(commons-statistics) branch master updated: STATISTICS-86: Add descriptive module to user guide

aherbert Tue, 25 Jun 2024 01:09:57 -0700

This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-statistics.git



The following commit(s) were added to refs/heads/master by this push:
     new 5b7e0c0  STATISTICS-86: Add descriptive module to user guide
5b7e0c0 is described below

commit 5b7e0c0753c2519d0840e26479f6fc22def5ce9f
Author: Alex Herbert <aherb...@apache.org>
AuthorDate: Sun Jun 23 12:07:50 2024 +0100

    STATISTICS-86: Add descriptive module to user guide
---
 .../statistics/descriptive/UserGuideTest.java      | 113 +++++++---
 src/site/site.xml                                  |   1 +
 src/site/xdoc/index.xml                            |  25 +++
 src/site/xdoc/userguide/index.xml                  | 245 +++++++++++++++++++++
 4 files changed, 354 insertions(+), 30 deletions(-)

diff --git 
a/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/UserGuideTest.java
 
b/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/UserGuideTest.java
index 745b35c..1ed6e25 100644
--- 
a/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/UserGuideTest.java
+++ 
b/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/UserGuideTest.java
@@ -19,7 +19,9 @@ package org.apache.commons.statistics.descriptive;
 
 import java.util.Arrays;
 import java.util.EnumSet;
+import java.util.SplittableRandom;
 import java.util.function.DoubleSupplier;
+import java.util.function.IntConsumer;
 import java.util.stream.Collector;
 import java.util.stream.IntStream;
 import java.util.stream.Stream;
@@ -31,30 +33,31 @@ import org.junit.jupiter.api.Test;
  */
 class UserGuideTest {
     @Test
-    void testVariance() {
-        double[] values = {1, 1, 2, 3, 5, 8, 13, 21};
+    void testSingleStatistic() {
+        int[] values = {1, 1, 2, 3, 5, 8, 13, 21};
 
-        double v = Variance.of(values).getAsDouble();
+        double v = IntVariance.of(values).getAsDouble();
 
-        double v2 = Stream.of("one", "two", "three", "four")
-                          .mapToDouble(String::length)
-                          .collect(Variance::create, Variance::accept, 
Variance::combine)
-                          .getAsDouble();
+        double m = Stream.of("one", "two", "three", "four")
+                         .mapToInt(String::length)
+                         .collect(IntMean::create, IntMean::accept, 
IntMean::combine)
+                         .getAsDouble();
 
         // import numpy as np
         // np.var([1, 1, 2, 3, 5, 8, 13, 21], ddof=1)
         Assertions.assertEquals(49.92857142857143, v, 1e-10);
 
-        // np.var([3, 3, 5, 4], ddof=1)
-        Assertions.assertEquals(0.9166666666666666, v2);
+        // mean = sum([3, 3, 5, 4]) / 4
+        Assertions.assertEquals(15.0 / 4.0, m);
     }
 
     @Test
-    void testDoubleStatistics1() {
+    void testMultipleStatistics() {
         double[] data = {1, 2, 3, 4, 5, 6, 7, 8};
-        DoubleStatistics stats = DoubleStatistics.builder(
-            Statistic.MIN, Statistic.MAX, Statistic.VARIANCE)
-            .build(data);
+        // EnumSet and input array data
+        DoubleStatistics stats = DoubleStatistics.of(
+            EnumSet.of(Statistic.MIN, Statistic.MAX, Statistic.VARIANCE),
+            data);
         Assertions.assertEquals(1, stats.getAsDouble(Statistic.MIN));
         Assertions.assertEquals(8, stats.getAsDouble(Statistic.MAX));
         // Python numpy 1.24.4
@@ -62,12 +65,42 @@ class UserGuideTest {
         // np.std(np.arange(1, 9), ddof=1)
         Assertions.assertEquals(6.0, stats.getAsDouble(Statistic.VARIANCE), 
1e-10);
         // Get other statistics supported by the underlying computations
+        Assertions.assertTrue(stats.isSupported(Statistic.STANDARD_DEVIATION));
+        Assertions.assertTrue(stats.isSupported(Statistic.MEAN));
         Assertions.assertEquals(2.449489742783178, 
stats.getAsDouble(Statistic.STANDARD_DEVIATION), 1e-10);
         Assertions.assertEquals(4.5, stats.getAsDouble(Statistic.MEAN), 1e-10);
     }
 
     @Test
-    void testDoubleStatistics2() {
+    void testMultipleStatisticsIndividualValues() {
+        IntStatistics stats = IntStatistics.of(
+            Statistic.MIN, Statistic.MAX, Statistic.MEAN);
+        Stream.of("one", "two", "three", "four")
+            .mapToInt(String::length)
+            .forEach(stats::accept);
+
+        Assertions.assertEquals(3, stats.getAsInt(Statistic.MIN));
+        Assertions.assertEquals(5, stats.getAsInt(Statistic.MAX));
+        Assertions.assertEquals(15.0 / 4.0, stats.getAsDouble(Statistic.MEAN), 
1e-10);
+    }
+
+    @Test
+    void testMultipleStatisticsParallelStream() {
+        IntStatistics.Builder builder = IntStatistics.builder(
+            Statistic.MIN, Statistic.MAX, Statistic.MEAN);
+        IntStatistics stats =
+            Stream.of("one", "two", "three", "four")
+            .parallel()
+            .mapToInt(String::length)
+            .collect(builder::build, IntConsumer::accept, 
IntStatistics::combine);
+
+        Assertions.assertEquals(3, stats.getAsInt(Statistic.MIN));
+        Assertions.assertEquals(5, stats.getAsInt(Statistic.MAX));
+        Assertions.assertEquals(15.0 / 4.0, stats.getAsDouble(Statistic.MEAN), 
1e-10);
+    }
+
+    @Test
+    void testMultipleStatisticsMultipleArrays() {
         double[][] data = {
             {1, 2, 3, 4},
             {5, 6, 7, 8},
@@ -87,11 +120,12 @@ class UserGuideTest {
     }
 
     @Test
-    void testDoubleStatistics3() {
+    void testMultipleStatisticsCollector() {
         double[][] data = {
             {1, 2, 3, 4},
             {5, 6, 7, 8},
         };
+        // A re-usable Collector
         DoubleStatistics.Builder builder = DoubleStatistics.builder(
             Statistic.MIN, Statistic.MAX, Statistic.VARIANCE);
         Collector<double[], DoubleStatistics, DoubleStatistics> collector =
@@ -106,25 +140,18 @@ class UserGuideTest {
     }
 
     @Test
-    void testDoubleStatistics4() {
-        double[] data = {1, 2, 3, 4, 5, 6, 7, 8};
-        DoubleStatistics varStats = 
DoubleStatistics.builder(Statistic.VARIANCE).build(data);
-        DoubleStatistics meanStats = 
DoubleStatistics.builder(Statistic.MEAN).build(data);
+    void testStatisticsCombineCompatibility() {
+        double[] data1 = {1, 2, 3, 4};
+        double[] data2 = {5, 6, 7, 8};
+        DoubleStatistics varStats = 
DoubleStatistics.builder(Statistic.VARIANCE).build(data1);
+        DoubleStatistics meanStats = 
DoubleStatistics.builder(Statistic.MEAN).build(data2);
         Assertions.assertThrows(IllegalArgumentException.class, () -> 
varStats.combine(meanStats));
-        Assertions.assertDoesNotThrow(() -> meanStats.combine(varStats));
+        meanStats.combine(varStats);
+        Assertions.assertEquals(4.5, meanStats.getAsDouble(Statistic.MEAN), 
1e-10);
     }
 
     @Test
-    void testDoubleStatistics5() {
-        DoubleStatistics stats = DoubleStatistics.of(
-            EnumSet.of(Statistic.MIN, Statistic.MAX),
-            1, 1, 2, 3, 5, 8, 13);
-        Assertions.assertEquals(1, stats.getAsDouble(Statistic.MIN));
-        Assertions.assertEquals(13, stats.getAsDouble(Statistic.MAX));
-    }
-
-    @Test
-    void testDoubleStatistics6() {
+    void testStatisticsUpdating() {
         DoubleStatistics stats = DoubleStatistics.of(Statistic.MEAN, 
Statistic.MAX);
         DoubleSupplier mean = stats.getResult(Statistic.MEAN);
         DoubleSupplier max = stats.getResult(Statistic.MAX);
@@ -136,4 +163,30 @@ class UserGuideTest {
             // printf("[1 .. %d] mean=%.1f, max=%s%n", x, mean.getAsDouble(), 
max.getAsDouble());
         });
     }
+
+    @Test
+    void testMedian() {
+        double[] data = {8, 7, 6, 5, 4, 3, 2, 1};
+        double m = Median.withDefaults()
+                         .withCopy(true)
+                         .with(NaNPolicy.ERROR)
+                         .evaluate(data);
+        Assertions.assertEquals(4.5, m);
+    }
+
+    @Test
+    void testQuantile() {
+        int size = 10000;
+        double origin = 0;
+        double bound = 100;
+        double[] data =
+            new SplittableRandom(123)
+            .doubles(size, origin, bound)
+            .toArray();
+        double[] q = Quantile.withDefaults()
+                             .evaluate(data, 0.25, 0.5, 0.75);
+        Assertions.assertEquals(25.0, q[0], 0.5);
+        Assertions.assertEquals(50.0, q[1], 0.5);
+        Assertions.assertEquals(75.0, q[2], 0.5);
+    }
 }
diff --git a/src/site/site.xml b/src/site/site.xml
index 65e41c0..25b5fa3 100644
--- a/src/site/site.xml
+++ b/src/site/site.xml
@@ -48,6 +48,7 @@
       <item name="Contents" href="/userguide/index.html#toc"/>
       <item name="Overview" href="/userguide/index.html#overview"/>
       <item name="Example Modules" 
href="/userguide/index.html#example-modules"/>
+      <item name="Descriptive" href="/userguide/index.html#descriptive"/>
       <item name="Probability Distributions" 
href="/userguide/index.html#distributions"/>
       <item name="Inference" href="/userguide/index.html#inference"/>
       <item name="Ranking" href="/userguide/index.html#ranking"/>
diff --git a/src/site/xdoc/index.xml b/src/site/xdoc/index.xml
index 463e60f..296255b 100644
--- a/src/site/xdoc/index.xml
+++ b/src/site/xdoc/index.xml
@@ -30,6 +30,31 @@
         Apache Commons Statistics provides utilities for statistical 
applications.
       </p>
 
+
+      <p>
+        Descriptive statistics can be computed on array data or using the Java 
Stream API,
+        for example:
+      </p>
+
+<source class="prettyprint">
+int[] values = {1, 1, 2, 3, 5, 8, 13, 21};
+double v = IntVariance.of(values).getAsDouble();   // 49.929
+
+// A builder for specified statistics to allow
+// parallel computation on a stream of values
+IntStatistics.Builder builder = IntStatistics.builder(
+    Statistic.MIN, Statistic.MAX, Statistic.MEAN);
+IntStatistics stats =
+    Stream.of("one", "two", "three", "four")
+    .parallel()
+    .mapToInt(String::length)
+    .collect(builder::build, IntConsumer::accept, IntStatistics::combine);
+
+stats.getAsInt(Statistic.MIN);       // 3
+stats.getAsInt(Statistic.MAX);       // 5
+stats.getAsDouble(Statistic.MEAN);   // 15.0 / 4
+</source>
+
       <p>
         Support is provided for commonly used continuous and discrete 
distributions,
         for example:
diff --git a/src/site/xdoc/userguide/index.xml 
b/src/site/xdoc/userguide/index.xml
index c187a15..efdce38 100644
--- a/src/site/xdoc/userguide/index.xml
+++ b/src/site/xdoc/userguide/index.xml
@@ -38,6 +38,17 @@
         <li>
           <a href="#example-modules">Example Modules</a>
         </li>
+        <li>
+          <a href="#descriptive">Descriptive Statistics</a>
+          <ul>
+            <li>
+              <a href="#desc_overview">Overview</a>
+            </li>
+            <li>
+              <a href="#desc_examples">Examples</a>
+            </li>
+          </ul>
+        </li>
         <li>
           <a href="#distributions">Probability Distributions</a>
           <ul>
@@ -84,6 +95,11 @@
         Commons Statistics is divided into a number of submodules:
       </p>
       <ul>
+        <li>
+          <code><a href="../commons-statistics-descriptive/index.html">
+          commons-statistics-descriptive</a></code> - Provides computation
+          of descriptive statistics (mean, variance, median, etc).
+        </li>
         <li>
           <code><a href="../commons-statistics-distribution/index.html">
           commons-statistics-distribution</a></code> - Provides interfaces
@@ -113,6 +129,234 @@
       <hr/>
     </section>
 
+    <section name="Descriptive Statistics" id="descriptive">
+      <p>
+        The <code>commons-statistics-descriptive</code> module provides 
descriptive statistics.
+      </p>
+      <subsection name="Overview" id="desc_overview">
+        <p>
+          The module provides classes to compute univariate statistics on 
<code>double</code>,
+          <code>int</code> and <code>long</code> data using array input or a 
Java stream. The
+          result is returned as a
+          <a 
href="../commons-statistics-descriptive/apidocs/org/apache/commons/statistics/descriptive/StatisticResult.html">StatisticResult</a>.
+          The <code>StatisticResult</code> provides methods to supply the 
result as a
+          <code>double</code>, <code>int</code>, <code>long</code> and 
<code>BigInteger</code>.
+          The integer types allow the exact result to be returned for integer 
data. For example
+          the sum of <code>long</code> values may not be exactly representable 
as a
+          <code>double</code> and may overflow a <code>long</code>.
+        </p>
+        <p>
+          Computation of an individual statistic involves creating an instance 
of
+          <code>StatisticResult</code> that can supply the current statistic 
value.
+          To allow addition of single values to update the statistic, instances
+          implement the primitive consumer interface for the supported type:
+          <code>DoubleConsumer</code>, <code>IntConsumer</code>, or 
<code>LongConsumer</code>.
+          Instances implement the
+          <a 
href="../commons-statistics-descriptive/apidocs/org/apache/commons/statistics/descriptive/StatisticAccumulator.html">StatisticAccumulator</a>
+          interface and can be combined with other instances. This allows 
computation in parallel on
+          subsets of data and combination to a final result. This can be 
performed using the
+          Java stream API.
+        </p>
+        <p>
+          Computation of multiple statistics uses a
+          <a 
href="../commons-statistics-descriptive/apidocs/org/apache/commons/statistics/descriptive/Statistic.html">Statistic</a>
+          enumeration to define the statistics to evaluate. A container class 
is created to
+          compute the desired statistics together and allows multiple 
statistics to be computed
+          concurrently using the Java stream API. Each statistic result is 
obtained using the
+          <code>Statistic</code> enum to access the required value. Providing 
a choice of the
+          statistics allows the user to avoid the computational cost of 
results that are not
+          required.
+        </p>
+        <p>
+          Note that <code>double</code> computations are subject to 
accumulated floating-point
+          rounding which can generate different results from permuted input 
data. Computation
+          on an array of <code>double</code> data can use a multiple-pass 
algorithm to increase
+          accuracy over a single-pass stream of values. This is the 
recommended approach if
+          all data is already stored in an array (i.e. is not dynamically 
generated).
+        </p>
+        <p>
+          If the data is an integer type then it is
+          preferred to use the integer specializations of the statistics.
+          Many implementations use exact integer math for the computation. 
This is faster than
+          using a <code>double</code> data type, more accurate and returns the 
same result
+          irrespective of the input order of the data. Note that for improved 
performance there
+          is no use of <code>BigInteger</code> in the accumulation of 
intermediate values; the
+          computation uses mutable fixed-precision integer classes for totals 
that may
+          overflow 64-bits.
+        </p>
+        <p>
+          Some statistics cannot be computed using a stream since they require 
all values for
+          computation, for example the median. These are evaluated on an array 
using an instance
+          of a computing class. The instance allows computation options to be 
changed. Instances
+          are immutable and the computation is thread-safe.
+        </p>
+      </subsection>
+      <subsection name="Examples" id="desc_examples">
+        <p>
+          Computation of a single statistic from an array of values, or a 
stream of data:
+        </p>
+<source class="prettyprint">
+int[] values = {1, 1, 2, 3, 5, 8, 13, 21};
+
+double v = IntVariance.of(values).getAsDouble();
+
+double m = Stream.of("one", "two", "three", "four")
+                 .mapToInt(String::length)
+                 .collect(IntMean::create, IntMean::accept, IntMean::combine)
+                 .getAsDouble();
+</source>
+        <p>
+          Computation of multiple statistics uses the <code>Statistic</code> 
enum.
+          These can be specified using an <code>EnumSet</code> together with 
the input array data.
+          Note that some statistics share the same underlying computation, for 
example the variance,
+          standard deviation and mean. When a container class is constructed 
using one of the
+          statistics, the other co-computed statistics are available in the 
result even if not
+          specified during construction. The <code>isSupported</code> method 
can
+          identify all results that are available from the container class.
+        </p>
+<source class="prettyprint">
+double[] data = {1, 2, 3, 4, 5, 6, 7, 8};
+DoubleStatistics stats = DoubleStatistics.of(
+    EnumSet.of(Statistic.MIN, Statistic.MAX, Statistic.VARIANCE),
+    data);
+
+stats.getAsDouble(Statistic.MIN);        // 1.0
+stats.getAsDouble(Statistic.MAX);        // 8.0
+stats.getAsDouble(Statistic.VARIANCE);   // 6.0
+
+// Get other statistics supported by the underlying computations
+stats.isSupported(Statistic.STANDARD_DEVIATION));   // true
+stats.getAsDouble(Statistic.STANDARD_DEVIATION);    // 2.449...
+</source>
+        <p>
+          Computation of multiple statistics on individual values can 
accumulate the results
+          using the <code>accept</code> method of the container class:
+        </p>
+<source class="prettyprint">
+IntStatistics stats = IntStatistics.of(
+    Statistic.MIN, Statistic.MAX, Statistic.MEAN);
+Stream.of("one", "two", "three", "four")
+    .mapToInt(String::length)
+    .forEach(stats::accept);
+
+stats.getAsInt(Statistic.MIN);       // 3
+stats.getAsInt(Statistic.MAX);       // 5
+stats.getAsDouble(Statistic.MEAN);   // 15.0 / 4
+</source>
+        <p>
+          Computation of multiple statistics on a stream of values in parallel.
+          This requires use of a <code>Builder</code> that
+          can supply instances of the container class to each worker with the
+          <code>build</code> method; populated using <code>accept</code>; and 
then collected
+          using <code>combine</code>:
+        </p>
+<source class="prettyprint">
+IntStatistics.Builder builder = IntStatistics.builder(
+    Statistic.MIN, Statistic.MAX, Statistic.MEAN);
+IntStatistics stats = corpus.stream()
+    Stream.of("one", "two", "three", "four")
+    .parallel()
+    .mapToInt(String::length)
+    .collect(builder::build, IntConsumer::accept, IntStatistics::combine);
+
+stats.getAsInt(Statistic.MIN);       // 3
+stats.getAsInt(Statistic.MAX);       // 5
+stats.getAsDouble(Statistic.MEAN);   // 15.0 / 4
+</source>
+        <p>
+          Computation on multiple arrays. This requires use of a 
<code>Builder</code> that
+          can supply instances of the container class to compute each array 
with the
+          <code>build</code> method:
+        </p>
+<source class="prettyprint">
+double[][] data = {
+    {1, 2, 3, 4},
+    {5, 6, 7, 8},
+};
+DoubleStatistics.Builder builder = DoubleStatistics.builder(
+    Statistic.MIN, Statistic.MAX, Statistic.VARIANCE);
+DoubleStatistics stats = Arrays.stream(data)
+    .map(builder::build)
+    .reduce(DoubleStatistics::combine)
+    .get();
+
+stats.getAsDouble(Statistic.MIN);        // 1.0
+stats.getAsDouble(Statistic.MAX);        // 8.0
+stats.getAsDouble(Statistic.VARIANCE);   // 6.0
+
+// Get other statistics supported by the underlying computations
+stats.isSupported(Statistic.MEAN));   // true
+stats.getAsDouble(Statistic.MEAN);    // 4.5
+</source>
+        <p>
+          If computation on multiple arrays is to be repeated then this can be 
done with a
+          re-useable <code>java.util.stream.Collector</code>:
+        </p>
+<source class="prettyprint">
+double[][] data = {
+    {1, 2, 3, 4},
+    {5, 6, 7, 8},
+};
+DoubleStatistics.Builder builder = DoubleStatistics.builder(
+    Statistic.MIN, Statistic.MAX, Statistic.VARIANCE);
+Collector&lt;double[], DoubleStatistics, DoubleStatistics&gt; collector =
+    Collector.of(builder::build, (s, d) -> s.combine(builder.build(d)), 
DoubleStatistics::combine);
+DoubleStatistics stats = Arrays.stream(data).collect(collector);
+
+stats.getAsDouble(Statistic.MIN);        // 1.0
+stats.getAsDouble(Statistic.MAX);        // 8.0
+stats.getAsDouble(Statistic.VARIANCE);   // 6.0
+</source>
+        <p>
+          Combination of multiple statistics requires them to be compatible, 
i.e. all supported
+          statistics in one container are also supported in the other. Note 
that combining another
+          container ignores any unsupported statistics and the compatibility
+          may be asymmetric.
+        </p>
+<source class="prettyprint">
+double[] data1 = {1, 2, 3, 4};
+double[] data2 = {5, 6, 7, 8};
+DoubleStatistics varStats = 
DoubleStatistics.builder(Statistic.VARIANCE).build(data1);
+DoubleStatistics meanStats = 
DoubleStatistics.builder(Statistic.MEAN).build(data2);
+
+// throws IllegalArgumentException
+varStats.combine(meanStats);
+
+// OK - mean is updated to 4.5
+meanStats.combine(varStats)
+</source>
+      <p>
+        Computation of a statistic that requires all data (i.e. does not 
support the
+        <code>Stream</code> API) uses a configurable instance of the computing 
class:
+      </p>
+<source class="prettyprint">
+double[] data = {8, 7, 6, 5, 4, 3, 2, 1};
+// Configure the statistic
+double m = Median.withDefaults()
+                 .withCopy(true)          // do not modify the input array
+                 .with(NaNPolicy.ERROR)   // raise an exception for NaN
+                 .evaluate(data);
+// m = 4.5
+</source>
+      <p>
+        Computation of multiple values of a statistic that requires all data:
+      </p>
+<source class="prettyprint">
+int size = 10000;
+double origin = 0;
+double bound = 100;
+double[] data =
+    new SplittableRandom(123)
+    .doubles(size, origin, bound)
+    .toArray();
+// Evaluate multiple statistics on the same data
+double[] q = Quantile.withDefaults()
+                     .evaluate(data, 0.25, 0.5, 0.75);   // probabilities
+// q ~ [25.0, 50.0, 75.0]
+</source>
+      </subsection>
+    </section>
+
     <section name="Probability Distributions" id="distributions">
       <subsection name="Overview" id="dist_overview">
         <p>
@@ -354,6 +598,7 @@ double x2 = chi2.inverseSurvivalProbability(q);
         </p>
       </subsection>
     </section>
+
     <section name="Inference" id="inference">
       <p>
         The <code>commons-statistics-inference</code> module provides 
hypothesis testing.

(commons-statistics) branch master updated: STATISTICS-86: Add descriptive module to user guide

Reply via email to