This is an automated email from the ASF dual-hosted git repository. aherbert pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-statistics.git
The following commit(s) were added to refs/heads/master by this push: new 5b7e0c0 STATISTICS-86: Add descriptive module to user guide 5b7e0c0 is described below commit 5b7e0c0753c2519d0840e26479f6fc22def5ce9f Author: Alex Herbert <aherb...@apache.org> AuthorDate: Sun Jun 23 12:07:50 2024 +0100 STATISTICS-86: Add descriptive module to user guide --- .../statistics/descriptive/UserGuideTest.java | 113 +++++++--- src/site/site.xml | 1 + src/site/xdoc/index.xml | 25 +++ src/site/xdoc/userguide/index.xml | 245 +++++++++++++++++++++ 4 files changed, 354 insertions(+), 30 deletions(-) diff --git a/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/UserGuideTest.java b/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/UserGuideTest.java index 745b35c..1ed6e25 100644 --- a/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/UserGuideTest.java +++ b/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/UserGuideTest.java @@ -19,7 +19,9 @@ package org.apache.commons.statistics.descriptive; import java.util.Arrays; import java.util.EnumSet; +import java.util.SplittableRandom; import java.util.function.DoubleSupplier; +import java.util.function.IntConsumer; import java.util.stream.Collector; import java.util.stream.IntStream; import java.util.stream.Stream; @@ -31,30 +33,31 @@ import org.junit.jupiter.api.Test; */ class UserGuideTest { @Test - void testVariance() { - double[] values = {1, 1, 2, 3, 5, 8, 13, 21}; + void testSingleStatistic() { + int[] values = {1, 1, 2, 3, 5, 8, 13, 21}; - double v = Variance.of(values).getAsDouble(); + double v = IntVariance.of(values).getAsDouble(); - double v2 = Stream.of("one", "two", "three", "four") - .mapToDouble(String::length) - .collect(Variance::create, Variance::accept, Variance::combine) - .getAsDouble(); + double m = Stream.of("one", "two", "three", "four") + .mapToInt(String::length) + .collect(IntMean::create, IntMean::accept, IntMean::combine) + .getAsDouble(); // import numpy as np // np.var([1, 1, 2, 3, 5, 8, 13, 21], ddof=1) Assertions.assertEquals(49.92857142857143, v, 1e-10); - // np.var([3, 3, 5, 4], ddof=1) - Assertions.assertEquals(0.9166666666666666, v2); + // mean = sum([3, 3, 5, 4]) / 4 + Assertions.assertEquals(15.0 / 4.0, m); } @Test - void testDoubleStatistics1() { + void testMultipleStatistics() { double[] data = {1, 2, 3, 4, 5, 6, 7, 8}; - DoubleStatistics stats = DoubleStatistics.builder( - Statistic.MIN, Statistic.MAX, Statistic.VARIANCE) - .build(data); + // EnumSet and input array data + DoubleStatistics stats = DoubleStatistics.of( + EnumSet.of(Statistic.MIN, Statistic.MAX, Statistic.VARIANCE), + data); Assertions.assertEquals(1, stats.getAsDouble(Statistic.MIN)); Assertions.assertEquals(8, stats.getAsDouble(Statistic.MAX)); // Python numpy 1.24.4 @@ -62,12 +65,42 @@ class UserGuideTest { // np.std(np.arange(1, 9), ddof=1) Assertions.assertEquals(6.0, stats.getAsDouble(Statistic.VARIANCE), 1e-10); // Get other statistics supported by the underlying computations + Assertions.assertTrue(stats.isSupported(Statistic.STANDARD_DEVIATION)); + Assertions.assertTrue(stats.isSupported(Statistic.MEAN)); Assertions.assertEquals(2.449489742783178, stats.getAsDouble(Statistic.STANDARD_DEVIATION), 1e-10); Assertions.assertEquals(4.5, stats.getAsDouble(Statistic.MEAN), 1e-10); } @Test - void testDoubleStatistics2() { + void testMultipleStatisticsIndividualValues() { + IntStatistics stats = IntStatistics.of( + Statistic.MIN, Statistic.MAX, Statistic.MEAN); + Stream.of("one", "two", "three", "four") + .mapToInt(String::length) + .forEach(stats::accept); + + Assertions.assertEquals(3, stats.getAsInt(Statistic.MIN)); + Assertions.assertEquals(5, stats.getAsInt(Statistic.MAX)); + Assertions.assertEquals(15.0 / 4.0, stats.getAsDouble(Statistic.MEAN), 1e-10); + } + + @Test + void testMultipleStatisticsParallelStream() { + IntStatistics.Builder builder = IntStatistics.builder( + Statistic.MIN, Statistic.MAX, Statistic.MEAN); + IntStatistics stats = + Stream.of("one", "two", "three", "four") + .parallel() + .mapToInt(String::length) + .collect(builder::build, IntConsumer::accept, IntStatistics::combine); + + Assertions.assertEquals(3, stats.getAsInt(Statistic.MIN)); + Assertions.assertEquals(5, stats.getAsInt(Statistic.MAX)); + Assertions.assertEquals(15.0 / 4.0, stats.getAsDouble(Statistic.MEAN), 1e-10); + } + + @Test + void testMultipleStatisticsMultipleArrays() { double[][] data = { {1, 2, 3, 4}, {5, 6, 7, 8}, @@ -87,11 +120,12 @@ class UserGuideTest { } @Test - void testDoubleStatistics3() { + void testMultipleStatisticsCollector() { double[][] data = { {1, 2, 3, 4}, {5, 6, 7, 8}, }; + // A re-usable Collector DoubleStatistics.Builder builder = DoubleStatistics.builder( Statistic.MIN, Statistic.MAX, Statistic.VARIANCE); Collector<double[], DoubleStatistics, DoubleStatistics> collector = @@ -106,25 +140,18 @@ class UserGuideTest { } @Test - void testDoubleStatistics4() { - double[] data = {1, 2, 3, 4, 5, 6, 7, 8}; - DoubleStatistics varStats = DoubleStatistics.builder(Statistic.VARIANCE).build(data); - DoubleStatistics meanStats = DoubleStatistics.builder(Statistic.MEAN).build(data); + void testStatisticsCombineCompatibility() { + double[] data1 = {1, 2, 3, 4}; + double[] data2 = {5, 6, 7, 8}; + DoubleStatistics varStats = DoubleStatistics.builder(Statistic.VARIANCE).build(data1); + DoubleStatistics meanStats = DoubleStatistics.builder(Statistic.MEAN).build(data2); Assertions.assertThrows(IllegalArgumentException.class, () -> varStats.combine(meanStats)); - Assertions.assertDoesNotThrow(() -> meanStats.combine(varStats)); + meanStats.combine(varStats); + Assertions.assertEquals(4.5, meanStats.getAsDouble(Statistic.MEAN), 1e-10); } @Test - void testDoubleStatistics5() { - DoubleStatistics stats = DoubleStatistics.of( - EnumSet.of(Statistic.MIN, Statistic.MAX), - 1, 1, 2, 3, 5, 8, 13); - Assertions.assertEquals(1, stats.getAsDouble(Statistic.MIN)); - Assertions.assertEquals(13, stats.getAsDouble(Statistic.MAX)); - } - - @Test - void testDoubleStatistics6() { + void testStatisticsUpdating() { DoubleStatistics stats = DoubleStatistics.of(Statistic.MEAN, Statistic.MAX); DoubleSupplier mean = stats.getResult(Statistic.MEAN); DoubleSupplier max = stats.getResult(Statistic.MAX); @@ -136,4 +163,30 @@ class UserGuideTest { // printf("[1 .. %d] mean=%.1f, max=%s%n", x, mean.getAsDouble(), max.getAsDouble()); }); } + + @Test + void testMedian() { + double[] data = {8, 7, 6, 5, 4, 3, 2, 1}; + double m = Median.withDefaults() + .withCopy(true) + .with(NaNPolicy.ERROR) + .evaluate(data); + Assertions.assertEquals(4.5, m); + } + + @Test + void testQuantile() { + int size = 10000; + double origin = 0; + double bound = 100; + double[] data = + new SplittableRandom(123) + .doubles(size, origin, bound) + .toArray(); + double[] q = Quantile.withDefaults() + .evaluate(data, 0.25, 0.5, 0.75); + Assertions.assertEquals(25.0, q[0], 0.5); + Assertions.assertEquals(50.0, q[1], 0.5); + Assertions.assertEquals(75.0, q[2], 0.5); + } } diff --git a/src/site/site.xml b/src/site/site.xml index 65e41c0..25b5fa3 100644 --- a/src/site/site.xml +++ b/src/site/site.xml @@ -48,6 +48,7 @@ <item name="Contents" href="/userguide/index.html#toc"/> <item name="Overview" href="/userguide/index.html#overview"/> <item name="Example Modules" href="/userguide/index.html#example-modules"/> + <item name="Descriptive" href="/userguide/index.html#descriptive"/> <item name="Probability Distributions" href="/userguide/index.html#distributions"/> <item name="Inference" href="/userguide/index.html#inference"/> <item name="Ranking" href="/userguide/index.html#ranking"/> diff --git a/src/site/xdoc/index.xml b/src/site/xdoc/index.xml index 463e60f..296255b 100644 --- a/src/site/xdoc/index.xml +++ b/src/site/xdoc/index.xml @@ -30,6 +30,31 @@ Apache Commons Statistics provides utilities for statistical applications. </p> + + <p> + Descriptive statistics can be computed on array data or using the Java Stream API, + for example: + </p> + +<source class="prettyprint"> +int[] values = {1, 1, 2, 3, 5, 8, 13, 21}; +double v = IntVariance.of(values).getAsDouble(); // 49.929 + +// A builder for specified statistics to allow +// parallel computation on a stream of values +IntStatistics.Builder builder = IntStatistics.builder( + Statistic.MIN, Statistic.MAX, Statistic.MEAN); +IntStatistics stats = + Stream.of("one", "two", "three", "four") + .parallel() + .mapToInt(String::length) + .collect(builder::build, IntConsumer::accept, IntStatistics::combine); + +stats.getAsInt(Statistic.MIN); // 3 +stats.getAsInt(Statistic.MAX); // 5 +stats.getAsDouble(Statistic.MEAN); // 15.0 / 4 +</source> + <p> Support is provided for commonly used continuous and discrete distributions, for example: diff --git a/src/site/xdoc/userguide/index.xml b/src/site/xdoc/userguide/index.xml index c187a15..efdce38 100644 --- a/src/site/xdoc/userguide/index.xml +++ b/src/site/xdoc/userguide/index.xml @@ -38,6 +38,17 @@ <li> <a href="#example-modules">Example Modules</a> </li> + <li> + <a href="#descriptive">Descriptive Statistics</a> + <ul> + <li> + <a href="#desc_overview">Overview</a> + </li> + <li> + <a href="#desc_examples">Examples</a> + </li> + </ul> + </li> <li> <a href="#distributions">Probability Distributions</a> <ul> @@ -84,6 +95,11 @@ Commons Statistics is divided into a number of submodules: </p> <ul> + <li> + <code><a href="../commons-statistics-descriptive/index.html"> + commons-statistics-descriptive</a></code> - Provides computation + of descriptive statistics (mean, variance, median, etc). + </li> <li> <code><a href="../commons-statistics-distribution/index.html"> commons-statistics-distribution</a></code> - Provides interfaces @@ -113,6 +129,234 @@ <hr/> </section> + <section name="Descriptive Statistics" id="descriptive"> + <p> + The <code>commons-statistics-descriptive</code> module provides descriptive statistics. + </p> + <subsection name="Overview" id="desc_overview"> + <p> + The module provides classes to compute univariate statistics on <code>double</code>, + <code>int</code> and <code>long</code> data using array input or a Java stream. The + result is returned as a + <a href="../commons-statistics-descriptive/apidocs/org/apache/commons/statistics/descriptive/StatisticResult.html">StatisticResult</a>. + The <code>StatisticResult</code> provides methods to supply the result as a + <code>double</code>, <code>int</code>, <code>long</code> and <code>BigInteger</code>. + The integer types allow the exact result to be returned for integer data. For example + the sum of <code>long</code> values may not be exactly representable as a + <code>double</code> and may overflow a <code>long</code>. + </p> + <p> + Computation of an individual statistic involves creating an instance of + <code>StatisticResult</code> that can supply the current statistic value. + To allow addition of single values to update the statistic, instances + implement the primitive consumer interface for the supported type: + <code>DoubleConsumer</code>, <code>IntConsumer</code>, or <code>LongConsumer</code>. + Instances implement the + <a href="../commons-statistics-descriptive/apidocs/org/apache/commons/statistics/descriptive/StatisticAccumulator.html">StatisticAccumulator</a> + interface and can be combined with other instances. This allows computation in parallel on + subsets of data and combination to a final result. This can be performed using the + Java stream API. + </p> + <p> + Computation of multiple statistics uses a + <a href="../commons-statistics-descriptive/apidocs/org/apache/commons/statistics/descriptive/Statistic.html">Statistic</a> + enumeration to define the statistics to evaluate. A container class is created to + compute the desired statistics together and allows multiple statistics to be computed + concurrently using the Java stream API. Each statistic result is obtained using the + <code>Statistic</code> enum to access the required value. Providing a choice of the + statistics allows the user to avoid the computational cost of results that are not + required. + </p> + <p> + Note that <code>double</code> computations are subject to accumulated floating-point + rounding which can generate different results from permuted input data. Computation + on an array of <code>double</code> data can use a multiple-pass algorithm to increase + accuracy over a single-pass stream of values. This is the recommended approach if + all data is already stored in an array (i.e. is not dynamically generated). + </p> + <p> + If the data is an integer type then it is + preferred to use the integer specializations of the statistics. + Many implementations use exact integer math for the computation. This is faster than + using a <code>double</code> data type, more accurate and returns the same result + irrespective of the input order of the data. Note that for improved performance there + is no use of <code>BigInteger</code> in the accumulation of intermediate values; the + computation uses mutable fixed-precision integer classes for totals that may + overflow 64-bits. + </p> + <p> + Some statistics cannot be computed using a stream since they require all values for + computation, for example the median. These are evaluated on an array using an instance + of a computing class. The instance allows computation options to be changed. Instances + are immutable and the computation is thread-safe. + </p> + </subsection> + <subsection name="Examples" id="desc_examples"> + <p> + Computation of a single statistic from an array of values, or a stream of data: + </p> +<source class="prettyprint"> +int[] values = {1, 1, 2, 3, 5, 8, 13, 21}; + +double v = IntVariance.of(values).getAsDouble(); + +double m = Stream.of("one", "two", "three", "four") + .mapToInt(String::length) + .collect(IntMean::create, IntMean::accept, IntMean::combine) + .getAsDouble(); +</source> + <p> + Computation of multiple statistics uses the <code>Statistic</code> enum. + These can be specified using an <code>EnumSet</code> together with the input array data. + Note that some statistics share the same underlying computation, for example the variance, + standard deviation and mean. When a container class is constructed using one of the + statistics, the other co-computed statistics are available in the result even if not + specified during construction. The <code>isSupported</code> method can + identify all results that are available from the container class. + </p> +<source class="prettyprint"> +double[] data = {1, 2, 3, 4, 5, 6, 7, 8}; +DoubleStatistics stats = DoubleStatistics.of( + EnumSet.of(Statistic.MIN, Statistic.MAX, Statistic.VARIANCE), + data); + +stats.getAsDouble(Statistic.MIN); // 1.0 +stats.getAsDouble(Statistic.MAX); // 8.0 +stats.getAsDouble(Statistic.VARIANCE); // 6.0 + +// Get other statistics supported by the underlying computations +stats.isSupported(Statistic.STANDARD_DEVIATION)); // true +stats.getAsDouble(Statistic.STANDARD_DEVIATION); // 2.449... +</source> + <p> + Computation of multiple statistics on individual values can accumulate the results + using the <code>accept</code> method of the container class: + </p> +<source class="prettyprint"> +IntStatistics stats = IntStatistics.of( + Statistic.MIN, Statistic.MAX, Statistic.MEAN); +Stream.of("one", "two", "three", "four") + .mapToInt(String::length) + .forEach(stats::accept); + +stats.getAsInt(Statistic.MIN); // 3 +stats.getAsInt(Statistic.MAX); // 5 +stats.getAsDouble(Statistic.MEAN); // 15.0 / 4 +</source> + <p> + Computation of multiple statistics on a stream of values in parallel. + This requires use of a <code>Builder</code> that + can supply instances of the container class to each worker with the + <code>build</code> method; populated using <code>accept</code>; and then collected + using <code>combine</code>: + </p> +<source class="prettyprint"> +IntStatistics.Builder builder = IntStatistics.builder( + Statistic.MIN, Statistic.MAX, Statistic.MEAN); +IntStatistics stats = corpus.stream() + Stream.of("one", "two", "three", "four") + .parallel() + .mapToInt(String::length) + .collect(builder::build, IntConsumer::accept, IntStatistics::combine); + +stats.getAsInt(Statistic.MIN); // 3 +stats.getAsInt(Statistic.MAX); // 5 +stats.getAsDouble(Statistic.MEAN); // 15.0 / 4 +</source> + <p> + Computation on multiple arrays. This requires use of a <code>Builder</code> that + can supply instances of the container class to compute each array with the + <code>build</code> method: + </p> +<source class="prettyprint"> +double[][] data = { + {1, 2, 3, 4}, + {5, 6, 7, 8}, +}; +DoubleStatistics.Builder builder = DoubleStatistics.builder( + Statistic.MIN, Statistic.MAX, Statistic.VARIANCE); +DoubleStatistics stats = Arrays.stream(data) + .map(builder::build) + .reduce(DoubleStatistics::combine) + .get(); + +stats.getAsDouble(Statistic.MIN); // 1.0 +stats.getAsDouble(Statistic.MAX); // 8.0 +stats.getAsDouble(Statistic.VARIANCE); // 6.0 + +// Get other statistics supported by the underlying computations +stats.isSupported(Statistic.MEAN)); // true +stats.getAsDouble(Statistic.MEAN); // 4.5 +</source> + <p> + If computation on multiple arrays is to be repeated then this can be done with a + re-useable <code>java.util.stream.Collector</code>: + </p> +<source class="prettyprint"> +double[][] data = { + {1, 2, 3, 4}, + {5, 6, 7, 8}, +}; +DoubleStatistics.Builder builder = DoubleStatistics.builder( + Statistic.MIN, Statistic.MAX, Statistic.VARIANCE); +Collector<double[], DoubleStatistics, DoubleStatistics> collector = + Collector.of(builder::build, (s, d) -> s.combine(builder.build(d)), DoubleStatistics::combine); +DoubleStatistics stats = Arrays.stream(data).collect(collector); + +stats.getAsDouble(Statistic.MIN); // 1.0 +stats.getAsDouble(Statistic.MAX); // 8.0 +stats.getAsDouble(Statistic.VARIANCE); // 6.0 +</source> + <p> + Combination of multiple statistics requires them to be compatible, i.e. all supported + statistics in one container are also supported in the other. Note that combining another + container ignores any unsupported statistics and the compatibility + may be asymmetric. + </p> +<source class="prettyprint"> +double[] data1 = {1, 2, 3, 4}; +double[] data2 = {5, 6, 7, 8}; +DoubleStatistics varStats = DoubleStatistics.builder(Statistic.VARIANCE).build(data1); +DoubleStatistics meanStats = DoubleStatistics.builder(Statistic.MEAN).build(data2); + +// throws IllegalArgumentException +varStats.combine(meanStats); + +// OK - mean is updated to 4.5 +meanStats.combine(varStats) +</source> + <p> + Computation of a statistic that requires all data (i.e. does not support the + <code>Stream</code> API) uses a configurable instance of the computing class: + </p> +<source class="prettyprint"> +double[] data = {8, 7, 6, 5, 4, 3, 2, 1}; +// Configure the statistic +double m = Median.withDefaults() + .withCopy(true) // do not modify the input array + .with(NaNPolicy.ERROR) // raise an exception for NaN + .evaluate(data); +// m = 4.5 +</source> + <p> + Computation of multiple values of a statistic that requires all data: + </p> +<source class="prettyprint"> +int size = 10000; +double origin = 0; +double bound = 100; +double[] data = + new SplittableRandom(123) + .doubles(size, origin, bound) + .toArray(); +// Evaluate multiple statistics on the same data +double[] q = Quantile.withDefaults() + .evaluate(data, 0.25, 0.5, 0.75); // probabilities +// q ~ [25.0, 50.0, 75.0] +</source> + </subsection> + </section> + <section name="Probability Distributions" id="distributions"> <subsection name="Overview" id="dist_overview"> <p> @@ -354,6 +598,7 @@ double x2 = chi2.inverseSurvivalProbability(q); </p> </subsection> </section> + <section name="Inference" id="inference"> <p> The <code>commons-statistics-inference</code> module provides hypothesis testing.