This is an automated email from the ASF dual-hosted git repository. aherbert pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-statistics.git
The following commit(s) were added to refs/heads/master by this push: new 2ef1d41 STATISTICS-85: Quantile implementation for discrete int[] data 2ef1d41 is described below commit 2ef1d41becd3e92683a79f6455be35d414496d7d Author: Alex Herbert <aherb...@apache.org> AuthorDate: Wed Jun 19 17:44:41 2024 +0100 STATISTICS-85: Quantile implementation for discrete int[] data --- commons-statistics-descriptive/pom.xml | 6 - .../{DoubleMath.java => Interpolation.java} | 18 +- .../commons/statistics/descriptive/Median.java | 51 +++- .../commons/statistics/descriptive/Quantile.java | 131 +++++++-- ...{DoubleMathTest.java => InterpolationTest.java} | 76 +++-- .../commons/statistics/descriptive/MedianTest.java | 157 ++++++++-- .../statistics/descriptive/QuantileTest.java | 320 +++++++++++++++++---- .../jmh/descriptive/MedianPerformance.java | 137 +++++++-- .../jmh/descriptive/QuantilePerformance.java | 222 +++++++++++--- 9 files changed, 927 insertions(+), 191 deletions(-) diff --git a/commons-statistics-descriptive/pom.xml b/commons-statistics-descriptive/pom.xml index ecb8f81..e53e4ca 100644 --- a/commons-statistics-descriptive/pom.xml +++ b/commons-statistics-descriptive/pom.xml @@ -66,12 +66,6 @@ <scope>test</scope> </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-math3</artifactId> - <scope>test</scope> - </dependency> - <!-- Required for DoubleEquivalence --> <dependency> <groupId>org.apache.commons</groupId> diff --git a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/DoubleMath.java b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Interpolation.java similarity index 91% rename from commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/DoubleMath.java rename to commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Interpolation.java index 579ebcb..d2bf6dd 100644 --- a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/DoubleMath.java +++ b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Interpolation.java @@ -17,13 +17,13 @@ package org.apache.commons.statistics.descriptive; /** - * Support class for double math. + * Support class for interpolation. * * @since 1.1 */ -final class DoubleMath { +final class Interpolation { /** No instances. */ - private DoubleMath() {} + private Interpolation() {} /** * Compute the arithmetic mean of the two values taking care to avoid overflow. @@ -41,6 +41,18 @@ final class DoubleMath { return x * 0.5 + y * 0.5; } + /** + * Compute the arithmetic mean of the two values. + * + * @param x Value. + * @param y Value. + * @return the mean + */ + static double mean(int x, int y) { + // long arithmetic handles a 32-bit signed integer + return ((long) x + y) * 0.5; + } + /** * Linear interpolation between sorted values {@code a <= b} using the * interpolant {@code t} taking care to avoid overflow. diff --git a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Median.java b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Median.java index 03c1869..1946ec8 100644 --- a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Median.java +++ b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Median.java @@ -36,6 +36,8 @@ import org.apache.commons.numbers.arrays.Selection; * * <p>The {@link NaNPolicy} can be used to change the behaviour on {@code NaN} values. * + * <p>Instances of this class are immutable and thread-safe. + * * @see #with(NaNPolicy) * @see <a href="https://en.wikipedia.org/wiki/Median">Median (Wikipedia)</a> * @since 1.1 @@ -141,14 +143,55 @@ public final class Median { if (n <= 2) { switch (n) { case 2: - // Sort data handling NaN and signed zeros. - // Matches the default behaviour of Quantile using p=0.5. + // Sorting the array matches the behaviour of Quantile for n==2 + // Handle NaN and signed zeros if (Double.compare(x[1], x[0]) < 0) { final double t = x[0]; x[0] = x[1]; x[1] = t; } - return DoubleMath.mean(x[0], x[1]); + return Interpolation.mean(x[0], x[1]); + case 1: + return x[0]; + default: + return Double.NaN; + } + } + // Median index + final int m = n >>> 1; + // Odd + if ((n & 0x1) == 1) { + Selection.select(x, 0, n, m); + return x[m]; + } + // Even: require (m-1, m) + Selection.select(x, 0, n, new int[] {m - 1, m}); + return Interpolation.mean(x[m - 1], x[m]); + } + + /** + * Evaluate the median. + * + * <p>Note: This method may partially sort the input values if not configured to + * {@link #withCopy(boolean) copy} the input data. + * + * @param values Values. + * @return the median + */ + public double evaluate(int[] values) { + final int[] x = copy ? values.clone() : values; + final int n = values.length; + // Special cases + if (n <= 2) { + switch (n) { + case 2: + // Sorting the array matches the behaviour of Quantile for n==2 + if (x[1] < x[0]) { + final int t = x[0]; + x[0] = x[1]; + x[1] = t; + } + return Interpolation.mean(x[0], x[1]); case 1: return x[0]; default: @@ -164,6 +207,6 @@ public final class Median { } // Even: require (m-1, m) Selection.select(x, 0, n, new int[] {m - 1, m}); - return DoubleMath.mean(x[m - 1], x[m]); + return Interpolation.mean(x[m - 1], x[m]); } } diff --git a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Quantile.java b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Quantile.java index 29117b9..f974d1b 100644 --- a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Quantile.java +++ b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Quantile.java @@ -42,6 +42,8 @@ import org.apache.commons.numbers.arrays.Selection; * * <p>The {@link NaNPolicy} can be used to change the behaviour on {@code NaN} values. * + * <p>Instances of this class are immutable and thread-safe. + * * @see #with(NaNPolicy) * @see <a href="http://en.wikipedia.org/wiki/Quantile">Quantile (Wikipedia)</a> * @since 1.1 @@ -237,7 +239,7 @@ public final class Quantile { // Partition and compute if (pos > i) { Selection.select(x, 0, n, new int[] {i, i + 1}); - return DoubleMath.interpolate(x[i], x[i + 1], pos - i); + return Interpolation.interpolate(x[i], x[i + 1], pos - i); } Selection.select(x, 0, n, i); return x[i]; @@ -269,27 +271,95 @@ public final class Quantile { } // Collect interpolation positions. We use the output q as storage. - final int[] indices = new int[p.length << 1]; - int count = 0; + final int[] indices = computeIndices(n, p, q); + + // Partition + Selection.select(x, 0, n, indices); + + // Compute for (int k = 0; k < p.length; k++) { - final double pos = estimationType.index(p[k], n); - q[k] = pos; - final int i = (int) pos; - indices[count++] = i; - if (pos > i) { - // Require the next index for interpolation - indices[count++] = i + 1; + final int i = (int) q[k]; + if (q[k] > i) { + q[k] = Interpolation.interpolate(x[i], x[i + 1], q[k] - i); + } else { + q[k] = x[i]; } } + return q; + } + + /** + * Evaluate the {@code p}-th quantile of the values. + * + * <p>Note: This method may partially sort the input values if not configured to + * {@link #withCopy(boolean) copy} the input data. + * + * <p><strong>Performance</strong> + * + * <p>It is not recommended to use this method for repeat calls for different quantiles + * within the same values. The {@link #evaluate(int[], double...)} method should be used + * which provides better performance. + * + * @param values Values. + * @param p Probability for the quantile to compute. + * @return the quantile + * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]} + * @see #evaluate(int[], double...) + */ + public double evaluate(int[] values, double p) { + checkProbability(p); + final int n = values.length; + // Special cases + if (n <= 1) { + return n == 0 ? Double.NaN : values[0]; + } + final double pos = estimationType.index(p, n); + final int i = (int) pos; + + // Partition and compute + final int[] x = copy ? values.clone() : values; + if (pos > i) { + Selection.select(x, 0, n, new int[] {i, i + 1}); + return Interpolation.interpolate(x[i], x[i + 1], pos - i); + } + Selection.select(x, 0, n, i); + return x[i]; + } + + /** + * Evaluate the {@code p}-th quantiles of the values. + * + * <p>Note: This method may partially sort the input values if not configured to + * {@link #withCopy(boolean) copy} the input data. + * + * @param values Values. + * @param p Probabilities for the quantiles to compute. + * @return the quantiles + * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; + * or no probabilities are specified. + */ + public double[] evaluate(int[] values, double... p) { + checkProbabilities(p); + final int n = values.length; + // Special cases + final double[] q = new double[p.length]; + if (n <= 1) { + Arrays.fill(q, n == 0 ? Double.NaN : values[0]); + return q; + } + + // Collect interpolation positions. We use the output q as storage. + final int[] indices = computeIndices(n, p, q); // Partition - Selection.select(x, 0, n, truncate(indices, count)); + final int[] x = copy ? values.clone() : values; + Selection.select(x, 0, n, indices); // Compute for (int k = 0; k < p.length; k++) { final int i = (int) q[k]; if (q[k] > i) { - q[k] = DoubleMath.interpolate(x[i], x[i + 1], q[k] - i); + q[k] = Interpolation.interpolate(x[i], x[i + 1], q[k] - i); } else { q[k] = x[i]; } @@ -327,7 +397,7 @@ public final class Quantile { final double v1 = values.applyAsDouble(i); if (pos > i) { final double v2 = values.applyAsDouble(i + 1); - return DoubleMath.interpolate(v1, v2, pos - i); + return Interpolation.interpolate(v1, v2, pos - i); } return v1; } @@ -365,7 +435,7 @@ public final class Quantile { final double v1 = values.applyAsDouble(i); if (pos > i) { final double v2 = values.applyAsDouble(i + 1); - q[k] = DoubleMath.interpolate(v1, v2, pos - i); + q[k] = Interpolation.interpolate(v1, v2, pos - i); } else { q[k] = v1; } @@ -427,18 +497,33 @@ public final class Quantile { } /** - * Truncate the array {@code a} to the given {@code size}. If the array length matches - * the {@code size} the original array is returned. + * Compute the indices required for quantile interpolation. + * + * <p>The zero-based interpolation index in {@code [0, n)} is + * saved into the working array {@code q} for each {@code p}. * - * @param a Array. - * @param size Size. - * @return the array of the specified size + * @param n Size of the data. + * @param p Probabilities for the quantiles to compute. + * @param q Working array for quantiles. + * @return the indices */ - private static int[] truncate(int[] a, int size) { - if (size < a.length) { - return Arrays.copyOf(a, size); + private int[] computeIndices(int n, double[] p, double[] q) { + final int[] indices = new int[p.length << 1]; + int count = 0; + for (int k = 0; k < p.length; k++) { + final double pos = estimationType.index(p[k], n); + q[k] = pos; + final int i = (int) pos; + indices[count++] = i; + if (pos > i) { + // Require the next index for interpolation + indices[count++] = i + 1; + } + } + if (count < indices.length) { + return Arrays.copyOf(indices, count); } - return a; + return indices; } /** diff --git a/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/DoubleMathTest.java b/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/InterpolationTest.java similarity index 74% rename from commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/DoubleMathTest.java rename to commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/InterpolationTest.java index d695bd4..8cf32d4 100644 --- a/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/DoubleMathTest.java +++ b/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/InterpolationTest.java @@ -27,13 +27,13 @@ import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; /** - * Test for {@link DoubleMath}. + * Test for {@link Interpolation}. */ -class DoubleMathTest { +class InterpolationTest { @ParameterizedTest @MethodSource void testMean(double x, double y, double expected) { - Assertions.assertEquals(expected, DoubleMath.mean(x, y)); + Assertions.assertEquals(expected, Interpolation.mean(x, y)); } static Stream<Arguments> testMean() { @@ -41,6 +41,7 @@ class DoubleMathTest { builder.add(Arguments.of(2, 3, 2.5)); builder.add(Arguments.of(-4, 3, -0.5)); builder.add(Arguments.of(-4, 4, 0)); + builder.add(Arguments.of(-4, 5, 0.5)); builder.add(Arguments.of(-0.0, -0.0, -0.0)); builder.add(Arguments.of(-Double.MAX_VALUE, Double.MAX_VALUE, 0)); builder.add(Arguments.of(Double.MIN_VALUE, Double.MIN_VALUE, Double.MIN_VALUE)); @@ -53,10 +54,32 @@ class DoubleMathTest { return builder.build(); } + @ParameterizedTest + @MethodSource + void testIntMean(int x, int y, double expected) { + Assertions.assertEquals(expected, Interpolation.mean(x, y)); + } + + static Stream<Arguments> testIntMean() { + final Stream.Builder<Arguments> builder = Stream.builder(); + builder.add(Arguments.of(2, 3, 2.5)); + builder.add(Arguments.of(-4, 3, -0.5)); + builder.add(Arguments.of(-4, 4, 0)); + builder.add(Arguments.of(-4, 5, 0.5)); + builder.add(Arguments.of(0, 0, 0)); + builder.add(Arguments.of(Integer.MAX_VALUE, Integer.MAX_VALUE, Integer.MAX_VALUE)); + builder.add(Arguments.of(-Integer.MAX_VALUE, Integer.MAX_VALUE, 0)); + builder.add(Arguments.of(Integer.MAX_VALUE, -Integer.MAX_VALUE, 0)); + builder.add(Arguments.of(Integer.MIN_VALUE, Integer.MIN_VALUE, Integer.MIN_VALUE)); + builder.add(Arguments.of(Integer.MIN_VALUE, Integer.MAX_VALUE, -0.5)); + builder.add(Arguments.of(Integer.MAX_VALUE, Integer.MIN_VALUE, -0.5)); + return builder.build(); + } + @ParameterizedTest @MethodSource void testInterpolate(double a, double b, double t, double expected) { - Assertions.assertEquals(expected, DoubleMath.interpolate(a, b, t)); + Assertions.assertEquals(expected, Interpolation.interpolate(a, b, t)); } static Stream<Arguments> testInterpolate() { @@ -120,21 +143,21 @@ class DoubleMathTest { // Interpolation between a==-0.0 and b>=0.0 is 0.0. // This is known behaviour and ignored as usage has t in (0, 1). if (Double.compare(a, -0.0) == 0 && Double.compare(b, 0.0) >= 0) { - Assertions.assertEquals(a, DoubleMath.interpolate(a, b, 0), 0.0, "exactness"); + Assertions.assertEquals(a, Interpolation.interpolate(a, b, 0), 0.0, "exactness"); } else { // result is -0.0 if a=b=-0.0 - Assertions.assertEquals(a, DoubleMath.interpolate(a, b, 0), "exactness"); + Assertions.assertEquals(a, Interpolation.interpolate(a, b, 0), "exactness"); } // Not supported //Assertions.assertEquals(b, DoubleMath.interpolate(a, b, 1)); Assertions.assertTrue( - Double.compare(DoubleMath.interpolate(a, b, t2), DoubleMath.interpolate(a, b, t1)) * + Double.compare(Interpolation.interpolate(a, b, t2), Interpolation.interpolate(a, b, t1)) * Double.compare(t2, t1) * Double.compare(b, a) >= 0, "monotonicity"); final double m = Double.MAX_VALUE; - Assertions.assertTrue(Double.isFinite(DoubleMath.interpolate(a * m, b * m, t1)), "boundedness"); - Assertions.assertTrue(Double.isFinite(DoubleMath.interpolate(a * m, b * m, t2)), "boundedness"); - Assertions.assertEquals(a, DoubleMath.interpolate(a, a, t1), "consistency"); - Assertions.assertEquals(b, DoubleMath.interpolate(b, b, t1), "consistency"); + Assertions.assertTrue(Double.isFinite(Interpolation.interpolate(a * m, b * m, t1)), "boundedness"); + Assertions.assertTrue(Double.isFinite(Interpolation.interpolate(a * m, b * m, t2)), "boundedness"); + Assertions.assertEquals(a, Interpolation.interpolate(a, a, t1), "consistency"); + Assertions.assertEquals(b, Interpolation.interpolate(b, b, t1), "consistency"); } static Stream<Arguments> testInterpolateProperties() { @@ -154,26 +177,47 @@ class DoubleMathTest { } @ParameterizedTest - @MethodSource + @MethodSource(value = {"anyFiniteDouble"}) void testInterpolateEdge(double a, double b) { if (b < a) { testInterpolateEdge(b, a); return; } // Test the extremes of t in (0, 1) - Assertions.assertTrue(DoubleMath.interpolate(a, b, 0.0) >= a, () -> "t=0 a=" + a + " b=" + b); - Assertions.assertTrue(DoubleMath.interpolate(a, b, Double.MIN_VALUE) >= a, () -> "t=min a=" + a + " b=" + b); - Assertions.assertTrue(DoubleMath.interpolate(a, b, Math.nextDown(1.0)) <= b, () -> "t=0.999... a=" + a + " b=" + b); + Assertions.assertTrue(Interpolation.interpolate(a, b, 0.0) >= a, () -> "t=0 a=" + a + " b=" + b); + Assertions.assertTrue(Interpolation.interpolate(a, b, Double.MIN_VALUE) >= a, () -> "t=min a=" + a + " b=" + b); + Assertions.assertTrue(Interpolation.interpolate(a, b, Math.nextDown(1.0)) <= b, () -> "t=0.999... a=" + a + " b=" + b); // This fails with the current implementation as it assumes interpolation never uses t=1 //Assertions.assertTrue(DoubleMath.interpolate(a, b, 1.0) <= b, () -> "t=1 a=" + a + " b=" + b); } - static Stream<Arguments> testInterpolateEdge() { + @ParameterizedTest + @MethodSource(value = {"anyFiniteDouble"}) + void testMeanVsInterpolate(double a, double b) { + if (b < a) { + testMeanVsInterpolate(b, a); + return; + } + // The mean should be the exact double + // but the interpolation has more float additions. + // The multiplication by the interpolant 0.5 is exact for normal numbers. + double expected = a + b; + if (!Double.isFinite(expected) || Math.abs(expected) < Double.MIN_NORMAL) { + return; + } + expected *= 0.5; + Assertions.assertEquals(expected, Interpolation.mean(a, b)); + Assertions.assertEquals(expected, Interpolation.interpolate(a, b, 0.5), Math.ulp(expected), + () -> a + ", " + b); + } + + static Stream<Arguments> anyFiniteDouble() { final Stream.Builder<Arguments> builder = Stream.builder(); final UniformRandomProvider rng = RandomSource.XO_RO_SHI_RO_128_PP.create(); final long infBits = Double.doubleToRawLongBits(Double.POSITIVE_INFINITY); final long signBit = Long.MIN_VALUE; for (int i = 0; i < 50; i++) { + // doubles in [-1, 1) builder.add(Arguments.of(signedDouble(rng), signedDouble(rng))); // Any finite double final long m = rng.nextLong(infBits); diff --git a/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/MedianTest.java b/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/MedianTest.java index f832f5b..c691be3 100644 --- a/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/MedianTest.java +++ b/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/MedianTest.java @@ -19,8 +19,6 @@ package org.apache.commons.statistics.descriptive; import java.util.Arrays; import java.util.stream.Stream; -import org.apache.commons.math3.stat.descriptive.rank.Percentile; -import org.apache.commons.math3.stat.ranking.NaNStrategy; import org.apache.commons.rng.UniformRandomProvider; import org.apache.commons.rng.simple.RandomSource; import org.junit.jupiter.api.Assertions; @@ -41,18 +39,24 @@ class MedianTest { } @ParameterizedTest - @MethodSource(value = {"testMedian"}) - void testMedian(double[] values, double expected) { + @MethodSource(value = {"testDoubleMedian"}) + void testDoubleMedian(double[] values, double expected) { final double[] copy = values.clone(); Assertions.assertEquals(expected, Median.withDefaults().evaluate(values)); - // Test the result and data (modified in-place) match the quantile implementation - Assertions.assertEquals(expected, Quantile.withDefaults().evaluate(copy, 0.5)); + // Test the result and data (modified in-place) match the quantile implementation. + // Interpolation may be different by a single ulp. + final double q = Quantile.withDefaults().evaluate(copy, 0.5); + if (Double.isNaN(expected)) { + Assertions.assertEquals(expected, q); + } else { + Assertions.assertEquals(expected, q, Math.ulp(expected)); + } Assertions.assertArrayEquals(values, copy); } @ParameterizedTest - @MethodSource(value = {"testMedian"}) - void testMedianExcludeNaN(double[] values, double expected) { + @MethodSource(value = {"testDoubleMedian"}) + void testDoubleMedianExcludeNaN(double[] values, double expected) { // If NaN is present then the result will change from expected so ignore this Assumptions.assumeTrue(Arrays.stream(values).filter(Double::isNaN).count() == 0); // Note: Use copy here. This checks that the copy of the data @@ -71,11 +75,8 @@ class MedianTest { } } - static Stream<Arguments> testMedian() { + static Stream<Arguments> testDoubleMedian() { final Stream.Builder<Arguments> builder = Stream.builder(); - final Percentile p = new Percentile(50).withNaNStrategy(NaNStrategy.FIXED); - // Note: Cannot use CM when NaN is adjacent to the middle of an odd length - // as it always interpolates pairs and uses: low + 0.0 * (NaN - low) for (final double[] x : new double[][] { {1}, {1, 2}, @@ -87,20 +88,25 @@ class MedianTest { {Double.NaN, Double.NaN, 1, 2, 3, 4}, {Double.MAX_VALUE, Double.MAX_VALUE}, {-Double.MAX_VALUE, -Double.MAX_VALUE / 2}, + // Cases where quantile interpolation using p=0.5 is different. + // Fail cases taken from adaption of InterpolationTest.testMeanVsInterpolate. + {6.125850131710258E31, 2.11712251424532992E17}, + {3.550291387137841E117, 7.941355536862782E127}, + {9.026950581570208E-93, 1.5840864549779843E-77}, }) { - builder.add(Arguments.of(x, p.evaluate(x))); + builder.add(Arguments.of(x, evaluate(x))); } - // Cases where CM Percentile returns NaN - builder.add(Arguments.of(new double[]{1, 2, Double.NaN}, 2)); - builder.add(Arguments.of(new double[]{Double.NaN, 1, 2, 3, Double.NaN}, 3)); + // Cases where Commons Math Percentile returns NaN (because it always interpolates + // x[i] + g * (x[i+1] - x[i]) even when g==0) + builder.add(Arguments.of(new double[] {1, 2, Double.NaN}, 2)); + builder.add(Arguments.of(new double[] {Double.NaN, 1, 2, 3, Double.NaN}, 3)); - // Test against the percentile can fail at 1 ULP so used a fixed seed - final UniformRandomProvider rng = RandomSource.XO_SHI_RO_128_PP.create(26378461823L); + final UniformRandomProvider rng = RandomSource.XO_SHI_RO_128_PP.create(); // Sizes above and below the threshold for partitioning double[] x; for (final int size : new int[] {5, 6, 50, 51}) { final double[] values = rng.doubles(size, -4.5, 1.5).toArray(); - final double expected = p.evaluate(values); + final double expected = evaluate(values); for (int i = 0; i < 20; i++) { x = TestHelper.shuffle(rng, values.clone()); builder.add(Arguments.of(x, expected)); @@ -126,13 +132,118 @@ class MedianTest { return builder.build(); } + /** + * Evaluate the median using a full sort on a copy of the data. + * + * @param values Value. + * @return the median + */ + private static double evaluate(double[] values) { + final double[] x = values.clone(); + Arrays.sort(x); + final int m = x.length >> 1; + if ((x.length & 0x1) == 1) { + // odd + return x[m]; + } + return Interpolation.mean(x[m - 1], x[m]); + } + @Test - void testMedianWithCopy() { - final double[] values = {3, 4, 2, 1, 0}; + void testDoubleMedianWithCopy() { + assertMedianWithCopy(new double[] {2, 1}, 1.5); + assertMedianWithCopy(new double[] {3, 2, 1}, 2); + assertMedianWithCopy(new double[] {4, 3, 2, 1}, 2.5); + assertMedianWithCopy(new double[] {5, 4, 3, 2, 1}, 3); + // Special case for 2 values with signed zeros (must be unordered) + assertMedianWithCopy(new double[] {0.0, -0.0}, 0.0); + } + + private static void assertMedianWithCopy(double[] values, double expected) { final double[] original = values.clone(); - Assertions.assertEquals(2, Median.withDefaults().withCopy(true).evaluate(values)); + Assertions.assertEquals(expected, Median.withDefaults().withCopy(true).evaluate(values)); + Assertions.assertArrayEquals(original, values); + Assertions.assertEquals(expected, Median.withDefaults().withCopy(false).evaluate(values)); + Assertions.assertFalse(Arrays.equals(original, values)); + } + + @ParameterizedTest + @MethodSource(value = {"testIntMedian"}) + void testIntMedian(int[] values, double expected) { + final int[] copy = values.clone(); + Assertions.assertEquals(expected, Median.withDefaults().evaluate(values)); + // Test the result and data (modified in-place) match the quantile implementation. + // Results are either integer or half-integer so this should be exact. + Assertions.assertEquals(expected, Quantile.withDefaults().evaluate(copy, 0.5)); + Assertions.assertArrayEquals(values, copy); + } + + static Stream<Arguments> testIntMedian() { + final Stream.Builder<Arguments> builder = Stream.builder(); + for (final int[] x : new int[][] { + {1}, + {1, 2}, + {2, 1}, + {1, 2, 3, 4}, + {Integer.MAX_VALUE, Integer.MAX_VALUE / 2}, + {Integer.MIN_VALUE, Integer.MIN_VALUE / 2}, + }) { + builder.add(Arguments.of(x, evaluate(x))); + } + + final UniformRandomProvider rng = RandomSource.XO_SHI_RO_128_PP.create(); + // Sizes above and below the threshold for partitioning + int[] x; + for (final int size : new int[] {5, 6, 50, 51}) { + final int[] values = rng.ints(size, -4500, 1500).toArray(); + final double expected = evaluate(values); + for (int i = 0; i < 20; i++) { + x = TestHelper.shuffle(rng, values.clone()); + builder.add(Arguments.of(x, expected)); + } + // Special values + for (final int y : new int[] {0, 1, Integer.MAX_VALUE, Integer.MIN_VALUE}) { + x = new int[size]; + Arrays.fill(x, y); + builder.add(Arguments.of(x, y)); + } + } + // Special cases + builder.add(Arguments.of(new int[] {}, Double.NaN)); + builder.add(Arguments.of(new int[] {-Integer.MAX_VALUE, Integer.MAX_VALUE}, 0)); + return builder.build(); + } + + /** + * Evaluate the median using a full sort on a copy of the data. + * + * @param values Value. + * @return the median + */ + private static double evaluate(int[] values) { + final int[] x = values.clone(); + Arrays.sort(x); + final int m = x.length >> 1; + if ((x.length & 0x1) == 1) { + // odd + return x[m]; + } + return Interpolation.mean(x[m - 1], x[m]); + } + + @Test + void testIntMedianWithCopy() { + assertMedianWithCopy(new int[] {2, 1}, 1.5); + assertMedianWithCopy(new int[] {3, 2, 1}, 2); + assertMedianWithCopy(new int[] {4, 3, 2, 1}, 2.5); + assertMedianWithCopy(new int[] {5, 4, 3, 2, 1}, 3); + } + + private static void assertMedianWithCopy(int[] values, double expected) { + final int[] original = values.clone(); + Assertions.assertEquals(expected, Median.withDefaults().withCopy(true).evaluate(values)); Assertions.assertArrayEquals(original, values); - Assertions.assertEquals(2, Median.withDefaults().withCopy(false).evaluate(values)); + Assertions.assertEquals(expected, Median.withDefaults().withCopy(false).evaluate(values)); Assertions.assertFalse(Arrays.equals(original, values)); } } diff --git a/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/QuantileTest.java b/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/QuantileTest.java index 417dc08..cf4fae7 100644 --- a/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/QuantileTest.java +++ b/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/QuantileTest.java @@ -35,20 +35,6 @@ class QuantileTest { /** Estimation types to test. */ private static final EstimationMethod[] TYPES = EstimationMethod.values(); - /** - * Interface to test the quantile for a single probability. - */ - interface QuantileFunction { - double evaluate(Quantile m, double[] values, double p); - } - - /** - * Interface to test the quantiles for a multiple probabilities. - */ - interface QuantileFunction2 { - double[] evaluate(Quantile m, double[] values, double[] p); - } - @Test void testNullPropertyThrows() { final Quantile m = Quantile.withDefaults(); @@ -92,11 +78,14 @@ class QuantileTest { @Test void testBadQuantileThrows() { - final double[] values = {3, 4, 2, 1, 0}; + final double[] values1 = {3, 4, 2, 1, 0}; + final int[] values2 = {3, 4, 2, 1, 0}; final Quantile m = Quantile.withDefaults(); for (final double p : new double[] {-0.5, 1.2, Double.NaN}) { - Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(values, p)); - Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(values, new double[] {p})); + Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(values1, p)); + Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(values1, new double[] {p})); + Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(values2, p)); + Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(values2, new double[] {p})); Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(10, i -> 1, p)); Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(10, i -> 1, new double[] {p})); } @@ -104,10 +93,14 @@ class QuantileTest { @Test void testNoQuantilesThrows() { - final double[] values = {3, 4, 2, 1, 0}; + final double[] values1 = {3, 4, 2, 1, 0}; + final int[] values2 = {3, 4, 2, 1, 0}; final Quantile m = Quantile.withDefaults(); - Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(values)); - Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(values, new double[0])); + Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(values1)); + Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(values1, new double[0])); + Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(values2)); + Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(values2, new double[0])); + Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(10, i -> 1)); Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(10, i -> 1, new double[0])); } @@ -120,16 +113,32 @@ class QuantileTest { } } + // double[] + + /** + * Interface to test the quantile for a single probability. + */ + interface DoubleQuantileFunction { + double evaluate(Quantile m, double[] values, double p); + } + + /** + * Interface to test the quantiles for a multiple probabilities. + */ + interface DoubleQuantileFunctionN { + double[] evaluate(Quantile m, double[] values, double[] p); + } + @ParameterizedTest - @MethodSource(value = {"testQuantile"}) - void testQuantile(double[] values, double[] p, double[][] expected, double delta) { + @MethodSource(value = {"testDoubleQuantile"}) + void testDoubleQuantile(double[] values, double[] p, double[][] expected, double delta) { assertQuantile(Quantile.withDefaults(), values, p, expected, delta, Quantile::evaluate, Quantile::evaluate); } @ParameterizedTest - @MethodSource(value = {"testQuantile"}) - void testQuantileExcludeNaN(double[] values, double[] p, double[][] expected, double delta) { + @MethodSource(value = {"testDoubleQuantile"}) + void testDoubleQuantileExcludeNaN(double[] values, double[] p, double[][] expected, double delta) { // If NaN is present then the result will change from expected so ignore this Assumptions.assumeTrue(Arrays.stream(values).filter(Double::isNaN).count() == 0); // Note: Use copy here. This checks that the copy of the data @@ -150,7 +159,7 @@ class QuantileTest { } @ParameterizedTest - @MethodSource(value = {"testQuantile"}) + @MethodSource(value = {"testDoubleQuantile"}) void testQuantileSorted(double[] values, double[] p, double[][] expected, double delta) { assertQuantile(Quantile.withDefaults(), values, p, expected, delta, (m, x, q) -> { @@ -167,7 +176,7 @@ class QuantileTest { private static void assertQuantile(Quantile m, double[] values, double[] p, double[][] expected, double delta, - QuantileFunction f1, QuantileFunction2 f2) { + DoubleQuantileFunction f1, DoubleQuantileFunctionN fn) { Assertions.assertEquals(expected.length, TYPES.length); for (int i = 0; i < TYPES.length; i++) { final EstimationMethod type = TYPES[i]; @@ -178,22 +187,22 @@ class QuantileTest { assertEqualsOrExactlyEqual(expected[i][j], f1.evaluate(m, values.clone(), p[j]), delta, () -> type.toString()); } - assertEqualsOrExactlyEqual(expected[i][j], f2.evaluate(m, values.clone(), new double[] {p[j]})[0], delta, + assertEqualsOrExactlyEqual(expected[i][j], fn.evaluate(m, values.clone(), new double[] {p[j]})[0], delta, () -> type.toString()); } // Bulk quantiles if (delta < 0) { - Assertions.assertArrayEquals(expected[i], f2.evaluate(m, values.clone(), p), + Assertions.assertArrayEquals(expected[i], fn.evaluate(m, values.clone(), p), () -> type.toString()); } else { - Assertions.assertArrayEquals(expected[i], f2.evaluate(m, values.clone(), p), delta, + Assertions.assertArrayEquals(expected[i], fn.evaluate(m, values.clone(), p), delta, () -> type.toString()); } } } /** - * Assert that {@code expected} and {@code actual} are equal within the given{@code delta}. + * Assert that {@code expected} and {@code actual} are equal within the given {@code delta}. * If the {@code delta} is negative it is ignored and values must be exactly equal. * * @param expected Expected @@ -210,79 +219,79 @@ class QuantileTest { } } - static Stream<Arguments> testQuantile() { + static Stream<Arguments> testDoubleQuantile() { final Stream.Builder<Arguments> builder = Stream.builder(); // Special cases final double nan = Double.NaN; - addQuantiles(builder, new double[] {}, new double[] {0.75}, 1e-5, + addDoubleQuantiles(builder, new double[] {}, new double[] {0.75}, 1e-5, new double[] {nan, nan, nan, nan, nan, nan, nan, nan, nan}); - addQuantiles(builder, new double[] {42}, new double[] {0.75}, 1e-5, + addDoubleQuantiles(builder, new double[] {42}, new double[] {0.75}, 1e-5, new double[] {42, 42, 42, 42, 42, 42, 42, 42, 42}); // Cases from Commons Math PercentileTest - addQuantiles(builder, new double[] {1, 2, 3}, new double[] {0.75}, 1e-5, + addDoubleQuantiles(builder, new double[] {1, 2, 3}, new double[] {0.75}, 1e-5, new double[] {3, 3, 2, 2.25, 2.75, 3, 2.5, 2.83333, 2.81250}); - addQuantiles(builder, new double[] {0, 1}, new double[] {0.25}, 1e-5, + addDoubleQuantiles(builder, new double[] {0, 1}, new double[] {0.25}, 1e-5, new double[] {0, 0, 0, 0, 0, 0, 0.25, 0, 0}); final double[] d = new double[] {1, 3, 2, 4}; - addQuantiles(builder, d, new double[] {0.3, 0.25, 0.75, 0.5}, 1e-5, + addDoubleQuantiles(builder, d, new double[] {0.3, 0.25, 0.75, 0.5}, 1e-5, new double[] {2, 2, 1, 1.2, 1.7, 1.5, 1.9, 1.63333, 1.65}, new double[] {1, 1.5, 1, 1, 1.5, 1.25, 1.75, 1.41667, 1.43750}, new double[] {3, 3.5, 3, 3, 3.5, 3.75, 3.25, 3.58333, 3.56250}, new double[] {2, 2.5, 2, 2, 2.5, 2.5, 2.5, 2.5, 2.5}); // NIST example - addQuantiles(builder, + addDoubleQuantiles(builder, new double[] {95.1772, 95.1567, 95.1937, 95.1959, 95.1442, 95.0610, 95.1591, 95.1195, 95.1772, 95.0925, 95.1990, 95.1682}, - new double[] {0.9}, 1e-4, + new double[] {0.9}, 1e-5, new double[] {95.19590, 95.19590, 95.19590, 95.19546, 95.19683, 95.19807, 95.19568, 95.19724, 95.19714}); - addQuantiles(builder, + addDoubleQuantiles(builder, new double[] {12.5, 12.0, 11.8, 14.2, 14.9, 14.5, 21.0, 8.2, 10.3, 11.3, 14.1, 9.9, 12.2, 12.0, 12.1, 11.0, 19.8, 11.0, 10.0, 8.8, 9.0, 12.3}, new double[] {0.05}, 1e-4, new double[] {8.8000, 8.8000, 8.2000, 8.2600, 8.5600, 8.2900, 8.8100, 8.4700, 8.4925}); // Special values tests - addQuantiles(builder, + addDoubleQuantiles(builder, new double[] {nan}, new double[] {0.5}, 1e-4, new double[] {nan, nan, nan, nan, nan, nan, nan, nan, nan}); - addQuantiles(builder, + addDoubleQuantiles(builder, new double[] {nan, nan}, new double[] {0.5}, 1e-4, new double[] {nan, nan, nan, nan, nan, nan, nan, nan, nan}); - addQuantiles(builder, + addDoubleQuantiles(builder, new double[] {1, nan}, new double[] {0.5}, 1e-4, new double[] {1, nan, 1, 1, nan, nan, nan, nan, nan}); - addQuantiles(builder, + addDoubleQuantiles(builder, new double[] {1, 2, nan}, new double[] {0.5}, 1e-4, new double[] {2, 2, 2, 1.5, 2, 2, 2, 2, 2}); - addQuantiles(builder, + addDoubleQuantiles(builder, new double[] {1, 2, nan, nan}, new double[] {0.5}, 1e-4, new double[] {2, nan, 2, 2, nan, nan, nan, nan, nan}); // min/max test. This hits edge cases for bounds clipping when computing // the index in [0, n). - addQuantiles(builder, + addDoubleQuantiles(builder, new double[] {5, 4, 3, 2, 1}, - new double[] {0.0, 1.0}, 0.0, + new double[] {0.0, 1.0}, -1, new double[] {1, 1, 1, 1, 1, 1, 1, 1, 1}, new double[] {5, 5, 5, 5, 5, 5, 5, 5, 5}); // Note: This tests interpolation between -0.0 and -0.0, and -0.0 and 0.0. // When the quantile requires interpolation, the sign should be maintained // if the upper bound is -0.0. // No interpolation - addQuantiles(builder, + addDoubleQuantiles(builder, new double[] {-0.0, 0.0, 0.0}, new double[] {0.0}, -1, new double[] {-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0}); // Interpolation between negative zeros - addQuantiles(builder, + addDoubleQuantiles(builder, new double[] {-0.0, -0.0, -0.0, -0.0, -0.0}, new double[] {0.45}, -1, new double[] {-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0}); // Interpolation between -0.0 and 0.0 - addQuantiles(builder, + addDoubleQuantiles(builder, new double[] {-0.0, -0.0, 0.0, 0.0, 0.0}, new double[] {0.45}, -1, // Here only HF3 rounds to k=2; other discrete methods to k=3; @@ -354,7 +363,7 @@ class QuantileTest { * @param p Quantiles to compute. * @param expected Expected result for each p for every estimation type. */ - private static void addQuantiles(Stream.Builder<Arguments> builder, + private static void addDoubleQuantiles(Stream.Builder<Arguments> builder, double[] x, double[] p, double delta, double[]... expected) { Assertions.assertEquals(p.length, expected.length); for (final double[] e : expected) { @@ -371,7 +380,7 @@ class QuantileTest { } @Test - void testQuantileWithCopy() { + void testDoubleQuantileWithCopy() { final double[] values = {3, 4, 2, 1, 0}; final double[] original = values.clone(); Assertions.assertEquals(2, Quantile.withDefaults().withCopy(true).evaluate(values, 0.5)); @@ -381,7 +390,7 @@ class QuantileTest { } @Test - void testQuantileWithCopy2() { + void testDoubleQuantileWithCopy2() { final double[] values = {3, 4, 2, 1, 0}; final double[] original = values.clone(); Assertions.assertEquals(2, Quantile.withDefaults().withCopy(true).evaluate(values, new double[] {0.5})[0]); @@ -389,4 +398,211 @@ class QuantileTest { Assertions.assertEquals(2, Quantile.withDefaults().withCopy(false).evaluate(values, new double[] {0.5})[0]); Assertions.assertFalse(Arrays.equals(original, values)); } + + // int[] + + /** + * Interface to test the quantile for a single probability. + */ + interface IntQuantileFunction { + double evaluate(Quantile m, int[] values, double p); + } + + /** + * Interface to test the quantiles for a multiple probabilities. + */ + interface IntQuantileFunctionN { + double[] evaluate(Quantile m, int[] values, double[] p); + } + + @ParameterizedTest + @MethodSource(value = {"testIntQuantile"}) + void testIntQuantile(int[] values, double[] p, double[][] expected, double delta) { + assertQuantile(Quantile.withDefaults(), values, p, expected, delta, + Quantile::evaluate, Quantile::evaluate); + } + + @ParameterizedTest + @MethodSource(value = {"testIntQuantile"}) + void testQuantileSorted(int[] values, double[] p, double[][] expected, double delta) { + assertQuantile(Quantile.withDefaults(), values, p, expected, delta, + (m, x, q) -> { + // No clone here as later calls with the same array will also sort it + Arrays.sort(x); + return m.evaluate(x.length, i -> x[i], q); + }, + (m, x, q) -> { + // No clone here as later calls with the same array will also sort it + Arrays.sort(x); + return m.evaluate(x.length, i -> x[i], q); + }); + } + + private static void assertQuantile(Quantile m, int[] values, double[] p, + double[][] expected, double delta, + IntQuantileFunction f1, IntQuantileFunctionN fn) { + Assertions.assertEquals(expected.length, TYPES.length); + for (int i = 0; i < TYPES.length; i++) { + final EstimationMethod type = TYPES[i]; + m = m.with(type); + // Single quantiles + for (int j = 0; j < p.length; j++) { + if (f1 != null) { + assertEqualsOrExactlyEqual(expected[i][j], f1.evaluate(m, values.clone(), p[j]), delta, + () -> type.toString()); + } + assertEqualsOrExactlyEqual(expected[i][j], fn.evaluate(m, values.clone(), new double[] {p[j]})[0], delta, + () -> type.toString()); + } + // Bulk quantiles + if (delta < 0) { + Assertions.assertArrayEquals(expected[i], fn.evaluate(m, values.clone(), p), + () -> type.toString()); + } else { + Assertions.assertArrayEquals(expected[i], fn.evaluate(m, values.clone(), p), delta, + () -> type.toString()); + } + } + } + + static Stream<Arguments> testIntQuantile() { + final Stream.Builder<Arguments> builder = Stream.builder(); + // Special cases + final double nan = Double.NaN; + addIntQuantiles(builder, new int[] {}, new double[] {0.75}, 1e-5, + new double[] {nan, nan, nan, nan, nan, nan, nan, nan, nan}); + addIntQuantiles(builder, new int[] {42}, new double[] {0.75}, 1e-5, + new double[] {42, 42, 42, 42, 42, 42, 42, 42, 42}); + // Cases from Commons Math PercentileTest + addIntQuantiles(builder, new int[] {1, 2, 3}, new double[] {0.75}, 1e-5, + new double[] {3, 3, 2, 2.25, 2.75, 3, 2.5, 2.83333, 2.81250}); + addIntQuantiles(builder, new int[] {0, 1}, new double[] {0.25}, 1e-5, + new double[] {0, 0, 0, 0, 0, 0, 0.25, 0, 0}); + final int[] d = new int[] {1, 3, 2, 4}; + addIntQuantiles(builder, d, new double[] {0.3, 0.25, 0.75, 0.5}, 1e-5, + new double[] {2, 2, 1, 1.2, 1.7, 1.5, 1.9, 1.63333, 1.65}, + new double[] {1, 1.5, 1, 1, 1.5, 1.25, 1.75, 1.41667, 1.43750}, + new double[] {3, 3.5, 3, 3, 3.5, 3.75, 3.25, 3.58333, 3.56250}, + new double[] {2, 2.5, 2, 2, 2.5, 2.5, 2.5, 2.5, 2.5}); + // NIST example + // Scale example 1 by 1e4 + addIntQuantiles(builder, + new int[] {951772, 951567, 951937, 951959, 951442, 950610, 951591, 951195, 951772, 950925, + 951990, 951682}, + new double[] {0.9}, 1e-1, + new double[] {951959.0, 951959.0, 951959.0, 951954.6, 951968.3, 951980.7, 951956.8, 951972.4, 951971.4}); + // Scale example 2 by 10 + addIntQuantiles(builder, + new int[] {125, 120, 118, 142, 149, 145, 210, 82, 103, 113, 141, 99, 122, 120, 121, 110, + 198, 110, 100, 88, 90, 123}, + new double[] {0.05}, 1e-3, + new double[] {88.000, 88.000, 82.000, 82.600, 85.600, 82.900, 88.100, 84.700, 84.925}); + // min/max test. This hits edge cases for bounds clipping when computing + // the index in [0, n). + addIntQuantiles(builder, + new int[] {5, 4, 3, 2, 1}, + new double[] {0.0, 1.0}, -1, + new double[] {1, 1, 1, 1, 1, 1, 1, 1, 1}, + new double[] {5, 5, 5, 5, 5, 5, 5, 5, 5}); + // Interpolation between zeros + addIntQuantiles(builder, + new int[] {0, 0, 0, 0, 0}, + new double[] {0.45}, -1, + new double[] {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}); + + // Test data samples using R version 4.3.3 + // require(stats) + // x = as.integer(c(1, 2, 3)) %% data + // p = c(0.1, 0.25, 0.5, 0.975) %% probabilities + // for (t in c(1:9)) { cat('{', paste(quantile(x, p, type=t), collapse=', '), "}, \n", sep="") } + + final double[] p = {0.1, 0.25, 0.5, 0.975}; + + /** Poisson samples: scipy.stats.poisson.rvs(45, size=100). */ + final int[] v4 = {42, 51, 38, 38, 49, 48, 42, 47, 51, 46, 45, 35, 39, 42, 49, 55, 53, 46, 49, 56, + 42, 46, 42, 53, 43, 55, 49, 52, 51, 45, 40, 49, 39, 40, 46, 43, 46, 48, 36, 44, 40, 49, 49, 43, 45, 44, 41, 55, + 52, 45, 57, 41, 43, 44, 38, 52, 44, 45, 43, 42, 38, 37, 47, 42, 47, 45, 70, 45, 50, 47, 46, 50, 47, 35, 43, 52, + 51, 41, 45, 42, 45, 53, 46, 48, 51, 43, 63, 48, 49, 41, 58, 51, 59, 43, 39, 32, 35, 46, 50, 50}; + + builder.add(Arguments.of(v4, p, new double[][] { + {38, 42, 46, 59}, + {38.5, 42, 46, 59}, + {38, 42, 46, 59}, + {38, 42, 46, 58.5}, + {38.5, 42, 46, 59}, + {38.1, 42, 46, 60.9}, + {38.9, 42, 46, 58.525}, + {38.3666666666667, 42, 46, 59.6333333333333}, + {38.4, 42, 46, 59.475}, + }, 1e-12)); + + // Discrete 'normal' samples: scipy.stats.norm.rvs(loc=3.4, scale=2250000, size=100).astype(int) + final int[] v = {659592, -849723, -1765944, 2610353, 2192883, -265784, 1126824, 1412069, 918175, -1066899, + -1289922, -1359925, 549577, 2891935, 5498383, 649055, -3774137, 3026349, 4317084, 16068, 1747179, -94833, + -891275, 146951, 659679, -1298483, -9717, -2372749, -213892, -956213, -2380241, -3265588, 515620, 334156, + 2595489, -463102, -1490342, -700231, -245959, -1596233, 1702451, 1265231, 2338985, -1298796, -2493882, + -1679698, 251933, -3446511, 3437103, -2940127, -1996991, -695605, -3127437, 2895523, 1659299, 935033, + 609115, -1245544, -1131839, 1645603, -1673754, -4318740, -163129, -1733950, 2609546, -536282, -2873472, + -204545, 872775, 448272, 1048969, -1781997, -3602571, -2346653, 2084923, 1289364, 2450980, -1809052, + -422631, 1895287, 72169, -4933595, 2602830, -1106753, 1295126, 1671634, 929809, -3094175, -787509, -769431, + -209387, 1517866, -1861080, 1863380, -699593, -174200, 2132930, 1957489, -340803, -2263330}; + + builder.add(Arguments.of(v, p, new double[][] { + {-2873472, -1490342, -204545, 3437103}, + {-2683677, -1425133.5, -189372.5, 3437103}, + {-2873472, -1490342, -204545, 3437103}, + {-2873472, -1490342, -204545, 3231726}, + {-2683677, -1425133.5, -189372.5, 3437103}, + {-2835513, -1457737.75, -189372.5, 3855093.975}, + {-2531841, -1392529.25, -189372.5, 3241994.85}, + {-2734289, -1436001.58333333, -189372.5, 3576433.325}, + {-2721636, -1433284.5625, -189372.5, 3541600.74374999}, + }, 1e-8)); + + return builder.build(); + } + + /** + * Adds the quantiles. + * + * @param builder Builder. + * @param x Data. + * @param p Quantiles to compute. + * @param expected Expected result for each p for every estimation type. + */ + private static void addIntQuantiles(Stream.Builder<Arguments> builder, + int[] x, double[] p, double delta, double[]... expected) { + Assertions.assertEquals(p.length, expected.length); + for (final double[] e : expected) { + Assertions.assertEquals(e.length, TYPES.length); + } + // Transpose + final double[][] t = new double[TYPES.length][p.length]; + for (int i = 0; i < t.length; i++) { + for (int j = 0; j < p.length; j++) { + t[i][j] = expected[j][i]; + } + } + builder.add(Arguments.of(x, p, t, delta)); + } + + @Test + void testIntQuantileWithCopy() { + final int[] values = {3, 4, 2, 1, 0}; + final int[] original = values.clone(); + Assertions.assertEquals(2, Quantile.withDefaults().withCopy(true).evaluate(values, 0.5)); + Assertions.assertArrayEquals(original, values); + Assertions.assertEquals(2, Quantile.withDefaults().withCopy(false).evaluate(values, 0.5)); + Assertions.assertFalse(Arrays.equals(original, values)); + } + + @Test + void testIntQuantileWithCopy2() { + final int[] values = {3, 4, 2, 1, 0}; + final int[] original = values.clone(); + Assertions.assertEquals(2, Quantile.withDefaults().withCopy(true).evaluate(values, new double[] {0.5})[0]); + Assertions.assertArrayEquals(original, values); + Assertions.assertEquals(2, Quantile.withDefaults().withCopy(false).evaluate(values, new double[] {0.5})[0]); + Assertions.assertFalse(Arrays.equals(original, values)); + } } diff --git a/commons-statistics-examples/examples-jmh/src/main/java/org/apache/commons/statistics/examples/jmh/descriptive/MedianPerformance.java b/commons-statistics-examples/examples-jmh/src/main/java/org/apache/commons/statistics/examples/jmh/descriptive/MedianPerformance.java index 760b4ec..4622c80 100644 --- a/commons-statistics-examples/examples-jmh/src/main/java/org/apache/commons/statistics/examples/jmh/descriptive/MedianPerformance.java +++ b/commons-statistics-examples/examples-jmh/src/main/java/org/apache/commons/statistics/examples/jmh/descriptive/MedianPerformance.java @@ -103,7 +103,7 @@ public class MedianPerformance { // we set it anyway for completeness. Objects.requireNonNull(name); if (JDK.equals(name)) { - function = MedianPerformance::sortMedian; + function = DoubleFunctionSource::sortMedian; } else if (CM3.equals(name)) { final org.apache.commons.math3.stat.descriptive.rank.Median m = new org.apache.commons.math3.stat.descriptive.rank.Median(); @@ -118,37 +118,106 @@ public class MedianPerformance { throw new IllegalStateException("Unknown double[] function: " + name); } } + + /** + * Sort the values and compute the median. + * + * @param values Values. + * @return the median + */ + private static double sortMedian(double[] values) { + // Implicit NPE + final int n = values.length; + // Special cases + if (n <= 2) { + switch (n) { + case 2: + return (values[0] + values[1]) * 0.5; + case 1: + return values[0]; + default: + return Double.NaN; + } + } + // A sort is required + Arrays.sort(values); + final int k = n >>> 1; + // Odd + if ((n & 0x1) == 0x1) { + return values[k]; + } + // Even + return (values[k - 1] + values[k]) * 0.5; + } } /** - * Sort the values and compute the median. - * - * @param values Values. - * @return the median + * Source of a {@link ToDoubleFunction} for a {@code int[]}. */ - static double sortMedian(double[] values) { - // Implicit NPE - final int n = values.length; - // Special cases - if (n <= 2) { - switch (n) { - case 2: - return (values[0] + values[1]) * 0.5; - case 1: - return values[0]; - default: - return Double.NaN; + @State(Scope.Benchmark) + public static class IntFunctionSource { + /** Name of the source. */ + @Param({JDK, STATISTICS}) + private String name; + + /** The action. */ + private ToDoubleFunction<int[]> function; + + /** + * @return the function + */ + public ToDoubleFunction<int[]> getFunction() { + return function; + } + + /** + * Create the function. + */ + @Setup + public void setup() { + // Note: Functions defensively copy the data by default + // Note: KeyStratgey does not matter for single / paired keys but + // we set it anyway for completeness. + Objects.requireNonNull(name); + if (JDK.equals(name)) { + function = IntFunctionSource::sortMedian; + } else if (STATISTICS.equals(name)) { + function = Median.withDefaults()::evaluate; + } else { + throw new IllegalStateException("Unknown int[] function: " + name); } } - // A sort is required - Arrays.sort(values); - final int k = n >>> 1; - // Odd - if ((n & 0x1) == 0x1) { - return values[k]; + + /** + * Sort the values and compute the median. + * + * @param values Values. + * @return the median + */ + private static double sortMedian(int[] values) { + // Implicit NPE + final int n = values.length; + // Special cases + if (n <= 2) { + switch (n) { + case 2: + return (values[0] + values[1]) * 0.5; + case 1: + return values[0]; + default: + return Double.NaN; + } + } + // A sort is required + Arrays.sort(values); + final int k = n >>> 1; + // Odd + if ((n & 0x1) == 0x1) { + return values[k]; + } + // Even + return (values[k - 1] + values[k]) * 0.5; } - // Even - return (values[k - 1] + values[k]) * 0.5; } /** @@ -159,11 +228,27 @@ public class MedianPerformance { * @param bh Data sink. */ @Benchmark - public void median(DoubleFunctionSource function, DataSource source, Blackhole bh) { + public void doubleMedian(DoubleFunctionSource function, DataSource source, Blackhole bh) { final int size = source.size(); final ToDoubleFunction<double[]> fun = function.getFunction(); for (int j = -1; ++j < size;) { bh.consume(fun.applyAsDouble(source.getData(j))); } } + + /** + * Create the statistic using an array. + * + * @param function Source of the function. + * @param source Source of the data. + * @param bh Data sink. + */ + @Benchmark + public void intMedian(IntFunctionSource function, DataSource source, Blackhole bh) { + final int size = source.size(); + final ToDoubleFunction<int[]> fun = function.getFunction(); + for (int j = -1; ++j < size;) { + bh.consume(fun.applyAsDouble(source.getIntData(j))); + } + } } diff --git a/commons-statistics-examples/examples-jmh/src/main/java/org/apache/commons/statistics/examples/jmh/descriptive/QuantilePerformance.java b/commons-statistics-examples/examples-jmh/src/main/java/org/apache/commons/statistics/examples/jmh/descriptive/QuantilePerformance.java index a5a16e1..ede1c07 100644 --- a/commons-statistics-examples/examples-jmh/src/main/java/org/apache/commons/statistics/examples/jmh/descriptive/QuantilePerformance.java +++ b/commons-statistics-examples/examples-jmh/src/main/java/org/apache/commons/statistics/examples/jmh/descriptive/QuantilePerformance.java @@ -25,6 +25,7 @@ import java.util.Locale; import java.util.Objects; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.BiFunction; import java.util.function.BinaryOperator; import java.util.logging.Logger; import org.apache.commons.rng.UniformRandomProvider; @@ -245,6 +246,18 @@ public class QuantilePerformance { return getDataSample(order[index]); } + /** + * Gets the sample for the given {@code index}. + * + * <p>This is returned in a randomized order per iteration. + * + * @param index Index. + * @return the data sample + */ + public int[] getIntData(int index) { + return getIntDataSample(order[index]); + } + /** * Gets the sample for the given {@code index}. * @@ -260,6 +273,22 @@ public class QuantilePerformance { return x; } + /** + * Gets the sample for the given {@code index}. + * + * @param index Index. + * @return the data sample + */ + protected int[] getIntDataSample(int index) { + // For parity with other methods do not use data.clone() + final int[] a = data[index]; + final int[] x = new int[a.length]; + for (int i = -1; ++i < a.length;) { + x[i] = a[i]; + } + return x; + } + /** * Gets the sample size for the given {@code index}. * @@ -908,7 +937,7 @@ public class QuantilePerformance { * Source of a {@link BinaryOperator} for a {@code double[]} and quantiles. */ @State(Scope.Benchmark) - public static class QuantileFunctionSource { + public static class DoubleQuantileFunctionSource { /** Name of the source. */ @Param({JDK, CM3, CM4, STATISTICS}) private String name; @@ -931,7 +960,7 @@ public class QuantilePerformance { // Note: Functions should not defensively copy the data // as a clone is passed in from the data source. if (JDK.equals(name)) { - function = QuantilePerformance::sortQuantile; + function = DoubleQuantileFunctionSource::sortQuantile; } else if (CM3.equals(name)) { // No way to avoid a data copy here. CM does // defensive copying for most array input. @@ -968,48 +997,127 @@ public class QuantilePerformance { throw new IllegalStateException("Unknown double[] function: " + name); } } + + /** + * Sort the values and compute the median. + * + * @param values Values. + * @param p p-th quantiles to compute. + * @return the quantiles + */ + private static double[] sortQuantile(double[] values, double[] p) { + // Implicit NPE + final int n = values.length; + if (p.length == 0) { + throw new IllegalArgumentException("No quantiles specified for double[] data"); + } + for (final double pp : p) { + checkQuantile(pp); + } + // Special cases + final double[] q = new double[p.length]; + if (n <= 1) { + Arrays.fill(q, n == 0 ? Double.NaN : values[0]); + return q; + } + // A sort is required + Arrays.sort(values); + for (int i = 0; i < p.length; i++) { + // EstimationMethod.HF6 (as per the Apache Commons Math Percentile + // legacy implementation) + final double pos = p[i] * (n + 1); + final double fpos = Math.floor(pos); + final int j = (int) fpos; + final double g = pos - fpos; + if (j < 1) { + q[i] = values[0]; + } else if (j >= n) { + q[i] = values[n - 1]; + } else { + q[i] = (1 - g) * values[j - 1] + g * values[j]; + } + } + return q; + } } + /** - * Sort the values and compute the median. - * - * @param values Values. - * @param p p-th quantiles to compute. - * @return the quantiles + * Source of a {@link BiFunction} for an {@code int[]} and quantiles. */ - static double[] sortQuantile(double[] values, double[] p) { - // Implicit NPE - final int n = values.length; - if (p.length == 0) { - throw new IllegalArgumentException("No quantiles specified"); - } - for (final double pp : p) { - checkQuantile(pp); - } - // Special cases - final double[] q = new double[p.length]; - if (n <= 1) { - Arrays.fill(q, n == 0 ? Double.NaN : values[0]); - return q; + @State(Scope.Benchmark) + public static class IntQuantileFunctionSource { + /** Name of the source. */ + @Param({JDK, CM3, CM4, STATISTICS}) + private String name; + + /** The action. */ + private BiFunction<int[], double[], double[]> function; + + /** + * @return the function + */ + public BiFunction<int[], double[], double[]> getFunction() { + return function; } - // A sort is required - Arrays.sort(values); - for (int i = 0; i < p.length; i++) { - // EstimationMethod.HF6 (as per the Apache Commons Math Percentile - // legacy implementation) - final double pos = p[i] * (n + 1); - final double fpos = Math.floor(pos); - final int j = (int) fpos; - final double g = pos - fpos; - if (j < 1) { - q[i] = values[0]; - } else if (j >= n) { - q[i] = values[n - 1]; + + /** + * Create the function. + */ + @Setup + public void setup() { + // Note: Functions should not defensively copy the data + // as a clone is passed in from the data source. + if (JDK.equals(name)) { + function = IntQuantileFunctionSource::sortQuantile; + } else if (STATISTICS.equals(name)) { + function = Quantile.withDefaults()::evaluate; } else { - q[i] = (1 - g) * values[j - 1] + g * values[j]; + throw new IllegalStateException("Unknown int[] function: " + name); } } - return q; + + /** + * Sort the values and compute the median. + * + * @param values Values. + * @param p p-th quantiles to compute. + * @return the quantiles + */ + private static double[] sortQuantile(int[] values, double[] p) { + // Implicit NPE + final int n = values.length; + if (p.length == 0) { + throw new IllegalArgumentException("No quantiles specified for int[] data"); + } + for (final double pp : p) { + checkQuantile(pp); + } + // Special cases + final double[] q = new double[p.length]; + if (n <= 1) { + Arrays.fill(q, n == 0 ? Double.NaN : values[0]); + return q; + } + // A sort is required + Arrays.sort(values); + for (int i = 0; i < p.length; i++) { + // EstimationMethod.HF6 (as per the Apache Commons Math Percentile + // legacy implementation) + final double pos = p[i] * (n + 1); + final double fpos = Math.floor(pos); + final int j = (int) fpos; + final double g = pos - fpos; + if (j < 1) { + q[i] = values[0]; + } else if (j >= n) { + q[i] = values[n - 1]; + } else { + q[i] = (1 - g) * values[j - 1] + g * values[j]; + } + } + return q; + } } /** @@ -1033,7 +1141,7 @@ public class QuantilePerformance { * @param bh Data sink. */ @Benchmark - public void quantiles(QuantileFunctionSource function, DataSource source, + public void doubleQuantiles(DoubleQuantileFunctionSource function, DataSource source, QuantileSource quantiles, Blackhole bh) { final int size = source.size(); final double[] p = quantiles.getData(); @@ -1052,7 +1160,7 @@ public class QuantilePerformance { * @param bh Data sink. */ @Benchmark - public void quantileRange(QuantileFunctionSource function, DataSource source, + public void doubleQuantileRange(DoubleQuantileFunctionSource function, DataSource source, QuantileRangeSource quantiles, Blackhole bh) { final int size = source.size(); final double[] p = quantiles.getData(); @@ -1061,4 +1169,42 @@ public class QuantilePerformance { bh.consume(fun.apply(source.getData(j), p)); } } + + /** + * Create the statistic using an array and given quantiles. + * + * @param function Source of the function. + * @param source Source of the data. + * @param quantiles Source of the quantiles. + * @param bh Data sink. + */ + @Benchmark + public void intQuantiles(IntQuantileFunctionSource function, DataSource source, + QuantileSource quantiles, Blackhole bh) { + final int size = source.size(); + final double[] p = quantiles.getData(); + final BiFunction<int[], double[], double[]> fun = function.getFunction(); + for (int j = -1; ++j < size;) { + bh.consume(fun.apply(source.getIntData(j), p)); + } + } + + /** + * Create the statistic using an array and given quantiles. + * + * @param function Source of the function. + * @param source Source of the data. + * @param quantiles Source of the quantiles. + * @param bh Data sink. + */ + @Benchmark + public void intQuantileRange(IntQuantileFunctionSource function, DataSource source, + QuantileRangeSource quantiles, Blackhole bh) { + final int size = source.size(); + final double[] p = quantiles.getData(); + final BiFunction<int[], double[], double[]> fun = function.getFunction(); + for (int j = -1; ++j < size;) { + bh.consume(fun.apply(source.getIntData(j), p)); + } + } }