This is an automated email from the ASF dual-hosted git repository. aherbert pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-statistics.git
commit bae8313a7f2c0a46b8f401ca9a4145db197be7fb Author: Alex Herbert <aherb...@apache.org> AuthorDate: Wed Mar 19 17:43:09 2025 +0000 STATISTICS-90: Support median and quantile using an array range --- .../commons/statistics/descriptive/Median.java | 127 +++++++-- .../statistics/descriptive/NaNTransformer.java | 16 +- .../statistics/descriptive/NaNTransformers.java | 64 +++-- .../commons/statistics/descriptive/Quantile.java | 287 ++++++++++++++++++--- .../commons/statistics/descriptive/Statistics.java | 19 ++ .../commons/statistics/descriptive/MedianTest.java | 138 +++++++++- .../descriptive/NaNTransformersTest.java | 95 +++++-- .../statistics/descriptive/QuantileTest.java | 171 ++++++++++++ 8 files changed, 810 insertions(+), 107 deletions(-) diff --git a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Median.java b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Median.java index 1946ec8..13a3545 100644 --- a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Median.java +++ b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Median.java @@ -133,39 +133,75 @@ public final class Median { * * @param values Values. * @return the median + * @throws IllegalArgumentException if the values contain NaN and the configuration is {@link NaNPolicy#ERROR} + * @see #with(NaNPolicy) */ public double evaluate(double[] values) { + return compute(values, 0, values.length); + } + + /** + * Evaluate the median of the specified range. + * + * <p>Note: This method may partially sort the input values if not configured to + * {@link #withCopy(boolean) copy} the input data. + * + * @param values Values. + * @param from Inclusive start of the range. + * @param to Exclusive end of the range. + * @return the median + * @throws IllegalArgumentException if the values contain NaN and the configuration is {@link NaNPolicy#ERROR} + * @throws IndexOutOfBoundsException if the sub-range is out of bounds + * @see #with(NaNPolicy) + * @since 1.2 + */ + public double evaluateRange(double[] values, int from, int to) { + Statistics.checkFromToIndex(from, to, values.length); + return compute(values, from, to); + } + + /** + * Compute the median of the specified range. + * + * @param values Values. + * @param from Inclusive start of the range. + * @param to Exclusive end of the range. + * @return the median + */ + private double compute(double[] values, int from, int to) { // Floating-point data handling - final int[] bounds = new int[1]; - final double[] x = nanTransformer.apply(values, bounds); - final int n = bounds[0]; + final int[] bounds = new int[2]; + final double[] x = nanTransformer.apply(values, from, to, bounds); + final int start = bounds[0]; + final int end = bounds[1]; + final int n = end - start; // Special cases if (n <= 2) { switch (n) { case 2: // Sorting the array matches the behaviour of Quantile for n==2 // Handle NaN and signed zeros - if (Double.compare(x[1], x[0]) < 0) { - final double t = x[0]; - x[0] = x[1]; - x[1] = t; + if (Double.compare(x[start + 1], x[start]) < 0) { + final double t = x[start]; + x[start] = x[start + 1]; + x[start + 1] = t; } - return Interpolation.mean(x[0], x[1]); + return Interpolation.mean(x[start], x[start + 1]); case 1: - return x[0]; + return x[start]; default: return Double.NaN; } } - // Median index - final int m = n >>> 1; + // Median index (including the offset) + final int m = (start + end) >>> 1; // Odd if ((n & 0x1) == 1) { - Selection.select(x, 0, n, m); + Selection.select(x, start, end, m); return x[m]; } // Even: require (m-1, m) - Selection.select(x, 0, n, new int[] {m - 1, m}); + Selection.select(x, start, end, new int[] {m - 1, m}); return Interpolation.mean(x[m - 1], x[m]); } @@ -179,34 +215,75 @@ public final class Median { * @return the median */ public double evaluate(int[] values) { - final int[] x = copy ? values.clone() : values; - final int n = values.length; + return compute(values, 0, values.length); + } + + /** + * Evaluate the median of the specified range. + * + * <p>Note: This method may partially sort the input values if not configured to + * {@link #withCopy(boolean) copy} the input data. + * + * @param values Values. + * @param from Inclusive start of the range. + * @param to Exclusive end of the range. + * @return the median + * @throws IndexOutOfBoundsException if the sub-range is out of bounds + * @since 1.2 + */ + public double evaluateRange(int[] values, int from, int to) { + Statistics.checkFromToIndex(from, to, values.length); + return compute(values, from, to); + } + + /** + * Compute the median of the specified range. + * + * @param values Values. + * @param from Inclusive start of the range. + * @param to Exclusive end of the range. + * @return the median + */ + private double compute(int[] values, int from, int to) { + final int[] x; + final int start; + final int end; + if (copy) { + x = Statistics.copy(values, from, to); + start = 0; + end = x.length; + } else { + x = values; + start = from; + end = to; + } + final int n = end - start; // Special cases if (n <= 2) { switch (n) { case 2: // Sorting the array matches the behaviour of Quantile for n==2 - if (x[1] < x[0]) { - final int t = x[0]; - x[0] = x[1]; - x[1] = t; + if (x[start + 1] < x[start]) { + final int t = x[start]; + x[start] = x[start + 1]; + x[start + 1] = t; } - return Interpolation.mean(x[0], x[1]); + return Interpolation.mean(x[start], x[start + 1]); case 1: - return x[0]; + return x[start]; default: return Double.NaN; } } - // Median index - final int m = n >>> 1; + // Median index (including the offset) + final int m = (start + end) >>> 1; // Odd if ((n & 0x1) == 1) { - Selection.select(x, 0, n, m); + Selection.select(x, start, end, m); return x[m]; } // Even: require (m-1, m) - Selection.select(x, 0, n, new int[] {m - 1, m}); + Selection.select(x, start, end, new int[] {m - 1, m}); return Interpolation.mean(x[m - 1], x[m]); } } diff --git a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/NaNTransformer.java b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/NaNTransformer.java index f85a697..5b687b6 100644 --- a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/NaNTransformer.java +++ b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/NaNTransformer.java @@ -38,8 +38,6 @@ package org.apache.commons.statistics.descriptive; * * <p>This interface allows implementations to respect the behaviour of * {@link Double#compare(double, double)}, or implement different behaviour. - * - * @since 1.1 */ interface NaNTransformer { /** @@ -50,13 +48,19 @@ interface NaNTransformer { * <p>The method will return: * <ul> * <li>An array to partition; this may be a copy. - * <li>The {@code size} of the data; this can be smaller than the input array length if - * the transformer is configured to exclude NaN values. + * <li>The {@code bounds} of the returned data as [start, end); this can be smaller than the + * input range if the transformer is configured to exclude NaN values. The start is inclusive + * and the end is exclusive. * </ul> * + * <p>Implementations may assume the input {@code [from, to)} range is valid given the + * length of the {@code data} array. + * * @param data Data. - * @param bounds [size]. + * @param from Inclusive start of the range. + * @param to Exclusive end of the range. + * @param bounds Set to [start, end). * @return pre-processed data (may be a copy) */ - double[] apply(double[] data, int[] bounds); + double[] apply(double[] data, int from, int to, int[] bounds); } diff --git a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/NaNTransformers.java b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/NaNTransformers.java index b0d6d18..74f4e7c 100644 --- a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/NaNTransformers.java +++ b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/NaNTransformers.java @@ -18,8 +18,6 @@ package org.apache.commons.statistics.descriptive; /** * Support for creating {@link NaNTransformer} implementations. - * - * @since 1.1 */ final class NaNTransformers { @@ -64,11 +62,14 @@ final class NaNTransformers { } @Override - public double[] apply(double[] data, int[] bounds) { - bounds[0] = data.length; + public double[] apply(double[] data, int from, int to, int[] bounds) { if (copy) { - return data.clone(); + bounds[0] = 0; + bounds[1] = to - from; + return copy(data, from, to); } + bounds[0] = from; + bounds[1] = to; return data; } } @@ -88,20 +89,31 @@ final class NaNTransformers { } @Override - public double[] apply(double[] data, int[] bounds) { + public double[] apply(double[] data, int from, int to, int[] bounds) { // Optionally work on a copy - final double[] a = copy ? data.clone() : data; + final double[] a; + final int start; + int end; + if (copy) { + a = copy(data, from, to); + start = 0; + end = a.length; + } else { + a = data; + start = from; + end = to; + } // Move NaN to end - int end = a.length; - for (int i = end; --i >= 0;) { + for (int i = end; --i >= start;) { final double v = a[i]; if (v != v) { a[i] = a[--end]; a[end] = v; } } - // Set the size excluding NaN - bounds[0] = end; + // Set the bounds excluding NaN + bounds[0] = start; + bounds[1] = end; return a; } } @@ -121,22 +133,44 @@ final class NaNTransformers { } @Override - public double[] apply(double[] data, int[] bounds) { + public double[] apply(double[] data, int from, int to, int[] bounds) { // Delay copy until data is checked for NaN final double[] a = data; // Error on NaN - for (int i = a.length; --i >= 0;) { + for (int i = to; --i >= from;) { final double v = a[i]; if (v != v) { throw new IllegalArgumentException("NaN at " + i); } } - bounds[0] = a.length; // No NaNs so copy the data if required if (copy) { - return data.clone(); + bounds[0] = 0; + bounds[1] = to - from; + return copy(data, from, to); } + bounds[0] = from; + bounds[1] = to; return data; } } + + /** + * Copy the specified range of data. + * + * <p>This is a simplification of {@link Arrays#copyOfRange(double[], int, int)} + * and does not support range checks or padding of the original input to + * a longer output. + * + * @param data Values. + * @param from Inclusive start of the range. + * @param to Exclusive end of the range. + * @return the copy + */ + static double[] copy(double[] data, int from, int to) { + final int length = to - from; + final double[] copy = new double[length]; + System.arraycopy(data, from, copy, 0, length); + return copy; + } } diff --git a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Quantile.java b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Quantile.java index f974d1b..7f37a73 100644 --- a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Quantile.java +++ b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Quantile.java @@ -220,28 +220,77 @@ public final class Quantile { * @param values Values. * @param p Probability for the quantile to compute. * @return the quantile - * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]} + * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]}; + * or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR} * @see #evaluate(double[], double...) + * @see #with(NaNPolicy) */ public double evaluate(double[] values, double p) { + return compute(values, 0, values.length, p); + } + + /** + * Evaluate the {@code p}-th quantile of the specified range of values. + * + * <p>Note: This method may partially sort the input values if not configured to + * {@link #withCopy(boolean) copy} the input data. + * + * <p><strong>Performance</strong> + * + * <p>It is not recommended to use this method for repeat calls for different quantiles + * within the same values. The {@link #evaluateRange(double[], int, int, double...)} method should be used + * which provides better performance. + * + * @param values Values. + * @param from Inclusive start of the range. + * @param to Exclusive end of the range. + * @param p Probability for the quantile to compute. + * @return the quantile + * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]}; + * or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR} + * @throws IndexOutOfBoundsException if the sub-range is out of bounds + * @see #evaluateRange(double[], int, int, double...) + * @see #with(NaNPolicy) + * @since 1.2 + */ + public double evaluateRange(double[] values, int from, int to, double p) { + Statistics.checkFromToIndex(from, to, values.length); + return compute(values, from, to, p); + } + + /** + * Compute the {@code p}-th quantile of the specified range of values. + * + * @param values Values. + * @param from Inclusive start of the range. + * @param to Exclusive end of the range. + * @param p Probability for the quantile to compute. + * @return the quantile + * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]} + */ + private double compute(double[] values, int from, int to, double p) { checkProbability(p); // Floating-point data handling - final int[] bounds = new int[1]; - final double[] x = nanTransformer.apply(values, bounds); - final int n = bounds[0]; + final int[] bounds = new int[2]; + final double[] x = nanTransformer.apply(values, from, to, bounds); + final int start = bounds[0]; + final int end = bounds[1]; + final int n = end - start; // Special cases if (n <= 1) { - return n == 0 ? Double.NaN : x[0]; + return n == 0 ? Double.NaN : x[start]; } + final double pos = estimationType.index(p, n); - final int i = (int) pos; + final int ip = (int) pos; + final int i = start + ip; // Partition and compute - if (pos > i) { - Selection.select(x, 0, n, new int[] {i, i + 1}); - return Interpolation.interpolate(x[i], x[i + 1], pos - i); + if (pos > ip) { + Selection.select(x, start, end, new int[] {i, i + 1}); + return Interpolation.interpolate(x[i], x[i + 1], pos - ip); } - Selection.select(x, 0, n, i); + Selection.select(x, start, end, i); return x[i]; } @@ -255,32 +304,74 @@ public final class Quantile { * @param p Probabilities for the quantiles to compute. * @return the quantiles * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; - * or no probabilities are specified. + * no probabilities are specified; or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR} + * @see #with(NaNPolicy) */ public double[] evaluate(double[] values, double... p) { + return compute(values, 0, values.length, p); + } + + /** + * Evaluate the {@code p}-th quantiles of the specified range of values. + * + * <p>Note: This method may partially sort the input values if not configured to + * {@link #withCopy(boolean) copy} the input data. + * + * @param values Values. + * @param from Inclusive start of the range. + * @param to Exclusive end of the range. + * @param p Probabilities for the quantiles to compute. + * @return the quantiles + * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; + * no probabilities are specified; or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR} + * @throws IndexOutOfBoundsException if the sub-range is out of bounds + * @see #with(NaNPolicy) + * @since 1.2 + */ + public double[] evaluateRange(double[] values, int from, int to, double... p) { + Statistics.checkFromToIndex(from, to, values.length); + return compute(values, from, to, p); + } + + /** + * Compute the {@code p}-th quantiles of the specified range of values. + * + * @param values Values. + * @param from Inclusive start of the range. + * @param to Exclusive end of the range. + * @param p Probabilities for the quantiles to compute. + * @return the quantiles + * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; + * or no probabilities are specified. + */ + private double[] compute(double[] values, int from, int to, double... p) { checkProbabilities(p); // Floating-point data handling - final int[] bounds = new int[1]; - final double[] x = nanTransformer.apply(values, bounds); - final int n = bounds[0]; + final int[] bounds = new int[2]; + final double[] x = nanTransformer.apply(values, from, to, bounds); + final int start = bounds[0]; + final int end = bounds[1]; + final int n = end - start; // Special cases final double[] q = new double[p.length]; if (n <= 1) { - Arrays.fill(q, n == 0 ? Double.NaN : x[0]); + Arrays.fill(q, n == 0 ? Double.NaN : x[start]); return q; } // Collect interpolation positions. We use the output q as storage. - final int[] indices = computeIndices(n, p, q); + final int[] indices = computeIndices(n, p, q, start); // Partition - Selection.select(x, 0, n, indices); + Selection.select(x, start, end, indices); // Compute for (int k = 0; k < p.length; k++) { - final int i = (int) q[k]; - if (q[k] > i) { - q[k] = Interpolation.interpolate(x[i], x[i + 1], q[k] - i); + // ip in [0, n); i in [start, end) + final int ip = (int) q[k]; + final int i = start + ip; + if (q[k] > ip) { + q[k] = Interpolation.interpolate(x[i], x[i + 1], q[k] - ip); } else { q[k] = x[i]; } @@ -307,22 +398,78 @@ public final class Quantile { * @see #evaluate(int[], double...) */ public double evaluate(int[] values, double p) { + return compute(values, 0, values.length, p); + } + + /** + * Evaluate the {@code p}-th quantile of the specified range of values. + * + * <p>Note: This method may partially sort the input values if not configured to + * {@link #withCopy(boolean) copy} the input data. + * + * <p><strong>Performance</strong> + * + * <p>It is not recommended to use this method for repeat calls for different quantiles + * within the same values. The {@link #evaluateRange(int[], int, int, double...)} method should be used + * which provides better performance. + * + * @param values Values. + * @param from Inclusive start of the range. + * @param to Exclusive end of the range. + * @param p Probability for the quantile to compute. + * @return the quantile + * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]} + * @throws IndexOutOfBoundsException if the sub-range is out of bounds + * @see #evaluateRange(int[], int, int, double...) + * @since 1.2 + */ + public double evaluateRange(int[] values, int from, int to, double p) { + Statistics.checkFromToIndex(from, to, values.length); + return compute(values, from, to, p); + } + + /** + * Compute the {@code p}-th quantile of the specified range of values. + * + * @param values Values. + * @param from Inclusive start of the range. + * @param to Exclusive end of the range. + * @param p Probability for the quantile to compute. + * @return the quantile + * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]} + */ + private double compute(int[] values, int from, int to, double p) { checkProbability(p); - final int n = values.length; + final int n = to - from; // Special cases if (n <= 1) { - return n == 0 ? Double.NaN : values[0]; + return n == 0 ? Double.NaN : values[from]; + } + + // Create the range + final int[] x; + final int start; + final int end; + if (copy) { + x = Statistics.copy(values, from, to); + start = 0; + end = n; + } else { + x = values; + start = from; + end = to; } + final double pos = estimationType.index(p, n); - final int i = (int) pos; + final int ip = (int) pos; + final int i = start + ip; // Partition and compute - final int[] x = copy ? values.clone() : values; - if (pos > i) { - Selection.select(x, 0, n, new int[] {i, i + 1}); - return Interpolation.interpolate(x[i], x[i + 1], pos - i); + if (pos > ip) { + Selection.select(x, start, end, new int[] {i, i + 1}); + return Interpolation.interpolate(x[i], x[i + 1], pos - ip); } - Selection.select(x, 0, n, i); + Selection.select(x, start, end, i); return x[i]; } @@ -339,27 +486,81 @@ public final class Quantile { * or no probabilities are specified. */ public double[] evaluate(int[] values, double... p) { + return compute(values, 0, values.length, p); + } + + /** + * Evaluate the {@code p}-th quantiles of the specified range of values.. + * + * <p>Note: This method may partially sort the input values if not configured to + * {@link #withCopy(boolean) copy} the input data. + * + * @param values Values. + * @param from Inclusive start of the range. + * @param to Exclusive end of the range. + * @param p Probabilities for the quantiles to compute. + * @return the quantiles + * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; + * or no probabilities are specified. + * @throws IndexOutOfBoundsException if the sub-range is out of bounds + * @since 1.2 + */ + public double[] evaluateRange(int[] values, int from, int to, double... p) { + Statistics.checkFromToIndex(from, to, values.length); + return compute(values, from, to, p); + } + + /** + * Evaluate the {@code p}-th quantiles of the specified range of values.. + * + * <p>Note: This method may partially sort the input values if not configured to + * {@link #withCopy(boolean) copy} the input data. + * + * @param values Values. + * @param from Inclusive start of the range. + * @param to Exclusive end of the range. + * @param p Probabilities for the quantiles to compute. + * @return the quantiles + * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; + * or no probabilities are specified. + */ + private double[] compute(int[] values, int from, int to, double... p) { checkProbabilities(p); - final int n = values.length; + final int n = to - from; // Special cases final double[] q = new double[p.length]; if (n <= 1) { - Arrays.fill(q, n == 0 ? Double.NaN : values[0]); + Arrays.fill(q, n == 0 ? Double.NaN : values[from]); return q; } + // Create the range + final int[] x; + final int start; + final int end; + if (copy) { + x = Statistics.copy(values, from, to); + start = 0; + end = n; + } else { + x = values; + start = from; + end = to; + } + // Collect interpolation positions. We use the output q as storage. - final int[] indices = computeIndices(n, p, q); + final int[] indices = computeIndices(n, p, q, start); // Partition - final int[] x = copy ? values.clone() : values; - Selection.select(x, 0, n, indices); + Selection.select(x, start, end, indices); // Compute for (int k = 0; k < p.length; k++) { - final int i = (int) q[k]; - if (q[k] > i) { - q[k] = Interpolation.interpolate(x[i], x[i + 1], q[k] - i); + // ip in [0, n); i in [start, end) + final int ip = (int) q[k]; + final int i = start + ip; + if (q[k] > ip) { + q[k] = Interpolation.interpolate(x[i], x[i + 1], q[k] - ip); } else { q[k] = x[i]; } @@ -502,22 +703,26 @@ public final class Quantile { * <p>The zero-based interpolation index in {@code [0, n)} is * saved into the working array {@code q} for each {@code p}. * + * <p>The indices are incremented by the provided {@code offset} to allow + * addressing sub-ranges of a larger array. + * * @param n Size of the data. * @param p Probabilities for the quantiles to compute. - * @param q Working array for quantiles. - * @return the indices + * @param q Working array for quantiles in {@code [0, n)}. + * @param offset Array offset. + * @return the indices in {@code [offset, offset + n)} */ - private int[] computeIndices(int n, double[] p, double[] q) { + private int[] computeIndices(int n, double[] p, double[] q, int offset) { final int[] indices = new int[p.length << 1]; int count = 0; for (int k = 0; k < p.length; k++) { final double pos = estimationType.index(p[k], n); q[k] = pos; final int i = (int) pos; - indices[count++] = i; + indices[count++] = offset + i; if (pos > i) { // Require the next index for interpolation - indices[count++] = i + 1; + indices[count++] = offset + i + 1; } } if (count < indices.length) { diff --git a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Statistics.java b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Statistics.java index 2126883..ebd639c 100644 --- a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Statistics.java +++ b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Statistics.java @@ -497,4 +497,23 @@ final class Statistics { } return s; } + + /** + * Copy the specified range of data. + * + * <p>This is a simplification of {@link Arrays#copyOfRange(double[], int, int)} + * and does not support range checks or padding of the original input to + * a longer output. + * + * @param data Values. + * @param from Inclusive start of the range. + * @param to Exclusive end of the range. + * @return the copy + */ + static int[] copy(int[] data, int from, int to) { + final int length = to - from; + final int[] copy = new int[length]; + System.arraycopy(data, from, copy, 0, length); + return copy; + } } diff --git a/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/MedianTest.java b/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/MedianTest.java index 1fc2ce0..dfc4423 100644 --- a/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/MedianTest.java +++ b/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/MedianTest.java @@ -18,6 +18,7 @@ package org.apache.commons.statistics.descriptive; import java.util.Arrays; +import java.util.function.Supplier; import java.util.stream.Stream; import org.apache.commons.rng.UniformRandomProvider; import org.apache.commons.rng.sampling.ArraySampler; @@ -33,6 +34,9 @@ import org.junit.jupiter.params.provider.MethodSource; * Test for {@link Median}. */ class MedianTest { + /** The number of random trials to perform. */ + private static final int RANDOM_TRIALS = 5; + @Test void testNullPropertyThrows() { final Median m = Median.withDefaults(); @@ -52,6 +56,8 @@ class MedianTest { } else { Assertions.assertEquals(expected, q, Math.ulp(expected)); } + // Note: This assertion is not strictly necessary. If the median uses a different + // partial sort than the quantile the assertion can be removed. Assertions.assertArrayEquals(values, copy); } @@ -59,7 +65,7 @@ class MedianTest { @MethodSource(value = {"testDoubleMedian"}) void testDoubleMedianExcludeNaN(double[] values, double expected) { // If NaN is present then the result will change from expected so ignore this - Assumptions.assumeTrue(Arrays.stream(values).filter(Double::isNaN).count() == 0); + Assumptions.assumeFalse(Arrays.stream(values).anyMatch(Double::isNaN)); // Note: Use copy here. This checks that the copy of the data // (with excluded NaNs) is used for special cases. final Median m = Median.withDefaults().with(NaNPolicy.EXCLUDE).withCopy(true); @@ -76,6 +82,19 @@ class MedianTest { } } + @ParameterizedTest + @MethodSource(value = {"testDoubleMedian"}) + void testDoubleMedianErrorNaN(double[] values, double expected) { + final Median m = Median.withDefaults().with(NaNPolicy.ERROR); + final double[] y = values.clone(); + if (Arrays.stream(values).anyMatch(Double::isNaN)) { + Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(values)); + Assertions.assertArrayEquals(y, values, "Input was modified"); + } else { + Assertions.assertEquals(expected, m.evaluate(values)); + } + } + static Stream<Arguments> testDoubleMedian() { final Stream.Builder<Arguments> builder = Stream.builder(); for (final double[] x : new double[][] { @@ -168,6 +187,74 @@ class MedianTest { Assertions.assertFalse(Arrays.equals(original, values)); } + @ParameterizedTest + @MethodSource(value = {"org.apache.commons.statistics.descriptive.TestData#arrayRangeTestData"}) + final void testDoubleMedianRangeThrows(int from, int to, int length) { + final double[] values = new double[length]; + Assertions.assertThrows(IndexOutOfBoundsException.class, + () -> Median.withDefaults().evaluateRange(values, from, to), + () -> String.format("range [%d, %d) in length %d", from, to, length)); + } + + /** + * Test data with an internal region evaluates exactly the same when using + * a copy of the internal region evaluated as a full length array, + * or the range method on the full array. + */ + @Test + void testDoubleMedianRange() { + // Empty range + assertMedianRange(new double[] {1, 2, 3, 4, 5}, 2, 2); + // Range range + final UniformRandomProvider rng = RandomSource.XO_SHI_RO_128_PP.create(); + for (int count = RANDOM_TRIALS; --count >= 0;) { + final int n = 10 + count; + final double[] x = rng.doubles(n).toArray(); + final int i = rng.nextInt(n); + final int j = rng.nextInt(n); + assertMedianRange(x, Math.min(i, j), Math.max(i, j)); + } + // NaN in the range + final double[] x = rng.doubles(10).toArray(); + x[5] = Double.NaN; + assertMedianRange(x.clone(), 2, 8); + assertMedianRange(x.clone(), 2, 9); + } + + private static void assertMedianRange(double[] values, int from, int to) { + // Test all NaN policies as these apply to double[] data + for (final NaNPolicy p : NaNPolicy.values()) { + assertMedianRange(values.clone(), from, to, p); + } + } + + private static void assertMedianRange(double[] values, int from, int to, NaNPolicy nanPolicy) { + final Supplier<String> msg = () -> String.format("NaN=%s; range [%d, %d) in length %d", + nanPolicy, from, to, values.length); + final double[] original = values.clone(); + final double[] x = Arrays.copyOfRange(values, from, to); + // Test with/without modification of the input + final Median m = Median.withDefaults().with(nanPolicy).withCopy(false); + final Median mCopy = Median.withDefaults().with(nanPolicy).withCopy(true); + try { + // Reference result operating in-place + final double expected = m.evaluate(x); + // With copy the input is unchanged + Assertions.assertEquals(expected, mCopy.evaluateRange(values, from, to), msg); + Assertions.assertArrayEquals(original, values, msg); + // Without copy only the values inside the range should be modified. + // Compose the expected result. + System.arraycopy(x, 0, original, from, x.length); + Assertions.assertEquals(expected, m.evaluateRange(values, from, to), msg); + Assertions.assertArrayEquals(original, values, msg); + } catch (IllegalArgumentException e) { + // NaN input + Assertions.assertThrows(e.getClass(), () -> mCopy.evaluateRange(values, from, to), msg); + Assertions.assertThrows(e.getClass(), () -> m.evaluateRange(values, from, to), msg); + Assertions.assertArrayEquals(original, values, msg); + } + } + @ParameterizedTest @MethodSource(value = {"testIntMedian"}) void testIntMedian(int[] values, double expected) { @@ -247,4 +334,53 @@ class MedianTest { Assertions.assertEquals(expected, Median.withDefaults().withCopy(false).evaluate(values)); Assertions.assertFalse(Arrays.equals(original, values)); } + + @ParameterizedTest + @MethodSource(value = {"org.apache.commons.statistics.descriptive.TestData#arrayRangeTestData"}) + final void testIntMedianRangeThrows(int from, int to, int length) { + final int[] values = new int[length]; + Assertions.assertThrows(IndexOutOfBoundsException.class, + () -> Median.withDefaults().evaluateRange(values, from, to), + () -> String.format("range [%d, %d) in length %d", from, to, length)); + } + + /** + * Test data with an internal region evaluates exactly the same when using + * a copy of the internal region evaluated as a full length array, + * or the range method on the full array. + */ + @Test + void testIntMedianRange() { + // Empty range + assertMedianRange(new int[] {1, 2, 3, 4, 5}, 2, 2); + // Range range + final UniformRandomProvider rng = RandomSource.XO_SHI_RO_128_PP.create(); + for (int count = RANDOM_TRIALS; --count >= 0;) { + final int n = 10 + count; + final int[] x = rng.ints(n).toArray(); + final int i = rng.nextInt(n); + final int j = rng.nextInt(n); + assertMedianRange(x, Math.min(i, j), Math.max(i, j)); + } + } + + private static void assertMedianRange(int[] values, int from, int to) { + final Supplier<String> msg = () -> String.format("range [%d, %d) in length %d", + from, to, values.length); + final int[] original = values.clone(); + final int[] x = Arrays.copyOfRange(values, from, to); + // Test with/without modification of the input + final Median m = Median.withDefaults().withCopy(false); + final Median mCopy = Median.withDefaults().withCopy(true); + // Reference result operating in-place + final double expected = m.evaluate(x); + // With copy the input is unchanged + Assertions.assertEquals(expected, mCopy.evaluateRange(values, from, to), msg); + Assertions.assertArrayEquals(original, values, msg); + // Without copy only the values inside the range should be modified. + // Compose the expected result. + System.arraycopy(x, 0, original, from, x.length); + Assertions.assertEquals(expected, m.evaluateRange(values, from, to), msg); + Assertions.assertArrayEquals(original, values, msg); + } } diff --git a/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/NaNTransformersTest.java b/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/NaNTransformersTest.java index d14634b..c77aacc 100644 --- a/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/NaNTransformersTest.java +++ b/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/NaNTransformersTest.java @@ -30,11 +30,27 @@ class NaNTransformersTest { @ParameterizedTest @MethodSource(value = {"nanData"}) void testNaNErrorWithNaN(double[] a) { - final int[] bound = new int[1]; - final NaNTransformer t1 = NaNTransformers.createNaNTransformer(NaNPolicy.ERROR, false); - Assertions.assertThrows(IllegalArgumentException.class, () -> t1.apply(a, bound)); - final NaNTransformer t2 = NaNTransformers.createNaNTransformer(NaNPolicy.ERROR, true); - Assertions.assertThrows(IllegalArgumentException.class, () -> t2.apply(a, bound)); + final int[] bounds = new int[2]; + for (final boolean copy : new boolean[] {false, true}) { + final NaNTransformer t = NaNTransformers.createNaNTransformer(NaNPolicy.ERROR, copy); + Assertions.assertThrows(IllegalArgumentException.class, () -> t.apply(a, 0, a.length, bounds)); + // Partial ranges. + // Note: Checks on the contents of the returned array range without NaN are + // done in a separate test on the nonNanData. + // First half + final int half = a.length >> 1; + if (Arrays.stream(a, 0, half).anyMatch(Double::isNaN)) { + Assertions.assertThrows(IllegalArgumentException.class, () -> t.apply(a, 0, half, bounds)); + } else { + Assertions.assertDoesNotThrow(() -> t.apply(a, 0, half, bounds)); + } + // Second half + if (Arrays.stream(a, half, a.length).anyMatch(Double::isNaN)) { + Assertions.assertThrows(IllegalArgumentException.class, () -> t.apply(a, half, a.length, bounds)); + } else { + Assertions.assertDoesNotThrow(() -> t.apply(a, half, a.length, bounds)); + } + } } @ParameterizedTest @@ -68,24 +84,65 @@ class NaNTransformersTest { */ private static void assertNaNTransformer(double[] a, NaNTransformer t, boolean includeNaN, boolean copy) { - final int[] bounds = new int[1]; - final double[] b = t.apply(a, bounds); + final int n = a.length; + assertNaNTransformer(a.clone(), 0, n, t, includeNaN, copy); + assertNaNTransformer(a.clone(), 0, n >> 1, t, includeNaN, copy); + assertNaNTransformer(a.clone(), n >> 1, n, t, includeNaN, copy); + } + + /** + * Assert the NaN transformer allows including or excluding NaN. + * + * @param a Data. + * @param from Inclusive start of the range. + * @param to Exclusive end of the range. + * @param t Transformer. + * @param includeNaN True if the size should include NaN. + * @param copy True if the pre-processed data should be a copy. + */ + private static void assertNaNTransformer(double[] a, int from, int to, NaNTransformer t, + boolean includeNaN, boolean copy) { + // Count NaN + final int nanCount = (int) Arrays.stream(a, from, to).filter(Double::isNaN).count(); + final int length = to - from - (includeNaN ? 0 : nanCount); + + final int[] bounds = {-1, -1}; + final double[] original = a.clone(); + final double[] b = t.apply(a, from, to, bounds); + + // Sort the original and the returned range to allow equality comparison + double[] s1 = Arrays.copyOfRange(original, from, to); + Arrays.sort(s1); + final double[] s2 = Arrays.copyOfRange(b, bounds[0], bounds[1]); + Arrays.sort(s2); + if (copy) { - Assertions.assertNotSame(a, b); + Assertions.assertNotSame(a, b, "copy returned the original"); + Assertions.assertArrayEquals(original, a, "original array was modified"); + // b is a new array, it can be any length but the returned range + // should contains the same values as a, optionally excluding NaN + if (!includeNaN) { + s1 = Arrays.copyOf(s1, length); + } + Assertions.assertArrayEquals(s1, s2); } else { - Assertions.assertSame(a, b); - } - // Count NaN - final int nanCount = (int) Arrays.stream(a).filter(Double::isNaN).count(); - final int size = a.length - (includeNaN ? 0 : nanCount); - Assertions.assertEquals(size, bounds[0], "Size of data"); - if (!includeNaN) { - for (int i = 0; i < size; i++) { - Assertions.assertNotEquals(Double.NaN, b[i], "NaN in unsorted range"); + Assertions.assertSame(a, b, "no copy returned a new array"); + // Unchanged outside of [from, to) + for (int i = 0; i < from; i++) { + Assertions.assertEquals(original[i], a[i]); + } + for (int i = to; i < a.length; i++) { + Assertions.assertEquals(original[i], a[i]); } - for (int i = size; i < b.length; i++) { - Assertions.assertEquals(Double.NaN, b[i], "non-NaN in upper range"); + // Same values inside the range (including NaN) + final double[] s3 = Arrays.copyOfRange(b, from, to); + Arrays.sort(s3); + Assertions.assertArrayEquals(s1, s3); + // The returned range has the same values optionally excluding NaN + if (!includeNaN) { + s1 = Arrays.copyOf(s1, length); } + Assertions.assertArrayEquals(s1, s2); } } diff --git a/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/QuantileTest.java b/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/QuantileTest.java index cf4fae7..ebd8faf 100644 --- a/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/QuantileTest.java +++ b/commons-statistics-descriptive/src/test/java/org/apache/commons/statistics/descriptive/QuantileTest.java @@ -20,6 +20,8 @@ package org.apache.commons.statistics.descriptive; import java.util.Arrays; import java.util.function.Supplier; import java.util.stream.Stream; +import org.apache.commons.rng.UniformRandomProvider; +import org.apache.commons.rng.simple.RandomSource; import org.apache.commons.statistics.descriptive.Quantile.EstimationMethod; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assumptions; @@ -34,6 +36,8 @@ import org.junit.jupiter.params.provider.MethodSource; class QuantileTest { /** Estimation types to test. */ private static final EstimationMethod[] TYPES = EstimationMethod.values(); + /** The number of random trials to perform. */ + private static final int RANDOM_TRIALS = 5; @Test void testNullPropertyThrows() { @@ -86,6 +90,10 @@ class QuantileTest { Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(values1, new double[] {p})); Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(values2, p)); Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(values2, new double[] {p})); + Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluateRange(values1, 0, values1.length, p)); + Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluateRange(values1, 0, values1.length, new double[] {p})); + Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluateRange(values2, 0, values2.length, p)); + Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluateRange(values2, 0, values2.length, new double[] {p})); Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(10, i -> 1, p)); Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(10, i -> 1, new double[] {p})); } @@ -100,6 +108,10 @@ class QuantileTest { Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(values1, new double[0])); Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(values2)); Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(values2, new double[0])); + Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluateRange(values1, 0, values1.length)); + Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluateRange(values1, 0, values1.length, new double[0])); + Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluateRange(values2, 0, values2.length)); + Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluateRange(values2, 0, values2.length, new double[0])); Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(10, i -> 1)); Assertions.assertThrows(IllegalArgumentException.class, () -> m.evaluate(10, i -> 1, new double[0])); } @@ -158,6 +170,20 @@ class QuantileTest { } } + @ParameterizedTest + @MethodSource(value = {"testDoubleQuantile"}) + void testDoubleQuantileErrorNaN(double[] values, double[] p, double[][] expected, double delta) { + final Quantile q = Quantile.withDefaults().with(NaNPolicy.ERROR); + final double[] y = values.clone(); + if (Arrays.stream(values).anyMatch(Double::isNaN)) { + Assertions.assertThrows(IllegalArgumentException.class, () -> q.evaluate(values)); + Assertions.assertArrayEquals(y, values, "Input was modified"); + } else { + assertQuantile(q, values, p, expected, delta, + Quantile::evaluate, Quantile::evaluate); + } + } + @ParameterizedTest @MethodSource(value = {"testDoubleQuantile"}) void testQuantileSorted(double[] values, double[] p, double[][] expected, double delta) { @@ -399,6 +425,87 @@ class QuantileTest { Assertions.assertFalse(Arrays.equals(original, values)); } + @ParameterizedTest + @MethodSource(value = {"org.apache.commons.statistics.descriptive.TestData#arrayRangeTestData"}) + final void testDoubleQuantileRangeThrows(int from, int to, int length) { + final double[] values = new double[length]; + final Supplier<String> msg = () -> String.format("range [%d, %d) in length %d", from, to, length); + Assertions.assertThrows(IndexOutOfBoundsException.class, + () -> Quantile.withDefaults().evaluateRange(values, from, to, 0.5), msg); + Assertions.assertThrows(IndexOutOfBoundsException.class, + () -> Quantile.withDefaults().evaluateRange(values, from, to, 0.25, 0.5), msg); + } + + /** + * Test data with an internal region evaluates exactly the same when using + * a copy of the internal region evaluated as a full length array, + * or the range method on the full array. + */ + @Test + void testDoubleQuantileRange() { + // Empty range + assertQuantileRange(new double[] {1, 2, 3, 4, 5}, 2, 2); + // Range range + final UniformRandomProvider rng = RandomSource.XO_SHI_RO_128_PP.create(); + for (int count = RANDOM_TRIALS; --count >= 0;) { + final int n = 10 + count; + final double[] x = rng.doubles(n).toArray(); + final int i = rng.nextInt(n); + final int j = rng.nextInt(n); + assertQuantileRange(x, Math.min(i, j), Math.max(i, j)); + } + // NaN in the range + final double[] x = rng.doubles(10).toArray(); + x[5] = Double.NaN; + assertQuantileRange(x.clone(), 2, 8); + assertQuantileRange(x.clone(), 2, 9); + } + + private static void assertQuantileRange(double[] values, int from, int to) { + // Test all NaN policies as these apply to double[] data + for (final NaNPolicy p : NaNPolicy.values()) { + // Using p={0, 1} ensures quantiles with no interpolation are included + for (final double prob : new double[] {0, 0.5, 0.75, 1}) { + assertQuantileRange(values.clone(), from, to, p, prob); + } + } + } + + private static void assertQuantileRange(double[] values, int from, int to, NaNPolicy nanPolicy, double prob) { + final Supplier<String> msg = () -> String.format("NaN=%s; p=%.2f; range [%d, %d) in length %d", + nanPolicy, prob, from, to, values.length); + final double[] original = values.clone(); + final double[] x = Arrays.copyOfRange(values, from, to); + final double[] p = {prob}; + // Test with/without modification of the input + final Quantile q = Quantile.withDefaults().with(nanPolicy).withCopy(false); + final Quantile qCopy = Quantile.withDefaults().with(nanPolicy).withCopy(true); + try { + // Reference result operating in-place + final double expected = q.evaluate(x, p[0]); + // With copy the input is unchanged + Assertions.assertEquals(expected, qCopy.evaluateRange(values, from, to, p[0]), msg); + Assertions.assertArrayEquals(original, values, msg); + Assertions.assertEquals(expected, qCopy.evaluateRange(values, from, to, p)[0], msg); + Assertions.assertArrayEquals(original, values, msg); + // Without copy only the values inside the range should be modified. + // Compose the expected result. + System.arraycopy(x, 0, original, from, x.length); + final double[] copy = values.clone(); + Assertions.assertEquals(expected, q.evaluateRange(values, from, to, p)[0], msg); + Assertions.assertArrayEquals(original, values, msg); + Assertions.assertEquals(expected, q.evaluateRange(copy, from, to, p[0]), msg); + Assertions.assertArrayEquals(original, copy, msg); + } catch (IllegalArgumentException e) { + // NaN input + Assertions.assertThrows(e.getClass(), () -> qCopy.evaluateRange(values, from, to, p[0]), msg); + Assertions.assertThrows(e.getClass(), () -> qCopy.evaluateRange(values, from, to, p), msg); + Assertions.assertThrows(e.getClass(), () -> q.evaluateRange(values, from, to, p[0]), msg); + Assertions.assertThrows(e.getClass(), () -> q.evaluateRange(values, from, to, p), msg); + Assertions.assertArrayEquals(original, values, msg); + } + } + // int[] /** @@ -605,4 +712,68 @@ class QuantileTest { Assertions.assertEquals(2, Quantile.withDefaults().withCopy(false).evaluate(values, new double[] {0.5})[0]); Assertions.assertFalse(Arrays.equals(original, values)); } + + @ParameterizedTest + @MethodSource(value = {"org.apache.commons.statistics.descriptive.TestData#arrayRangeTestData"}) + final void testIntQuantileRangeThrows(int from, int to, int length) { + final int[] values = new int[length]; + final Supplier<String> msg = () -> String.format("range [%d, %d) in length %d", from, to, length); + Assertions.assertThrows(IndexOutOfBoundsException.class, + () -> Quantile.withDefaults().evaluateRange(values, from, to, 0.5), msg); + Assertions.assertThrows(IndexOutOfBoundsException.class, + () -> Quantile.withDefaults().evaluateRange(values, from, to, 0.25, 0.5), msg); + } + + /** + * Test data with an internal region evaluates exactly the same when using + * a copy of the internal region evaluated as a full length array, + * or the range method on the full array. + */ + @Test + void testIntQuantileRange() { + // Empty range + assertQuantileRange(new int[] {1, 2, 3, 4, 5}, 2, 2); + // Range range + final UniformRandomProvider rng = RandomSource.XO_SHI_RO_128_PP.create(); + for (int count = RANDOM_TRIALS; --count >= 0;) { + final int n = 10 + count; + final int[] x = rng.ints(n).toArray(); + final int i = rng.nextInt(n); + final int j = rng.nextInt(n); + assertQuantileRange(x, Math.min(i, j), Math.max(i, j)); + } + } + + private static void assertQuantileRange(int[] values, int from, int to) { + // Using p={0, 1} ensures quantiles with no interpolation are included + for (final double p : new double[] {0, 0.5, 0.75, 1}) { + assertQuantileRange(values.clone(), from, to, p); + } + } + + private static void assertQuantileRange(int[] values, int from, int to, double prob) { + final Supplier<String> msg = () -> String.format("p=%.2f range [%d, %d) in length %d", + prob, from, to, values.length); + final int[] original = values.clone(); + final int[] x = Arrays.copyOfRange(values, from, to); + final double[] p = {prob}; + // Test with/without modification of the input + final Quantile q = Quantile.withDefaults().withCopy(false); + final Quantile qCopy = Quantile.withDefaults().withCopy(true); + // Reference result operating in-place + final double expected = q.evaluate(x, p[0]); + // With copy the input is unchanged + Assertions.assertEquals(expected, qCopy.evaluateRange(values, from, to, p[0]), msg); + Assertions.assertArrayEquals(original, values, msg); + Assertions.assertEquals(expected, qCopy.evaluateRange(values, from, to, p)[0], msg); + Assertions.assertArrayEquals(original, values, msg); + // Without copy only the values inside the range should be modified. + // Compose the expected result. + System.arraycopy(x, 0, original, from, x.length); + final int[] copy = values.clone(); + Assertions.assertEquals(expected, q.evaluateRange(values, from, to, p)[0], msg); + Assertions.assertArrayEquals(original, values, msg); + Assertions.assertEquals(expected, q.evaluateRange(copy, from, to, p[0]), msg); + Assertions.assertArrayEquals(original, copy, msg); + } }