This is an automated email from the ASF dual-hosted git repository. aherbert pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-statistics.git
The following commit(s) were added to refs/heads/master by this push: new 018b1a8 STATISTICS-63: Add ranking module 018b1a8 is described below commit 018b1a8401c6165a0b876c5814121e09ae4757e3 Author: Alex Herbert <aherb...@apache.org> AuthorDate: Sat Dec 3 19:02:55 2022 +0000 STATISTICS-63: Add ranking module Port o.a.c.math4.stat.ranking to a new ranking module. --- commons-statistics-examples/examples-jmh/pom.xml | 6 + .../jmh/ranking/NaturalRankingPerformance.java | 304 ++++++++++ .../examples/jmh/ranking/package-info.java | 21 + .../jmh/ranking/NaturalRankingPerformanceTest.java | 77 +++ commons-statistics-examples/pom.xml | 5 + commons-statistics-ranking/LICENSE | 201 +++++++ commons-statistics-ranking/NOTICE | 5 + commons-statistics-ranking/README.md | 105 ++++ commons-statistics-ranking/pom.xml | 73 +++ .../commons/statistics/ranking/NaNStrategy.java | 50 ++ .../commons/statistics/ranking/NaturalRanking.java | 582 ++++++++++++++++++++ .../statistics/ranking/RankingAlgorithm.java | 42 ++ .../commons/statistics/ranking/TiesStrategy.java | 68 +++ .../commons/statistics/ranking/package-info.java | 21 + .../src/site/resources/profile.jacoco | 17 + commons-statistics-ranking/src/site/site.xml | 38 ++ commons-statistics-ranking/src/site/xdoc/index.xml | 52 ++ .../statistics/ranking/NaturalRankingTest.java | 611 +++++++++++++++++++++ .../commons/statistics/ranking/UserGuideTest.java | 34 ++ pom.xml | 1 + src/conf/pmd/pmd-ruleset.xml | 14 + src/conf/spotbugs/spotbugs-exclude-filter.xml | 5 + 22 files changed, 2332 insertions(+) diff --git a/commons-statistics-examples/examples-jmh/pom.xml b/commons-statistics-examples/examples-jmh/pom.xml index 2e4abed..cfa3ab8 100644 --- a/commons-statistics-examples/examples-jmh/pom.xml +++ b/commons-statistics-examples/examples-jmh/pom.xml @@ -36,6 +36,11 @@ <artifactId>commons-statistics-distribution</artifactId> </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-statistics-ranking</artifactId> + </dependency> + <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-rng-simple</artifactId> @@ -204,6 +209,7 @@ </goals> <configuration> <finalName>${uberjar.name}</finalName> + <createDependencyReducedPom>false</createDependencyReducedPom> <transformers> <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> <mainClass>${project.mainClass}</mainClass> diff --git a/commons-statistics-examples/examples-jmh/src/main/java/org/apache/commons/statistics/examples/jmh/ranking/NaturalRankingPerformance.java b/commons-statistics-examples/examples-jmh/src/main/java/org/apache/commons/statistics/examples/jmh/ranking/NaturalRankingPerformance.java new file mode 100644 index 0000000..923a56b --- /dev/null +++ b/commons-statistics-examples/examples-jmh/src/main/java/org/apache/commons/statistics/examples/jmh/ranking/NaturalRankingPerformance.java @@ -0,0 +1,304 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.statistics.examples.jmh.ranking; + +import java.util.Arrays; +import java.util.concurrent.TimeUnit; +import java.util.function.UnaryOperator; +import org.apache.commons.rng.UniformRandomProvider; +import org.apache.commons.rng.sampling.distribution.DirichletSampler; +import org.apache.commons.rng.simple.RandomSource; +import org.apache.commons.statistics.ranking.NaturalRanking; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; + +/** + * Executes a benchmark of the ranking of {@code double} array data. + */ +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@State(Scope.Benchmark) +@Fork(value = 1, jvmArgs = {"-server", "-Xms512M", "-Xmx512M"}) +public class NaturalRankingPerformance { + /** + * Source of {@code double} array data to rank. + */ + @State(Scope.Benchmark) + public static class DataSource { + /** Data length. */ + @Param({"10000"}) + private int length; + /** Fraction of total length that has tied (equal) data. */ + @Param({"0", "0.1", "0.5"}) + private double tieFraction; + /** Count of the number of distinct runs of tied (equal) data. */ + @Param({"20"}) + private int ties; + /** Concentration parameter for the distribution of tie lengths. */ + @Param({"1"}) + private double alpha; + + /** Data to rank. */ + private double[] data; + + /** + * @return the data + */ + public double[] getData() { + return data; + } + + /** + * Create the data to rank. + */ + @Setup(Level.Iteration) + public void setup() { + // Data will be randomized per iteration + final UniformRandomProvider rng = RandomSource.XO_RO_SHI_RO_128_PP.create(); + data = createData(rng, length, tieFraction, ties, alpha); + } + + /** + * Creates the data. + * This is package-private for testing. + * + * @param rng Source of randomness + * @param length Data length. + * @param tieFraction Fraction of total length that has tied (equal) data. + * @param ties Count of the number of distinct runs of tied (equal) data. + * @param alpha Concentration parameter for the distribution of tie lengths. + * @return the data + */ + static double[] createData(UniformRandomProvider rng, + int length, double tieFraction, int ties, double alpha) { + assert length > 0 : "Invalid data length"; + assert tieFraction <= 1 && tieFraction >= 0 : "Invalid tie fraction"; + assert ties >= 0 : "Invalid number of ties"; + assert alpha >= 0 : "Invalid concentration parameter"; + + // The data will contain n regions of data, each with the same value, + // then the rest is a sequence. This is then shuffled. + final double[] data = new double[length]; + int count = 0; + int value = 0; + + // Create tie regions + final int tiesLength = (int) Math.round(tieFraction * length); + if (ties > 0 && tiesLength > 0) { + // Cut the ties length into parts. + // Note that due to randomness some lengths may be <= 1 and therefore not a tie. + // This increasingly occurs as alpha -> 0. + // See: https://en.wikipedia.org/wiki/Dirichlet_distribution#String_cutting + final double[] tieFractions = DirichletSampler.symmetric(rng, ties, alpha).sample(); + final int[] lengths = Arrays.stream(tieFractions) + .mapToInt(f -> (int) Math.round(f * tiesLength)) + .toArray(); + for (final int len : lengths) { + ++value; + // Lengths may sum to more than tiesLength due to rounding so + // consume most we can + for (int i = Math.min(len, tiesLength - count); i > 0; i--) { + data[count++] = value; + } + } + } + + // Fill remaining values + while (count < data.length) { + data[count++] = ++value; + } + + // Fisher-Yates shuffle + for (int i = data.length; i > 1; i--) { + swap(data, i - 1, rng.nextInt(i)); + } + + return data; + } + + /** + * Swaps the two specified elements in the specified array. + * + * @param array Data array + * @param i First index + * @param j Second index + */ + private static void swap(double[] array, int i, int j) { + final double tmp = array[i]; + array[i] = array[j]; + array[j] = tmp; + } + } + + /** + * Source of a function to rank {@code double} array data. + */ + @State(Scope.Benchmark) + public static class RankingSource { + /** Name for baseline implementation. */ + private static final String METHOD_BASELINE = "baseline"; + /** Name for Commons Statistics implementation. */ + private static final String METHOD_STATISTICS = "statistics"; + /** Name for Commons Math3 implementation. */ + private static final String METHOD_MATH3 = "math3"; + /** The ranking method. */ + @Param({"baseline", METHOD_STATISTICS, METHOD_MATH3}) + private String ranking; + + /** The ranking function. */ + private UnaryOperator<double[]> fun; + + /** + * @return the ranking function + */ + public UnaryOperator<double[]> getFunction() { + return fun; + } + + /** + * Create the ranking function. + */ + @Setup(Level.Iteration) + public void setup() { + fun = createFunction(ranking); + } + + /** + * Creates the ranking function. + * This is package-private for testing. + * + * @param name The function name. + * @return the ranking function + */ + static UnaryOperator<double[]> createFunction(String name) { + if (METHOD_BASELINE.equals(name)) { + return new SortRanking(); + } else if (METHOD_STATISTICS.equals(name)) { + return new NaturalRanking(); + } else if (METHOD_MATH3.equals(name)) { + return new org.apache.commons.math3.stat.ranking.NaturalRanking()::rank; + } else { + throw new IllegalStateException("Unknown method: " + name); + } + } + + /** + * Gets the names for valid ranking functions. + * This is package-private for testing. + * + * @return the function names + */ + static String[] getFunctionNames() { + // Do not return the baseline method + return new String[] {METHOD_STATISTICS, METHOD_MATH3}; + } + + /** + * Class to create a ranking using a sort of the data. This ranking does not + * resolve ties and is a baseline for the speed of {@link Arrays#sort(Object[])}. + */ + private static class SortRanking implements UnaryOperator<double[]> { + @Override + public double[] apply(double[] in) { + final DataPosition[] data = new DataPosition[in.length]; + for (int i = 0; i < in.length; i++) { + data[i] = new DataPosition(in[i], i); + } + Arrays.sort(data); + final double[] out = new double[in.length]; + for (int i = 0; i < in.length; i++) { + out[data[i].getPosition()] = i + 1.0; + } + return out; + } + + // Copied from NaturalRanking for baseline equivalence. + + /** + * Represents the position of a {@code double} value in a data array. The + * Comparable interface is implemented so Arrays.sort can be used to sort an + * array of data positions by value. Note that the implicitly defined natural + * ordering is NOT consistent with equals. + */ + private static class DataPosition implements Comparable<DataPosition> { + /** Data value. */ + private final double value; + /** Data position. */ + private final int position; + + /** + * Create an instance with the given value and position. + * + * @param value Data value. + * @param position Data position. + */ + DataPosition(double value, int position) { + this.value = value; + this.position = position; + } + + /** + * Compare this value to another. + * Only the <strong>values</strong> are compared. + * + * @param other the other pair to compare this to + * @return result of {@code Double.compare(value, other.value)} + */ + @Override + public int compareTo(DataPosition other) { + return Double.compare(value, other.value); + } + + // N.B. equals() and hashCode() are not implemented; see MATH-610 for discussion. + + /** + * Returns the data position. + * + * @return position + */ + int getPosition() { + return position; + } + } + } + } + + /** + * Rank {@code double} array data. + * + * @param source Source of the data. + * @param ranking Source of the ranking function. + * @return the ranking + */ + @Benchmark + public double[] sample(DataSource source, RankingSource ranking) { + return ranking.getFunction().apply(source.getData()); + } +} diff --git a/commons-statistics-examples/examples-jmh/src/main/java/org/apache/commons/statistics/examples/jmh/ranking/package-info.java b/commons-statistics-examples/examples-jmh/src/main/java/org/apache/commons/statistics/examples/jmh/ranking/package-info.java new file mode 100644 index 0000000..9f62bd8 --- /dev/null +++ b/commons-statistics-examples/examples-jmh/src/main/java/org/apache/commons/statistics/examples/jmh/ranking/package-info.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Benchmarks for the {@code org.apache.commons.statistics.ranking} components. + */ +package org.apache.commons.statistics.examples.jmh.ranking; diff --git a/commons-statistics-examples/examples-jmh/src/test/java/org/apache/commons/statistics/examples/jmh/ranking/NaturalRankingPerformanceTest.java b/commons-statistics-examples/examples-jmh/src/test/java/org/apache/commons/statistics/examples/jmh/ranking/NaturalRankingPerformanceTest.java new file mode 100644 index 0000000..3cd885a --- /dev/null +++ b/commons-statistics-examples/examples-jmh/src/test/java/org/apache/commons/statistics/examples/jmh/ranking/NaturalRankingPerformanceTest.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.statistics.examples.jmh.ranking; + +import java.util.function.UnaryOperator; +import org.apache.commons.rng.UniformRandomProvider; +import org.apache.commons.rng.simple.RandomSource; +import org.apache.commons.statistics.examples.jmh.ranking.NaturalRankingPerformance.DataSource; +import org.apache.commons.statistics.examples.jmh.ranking.NaturalRankingPerformance.RankingSource; +import org.apache.commons.statistics.ranking.NaturalRanking; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; + +/** + * Executes tests for {@link NaturalRankingPerformance}. + */ +class NaturalRankingPerformanceTest { + /** + * Test the ranking of data from the data source. + * + * <p>Each known method in the benchmark is tested to: create the same ranking; + * and not destructively modify the data. Violation of these assumptions will + * invalidate the performance benchmark. + */ + @ParameterizedTest + @CsvSource({ + "100, 0, 0, 1", + "100, 0.5, 5, 1", + "100, 1.0, 5, 0.5", + "100, 0.25, 5, 2", + }) + void testDataSource(int length, double tieFraction, int ties, double alpha) { + final UniformRandomProvider rng = RandomSource.XO_RO_SHI_RO_128_PP.create(); + final double[] data = DataSource.createData(rng, length, tieFraction, ties, alpha); + + final double[] original = data.clone(); + double[] ranking = null; + for (final String name : RankingSource.getFunctionNames()) { + final double[] r = RankingSource.createFunction(name).apply(data); + Assertions.assertArrayEquals(original, data, () -> name + " destroyed the data"); + if (ranking == null) { + ranking = r; + } else { + Assertions.assertArrayEquals(ranking, r, () -> name + " has a different ranking"); + } + } + } + + /** + * Demonstrate the Commons Math3 bug that has been fixed in statistics. + */ + @Test + void testNoData() { + final double[] empty = {}; + final UnaryOperator<double[]> fun1 = new org.apache.commons.math3.stat.ranking.NaturalRanking()::rank; + Assertions.assertThrows(ArrayIndexOutOfBoundsException.class, () -> fun1.apply(empty)); + final UnaryOperator<double[]> fun2 = new NaturalRanking(); + Assertions.assertArrayEquals(empty, fun2.apply(empty)); + } +} diff --git a/commons-statistics-examples/pom.xml b/commons-statistics-examples/pom.xml index f3690f8..d95611b 100644 --- a/commons-statistics-examples/pom.xml +++ b/commons-statistics-examples/pom.xml @@ -60,6 +60,11 @@ <artifactId>commons-statistics-distribution</artifactId> <version>1.1-SNAPSHOT</version> </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-statistics-ranking</artifactId> + <version>1.1-SNAPSHOT</version> + </dependency> <dependency> <groupId>info.picocli</groupId> <artifactId>picocli</artifactId> diff --git a/commons-statistics-ranking/LICENSE b/commons-statistics-ranking/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/commons-statistics-ranking/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/commons-statistics-ranking/NOTICE b/commons-statistics-ranking/NOTICE new file mode 100644 index 0000000..5c79569 --- /dev/null +++ b/commons-statistics-ranking/NOTICE @@ -0,0 +1,5 @@ +Apache Commons Statistics +Copyright 2018-2022 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). diff --git a/commons-statistics-ranking/README.md b/commons-statistics-ranking/README.md new file mode 100644 index 0000000..5d1e5f4 --- /dev/null +++ b/commons-statistics-ranking/README.md @@ -0,0 +1,105 @@ +<!--- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<!--- + +======================================================================+ + |**** ****| + |**** THIS FILE IS GENERATED BY THE COMMONS BUILD PLUGIN ****| + |**** DO NOT EDIT DIRECTLY ****| + |**** ****| + +======================================================================+ + | TEMPLATE FILE: readme-md-template.md | + | commons-build-plugin/trunk/src/main/resources/commons-xdoc-templates | + +======================================================================+ + | | + | 1) Re-generate using: mvn commons-build:readme-md | + | | + | 2) Set the following properties in the component's pom: | + | - commons.componentid (required, alphabetic, lower case) | + | - commons.release.version (required) | + | | + | 3) Example Properties | + | | + | <properties> | + | <commons.componentid>math</commons.componentid> | + | <commons.release.version>1.2</commons.release.version> | + | </properties> | + | | + +======================================================================+ +---> +Apache Commons Statistics Ranking +=================== + +[](https://github.com/apache/commons-statistics/actions/workflows/maven.yml) +[](https://app.codecov.io/gh/apache/commons-statistics) +[](https://maven-badges.herokuapp.com/maven-central/org.apache.commons/commons-statistics-ranking/) +[](http://www.apache.org/licenses/LICENSE-2.0.html) + +Statistical rank transformations. + +Documentation +------------- + +More information can be found on the [Apache Commons Statistics Ranking Homepage](https://commons.apache.org/proper/commons-statistics). +The [Javadoc](https://commons.apache.org/proper/commons-statistics/javadocs/api-release) can be browsed. +Questions related to the usage of Apache Commons Statistics Ranking should be posted to the [user mailing list][ml]. + +Where can I get the latest release? +----------------------------------- +You can download source and binaries from our [download page](https://commons.apache.org/proper/commons-statistics/download_statistics.cgi). + +Alternatively you can pull it from the central Maven repositories: + +```xml +<dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-statistics-ranking</artifactId> + <version>1.0</version> +</dependency> +``` + +Contributing +------------ + +We accept Pull Requests via GitHub. The [developer mailing list][ml] is the main channel of communication for contributors. +There are some guidelines which will make applying PRs easier for us: ++ No tabs! Please use spaces for indentation. ++ Respect the code style. ++ Create minimal diffs - disable on save actions like reformat source code or organize imports. If you feel the source code should be reformatted create a separate PR for this change. ++ Provide JUnit tests for your changes and make sure your changes don't break any existing tests by running ```mvn```. + +If you plan to contribute on a regular basis, please consider filing a [contributor license agreement](https://www.apache.org/licenses/#clas). +You can learn more about contributing via GitHub in our [contribution guidelines](CONTRIBUTING.md). + +License +------- +This code is under the [Apache Licence v2](https://www.apache.org/licenses/LICENSE-2.0). + +See the `NOTICE` file for required notices and attributions. + +Donations +--------- +You like Apache Commons Statistics Ranking? Then [donate back to the ASF](https://www.apache.org/foundation/contributing.html) to support the development. + +Additional Resources +-------------------- + ++ [Apache Commons Homepage](https://commons.apache.org/) ++ [Apache Issue Tracker (JIRA)](https://issues.apache.org/jira/browse/STATISTICS) ++ [Apache Commons Twitter Account](https://twitter.com/ApacheCommons) ++ `#apache-commons` IRC channel on `irc.freenode.org` + +[ml]:https://commons.apache.org/mail-lists.html diff --git a/commons-statistics-ranking/pom.xml b/commons-statistics-ranking/pom.xml new file mode 100644 index 0000000..91a1ecb --- /dev/null +++ b/commons-statistics-ranking/pom.xml @@ -0,0 +1,73 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" + xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.commons</groupId> + <artifactId>commons-statistics-parent</artifactId> + <version>1.1-SNAPSHOT</version> + </parent> + + <artifactId>commons-statistics-ranking</artifactId> + <version>1.1-SNAPSHOT</version> + <name>Apache Commons Statistics Ranking</name> + + <description>Statistical rank transformations.</description> + + <properties> + <!-- OSGi --> + <commons.osgi.symbolicName>org.apache.commons.statistics.ranking</commons.osgi.symbolicName> + <commons.osgi.export>org.apache.commons.statistics.ranking</commons.osgi.export> + <!-- Java 9+ --> + <commons.module.name>org.apache.commons.statistics.ranking</commons.module.name> + <!-- Workaround to avoid duplicating config files. --> + <statistics.parent.dir>${basedir}/..</statistics.parent.dir> + <statistics.jira.component>ranking</statistics.jira.component> + </properties> + + <dependencies> + + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-rng-client-api</artifactId> + </dependency> + + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-rng-simple</artifactId> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-rng-sampling</artifactId> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-math3</artifactId> + <scope>test</scope> + </dependency> + + </dependencies> + +</project> diff --git a/commons-statistics-ranking/src/main/java/org/apache/commons/statistics/ranking/NaNStrategy.java b/commons-statistics-ranking/src/main/java/org/apache/commons/statistics/ranking/NaNStrategy.java new file mode 100644 index 0000000..de7c326 --- /dev/null +++ b/commons-statistics-ranking/src/main/java/org/apache/commons/statistics/ranking/NaNStrategy.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.statistics.ranking; + +/** + * Strategies for handling {@link Double#NaN} values in rank transformations. + * + * @since 1.1 + */ +public enum NaNStrategy { + + /** + * NaNs are considered minimal in the ordering, equivalent to (that is, tied + * with) {@link Double#NEGATIVE_INFINITY}. + */ + MINIMAL, + + /** + * NaNs are considered maximal in the ordering, equivalent to (that is, tied + * with) {@link Double#POSITIVE_INFINITY}. + */ + MAXIMAL, + + /** NaNs are removed before rank transform is applied. */ + REMOVED, + + /** + * NaNs are left fixed "in place", that is the rank transformation is applied to + * the other elements in the input array, but the NaN elements are returned + * unchanged. + */ + FIXED, + + /** NaNs result in an exception. */ + FAILED +} diff --git a/commons-statistics-ranking/src/main/java/org/apache/commons/statistics/ranking/NaturalRanking.java b/commons-statistics-ranking/src/main/java/org/apache/commons/statistics/ranking/NaturalRanking.java new file mode 100644 index 0000000..c729bd1 --- /dev/null +++ b/commons-statistics-ranking/src/main/java/org/apache/commons/statistics/ranking/NaturalRanking.java @@ -0,0 +1,582 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.statistics.ranking; + +import java.util.Arrays; +import java.util.SplittableRandom; +import java.util.function.DoubleUnaryOperator; +import java.util.function.LongSupplier; +import org.apache.commons.rng.UniformRandomProvider; + +/** + * Ranking based on the natural ordering on floating-point values. + * + * <p>{@link Double#NaN NaNs} are treated according to the configured + * {@link NaNStrategy} and ties are handled using the selected + * {@link TiesStrategy}. Configuration settings are supplied in optional + * constructor arguments. Defaults are {@link NaNStrategy#FAILED} and + * {@link TiesStrategy#AVERAGE}, respectively. + * + * <p>When using {@link TiesStrategy#RANDOM}, a generator of random bits may be + * supplied as a {@link LongSupplier} argument; otherwise a default is created + * on-demand. The source of randomness can be supplied using a method reference. + * The following example creates a ranking with NaN values with the highest + * ranking and ties resolved randomly: + * + * <pre> + * NaturalRanking ranking = new NaturalRanking(NaNStrategy.MAXIMAL, + * new SplittableRandom()::nextLong); + * </pre> + * + * <p>Note: Using {@link TiesStrategy#RANDOM} is not thread-safe due to the mutable + * generator of randomness. Instances not using random resolution of ties are + * thread-safe. + * + * <p>Examples: + * + * <table border=""> + * <caption>Examples</caption> + * <tr><th colspan="3"> + * Input data: [20, 17, 30, 42.3, 17, 50, Double.NaN, Double.NEGATIVE_INFINITY, 17] + * </th></tr> + * <tr><th>NaNStrategy</th><th>TiesStrategy</th> + * <th>{@code rank(data)}</th> + * <tr> + * <td>MAXIMAL</td> + * <td>default (ties averaged)</td> + * <td>[5, 3, 6, 7, 3, 8, 9, 1, 3]</td></tr> + * <tr> + * <td>MAXIMAL</td> + * <td>MINIMUM</td> + * <td>[5, 2, 6, 7, 2, 8, 9, 1, 2]</td></tr> + * <tr> + * <td>MINIMAL</td> + * <td>default (ties averaged]</td> + * <td>[6, 4, 7, 8, 4, 9, 1.5, 1.5, 4]</td></tr> + * <tr> + * <td>REMOVED</td> + * <td>SEQUENTIAL</td> + * <td>[5, 2, 6, 7, 3, 8, 1, 4]</td></tr> + * <tr> + * <td>MINIMAL</td> + * <td>MAXIMUM</td> + * <td>[6, 5, 7, 8, 5, 9, 2, 2, 5]</td></tr> + * <tr> + * <td>MINIMAL</td> + * <td>MAXIMUM</td> + * <td>[6, 5, 7, 8, 5, 9, 2, 2, 5]</td></tr> + * </table> + * + * @since 1.1 + */ +public class NaturalRanking implements RankingAlgorithm { + /** Default NaN strategy. */ + private static final NaNStrategy DEFAULT_NAN_STRATEGY = NaNStrategy.FAILED; + /** Default ties strategy. */ + private static final TiesStrategy DEFAULT_TIES_STRATEGY = TiesStrategy.AVERAGE; + /** Map values to positive infinity. */ + private static final DoubleUnaryOperator ACTION_POS_INF = x -> Double.POSITIVE_INFINITY; + /** Map values to negative infinity. */ + private static final DoubleUnaryOperator ACTION_NEG_INF = x -> Double.NEGATIVE_INFINITY; + /** Raise an exception for values. */ + private static final DoubleUnaryOperator ACTION_ERROR = new DoubleUnaryOperator() { + @Override + public double applyAsDouble(double operand) { + throw new IllegalArgumentException("Invalid data: " + operand); + } + }; + + /** NaN strategy. */ + private final NaNStrategy nanStrategy; + /** Ties strategy. */ + private final TiesStrategy tiesStrategy; + /** Source of randomness when ties strategy is RANDOM. + * Can be null to default to a JDK implementation. */ + private final LongSupplier randomSource; + /** Random generator created on demand when ties strategy is RANDOM. */ + private UniformRandomProvider rng; + + /** + * Creates an instance with {@link NaNStrategy#FAILED} and + * {@link TiesStrategy#AVERAGE}. + */ + public NaturalRanking() { + this(DEFAULT_NAN_STRATEGY, DEFAULT_TIES_STRATEGY, null); + } + + /** + * Creates an instance with {@link NaNStrategy#FAILED} and the + * specified @{@code tiesStrategy}. + * + * <p>If the ties strategy is {@link TiesStrategy#RANDOM RANDOM} a default + * source of randomness is used to resolve ties. + * + * @param tiesStrategy TiesStrategy to use. + */ + public NaturalRanking(TiesStrategy tiesStrategy) { + this(DEFAULT_NAN_STRATEGY, tiesStrategy, null); + } + + /** + * Creates an instance with the specified @{@code nanStrategy} and + * {@link TiesStrategy#AVERAGE}. + * + * @param nanStrategy NaNStrategy to use. + */ + public NaturalRanking(NaNStrategy nanStrategy) { + this(nanStrategy, DEFAULT_TIES_STRATEGY, null); + } + + /** + * Creates an instance with the specified @{@code nanStrategy} and the + * specified @{@code tiesStrategy}. + * + * <p>If the ties strategy is {@link TiesStrategy#RANDOM RANDOM} a default + * source of randomness is used to resolve ties. + * + * @param nanStrategy NaNStrategy to use. + * @param tiesStrategy TiesStrategy to use. + */ + public NaturalRanking(NaNStrategy nanStrategy, + TiesStrategy tiesStrategy) { + this(nanStrategy, tiesStrategy, null); + } + + /** + * Creates an instance with {@link NaNStrategy#FAILED}, + * {@link TiesStrategy#RANDOM} and the given the source of random data. + * + * @param randomSource Source of random data. + */ + public NaturalRanking(LongSupplier randomSource) { + this(DEFAULT_NAN_STRATEGY, TiesStrategy.RANDOM, randomSource); + } + + /** + * Creates an instance with the specified @{@code nanStrategy}, + * {@link TiesStrategy#RANDOM} and the given the source of random data. + * + * @param nanStrategy NaNStrategy to use. + * @param randomSource Source of random data. + */ + public NaturalRanking(NaNStrategy nanStrategy, + LongSupplier randomSource) { + this(nanStrategy, TiesStrategy.RANDOM, randomSource); + } + + /** + * @param nanStrategy NaNStrategy to use. + * @param tiesStrategy TiesStrategy to use. + * @param randomSource Source of random data. + */ + private NaturalRanking(NaNStrategy nanStrategy, + TiesStrategy tiesStrategy, + LongSupplier randomSource) { + this.nanStrategy = nanStrategy; + this.tiesStrategy = tiesStrategy; + this.randomSource = randomSource; + } + + /** + * Return the {@link NaNStrategy}. + * + * @return the strategy for handling NaN + */ + public NaNStrategy getNanStrategy() { + return nanStrategy; + } + + /** + * Return the {@link TiesStrategy}. + * + * @return the strategy for handling ties + */ + public TiesStrategy getTiesStrategy() { + return tiesStrategy; + } + + /** + * Rank {@code data} using the natural ordering on floating-point values, with + * NaN values handled according to {@code nanStrategy} and ties resolved using + * {@code tiesStrategy}. + * + * @throws IllegalArgumentException if the selected {@link NaNStrategy} is + * {@code FAILED} and a {@link Double#NaN} is encountered in the input data. + */ + @Override + public double[] apply(double[] data) { + // Convert data for sorting. + // NaNs are counted for the FIXED strategy. + final int[] nanCount = {0}; + final DataPosition[] ranks = createRankData(data, nanCount); + + // Sorting will move NaNs to the end and we do not have to resolve ties in them. + final int nonNanSize = ranks.length - nanCount[0]; + + // Edge case for empty data + if (nonNanSize == 0) { + // Either NaN are left in-place or removed + return nanStrategy == NaNStrategy.FIXED ? data : new double[0]; + } + + Arrays.sort(ranks); + + // Walk the sorted array, filling output array using sorted positions, + // resolving ties as we go. + int pos = 1; + final double[] out = new double[ranks.length]; + + DataPosition current = ranks[0]; + out[current.getPosition()] = pos; + + // Store all previous elements of a tie. + // Note this lags behind the length of the tie sequence by 1. + // In the event there are no ties this is not used. + final IntList tiesTrace = new IntList(ranks.length); + + for (int i = 1; i < nonNanSize; i++) { + final DataPosition previous = current; + current = ranks[i]; + if (current.compareTo(previous) > 0) { + // Check for a previous tie sequence + if (tiesTrace.size() != 0) { + resolveTie(out, tiesTrace, previous.getPosition()); + } + pos = i + 1; + } else { + // Tie sequence. Add the matching previous element. + tiesTrace.add(previous.getPosition()); + } + out[current.getPosition()] = pos; + } + // Handle tie sequence at end + if (tiesTrace.size() != 0) { + resolveTie(out, tiesTrace, current.getPosition()); + } + // For the FIXED strategy consume the remaining NaN elements + if (nanStrategy == NaNStrategy.FIXED) { + for (int i = nonNanSize; i < ranks.length; i++) { + out[ranks[i].getPosition()] = Double.NaN; + } + } + return out; + } + + /** + * Creates the rank data. If using {@link NaNStrategy#REMOVED} then NaNs are + * filtered. Otherwise NaNs may be mapped to an infinite value, counted to allow + * subsequent processing, or cause an exception to be thrown. + * + * @param data Source data. + * @param nanCount Output counter for NaN values. + * @return the rank data + * @throws IllegalArgumentException if the data contains NaN values when using + * {@link NaNStrategy#FAILED}. + */ + private DataPosition[] createRankData(double[] data, final int[] nanCount) { + return nanStrategy == NaNStrategy.REMOVED ? + createNonNaNRankData(data) : + createMappedRankData(data, createNaNAction(nanCount)); + } + + /** + * Creates the NaN action. + * + * @param nanCount Output counter for NaN values. + * @return the operator applied to NaN values + */ + private DoubleUnaryOperator createNaNAction(int[] nanCount) { + switch (nanStrategy) { + case MAXIMAL: // Replace NaNs with +INFs + return ACTION_POS_INF; + case MINIMAL: // Replace NaNs with -INFs + return ACTION_NEG_INF; + case REMOVED: // NaNs are removed + case FIXED: // NaNs are unchanged + // Count the NaNs in the data that must be handled + return x -> { + nanCount[0]++; + return x; + }; + case FAILED: + return ACTION_ERROR; + default: + // this should not happen unless NaNStrategy enum is changed + throw new IllegalStateException(); + } + } + + /** + * Creates the rank data with NaNs removed. + * + * @param data Source data. + * @return the rank data + */ + private static DataPosition[] createNonNaNRankData(double[] data) { + final DataPosition[] ranks = new DataPosition[data.length]; + int size = 0; + for (final double v : data) { + if (!Double.isNaN(v)) { + ranks[size] = new DataPosition(v, size); + size++; + } + } + return size == data.length ? ranks : Arrays.copyOf(ranks, size); + } + + /** + * Creates the rank data. + * + * @param data Source data. + * @param nanAction Mapping operator applied to NaN values. + * @return the rank data + */ + private static DataPosition[] createMappedRankData(double[] data, DoubleUnaryOperator nanAction) { + final DataPosition[] ranks = new DataPosition[data.length]; + for (int i = 0; i < data.length; i++) { + double v = data[i]; + if (Double.isNaN(v)) { + v = nanAction.applyAsDouble(v); + } + ranks[i] = new DataPosition(v, i); + } + return ranks; + } + + /** + * Resolve a sequence of ties, using the configured {@link TiesStrategy}. The + * input {@code ranks} array is expected to take the same value for all indices + * in {@code tiesTrace}. The common value is recoded according to the + * tiesStrategy. For example, if ranks = [5,8,2,6,2,7,1,2], tiesTrace = [2,4,7] + * and tiesStrategy is MINIMUM, ranks will be unchanged. The same array and + * trace with tiesStrategy AVERAGE will come out [5,8,3,6,3,7,1,3]. + * + * <p>Note: For convenience the final index of the trace is passed as an argument; + * it is assumed the list is already non-empty. At the end of the method the + * list of indices is cleared. + * + * @param ranks Array of ranks. + * @param tiesTrace List of indices where {@code ranks} is constant, that is, + * for any i and j in {@code tiesTrace}: {@code ranks[i] == ranks[j]}. + * @param finalIndex The final index to add to the sequence of ties. + */ + private void resolveTie(double[] ranks, IntList tiesTrace, int finalIndex) { + tiesTrace.add(finalIndex); + + // Constant value of ranks over tiesTrace. + // Note: c is a rank counter starting from 1 so limited to an int. + final double c = ranks[tiesTrace.get(0)]; + + // length of sequence of tied ranks + final int length = tiesTrace.size(); + + switch (tiesStrategy) { + case AVERAGE: // Replace ranks with average: (lower + upper) / 2 + fill(ranks, tiesTrace, (2 * c + length - 1) * 0.5); + break; + case MAXIMUM: // Replace ranks with maximum values + fill(ranks, tiesTrace, c + length - 1); + break; + case MINIMUM: // Replace ties with minimum + // Note that the tie sequence already has all values set to c so + // no requirement to fill again. + break; + case SEQUENTIAL: // Fill sequentially from c to c + length - 1 + case RANDOM: // Fill with randomized sequential values in [c, c + length - 1] + // This cast is safe as c is a counter. + int r = (int) c; + if (tiesStrategy == TiesStrategy.RANDOM) { + tiesTrace.shuffle(getRNG()); + } + final int size = tiesTrace.size(); + for (int i = 0; i < size; i++) { + ranks[tiesTrace.get(i)] = r++; + } + break; + default: // this should not happen unless TiesStrategy enum is changed + throw new IllegalStateException(); + } + + tiesTrace.clear(); + } + + /** + * Sets {@code data[i] = value} for each i in {@code tiesTrace}. + * + * @param data Array to modify. + * @param tiesTrace List of index values to set. + * @param value Value to set. + */ + private static void fill(double[] data, IntList tiesTrace, double value) { + final int size = tiesTrace.size(); + for (int i = 0; i < size; i++) { + data[tiesTrace.get(i)] = value; + } + } + + /** + * Gets the random generator from the source of randomness. + * Defaults to a system provided generator if the source of randomness is null. + * + * @return the RNG + */ + private UniformRandomProvider getRNG() { + UniformRandomProvider r = rng; + if (r == null) { + // Use the provided source, or default to a SplittableRandom + final LongSupplier source = randomSource != null ? + randomSource : + new SplittableRandom()::nextLong; + rng = r = source::getAsLong; + } + return r; + } + + /** + * An expandable list of int values. This allows tracking array positions + * without using boxed values in a {@code List<Integer>}. + */ + private static class IntList { + /** The maximum size of array to allocate. */ + private final int max; + + /** The size of the list. */ + private int size; + /** The list data. Initialised with space to store a tie of 2 values. */ + private int[] data = new int[2]; + + /** + * @param max Maximum size of array to allocate. Can use the length of the parent array + * for which this is used to track indices. + */ + IntList(int max) { + this.max = max; + } + + /** + * Adds the value to the list. + * + * @param value the value + */ + void add(int value) { + if (size == data.length) { + // Overflow safe doubling of the current size. + data = Arrays.copyOf(data, (int) Math.min(max, size * 2L)); + } + data[size++] = value; + } + + /** + * Gets the element at the specified {@code index}. + * + * @param index Element index + * @return the element + */ + int get(int index) { + return data[index]; + } + + /** + * Gets the number of elements in the list. + * + * @return the size + */ + int size() { + return size; + } + + /** + * Clear the list. + */ + void clear() { + size = 0; + } + + /** + * Shuffle the list. + * + * @param rng Source of randomness + */ + void shuffle(UniformRandomProvider rng) { + // Fisher-Yates shuffle + final int[] array = data; + for (int i = size; i > 1; i--) { + swap(array, i - 1, rng.nextInt(i)); + } + } + + /** + * Swaps the two specified elements in the specified array. + * + * @param array Data array + * @param i First index + * @param j Second index + */ + private static void swap(int[] array, int i, int j) { + final int tmp = array[i]; + array[i] = array[j]; + array[j] = tmp; + } + } + + /** + * Represents the position of a {@code double} value in a data array. The + * Comparable interface is implemented so Arrays.sort can be used to sort an + * array of data positions by value. Note that the implicitly defined natural + * ordering is NOT consistent with equals. + */ + private static class DataPosition implements Comparable<DataPosition> { + /** Data value. */ + private final double value; + /** Data position. */ + private final int position; + + /** + * Create an instance with the given value and position. + * + * @param value Data value. + * @param position Data position. + */ + DataPosition(double value, int position) { + this.value = value; + this.position = position; + } + + /** + * Compare this value to another. + * Only the <strong>values</strong> are compared. + * + * @param other the other pair to compare this to + * @return result of {@code Double.compare(value, other.value)} + */ + @Override + public int compareTo(DataPosition other) { + return Double.compare(value, other.value); + } + + // N.B. equals() and hashCode() are not implemented; see MATH-610 for discussion. + + /** + * Returns the data position. + * + * @return position + */ + int getPosition() { + return position; + } + } +} diff --git a/commons-statistics-ranking/src/main/java/org/apache/commons/statistics/ranking/RankingAlgorithm.java b/commons-statistics-ranking/src/main/java/org/apache/commons/statistics/ranking/RankingAlgorithm.java new file mode 100644 index 0000000..2cc2134 --- /dev/null +++ b/commons-statistics-ranking/src/main/java/org/apache/commons/statistics/ranking/RankingAlgorithm.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.statistics.ranking; + +import java.util.function.UnaryOperator; + +/** + * Interface representing a rank transformation. + * + * @since 1.1 + */ +@FunctionalInterface +public interface RankingAlgorithm extends UnaryOperator<double[]> { + /** + * <p>Performs a rank transformation on the input data, returning an array of + * ranks. + * + * <p>Ranks should be 1-based: the smallest value returned in an array of ranks + * should be greater than or equal to one, rather than 0. Ranks should in + * general take integer values, though implementations may return averages or + * other floating point values to resolve ties in the input data. + * + * @param data Array of data to be ranked. + * @return an array of ranks corresponding to the elements of the input array + */ + @Override + double[] apply(double[] data); +} diff --git a/commons-statistics-ranking/src/main/java/org/apache/commons/statistics/ranking/TiesStrategy.java b/commons-statistics-ranking/src/main/java/org/apache/commons/statistics/ranking/TiesStrategy.java new file mode 100644 index 0000000..02730c3 --- /dev/null +++ b/commons-statistics-ranking/src/main/java/org/apache/commons/statistics/ranking/TiesStrategy.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.statistics.ranking; + +/** + * Strategies for handling tied values in rank transformations. + * + * @since 1.1 + */ +public enum TiesStrategy { + + /** + * Ties are assigned ranks in order of occurrence in the original array. + * + * <p>For example, {@code [1, 3, 4, 3]} is ranked as {@code [1, 2, 4, 3]}. + */ + SEQUENTIAL, + + /** + * Tied values are assigned the minimum applicable rank, or the rank of the + * first occurrence. + * + * <p>For example, {@code [1, 3, 4, 3]} is ranked as {@code [1, 2, 4, 2]}. + */ + MINIMUM, + + /** + * Tied values are assigned the maximum applicable rank, or the rank of the last + * occurrence. <p>For example, {@code [1, 3, 4, 3]} is ranked as {@code [1, 3, 4, 3]}. + */ + MAXIMUM, + + /** + * Tied values are assigned the average of the applicable ranks. + * + * <p>For example, {@code [1, 3, 4, 3]} is ranked as {@code [1, 2.5, 4, 2.5]}. + */ + AVERAGE, + + /** + * Tied values are assigned a <em>unique</em> random integral rank from among the + * applicable values. + * + * <p>For example, {@code [1, 3, 4, 3]} is ranked as either {@code [1, 2, 4, 3]} or + * {@code [1, 3, 4, 2]} where the choice is random. + * + * <p>The assigned rank will always be an integer, (inclusively) between the + * values returned by the {@link #MINIMUM} and {@link #MAXIMUM} strategies. + * + * <p>Use of a <em>unique</em> rank ensures that ties are resolved so that the + * rank order is stable. + */ + RANDOM +} diff --git a/commons-statistics-ranking/src/main/java/org/apache/commons/statistics/ranking/package-info.java b/commons-statistics-ranking/src/main/java/org/apache/commons/statistics/ranking/package-info.java new file mode 100644 index 0000000..ff9a5b5 --- /dev/null +++ b/commons-statistics-ranking/src/main/java/org/apache/commons/statistics/ranking/package-info.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Classes providing rank transformations. + */ +package org.apache.commons.statistics.ranking; diff --git a/commons-statistics-ranking/src/site/resources/profile.jacoco b/commons-statistics-ranking/src/site/resources/profile.jacoco new file mode 100644 index 0000000..a12755f --- /dev/null +++ b/commons-statistics-ranking/src/site/resources/profile.jacoco @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ----------------------------------------------------------------------------- +# +# Empty file used to automatically trigger JaCoCo profile from commons parent pom diff --git a/commons-statistics-ranking/src/site/site.xml b/commons-statistics-ranking/src/site/site.xml new file mode 100644 index 0000000..0ad859d --- /dev/null +++ b/commons-statistics-ranking/src/site/site.xml @@ -0,0 +1,38 @@ +<?xml version="1.0" encoding="ISO-8859-1"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="Statistics"> + <bannerRight> + <name>Apache Commons Statistics</name> + <!-- Use a full URL allows a correct banner for the modules. --> + <src>https://commons.apache.org/proper/commons-statistics/images/commons_statistics.small.png</src> + <href>https://commons.apache.org/proper/commons-statistics/index.html</href> + </bannerRight> + + <body> + <menu name="Statistics Ranking"> + <item name="Overview" href="index.html"/> + <item name="Latest API docs (development)" + href="apidocs/index.html"/> + <!-- TODO: Uncomment for initial release + <item name="Javadoc (1.1 release)" + href="https://commons.apache.org/statistics/commons-statistics-ranking/javadocs/api-1.1/index.html"/> + --> + </menu> + + </body> +</project> diff --git a/commons-statistics-ranking/src/site/xdoc/index.xml b/commons-statistics-ranking/src/site/xdoc/index.xml new file mode 100644 index 0000000..0580e9e --- /dev/null +++ b/commons-statistics-ranking/src/site/xdoc/index.xml @@ -0,0 +1,52 @@ +<?xml version="1.0"?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + --> + +<document> + + <properties> + <title>Apache Commons Statistics Ranking</title> + </properties> + + <body> + + <section name="Apache Commons Statistics: Ranking" href="summary"> + <p> + Apache Commons Statistics provides statistical rank transformations. + </p> + + <p> + Example: + </p> + +<source class="prettyprint">import org.apache.commons.statistics.ranking.NaturalRanking; + +NaturalRanking ranking = new NaturalRanking(); +ranking.apply(new double[] {5, 6, 7, 8}); // 1, 2, 3, 4 +ranking.apply(new double[] {8, 5, 7, 6}); // 4, 1, 3, 2 +ranking.apply(new double[] {6, 5, 6, 7}); // 2.5, 1, 2.5, 4 +</source> + + <p> + Browse the <a href="apidocs/index.html">Javadoc</a> for more information. + </p> + </section> + + </body> + +</document> diff --git a/commons-statistics-ranking/src/test/java/org/apache/commons/statistics/ranking/NaturalRankingTest.java b/commons-statistics-ranking/src/test/java/org/apache/commons/statistics/ranking/NaturalRankingTest.java new file mode 100644 index 0000000..a0acb7c --- /dev/null +++ b/commons-statistics-ranking/src/test/java/org/apache/commons/statistics/ranking/NaturalRankingTest.java @@ -0,0 +1,611 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.statistics.ranking; + +import java.util.Arrays; +import java.util.BitSet; +import java.util.List; +import java.util.function.LongSupplier; +import java.util.stream.Collectors; +import java.util.stream.DoubleStream; +import java.util.stream.IntStream; +import java.util.stream.Stream; +import org.apache.commons.math3.stat.inference.ChiSquareTest; +import org.apache.commons.rng.UniformRandomProvider; +import org.apache.commons.rng.sampling.PermutationSampler; +import org.apache.commons.rng.simple.RandomSource; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.CsvSource; +import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.MethodSource; + +/** + * Test cases for {@link NaturalRanking}. + */ +class NaturalRankingTest { + + /** Examples data in the {@link NaturalRanking} class javadoc. */ + private static final double[] EXAMPLE_DATA = {20, 17, 30, 42.3, 17, 50, Double.NaN, Double.NEGATIVE_INFINITY, 17}; + private static final double[] TIES_FIRST = {0, 0, 2, 1, 4}; + private static final double[] TIES_LAST = {4, 4, 1, 0}; + private static final double[] MULTIPLE_NANS = {0, 1, Double.NaN, Double.NaN}; + private static final double[] MULTIPLE_TIES = {3, 2, 5, 5, 6, 6, 1}; + private static final double[] ALL_SAME = {0, 0, 0, 0}; + + /** + * Test the strategies are correctly configured using the various constructors. + */ + @Test + void testProperties() { + final TiesStrategy defaultTs = TiesStrategy.AVERAGE; + final NaNStrategy defaultNs = NaNStrategy.FAILED; + final LongSupplier randomSource = null; + NaturalRanking ranking; + + ranking = new NaturalRanking(); + Assertions.assertEquals(defaultTs, ranking.getTiesStrategy()); + Assertions.assertEquals(defaultNs, ranking.getNanStrategy()); + ranking = new NaturalRanking(randomSource); + Assertions.assertEquals(TiesStrategy.RANDOM, ranking.getTiesStrategy()); + Assertions.assertEquals(defaultNs, ranking.getNanStrategy()); + + final TiesStrategy[] ts = TiesStrategy.values(); + final NaNStrategy[] ns = NaNStrategy.values(); + for (final NaNStrategy n : ns) { + ranking = new NaturalRanking(n); + Assertions.assertEquals(defaultTs, ranking.getTiesStrategy()); + Assertions.assertEquals(n, ranking.getNanStrategy()); + ranking = new NaturalRanking(n, randomSource); + Assertions.assertEquals(TiesStrategy.RANDOM, ranking.getTiesStrategy()); + Assertions.assertEquals(n, ranking.getNanStrategy()); + for (final TiesStrategy t : ts) { + ranking = new NaturalRanking(n, t); + Assertions.assertEquals(t, ranking.getTiesStrategy()); + Assertions.assertEquals(n, ranking.getNanStrategy()); + } + } + for (final TiesStrategy t : ts) { + ranking = new NaturalRanking(t); + Assertions.assertEquals(t, ranking.getTiesStrategy()); + Assertions.assertEquals(defaultNs, ranking.getNanStrategy()); + } + } + + @Test + void testNullStrategy() { + final double[] data = new double[5]; + final NaturalRanking r1 = new NaturalRanking((NaNStrategy)null); + Assertions.assertThrows(NullPointerException.class, () -> r1.apply(data)); + final NaturalRanking r2 = new NaturalRanking((TiesStrategy)null); + Assertions.assertThrows(NullPointerException.class, () -> r2.apply(data)); + } + + /** + * Test the ranks on the standard test cases. + * + * <p>If the expected result is null then the algorithm is expected to throw an + * {@link IllegalArgumentException}. + * + * @param ranking Ranking algorithm + * @param example Ranks for Ranks for the example data + * @param tiesFirst Ranks for the ties first data + * @param tiesLast Ranks for the ties last data + * @param multipleNaNs Ranks for the multiple NaNs data + * @param multipleTies Ranks for the multiple ties data + * @param allSame Ranks for the all same data + */ + @ParameterizedTest + @MethodSource + void testRanks(RankingAlgorithm ranking, double[] example, double[] tiesFirst, double[] tiesLast, + double[] multipleNaNs, double[] multipleTies, double[] allSame) { + assertRanks(ranking, EXAMPLE_DATA, example, "Example data"); + assertRanks(ranking, TIES_FIRST, tiesFirst, "Ties first"); + assertRanks(ranking, TIES_LAST, tiesLast, "Ties last"); + assertRanks(ranking, MULTIPLE_NANS, multipleNaNs, "Multiple NaNs"); + assertRanks(ranking, MULTIPLE_TIES, multipleTies, "Multiple ties"); + assertRanks(ranking, ALL_SAME, allSame, "All same"); + } + + /** + * Provide expected results for the standard test cases using different algorithms. + * + * @return the arguments + */ + static Stream<Arguments> testRanks() { + final Stream.Builder<Arguments> builder = Stream.builder(); + builder.add(Arguments.of( + // Default: Ties averaged, NaNs failed + new NaturalRanking(), + null, + new double[] {1.5, 1.5, 4, 3, 5}, + new double[] {3.5, 3.5, 2, 1}, + null, + new double[] {3, 2, 4.5, 4.5, 6.5, 6.5, 1}, + new double[] {2.5, 2.5, 2.5, 2.5} + )); + builder.add(Arguments.of( + new NaturalRanking(NaNStrategy.FAILED), + null, + new double[] {1.5, 1.5, 4, 3, 5}, + new double[] {3.5, 3.5, 2, 1}, + null, + new double[] {3, 2, 4.5, 4.5, 6.5, 6.5, 1}, + new double[] {2.5, 2.5, 2.5, 2.5} + )); + builder.add(Arguments.of( + new NaturalRanking(NaNStrategy.FAILED, TiesStrategy.AVERAGE), + null, + new double[] {1.5, 1.5, 4, 3, 5}, + new double[] {3.5, 3.5, 2, 1}, + null, + new double[] {3, 2, 4.5, 4.5, 6.5, 6.5, 1}, + new double[] {2.5, 2.5, 2.5, 2.5} + )); + builder.add(Arguments.of( + new NaturalRanking(TiesStrategy.AVERAGE), + null, + new double[] {1.5, 1.5, 4, 3, 5}, + new double[] {3.5, 3.5, 2, 1}, + null, + new double[] {3, 2, 4.5, 4.5, 6.5, 6.5, 1}, + new double[] {2.5, 2.5, 2.5, 2.5} + )); + builder.add(Arguments.of( + new NaturalRanking(NaNStrategy.MAXIMAL, TiesStrategy.MINIMUM), + new double[] {5, 2, 6, 7, 2, 8, 9, 1, 2}, + new double[] {1, 1, 4, 3, 5}, + new double[] {3, 3, 2, 1}, + new double[] {1, 2, 3, 3}, + new double[] {3, 2, 4, 4, 6, 6, 1}, + new double[] {1, 1, 1, 1} + )); + builder.add(Arguments.of( + new NaturalRanking(NaNStrategy.REMOVED, TiesStrategy.SEQUENTIAL), + new double[] {5, 2, 6, 7, 3, 8, 1, 4}, + new double[] {1, 2, 4, 3, 5}, + new double[] {3, 4, 2, 1}, + new double[] {1, 2}, + new double[] {3, 2, 4, 5, 6, 7, 1}, + new double[] {1, 2, 3, 4} + )); + builder.add(Arguments.of( + new NaturalRanking(NaNStrategy.MINIMAL, TiesStrategy.MAXIMUM), + new double[] {6, 5, 7, 8, 5, 9, 2, 2, 5}, + new double[] {2, 2, 4, 3, 5}, + new double[] {4, 4, 2, 1}, + new double[] {3, 4, 2, 2}, + new double[] {3, 2, 5, 5, 7, 7, 1}, + new double[] {4, 4, 4, 4} + )); + builder.add(Arguments.of( + new NaturalRanking(NaNStrategy.MINIMAL), + new double[] {6, 4, 7, 8, 4, 9, 1.5, 1.5, 4}, + new double[] {1.5, 1.5, 4, 3, 5}, + new double[] {3.5, 3.5, 2, 1}, + new double[] {3, 4, 1.5, 1.5}, + new double[] {3, 2, 4.5, 4.5, 6.5, 6.5, 1}, + new double[] {2.5, 2.5, 2.5, 2.5} + )); + builder.add(Arguments.of( + new NaturalRanking(NaNStrategy.FAILED), + null, + new double[] {1.5, 1.5, 4, 3, 5}, + new double[] {3.5, 3.5, 2, 1}, + null, + new double[] {3, 2, 4.5, 4.5, 6.5, 6.5, 1}, + new double[] {2.5, 2.5, 2.5, 2.5} + )); + return builder.build(); + } + + /** + * Test the ranks on the standard test cases. This method requires the output ranking + * to have unique indices using a natural sequence from one. The order within groups + * is arbitrary. All elements of each successive group must be ranked after the previous + * group. + * + * <p>If the expected result is null then the algorithm is expected to throw an + * {@link IllegalArgumentException}. + * + * @param ranking Ranking algorithm + * @param example Rank groups for Rank groups for the example data + * @param tiesFirst Rank groups for the ties first data + * @param tiesLast Rank groups for the ties last data + * @param multipleNaNs Rank groups for the multiple NaNs data + * @param multipleTies Rank groups for the multiple ties data + * @param allSame Rank groups for the all same data + */ + @ParameterizedTest + @MethodSource + void testRanksTiesRandom(RankingAlgorithm ranking, int[] example, int[] tiesFirst, int[] tiesLast, + int[] multipleNaNs, int[] multipleTies, int[] allSame) { + assertRanks(ranking, EXAMPLE_DATA, example, "Example data"); + assertRanks(ranking, TIES_FIRST, tiesFirst, "Ties first"); + assertRanks(ranking, TIES_LAST, tiesLast, "Ties last"); + assertRanks(ranking, MULTIPLE_NANS, multipleNaNs, "Multiple NaNs"); + assertRanks(ranking, MULTIPLE_TIES, multipleTies, "Multiple ties"); + assertRanks(ranking, ALL_SAME, allSame, "All same"); + } + + /** + * Provide expected results for the standard test cases using different algorithms. + * + * @return the arguments + */ + static Stream<Arguments> testRanksTiesRandom() { + final UniformRandomProvider rng = RandomSource.SPLIT_MIX_64.create(); + final Stream.Builder<Arguments> builder = Stream.builder(); + builder.add(Arguments.of( + new NaturalRanking(rng::nextLong), + null, + new int[] {1, 1, 3, 2, 4}, + new int[] {3, 3, 2, 1}, + null, + new int[] {3, 2, 4, 4, 5, 5, 1}, + new int[] {1, 1, 1, 1} + )); + builder.add(Arguments.of( + new NaturalRanking(NaNStrategy.FIXED, rng::nextLong), + new int[] {3, 2, 4, 5, 2, 6, 0, 1, 2}, + new int[] {1, 1, 3, 2, 4}, + new int[] {3, 3, 2, 1}, + new int[] {1, 2, 0, 0}, + new int[] {3, 2, 4, 4, 5, 5, 1}, + new int[] {1, 1, 1, 1} + )); + builder.add(Arguments.of( + new NaturalRanking(NaNStrategy.REMOVED, rng::nextLong), + new int[] {3, 2, 4, 5, 2, 6, -1, 1, 2}, + new int[] {1, 1, 3, 2, 4}, + new int[] {3, 3, 2, 1}, + new int[] {1, 2, -1, -1}, + new int[] {3, 2, 4, 4, 5, 5, 1}, + new int[] {1, 1, 1, 1} + )); + // The test method works even when not using TiesStrategy.RANDOM but the + // ties strategy must output unique indices so use SEQUENTIAL. + builder.add(Arguments.of( + new NaturalRanking(NaNStrategy.MAXIMAL, TiesStrategy.SEQUENTIAL), + new int[] {5, 2, 6, 7, 3, 8, 9, 1, 4}, + new int[] {1, 2, 4, 3, 5}, + new int[] {3, 4, 2, 1}, + new int[] {1, 2, 3, 4}, + new int[] {3, 2, 4, 5, 6, 7, 1}, + new int[] {1, 2, 3, 4} + )); + builder.add(Arguments.of( + new NaturalRanking(NaNStrategy.REMOVED, TiesStrategy.SEQUENTIAL), + new int[] {5, 2, 6, 7, 3, 8, -1, 1, 4}, + new int[] {1, 2, 4, 3, 5}, + new int[] {3, 4, 2, 1}, + new int[] {1, 2, -1, -1}, + new int[] {3, 2, 4, 5, 6, 7, 1}, + new int[] {1, 2, 3, 4} + )); + builder.add(Arguments.of( + new NaturalRanking(NaNStrategy.MINIMAL, TiesStrategy.SEQUENTIAL), + new int[] {5, 2, 6, 7, 3, 8, 1, 1, 4}, + new int[] {1, 2, 4, 3, 5}, + new int[] {3, 4, 2, 1}, + new int[] {2, 3, 1, 1}, + new int[] {3, 2, 4, 5, 6, 7, 1}, + new int[] {1, 2, 3, 4} + )); + return builder.build(); + } + + /** + * Test ranking of data with no ties. This should work for any ties strategy. + */ + @ParameterizedTest + @EnumSource(value = TiesStrategy.class) + void testNoTies(TiesStrategy tiesStrategy) { + // Ordered values required for the test. These are randomized below. + final double[] values = {-13, -6, 1, 13.5, 66.9}; + final int[] indices = PermutationSampler.natural(values.length); + final UniformRandomProvider rng = RandomSource.SPLIT_MIX_64.create(); + final double[] data = new double[values.length]; + final double[] expected = new double[values.length]; + final NaturalRanking ranking = new NaturalRanking(tiesStrategy); + for (int i = 0; i < 3; i++) { + PermutationSampler.shuffle(rng, indices); + for (int j = 0; j < values.length; j++) { + data[j] = values[indices[j]]; + expected[j] = indices[j] + 1; + } + assertRanks(ranking, data, expected, tiesStrategy.toString()); + } + } + + /** + * Test ranking of data that contains NaN and positive and negative infinities. + */ + @ParameterizedTest + @MethodSource + void testNaNsAndInfinities(NaNStrategy nanStrategy, double[] expected) { + final double[] data = {0, Double.POSITIVE_INFINITY, Double.NaN, Double.NEGATIVE_INFINITY}; + assertRanks(new NaturalRanking(nanStrategy), data, expected, nanStrategy.toString()); + } + + static Stream<Arguments> testNaNsAndInfinities() { + final Stream.Builder<Arguments> builder = Stream.builder(); + builder.add(Arguments.of(NaNStrategy.MAXIMAL, new double[] {2, 3.5, 3.5, 1})); + builder.add(Arguments.of(NaNStrategy.MINIMAL, new double[] {3, 4, 1.5, 1.5})); + builder.add(Arguments.of(NaNStrategy.REMOVED, new double[] {2, 3, 1})); + builder.add(Arguments.of(NaNStrategy.FIXED, new double[] {2, 3, Double.NaN, 1})); + builder.add(Arguments.of(NaNStrategy.FAILED, null)); + return builder.build(); + } + + /** + * Test ranking of no data. This is enumerated for all NaN strategies to ensure + * all methods searching for NaN handle no data. + */ + @ParameterizedTest + @EnumSource(value = NaNStrategy.class) + void testEmpty(NaNStrategy nanStrategy) { + final double[] data = {}; + assertRanks(new NaturalRanking(nanStrategy), data, data, nanStrategy.toString()); + } + + /** + * Test ranking of only NaN data. + */ + @ParameterizedTest + @MethodSource + void testNaN(NaNStrategy nanStrategy, double[] expected) { + final double[] data = {Double.NaN, Double.NaN}; + assertRanks(new NaturalRanking(nanStrategy), data, expected, nanStrategy.toString()); + } + + static Stream<Arguments> testNaN() { + final Stream.Builder<Arguments> builder = Stream.builder(); + builder.add(Arguments.of(NaNStrategy.MAXIMAL, new double[] {1.5, 1.5})); + builder.add(Arguments.of(NaNStrategy.MINIMAL, new double[] {1.5, 1.5})); + builder.add(Arguments.of(NaNStrategy.REMOVED, new double[0])); + builder.add(Arguments.of(NaNStrategy.FIXED, new double[] {Double.NaN, Double.NaN})); + builder.add(Arguments.of(NaNStrategy.FAILED, null)); + return builder.build(); + } + + /** + * Test the random allocation of ties is uniform for each tied-position. + * + * @param before Number of values before the length of ties + * @param ties Number of tied values + * @param after Number of values after the length of ties + * @param seed Random seed (ensures test does not fail the build due to randomness) + */ + @ParameterizedTest + @CsvSource({ + "0, 10, 0, 23657426436", + "1, 8, 0, 21637427438", + "0, 6, 3, -9879847797", + "1, 12, 1, -253672793297", + }) + void testRandom(int before, int ties, int after, long seed) { + Assertions.assertTrue(ties > 0); + final int n = 1000; + final DoubleStream.Builder builder = DoubleStream.builder(); + final double[] value = {0}; + IntStream.range(0, before).forEach(i -> builder.add(value[0]++)); + IntStream.range(0, ties).forEach(i -> builder.add(value[0])); + IntStream.range(0, after).forEach(i -> builder.add(++value[0])); + final double[] data = builder.build().toArray(); + final UniformRandomProvider rng = RandomSource.SPLIT_MIX_64.create(seed); + final NaturalRanking ranking = new NaturalRanking(rng::nextLong); + final int k = before + 1; + final int m = before + ties; + // Frequency of ranks for each tied position in the data + final long[][] counts = new long[ties][ties]; + for (int i = 0; i < n; i++) { + final double[] ranks = ranking.apply(data); + int j = 0; + for (; j < before; j++) { + Assertions.assertEquals(j + 1, ranks[j]); + } + for (; j < m; j++) { + counts[j - before][(int)ranks[j] - k]++; + } + for (; j < ranks.length; j++) { + Assertions.assertEquals(j + 1, ranks[j]); + } + } + final double p = new ChiSquareTest().chiSquareTest(counts); + Assertions.assertFalse(p < 1e-3, () -> "p-value too large: " + p); + } + + /** + * Test random allocation of ties works without a supplied source of randomness. + */ + @Test + void testDefaultRandom() { + // This is big enough the test should never fail to create 2 different results + final double[] data = new double[1000]; + Arrays.fill(data, 1.23); + final NaturalRanking ranking = new NaturalRanking(TiesStrategy.RANDOM); + final double[] ranks1 = ranking.apply(data); + final double[] ranks2 = ranking.apply(data); + Assertions.assertFalse(Arrays.equals(ranks1, ranks2)); + final double[] expected = IntStream.rangeClosed(1, data.length).asDoubleStream().toArray(); + Arrays.sort(ranks1); + Arrays.sort(ranks2); + Assertions.assertArrayEquals(expected, ranks1); + Assertions.assertArrayEquals(expected, ranks2); + } + + /** + * Assert the data ranks created by the algorithm are equal to the expected + * ranks. The input data is tested to ensure it is unchanged (i.e. the ranking + * does not destructively modify the input data). The expected ranks are passed + * through the algorithm and the result must be unchanged thus ensuring the + * algorithm is stable. + * + * @param ranking Ranking algorithm + * @param data Input data + * @param expected Expected ranks (if null the algorithm is expected to raise an {@link IllegalArgumentException}) + * @param msg Prefix for any assertion failure message + */ + private static void assertRanks(RankingAlgorithm ranking, double[] data, double[] expected, String msg) { + if (expected == null) { + Assertions.assertThrows(IllegalArgumentException.class, () -> ranking.apply(data)); + } else { + final double[] original = data.clone(); + final double[] ranks = ranking.apply(data); + Assertions.assertArrayEquals(original, data, () -> msg + ": Data was destructively modified"); + Assertions.assertArrayEquals(expected, ranks, () -> msg + ": Ranking failed"); + final double[] ranks2 = ranking.apply(ranks); + Assertions.assertArrayEquals(ranks, ranks2, () -> msg + ": Repeat ranking changed the result"); + } + } + + /** + * Assert the data ranks created by the algorithm are assigned to the expected + * groups. The input data is tested to ensure it is unchanged (i.e. the ranking + * does not destructively modify the input data). The expected ranks are passed + * through the algorithm and the result must be unchanged thus ensuring the + * algorithm is stable. + * + * <p>Groups must use a sequential ordering from 1. + * Any negative expected group marks data to be removed (e.g. NaNs). + * Any group of zero marks data to be left unchanged (e.g. NaNs). + * Note that the current test assumes the removed and unchanged options are mutually exclusive. + * Groups are mapped to an expected rank using a sequential ordering from 1. + * The order within a group can be random. + * + * For example: + * <pre> + * groups: [1, 2, 2, 2, 3] + * + * [1, 2, 3, 4, 5] : pass + * [1, 4, 3, 2, 5] : pass + * [1, 2, 4, 3, 5] : pass + * [1, 3, 3, 3, 5] : fail + * [1, 2, 2, 4, 5] : fail + * [1, 2, 3, 4, 4] : fail + * [1, 2, 3, 4, 6] : fail + * </pre> + * + * @param ranking Ranking algorithm + * @param data Input data + * @param expectedGroups Expected groups (if null the algorithm is expected to raise an {@link IllegalArgumentException}) + * @param msg Prefix for any assertion failure message + */ + private static void assertRanks(RankingAlgorithm ranking, double[] data, int[] expectedGroup, String msg) { + if (expectedGroup == null) { + Assertions.assertThrows(IllegalArgumentException.class, () -> ranking.apply(data)); + } else { + Assertions.assertEquals(data.length, expectedGroup.length, "Groups must be assigned to all data"); + + final double[] original = data.clone(); + final double[] ranks = ranking.apply(data); + Assertions.assertArrayEquals(original, data, () -> msg + ": Data was destructively modified"); + final double[] ranks2 = ranking.apply(ranks); + Assertions.assertArrayEquals(ranks, ranks2, () -> msg + ": Repeat ranking changed the result"); + + // TODO: Simplify the collation of groups into sets. + + // Check groups + int max = 0; + int numberOfElements = 0; + int unchanged = 0; + int removed = 0; + for (int i = 0; i < expectedGroup.length; i++) { + if (expectedGroup[i] > 0) { + max = Math.max(max, expectedGroup[i]); + // Reduce to only the expected elements. + // This filters unchanged/removed elements. + expectedGroup[numberOfElements] = expectedGroup[i]; + numberOfElements++; + } else if (expectedGroup[i] == 0) { + Assertions.assertEquals(data[i], ranks[i], "Element was changed"); + // Flag for removal + ranks[i] = -1; + unchanged++; + } else { + removed++; + } + } + Assertions.assertTrue(removed == 0 || unchanged == 0, "removed and unchanged options are mutually exclusive"); + Assertions.assertEquals(ranks.length, data.length - removed, "Did not remove expected number of elements"); + Assertions.assertTrue(max <= numberOfElements, "Maximum group above the number of valid elements"); + + if (unchanged != 0) { + // Unchanged elements are not removed by the ranking algorithm. + // These have already been asserted as unchanged so we remove them + // so only the grouped elements remain. + int j = 0; + for (int i = 0; i < ranks.length; i++) { + if (ranks[i] < 0) { + continue; + } + ranks[j++] = ranks[i]; + } + Assertions.assertEquals(numberOfElements, j, "Error removing unchanged elements"); + } + + // Count groups sizes + final int[] sizes = new int[max + 1]; + for (int i = 0; i < numberOfElements; i++) { + sizes[expectedGroup[i]]++; + } + // Each must be non-zero + for (int i = 1; i <= max; i++) { + final int index = i; + Assertions.assertNotEquals(0, sizes[i], () -> "Empty group: " + index); + } + + // Here we have a number of groups with non-zero sizes. + // The expected ranks must be sequential from 1. + // Create a BitSet for each group filled with bits enabled for each rank + // within the group. This is typically a single value or a range: [1], [2, 3], [4], etc. + // Store the set in an array using the rank as the index to allow look-up of + // the set from the rank. + final int[] rankIndex = {1}; + final BitSet[] rankToGroup = new BitSet[data.length + 1]; + final List<BitSet> groups = + IntStream.rangeClosed(1, max) + .mapToObj(i -> { + final BitSet bs = new BitSet(); + for (int j = 0; j < sizes[i]; j++) { + bs.set(rankIndex[0]); + rankToGroup[rankIndex[0]] = bs; + rankIndex[0]++; + } + return bs; + }).collect(Collectors.toList()); + + // For the actual rank, map back to the group and check it is allowed. + for (int i = 0; i < numberOfElements; i++) { + final double r = ranks[i]; + Assertions.assertTrue((int) r == r, "Non-integer rank: " + r); + final int rank = (int) r; + final BitSet groupSet = rankToGroup[rank]; + final int group = expectedGroup[i]; + Assertions.assertTrue(groupSet.get(rank), + () -> String.format("Unexpected rank %d in group %d", rank, group)); + groupSet.clear(rank); + } + + // Check all groups should now be empty + groups.forEach(g -> Assertions.assertEquals(0, g.cardinality(), "Non-empty group")); + } + } +} diff --git a/commons-statistics-ranking/src/test/java/org/apache/commons/statistics/ranking/UserGuideTest.java b/commons-statistics-ranking/src/test/java/org/apache/commons/statistics/ranking/UserGuideTest.java new file mode 100644 index 0000000..03c43d1 --- /dev/null +++ b/commons-statistics-ranking/src/test/java/org/apache/commons/statistics/ranking/UserGuideTest.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.statistics.ranking; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +/** + * Test code used in the ranking section of the user guide. + */ +class UserGuideTest { + @Test + void testRanking() { + NaturalRanking ranking = new NaturalRanking(); + Assertions.assertArrayEquals(new double[] {1, 2, 3, 4}, ranking.apply(new double[] {5, 6, 7, 8})); + Assertions.assertArrayEquals(new double[] {4, 1, 3, 2}, ranking.apply(new double[] {8, 5, 7, 6})); + Assertions.assertArrayEquals(new double[] {2.5, 1, 2.5, 4}, ranking.apply(new double[] {6, 5, 6, 7})); + } +} diff --git a/pom.xml b/pom.xml index 6abb936..677f932 100644 --- a/pom.xml +++ b/pom.xml @@ -623,6 +623,7 @@ This is avoided by creating an empty directory when svn is not available. <modules> <module>commons-statistics-distribution</module> + <module>commons-statistics-ranking</module> <module>commons-statistics-regression</module> </modules> diff --git a/src/conf/pmd/pmd-ruleset.xml b/src/conf/pmd/pmd-ruleset.xml index 16c2da5..ce00cc4 100644 --- a/src/conf/pmd/pmd-ruleset.xml +++ b/src/conf/pmd/pmd-ruleset.xml @@ -124,6 +124,12 @@ value="./ancestor-or-self::ClassOrInterfaceDeclaration[@SimpleName='AbstractContinuousDistribution']"/> </properties> </rule> + <rule ref="category/java/design.xml/GodClass"> + <properties> + <property name="violationSuppressXPath" + value="./ancestor-or-self::ClassOrInterfaceDeclaration[@SimpleName='NaturalRanking']"/> + </properties> + </rule> <rule ref="category/java/errorprone.xml/AvoidLiteralsInIfCondition"> <properties> @@ -132,4 +138,12 @@ </properties> </rule> + <rule ref="category/java/errorprone.xml/AvoidFieldNameMatchingMethodName"> + <properties> + <!-- size() method return the list size. --> + <property name="violationSuppressXPath" + value="./ancestor-or-self::ClassOrInterfaceDeclaration[@SimpleName='IntList']"/> + </properties> + </rule> + </ruleset> diff --git a/src/conf/spotbugs/spotbugs-exclude-filter.xml b/src/conf/spotbugs/spotbugs-exclude-filter.xml index 629ddcb..629481e 100644 --- a/src/conf/spotbugs/spotbugs-exclude-filter.xml +++ b/src/conf/spotbugs/spotbugs-exclude-filter.xml @@ -62,4 +62,9 @@ <Bug pattern="FL_FLOATS_AS_LOOP_COUNTERS" /> </Match> + <Match> + <Class name="org.apache.commons.statistics.ranking.NaturalRanking$DataPosition" /> + <Bug pattern="EQ_COMPARETO_USE_OBJECT_EQUALS" /> + </Match> + </FindBugsFilter>