Repository: commons-math Updated Branches: refs/heads/field-ode b713e4ca0 -> 10c271f2c
Added bootstrap method to KolmogorovSmirnovTest. JIRA: MATH-1246. Project: http://git-wip-us.apache.org/repos/asf/commons-math/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-math/commit/fbc327e9 Tree: http://git-wip-us.apache.org/repos/asf/commons-math/tree/fbc327e9 Diff: http://git-wip-us.apache.org/repos/asf/commons-math/diff/fbc327e9 Branch: refs/heads/field-ode Commit: fbc327e9e30093acdc0fc325b1719cae4ea8bac1 Parents: d391146 Author: Phil Steitz <phil.ste...@gmail.com> Authored: Sun Nov 22 12:41:00 2015 -0700 Committer: Phil Steitz <phil.ste...@gmail.com> Committed: Sun Nov 22 12:41:00 2015 -0700 ---------------------------------------------------------------------- src/changes/changes.xml | 3 + .../stat/inference/KolmogorovSmirnovTest.java | 60 ++++++++++++++++++++ src/test/R/KolmogorovSmirnovTestCases.R | 48 ++++++++++++---- .../inference/KolmogorovSmirnovTestTest.java | 35 ++++++++++++ 4 files changed, 134 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-math/blob/fbc327e9/src/changes/changes.xml ---------------------------------------------------------------------- diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 202f368..7dfacc5 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -51,6 +51,9 @@ If the output is not quite correct, check for invisible trailing spaces! </properties> <body> <release version="3.6" date="XXXX-XX-XX" description=""> + <action dev="psteitz" type="update" issue="MATH-1246"> + Added bootstrap method to KolmogorovSmirnov test. + </action> <action dev="psteitz" type="update" issue="MATH-1287"> Added constructors taking sample data as arguments to enumerated real and integer distributions. </action> http://git-wip-us.apache.org/repos/asf/commons-math/blob/fbc327e9/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java b/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java index 0b8332e..7c19b1a 100644 --- a/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java +++ b/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java @@ -21,6 +21,7 @@ import java.math.BigDecimal; import java.util.Arrays; import java.util.Iterator; +import org.apache.commons.math3.distribution.EnumeratedRealDistribution; import org.apache.commons.math3.distribution.RealDistribution; import org.apache.commons.math3.exception.InsufficientDataException; import org.apache.commons.math3.exception.MathArithmeticException; @@ -376,6 +377,65 @@ public class KolmogorovSmirnovTest { } /** + * Estimates the <i>p-value</i> of a two-sample + * <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> + * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same + * probability distribution. This method estimates the p-value by repeatedly sampling sets of size + * {@code x.length} and {@code y.length} from the empirical distribution of the combined sample. + * When {@code strict} is true, this is equivalent to the algorithm implemented in the R function + * ks.boot, described in <pre> + * Jasjeet S. Sekhon. 2011. `Multivariate and Propensity Score Matching + * Software with Automated Balance Optimization: The Matching package for R.` + * Journal of Statistical Software, 42(7): 1-52. + * </pre> + * @param x first sample + * @param y second sample + * @param iterations number of bootstrap resampling iterations + * @param strict whether or not the null hypothesis is expressed as a strict inequality + * @return estimated p-value + */ + public double bootstrap(double[] x, double[] y, int iterations, boolean strict) { + final int xLength = x.length; + final int yLength = y.length; + final double[] combined = new double[xLength + yLength]; + System.arraycopy(x, 0, combined, 0, xLength); + System.arraycopy(y, 0, combined, xLength, yLength); + final EnumeratedRealDistribution dist = new EnumeratedRealDistribution(combined); + final long d = integralKolmogorovSmirnovStatistic(x, y); + int greaterCount = 0; + int equalCount = 0; + double[] curX; + double[] curY; + long curD; + for (int i = 0; i < iterations; i++) { + curX = dist.sample(xLength); + curY = dist.sample(yLength); + curD = integralKolmogorovSmirnovStatistic(curX, curY); + if (curD > d) { + greaterCount++; + } else if (curD == d) { + equalCount++; + } + } + return strict ? greaterCount / (double) iterations : + (greaterCount + equalCount) / (double) iterations; + } + + /** + * Computes {@code bootstrap(x, y, iterations, true)}. + * This is equivalent to ks.boot(x,y, nboots=iterations) using the R Matching + * package function. See #bootstrap(double[], double[], int, boolean). + * + * @param x first sample + * @param y second sample + * @param iterations number of bootstrap resampling iterations + * @return estimated p-value + */ + public double bootstrap(double[] x, double[] y, int iterations) { + return bootstrap(x, y, iterations, true); + } + + /** * Calculates \(P(D_n < d)\) using the method described in [1] with quick decisions for extreme * values given in [2] (see above). The result is not exact as with * {@link #cdfExact(double, int)} because calculations are based on http://git-wip-us.apache.org/repos/asf/commons-math/blob/fbc327e9/src/test/R/KolmogorovSmirnovTestCases.R ---------------------------------------------------------------------- diff --git a/src/test/R/KolmogorovSmirnovTestCases.R b/src/test/R/KolmogorovSmirnovTestCases.R index 6b70155..3146325 100644 --- a/src/test/R/KolmogorovSmirnovTestCases.R +++ b/src/test/R/KolmogorovSmirnovTestCases.R @@ -21,12 +21,20 @@ # into the same directory, launch R from this directory and then enter # source("<name-of-this-file>") # +# NOTE: the 2-sample bootstrap test requires the "Matching" library +## https://cran.r-project.org/web/packages/Matching/index.html +## See http://sekhon.berkeley.edu/matching for additional documentation. +## Jasjeet S. Sekhon. 2011. ``Multivariate and Propensity Score Matching +## Software with Automated Balance Optimization: The Matching package for R.'' +## Journal of Statistical Software, 42(7): 1-52. +# #------------------------------------------------------------------------------ tol <- 1E-14 # error tolerance for tests #------------------------------------------------------------------------------ # Function definitions source("testFunctions") # utility test functions +require("Matching") # for ks.boot verifyOneSampleGaussian <- function(data, expectedP, expectedD, mean, sigma, exact, tol, desc) { results <- ks.test(data, "pnorm", mean, sigma, exact = exact) @@ -34,12 +42,12 @@ verifyOneSampleGaussian <- function(data, expectedP, expectedD, mean, sigma, exa displayPadded(c(desc," p-value test"), SUCCEEDED, WIDTH) } else { displayPadded(c(desc, " p-value test"), FAILED, WIDTH) - } + } if (assertEquals(expectedD, results$statistic, tol, "D statistic value")) { displayPadded(c(desc," D statistic test"), SUCCEEDED, WIDTH) } else { displayPadded(c(desc, " D statistic test"), FAILED, WIDTH) - } + } } verifyOneSampleUniform <- function(data, expectedP, expectedD, min, max, exact, tol, desc) { @@ -48,12 +56,12 @@ verifyOneSampleUniform <- function(data, expectedP, expectedD, min, max, exact, displayPadded(c(desc," p-value test"), SUCCEEDED, WIDTH) } else { displayPadded(c(desc, " p-value test"), FAILED, WIDTH) - } + } if (assertEquals(expectedD, results$statistic, tol, "D statistic value")) { displayPadded(c(desc," D statistic test"), SUCCEEDED, WIDTH) } else { displayPadded(c(desc, " D statistic test"), FAILED, WIDTH) - } + } } verifyTwoSampleLargeSamples <- function(sample1, sample2, expectedP, expectedD, tol, desc) { @@ -62,12 +70,12 @@ verifyTwoSampleLargeSamples <- function(sample1, sample2, expectedP, expectedD, displayPadded(c(desc," p-value test"), SUCCEEDED, WIDTH) } else { displayPadded(c(desc, " p-value test"), FAILED, WIDTH) - } + } if (assertEquals(expectedD, results$statistic, tol, "D statistic value")) { displayPadded(c(desc," D statistic test"), SUCCEEDED, WIDTH) } else { displayPadded(c(desc, " D statistic test"), FAILED, WIDTH) - } + } } verifyTwoSampleSmallSamplesExact <- function(sample1, sample2, expectedP, expectedD, tol, desc) { @@ -76,14 +84,22 @@ verifyTwoSampleSmallSamplesExact <- function(sample1, sample2, expectedP, expect displayPadded(c(desc," p-value test"), SUCCEEDED, WIDTH) } else { displayPadded(c(desc, " p-value test"), FAILED, WIDTH) - } + } if (assertEquals(expectedD, results$statistic, tol, "D statistic value")) { displayPadded(c(desc," D statistic test"), SUCCEEDED, WIDTH) } else { displayPadded(c(desc, " D statistic test"), FAILED, WIDTH) - } + } } +verifyTwoSampleBootstrap <- function(sample1, sample2, expectedP, tol, desc) { + results <- ks.boot(sample1, sample2,nboots=10000 ) + if (assertEquals(expectedP, results$ks.boot.pvalue, tol, "p-value")) { + displayPadded(c(desc, " p-value test"), SUCCEEDED, WIDTH) + } else { + displayPadded(c(desc, " p-value test"), FAILED, WIDTH) + } +} cat("KolmogorovSmirnovTest test cases\n") @@ -100,7 +116,7 @@ gaussian <- c(0.26055895, -0.63665233, 1.51221323, 0.61246988, -0.03013003, -1.7 0.25165971, -0.04125231, -0.23756244, -0.93389975, 0.75551407, 0.08347445, -0.27482228, -0.4717632, -0.1867746, -0.1166976, 0.5763333, 0.1307952, 0.7630584, -0.3616248, 2.1383790,-0.7946630, 0.0231885, 0.7919195, 1.6057144, -0.3802508, 0.1229078, 1.5252901, -0.8543149, 0.3025040) - + shortGaussian <- gaussian[1:50] gaussian2 <- c(2.88041498038308, -0.632349445671017, 0.402121295225571, 0.692626364613243, 1.30693446815426, @@ -137,10 +153,14 @@ uniform <- c(0.7930305, 0.6424382, 0.8747699, 0.7156518, 0.1845909, 0.2022326, 0.85201008, 0.02945562, 0.26200374, 0.11382818, 0.17238856, 0.36449473, 0.69688273, 0.96216330, 0.4859432,0.4503438, 0.1917656, 0.8357845, 0.9957812, 0.4633570, 0.8654599, 0.4597996, 0.68190289, 0.58887855, 0.09359396, 0.98081979, 0.73659533, 0.89344777, 0.18903099, 0.97660425) - + smallSample1 <- c(6, 7, 9, 13, 19, 21, 22, 23, 24) smallSample2 <- c(10, 11, 12, 16, 20, 27, 28, 32, 44, 54) - +bootSample1 <- c(0, 2, 4, 6, 8, 8, 10, 15, 22, 30, 33, 36, 38) +bootSample2 <- c(9, 17, 20, 33, 40, 51, 60, 60, 72, 90, 101) +roundingSample1 <- c(2,4,6,8,9,10,11,12,13) +roundingSample2 <- c(0,1,3,5,7) + shortUniform <- uniform[1:20] verifyOneSampleGaussian(gaussian, 0.3172069207622391, 0.0932947561266756, 0, 1, @@ -167,6 +187,10 @@ verifyTwoSampleLargeSamples(gaussian, gaussian2, 0.0319983962391632, 0.202352941 verifyTwoSampleSmallSamplesExact(smallSample1, smallSample2, 0.105577085453247, .5, tol, "Two sample small samples exact") +verifyTwoSampleBootstrap(bootSample1, bootSample2, 0.0059, 1E-3, "Two sample bootstrap - isolated failures possible") +verifyTwoSampleBootstrap(gaussian, gaussian2, 0.0237, 1E-2, "Two sample bootstrap - isolated failures possible") +verifyTwoSampleBootstrap(roundingSample1, roundingSample2, 0.06303, 1E-2, "Two sample bootstrap - isolated failures possible") + displayDashes(WIDTH) - + http://git-wip-us.apache.org/repos/asf/commons-math/blob/fbc327e9/src/test/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTestTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTestTest.java b/src/test/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTestTest.java index 808c0d4..609a0fd 100644 --- a/src/test/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTestTest.java +++ b/src/test/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTestTest.java @@ -532,6 +532,41 @@ public class KolmogorovSmirnovTestTest { } /** + * Test an example with ties in the data. Reference data is R 3.2.0, + * ks.boot implemented in Matching (Version 4.8-3.4, Build Date: 2013/10/28) + */ + @Test + public void testBootstrapSmallSamplesWithTies() { + final double[] x = {0, 2, 4, 6, 8, 8, 10, 15, 22, 30, 33, 36, 38}; + final double[] y = {9, 17, 20, 33, 40, 51, 60, 60, 72, 90, 101}; + final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(new Well19937c(2000)); + Assert.assertEquals(0.0059, test.bootstrap(x, y, 10000, false), 1E-3); + } + + /** + * Reference data is R 3.2.0, ks.boot implemented in + * Matching (Version 4.8-3.4, Build Date: 2013/10/28) + */ + @Test + public void testBootstrapLargeSamples() { + final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(new Well19937c(1000)); + Assert.assertEquals(0.0237, test.bootstrap(gaussian, gaussian2, 10000), 1E-2); + } + + /** + * Test an example where D-values are close (subject to rounding). + * Reference data is R 3.2.0, ks.boot implemented in + * Matching (Version 4.8-3.4, Build Date: 2013/10/28) + */ + @Test + public void testBootstrapRounding() { + final double[] x = {2,4,6,8,9,10,11,12,13}; + final double[] y = {0,1,3,5,7}; + final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(new Well19937c(1000)); + Assert.assertEquals(0.06303, test.bootstrap(x, y, 10000, false), 1E-2); + } + + /** * Verifies the inequality exactP(criticalValue, n, m, true) < alpha < exactP(criticalValue, n, * m, false). *