[1/2] [math] [MATH-1197] Computation of 2-sample KS statistic was wrong in case of ties.

tn Sun, 26 Apr 2015 12:13:35 -0700

Repository: commons-math
Updated Branches:
  refs/heads/master 8ed40d530 -> b494caa01



[MATH-1197] Computation of 2-sample KS statistic was wrong in case of ties.


Project: http://git-wip-us.apache.org/repos/asf/commons-math/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-math/commit/a6abb8b0
Tree: http://git-wip-us.apache.org/repos/asf/commons-math/tree/a6abb8b0
Diff: http://git-wip-us.apache.org/repos/asf/commons-math/diff/a6abb8b0

Branch: refs/heads/master
Commit: a6abb8b00355db97486ab5023305c3f4e9e19ad3
Parents: 8ed40d5
Author: Thomas Neidhart <thomas.neidh...@gmail.com>
Authored: Sun Apr 26 20:55:17 2015 +0200
Committer: Thomas Neidhart <thomas.neidh...@gmail.com>
Committed: Sun Apr 26 20:55:17 2015 +0200

----------------------------------------------------------------------
 src/changes/changes.xml                         |  4 +
 .../stat/inference/KolmogorovSmirnovTest.java   | 49 ++++++++---
 .../inference/KolmogorovSmirnovTestTest.java    | 85 +++++++++++++++++---
 3 files changed, 118 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-math/blob/a6abb8b0/src/changes/changes.xml
----------------------------------------------------------------------
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index c2eb5f0..162f6e7 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -54,6 +54,10 @@ If the output is not quite correct, check for invisible 
trailing spaces!
     </release>
 
     <release version="4.0" date="XXXX-XX-XX" description="">
+      <action dev="tn" type="fix" issue="MATH-1197">
+        Computation of 2-sample Kolmogoriv-Smirnov statistic in case of ties
+        was not correct.
+      </action>    
       <action dev="tn" type="remove" issue="MATH-1205">
         Removed methods "test(...)" from "AbstractUnivariateStatistic".
         The already existing methods "MathArrays#verifyValues(...)" shall

http://git-wip-us.apache.org/repos/asf/commons-math/blob/a6abb8b0/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
 
b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
index fbea82f..6144ff7 100644
--- 
a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
+++ 
b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
@@ -298,9 +298,13 @@ public class KolmogorovSmirnovTest {
         double supD = 0d;
         // First walk x points
         for (int i = 0; i < n; i++) {
-            final double cdf_x = (i + 1d) / n;
-            final int yIndex = Arrays.binarySearch(sy, sx[i]);
-            final double cdf_y = yIndex >= 0 ? (yIndex + 1d) / m : (-yIndex - 
1d) / m;
+            final double x_i = sx[i];
+            // ties can be safely ignored
+            if (i > 0 && x_i == sx[i-1]) {
+                continue;
+            }
+            final double cdf_x = edf(x_i, sx);
+            final double cdf_y = edf(x_i, sy);
             final double curD = FastMath.abs(cdf_x - cdf_y);
             if (curD > supD) {
                 supD = curD;
@@ -308,9 +312,13 @@ public class KolmogorovSmirnovTest {
         }
         // Now look at y
         for (int i = 0; i < m; i++) {
-            final double cdf_y = (i + 1d) / m;
-            final int xIndex = Arrays.binarySearch(sx, sy[i]);
-            final double cdf_x = xIndex >= 0 ? (xIndex + 1d) / n : (-xIndex - 
1d) / n;
+            final double y_i = sy[i];
+            // ties can be safely ignored
+            if (i > 0 && y_i == sy[i-1]) {
+                continue;
+            }
+            final double cdf_x = edf(y_i, sx);
+            final double cdf_y = edf(y_i, sy);
             final double curD = FastMath.abs(cdf_x - cdf_y);
             if (curD > supD) {
                 supD = curD;
@@ -320,6 +328,24 @@ public class KolmogorovSmirnovTest {
     }
 
     /**
+     * Computes the empirical distribution function.
+     *
+     * @param x the given x
+     * @param samples the observations
+     * @return the empirical distribution function \(F_n(x)\)
+     */
+    private double edf(final double x, final double[] samples) {
+        final int n = samples.length;
+        int index = Arrays.binarySearch(samples, x);
+        if (index >= 0) {
+            while(index < (n - 1) && samples[index+1] == x) {
+                ++index;
+            }
+        }
+        return index >= 0 ? (index + 1d) / n : (-index - 1d) / n;
+    }
+
+    /**
      * Computes the <i>p-value</i>, or <i>observed significance level</i>, of 
a one-sample <a
      * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test";> 
Kolmogorov-Smirnov test</a>
      * evaluating the null hypothesis that {@code data} conforms to {@code 
distribution}.
@@ -429,7 +455,7 @@ public class KolmogorovSmirnovTest {
             return 1;
         }
         if (exact) {
-            return exactK(d,n);
+            return exactK(d, n);
         }
         if (n <= 140) {
             return roundedK(d, n);
@@ -501,7 +527,6 @@ public class KolmogorovSmirnovTest {
      * @since 3.4
      */
     public double pelzGood(double d, int n) {
-
         // Change the variable since approximation is for the distribution 
evaluated at d / sqrt(n)
         final double sqrtN = FastMath.sqrt(n);
         final double z = d * sqrtN;
@@ -834,8 +859,13 @@ public class KolmogorovSmirnovTest {
      * @throws TooManyIterationsException if the series does not converge
      */
     public double ksSum(double t, double tolerance, int maxIterations) {
+        if (t == 0.0) {
+            return 1.0;
+        }
+
         // TODO: for small t (say less than 1), the alternative expansion in 
part 3 of [1]
         // from class javadoc should be used.
+
         final double x = -2 * t * t;
         int sign = -1;
         long i = 1;
@@ -926,7 +956,8 @@ public class KolmogorovSmirnovTest {
     public double approximateP(double d, int n, int m) {
         final double dm = m;
         final double dn = n;
-        return 1 - ksSum(d * FastMath.sqrt((dm * dn) / (dm + dn)), 
KS_SUM_CAUCHY_CRITERION, MAXIMUM_PARTIAL_SUM_COUNT);
+        return 1 - ksSum(d * FastMath.sqrt((dm * dn) / (dm + dn)),
+                         KS_SUM_CAUCHY_CRITERION, MAXIMUM_PARTIAL_SUM_COUNT);
     }
 
     /**

http://git-wip-us.apache.org/repos/asf/commons-math/blob/a6abb8b0/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
 
b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
index e44972d..4c8a38b 100644
--- 
a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
+++ 
b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
@@ -27,7 +27,7 @@ import org.junit.Test;
 
 /**
  * Test cases for {@link KolmogorovSmirnovTest}.
- * 
+ *
  * @since 3.3
  */
 public class KolmogorovSmirnovTestTest {
@@ -221,7 +221,7 @@ public class KolmogorovSmirnovTestTest {
             }
         }
     }
-    
+
     @Test
     public void testPelzGoodApproximation() {
         KolmogorovSmirnovTest ksTest = new KolmogorovSmirnovTest();
@@ -237,7 +237,7 @@ public class KolmogorovSmirnovTestTest {
             0.9999999999999877, 0.9999999999999191, 0.9999999999999254, 
0.9999999999998178, 0.9999999999917788,
             0.9999999999998556, 0.9999999999992014, 0.9999999999988859, 
0.9999999999999325, 0.9999999999821726
         };
-        
+
         final double tol = 10e-15;
         int k = 0;
         for (int i = 0; i < 6; i++) {
@@ -255,13 +255,13 @@ public class KolmogorovSmirnovTestTest {
         Assert.assertEquals(0.0319983962391632, 
test.kolmogorovSmirnovTest(gaussian, gaussian2), TOLERANCE);
         Assert.assertEquals(0.202352941176471, 
test.kolmogorovSmirnovStatistic(gaussian, gaussian2), TOLERANCE);
     }
-    
-    /** 
+
+    /**
      * MATH-1181
      * Verify that large sample method is selected for sample product > 
Integer.MAX_VALUE
      * (integer overflow in sample product)
      */
-    @Test(timeout=5000)
+    @Test//(timeout=5000)
     public void testTwoSampleProductSizeOverflow() {
         final int n = 50000;
         Assert.assertTrue(n * n < 0);
@@ -270,7 +270,7 @@ public class KolmogorovSmirnovTestTest {
         final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
         Assert.assertFalse(Double.isNaN(test.kolmogorovSmirnovTest(x, y)));
     }
-    
+
 
     /**
      * Verifies that Monte Carlo simulation gives results close to exact p 
values. This test is a
@@ -303,15 +303,78 @@ public class KolmogorovSmirnovTestTest {
         }
     }
 
+    @Test
+    public void testTwoSampleWithManyTies() {
+        // MATH-1197
+        final double[] x = {
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 2.202653, 2.202653, 2.202653, 2.202653, 2.202653,
+            2.202653, 2.202653, 2.202653, 2.202653, 2.202653, 2.202653,
+            2.202653, 2.202653, 2.202653, 2.202653, 2.202653, 2.202653,
+            2.202653, 2.202653, 2.202653, 2.202653, 2.202653, 2.202653,
+            2.202653, 2.202653, 2.202653, 2.202653, 2.202653, 2.202653,
+            2.202653, 2.202653, 2.202653, 2.202653, 2.202653, 2.202653,
+            3.181199, 3.181199, 3.181199, 3.181199, 3.181199, 3.181199,
+            3.723539, 3.723539, 3.723539, 3.723539, 4.383482, 4.383482,
+            4.383482, 4.383482, 5.320671, 5.320671, 5.320671, 5.717284,
+            6.964001, 7.352165, 8.710510, 8.710510, 8.710510, 8.710510,
+            8.710510, 8.710510, 9.539004, 9.539004, 10.720619, 17.726077,
+            17.726077, 17.726077, 17.726077, 22.053875, 23.799144, 27.355308,
+            30.584960, 30.584960, 30.584960, 30.584960, 30.751808
+        };
+
+        final double[] y = {
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+            0.000000, 0.000000, 0.000000, 2.202653, 2.202653, 2.202653,
+            2.202653, 2.202653, 2.202653, 2.202653, 2.202653, 3.061758,
+            3.723539, 5.628420, 5.628420, 5.628420, 5.628420, 5.628420,
+            6.916982, 6.916982, 6.916982, 10.178538, 10.178538, 10.178538,
+            10.178538, 10.178538
+        };
+
+        final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
+
+        Assert.assertEquals(0.0640394088, test.kolmogorovSmirnovStatistic(x, 
y), 1e-6);
+        Assert.assertEquals(0.9792777290, test.kolmogorovSmirnovTest(x, y), 
1e-6);
+
+    }
+
     /**
      * Verifies the inequality exactP(criticalValue, n, m, true) < alpha < 
exactP(criticalValue, n,
      * m, false).
-     * 
+     *
      * Note that the validity of this check depends on the fact that alpha 
lies strictly between two
      * attained values of the distribution and that criticalValue is one of 
the attained values. The
      * critical value table (reference below) uses attained values. This test 
therefore also
      * verifies that criticalValue is attained.
-     * 
+     *
      * @param n first sample size
      * @param m second sample size
      * @param criticalValue critical value
@@ -325,7 +388,7 @@ public class KolmogorovSmirnovTestTest {
 
     /**
      * Verifies that approximateP(criticalValue, n, m) is within epsilon of 
alpha.
-     * 
+     *
      * @param n first sample size
      * @param m second sample size
      * @param criticalValue critical value (from table)
@@ -336,5 +399,5 @@ public class KolmogorovSmirnovTestTest {
         final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
         Assert.assertEquals(alpha, test.approximateP(criticalValue, n, m), 
epsilon);
     }
-    
+
 }
\ No newline at end of file

[1/2] [math] [MATH-1197] Computation of 2-sample KS statistic was wrong in case of ties.

Reply via email to