Author: psteitz Date: Sun Oct 20 05:49:10 2013 New Revision: 1533853 URL: http://svn.apache.org/r1533853 Log: Javadoc fixes
Added: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java - copied, changed from r1488409, commons/proper/math/trunk/src/main/java/org/apache/commons/math3/distribution/KolmogorovSmirnovDistribution.java Modified: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/stat/inference/BinomialConfidenceInterval.java Modified: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/stat/inference/BinomialConfidenceInterval.java URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/main/java/org/apache/commons/math3/stat/inference/BinomialConfidenceInterval.java?rev=1533853&r1=1533852&r2=1533853&view=diff ============================================================================== --- commons/proper/math/trunk/src/main/java/org/apache/commons/math3/stat/inference/BinomialConfidenceInterval.java (original) +++ commons/proper/math/trunk/src/main/java/org/apache/commons/math3/stat/inference/BinomialConfidenceInterval.java Sun Oct 20 05:49:10 2013 @@ -65,7 +65,7 @@ public class BinomialConfidenceInterval * @throws NotStrictlyPositiveException if {@code numberOfTrials <= 0}. * @throws NotPositiveException if {@code numberOfSuccesses < 0}. * @throws NumberIsTooLargeException if {@code numberOfSuccesses > numberOfTrials}. - * @throws OutOfRangeException if {@code confidenceLevel} is not in the interval {@code [0, 1]}. + * @throws OutOfRangeException if {@code confidenceLevel} is not in the interval {@code (0, 1)}. */ public ConfidenceInterval getClopperPearsonInterval(int numberOfTrials, int numberOfSuccesses, double confidenceLevel) { @@ -131,7 +131,7 @@ public class BinomialConfidenceInterval * @throws NotStrictlyPositiveException if {@code numberOfTrials <= 0}. * @throws NotPositiveException if {@code numberOfSuccesses < 0}. * @throws NumberIsTooLargeException if {@code numberOfSuccesses > numberOfTrials}. - * @throws OutOfRangeException if {@code confidenceLevel} is not in the interval {@code [0, 1]}. + * @throws OutOfRangeException if {@code confidenceLevel} is not in the interval {@code (0, 1)}. */ public ConfidenceInterval getAgrestiCoullInterval(int numberOfTrials, int numberOfSuccesses, double confidenceLevel) { checkParameters(numberOfTrials, numberOfSuccesses, confidenceLevel); @@ -162,7 +162,7 @@ public class BinomialConfidenceInterval * @throws NotStrictlyPositiveException if {@code numberOfTrials <= 0}. * @throws NotPositiveException if {@code numberOfSuccesses < 0}. * @throws NumberIsTooLargeException if {@code numberOfSuccesses > numberOfTrials}. - * @throws OutOfRangeException if {@code confidenceLevel} is not in the interval {@code [0, 1]}. + * @throws OutOfRangeException if {@code confidenceLevel} is not in the interval {@code (0, 1)}. */ public ConfidenceInterval getWilsonScoreInterval(int numberOfTrials, int numberOfSuccesses, double confidenceLevel) { checkParameters(numberOfTrials, numberOfSuccesses, confidenceLevel); @@ -186,13 +186,13 @@ public class BinomialConfidenceInterval /** * Verifies that parameters satisfy preconditions. * - * @param numberOfTrials Number of trials (must be positive) - * @param numberOfSuccesses Number of successes (must not exceed numberOfTrials) - * @param confidenceLevel Confidence level (must be strictly between 0 and 1) + * @param numberOfTrials number of trials (must be positive) + * @param numberOfSuccesses number of successes (must not exceed numberOfTrials) + * @param confidenceLevel confidence level (must be strictly between 0 and 1) * @throws NotStrictlyPositiveException if {@code numberOfTrials <= 0}. * @throws NotPositiveException if {@code numberOfSuccesses < 0}. * @throws NumberIsTooLargeException if {@code numberOfSuccesses > numberOfTrials}. - * @throws OutOfRangeException if {@code confidenceLevel} is not in the interval {@code [0, 1]}. + * @throws OutOfRangeException if {@code confidenceLevel} is not in the interval {@code (0, 1)}. */ private void checkParameters(int numberOfTrials, int numberOfSuccesses, double confidenceLevel) { if (numberOfTrials <= 0) { Copied: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java (from r1488409, commons/proper/math/trunk/src/main/java/org/apache/commons/math3/distribution/KolmogorovSmirnovDistribution.java) URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java?p2=commons/proper/math/trunk/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java&p1=commons/proper/math/trunk/src/main/java/org/apache/commons/math3/distribution/KolmogorovSmirnovDistribution.java&r1=1488409&r2=1533853&rev=1533853&view=diff ============================================================================== --- commons/proper/math/trunk/src/main/java/org/apache/commons/math3/distribution/KolmogorovSmirnovDistribution.java (original) +++ commons/proper/math/trunk/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java Sun Oct 20 05:49:10 2013 @@ -1,28 +1,31 @@ /* * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law + * or agreed to in writing, software distributed under the License is + * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. */ -package org.apache.commons.math3.distribution; +package org.apache.commons.math3.stat.inference; -import java.io.Serializable; import java.math.BigDecimal; +import java.util.Arrays; +import java.util.Iterator; +import org.apache.commons.math3.distribution.KolmogorovSmirnovDistribution; +import org.apache.commons.math3.distribution.RealDistribution; import org.apache.commons.math3.exception.MathArithmeticException; -import org.apache.commons.math3.exception.NotStrictlyPositiveException; +import org.apache.commons.math3.exception.MathIllegalArgumentException; +import org.apache.commons.math3.exception.NullArgumentException; import org.apache.commons.math3.exception.NumberIsTooLargeException; +import org.apache.commons.math3.exception.OutOfRangeException; +import org.apache.commons.math3.exception.TooManyIterationsException; import org.apache.commons.math3.exception.util.LocalizedFormats; import org.apache.commons.math3.fraction.BigFraction; import org.apache.commons.math3.fraction.BigFractionField; @@ -31,102 +34,280 @@ import org.apache.commons.math3.linear.A import org.apache.commons.math3.linear.Array2DRowRealMatrix; import org.apache.commons.math3.linear.FieldMatrix; import org.apache.commons.math3.linear.RealMatrix; +import org.apache.commons.math3.util.CombinatoricsUtils; +import org.apache.commons.math3.util.FastMath; +import org.apache.commons.math3.util.MathArrays; /** - * Implementation of the Kolmogorov-Smirnov distribution. - * - * <p> - * Treats the distribution of the two-sided {@code P(D_n < d)} where - * {@code D_n = sup_x |G(x) - G_n (x)|} for the theoretical cdf {@code G} and - * the empirical cdf {@code G_n}. - * </p> - * <p> - * This implementation is based on [1] with certain quick decisions for extreme - * values given in [2]. - * </p> + * Implementation of the <a + * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> + * Kolmogorov-Smirnov (K-S) test</a> for equality of continuous distributions. * <p> - * In short, when wanting to evaluate {@code P(D_n < d)}, the method in [1] is - * to write {@code d = (k - h) / n} for positive integer {@code k} and - * {@code 0 <= h < 1}. Then {@code P(D_n < d) = (n! / n^n) * t_kk}, where - * {@code t_kk} is the {@code (k, k)}'th entry in the special matrix - * {@code H^n}, i.e. {@code H} to the {@code n}'th power. + * The K-S test uses a statistic based on the maximum deviation of the empirical + * distribution of sample data points from the distribution expected under the + * null hypothesis. Specifically, what is computed is \(D_n=\sup_x + * |F_n(x)-F(x)|\), where \(F\) is the expected distribution and \(F_n\) is the + * empirical distribution of the \(n\) sample data points. The distribution of + * \(D_n\) is estimated using a method based on [1] with certain quick decisions + * for extreme values given in [2]. * </p> * <p> * References: * <ul> - * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/"> - * Evaluating Kolmogorov's Distribution</a> by George Marsaglia, Wai - * Wan Tsang, and Jingbo Wang</li> - * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/"> - * Computing the Two-Sided Kolmogorov-Smirnov Distribution</a> by Richard Simard - * and Pierre L'Ecuyer</li> + * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/"> Evaluating Kolmogorov's + * Distribution</a> by George Marsaglia, Wai Wan Tsang, and Jingbo Wang</li> + * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/"> Computing the Two-Sided + * Kolmogorov-Smirnov Distribution</a> by Richard Simard and Pierre L'Ecuyer</li> * </ul> - * Note that [1] contains an error in computing h, refer to - * <a href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details. + * Note that [1] contains an error in computing h, refer to <a + * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for + * details. * </p> * - * @see <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> - * Kolmogorov-Smirnov test (Wikipedia)</a> + * @since 3.3 * @version $Id$ */ -public class KolmogorovSmirnovDistribution implements Serializable { +public class KolmogorovSmirnovTest { + + /** + * Bound on the number of partial sums in + * {@link #ksSum(double, double, long)} + */ + private static final long MAXIMUM_PARTIAL_SUM_COUNT = 100000; + + /** Convergence criterion for {@link #ksSum(double, double, long)} */ + private static final double KS_SUM_CAUCHY_CRITERION = 1e-15; + + /** Cutoff for default 2-sample test to use K-S distribution approximation */ + private static final long SMALL_SAMPLE_PRODUCT = 10000; + + /** + * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a + * one-sample <a + * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> + * Kolmogorov-Smirnov test</a> evaluating the null hypothesis that + * {@code data} conforms to {@code distribution}. If {@code exact} is true, + * the distribution used to compute the p-value is computed using extended + * precision. See {@link #cdfExact(double, int)}. + * + * @param distribution reference distribution + * @param data sample being being evaluated + * @param exact whether or not to force exact computation of the p-value + * @return the p-value associated with the null hypothesis that {@code data} + * is a sample from {@code distribution} + */ + public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data, boolean exact) { + return 1d - cdf(kolmogorovSmirnovStatistic(distribution, data), data.length, exact); + } + + /** + * Computes the one-sample Kolmogorov-Smirnov test statistic, \(D_n=\sup_x + * |F_n(x)-F(x)|\) where \(F\) is the distribution (cdf) function associated + * with {@code distribution}, \(n\) is the length of {@code data} and + * \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of + * the values in {@code data}. + * + * @param distribution reference distribution + * @param data sample being evaluated + * @return Kolmogorov-Smirnov statistic \(D_n\) + * @throws MathIllegalArgumentException if {@code data} does not have length + * at least 2 + */ + public double kolmogorovSmirnovStatistic(RealDistribution distribution, double[] data) { + final int n = data.length; + final double nd = n; + final double[] dataCopy = new double[n]; + System.arraycopy(data, 0, dataCopy, 0, n); + Arrays.sort(dataCopy); + double d = 0d; + for (int i = 1; i <= n; i++) { + final double yi = distribution.cumulativeProbability(dataCopy[i - 1]); + final double currD = FastMath.max(yi - (i - 1) / nd, i / nd - yi); + if (currD > d) { + d = currD; + } + } + return d; + } - /** Serializable version identifier. */ - private static final long serialVersionUID = -4670676796862967187L; + /** + * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a + * two-sample <a + * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> + * Kolmogorov-Smirnov test</a> evaluating the null hypothesis that {@code x} + * and {@code y} are samples drawn from the same probability distribution. + * If {@code exact} is true, the discrete distribution of the test statistic + * is computed and used directly; otherwise the asymptotic + * (Kolmogorov-Smirnov) distribution is used to estimate the p-value. + * + * @param x first sample dataset + * @param y second sample dataset + * @param exact whether or not the exact distribution of the \(D\( statistic + * is used + * @return p-value associated with the null hypothesis that {@code x} and + * {@code y} represent samples from the same distribution + */ + public double kolmogorovSmirnovTest(double[] x, double[] y, boolean exact) { + if (exact) { + return exactP(kolmogorovSmirnovStatistic(x, y), x.length, y.length, false); + } else { + return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length); + } + } - /** Number of observations. */ - private int n; + /** + * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a + * two-sample <a + * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> + * Kolmogorov-Smirnov test</a> evaluating the null hypothesis that {@code x} + * and {@code y} are samples drawn from the same probability distribution. + * If the product of the lengths of x and y is less than 10,000, the + * discrete distribution of the test statistic is computed and used + * directly; otherwise the asymptotic (Kolmogorov-Smirnov) distribution is + * used to estimate the p-value. + * + * @param x first sample dataset + * @param y second sample dataset + * @return p-value associated with the null hypothesis that {@code x} and + * {@code y} represent samples from the same distribution + */ + public double kolmogorovSmirnovTest(double[] x, double[] y) { + if (x.length * y.length < SMALL_SAMPLE_PRODUCT) { + return kolmogorovSmirnovTest(x, y, true); + } else { + return kolmogorovSmirnovTest(x, y, false); + } + } /** - * @param n Number of observations - * @throws NotStrictlyPositiveException if {@code n <= 0} + * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_n,m=\sup_x + * |F_n(x)-F_m(x)|\) \(n\) is the length of {@code x}, \(m\) is the length + * of {@code y}, \(F_n\) is the empirical distribution that puts mass + * \(1/n\) at each of the values in {@code x} and \(F_m\) is the empirical + * distribution of the {@code y} values. + * + * @param x first sample + * @param y second sample + * @return test statistic \(D_n,m\) used to evaluate the null hypothesis + * that {@code x} and {@code y} represent samples from the same + * underlying distribution + * @throws MathIllegalArgumentException if either {@code x} or {@code y} + * does not have length at least 2. */ - public KolmogorovSmirnovDistribution(int n) - throws NotStrictlyPositiveException { - if (n <= 0) { - throw new NotStrictlyPositiveException(LocalizedFormats.NOT_POSITIVE_NUMBER_OF_SAMPLES, n); + public double kolmogorovSmirnovStatistic(double[] x, double[] y) { + checkArray(x); + checkArray(y); + // Copy and sort the sample arrays + final double[] sx = MathArrays.copyOf(x); + final double[] sy = MathArrays.copyOf(y); + Arrays.sort(sx); + Arrays.sort(sy); + final int n = sx.length; + final int m = sy.length; + + // Compare empirical distribution cdf values at each (combined) sample + // point. + // D_n.m is the max difference. + // cdf value is (insertion point - 1) / length if not an element; + // index / n if element is in the array. + double supD = 0d; + // First walk x points + for (int i = 0; i < n; i++) { + final double cdf_x = (i + 1d) / n; + final int yIndex = Arrays.binarySearch(sy, sx[i]); + final double cdf_y = yIndex >= 0 ? (yIndex + 1d) / m : (-yIndex - 1d) / m; + final double curD = FastMath.abs(cdf_x - cdf_y); + if (curD > supD) { + supD = curD; + } + } + // Now look at y + for (int i = 0; i < m; i++) { + final double cdf_y = (i + 1d) / m; + final int xIndex = Arrays.binarySearch(sx, sy[i]); + final double cdf_x = xIndex >= 0 ? (xIndex + 1d) / n : (-xIndex - 1d) / n; + final double curD = FastMath.abs(cdf_x - cdf_y); + if (curD > supD) { + supD = curD; + } } + return supD; + } - this.n = n; + /** + * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a + * one-sample <a + * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> + * Kolmogorov-Smirnov test</a> evaluating the null hypothesis that + * {@code data} conforms to {@code distribution}. + * + * @param distribution reference distribution + * @param data sample being being evaluated + * @return the p-value associated with the null hypothesis that {@code data} + * is a sample from {@code distribution} + */ + public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data) { + return kolmogorovSmirnovTest(distribution, data, false); + } + + /** + * Performs a <a + * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> + * Kolmogorov-Smirnov test</a> evaluating the null hypothesis that + * {@code data} conforms to {@code distribution}. + * + * @param distribution reference distribution + * @param data sample being being evaluated + * @param alpha significance level of the test + * @return true iff the null hypothesis that {@code data} is a sample from + * {@code distribution} can be rejected with confidence 1 - + * {@code alpha} + */ + public boolean kolmogorovSmirnovTest(RealDistribution distribution, double[] data, double alpha) { + if ((alpha <= 0) || (alpha > 0.5)) { + throw new OutOfRangeException(LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, alpha, 0, 0.5); + } + return kolmogorovSmirnovTest(distribution, data) < alpha; } /** * Calculates {@code P(D_n < d)} using method described in [1] with quick * decisions for extreme values given in [2] (see above). The result is not - * exact as with - * {@link KolmogorovSmirnovDistribution#cdfExact(double)} because - * calculations are based on {@code double} rather than + * exact as with {@link KolmogorovSmirnovDistribution#cdfExact(double)} + * because calculations are based on {@code double} rather than * {@link org.apache.commons.math3.fraction.BigFraction}. * * @param d statistic * @return the two-sided probability of {@code P(D_n < d)} * @throws MathArithmeticException if algorithm fails to convert {@code h} - * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing - * {@code d} as {@code (k - h) / m} for integer {@code k, m} and - * {@code 0 <= h < 1}. + * to a {@link org.apache.commons.math3.fraction.BigFraction} in + * expressing {@code d} as {@code (k - h) / m} for integer + * {@code k, m} and {@code 0 <= h < 1}. */ - public double cdf(double d) throws MathArithmeticException { - return this.cdf(d, false); + public double cdf(double d, int n) + throws MathArithmeticException { + return cdf(d, n, false); } /** - * Calculates {@code P(D_n < d)} using method described in [1] with quick - * decisions for extreme values given in [2] (see above). The result is - * exact in the sense that BigFraction/BigReal is used everywhere at the - * expense of very slow execution time. Almost never choose this in real - * applications unless you are very sure; this is almost solely for - * verification purposes. Normally, you would choose - * {@link KolmogorovSmirnovDistribution#cdf(double)} + * Calculates {@code P(D_n < d)}. The result is exact in the sense that + * BigFraction/BigReal is used everywhere at the expense of very slow + * execution time. Almost never choose this in real applications unless you + * are very sure; this is almost solely for verification purposes. Normally, + * you would choose {@link KolmogorovSmirnovDistribution#cdf(double)}. See + * the class javadoc for definitions and algorithm description. * * @param d statistic * @return the two-sided probability of {@code P(D_n < d)} - * @throws MathArithmeticException if algorithm fails to convert {@code h} - * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing - * {@code d} as {@code (k - h) / m} for integer {@code k, m} and - * {@code 0 <= h < 1}. + * @throws MathArithmeticException if the algorithm fails to convert + * {@code h} to a + * {@link org.apache.commons.math3.fraction.BigFraction} in + * expressing {@code d} as {@code (k - h) / m} for integer + * {@code k, m} and {@code 0 <= h < 1}. */ - public double cdfExact(double d) throws MathArithmeticException { - return this.cdf(d, true); + public double cdfExact(double d, int n) + throws MathArithmeticException { + return cdf(d, n, true); } /** @@ -135,48 +316,39 @@ public class KolmogorovSmirnovDistributi * * @param d statistic * @param exact whether the probability should be calculated exact using - * {@link org.apache.commons.math3.fraction.BigFraction} everywhere at the - * expense of very slow execution time, or if {@code double} should be used - * convenient places to gain speed. Almost never choose {@code true} in real - * applications unless you are very sure; {@code true} is almost solely for - * verification purposes. + * {@link org.apache.commons.math3.fraction.BigFraction} everywhere + * at the expense of very slow execution time, or if {@code double} + * should be used convenient places to gain speed. Almost never + * choose {@code true} in real applications unless you are very sure; + * {@code true} is almost solely for verification purposes. * @return the two-sided probability of {@code P(D_n < d)} * @throws MathArithmeticException if algorithm fails to convert {@code h} - * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing - * {@code d} as {@code (k - h) / m} for integer {@code k, m} and - * {@code 0 <= h < 1}. + * to a {@link org.apache.commons.math3.fraction.BigFraction} in + * expressing {@code d} as {@code (k - h) / m} for integer + * {@code k, m} and {@code 0 <= h < 1}. */ - public double cdf(double d, boolean exact) throws MathArithmeticException { + public double cdf(double d, int n, boolean exact) + throws MathArithmeticException { final double ninv = 1 / ((double) n); final double ninvhalf = 0.5 * ninv; if (d <= ninvhalf) { - return 0; - } else if (ninvhalf < d && d <= ninv) { - double res = 1; - double f = 2 * d - ninv; - + final double f = 2 * d - ninv; // n! f^n = n*f * (n-1)*f * ... * 1*x for (int i = 1; i <= n; ++i) { res *= i * f; } - return res; - } else if (1 - ninv <= d && d < 1) { - return 1 - 2 * Math.pow(1 - d, n); - } else if (1 <= d) { - return 1; } - - return exact ? exactK(d) : roundedK(d); + return exact ? exactK(d, n) : roundedK(d, n); } /** @@ -187,15 +359,16 @@ public class KolmogorovSmirnovDistributi * @param d statistic * @return the two-sided probability of {@code P(D_n < d)} * @throws MathArithmeticException if algorithm fails to convert {@code h} - * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing - * {@code d} as {@code (k - h) / m} for integer {@code k, m} and - * {@code 0 <= h < 1}. + * to a {@link org.apache.commons.math3.fraction.BigFraction} in + * expressing {@code d} as {@code (k - h) / m} for integer + * {@code k, m} and {@code 0 <= h < 1}. */ - private double exactK(double d) throws MathArithmeticException { + private double exactK(double d, int n) + throws MathArithmeticException { final int k = (int) Math.ceil(n * d); - final FieldMatrix<BigFraction> H = this.createH(d); + final FieldMatrix<BigFraction> H = this.createH(d, n); final FieldMatrix<BigFraction> Hpower = H.power(n); BigFraction pFrac = Hpower.getEntry(k - 1, k - 1); @@ -219,19 +392,20 @@ public class KolmogorovSmirnovDistributi * @param d statistic * @return the two-sided probability of {@code P(D_n < d)} * @throws MathArithmeticException if algorithm fails to convert {@code h} - * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing - * {@code d} as {@code (k - h) / m} for integer {@code k, m} and - * {@code 0 <= h < 1}. + * to a {@link org.apache.commons.math3.fraction.BigFraction} in + * expressing {@code d} as {@code (k - h) / m} for integer + * {@code k, m} and {@code 0 <= h < 1}. */ - private double roundedK(double d) throws MathArithmeticException { + private double roundedK(double d, int n) + throws MathArithmeticException { final int k = (int) Math.ceil(n * d); - final FieldMatrix<BigFraction> HBigFraction = this.createH(d); + final FieldMatrix<BigFraction> HBigFraction = this.createH(d, n); final int m = HBigFraction.getRowDimension(); /* - * Here the rounding part comes into play: use - * RealMatrix instead of FieldMatrix<BigFraction> + * Here the rounding part comes into play: use RealMatrix instead of + * FieldMatrix<BigFraction> */ final RealMatrix H = new Array2DRowRealMatrix(m, m); @@ -259,17 +433,18 @@ public class KolmogorovSmirnovDistributi * @return H matrix * @throws NumberIsTooLargeException if fractional part is greater than 1 * @throws FractionConversionException if algorithm fails to convert - * {@code h} to a {@link org.apache.commons.math3.fraction.BigFraction} in - * expressing {@code d} as {@code (k - h) / m} for integer {@code k, m} and - * {@code 0 <= h < 1}. + * {@code h} to a + * {@link org.apache.commons.math3.fraction.BigFraction} in + * expressing {@code d} as {@code (k - h) / m} for integer + * {@code k, m} and {@code 0 <= h < 1}. */ - private FieldMatrix<BigFraction> createH(double d) - throws NumberIsTooLargeException, FractionConversionException { + private FieldMatrix<BigFraction> createH(double d, int n) + throws NumberIsTooLargeException, FractionConversionException { - int k = (int) Math.ceil(n * d); + final int k = (int) Math.ceil(n * d); - int m = 2 * k - 1; - double hDouble = k - n * d; + final int m = 2 * k - 1; + final double hDouble = k - n * d; if (hDouble >= 1) { throw new NumberIsTooLargeException(hDouble, 1.0, false); @@ -279,10 +454,10 @@ public class KolmogorovSmirnovDistributi try { h = new BigFraction(hDouble, 1.0e-20, 10000); - } catch (FractionConversionException e1) { + } catch (final FractionConversionException e1) { try { h = new BigFraction(hDouble, 1.0e-10, 10000); - } catch (FractionConversionException e2) { + } catch (final FractionConversionException e2) { h = new BigFraction(hDouble, 1.0e-5, 10000); } } @@ -334,11 +509,9 @@ public class KolmogorovSmirnovDistributi * 1/(i - j + 1)! if i - j + 1 >= 0, else 0. 1's and 0's are already * put, so only division with (i - j + 1)! is needed in the elements * that have 1's. There is no need to calculate (i - j + 1)! and then - * divide - small steps avoid overflows. - * - * Note that i - j + 1 > 0 <=> i + 1 > j instead of j'ing all the way to - * m. Also note that it is started at g = 2 because dividing by 1 isn't - * really necessary. + * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + + * 1 > j instead of j'ing all the way to m. Also note that it is started + * at g = 2 because dividing by 1 isn't really necessary. */ for (int i = 0; i < m; ++i) { for (int j = 0; j < i + 1; ++j) { @@ -352,4 +525,130 @@ public class KolmogorovSmirnovDistributi return new Array2DRowFieldMatrix<BigFraction>(BigFractionField.getInstance(), Hdata); } + + /** + * Verifies that array has length at least 2, throwing MIAE if not. + * + * @param array array to test + * @throws NullArgumentException if array is null + * @throws MathIllegalArgumentException if array is too short + */ + private void checkArray(double[] array) { + if (array == null) { + throw new NullArgumentException(LocalizedFormats.NULL_NOT_ALLOWED); + } + if (array.length < 2) { + throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_OBSERVED_POINTS_IN_SAMPLE, + array.length, 2); + } + } + + /** + * Compute \( \sum_{k=-\infty}^\infty (-1)^k e^{-2 k^2 x^2} = 1 + 2 + * \sum_{k=1}^\infty (-1)^k e^{-2 k^2 x^2} = \frac{\sqrt{2\pi}}{x} + * \sum_{k=1}^\infty \exp(-(2k-1)^2\pi^2/(8x^2)) \) See e.g. J. Durbin + * (1973), Distribution Theory for Tests Based on the Sample Distribution + * Function. SIAM. The 'standard' series expansion obviously cannot be used + * close to 0; we use the alternative series for x < 1, and a rather crude + * estimate of the series remainder term in this case, in particular using + * that \(ue^(-lu^2) \le e^(-lu^2 + u) \le e^(-(l-1)u^2 - u^2+u) \le + * e^(-(l-1))\) provided that u and l are >= 1. (But note that for + * reasonable tolerances, one could simply take 0 as the value for x < 0.2, + * and use the standard expansion otherwise.) + */ + public double pkstwo(double x, double tol) { + final double M_PI_2 = Math.PI / 2; + final double M_PI_4 = Math.PI / 4; + final double M_1_SQRT_2PI = 1 / Math.sqrt(Math.PI * 2); + double newx, old, s; + int k; + final int k_max = (int) Math.sqrt(2 - Math.log(tol)); + if (x < 1) { + final double z = -(M_PI_2 * M_PI_4) / (x * x); + final double w = Math.log(x); + s = 0; + for (k = 1; k < k_max; k += 2) { + s += Math.exp(k * k * z - w); + } + return s / M_1_SQRT_2PI; + } else { + final double z = -2 * x * x; + s = -1; + k = 1; + old = 0; + newx = 1; + while (Math.abs(old - newx) > tol) { + old = newx; + newx += 2 * s * Math.exp(z * k * k); + s *= -1; + k++; + } + return newx; + } + } + + /** + * Computes \( 1 + 2 \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2} \) stopping + * when successive partial sums are within {@code tolerance} of one another, + * or when {@code maxIter} partial sums have been computed. If the sum does + * not converge before {@code maxIter} iterations a + * {@link TooManyIterationsException} is thrown. + * + * @param t argument + * @param tolerance Cauchy criterion for partial sums + * @param maxIter maximum number of partial sums to compute + * @throws TooManyIterationsException if the series does not converge + */ + public double ksSum(double t, double tolerance, long maxIter) { + final double x = -2 * t * t; + double sign = -1; + int i = 1; + double lastPartialSum = -1d; + double partialSum = 0.5d; + long iterationCount = 0; + while (FastMath.abs(lastPartialSum - partialSum) > tolerance && iterationCount < maxIter) { + lastPartialSum = partialSum; + partialSum += sign * FastMath.exp(x * i * i); + sign *= -1; + i++; + } + if (iterationCount == maxIter) { + throw new TooManyIterationsException(maxIter); + } + return partialSum * 2; + } + + public double exactP(double d, int n, int m, boolean strict) { + Iterator<int[]> combinationsIterator = CombinatoricsUtils.combinationsIterator(n + m, n); + long tail = 0; + final double[] nSet = new double[n]; + final double[] mSet = new double[m]; + while (combinationsIterator.hasNext()) { + // Generate an n-set + final int[] nSetI = combinationsIterator.next(); + // Copy the n-set to nSet and its complement to mSet + int j = 0; + int k = 0; + for (int i = 0; i < n + m; i++) { + if (j < n && nSetI[j] == i) { + nSet[j++] = i; + } else { + mSet[k++] = i; + } + } + final double curD = kolmogorovSmirnovStatistic(nSet, mSet); + if (curD > d) { + tail++; + } else if (curD == d && !strict) { + tail++; + } + } + return (double) tail / (double) CombinatoricsUtils.binomialCoefficient(n + m, n); + } + + public double approximateP(double d, int n, int m) { + return 1 - ksSum(d * FastMath.sqrt((double) (m * n) / (double) (m + n)), KS_SUM_CAUCHY_CRITERION, + MAXIMUM_PARTIAL_SUM_COUNT); + } + }