Author: mikl Date: Mon Mar 21 09:39:23 2011 New Revision: 1083716 URL: http://svn.apache.org/viewvc?rev=1083716&view=rev Log: Fixes MATH-437
Added: commons/proper/math/trunk/src/main/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistribution.java (with props) commons/proper/math/trunk/src/main/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistributionImpl.java (with props) commons/proper/math/trunk/src/test/R/KolmogorovSmirnovDistributionTestCases.R commons/proper/math/trunk/src/test/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistributionTest.java (with props) Added: commons/proper/math/trunk/src/main/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistribution.java URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/main/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistribution.java?rev=1083716&view=auto ============================================================================== --- commons/proper/math/trunk/src/main/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistribution.java (added) +++ commons/proper/math/trunk/src/main/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistribution.java Mon Mar 21 09:39:23 2011 @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math.distribution; + +/** + * Treats the distribution of the two-sided + * {@code P(D<sub>n</sup> < d)} + * where {@code D<sub>n</sup> = sup_x | G(x) - Gn (x) |} for the + * theoretical cdf G and the emperical cdf Gn. + * + * This implementation is based on [1] with certain quick + * decisions for extreme values given in [2]. + * + * In short, when wanting to evaluate {@code P(D<sub>n</sup> < d)}, + * the method in [1] is to write {@code d = (k - h) / n} for positive + * integer {@code k} and {@code 0 <= h < 1}. Then + * {@code P(D<sub>n</sup> < d) = (n!/n^n) * t_kk} + * where {@code t_kk} is the (k, k)'th entry in the special matrix {@code H^n}, + * i.e. {@code H} to the {@code n}'th power. + * + * See also <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> + * Kolmogorov-Smirnov test on Wikipedia</a> for details. + * + * References: + * [1] Evaluating Kolmogorov's Distribution by George Marsaglia, Wai + * Wan Tsang, Jingbo Wang http://www.jstatsoft.org/v08/i18/paper + * + * [2] <a href="http://www.iro.umontreal.ca/~lecuyer/myftp/papers/ksdist.pdf"> + * Computing the Two-Sided Kolmogorov-Smirnov Distribution</a> by Richard Simard + * and Pierre L'Ecuyer + * + * Note that [1] contains an error in computing h, refer to + * <a href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details. + * + * @version $Revision$ $Date$ + */ +public interface KolmogorovSmirnovDistribution { + + public double cdf(double d); + +} Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistribution.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistribution.java ------------------------------------------------------------------------------ svn:keywords = Author Date Id Revision Added: commons/proper/math/trunk/src/main/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistributionImpl.java URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/main/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistributionImpl.java?rev=1083716&view=auto ============================================================================== --- commons/proper/math/trunk/src/main/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistributionImpl.java (added) +++ commons/proper/math/trunk/src/main/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistributionImpl.java Mon Mar 21 09:39:23 2011 @@ -0,0 +1,328 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math.distribution; + +import java.io.Serializable; +import java.math.BigDecimal; + +import org.apache.commons.math.exception.MathArithmeticException; +import org.apache.commons.math.exception.NotStrictlyPositiveException; +import org.apache.commons.math.exception.util.LocalizedFormats; +import org.apache.commons.math.fraction.BigFraction; +import org.apache.commons.math.fraction.FractionConversionException; +import org.apache.commons.math.linear.Array2DRowFieldMatrix; +import org.apache.commons.math.linear.Array2DRowRealMatrix; +import org.apache.commons.math.linear.FieldMatrix; +import org.apache.commons.math.linear.RealMatrix; + +/** + * The default implementation of {@link KolmogorovSmirnovDistribution}. + * + * @version $Revision$ $Date$ + */ +public class KolmogorovSmirnovDistributionImpl implements KolmogorovSmirnovDistribution, Serializable { + + /** Serializable version identifier. */ + private static final long serialVersionUID = -4670676796862967187L; + + private int n; + + /** + * @param n Number of observations + * @throws NotStrictlyPositiveException + * if n <= 0 + */ + public KolmogorovSmirnovDistributionImpl(int n) { + if (n <= 0) { + throw new NotStrictlyPositiveException(LocalizedFormats.NOT_POSITIVE_NUMBER_OF_SAMPLES, n); + } + + this.n = n; + } + + /** + * Calculates {@code P(D<sub>n</sup> < d)} using method described in + * [1] with quick decisions for extreme values given in [2] (see above). The + * result is not exact as with + * {@link KolmogorovSmirnovDistributionImpl#cdfExact(double)} because + * calculations are based on double rather than + * {@link org.apache.commons.math.fraction.BigFraction}. + * + * @param d statistic + * @return the two-sided probability of {@code P(D<sub>n</sup> < d)} + * @throws MathArithmeticException + * if algorithm fails to convert h to a BigFraction in + * expressing d as (k - h) / m for integer k, m and 0 <= h < 1. + */ + @Override + public double cdf(double d) throws MathArithmeticException { + return this.cdf(d, false); + } + + /** + * Calculates {@code P(D<sub>n</sup> < d)} using method described in + * [1] with quick decisions for extreme values given in [2] (see above). + * The result is exact in the sense that BigFraction/BigReal is used everywhere + * at the expense of very slow execution time. Almost never choose this in + * real applications unless you are very sure; this is almost solely for + * verification purposes. Normally, you would choose + * {@link KolmogorovSmirnovDistributionImpl#cdf(double)} + * + * @param d statistic + * @return the two-sided probability of {@code P(D<sub>n</sup> < d)} + * @throws MathArithmeticException + * if algorithm fails to convert h to a BigFraction in + * expressing d as (k - h) / m for integer k, m and 0 <= h < 1. + */ + public double cdfExact(double d) throws MathArithmeticException { + return this.cdf(d, true); + } + + /** + * Calculates {@code P(D<sub>n</sup> < d)} using method described in + * [1] with quick decisions for extreme values given in [2] (see above). + * + * @param d statistic + * @param exact + * whether the probability should be calculated exact using + * BigFraction everywhere at the expense of very + * slow execution time, or if double should be used convenient + * places to gain speed. Never choose true in real applications + * unless you are very sure; true is almost solely for + * verification purposes. + * @return the two-sided probability of {@code P(D<sub>n</sup> < d)} + * @throws MathArithmeticException + * if algorithm fails to convert h to a BigFraction in + * expressing d as (k - h) / m for integer k, m and 0 <= h < 1. + */ + public double cdf(double d, boolean exact) + throws MathArithmeticException { + + final int n = this.n; + + final double ninv = 1 / ((double) n); + final double ninvhalf = 0.5 * ninv; + + if (d <= ninvhalf) { + + return 0; + + } else if (ninvhalf < d && d <= ninv) { + + double res = 1; + double f = 2 * d - ninv; + + // n! f^n = n*f * (n-1)*f * ... * 1*x + for (int i = 1; i <= n; ++i) { + res *= i * f; + } + + return res; + + } else if (1 - ninv <= d && d < 1) { + + return 1 - 2 * Math.pow(1 - d, n); + + } else if (1 <= d) { + + return 1; + } + + return (exact) ? this.exactK(d) : this.roundedK(d); + } + + /** + * Calculates {@code P(D<sub>n</sup> < d)} exact using method + * described in [1] and BigFraction (see above). + * + * @param d statistic + * @return the two-sided probability of {@code P(D<sub>n</sup> < d)} + * @throws MathArithmeticException + * if algorithm fails to convert h to a BigFraction in + * expressing d as (k - h) / m for integer k, m and 0 <= h < 1. + */ + private double exactK(double d) + throws MathArithmeticException { + + final int n = this.n; + final int k = (int) Math.ceil(n * d); + + final FieldMatrix<BigFraction> H = this.createH(d); + final FieldMatrix<BigFraction> Hpower = H.power(n); + + BigFraction pFrac = Hpower.getEntry(k - 1, k - 1); + + for (int i = 1; i <= n; ++i) { + pFrac = pFrac.multiply(i).divide(n); + } + + /* + * BigFraction.doubleValue converts numerator to double and the + * denominator to double and divides afterwards. That gives NaN quite + * easy. This does not (scale is the number of digits): + */ + return pFrac.bigDecimalValue(20, BigDecimal.ROUND_HALF_UP) + .doubleValue(); + } + + /** + * Calculates <code>P(D<sub>n</sup> < d)</code> using method described in + * [1] and doubles (see above). + * + * @param d statistic + * @return the two-sided probability of {@code P(D<sub>n</sup> < d)} + * @throws MathArithmeticException + * if algorithm fails to convert h to a BigFraction in + * expressing d as (k - h) / m for integer k, m and 0 <= h < 1. + */ + private double roundedK(double d) + throws MathArithmeticException { + + final int n = this.n; + final int k = (int) Math.ceil(n * d); + final FieldMatrix<BigFraction> HBigFraction = this.createH(d); + final int m = HBigFraction.getRowDimension(); + + /* + * Here the rounding part comes into play: use + * RealMatrix instead of FieldMatrix<BigFraction> + */ + final RealMatrix H = new Array2DRowRealMatrix(m, m); + + for (int i = 0; i < m; ++i) { + for (int j = 0; j < m; ++j) { + H.setEntry(i, j, HBigFraction.getEntry(i, j).doubleValue()); + } + } + + final RealMatrix Hpower = H.power(n); + + double pFrac = Hpower.getEntry(k - 1, k - 1); + + for (int i = 1; i <= n; ++i) { + pFrac *= (double)i / (double)n; + } + + return pFrac; + } + + /*** + * Creates H of size m x m as described in [1] (see above). + * + * @param d statistic + * + * @throws MathArithmeticException + * if algorithm fails to convert h to a BigFraction in + * expressing x as (k - h) / m for integer k, m and 0 <= h < 1. + */ + private FieldMatrix<BigFraction> createH(double d) + throws MathArithmeticException { + + int n = this.n; + int k = (int) Math.ceil(n * d); + + int m = 2 * k - 1; + double hDouble = k - n * d; + + if (hDouble >= 1) { + throw new ArithmeticException("Could not "); + } + + BigFraction h = null; + + try { + h = new BigFraction(hDouble, 1.0e-20, 10000); + } catch (FractionConversionException e1) { + try { + h = new BigFraction(hDouble, 1.0e-10, 10000); + } catch (FractionConversionException e2) { + try { + h = new BigFraction(hDouble, 1.0e-5, 10000); + } catch (FractionConversionException e3) { + //throw new MathArithmeticException(hDouble, 10000); + throw new MathArithmeticException(LocalizedFormats.CANNOT_CONVERT_OBJECT_TO_FRACTION, hDouble); + } + } + } + + final BigFraction[][] Hdata = new BigFraction[m][m]; + + /* + * Start by filling everything with either 0 or 1. + */ + for (int i = 0; i < m; ++i) { + for (int j = 0; j < m; ++j) { + if (i - j + 1 < 0) + Hdata[i][j] = BigFraction.ZERO; + else + Hdata[i][j] = BigFraction.ONE; + } + } + + /* + * Setting up power-array to avoid calculating the same value twice: + * hPowers[0] = h^1 ... hPowers[m-1] = h^m + */ + final BigFraction[] hPowers = new BigFraction[m]; + hPowers[0] = h; + for (int i = 1; i < m; ++i) { + hPowers[i] = h.multiply(hPowers[i - 1]); + } + + /* + * First column and last row has special values (each other reversed). + */ + for (int i = 0; i < m; ++i) { + Hdata[i][0] = Hdata[i][0].subtract(hPowers[i]); + Hdata[m - 1][i] = Hdata[m - 1][i].subtract(hPowers[m - i - 1]); + } + + /* + * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix + * should be (1 - 2*h^m + (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > + * 1/2 is sufficient to check: + */ + if (h.compareTo(BigFraction.ONE_HALF) == 1) { + Hdata[m - 1][0] = Hdata[m - 1][0].add(h.multiply(2).subtract(1) + .pow(m)); + } + + /* + * Aside from the first column and last row, the (i, j)-th element is + * 1/(i - j + 1)! if i â j + 1 >= 0, else 0. 1's and 0's are already + * put, so only division with (i - j + 1)! is needed in the elements + * that have 1's. There is no need to calculate (i - j + 1)! and then + * divide - small steps avoid overflows. + * + * Note that i - j + 1 > 0 <=> i + 1 > j instead of j'ing all the way to + * m. Also note that it is started at g = 2 because dividing by 1 isn't + * really necessary. + */ + for (int i = 0; i < m; ++i) { + for (int j = 0; j < i + 1; ++j) { + if (i - j + 1 > 0) { + for (int g = 2; g <= i - j + 1; ++g) { + Hdata[i][j] = Hdata[i][j].divide(g); + } + } + } + } + + return new Array2DRowFieldMatrix<BigFraction>(Hdata); + } +} Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistributionImpl.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: commons/proper/math/trunk/src/main/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistributionImpl.java ------------------------------------------------------------------------------ svn:keywords = Author Date Id Revision Added: commons/proper/math/trunk/src/test/R/KolmogorovSmirnovDistributionTestCases.R URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/test/R/KolmogorovSmirnovDistributionTestCases.R?rev=1083716&view=auto ============================================================================== --- commons/proper/math/trunk/src/test/R/KolmogorovSmirnovDistributionTestCases.R (added) +++ commons/proper/math/trunk/src/test/R/KolmogorovSmirnovDistributionTestCases.R Mon Mar 21 09:39:23 2011 @@ -0,0 +1,22 @@ +cat("/* ", version$version.string, " */\n\n\n", sep = "") + +ns <- c(200, 341, 389) +ps <- c(0.005, 0.02, 0.031111, 0.04) + +for (n in ns) { + for (p in ps) { + res <- .C("pkolmogorov2x", p = as.double(p), n = as.integer(n), PACKAGE = "stats")$p + + cat("/* formatC(.C(\"pkolmogorov2x\", p = as.double(", p, "), n = as.integer(", n, "), PACKAGE = \"stats\")$p, 40) gives\n", sep = "") + cat(" * ", formatC(res, digits = 40), "\n", sep = "") + cat(" */\n") + + cat("dist = new KolmogorovSmirnovDistributionImpl(", n, ");\n", sep = "") + #cat("assertEquals(", formatC(res, digits = 40), ", dist.cdf(", p, ", true), TOLERANCE);\n", sep = "") + cat("assertEquals(", formatC(res, digits = 40), ", dist.cdf(", p, ", false), TOLERANCE);\n", sep = "") + cat("\n") + + #cat("System.out.println(\"", formatC(res, digits = 20), " - \" + dist.cdf(", p, ", false) + \" = \" + (", res, " - dist.cdf(", p, ", false)));\n", sep = "") + } +} + Added: commons/proper/math/trunk/src/test/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistributionTest.java URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/test/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistributionTest.java?rev=1083716&view=auto ============================================================================== --- commons/proper/math/trunk/src/test/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistributionTest.java (added) +++ commons/proper/math/trunk/src/test/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistributionTest.java Mon Mar 21 09:39:23 2011 @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.math.distribution; + +import junit.framework.TestCase; + +/** + * Test cases for {@link KolmogorovSmirnovDistributionImpl}. + * + * @version $Revision$ $Date$ + */ +public class KolmogorovSmirnovDistributionTest extends TestCase { + + private static final double TOLERANCE = 10e-10; + + public void testCumulativeDensityFunction() throws Exception { + + KolmogorovSmirnovDistributionImpl dist; + + /* The code below is generated using the R-script located in + * /src/test/R/KolmogorovSmirnovDistributionTestCases.R + */ + + /* R version 2.11.1 (2010-05-31) */ + + + /* formatC(.C("pkolmogorov2x", p = as.double(0.005), n = as.integer(200), PACKAGE = "stats")$p, 40) gives + * 4.907829957616471622388047046469198862537e-86 + */ + dist = new KolmogorovSmirnovDistributionImpl(200); + assertEquals(4.907829957616471622388047046469198862537e-86, dist.cdf(0.005, false), TOLERANCE); + + /* formatC(.C("pkolmogorov2x", p = as.double(0.02), n = as.integer(200), PACKAGE = "stats")$p, 40) gives + * 5.151982014280041957199687829849210629618e-06 + */ + dist = new KolmogorovSmirnovDistributionImpl(200); + assertEquals(5.151982014280041957199687829849210629618e-06, dist.cdf(0.02, false), TOLERANCE); + + /* formatC(.C("pkolmogorov2x", p = as.double(0.031111), n = as.integer(200), PACKAGE = "stats")$p, 40) gives + * 0.01291614648162886340443389343590752105229 + */ + dist = new KolmogorovSmirnovDistributionImpl(200); + assertEquals(0.01291614648162886340443389343590752105229, dist.cdf(0.031111, false), TOLERANCE); + + /* formatC(.C("pkolmogorov2x", p = as.double(0.04), n = as.integer(200), PACKAGE = "stats")$p, 40) gives + * 0.1067137011362679355208626930107129737735 + */ + dist = new KolmogorovSmirnovDistributionImpl(200); + assertEquals(0.1067137011362679355208626930107129737735, dist.cdf(0.04, false), TOLERANCE); + + /* formatC(.C("pkolmogorov2x", p = as.double(0.005), n = as.integer(341), PACKAGE = "stats")$p, 40) gives + * 1.914734701559404553985102395145063418825e-53 + */ + dist = new KolmogorovSmirnovDistributionImpl(341); + assertEquals(1.914734701559404553985102395145063418825e-53, dist.cdf(0.005, false), TOLERANCE); + + /* formatC(.C("pkolmogorov2x", p = as.double(0.02), n = as.integer(341), PACKAGE = "stats")$p, 40) gives + * 0.001171328985781981343872182321774744195864 + */ + dist = new KolmogorovSmirnovDistributionImpl(341); + assertEquals(0.001171328985781981343872182321774744195864, dist.cdf(0.02, false), TOLERANCE); + + /* formatC(.C("pkolmogorov2x", p = as.double(0.031111), n = as.integer(341), PACKAGE = "stats")$p, 40) gives + * 0.1142955196267499418105728636874118819833 + */ + dist = new KolmogorovSmirnovDistributionImpl(341); + assertEquals(0.1142955196267499418105728636874118819833, dist.cdf(0.031111, false), TOLERANCE); + + /* formatC(.C("pkolmogorov2x", p = as.double(0.04), n = as.integer(341), PACKAGE = "stats")$p, 40) gives + * 0.3685529520496805266915885113121476024389 + */ + dist = new KolmogorovSmirnovDistributionImpl(341); + assertEquals(0.3685529520496805266915885113121476024389, dist.cdf(0.04, false), TOLERANCE); + + /* formatC(.C("pkolmogorov2x", p = as.double(0.005), n = as.integer(389), PACKAGE = "stats")$p, 40) gives + * 1.810657144595055888918455512707637574637e-47 + */ + dist = new KolmogorovSmirnovDistributionImpl(389); + assertEquals(1.810657144595055888918455512707637574637e-47, dist.cdf(0.005, false), TOLERANCE); + + /* formatC(.C("pkolmogorov2x", p = as.double(0.02), n = as.integer(389), PACKAGE = "stats")$p, 40) gives + * 0.003068542559702356568168690742481885536108 + */ + dist = new KolmogorovSmirnovDistributionImpl(389); + assertEquals(0.003068542559702356568168690742481885536108, dist.cdf(0.02, false), TOLERANCE); + + /* formatC(.C("pkolmogorov2x", p = as.double(0.031111), n = as.integer(389), PACKAGE = "stats")$p, 40) gives + * 0.1658291700122746237244797384846606291831 + */ + dist = new KolmogorovSmirnovDistributionImpl(389); + assertEquals(0.1658291700122746237244797384846606291831, dist.cdf(0.031111, false), TOLERANCE); + + /* formatC(.C("pkolmogorov2x", p = as.double(0.04), n = as.integer(389), PACKAGE = "stats")$p, 40) gives + * 0.4513143712128902529379104180407011881471 + */ + dist = new KolmogorovSmirnovDistributionImpl(389); + assertEquals(0.4513143712128902529379104180407011881471, dist.cdf(0.04, false), TOLERANCE); + + } + +} Propchange: commons/proper/math/trunk/src/test/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistributionTest.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: commons/proper/math/trunk/src/test/java/org/apache/commons/math/distribution/KolmogorovSmirnovDistributionTest.java ------------------------------------------------------------------------------ svn:keywords = Author Date Id Revision