This is an automated email from the ASF dual-hosted git repository. aherbert pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-statistics.git
The following commit(s) were added to refs/heads/master by this push: new 490c8bb STATISTICS-91: Confidence intervals for a normal population 490c8bb is described below commit 490c8bb91e4b4bae39bb34977561812ccd2661c2 Author: Alex Herbert <aherb...@apache.org> AuthorDate: Fri Jun 13 14:44:24 2025 +0100 STATISTICS-91: Confidence intervals for a normal population --- commons-statistics-interval/pom.xml | 7 ++ .../commons/statistics/interval/ArgumentUtils.java | 41 ++++++++ .../interval/BinomialConfidenceInterval.java | 5 +- .../interval/NormalConfidenceInterval.java | 97 ++++++++++++++++++ .../interval/NormalConfidenceIntervalTest.java | 110 +++++++++++++++++++++ .../commons/statistics/interval/UserGuideTest.java | 29 ++++++ src/changes/changes.xml | 4 + src/conf/checkstyle/checkstyle-suppressions.xml | 1 + src/site/xdoc/userguide/index.xml | 29 ++++++ 9 files changed, 319 insertions(+), 4 deletions(-) diff --git a/commons-statistics-interval/pom.xml b/commons-statistics-interval/pom.xml index fc04b91..92b7a4d 100644 --- a/commons-statistics-interval/pom.xml +++ b/commons-statistics-interval/pom.xml @@ -50,6 +50,13 @@ <version>1.2-SNAPSHOT</version> </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-statistics-descriptive</artifactId> + <version>1.2-SNAPSHOT</version> + <scope>test</scope> + </dependency> + </dependencies> </project> diff --git a/commons-statistics-interval/src/main/java/org/apache/commons/statistics/interval/ArgumentUtils.java b/commons-statistics-interval/src/main/java/org/apache/commons/statistics/interval/ArgumentUtils.java new file mode 100644 index 0000000..668bfbf --- /dev/null +++ b/commons-statistics-interval/src/main/java/org/apache/commons/statistics/interval/ArgumentUtils.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.statistics.interval; + +/** + * Utilities for argument validation. + * + * @since 1.2 + */ +final class ArgumentUtils { + /** No instances. */ + private ArgumentUtils() {} + + /** + * Check the error rate {@code alpha} is in the open interval {@code (0, 1)}. + * + * @param alpha Error rate. + * @throws IllegalArgumentException if {@code alpha} is not in the open interval {@code (0, 1)}. + */ + static void checkErrorRate(double alpha) { + if (alpha > 0 && alpha < 1) { + return; + } + // Out-of-range or NaN + throw new IllegalArgumentException("Error rate is not in (0, 1): " + alpha); + } +} diff --git a/commons-statistics-interval/src/main/java/org/apache/commons/statistics/interval/BinomialConfidenceInterval.java b/commons-statistics-interval/src/main/java/org/apache/commons/statistics/interval/BinomialConfidenceInterval.java index 078cbc8..2b0eb88 100644 --- a/commons-statistics-interval/src/main/java/org/apache/commons/statistics/interval/BinomialConfidenceInterval.java +++ b/commons-statistics-interval/src/main/java/org/apache/commons/statistics/interval/BinomialConfidenceInterval.java @@ -177,10 +177,7 @@ public enum BinomialConfidenceInterval { String.format("Number of successes (%d) must be less than or equal to number of trials (%d)", numberOfSuccesses, numberOfTrials)); } - // Negation of alpha inside the interval (0, 1) detects NaN - if (!(alpha > 0 && alpha < 1)) { - throw new IllegalArgumentException("Error rate is not in (0, 1): " + alpha); - } + ArgumentUtils.checkErrorRate(alpha); return create(numberOfTrials, numberOfSuccesses, alpha); } diff --git a/commons-statistics-interval/src/main/java/org/apache/commons/statistics/interval/NormalConfidenceInterval.java b/commons-statistics-interval/src/main/java/org/apache/commons/statistics/interval/NormalConfidenceInterval.java new file mode 100644 index 0000000..2906688 --- /dev/null +++ b/commons-statistics-interval/src/main/java/org/apache/commons/statistics/interval/NormalConfidenceInterval.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.statistics.interval; + +import org.apache.commons.statistics.distribution.ChiSquaredDistribution; +import org.apache.commons.statistics.distribution.TDistribution; + +/** + * Generate confidence intervals for a normally distributed population. + * + * @see <a + * href="https://en.wikipedia.org/wiki/Normal_distribution#Confidence_intervals">Normal + * distribution confidence interval (Wikipedia)</a> + * + * @since 1.2 + */ +public enum NormalConfidenceInterval { + /** + * Create a confidence interval for the true mean of an unknown normally distributed population. + */ + MEAN { + @Override + Interval create(double mean, double variance, long n, double alpha) { + final double c = TDistribution.of(n - 1).inverseSurvivalProbability(alpha * 0.5); + final double distance = c * Math.sqrt(variance / n); + return new BaseInterval(mean - distance, mean + distance); + } + }, + /** + * Create a confidence interval for the true variance of an unknown normally distributed population. + */ + VARIANCE { + @Override + Interval create(double mean, double variance, long n, double alpha) { + final ChiSquaredDistribution d = ChiSquaredDistribution.of(n - 1); + final double f = variance * (n - 1.0); + final double lower = f / d.inverseSurvivalProbability(alpha * 0.5); + final double upper = f / d.inverseCumulativeProbability(alpha * 0.5); + return new BaseInterval(lower, upper); + } + }; + + /** + * Create a confidence interval from an independent sample from an unknown normally + * distributed population with the given error rate. + * + * <p>The error rate {@code alpha} is related to the confidence level that the + * interval contains the true probability of success as + * {@code alpha = 1 - confidence}, where {@code confidence} is the confidence level + * in {@code [0, 1]}. For example a 95% confidence level is an {@code alpha} of 0.05. + * + * <p>The unbiased variance is the sum of the squared deviations from the mean divided + * by {@code n - 1}. + * + * @param mean Sample mean. + * @param variance Unbiased sample variance. + * @param n Sample size. + * @param alpha Desired error rate that the true probability of success falls + * <em>outside</em> the returned interval. + * @return Confidence interval containing the target with error rate {@code alpha} + * @throws IllegalArgumentException if {@code n <= 1}, or if {@code alpha} is not in + * the open interval {@code (0, 1)}. + */ + public Interval fromErrorRate(double mean, double variance, long n, double alpha) { + if (n <= 1) { + throw new IllegalArgumentException("Sample size is not above one: " + n); + } + ArgumentUtils.checkErrorRate(alpha); + return create(mean, variance, n, alpha); + } + + /** + * Create a confidence interval from an independent sample from an unknown normally + * distributed population with the given error rate. + * + * @param mean Sample mean. + * @param variance Unbiased sample variance. + * @param n Sample size. + * @param alpha Desired error rate. + * @return Confidence interval + */ + abstract Interval create(double mean, double variance, long n, double alpha); +} diff --git a/commons-statistics-interval/src/test/java/org/apache/commons/statistics/interval/NormalConfidenceIntervalTest.java b/commons-statistics-interval/src/test/java/org/apache/commons/statistics/interval/NormalConfidenceIntervalTest.java new file mode 100644 index 0000000..7efa61b --- /dev/null +++ b/commons-statistics-interval/src/test/java/org/apache/commons/statistics/interval/NormalConfidenceIntervalTest.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.statistics.interval; + +import java.util.stream.Stream; +import java.util.stream.Stream.Builder; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.MethodSource; + +/** + * Test cases for {@link NormalConfidenceInterval}. + */ +class NormalConfidenceIntervalTest { + @ParameterizedTest + @EnumSource + void testInvalidArgumentsThrow(NormalConfidenceInterval method) { + double mean = 0.1; + double variance = 1.23; + int n = 42; + double alpha = 0.05; + Assertions.assertDoesNotThrow(() -> method.fromErrorRate(mean, variance, n, alpha)); + // n < 2 + Assertions.assertDoesNotThrow(() -> method.fromErrorRate(mean, variance, 2, alpha)); + Assertions.assertThrows(IllegalArgumentException.class, () -> method.fromErrorRate(mean, variance, 1, alpha)); + Assertions.assertThrows(IllegalArgumentException.class, () -> method.fromErrorRate(mean, variance, 0, alpha)); + Assertions.assertThrows(IllegalArgumentException.class, () -> method.fromErrorRate(mean, variance, -1, alpha)); + // alpha not in (0, 1) + Assertions.assertDoesNotThrow(() -> method.fromErrorRate(mean, variance, n, Math.nextUp(0.0))); + Assertions.assertDoesNotThrow(() -> method.fromErrorRate(mean, variance, n, Math.nextDown(1.0))); + Assertions.assertThrows(IllegalArgumentException.class, () -> method.fromErrorRate(mean, variance, n, 0.0)); + Assertions.assertThrows(IllegalArgumentException.class, () -> method.fromErrorRate(mean, variance, n, 1.0)); + Assertions.assertThrows(IllegalArgumentException.class, () -> method.fromErrorRate(mean, variance, n, -0.01)); + Assertions.assertThrows(IllegalArgumentException.class, () -> method.fromErrorRate(mean, variance, n, 1.01)); + Assertions.assertThrows(IllegalArgumentException.class, () -> method.fromErrorRate(mean, variance, n, Double.NaN)); + } + + @ParameterizedTest + @MethodSource() + void testInterval(NormalConfidenceInterval method, double mean, double variance, int n, double alpha, + double lower, double upper, double relativeError) { + final Interval i = method.fromErrorRate(mean, variance, n, alpha); + Assertions.assertEquals(lower, i.getLowerBound(), lower * relativeError, "lower"); + Assertions.assertEquals(upper, i.getUpperBound(), upper * relativeError, "upper"); + } + + static Stream<Arguments> testInterval() { + final Builder<Arguments> builder = Stream.builder(); + + // mean cases generated using R 4.4.3, e.g. + // options(digits=17) + // x -> c(1, 2, 3, 4) + // mean(x); var(x); length(x) + // t.test(x, conf.level=0.95)$conf.int + // Data generated in r using random numbers, e.g. + // x = runif(100); x = rnorm(50, mean=3, sd=2) + NormalConfidenceInterval method; + method = NormalConfidenceInterval.MEAN; + add(builder, method, 2.5, 1.6666666666666667, 4, 0.05, 0.44573974323947924, 4.55426025676052060, 1e-14); + add(builder, method, 0.5263914421340451, 0.079384303412544904, 100, 0.05, 0.47048569257011025, + 0.58229719169798000, 1e-14); + add(builder, method, 2.9535381946732131, 5.2628380291790835, 50, 0.1, 2.4096097871064539, 3.4974666022399732, + 1e-14); + + // variance cases manually computed in R 4.4.3 (data x as above) e.g. + // alpha=0.05; n=length(x); v=var(x); (n-1)*v/qchisq(alpha/2, n-1, lower.tail=F); (n-1)*v/qchisq(alpha/2, n-1) + method = NormalConfidenceInterval.VARIANCE; + add(builder, method, 2.5, 1.6666666666666667, 4, 0.05, 0.53485067734936409, 23.170107980137484, 1e-14); + add(builder, method, 0.5263914421340451, 0.079384303412544904, 100, 0.05, 0.061197043596933121, + 0.10712827588348012, 1e-14); + add(builder, method, 2.9535381946732131, 5.2628380291790835, 50, 0.1, 3.887312567406342, 7.6002576083181186, + 1e-14); + + // Approximate formula for asymptotic distributions at large n uses z critical value + // from a normal distribution, here z_{0.025} = 1.96 + final double z = 1.96; + final double mean = 1.23; + final double variance = 3.45; + final int n = 100_000; + double dist = z * Math.sqrt(variance / n); + add(builder, NormalConfidenceInterval.MEAN, mean, variance, n, 0.05, mean - dist, mean + dist, 1e-6); + dist = z * Math.sqrt(2.0 / n) * variance; + add(builder, NormalConfidenceInterval.VARIANCE, mean, variance, n, 0.05, variance - dist, variance + dist, + 1e-4); + + return builder.build(); + } + + private static void add(Builder<Arguments> builder, NormalConfidenceInterval method, + double mean, double variance, int n, double alpha, + double lower, double upper, double relativeError) { + builder.accept(Arguments.of(method, mean, variance, n, alpha, lower, upper, relativeError)); + } +} diff --git a/commons-statistics-interval/src/test/java/org/apache/commons/statistics/interval/UserGuideTest.java b/commons-statistics-interval/src/test/java/org/apache/commons/statistics/interval/UserGuideTest.java index d7b4d01..a0aa990 100644 --- a/commons-statistics-interval/src/test/java/org/apache/commons/statistics/interval/UserGuideTest.java +++ b/commons-statistics-interval/src/test/java/org/apache/commons/statistics/interval/UserGuideTest.java @@ -17,6 +17,9 @@ package org.apache.commons.statistics.interval; +import java.util.EnumSet; +import org.apache.commons.statistics.descriptive.DoubleStatistics; +import org.apache.commons.statistics.descriptive.Statistic; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -41,6 +44,32 @@ class UserGuideTest { assertInterval(method.fromErrorRate(10000, 5000, alpha), 0.49020, 0.50980, 1e-5); } + @Test + void testInterval2() { + // Results generated using R 4.4.3, e.g. + // options(digits=3) + // rnorm(15, 1.45, 0.1) + // Create data using the rounded sample: + // x = c(1.47, 1.40, 1.55, 1.44, 1.41, 1.38, 1.53, 1.42, 1.55, 1.55, 1.31, 1.37, 1.53, 1.47, 1.51) + // options(digits=17) + // mean(x); var(x); length(x) + // t.test(x, conf.level=0.95)$conf.int + double[] x = {1.47, 1.40, 1.55, 1.44, 1.41, 1.38, 1.53, 1.42, 1.55, 1.55, 1.31, 1.37, 1.53, 1.47, 1.51}; + DoubleStatistics stats = DoubleStatistics.of(EnumSet.of(Statistic.MEAN, Statistic.VARIANCE), x); + + double mean = stats.getAsDouble(Statistic.MEAN); + double variance = stats.getAsDouble(Statistic.VARIANCE); + long n = stats.getCount(); + double alpha = 0.05; + + Assertions.assertEquals(1.46, mean, 1e-2); + Assertions.assertEquals(0.0058, variance, 1e-4); + + Interval interval = NormalConfidenceInterval.MEAN.fromErrorRate(mean, variance, n, alpha); + Assertions.assertEquals(1.4170, interval.getLowerBound(), 1e-4); + Assertions.assertEquals(1.5017, interval.getUpperBound(), 1e-4); + } + private static void assertInterval(Interval interval, double lower, double upper, double relError) { Assertions.assertEquals(lower, interval.getLowerBound(), lower * relError, "lower"); Assertions.assertEquals(upper, interval.getUpperBound(), upper * relError, "upper"); diff --git a/src/changes/changes.xml b/src/changes/changes.xml index d837c90..b35754c 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -56,6 +56,10 @@ If the output is not quite correct, check for invisible trailing spaces! <release version="1.2" date="TBD" description=" New features, updates and bug fixes (requires Java 8). "> + <action dev="aherbert" type="add" issue="STATISTICS-91"> + "NormalConfidenceInterval": Support confidence intervals for a normally + distributed population. + </action> <action dev="aherbert" type="add" issue="STATISTICS-6"> Add a commons-statistics-interval module for statistical intervals. This ports and updates functionality in org.apache.commons.math4.stat.interval. diff --git a/src/conf/checkstyle/checkstyle-suppressions.xml b/src/conf/checkstyle/checkstyle-suppressions.xml index 7e47f14..1da396b 100644 --- a/src/conf/checkstyle/checkstyle-suppressions.xml +++ b/src/conf/checkstyle/checkstyle-suppressions.xml @@ -41,6 +41,7 @@ <suppress checks="ParameterNumber" files=".*[/\\]WilcoxonSignedRankTestTest.java" /> <suppress checks="ParameterNumber" files=".*[/\\]UnconditionedExactTestTest.java" /> <suppress checks="ParameterNumber" files=".*[/\\]BinomialConfidenceIntervalTest.java" /> + <suppress checks="ParameterNumber" files=".*[/\\]NormalConfidenceIntervalTest.java" /> <suppress checks="MethodLength" files=".*[/\\]WilcoxonSignedRankTestTest.java" /> <suppress checks="IllegalCatch" files=".*[/\\]TestHelper.java" lines="295-410" /> <suppress checks="IllegalCatch" files=".*[/\\]BaseStatisticTest.java" lines="280-400" /> diff --git a/src/site/xdoc/userguide/index.xml b/src/site/xdoc/userguide/index.xml index cff4d81..cc6a08c 100644 --- a/src/site/xdoc/userguide/index.xml +++ b/src/site/xdoc/userguide/index.xml @@ -763,6 +763,35 @@ interval.getUpperBound(); // 0.76341 method.fromErrorRate(100, 50, alpha); // 0.40383, 0.59617 method.fromErrorRate(1000, 500, alpha); // 0.46907, 0.53093 method.fromErrorRate(10000, 5000, alpha); // 0.49020, 0.50980 +</source> + <p> + The <code>NormalConfidenceInterval</code> enumeration provides methods + to create a confidence interval for a normally distributed population. + Intervals can be created for the mean or the variance from a sample of + the population. + </p> + <p> + The following example demonstrates how to generate a 95% confidence interval + for the mean given a sample. The mean and variance of the sample are + required for the interval; here they are generated using the descriptive + statistics API. + </p> +<source class="prettyprint"> +double[] x = { + 1.47, 1.40, 1.55, 1.44, 1.41, + 1.38, 1.53, 1.42, 1.55, 1.55, + 1.31, 1.37, 1.53, 1.47, 1.51 +}; +DoubleStatistics stats = DoubleStatistics.of(EnumSet.of(Statistic.MEAN, Statistic.VARIANCE), x); + +double mean = stats.getAsDouble(Statistic.MEAN); // 1.46 +double variance = stats.getAsDouble(Statistic.VARIANCE); // 0.0058 +long n = stats.getCount(); // 15 +double alpha = 0.05; + +Interval interval = NormalConfidenceInterval.MEAN.fromErrorRate(mean, variance, n, alpha); +interval.getLowerBound(); // 1.4170 +interval.getUpperBound(); // 1.5017 </source> </section> <section name="Ranking" id="ranking">