This is an automated email from the ASF dual-hosted git repository. aherbert pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-statistics.git
The following commit(s) were added to refs/heads/master by this push: new 1e5c695 STATISTICS-59: Update sampling of Pareto distribution with extreme shape 1e5c695 is described below commit 1e5c695164a886d4654418cf6c64e26ea791b8c2 Author: aherbert <aherb...@apache.org> AuthorDate: Wed Nov 23 17:30:06 2022 +0000 STATISTICS-59: Update sampling of Pareto distribution with extreme shape Large shape should sample from uniform p in [0, 1) to concentrate samples towards the scale parameter. Tiny shape should sample from uniform p in (0, 1] to concentrate samples towards infinity. Added test resource files for tiny shape when 1 / shape is infinite. --- .../distribution/ParetoDistribution.java | 40 ++++- .../distribution/ParetoDistributionTest.java | 182 +++++++++++++++++++++ ...reto.6.properties => test.pareto.10.properties} | 20 ++- ...reto.7.properties => test.pareto.11.properties} | 20 ++- .../distribution/test.pareto.5.properties | 5 +- .../distribution/test.pareto.6.properties | 5 +- .../distribution/test.pareto.7.properties | 5 +- ...areto.5.properties => test.pareto.9.properties} | 20 ++- 8 files changed, 264 insertions(+), 33 deletions(-) diff --git a/commons-statistics-distribution/src/main/java/org/apache/commons/statistics/distribution/ParetoDistribution.java b/commons-statistics-distribution/src/main/java/org/apache/commons/statistics/distribution/ParetoDistribution.java index 48506a7..13a71a2 100644 --- a/commons-statistics-distribution/src/main/java/org/apache/commons/statistics/distribution/ParetoDistribution.java +++ b/commons-statistics-distribution/src/main/java/org/apache/commons/statistics/distribution/ParetoDistribution.java @@ -82,7 +82,7 @@ public final class ParetoDistribution extends AbstractContinuousDistribution { pdf = x -> Math.exp(logpdf.applyAsDouble(x)); } else { // Assume Dirac function - logpdf = x -> x > scale ? -Double.POSITIVE_INFINITY : Double.POSITIVE_INFINITY; + logpdf = x -> x > scale ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY; // PDF has infinite value at lower bound pdf = x -> x > scale ? 0 : Double.POSITIVE_INFINITY; } @@ -103,7 +103,6 @@ public final class ParetoDistribution extends AbstractContinuousDistribution { if (scale <= 0 || scale == Double.POSITIVE_INFINITY) { throw new DistributionException(DistributionException.NOT_STRICTLY_POSITIVE_FINITE, scale); } - if (shape <= 0) { throw new DistributionException(DistributionException.NOT_STRICTLY_POSITIVE, shape); } @@ -297,6 +296,41 @@ public final class ParetoDistribution extends AbstractContinuousDistribution { @Override public ContinuousDistribution.Sampler createSampler(final UniformRandomProvider rng) { // Pareto distribution sampler. - return InverseTransformParetoSampler.of(rng, scale, shape)::sample; + // Commons RNG v1.5 uses nextDouble() for (1 - p) effectively sampling from p in (0, 1]. + // Ensure sampling is concentrated at the lower / upper bound at extreme shapes: + // Large shape should sample using p in [0, 1) (lower bound) + // Small shape should sample using p in (0, 1] (upper bound) + // Note: For small shape the input RNG is also wrapped to use nextLong as the source of + // randomness; this ensures the nextDouble method uses the interface output of [0, 1). + final UniformRandomProvider wrappedRng = shape >= 1 ? new InvertedRNG(rng) : rng::nextLong; + return InverseTransformParetoSampler.of(wrappedRng, scale, shape)::sample; + } + + /** + * Create a RNG that inverts the output from nextDouble() as (1 - nextDouble()). + */ + private static class InvertedRNG implements UniformRandomProvider { + /** Source of randomness. */ + private final UniformRandomProvider rng; + + /** + * @param rng Source of randomness + */ + InvertedRNG(UniformRandomProvider rng) { + this.rng = rng; + } + + @Override + public long nextLong() { + // Delegate the source of randomness + return rng.nextLong(); + } + + @Override + public double nextDouble() { + // Return a value in (0, 1]. + // This assumes the interface method outputs in [0, 1). + return 1 - UniformRandomProvider.super.nextDouble(); + } } } diff --git a/commons-statistics-distribution/src/test/java/org/apache/commons/statistics/distribution/ParetoDistributionTest.java b/commons-statistics-distribution/src/test/java/org/apache/commons/statistics/distribution/ParetoDistributionTest.java index 2ab90e1..f719f40 100644 --- a/commons-statistics-distribution/src/test/java/org/apache/commons/statistics/distribution/ParetoDistributionTest.java +++ b/commons-statistics-distribution/src/test/java/org/apache/commons/statistics/distribution/ParetoDistributionTest.java @@ -17,6 +17,8 @@ package org.apache.commons.statistics.distribution; +import org.apache.commons.rng.UniformRandomProvider; +import org.apache.commons.rng.simple.RandomSource; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; @@ -27,6 +29,13 @@ import org.junit.jupiter.params.provider.CsvSource; * Extends {@link BaseContinuousDistributionTest}. See javadoc of that class for details. */ class ParetoDistributionTest extends BaseContinuousDistributionTest { + /** + * The difference each of the 2^53 dyadic rationals in [0, 1). + * This is the smallest non-zero value for p to use when inverse transform sampling. + * Equal to 2^-53. + */ + private static final double U = 0x1.0p-53; + @Override ContinuousDistribution makeDistribution(Object... parameters) { final double scale = (Double) parameters[0]; @@ -211,4 +220,177 @@ class ParetoDistributionTest extends BaseContinuousDistributionTest { Assertions.assertTrue(Double.isFinite(Math.log(shape) + Math.log(scale2) * shape)); } } + + /** + * Test sampling with a large shape. As {@code shape -> inf} then the distribution + * approaches a delta function with {@code CDF(x=scale)} = 0 and {@code CDF(x>scale) = 1}. + * This test verifies that a large shape is effectively sampled from p in [0, 1) to avoid + * spurious infinite samples when p=1. + * + * <p>Sampling Details + * + * <p>Note that sampling is using inverse transform sampling by inverting the CDF: + * <pre> + * CDF(x) = 1 - (scale / x)^shape + * x = scale / (1 - p)^(1 / shape) + * = scale / exp(log(1 - p) / shape) + * </pre> + * + * <p>The sampler in Commons RNG is inverting the CDF function using Math.pow: + * <pre> + * x = scale / Math.pow(1 - p, 1 / shape) + * </pre> + * + * <p>The Pareto distribution uses log functions to achieve the same result: + * <pre> + * x = scale / Math.exp(Math.log1p(-p) / shape); + * </pre> + * + * <p>Inversion will return the scale when Math.exp(X) == 1 where X (in [-inf, 0]) is: + * <pre> + * X = log(1 - p) / shape + * </pre> + * + * <p>This occurs when {@code X > log(1.0 - epsilon)}, or larger (closer to zero) than + * {@code Math.log(Math.nextDown(1.0))}; X is approximately -1.11e-16. + * During sampling p is bounded to the 2^53 dyadic rationals in [0, 1). The largest + * finite value for the logarithm is log(2^-53) thus the critical size for shape is around: + * <pre> + * shape = log(2^-53) / -1.1102230246251565e-16 = 3.3089568271276403e17 + * </pre> + * + * <p>Note that if the p-value is 1 then inverseCumulativeProbability(1.0) == inf. + * However using the power function to invert this ignores this possibility when the shape + * is infinite and will always return scale / x^0 = scale / 1 = scale. If the inversion + * using logarithms is directly used then a log(0) / inf == -inf / inf == NaN occurs. + */ + @ParameterizedTest + @CsvSource({ + // Scale values match those from the test resource files where the sampling test is disabled + "10, Infinity", + "1, Infinity", + "0.1, Infinity", + // This behaviour occurs even when the shape is not infinite due to limited precision + // of double values. Shape is set to twice the limit derived above to account for rounding: + // double p = 0x1.0p-53 + // Math.pow(p, 1 / (Math.log(p) / -p)) ==> 0.9999999999999999 + // Math.pow(p, 1 / (2 * Math.log(p) / -p)) ==> 1.0 + // shape = (2 * Math.log(p) / -p) + "10, 6.6179136542552806e17", + "1, 6.6179136542552806e17", + "0.1, 6.6179136542552806e17", + }) + void testSamplingWithLargeShape(double scale, double shape) { + final ParetoDistribution dist = ParetoDistribution.of(scale, shape); + + // Sampling should act as if inverting p in [0, 1) + final double x0 = dist.inverseCumulativeProbability(0); + final double x1 = dist.inverseCumulativeProbability(1 - U); + Assertions.assertEquals(scale, x0); + Assertions.assertEquals(x0, x1, "Test parameters did not create an extreme distribution"); + + // Sampling for p in [0, 1): returns scale when shape is large + assertSampler(dist, scale); + } + + /** + * Test sampling with a tiny shape. As {@code shape -> 0} then the distribution + * approaches a function with {@code CDF(x=inf) = 1} and {@code CDF(x>=scale) = 0}. + * This test verifies that a tiny shape is effectively sampled from p in (0, 1] to avoid + * spurious NaN samples when p=0. + * + * <p>Sampling Details + * + * <p>The sampler in Commons RNG is inverting the CDF function using Math.pow: + * <pre> + * x = scale / Math.pow(1 - p, 1 / shape) + * </pre> + * + * <p>However Math.pow(1, infinity) == NaN. This can be avoided if p=0 is not used. + * For all other values Math.pow(1 - p, infinity) == 0 and the sample is infinite. + */ + @ParameterizedTest + @CsvSource({ + // 1 / shape is infinite + // Scale values match those from the test resource files where the sampling test is disabled + "10, 4.9e-324", + "1, 4.9e-324", + "0.1, 4.9e-324", + // This behaviour occurs even when 1 / shape is not infinite due to limited precision + // of double values. Shape provides the largest possible finite value from 1 / shape: + // shape = (1.0 + Math.ulp(1.0)*2) / Double.MAX_VALUE + // 1 / shape = 1.7976931348623143e308 + // 1 / Math.nextDown(shape) = Infinity + "10, 5.56268464626801E-309", + "1, 5.56268464626801E-309", + "0.1, 5.56268464626801E-309", + // Lower limit is where pow(1 - p, 1 / shape) < Double.MIN_VALUE: + // shape < log(1 - p) / log(MIN_VALUE) + // Shape is set to half this limit to account for rounding: + // double p = 0x1.0p-53 + // Math.pow(1 - p, 1 / (Math.log(1 - p) / Math.log(Double.MIN_VALUE))) ==> 4.9e-324 + // Math.pow(1 - p, 2 / (Math.log(1 - p) / Math.log(Double.MIN_VALUE))) ==> 0.0 + // shape = 0.5 * Math.log(1 - p) / Math.log(Double.MIN_VALUE) + "10, 7.456765604783329e-20", + "1, 7.456765604783329e-20", + // Use smallest possible scale: test will fail if shape is not half the limit + "4.9e-324, 7.456765604783329e-20", + }) + void testSamplingWithTinyShape(double scale, double shape) { + final ParetoDistribution dist = ParetoDistribution.of(scale, shape); + + // Sampling should act as if inverting p in (0, 1] + final double x0 = dist.inverseCumulativeProbability(U); + final double x1 = dist.inverseCumulativeProbability(1); + Assertions.assertEquals(Double.POSITIVE_INFINITY, x1); + Assertions.assertEquals(x1, x0, "Test parameters did not create an extreme distribution"); + + // Sampling for p in [0, 1): returns infinity when shape is tiny + assertSampler(dist, Double.POSITIVE_INFINITY); + } + + /** + * Assert the sampler produces the expected sample value irrespective of the values from the RNG. + * + * @param dist Distribution + * @param expected Expected sample value + */ + private static void assertSampler(ParetoDistribution dist, double expected) { + // Extreme random numbers using no bits or all bits, then combinations + // that may be used to generate a double from the lower or upper 53-bits + final long[] values = {0, -1, 1, 1L << 11, -2, -2L << 11}; + final UniformRandomProvider rng = createRNG(values); + ContinuousDistribution.Sampler s = dist.createSampler(rng); + for (final long l : values) { + Assertions.assertEquals(expected, s.sample(), () -> "long bits = " + l); + } + // Any random number + final long seed = RandomSource.createLong(); + s = dist.createSampler(RandomSource.SPLIT_MIX_64.create(seed)); + for (int i = 0; i < 100; i++) { + Assertions.assertEquals(expected, s.sample(), () -> "seed = " + seed); + } + } + + /** + * Creates the RNG to return the given values from the nextLong() method. + * + * @param values Long values + * @return the RNG + */ + private static UniformRandomProvider createRNG(long... values) { + return new UniformRandomProvider() { + private int i; + + @Override + public long nextLong() { + return values[i++]; + } + + @Override + public double nextDouble() { + throw new IllegalStateException("nextDouble cannot be trusted to be in [0, 1) and should be ignored"); + } + }; + } } diff --git a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.6.properties b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.10.properties similarity index 60% copy from commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.6.properties copy to commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.10.properties index 3291bd3..7882d53 100644 --- a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.6.properties +++ b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.10.properties @@ -13,14 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -# scale -> infinity creates a delta function -parameters = 1.0 Infinity -mean = 1 -variance = 0 +# 1 / shape -> infinity pushes the CDF towards infinity +parameters = 1.0 4.9e-324 +mean = Inf +variance = Inf lower = 1.0 -cdf.points = 1.0 1.1 1.2 1.3 -cdf.values = 0 1 1 1 -pdf.values = Inf 0 0 0 +cdf.points = 1.0 1.1 1.2 Inf +cdf.values = 0 0 0 1 +# Note: pdf(x=inf) remains zero. The value is small but not representable. +pdf.values = 0 0 0 0 +# Log values computed manually. Density reduces as x increases. +logpdf.values = -744.4400719213812 -744.5353821011855 -744.6223934781751 -Inf -# TODO: fix sampling. Possibly return Math.nextUp(shape). +# The sampling test is not applicable as there are no quartiles. +# As shape -> infinity then sampling will return the scale parameter. disable.sample = true diff --git a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.7.properties b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.11.properties similarity index 61% copy from commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.7.properties copy to commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.11.properties index 5d4b407..f22b078 100644 --- a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.7.properties +++ b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.11.properties @@ -13,14 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -# scale -> infinity creates a delta function -parameters = 0.1 Infinity -mean = 0.1 -variance = 0 +# 1 / shape -> infinity pushes the CDF towards infinity +parameters = 0.1 4.9e-324 +mean = Inf +variance = Inf lower = 0.1 -cdf.points = 0.1 0.2 0.3 -cdf.values = 0 1 1 -pdf.values = Inf 0 0 +cdf.points = 0.1 0.2 0.3 Inf +cdf.values = 0 0 0 1 +# PDF values computed manually +pdf.values = 4.9E-323 2.5E-323 1.5E-323 0 +# Log values computed manually. Density reduces as x increases. +logpdf.values = -742.1374868283872 -742.8306340089471 -743.2360991170552 -Inf -# TODO: fix sampling. Possibly return Math.nextUp(shape). +# The sampling test is not applicable as there are no quartiles. +# As shape -> infinity then sampling will return the scale parameter. disable.sample = true diff --git a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.5.properties b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.5.properties index ba83852..f23e9b1 100644 --- a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.5.properties +++ b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.5.properties @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# scale -> infinity creates a delta function +# shape -> infinity creates a delta function parameters = 10.0 Infinity mean = 10 variance = 0 @@ -22,5 +22,6 @@ cdf.points = 10.0 10.1 10.2 cdf.values = 0 1 1 pdf.values = Inf 0 0 -# TODO: fix sampling. Possibly return Math.nextUp(shape). +# The sampling test is not applicable as there are no quartiles. +# As shape -> infinity then sampling will return the scale parameter. disable.sample = true diff --git a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.6.properties b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.6.properties index 3291bd3..9f89f68 100644 --- a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.6.properties +++ b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.6.properties @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# scale -> infinity creates a delta function +# shape -> infinity creates a delta function parameters = 1.0 Infinity mean = 1 variance = 0 @@ -22,5 +22,6 @@ cdf.points = 1.0 1.1 1.2 1.3 cdf.values = 0 1 1 1 pdf.values = Inf 0 0 0 -# TODO: fix sampling. Possibly return Math.nextUp(shape). +# The sampling test is not applicable as there are no quartiles. +# As shape -> infinity then sampling will return the scale parameter. disable.sample = true diff --git a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.7.properties b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.7.properties index 5d4b407..24871e6 100644 --- a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.7.properties +++ b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.7.properties @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# scale -> infinity creates a delta function +# shape -> infinity creates a delta function parameters = 0.1 Infinity mean = 0.1 variance = 0 @@ -22,5 +22,6 @@ cdf.points = 0.1 0.2 0.3 cdf.values = 0 1 1 pdf.values = Inf 0 0 -# TODO: fix sampling. Possibly return Math.nextUp(shape). +# The sampling test is not applicable as there are no quartiles. +# As shape -> infinity then sampling will return the scale parameter. disable.sample = true diff --git a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.5.properties b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.9.properties similarity index 60% copy from commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.5.properties copy to commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.9.properties index ba83852..eb18228 100644 --- a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.5.properties +++ b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.9.properties @@ -13,14 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -# scale -> infinity creates a delta function -parameters = 10.0 Infinity -mean = 10 -variance = 0 +# 1 / shape -> infinity pushes the CDF towards infinity +parameters = 10.0 4.9e-324 +mean = Inf +variance = Inf lower = 10.0 -cdf.points = 10.0 10.1 10.2 -cdf.values = 0 1 1 -pdf.values = Inf 0 0 +cdf.points = 10.0 10.1 10.2 Inf +cdf.values = 0 0 0 1 +# Note: pdf(x=inf) remains zero. The value is small but not representable. +pdf.values = 0 0 0 0 +# Log values computed manually. Density reduces as x increases. +logpdf.values = -746.7426570143753 -746.7526073452284 -746.7624596416714 -Inf -# TODO: fix sampling. Possibly return Math.nextUp(shape). +# The sampling test is not applicable as there are no quartiles. +# As 1 / shape -> infinity then sampling will return infinity. disable.sample = true