This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-statistics.git


The following commit(s) were added to refs/heads/master by this push:
     new 1e5c695  STATISTICS-59: Update sampling of Pareto distribution with 
extreme shape
1e5c695 is described below

commit 1e5c695164a886d4654418cf6c64e26ea791b8c2
Author: aherbert <aherb...@apache.org>
AuthorDate: Wed Nov 23 17:30:06 2022 +0000

    STATISTICS-59: Update sampling of Pareto distribution with extreme shape
    
    Large shape should sample from uniform p in [0, 1) to concentrate
    samples towards the scale parameter.
    
    Tiny shape should sample from uniform p in (0, 1] to concentrate samples
    towards infinity.
    
    Added test resource files for tiny shape when 1 / shape is infinite.
---
 .../distribution/ParetoDistribution.java           |  40 ++++-
 .../distribution/ParetoDistributionTest.java       | 182 +++++++++++++++++++++
 ...reto.6.properties => test.pareto.10.properties} |  20 ++-
 ...reto.7.properties => test.pareto.11.properties} |  20 ++-
 .../distribution/test.pareto.5.properties          |   5 +-
 .../distribution/test.pareto.6.properties          |   5 +-
 .../distribution/test.pareto.7.properties          |   5 +-
 ...areto.5.properties => test.pareto.9.properties} |  20 ++-
 8 files changed, 264 insertions(+), 33 deletions(-)

diff --git 
a/commons-statistics-distribution/src/main/java/org/apache/commons/statistics/distribution/ParetoDistribution.java
 
b/commons-statistics-distribution/src/main/java/org/apache/commons/statistics/distribution/ParetoDistribution.java
index 48506a7..13a71a2 100644
--- 
a/commons-statistics-distribution/src/main/java/org/apache/commons/statistics/distribution/ParetoDistribution.java
+++ 
b/commons-statistics-distribution/src/main/java/org/apache/commons/statistics/distribution/ParetoDistribution.java
@@ -82,7 +82,7 @@ public final class ParetoDistribution extends 
AbstractContinuousDistribution {
                 pdf = x -> Math.exp(logpdf.applyAsDouble(x));
             } else  {
                 // Assume Dirac function
-                logpdf = x -> x > scale ? -Double.POSITIVE_INFINITY : 
Double.POSITIVE_INFINITY;
+                logpdf = x -> x > scale ? Double.NEGATIVE_INFINITY : 
Double.POSITIVE_INFINITY;
                 // PDF has infinite value at lower bound
                 pdf = x -> x > scale ? 0 : Double.POSITIVE_INFINITY;
             }
@@ -103,7 +103,6 @@ public final class ParetoDistribution extends 
AbstractContinuousDistribution {
         if (scale <= 0 || scale == Double.POSITIVE_INFINITY) {
             throw new 
DistributionException(DistributionException.NOT_STRICTLY_POSITIVE_FINITE, 
scale);
         }
-
         if (shape <= 0) {
             throw new 
DistributionException(DistributionException.NOT_STRICTLY_POSITIVE, shape);
         }
@@ -297,6 +296,41 @@ public final class ParetoDistribution extends 
AbstractContinuousDistribution {
     @Override
     public ContinuousDistribution.Sampler createSampler(final 
UniformRandomProvider rng) {
         // Pareto distribution sampler.
-        return InverseTransformParetoSampler.of(rng, scale, shape)::sample;
+        // Commons RNG v1.5 uses nextDouble() for (1 - p) effectively sampling 
from p in (0, 1].
+        // Ensure sampling is concentrated at the lower / upper bound at 
extreme shapes:
+        // Large shape should sample using p in [0, 1)  (lower bound)
+        // Small shape should sample using p in (0, 1]  (upper bound)
+        // Note: For small shape the input RNG is also wrapped to use nextLong 
as the source of
+        // randomness; this ensures the nextDouble method uses the interface 
output of [0, 1).
+        final UniformRandomProvider wrappedRng = shape >= 1 ? new 
InvertedRNG(rng) : rng::nextLong;
+        return InverseTransformParetoSampler.of(wrappedRng, scale, 
shape)::sample;
+    }
+
+    /**
+     * Create a RNG that inverts the output from nextDouble() as (1 - 
nextDouble()).
+     */
+    private static class InvertedRNG implements UniformRandomProvider {
+        /** Source of randomness. */
+        private final UniformRandomProvider rng;
+
+        /**
+         * @param rng Source of randomness
+         */
+        InvertedRNG(UniformRandomProvider rng) {
+            this.rng = rng;
+        }
+
+        @Override
+        public long nextLong() {
+            // Delegate the source of randomness
+            return rng.nextLong();
+        }
+
+        @Override
+        public double nextDouble() {
+            // Return a value in (0, 1].
+            // This assumes the interface method outputs in [0, 1).
+            return 1 - UniformRandomProvider.super.nextDouble();
+        }
     }
 }
diff --git 
a/commons-statistics-distribution/src/test/java/org/apache/commons/statistics/distribution/ParetoDistributionTest.java
 
b/commons-statistics-distribution/src/test/java/org/apache/commons/statistics/distribution/ParetoDistributionTest.java
index 2ab90e1..f719f40 100644
--- 
a/commons-statistics-distribution/src/test/java/org/apache/commons/statistics/distribution/ParetoDistributionTest.java
+++ 
b/commons-statistics-distribution/src/test/java/org/apache/commons/statistics/distribution/ParetoDistributionTest.java
@@ -17,6 +17,8 @@
 
 package org.apache.commons.statistics.distribution;
 
+import org.apache.commons.rng.UniformRandomProvider;
+import org.apache.commons.rng.simple.RandomSource;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
@@ -27,6 +29,13 @@ import org.junit.jupiter.params.provider.CsvSource;
  * Extends {@link BaseContinuousDistributionTest}. See javadoc of that class 
for details.
  */
 class ParetoDistributionTest extends BaseContinuousDistributionTest {
+    /**
+     * The difference each of the 2^53 dyadic rationals in [0, 1).
+     * This is the smallest non-zero value for p to use when inverse transform 
sampling.
+     * Equal to 2^-53.
+     */
+    private static final double U = 0x1.0p-53;
+
     @Override
     ContinuousDistribution makeDistribution(Object... parameters) {
         final double scale = (Double) parameters[0];
@@ -211,4 +220,177 @@ class ParetoDistributionTest extends 
BaseContinuousDistributionTest {
             Assertions.assertTrue(Double.isFinite(Math.log(shape) + 
Math.log(scale2) * shape));
         }
     }
+
+    /**
+     * Test sampling with a large shape. As {@code shape -> inf} then the 
distribution
+     * approaches a delta function with {@code CDF(x=scale)} = 0 and {@code 
CDF(x>scale) = 1}.
+     * This test verifies that a large shape is effectively sampled from p in 
[0, 1) to avoid
+     * spurious infinite samples when p=1.
+     *
+     * <p>Sampling Details
+     *
+     * <p>Note that sampling is using inverse transform sampling by inverting 
the CDF:
+     * <pre>
+     * CDF(x) = 1 - (scale / x)^shape
+     * x = scale / (1 - p)^(1 / shape)
+     *   = scale / exp(log(1 - p) / shape)
+     * </pre>
+     *
+     * <p>The sampler in Commons RNG is inverting the CDF function using 
Math.pow:
+     * <pre>
+     * x = scale / Math.pow(1 - p, 1 / shape)
+     * </pre>
+     *
+     * <p>The Pareto distribution uses log functions to achieve the same 
result:
+     * <pre>
+     * x = scale / Math.exp(Math.log1p(-p) / shape);
+     * </pre>
+     *
+     * <p>Inversion will return the scale when Math.exp(X) == 1 where X (in 
[-inf, 0]) is:
+     * <pre>
+     * X = log(1 - p) / shape
+     * </pre>
+     *
+     * <p>This occurs when {@code X > log(1.0 - epsilon)}, or larger (closer 
to zero) than
+     * {@code Math.log(Math.nextDown(1.0))}; X is approximately -1.11e-16.
+     * During sampling p is bounded to the 2^53 dyadic rationals in [0, 1). 
The largest
+     * finite value for the logarithm is log(2^-53) thus the critical size for 
shape is around:
+     * <pre>
+     * shape = log(2^-53) / -1.1102230246251565e-16 = 3.3089568271276403e17
+     * </pre>
+     *
+     * <p>Note that if the p-value is 1 then inverseCumulativeProbability(1.0) 
== inf.
+     * However using the power function to invert this ignores this 
possibility when the shape
+     * is infinite and will always return scale / x^0 = scale / 1 = scale. If 
the inversion
+     * using logarithms is directly used then a log(0) / inf == -inf / inf == 
NaN occurs.
+     */
+    @ParameterizedTest
+    @CsvSource({
+        // Scale values match those from the test resource files where the 
sampling test is disabled
+        "10, Infinity",
+        "1, Infinity",
+        "0.1, Infinity",
+        // This behaviour occurs even when the shape is not infinite due to 
limited precision
+        // of double values. Shape is set to twice the limit derived above to 
account for rounding:
+        // double p = 0x1.0p-53
+        // Math.pow(p, 1 / (Math.log(p) / -p))     ==> 0.9999999999999999
+        // Math.pow(p, 1 / (2 * Math.log(p) / -p)) ==> 1.0
+        // shape = (2 * Math.log(p) / -p)
+        "10, 6.6179136542552806e17",
+        "1, 6.6179136542552806e17",
+        "0.1, 6.6179136542552806e17",
+    })
+    void testSamplingWithLargeShape(double scale, double shape) {
+        final ParetoDistribution dist = ParetoDistribution.of(scale, shape);
+
+        // Sampling should act as if inverting p in [0, 1)
+        final double x0 = dist.inverseCumulativeProbability(0);
+        final double x1 = dist.inverseCumulativeProbability(1 - U);
+        Assertions.assertEquals(scale, x0);
+        Assertions.assertEquals(x0, x1, "Test parameters did not create an 
extreme distribution");
+
+        // Sampling for p in [0, 1): returns scale when shape is large
+        assertSampler(dist, scale);
+    }
+
+    /**
+     * Test sampling with a tiny shape. As {@code shape -> 0} then the 
distribution
+     * approaches a function with {@code CDF(x=inf) = 1} and {@code 
CDF(x>=scale) = 0}.
+     * This test verifies that a tiny shape is effectively sampled from p in 
(0, 1] to avoid
+     * spurious NaN samples when p=0.
+     *
+     * <p>Sampling Details
+     *
+     * <p>The sampler in Commons RNG is inverting the CDF function using 
Math.pow:
+     * <pre>
+     * x = scale / Math.pow(1 - p, 1 / shape)
+     * </pre>
+     *
+     * <p>However Math.pow(1, infinity) == NaN. This can be avoided if p=0 is 
not used.
+     * For all other values Math.pow(1 - p, infinity) == 0 and the sample is 
infinite.
+     */
+    @ParameterizedTest
+    @CsvSource({
+        // 1 / shape is infinite
+        // Scale values match those from the test resource files where the 
sampling test is disabled
+        "10, 4.9e-324",
+        "1, 4.9e-324",
+        "0.1, 4.9e-324",
+        // This behaviour occurs even when 1 / shape is not infinite due to 
limited precision
+        // of double values. Shape provides the largest possible finite value 
from 1 / shape:
+        // shape = (1.0 + Math.ulp(1.0)*2) / Double.MAX_VALUE
+        // 1 / shape = 1.7976931348623143e308
+        // 1 / Math.nextDown(shape) = Infinity
+        "10, 5.56268464626801E-309",
+        "1, 5.56268464626801E-309",
+        "0.1, 5.56268464626801E-309",
+        // Lower limit is where pow(1 - p, 1 / shape) < Double.MIN_VALUE:
+        // shape < log(1 - p) / log(MIN_VALUE)
+        // Shape is set to half this limit to account for rounding:
+        // double p = 0x1.0p-53
+        // Math.pow(1 - p, 1 / (Math.log(1 - p) / Math.log(Double.MIN_VALUE))) 
==> 4.9e-324
+        // Math.pow(1 - p, 2 / (Math.log(1 - p) / Math.log(Double.MIN_VALUE))) 
==> 0.0
+        // shape = 0.5 * Math.log(1 - p) / Math.log(Double.MIN_VALUE)
+        "10, 7.456765604783329e-20",
+        "1, 7.456765604783329e-20",
+        // Use smallest possible scale: test will fail if shape is not half 
the limit
+        "4.9e-324, 7.456765604783329e-20",
+    })
+    void testSamplingWithTinyShape(double scale, double shape) {
+        final ParetoDistribution dist = ParetoDistribution.of(scale, shape);
+
+        // Sampling should act as if inverting p in (0, 1]
+        final double x0 = dist.inverseCumulativeProbability(U);
+        final double x1 = dist.inverseCumulativeProbability(1);
+        Assertions.assertEquals(Double.POSITIVE_INFINITY, x1);
+        Assertions.assertEquals(x1, x0, "Test parameters did not create an 
extreme distribution");
+
+        // Sampling for p in [0, 1): returns infinity when shape is tiny
+        assertSampler(dist, Double.POSITIVE_INFINITY);
+    }
+
+    /**
+     * Assert the sampler produces the expected sample value irrespective of 
the values from the RNG.
+     *
+     * @param dist Distribution
+     * @param expected Expected sample value
+     */
+    private static void assertSampler(ParetoDistribution dist, double 
expected) {
+        // Extreme random numbers using no bits or all bits, then combinations
+        // that may be used to generate a double from the lower or upper 
53-bits
+        final long[] values = {0, -1, 1, 1L << 11, -2, -2L << 11};
+        final UniformRandomProvider rng = createRNG(values);
+        ContinuousDistribution.Sampler s = dist.createSampler(rng);
+        for (final long l : values) {
+            Assertions.assertEquals(expected, s.sample(), () -> "long bits = " 
+ l);
+        }
+        // Any random number
+        final long seed = RandomSource.createLong();
+        s = dist.createSampler(RandomSource.SPLIT_MIX_64.create(seed));
+        for (int i = 0; i < 100; i++) {
+            Assertions.assertEquals(expected, s.sample(), () -> "seed = " + 
seed);
+        }
+    }
+
+    /**
+     * Creates the RNG to return the given values from the nextLong() method.
+     *
+     * @param values Long values
+     * @return the RNG
+     */
+    private static UniformRandomProvider createRNG(long... values) {
+        return new UniformRandomProvider() {
+            private int i;
+
+            @Override
+            public long nextLong() {
+                return values[i++];
+            }
+
+            @Override
+            public double nextDouble() {
+                throw new IllegalStateException("nextDouble cannot be trusted 
to be in [0, 1) and should be ignored");
+            }
+        };
+    }
 }
diff --git 
a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.6.properties
 
b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.10.properties
similarity index 60%
copy from 
commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.6.properties
copy to 
commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.10.properties
index 3291bd3..7882d53 100644
--- 
a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.6.properties
+++ 
b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.10.properties
@@ -13,14 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# scale -> infinity creates a delta function
-parameters = 1.0 Infinity
-mean = 1
-variance = 0
+# 1 / shape -> infinity pushes the CDF towards infinity
+parameters = 1.0 4.9e-324
+mean = Inf
+variance = Inf
 lower = 1.0
-cdf.points = 1.0 1.1 1.2 1.3
-cdf.values = 0 1 1 1
-pdf.values = Inf 0 0 0
+cdf.points = 1.0 1.1 1.2 Inf
+cdf.values = 0 0 0 1
+# Note: pdf(x=inf) remains zero. The value is small but not representable.
+pdf.values = 0 0 0 0
+# Log values computed manually. Density reduces as x increases.
+logpdf.values = -744.4400719213812 -744.5353821011855 -744.6223934781751 -Inf
 
-# TODO: fix sampling. Possibly return Math.nextUp(shape).
+# The sampling test is not applicable as there are no quartiles.
+# As shape -> infinity then sampling will return the scale parameter.
 disable.sample = true
diff --git 
a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.7.properties
 
b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.11.properties
similarity index 61%
copy from 
commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.7.properties
copy to 
commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.11.properties
index 5d4b407..f22b078 100644
--- 
a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.7.properties
+++ 
b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.11.properties
@@ -13,14 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# scale -> infinity creates a delta function
-parameters = 0.1 Infinity
-mean = 0.1
-variance = 0
+# 1 / shape -> infinity pushes the CDF towards infinity
+parameters = 0.1 4.9e-324
+mean = Inf
+variance = Inf
 lower = 0.1
-cdf.points = 0.1 0.2 0.3
-cdf.values = 0 1 1
-pdf.values = Inf 0 0
+cdf.points = 0.1 0.2 0.3 Inf
+cdf.values = 0 0 0 1
+# PDF values computed manually
+pdf.values = 4.9E-323 2.5E-323 1.5E-323 0
+# Log values computed manually. Density reduces as x increases.
+logpdf.values = -742.1374868283872 -742.8306340089471 -743.2360991170552 -Inf
 
-# TODO: fix sampling. Possibly return Math.nextUp(shape).
+# The sampling test is not applicable as there are no quartiles.
+# As shape -> infinity then sampling will return the scale parameter.
 disable.sample = true
diff --git 
a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.5.properties
 
b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.5.properties
index ba83852..f23e9b1 100644
--- 
a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.5.properties
+++ 
b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.5.properties
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# scale -> infinity creates a delta function
+# shape -> infinity creates a delta function
 parameters = 10.0 Infinity
 mean = 10
 variance = 0
@@ -22,5 +22,6 @@ cdf.points = 10.0 10.1 10.2
 cdf.values = 0 1 1
 pdf.values = Inf 0 0
 
-# TODO: fix sampling. Possibly return Math.nextUp(shape).
+# The sampling test is not applicable as there are no quartiles.
+# As shape -> infinity then sampling will return the scale parameter.
 disable.sample = true
diff --git 
a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.6.properties
 
b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.6.properties
index 3291bd3..9f89f68 100644
--- 
a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.6.properties
+++ 
b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.6.properties
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# scale -> infinity creates a delta function
+# shape -> infinity creates a delta function
 parameters = 1.0 Infinity
 mean = 1
 variance = 0
@@ -22,5 +22,6 @@ cdf.points = 1.0 1.1 1.2 1.3
 cdf.values = 0 1 1 1
 pdf.values = Inf 0 0 0
 
-# TODO: fix sampling. Possibly return Math.nextUp(shape).
+# The sampling test is not applicable as there are no quartiles.
+# As shape -> infinity then sampling will return the scale parameter.
 disable.sample = true
diff --git 
a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.7.properties
 
b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.7.properties
index 5d4b407..24871e6 100644
--- 
a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.7.properties
+++ 
b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.7.properties
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# scale -> infinity creates a delta function
+# shape -> infinity creates a delta function
 parameters = 0.1 Infinity
 mean = 0.1
 variance = 0
@@ -22,5 +22,6 @@ cdf.points = 0.1 0.2 0.3
 cdf.values = 0 1 1
 pdf.values = Inf 0 0
 
-# TODO: fix sampling. Possibly return Math.nextUp(shape).
+# The sampling test is not applicable as there are no quartiles.
+# As shape -> infinity then sampling will return the scale parameter.
 disable.sample = true
diff --git 
a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.5.properties
 
b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.9.properties
similarity index 60%
copy from 
commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.5.properties
copy to 
commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.9.properties
index ba83852..eb18228 100644
--- 
a/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.5.properties
+++ 
b/commons-statistics-distribution/src/test/resources/org/apache/commons/statistics/distribution/test.pareto.9.properties
@@ -13,14 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# scale -> infinity creates a delta function
-parameters = 10.0 Infinity
-mean = 10
-variance = 0
+# 1 / shape -> infinity pushes the CDF towards infinity
+parameters = 10.0 4.9e-324
+mean = Inf
+variance = Inf
 lower = 10.0
-cdf.points = 10.0 10.1 10.2
-cdf.values = 0 1 1
-pdf.values = Inf 0 0
+cdf.points = 10.0 10.1 10.2 Inf
+cdf.values = 0 0 0 1
+# Note: pdf(x=inf) remains zero. The value is small but not representable.
+pdf.values = 0 0 0 0
+# Log values computed manually. Density reduces as x increases.
+logpdf.values = -746.7426570143753 -746.7526073452284 -746.7624596416714 -Inf
 
-# TODO: fix sampling. Possibly return Math.nextUp(shape).
+# The sampling test is not applicable as there are no quartiles.
+# As 1 / shape -> infinity then sampling will return infinity.
 disable.sample = true

Reply via email to