This is an automated email from the ASF dual-hosted git repository. erans pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-math.git
commit 9dbceb0ed1d58c6ccb4d841a2384fe8f6d98149c Author: Gilles Sadowski <gillese...@gmail.com> AuthorDate: Tue Jul 20 11:45:45 2021 +0200 MATH-1616: Refactor "EmpiricalDistribution". * No default bin count (cf. MATH-1462). * No data loading from external sources (file, URL). * No data abstraction layer. * Return defensive copies of the internal state. * Make class immutable. * Allow user-defined within-bin kernel. --- .../legacy/distribution/EmpiricalDistribution.java | 494 +++++++-------------- .../distribution/EmpiricalDistributionTest.java | 420 ++++-------------- 2 files changed, 264 insertions(+), 650 deletions(-) diff --git a/commons-math-legacy/src/main/java/org/apache/commons/math4/legacy/distribution/EmpiricalDistribution.java b/commons-math-legacy/src/main/java/org/apache/commons/math4/legacy/distribution/EmpiricalDistribution.java index 65e5808..3fd8bbf 100644 --- a/commons-math-legacy/src/main/java/org/apache/commons/math4/legacy/distribution/EmpiricalDistribution.java +++ b/commons-math-legacy/src/main/java/org/apache/commons/math4/legacy/distribution/EmpiricalDistribution.java @@ -17,29 +17,16 @@ package org.apache.commons.math4.legacy.distribution; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.net.URL; -import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; +import java.util.function.Function; import org.apache.commons.statistics.distribution.NormalDistribution; import org.apache.commons.statistics.distribution.ContinuousDistribution; import org.apache.commons.statistics.distribution.ConstantContinuousDistribution; import org.apache.commons.numbers.core.Precision; -import org.apache.commons.rng.UniformRandomProvider; -import org.apache.commons.math4.legacy.exception.MathIllegalStateException; -import org.apache.commons.math4.legacy.exception.MathInternalError; -import org.apache.commons.math4.legacy.exception.NullArgumentException; import org.apache.commons.math4.legacy.exception.OutOfRangeException; -import org.apache.commons.math4.legacy.exception.ZeroException; import org.apache.commons.math4.legacy.exception.NotStrictlyPositiveException; -import org.apache.commons.math4.legacy.exception.util.LocalizedFormats; import org.apache.commons.math4.legacy.stat.descriptive.StatisticalSummary; import org.apache.commons.math4.legacy.stat.descriptive.SummaryStatistics; import org.apache.commons.math4.legacy.core.jdkmath.AccurateMath; @@ -99,290 +86,138 @@ import org.apache.commons.math4.legacy.core.jdkmath.AccurateMath; * * <strong>USAGE NOTES:</strong> * <ul> - * <li>The {@code binCount} is set by default to 1000. A good rule of thumb - * is to set the bin count to approximately the length of the input file divided - * by 10. </li> - * <li>The input file <i>must</i> be a plain text file containing one valid numeric - * entry per line.</li> + * <li> + * The {@code binCount} is set by default to 1000. A good rule of thumb + * is to set the bin count to approximately the length of the input file + * divided by 10. </li> + * <li> + * The input file <i>must</i> be a plain text file containing one valid + * numeric entry per line.</li> * </ul> */ -public class EmpiricalDistribution extends AbstractRealDistribution +public final class EmpiricalDistribution extends AbstractRealDistribution implements ContinuousDistribution { - /** Default bin count. */ - public static final int DEFAULT_BIN_COUNT = 1000; - /** Character set for file input. */ - private static final String FILE_CHARSET = "US-ASCII"; - /** Bins' characteristics. */ + /** Bins characteristics. */ private final List<SummaryStatistics> binStats; /** Sample statistics. */ - private SummaryStatistics sampleStats; + private final SummaryStatistics sampleStats; /** Max loaded value. */ - private double max = Double.NEGATIVE_INFINITY; + private final double max; /** Min loaded value. */ - private double min = Double.POSITIVE_INFINITY; + private final double min; /** Grid size. */ - private double delta; + private final double delta; /** Number of bins. */ private final int binCount; - /** Whether the distribution is loaded. */ - private boolean loaded; - /** Upper bounds of subintervals in (0,1) belonging to the bins. */ - private double[] upperBounds; + /** Upper bounds of subintervals in (0, 1) belonging to the bins. */ + private final double[] upperBounds; + /** Kernel factory. */ + private final Function<SummaryStatistics, ContinuousDistribution> kernelFactory; /** - * Creates a new EmpiricalDistribution with the default bin count. - */ - public EmpiricalDistribution() { - this(DEFAULT_BIN_COUNT); - } - - /** - * Creates a new EmpiricalDistribution with the specified bin count. + * Creates a new instance with the specified data. * - * @param binCount number of bins. Must be strictly positive. + * @param binCount Number of bins. Must be strictly positive. + * @param input Input data. Cannot be {@code null}. + * @param kernelFactory Kernel factory. * @throws NotStrictlyPositiveException if {@code binCount <= 0}. */ - public EmpiricalDistribution(int binCount) { + private EmpiricalDistribution(int binCount, + double[] input, + Function<SummaryStatistics, ContinuousDistribution> kernelFactory) { if (binCount <= 0) { throw new NotStrictlyPositiveException(binCount); } this.binCount = binCount; - binStats = new ArrayList<>(); - } - /** - * Computes the empirical distribution from the provided - * array of numbers. - * - * @param in the input data array - * @exception NullArgumentException if in is null - */ - public void load(double[] in) { - DataAdapter da = new ArrayDataAdapter(in); - try { - da.computeStats(); - // new adapter for the second pass - fillBinStats(new ArrayDataAdapter(in)); - } catch (IOException ex) { - // Can't happen - throw new MathInternalError(); + // First pass through the data. + sampleStats = new SummaryStatistics(); + for (int i = 0; i < input.length; i++) { + sampleStats.addValue(input[i]); } - loaded = true; - } + // Set up grid. + min = sampleStats.getMin(); + max = sampleStats.getMax(); + delta = (max - min) / binCount; - /** - * Computes the empirical distribution using data read from a URL. - * - * <p>The input file <i>must</i> be an ASCII text file containing one - * valid numeric entry per line.</p> - * - * @param url url of the input file - * - * @throws IOException if an IO error occurs - * @throws NullArgumentException if url is null - * @throws ZeroException if URL contains no data - */ - public void load(URL url) throws IOException { - NullArgumentException.check(url); - Charset charset = Charset.forName(FILE_CHARSET); - BufferedReader in = - new BufferedReader(new InputStreamReader(url.openStream(), charset)); - try { - DataAdapter da = new StreamDataAdapter(in); - da.computeStats(); - if (sampleStats.getN() == 0) { - throw new ZeroException(LocalizedFormats.URL_CONTAINS_NO_DATA, url); - } - // new adapter for the second pass - in = new BufferedReader(new InputStreamReader(url.openStream(), charset)); - fillBinStats(new StreamDataAdapter(in)); - loaded = true; - } finally { - try { - in.close(); - } catch (IOException ex) { //NOPMD - // ignore - } - } - } + // Second pass through the data. + binStats = createBinStats(input); - /** - * Computes the empirical distribution from the input file. - * - * <p>The input file <i>must</i> be an ASCII text file containing one - * valid numeric entry per line.</p> - * - * @param file the input file - * @throws IOException if an IO error occurs - * @throws NullArgumentException if file is null - */ - public void load(File file) throws IOException { - NullArgumentException.check(file); - Charset charset = Charset.forName(FILE_CHARSET); - InputStream is = new FileInputStream(file); - BufferedReader in = new BufferedReader(new InputStreamReader(is, charset)); - try { - DataAdapter da = new StreamDataAdapter(in); - da.computeStats(); - // new adapter for second pass - is = new FileInputStream(file); - in = new BufferedReader(new InputStreamReader(is, charset)); - fillBinStats(new StreamDataAdapter(in)); - loaded = true; - } finally { - try { - in.close(); - } catch (IOException ex) { //NOPMD - // ignore - } + // Assign upper bounds based on bin counts. + upperBounds = new double[binCount]; + final double n = (double) sampleStats.getN(); + upperBounds[0] = binStats.get(0).getN() / n; + for (int i = 1; i < binCount - 1; i++) { + upperBounds[i] = upperBounds[i - 1] + binStats.get(i).getN() / n; } - } + upperBounds[binCount - 1] = 1d; - /** - * Provides methods for computing {@code sampleStats} and - * {@code beanStats} abstracting the source of data. - */ - private abstract class DataAdapter { - /** - * Compute bin stats. - * - * @throws IOException if an error occurs computing bin stats - */ - public abstract void computeBinStats() throws IOException; - - /** - * Compute sample statistics. - * - * @throws IOException if an error occurs computing sample stats - */ - public abstract void computeStats() throws IOException; - } + this.kernelFactory = kernelFactory; + } /** - * {@code DataAdapter} for data provided through some input stream. + * Factory that creates a new instance from the specified data. + * + * @param binCount Number of bins. Must be strictly positive. + * @param input Input data. Cannot be {@code null}. + * @param kernelFactory Factory for creating within-bin kernels. + * @return a new instance. + * @throws NotStrictlyPositiveException if {@code binCount <= 0}. */ - private class StreamDataAdapter extends DataAdapter { - /** Input stream providing access to the data. */ - private final BufferedReader inputStream; - - /** - * Create a StreamDataAdapter from a BufferedReader. - * - * @param in BufferedReader input stream - */ - StreamDataAdapter(BufferedReader in){ - inputStream = in; - } - - /** {@inheritDoc} */ - @Override - public void computeBinStats() throws IOException { - String str = null; - double val = 0.0d; - while ((str = inputStream.readLine()) != null) { - val = Double.parseDouble(str); - SummaryStatistics stats = binStats.get(findBin(val)); - stats.addValue(val); - } - - inputStream.close(); - } - - /** {@inheritDoc} */ - @Override - public void computeStats() throws IOException { - String str = null; - double val = 0.0; - sampleStats = new SummaryStatistics(); - while ((str = inputStream.readLine()) != null) { - val = Double.parseDouble(str); - sampleStats.addValue(val); - } - inputStream.close(); - } + public static EmpiricalDistribution from(int binCount, + double[] input, + Function<SummaryStatistics, ContinuousDistribution> kernelFactory) { + return new EmpiricalDistribution(binCount, + input, + kernelFactory); } /** - * {@code DataAdapter} for data provided as array of doubles. + * Factory that creates a new instance from the specified data. + * + * @param binCount Number of bins. Must be strictly positive. + * @param input Input data. Cannot be {@code null}. + * @return a new instance. + * @throws NotStrictlyPositiveException if {@code binCount <= 0}. */ - private class ArrayDataAdapter extends DataAdapter { - /** Array of input data values. */ - private final double[] inputArray; - - /** - * Construct an ArrayDataAdapter from a double[] array. - * - * @param in double[] array holding the data - * @throws NullArgumentException if in is null - */ - ArrayDataAdapter(double[] in) { - NullArgumentException.check(in); - inputArray = in; - } - - /** {@inheritDoc} */ - @Override - public void computeStats() throws IOException { - sampleStats = new SummaryStatistics(); - for (int i = 0; i < inputArray.length; i++) { - sampleStats.addValue(inputArray[i]); - } - } - - /** {@inheritDoc} */ - @Override - public void computeBinStats() throws IOException { - for (int i = 0; i < inputArray.length; i++) { - SummaryStatistics stats = binStats.get(findBin(inputArray[i])); - stats.addValue(inputArray[i]); - } - } + public static EmpiricalDistribution from(int binCount, + double[] input) { + return from(binCount, input, defaultKernel()); } /** - * Fills binStats array (second pass through data file). + * Create statistics (second pass through the data). * - * @param da object providing access to the data - * @throws IOException if an IO error occurs + * @param input Input data. + * @return bins statistics. */ - private void fillBinStats(final DataAdapter da) throws IOException { - // Set up grid - min = sampleStats.getMin(); - max = sampleStats.getMax(); - delta = (max - min) / binCount; + private List<SummaryStatistics> createBinStats(double[] input) { + final List<SummaryStatistics> binStats = new ArrayList<>(); - // Initialize binStats ArrayList - if (!binStats.isEmpty()) { - binStats.clear(); - } for (int i = 0; i < binCount; i++) { - SummaryStatistics stats = new SummaryStatistics(); - binStats.add(i, stats); + binStats.add(i, new SummaryStatistics()); } - // Filling data in binStats Array - da.computeBinStats(); - - // Assign upperBounds based on bin counts - upperBounds = new double[binCount]; - upperBounds[0] = binStats.get(0).getN() / (double) sampleStats.getN(); - for (int i = 1; i < binCount - 1; i++) { - upperBounds[i] = upperBounds[i - 1] + - binStats.get(i).getN() / (double) sampleStats.getN(); + // Second pass though the data. + for (int i = 0; i < input.length; i++) { + final double v = input[i]; + binStats.get(findBin(v)).addValue(v); } - upperBounds[binCount - 1] = 1d; + + return binStats; } /** * Returns the index of the bin to which the given value belongs. * - * @param value the value whose bin we are trying to find - * @return the index of the bin containing the value + * @param value Value whose bin we are trying to find. + * @return the index of the bin containing the value. */ private int findBin(double value) { - return AccurateMath.min(AccurateMath.max((int) AccurateMath.ceil((value - min) / delta) - 1, 0), - binCount - 1); + return Math.min(Math.max((int) AccurateMath.ceil((value - min) / delta) - 1, + 0), + binCount - 1); } /** @@ -394,7 +229,7 @@ public class EmpiricalDistribution extends AbstractRealDistribution * @throws IllegalStateException if the distribution has not been loaded */ public StatisticalSummary getSampleStats() { - return sampleStats; + return sampleStats.copy(); } /** @@ -407,27 +242,33 @@ public class EmpiricalDistribution extends AbstractRealDistribution } /** - * Returns a List of {@link SummaryStatistics} instances containing - * statistics describing the values in each of the bins. The list is - * indexed on the bin number. + * Returns a copy of the {@link SummaryStatistics} instances containing + * statistics describing the values in each of the bins. + * The list is indexed on the bin number. * - * @return List of bin statistics. + * @return the bins statistics. */ public List<SummaryStatistics> getBinStats() { - return binStats; + final List<SummaryStatistics> copy = new ArrayList<>(); + for (SummaryStatistics s : binStats) { + copy.add(s.copy()); + } + return copy; } /** - * <p>Returns a fresh copy of the array of upper bounds for the bins. - * Bins are: <br> - * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],..., - * (upperBounds[binCount-2], upperBounds[binCount-1] = max].</p> + * Returns the upper bounds of the bins. + * + * Assuming array {@code u} is returned by this method, the bins are: + * <ul> + * <li>{@code (min, u[0])},</li> + * <li>{@code (u[0], u[1])},</li> + * <li>... ,</li> + * <li>{@code (u[binCount - 2], u[binCount - 1] = max)},</li> + * </ul> * - * <p>Note: In versions 1.0-2.0 of commons-math, this method - * incorrectly returned the array of probability generator upper - * bounds now returned by {@link #getGeneratorUpperBounds()}.</p> + * @return the bins upper bounds. * - * @return array of bin upper bounds * @since 2.1 */ public double[] getUpperBounds() { @@ -440,20 +281,18 @@ public class EmpiricalDistribution extends AbstractRealDistribution } /** - * <p>Returns a fresh copy of the array of upper bounds of the subintervals - * of [0,1] used in generating data from the empirical distribution. - * Subintervals correspond to bins with lengths proportional to bin counts.</p> + * Returns the upper bounds of the subintervals of [0, 1] used in generating + * data from the empirical distribution. + * Subintervals correspond to bins with lengths proportional to bin counts. * * <strong>Preconditions:</strong><ul> * <li>the distribution must be loaded before invoking this method</li></ul> * - * <p>In versions 1.0-2.0 of commons-math, this array was (incorrectly) returned - * by {@link #getUpperBounds()}.</p> - * - * @since 2.1 * @return array of upper bounds of subintervals used in data generation * @throws NullPointerException unless a {@code load} method has been * called beforehand. + * + * @since 2.1 */ public double[] getGeneratorUpperBounds() { int len = upperBounds.length; @@ -462,15 +301,6 @@ public class EmpiricalDistribution extends AbstractRealDistribution return out; } - /** - * Property indicating whether or not the distribution has been loaded. - * - * @return true if the distribution has been loaded - */ - public boolean isLoaded() { - return loaded; - } - // Distribution methods. /** @@ -485,15 +315,18 @@ public class EmpiricalDistribution extends AbstractRealDistribution /** * {@inheritDoc} * - * <p>Returns the kernel density normalized so that its integral over each bin - * equals the bin mass.</p> + * Returns the kernel density normalized so that its integral over each bin + * equals the bin mass. + * + * Algorithm description: + * <ol> + * <li>Find the bin B that x belongs to.</li> + * <li>Compute K(B) = the mass of B with respect to the within-bin kernel (i.e., the + * integral of the kernel density over B).</li> + * <li>Return k(x) * P(B) / K(B), where k is the within-bin kernel density + * and P(B) is the mass of B.</li> + * </ol> * - * <p>Algorithm description: <ol> - * <li>Find the bin B that x belongs to.</li> - * <li>Compute K(B) = the mass of B with respect to the within-bin kernel (i.e., the - * integral of the kernel density over B).</li> - * <li>Return k(x) * P(B) / K(B), where k is the within-bin kernel density - * and P(B) is the mass of B.</li></ol> * @since 3.1 */ @Override @@ -509,13 +342,15 @@ public class EmpiricalDistribution extends AbstractRealDistribution /** * {@inheritDoc} * - * <p>Algorithm description:<ol> - * <li>Find the bin B that x belongs to.</li> - * <li>Compute P(B) = the mass of B and P(B-) = the combined mass of the bins below B.</li> - * <li>Compute K(B) = the probability mass of B with respect to the within-bin kernel - * and K(B-) = the kernel distribution evaluated at the lower endpoint of B</li> - * <li>Return P(B-) + P(B) * [K(x) - K(B-)] / K(B) where - * K(x) is the within-bin kernel distribution function evaluated at x.</li></ol> + * Algorithm description: + * <ol> + * <li>Find the bin B that x belongs to.</li> + * <li>Compute P(B) = the mass of B and P(B-) = the combined mass of the bins below B.</li> + * <li>Compute K(B) = the probability mass of B with respect to the within-bin kernel + * and K(B-) = the kernel distribution evaluated at the lower endpoint of B</li> + * <li>Return P(B-) + P(B) * [K(x) - K(B-)] / K(B) where + * K(x) is the within-bin kernel distribution function evaluated at x.</li> + * </ol> * If K is a constant distribution, we return P(B-) + P(B) (counting the full * mass of B). * @@ -550,20 +385,24 @@ public class EmpiricalDistribution extends AbstractRealDistribution /** * {@inheritDoc} * - * <p>Algorithm description:<ol> - * <li>Find the smallest i such that the sum of the masses of the bins - * through i is at least p.</li> - * <li> - * Let K be the within-bin kernel distribution for bin i.<br> - * Let K(B) be the mass of B under K. <br> - * Let K(B-) be K evaluated at the lower endpoint of B (the combined - * mass of the bins below B under K).<br> - * Let P(B) be the probability of bin i.<br> - * Let P(B-) be the sum of the bin masses below bin i. <br> - * Let pCrit = p - P(B-)<br> - * <li>Return the inverse of K evaluated at <br> + * Algorithm description: + * <ol> + * <li>Find the smallest i such that the sum of the masses of the bins + * through i is at least p.</li> + * <li> + * <ol> + * <li>Let K be the within-bin kernel distribution for bin i.</li> + * <li>Let K(B) be the mass of B under K.</li> + * <li>Let K(B-) be K evaluated at the lower endpoint of B (the combined + * mass of the bins below B under K).</li> + * <li>Let P(B) be the probability of bin i.</li> + * <li>Let P(B-) be the sum of the bin masses below bin i.</li> + * <li>Let pCrit = p - P(B-)</li> + * </ol> + * </li> + * <li>Return the inverse of K evaluated at * K(B-) + pCrit * K(B) / P(B) </li> - * </ol> + * </ol> * * @since 3.1 */ @@ -651,15 +490,6 @@ public class EmpiricalDistribution extends AbstractRealDistribution return true; } - /**{@inheritDoc} */ - @Override - public ContinuousDistribution.Sampler createSampler(final UniformRandomProvider rng) { - if (!loaded) { - throw new MathIllegalStateException(LocalizedFormats.DISTRIBUTION_NOT_LOADED); - } - return super.createSampler(rng); - } - /** * The probability of bin i. * @@ -717,19 +547,29 @@ public class EmpiricalDistribution extends AbstractRealDistribution } /** - * The within-bin smoothing kernel. Returns a Gaussian distribution - * parameterized by {@code bStats}, unless the bin contains only one - * observation, in which case a constant distribution is returned. + * @param stats Bin statistics. + * @return the within-bin kernel. + */ + private ContinuousDistribution getKernel(SummaryStatistics stats) { + return kernelFactory.apply(stats); + } + + /** + * The within-bin smoothing kernel: A Gaussian distribution + * (unless the bin contains only one observation, in which case + * a constant distribution is returned). * - * @param bStats summary statistics for the bin - * @return within-bin kernel parameterized by bStats + * @return the within-bin kernel factory. */ - protected ContinuousDistribution getKernel(SummaryStatistics bStats) { - if (bStats.getN() <= 1 || - bStats.getVariance() == 0) { - return new ConstantContinuousDistribution(bStats.getMean()); - } else { - return new NormalDistribution(bStats.getMean(), bStats.getStandardDeviation()); - } + private static Function<SummaryStatistics, ContinuousDistribution> defaultKernel() { + return stats -> { + if (stats.getN() <= 1 || + stats.getVariance() == 0) { + return new ConstantContinuousDistribution(stats.getMean()); + } else { + return new NormalDistribution(stats.getMean(), + stats.getStandardDeviation()); + } + }; } } diff --git a/commons-math-legacy/src/test/java/org/apache/commons/math4/legacy/distribution/EmpiricalDistributionTest.java b/commons-math-legacy/src/test/java/org/apache/commons/math4/legacy/distribution/EmpiricalDistributionTest.java index b86b7c3..3217317 100644 --- a/commons-math-legacy/src/test/java/org/apache/commons/math4/legacy/distribution/EmpiricalDistributionTest.java +++ b/commons-math-legacy/src/test/java/org/apache/commons/math4/legacy/distribution/EmpiricalDistributionTest.java @@ -17,7 +17,6 @@ package org.apache.commons.math4.legacy.distribution; import java.io.BufferedReader; -import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; @@ -35,8 +34,6 @@ import org.apache.commons.math4.legacy.TestUtils; import org.apache.commons.math4.legacy.analysis.UnivariateFunction; import org.apache.commons.math4.legacy.analysis.integration.BaseAbstractUnivariateIntegrator; import org.apache.commons.math4.legacy.analysis.integration.IterativeLegendreGaussIntegrator; -import org.apache.commons.math4.legacy.exception.MathIllegalStateException; -import org.apache.commons.math4.legacy.exception.NullArgumentException; import org.apache.commons.math4.legacy.exception.NotStrictlyPositiveException; import org.apache.commons.math4.legacy.stat.descriptive.SummaryStatistics; import org.apache.commons.math4.legacy.core.jdkmath.AccurateMath; @@ -48,32 +45,28 @@ import org.junit.Test; * Test cases for the {@link EmpiricalDistribution} class. */ public final class EmpiricalDistributionTest extends RealDistributionAbstractTest { - - protected EmpiricalDistribution empiricalDistribution = null; - protected EmpiricalDistribution empiricalDistribution2 = null; - protected File file = null; - protected URL url = null; - protected double[] dataArray = null; - protected final int n = 10000; + private EmpiricalDistribution empiricalDistribution = null; + private double[] dataArray = null; + private final int n = 10000; + /** Uniform bin mass = 10/10001 == mass of all but the first bin */ + private final double binMass = 10d / (n + 1); + /** Mass of first bin = 11/10001 */ + private final double firstBinMass = 11d / (n + 1); @Override @Before public void setUp() { super.setUp(); - empiricalDistribution = new EmpiricalDistribution(100); - url = getClass().getResource("testData.txt"); + + final URL url = getClass().getResource("testData.txt"); final ArrayList<Double> list = new ArrayList<>(); try { - empiricalDistribution2 = new EmpiricalDistribution(100); - BufferedReader in = - new BufferedReader(new InputStreamReader( - url.openStream())); + final BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream())); String str = null; while ((str = in.readLine()) != null) { list.add(Double.valueOf(str)); } in.close(); - in = null; } catch (IOException ex) { Assert.fail("IOException " + ex); } @@ -84,193 +77,68 @@ public final class EmpiricalDistributionTest extends RealDistributionAbstractTes dataArray[i] = data.doubleValue(); i++; } + + empiricalDistribution = EmpiricalDistribution.from(100, dataArray); } // MATH-1279 @Test(expected=NotStrictlyPositiveException.class) public void testPrecondition1() { - new EmpiricalDistribution(0); + EmpiricalDistribution.from(0, new double[] {1,2,3}); } /** - * Test EmpiricalDistribution.load() using sample data file.<br> - * Check that the sampleCount, mu and sigma match data in - * the sample data file. Also verify that load is idempotent. + * Test using data taken from sample data file. + * Check that the sampleCount, mu and sigma match data in the sample data file. */ @Test - public void testLoad() throws Exception { - // Load from a URL - empiricalDistribution.load(url); - checkDistribution(); - - // Load again from a file (also verifies idempotency of load) - File file = new File(url.toURI()); - empiricalDistribution.load(file); - checkDistribution(); - } - - private void checkDistribution() { + public void testDoubleLoad() { // testData File has 10000 values, with mean ~ 5.0, std dev ~ 1 // Make sure that loaded distribution matches this - Assert.assertEquals(empiricalDistribution.getSampleStats().getN(),1000,10E-7); + Assert.assertEquals(empiricalDistribution.getSampleStats().getN(), + 1000, 1e-7); //TODO: replace with statistical tests Assert.assertEquals(empiricalDistribution.getSampleStats().getMean(), - 5.069831575018909,10E-7); + 5.069831575018909, 1e-7); Assert.assertEquals(empiricalDistribution.getSampleStats().getStandardDeviation(), - 1.0173699343977738,10E-7); - } + 1.0173699343977738, 1e-7); - /** - * Test EmpiricalDistribution.load(double[]) using data taken from - * sample data file.<br> - * Check that the sampleCount, mu and sigma match data in - * the sample data file. - */ - @Test - public void testDoubleLoad() throws Exception { - empiricalDistribution2.load(dataArray); - // testData File has 10000 values, with mean ~ 5.0, std dev ~ 1 - // Make sure that loaded distribution matches this - Assert.assertEquals(empiricalDistribution2.getSampleStats().getN(),1000,10E-7); - //TODO: replace with statistical tests - Assert.assertEquals(empiricalDistribution2.getSampleStats().getMean(), - 5.069831575018909,10E-7); - Assert.assertEquals(empiricalDistribution2.getSampleStats().getStandardDeviation(), - 1.0173699343977738,10E-7); - - double[] bounds = empiricalDistribution2.getGeneratorUpperBounds(); + double[] bounds = empiricalDistribution.getGeneratorUpperBounds(); Assert.assertEquals(bounds.length, 100); Assert.assertEquals(bounds[99], 1.0, 10e-12); - } - // MATH-1531 @Test public void testMath1531() { - final EmpiricalDistribution inputDistribution = new EmpiricalDistribution(120); - inputDistribution.load(new double[] { - 50.993456376721454, - 49.455345691918055, - 49.527276095295804, - 50.017183448668845, - 49.10508147470046, - 49.813998274118696, - 50.87195348756139, - 50.419474110037, - 50.63614906979689, - 49.49694777179407, - 50.71799078406067, - 50.03192853759164, - 49.915092423165994, - 49.56895392597687, - 51.034638001064934, - 50.681227971275945, - 50.43749845081759, - 49.86513120270245, - 50.21475262482965, - 49.99202971042547, - 50.02382189838519, - 49.386888585302884, - 49.45585010202781, - 49.988009479855435, - 49.8136712206123, - 49.6715197127997, - 50.1981278397565, - 49.842297508010276, - 49.62491227740015, - 50.05101916097176, - 48.834912763303926, - 49.806787657848574, - 49.478236106374695, - 49.56648347371614, - 49.95069238081982, - 49.71845132077346, - 50.6097468705947, - 49.80724637775541, - 49.90448813086025, - 49.39641861662603, - 50.434295712893714, - 49.227176959566734, - 49.541126466050905, - 49.03416593170446, - 49.11584328494423, - 49.61387482435674, - 49.92877857995328, - 50.70638552955101, - 50.60078208448842, - 49.39326233277838, - 49.21488424364095, - 49.69503351015096, - 50.13733214001718, - 50.22084761458942, - 51.09804435604931, - 49.18559131120419, - 49.52286371605357, - 49.34804374996689, - 49.6901827776375, - 50.01316351359638, - 48.7751460520373, - 50.12961836291053, - 49.9978419772511, - 49.885658399408584, - 49.673438879979834, - 49.45565980965606, - 50.429747484906564, - 49.40129274804164, - 50.13034614008073, - 49.87685735146651, - 50.12967905393557, - 50.323560376181696, - 49.83519233651367, - 49.37333369733053, - 49.70074301611427, - 50.11626105774947, - 50.28249500380083, - 50.543354367136466, - 50.05866241335002, - 50.39516515672527, - 49.4838561463057, - 50.451757089234796, - 50.31370674203726, - 49.79063762614284, - 50.19652349768548, - 49.75881420748814, - 49.98371855036422, - 49.82171344472916, - 48.810793204162415, - 49.37040569084592, - 50.050641186203976, - 50.48360952263646, - 50.86666450358076, - 50.463268776129844, - 50.137489751888666, - 50.23823061444118, - 49.881460479468004, - 50.641174398764356, - 49.09314136851421, - 48.80877928574451, - 50.46197084844826, - 49.97691704141741, - 49.99933997561926, - 50.25692254481885, - 49.52973451252715, - 49.81229858420664, - 48.996112655915994, - 48.740531054814674, - 50.026642633066416, - 49.98696633604899, - 49.61307159972952, - 50.5115278979726, - 50.75245152442404, - 50.51807785445929, - 49.60929671768147, - 49.1079533564074, - 49.65347196551866, - 49.31684818724059, - 50.4906368627049, - 50.37483603684714}); - inputDistribution.inverseCumulativeProbability(0.7166666666666669); + final double[] data = new double[] { + 50.993456376721454, 49.455345691918055, 49.527276095295804, 50.017183448668845, 49.10508147470046, + 49.813998274118696, 50.87195348756139, 50.419474110037, 50.63614906979689, 49.49694777179407, + 50.71799078406067, 50.03192853759164, 49.915092423165994, 49.56895392597687, 51.034638001064934, + 50.681227971275945, 50.43749845081759, 49.86513120270245, 50.21475262482965, 49.99202971042547, + 50.02382189838519, 49.386888585302884, 49.45585010202781, 49.988009479855435, 49.8136712206123, + 49.6715197127997, 50.1981278397565, 49.842297508010276, 49.62491227740015, 50.05101916097176, + 48.834912763303926, 49.806787657848574, 49.478236106374695, 49.56648347371614, 49.95069238081982, + 49.71845132077346, 50.6097468705947, 49.80724637775541, 49.90448813086025, 49.39641861662603, + 50.434295712893714, 49.227176959566734, 49.541126466050905, 49.03416593170446, 49.11584328494423, + 49.61387482435674, 49.92877857995328, 50.70638552955101, 50.60078208448842, 49.39326233277838, + 49.21488424364095, 49.69503351015096, 50.13733214001718, 50.22084761458942, 51.09804435604931, + 49.18559131120419, 49.52286371605357, 49.34804374996689, 49.6901827776375, 50.01316351359638, + 48.7751460520373, 50.12961836291053, 49.9978419772511, 49.885658399408584, 49.673438879979834, + 49.45565980965606, 50.429747484906564, 49.40129274804164, 50.13034614008073, 49.87685735146651, + 50.12967905393557, 50.323560376181696, 49.83519233651367, 49.37333369733053, 49.70074301611427, + 50.11626105774947, 50.28249500380083, 50.543354367136466, 50.05866241335002, 50.39516515672527, + 49.4838561463057, 50.451757089234796, 50.31370674203726, 49.79063762614284, 50.19652349768548, + 49.75881420748814, 49.98371855036422, 49.82171344472916, 48.810793204162415, 49.37040569084592, + 50.050641186203976, 50.48360952263646, 50.86666450358076, 50.463268776129844, 50.137489751888666, + 50.23823061444118, 49.881460479468004, 50.641174398764356, 49.09314136851421, 48.80877928574451, + 50.46197084844826, 49.97691704141741, 49.99933997561926, 50.25692254481885, 49.52973451252715, + 49.81229858420664, 48.996112655915994, 48.740531054814674, 50.026642633066416, 49.98696633604899, + 49.61307159972952, 50.5115278979726, 50.75245152442404, 50.51807785445929, 49.60929671768147, + 49.1079533564074, 49.65347196551866, 49.31684818724059, 50.4906368627049, 50.37483603684714 + }; + + EmpiricalDistribution.from(120, data).inverseCumulativeProbability(0.7166666666666669); } /** @@ -279,84 +147,42 @@ public final class EmpiricalDistributionTest extends RealDistributionAbstractTes * these tests will fail even if the code is working as designed. */ @Test - public void testNext() throws Exception { - tstGen(0.1); - tstDoubleGen(0.1); - } - - /** - * Make sure exception thrown if sampling is attempted - * before loading empiricalDistribution. - */ - @Test - public void testNextFail1() { - try { - empiricalDistribution.createSampler(RandomSource.create(RandomSource.JDK)).sample(); - Assert.fail("Expecting MathIllegalStateException"); - } catch (MathIllegalStateException ex) { - // expected - } - } - - /** - * Make sure exception thrown if sampling is attempted - * before loading empiricalDistribution. - */ - @Test - public void testNextFail2() { - try { - empiricalDistribution2.createSampler(RandomSource.create(RandomSource.JDK)).sample(); - Assert.fail("Expecting MathIllegalStateException"); - } catch (MathIllegalStateException ex) { - // expected - } + public void testNext() { + tstGen(empiricalDistribution, + 0.1); } /** * Make sure we can handle a grid size that is too fine */ @Test - public void testGridTooFine() throws Exception { - empiricalDistribution = new EmpiricalDistribution(1001); - tstGen(0.1); - empiricalDistribution2 = new EmpiricalDistribution(1001); - tstDoubleGen(0.1); + public void testGridTooFine() { + tstGen(EmpiricalDistribution.from(1001, dataArray), + 0.1); } /** * How about too fat? */ @Test - public void testGridTooFat() throws Exception { - empiricalDistribution = new EmpiricalDistribution(1); - tstGen(5); // ridiculous tolerance; but ridiculous grid size + public void testGridTooFat() { + tstGen(EmpiricalDistribution.from(1, dataArray), + 5); // ridiculous tolerance; but ridiculous grid size // really just checking to make sure we do not bomb - empiricalDistribution2 = new EmpiricalDistribution(1); - tstDoubleGen(5); } /** * Test bin index overflow problem (BZ 36450) */ @Test - public void testBinIndexOverflow() throws Exception { + public void testBinIndexOverflow() { double[] x = new double[] {9474.94326071674, 2080107.8865462579}; - new EmpiricalDistribution().load(x); + EmpiricalDistribution.from(1000, x); } - @Test(expected=NullArgumentException.class) + @Test(expected=NullPointerException.class) public void testLoadNullDoubleArray() { - new EmpiricalDistribution().load((double[]) null); - } - - @Test(expected=NullArgumentException.class) - public void testLoadNullURL() throws Exception { - new EmpiricalDistribution().load((URL) null); - } - - @Test(expected=NullArgumentException.class) - public void testLoadNullFile() throws Exception { - new EmpiricalDistribution().load((File) null); + EmpiricalDistribution.from(1000, null); } /** @@ -365,8 +191,7 @@ public final class EmpiricalDistributionTest extends RealDistributionAbstractTes @Test public void testGetBinUpperBounds() { double[] testData = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10}; - EmpiricalDistribution dist = new EmpiricalDistribution(5); - dist.load(testData); + EmpiricalDistribution dist = EmpiricalDistribution.from(5, testData); double[] expectedBinUpperBounds = {2, 4, 6, 8, 10}; double[] expectedGeneratorUpperBounds = {4d/13d, 7d/13d, 9d/13d, 11d/13d, 1}; double tol = 10E-12; @@ -374,23 +199,22 @@ public final class EmpiricalDistributionTest extends RealDistributionAbstractTes TestUtils.assertEquals(expectedGeneratorUpperBounds, dist.getGeneratorUpperBounds(), tol); } - private void verifySame(EmpiricalDistribution d1, EmpiricalDistribution d2) { - Assert.assertEquals(d1.isLoaded(), d2.isLoaded()); + private void verifySame(EmpiricalDistribution d1, + EmpiricalDistribution d2) { Assert.assertEquals(d1.getBinCount(), d2.getBinCount()); Assert.assertEquals(d1.getSampleStats(), d2.getSampleStats()); - if (d1.isLoaded()) { - for (int i = 0; i < d1.getUpperBounds().length; i++) { - Assert.assertEquals(d1.getUpperBounds()[i], d2.getUpperBounds()[i], 0); - } - Assert.assertEquals(d1.getBinStats(), d2.getBinStats()); + + for (int i = 0; i < d1.getUpperBounds().length; i++) { + Assert.assertEquals(d1.getUpperBounds()[i], d2.getUpperBounds()[i], 0); } + Assert.assertEquals(d1.getBinStats(), d2.getBinStats()); } - private void tstGen(double tolerance)throws Exception { - empiricalDistribution.load(url); - ContinuousDistribution.Sampler sampler - = empiricalDistribution.createSampler(RandomSource.create(RandomSource.WELL_19937_C, 1000)); - SummaryStatistics stats = new SummaryStatistics(); + private void tstGen(EmpiricalDistribution dist, + double tolerance) { + final ContinuousDistribution.Sampler sampler + = dist.createSampler(RandomSource.WELL_19937_C.create(1000)); + final SummaryStatistics stats = new SummaryStatistics(); for (int i = 1; i < 1000; i++) { stats.addValue(sampler.sample()); } @@ -398,38 +222,19 @@ public final class EmpiricalDistributionTest extends RealDistributionAbstractTes Assert.assertEquals("std dev", 1.0173699343977738, stats.getStandardDeviation(),tolerance); } - private void tstDoubleGen(double tolerance)throws Exception { - empiricalDistribution2.load(dataArray); - ContinuousDistribution.Sampler sampler - = empiricalDistribution2.createSampler(RandomSource.create(RandomSource.WELL_19937_C, 1000)); - SummaryStatistics stats = new SummaryStatistics(); - for (int i = 1; i < 1000; i++) { - stats.addValue(sampler.sample()); - } - Assert.assertEquals("mean", 5.069831575018909, stats.getMean(), tolerance); - Assert.assertEquals("std dev", 1.0173699343977738, stats.getStandardDeviation(), tolerance); - } - // Setup for distribution tests @Override public ContinuousDistribution makeDistribution() { - // Create a uniform distribution on [0, 10,000] + // Create a uniform distribution on [0, 10,000]. final double[] sourceData = new double[n + 1]; for (int i = 0; i < n + 1; i++) { sourceData[i] = i; } - EmpiricalDistribution dist = new EmpiricalDistribution(); - dist.load(sourceData); + EmpiricalDistribution dist = EmpiricalDistribution.from(1000, sourceData); return dist; } - /** Uniform bin mass = 10/10001 == mass of all but the first bin */ - private final double binMass = 10d / (n + 1); - - /** Mass of first bin = 11/10001 */ - private final double firstBinMass = 11d / (n + 1); - @Override public double[] makeCumulativeTestPoints() { final double[] testPoints = new double[] {9, 10, 15, 1000, 5004, 9999}; @@ -529,10 +334,9 @@ public final class EmpiricalDistributionTest extends RealDistributionAbstractTes for (int i = 51; i < 100; i++) { data[i] = 1 - 1 / (100 - (double) i + 2); } - EmpiricalDistribution dist = new EmpiricalDistribution(10); - dist.load(data); + EmpiricalDistribution dist = EmpiricalDistribution.from(10, data); ContinuousDistribution.Sampler sampler - = dist.createSampler(RandomSource.create(RandomSource.WELL_19937_C, 1000)); + = dist.createSampler(RandomSource.WELL_19937_C.create(1000)); for (int i = 0; i < 1000; i++) { final double dev = sampler.sample(); Assert.assertTrue(dev < 1); @@ -546,10 +350,9 @@ public final class EmpiricalDistributionTest extends RealDistributionAbstractTes @Test public void testNoBinVariance() { final double[] data = {0, 0, 1, 1}; - EmpiricalDistribution dist = new EmpiricalDistribution(2); - dist.load(data); + EmpiricalDistribution dist = EmpiricalDistribution.from(2, data); ContinuousDistribution.Sampler sampler - = dist.createSampler(RandomSource.create(RandomSource.WELL_19937_C, 1000)); + = dist.createSampler(RandomSource.WELL_19937_C.create(1000)); for (int i = 0; i < 1000; i++) { final double dev = sampler.sample(); Assert.assertTrue(dev == 0 || dev == 1); @@ -588,17 +391,18 @@ public final class EmpiricalDistributionTest extends RealDistributionAbstractTes @Test public void testKernelOverrideConstant() { - final EmpiricalDistribution dist = new ConstantKernelEmpiricalDistribution(5); - final double[] data = {1d,2d,3d, 4d,5d,6d, 7d,8d,9d, 10d,11d,12d, 13d,14d,15d}; - dist.load(data); - ContinuousDistribution.Sampler sampler - = dist.createSampler(RandomSource.create(RandomSource.WELL_19937_C, 1000)); + final double[] data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + final EmpiricalDistribution dist = + EmpiricalDistribution.from(5, data, + s -> new ConstantContinuousDistribution(s.getMean())); + final ContinuousDistribution.Sampler sampler + = dist.createSampler(RandomSource.WELL_19937_C.create(1000)); // Bin masses concentrated on 2, 5, 8, 11, 14 <- effectively discrete uniform distribution over these - double[] values = {2d, 5d, 8d, 11d, 14d}; + final double[] values = {2d, 5d, 8d, 11d, 14d}; for (int i = 0; i < 20; i++) { Assert.assertTrue(Arrays.binarySearch(values, sampler.sample()) >= 0); } - final double tol = 10E-12; + final double tol = 1e-12; Assert.assertEquals(0.0, dist.cumulativeProbability(1), tol); Assert.assertEquals(0.2, dist.cumulativeProbability(2), tol); Assert.assertEquals(0.6, dist.cumulativeProbability(10), tol); @@ -616,11 +420,12 @@ public final class EmpiricalDistributionTest extends RealDistributionAbstractTes @Test public void testKernelOverrideUniform() { - final EmpiricalDistribution dist = new UniformKernelEmpiricalDistribution(5); - final double[] data = {1d,2d,3d, 4d,5d,6d, 7d,8d,9d, 10d,11d,12d, 13d,14d,15d}; - dist.load(data); - ContinuousDistribution.Sampler sampler - = dist.createSampler(RandomSource.create(RandomSource.WELL_19937_C, 1000)); + final double[] data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + final EmpiricalDistribution dist = + EmpiricalDistribution.from(5, data, + s -> new UniformContinuousDistribution(s.getMin(), s.getMax())); + final ContinuousDistribution.Sampler sampler + = dist.createSampler(RandomSource.WELL_19937_C.create(1000)); // Kernels are uniform distributions on [1,3], [4,6], [7,9], [10,12], [13,15] final double bounds[] = {3d, 6d, 9d, 12d}; final double tol = 10E-12; @@ -648,7 +453,7 @@ public final class EmpiricalDistributionTest extends RealDistributionAbstractTes @Test public void testMath1431() { - final UniformRandomProvider rng = RandomSource.create(RandomSource.WELL_19937_C, 1000); + final UniformRandomProvider rng = RandomSource.WELL_19937_C.create(1000); final ContinuousDistribution.Sampler exponentialDistributionSampler = new ExponentialDistribution(0.05).createSampler(rng); final double[] empiricalDataPoints = new double[3000]; @@ -656,8 +461,7 @@ public final class EmpiricalDistributionTest extends RealDistributionAbstractTes empiricalDataPoints[i] = exponentialDistributionSampler.sample(); } - final EmpiricalDistribution testDistribution = new EmpiricalDistribution(100); - testDistribution.load(empiricalDataPoints); + final EmpiricalDistribution testDistribution = EmpiricalDistribution.from(100, empiricalDataPoints); for (int i = 0; i < 1000; i++) { final double point = rng.nextDouble(); @@ -684,8 +488,7 @@ public final class EmpiricalDistributionTest extends RealDistributionAbstractTes 6461.3944, 6384.1345 }; - final EmpiricalDistribution ed = new EmpiricalDistribution(data.length / 10); - ed.load(data); + final EmpiricalDistribution ed = EmpiricalDistribution.from(data.length / 10, data); double v; double p; @@ -702,33 +505,4 @@ public final class EmpiricalDistributionTest extends RealDistributionAbstractTes v = ed.inverseCumulativeProbability(p); Assert.assertTrue("p=" + p + " => v=" + v, v < 6350); } - - /** - * Empirical distribution using a constant smoothing kernel. - */ - private class ConstantKernelEmpiricalDistribution extends EmpiricalDistribution { - private static final long serialVersionUID = 1L; - ConstantKernelEmpiricalDistribution(int i) { - super(i); - } - // Use constant distribution equal to bin mean within bin - @Override - protected ContinuousDistribution getKernel(SummaryStatistics bStats) { - return new ConstantContinuousDistribution(bStats.getMean()); - } - } - - /** - * Empirical distribution using a uniform smoothing kernel. - */ - private class UniformKernelEmpiricalDistribution extends EmpiricalDistribution { - private static final long serialVersionUID = 2963149194515159653L; - UniformKernelEmpiricalDistribution(int i) { - super(i); - } - @Override - protected ContinuousDistribution getKernel(SummaryStatistics bStats) { - return new UniformContinuousDistribution(bStats.getMin(), bStats.getMax()); - } - } }