Author: luc Date: Wed Mar 23 13:19:23 2011 New Revision: 1084577 URL: http://svn.apache.org/viewvc?rev=1084577&view=rev Log: Added a consistency check for number of points with respect to the number of clusters in Kmeans++ clustering
Modified: commons/proper/math/trunk/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java commons/proper/math/trunk/src/site/xdoc/changes.xml commons/proper/math/trunk/src/test/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClustererTest.java Modified: commons/proper/math/trunk/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java?rev=1084577&r1=1084576&r2=1084577&view=diff ============================================================================== --- commons/proper/math/trunk/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java (original) +++ commons/proper/math/trunk/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java Wed Mar 23 13:19:23 2011 @@ -23,8 +23,12 @@ import java.util.List; import java.util.Random; import org.apache.commons.math.exception.ConvergenceException; +import org.apache.commons.math.exception.MathIllegalArgumentException; +import org.apache.commons.math.exception.NullArgumentException; +import org.apache.commons.math.exception.NumberIsTooSmallException; import org.apache.commons.math.exception.util.LocalizedFormats; import org.apache.commons.math.stat.descriptive.moment.Variance; +import org.apache.commons.math.util.MathUtils; /** * Clustering algorithm based on David Arthur and Sergei Vassilvitski k-means++ algorithm. @@ -88,9 +92,21 @@ public class KMeansPlusPlusClusterer<T e * @param maxIterations the maximum number of iterations to run the algorithm * for. If negative, no maximum will be used * @return a list of clusters containing the points + * @throws MathIllegalArgumentException if the data points are null or the number + * of clusters is larger than the number of data points */ - public List<Cluster<T>> cluster(final Collection<T> points, - final int k, final int maxIterations) { + public List<Cluster<T>> cluster(final Collection<T> points, final int k, + final int maxIterations) + throws MathIllegalArgumentException { + + // sanity checks + MathUtils.checkNotNull(points); + + // number of clusters has to be smaller or equal the number of data points + if (points.size() < k) { + throw new NumberIsTooSmallException(points.size(), k, false); + } + // create the initial clusters List<Cluster<T>> clusters = chooseInitialCenters(points, k, random); assignPointsToClusters(clusters, points); Modified: commons/proper/math/trunk/src/site/xdoc/changes.xml URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/site/xdoc/changes.xml?rev=1084577&r1=1084576&r2=1084577&view=diff ============================================================================== --- commons/proper/math/trunk/src/site/xdoc/changes.xml (original) +++ commons/proper/math/trunk/src/site/xdoc/changes.xml Wed Mar 23 13:19:23 2011 @@ -52,6 +52,10 @@ The <action> type attribute can be add,u If the output is not quite correct, check for invisible trailing spaces! --> <release version="3.0" date="TBD" description="TBD"> + <action dev="luc" type="add" issue="MATH-436" due-to="Thomas Neidhart"> + Added a consistency check for number of points with respect to the number + of clusters in Kmeans++ clustering + </action> <action dev="mikl" type="add" issue="MATH-437"> Added two sided Kolmogorov-Smirnov distribution using modified Marsaglia et al. (2003) implementation and quick decisions for certain Modified: commons/proper/math/trunk/src/test/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClustererTest.java URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/test/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClustererTest.java?rev=1084577&r1=1084576&r2=1084577&view=diff ============================================================================== --- commons/proper/math/trunk/src/test/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClustererTest.java (original) +++ commons/proper/math/trunk/src/test/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClustererTest.java Wed Mar 23 13:19:23 2011 @@ -24,6 +24,7 @@ import java.util.Collection; import java.util.List; import java.util.Random; +import org.apache.commons.math.exception.NumberIsTooSmallException; import org.junit.Assert; import org.junit.Test; @@ -246,4 +247,26 @@ public class KMeansPlusPlusClustererTest } Assert.assertTrue(uniquePointIsCenter); } + + /** + * 2 variables cannot be clustered into 3 clusters. See issue MATH-436. + */ + @Test(expected=NumberIsTooSmallException.class) + public void testPerformClusterAnalysisToManyClusters() { + KMeansPlusPlusClusterer<EuclideanIntegerPoint> transformer = + new KMeansPlusPlusClusterer<EuclideanIntegerPoint>( + new Random(1746432956321l)); + + EuclideanIntegerPoint[] points = new EuclideanIntegerPoint[] { + new EuclideanIntegerPoint(new int[] { + 1959, 325100 + }), new EuclideanIntegerPoint(new int[] { + 1960, 373200 + }) + }; + + transformer.cluster(Arrays.asList(points), 3, 1); + + } + }