Repository: commons-text Updated Branches: refs/heads/master 43ba72325 -> e3bb7f75a
TEXT-2 Jaccard Index and Jaccard Distance Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/a6a6a274 Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/a6a6a274 Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/a6a6a274 Branch: refs/heads/master Commit: a6a6a274a6ed92945ae0ea0707b38629c9a01ace Parents: 5888d49 Author: drajakumar <drajaku...@paypal.com> Authored: Sun Nov 27 13:59:43 2016 +0530 Committer: drajakumar <drajaku...@paypal.com> Committed: Sun Nov 27 13:59:43 2016 +0530 ---------------------------------------------------------------------- .../text/similarity/JaccardDistance.java | 50 ++++++++++++ .../text/similarity/JaccardSimilarity.java | 86 ++++++++++++++++++++ .../text/similarity/JaccardDistanceTest.java | 66 +++++++++++++++ .../text/similarity/JaccardSimilarityTest.java | 66 +++++++++++++++ 4 files changed, 268 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-text/blob/a6a6a274/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java b/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java new file mode 100644 index 0000000..0c6d491 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +/** + * Measures the Jaccard distance of two sets of character sequence. Jaccard + * distance is the dissimilarity between two sets. Its the complementary of + * Jaccard similarity. + * + * <p> + * For further explanation about Jaccard Distance, refer + * https://en.wikipedia.org/wiki/Jaccard_index + * </p> + */ +public class JaccardDistance implements EditDistance<Double> { + + private final JaccardSimilarity jaccardSimilarity = new JaccardSimilarity(); + + /** + * Calculates Jaccard distance of two set character sequence passed as + * input. Calculates Jaccard similarity and returns the complementary of it. + * + * @param left first character sequence + * @param right second character sequence + * @return index + * @throws IllegalArgumentException + * if either String input {@code null} + */ + @Override + public Double apply(CharSequence left, CharSequence right) { + if (left == null || right == null) { + throw new IllegalArgumentException("Input cannot be null"); + } + return Math.round((1 - jaccardSimilarity.apply(left, right)) * 100d) / 100d; + } +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/a6a6a274/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java b/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java new file mode 100644 index 0000000..6c78bad --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import java.util.HashSet; +import java.util.Set; + +/** + * Measures the Jaccard similarity (aka Jaccard index) of two sets of character + * sequence. Jaccard similarity is the size of the intersection divided by the + * size of the union of the two sets. + * + * <p> + * For further explanation about Jaccard Similarity, refer + * https://en.wikipedia.org/wiki/Jaccard_index + * </p> + */ +public class JaccardSimilarity implements EditDistance<Double> { + + /** + * Calculates Jaccard Similarity of two set character sequence passed as + * input. + * + * @param left first character sequence + * @param right second character sequence + * @return index + * @throws IllegalArgumentException + * if either String input {@code null} + */ + @Override + public Double apply(CharSequence left, CharSequence right) { + if (left == null || right == null) { + throw new IllegalArgumentException("Input cannot be null"); + } + return Math.round(calculateJaccardSimilarity(left, right) * 100d) / 100d; + } + + /** + * Calculates Jaccard Similarity of two set character sequence passed as + * input. Does the calculation by identifying the union (characters in at + * least one of the two sets) of the two sets and intersection (characters + * which are present in set one which are present in set two) + * + * @param left first character sequence + * @param right second character sequence + * @return index + */ + private Double calculateJaccardSimilarity(CharSequence left, CharSequence right) { + Set<String> intersectionSet = new HashSet<String>(); + Set<String> unionSet = new HashSet<String>(); + boolean unionFilled = false; + int leftLength = left.length(); + int rightLength = right.length(); + if (leftLength == 0 || rightLength == 0) { + return 0d; + } + + for (int leftIndex = 0; leftIndex < leftLength; leftIndex++) { + unionSet.add(String.valueOf(left.charAt(leftIndex))); + for (int rightIndex = 0; rightIndex < rightLength; rightIndex++) { + if (!unionFilled) { + unionSet.add(String.valueOf(right.charAt(rightIndex))); + } + if (left.charAt(leftIndex) == right.charAt(rightIndex)) { + intersectionSet.add(String.valueOf(left.charAt(leftIndex))); + } + } + unionFilled = true; + } + return Double.valueOf(intersectionSet.size()) / Double.valueOf(unionSet.size()); + } +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/a6a6a274/src/test/java/org/apache/commons/text/similarity/JaccardDistanceTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/similarity/JaccardDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/JaccardDistanceTest.java new file mode 100644 index 0000000..4f3f55d --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/JaccardDistanceTest.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import static org.junit.Assert.assertEquals; + +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Unit tests for {@link org.apache.commons.text.similarity.JaccardDistance}. + */ +public class JaccardDistanceTest { + + private static JaccardDistance classBeingTested; + + @BeforeClass + public static void setUp() { + classBeingTested = new JaccardDistance(); + } + + @Test + public void testGettingJaccardDistance() { + assertEquals(1.00d, classBeingTested.apply("", ""), 0.0d); + assertEquals(1.00d, classBeingTested.apply("left", ""), 0.0d); + assertEquals(1.00d, classBeingTested.apply("", "right"), 0.0d); + assertEquals(0.25d, classBeingTested.apply("frog", "fog"), 0.0d); + assertEquals(1.00d, classBeingTested.apply("fly", "ant"), 0.0d); + assertEquals(0.78d, classBeingTested.apply("elephant", "hippo"), 0.0d); + assertEquals(0.36d, classBeingTested.apply("ABC Corporation", "ABC Corp"), 0.0d); + assertEquals(0.24d, classBeingTested.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.0d); + assertEquals(0.11d, classBeingTested.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.0d); + assertEquals(0.10d, classBeingTested.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d); + assertEquals(0.87d, classBeingTested.apply("left", "right"), 0.0d); + assertEquals(0.87d, classBeingTested.apply("leettteft", "ritttght"), 0.0d); + } + + @Test(expected = IllegalArgumentException.class) + public void testGettingJaccardDistanceNullNull() throws Exception { + classBeingTested.apply(null, null); + } + + @Test(expected = IllegalArgumentException.class) + public void testGettingJaccardDistanceStringNull() throws Exception { + classBeingTested.apply(" ", null); + } + + @Test(expected = IllegalArgumentException.class) + public void testGettingJaccardDistanceNullString() throws Exception { + classBeingTested.apply(null, "right"); + } +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/a6a6a274/src/test/java/org/apache/commons/text/similarity/JaccardSimilarityTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/similarity/JaccardSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/JaccardSimilarityTest.java new file mode 100644 index 0000000..0e8773c --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/JaccardSimilarityTest.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import static org.junit.Assert.assertEquals; + +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Unit tests for {@link org.apache.commons.text.similarity.JaccardSimilarity}. + */ +public class JaccardSimilarityTest { + + private static JaccardSimilarity classBeingTested; + + @BeforeClass + public static void setUp() { + classBeingTested = new JaccardSimilarity(); + } + + @Test + public void testGettingJaccardSimilarity() { + assertEquals(0.00d, classBeingTested.apply("", ""), 0.0d); + assertEquals(0.00d, classBeingTested.apply("left", ""), 0.0d); + assertEquals(0.00d, classBeingTested.apply("", "right"), 0.0d); + assertEquals(0.75d, classBeingTested.apply("frog", "fog"), 0.0d); + assertEquals(0.00d, classBeingTested.apply("fly", "ant"), 0.0d); + assertEquals(0.22d, classBeingTested.apply("elephant", "hippo"), 0.0d); + assertEquals(0.64d, classBeingTested.apply("ABC Corporation", "ABC Corp"), 0.0d); + assertEquals(0.76d, classBeingTested.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.0d); + assertEquals(0.89d, classBeingTested.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.0d); + assertEquals(0.9d, classBeingTested.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d); + assertEquals(0.13d, classBeingTested.apply("left", "right"), 0.0d); + assertEquals(0.13d, classBeingTested.apply("leettteft", "ritttght"), 0.0d); + } + + @Test(expected = IllegalArgumentException.class) + public void testGettingJaccardSimilarityNullNull() throws Exception { + classBeingTested.apply(null, null); + } + + @Test(expected = IllegalArgumentException.class) + public void testGettingJaccardSimilarityStringNull() throws Exception { + classBeingTested.apply(" ", null); + } + + @Test(expected = IllegalArgumentException.class) + public void testGettingJaccardSimilarityNullString() throws Exception { + classBeingTested.apply(null, "right"); + } +}