This is an automated email from the ASF dual-hosted git repository. aherbert pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-text.git
commit 639e4abb7fa57319311277753eb9f6aabfa2cf30 Author: Alex Herbert <aherb...@apache.org> AuthorDate: Sun Mar 10 13:22:14 2019 +0000 TEXT-155: Add a generic IntersectionSimilarity measure --- .../text/similarity/IntersectionResult.java | 118 +++++++ .../text/similarity/IntersectionSimilarity.java | 237 ++++++++++++++ .../text/similarity/IntersectionResultTest.java | 153 ++++++++++ .../similarity/IntersectionSimilarityTest.java | 340 +++++++++++++++++++++ 4 files changed, 848 insertions(+) diff --git a/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java b/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java new file mode 100644 index 0000000..1034c69 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import java.util.Objects; + +/** + * Represents the intersection result between two sets. + * + * <p>Stores the size of set A, set B and the intersection of A and B + * (<code>|A ∩ B|</code>).</p> + * + * <p>This class is immutable.</p> + * + * @since 1.7 + * @see <a href="https://en.wikipedia.org/wiki/Intersection_(set_theory)">Intersection</a> + */ +public class IntersectionResult { + /** + * The size of set A. + */ + private final int sizeA; + /** + * The size of set B. + */ + private final int sizeB; + /** + * The size of the intersection between set A and B. + */ + private final int intersection; + + /** + * Create the results for an intersection between two sets. + * + * @param sizeA the size of set A ({@code |A|}) + * @param sizeB the size of set B ({@code |B|}) + * @param intersection the size of the intersection of A and B (<code>|A ∩ B|</code>) + * @throws IllegalArgumentException if the sizes are negative or the intersection is greater + * than the minimum of the two set sizes + */ + public IntersectionResult(final int sizeA, final int sizeB, final int intersection) { + if (sizeA < 0) { + throw new IllegalArgumentException("Set size |A| is not positive: " + sizeA); + } + if (sizeB < 0) { + throw new IllegalArgumentException("Set size |B| is not positive: " + sizeB); + } + if (intersection < 0 || intersection > Math.min(sizeA, sizeB)) { + throw new IllegalArgumentException("Invalid intersection of |A| and |B|: " + intersection); + } + this.sizeA = sizeA; + this.sizeB = sizeB; + this.intersection = intersection; + } + + /** + * Get the size of set A. + * + * @return |A| + */ + public int getSizeA() { + return sizeA; + } + + /** + * Get the size of set B. + * + * @return |B| + */ + public int getSizeB() { + return sizeB; + } + + /** + * Get the size of the intersection between set A and B. + * + * @return <code>|A ∩ B|</code> + */ + public int getIntersection() { + return intersection; + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final IntersectionResult result = (IntersectionResult) o; + return sizeA == result.sizeA && sizeB == result.sizeB && intersection == result.intersection; + } + + @Override + public int hashCode() { + return Objects.hash(sizeA, sizeB, intersection); + } + + @Override + public String toString() { + return "Size A: " + sizeA + ", Size B: " + sizeB + ", Intersection: " + intersection; + } +} diff --git a/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java b/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java new file mode 100644 index 0000000..bbdd7b8 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java @@ -0,0 +1,237 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.function.Function; + +/** + * Measures the intersection of two sets created from a pair of character sequences. + * + * <p>It is assumed that the type {@code T} correctly conforms to the requirements for storage + * within a {@link Set} or {@link HashMap}. Ideally the type is immutable and implements + * {@link Object#equals(Object)} and {@link Object#hashCode()}.</p> + * + * @param <T> the type of the elements extracted from the character sequence + * @since 1.7 + * @see Set + * @see HashMap + */ +public class IntersectionSimilarity<T> implements SimilarityScore<IntersectionResult> { + /** The converter used to create the elements from the characters. */ + private final Function<CharSequence, Collection<T>> converter; + + // The following is adapted from commons-collections for a Bag. + // A Bag is a collection that can store the count of the number + // of copies of each element. + + /** + * Mutable counter class for storing the count of elements. + */ + private static class BagCount { + /** The count. This is initialised to 1 upon construction. */ + int count = 1; + } + + /** + * A minimal implementation of a Bag that can store elements and a count. + * + * <p>For the intended purpose the Bag does not have to be a {@link Collection}. It does not + * even have to know its own size. + */ + private class TinyBag { + /** The backing map. */ + private final Map<T, BagCount> map; + + /** + * Create a new tiny bag. + * + * @param initialCapacity the initial capacity + */ + TinyBag(int initialCapacity) { + map = new HashMap<>(initialCapacity); + } + + /** + * Adds a new element to the bag, incrementing its count in the underlying map. + * + * @param object the object to add + */ + void add(T object) { + final BagCount mut = map.get(object); + if (mut == null) { + map.put(object, new BagCount()); + } else { + mut.count++; + } + } + + /** + * Returns the number of occurrence of the given element in this bag by + * looking up its count in the underlying map. + * + * @param object the object to search for + * @return the number of occurrences of the object, zero if not found + */ + int getCount(final Object object) { + final BagCount count = map.get(object); + if (count != null) { + return count.count; + } + return 0; + } + + /** + * Returns a Set view of the mappings contained in this bag. + * + * @return the Set view + */ + Set<Entry<T, BagCount>> entrySet() { + return map.entrySet(); + } + + /** + * Get the number of unique elements in the bag. + * + * @return the unique element size + */ + int uniqueElementSize() { + return map.size(); + } + } + + /** + * Create a new intersection similarity using the provided converter. + * + * <p>If the converter returns a {@link Set} then the intersection result will + * not include duplicates. Any other {@link Collection} is used to produce a result + * that will include duplicates in the intersect and union. + * + * @param converter the converter used to create the elements from the characters + * @throws IllegalArgumentException if the converter is null + */ + public IntersectionSimilarity(Function<CharSequence, Collection<T>> converter) { + if (converter == null) { + throw new IllegalArgumentException("Converter must not be null"); + } + this.converter = converter; + } + + /** + * Calculates the intersection of two character sequences passed as input. + * + * @param left first character sequence + * @param right second character sequence + * @return the intersection result + * @throws IllegalArgumentException if either input sequence is {@code null} + */ + @Override + public IntersectionResult apply(final CharSequence left, final CharSequence right) { + if (left == null || right == null) { + throw new IllegalArgumentException("Input cannot be null"); + } + + // Create the elements from the sequences + final Collection<T> objectsA = converter.apply(left); + final Collection<T> objectsB = converter.apply(right); + final int sizeA = objectsA.size(); + final int sizeB = objectsB.size(); + + // Short-cut if either collection is empty + if (Math.min(sizeA, sizeB) == 0) { + // No intersection + return new IntersectionResult(sizeA, sizeB, 0); + } + + // Intersection = count the number of shared elements + int intersection; + if (objectsA instanceof Set && objectsB instanceof Set) { + // If a Set then the elements will only have a count of 1. + // Iterate over the smaller set. + intersection = (sizeA < sizeB) + ? getIntersection((Set<T>) objectsA, (Set<T>) objectsB) + : getIntersection((Set<T>) objectsB, (Set<T>) objectsA); + } else { + // Create a bag for each collection + final TinyBag bagA = toBag(objectsA); + final TinyBag bagB = toBag(objectsB); + // Iterate over the smaller number of unique elements + intersection = (bagA.uniqueElementSize() < bagB.uniqueElementSize()) + ? getIntersection(bagA, bagB) + : getIntersection(bagB, bagA); + } + + return new IntersectionResult(sizeA, sizeB, intersection); + } + + /** + * Convert the collection to a bag. The bag will contain the count of each element + * in the collection. + * + * @param objects the objects + * @return the bag + */ + private TinyBag toBag(Collection<T> objects) { + final TinyBag bag = new TinyBag(objects.size()); + for (T t : objects) { + bag.add(t); + } + return bag; + } + + /** + * Compute the intersection between two sets. This is the count of all the elements + * that are within both sets. + * + * @param <T> the type of the elements in the set + * @param setA the set A + * @param setB the set B + * @return the intersection + */ + private static <T> int getIntersection(Set<T> setA, Set<T> setB) { + int intersection = 0; + for (T element : setA) { + if (setB.contains(element)) { + intersection++; + } + } + return intersection; + } + + /** + * Compute the intersection between two bags. This is the sum of the minimum + * count of each element that is within both sets. + * + * @param bagA the bag A + * @param bagB the bag B + * @return the intersection + */ + private int getIntersection(TinyBag bagA, TinyBag bagB) { + int intersection = 0; + for (Entry<T, BagCount> entry : bagA.entrySet()) { + final T element = entry.getKey(); + final int count = entry.getValue().count; + // The intersection of this entry in both bags is the minimum count + intersection += Math.min(count, bagB.getCount(element)); + } + return intersection; + } +} diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java b/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java new file mode 100644 index 0000000..c2477e0 --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.HashMap; +import java.util.concurrent.ThreadLocalRandom; + +/** + * Unit tests for {@link IntersectionResult}. + */ +public class IntersectionResultTest { + @Test + public void testNewIntersectionResult_WithZeros() { + final int sizeA = 0; + final int sizeB = 0; + final int intersection = 0; + new IntersectionResult(sizeA, sizeB, intersection); + } + + @Test + public void testNewIntersectionResult_WithNegativeSizeA() { + final int sizeA = -1; + final int sizeB = 0; + final int intersection = 0; + Assertions.assertThrows(IllegalArgumentException.class, () -> { + new IntersectionResult(sizeA, sizeB, intersection); + }); + } + + @Test + public void testNewIntersectionResult_WithNegativeSizeB() { + final int sizeA = 0; + final int sizeB = -1; + final int intersection = 0; + Assertions.assertThrows(IllegalArgumentException.class, () -> { + new IntersectionResult(sizeA, sizeB, intersection); + }); + } + + @Test + public void testNewIntersectionResult_WithNegativeIntersection() { + final int sizeA = 0; + final int sizeB = 0; + final int intersection = -1; + Assertions.assertThrows(IllegalArgumentException.class, () -> { + new IntersectionResult(sizeA, sizeB, intersection); + }); + } + + @Test + public void testNewIntersectionResult_WithIntersectionAboveSizeAorB() { + final int sizeA = 1; + final int sizeB = 2; + final int intersection = Math.max(sizeA, sizeB) + 1; + Assertions.assertThrows(IllegalArgumentException.class, () -> { + new IntersectionResult(sizeA, sizeB, intersection); + }); + Assertions.assertThrows(IllegalArgumentException.class, () -> { + new IntersectionResult(sizeB, sizeA, intersection); + }); + } + + @Test + public void testProperties() { + final ThreadLocalRandom rand = ThreadLocalRandom.current(); + final int max = 1024; + for (int i = 0; i < 5; i++) { + // Ensure the min is above 0 + final int sizeA = rand.nextInt(max) + 1; + final int sizeB = rand.nextInt(max) + 1; + final int intersection = rand.nextInt(Math.min(sizeA, sizeB)); + final IntersectionResult result = new IntersectionResult(sizeA, sizeB, intersection); + Assertions.assertEquals(sizeA, result.getSizeA()); + Assertions.assertEquals(sizeB, result.getSizeB()); + Assertions.assertEquals(intersection, result.getIntersection()); + } + } + + @Test + public void testEquals() { + final IntersectionResult[] results = new IntersectionResult[] { + new IntersectionResult(0, 0, 0), + new IntersectionResult(10, 0, 0), + new IntersectionResult(10, 10, 0), + new IntersectionResult(10, 10, 10), + }; + + // Test a different instance with same values + Assertions.assertTrue(results[0].equals(new IntersectionResult(0, 0, 0))); + + final Object something = new Object(); + for (int i = 0; i < results.length; i++) { + Assertions.assertFalse(results[i].equals(something)); + Assertions.assertFalse(results[i].equals(null)); + for (int j = 0; j < results.length; j++) { + Assertions.assertTrue(results[i].equals(results[j]) == (i == j)); + } + } + } + + @Test + public void testHashCode() { + final IntersectionResult[] results = new IntersectionResult[] { + new IntersectionResult(10, 0, 0), + new IntersectionResult(10, 10, 0), + new IntersectionResult(10, 10, 10), + }; + final HashMap<IntersectionResult, Integer> map = new HashMap<>(); + final int offset = 123; + for (int i = 0; i < results.length; i++) { + map.put(results[i], i + offset); + } + for (int i = 0; i < results.length; i++) { + Assertions.assertEquals(i + offset, map.get(results[i])); + } + } + + @Test + public void testToString() { + final ThreadLocalRandom rand = ThreadLocalRandom.current(); + final int max = 9; + for (int i = 0; i < 5; i++) { + // Ensure the min is above 0 + final int sizeA = rand.nextInt(max) + 1; + final int sizeB = rand.nextInt(max) + 1; + final int intersection = rand.nextInt(Math.min(sizeA, sizeB)); + final IntersectionResult result = new IntersectionResult(sizeA, sizeB, intersection); + final String string = result.toString(); + // Not perfect as this will match substrings too. The chance of error + // is limited by restricting the numbers to a max of 10. + Assertions.assertTrue(string.contains(String.valueOf(sizeA))); + Assertions.assertTrue(string.contains(String.valueOf(sizeB))); + Assertions.assertTrue(string.contains(String.valueOf(intersection))); + } + } +} diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java new file mode 100644 index 0000000..2af8c8e --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java @@ -0,0 +1,340 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.function.Function; +import java.util.regex.Pattern; + +/** + * Unit tests for {@link IntersectionSimilarity}. + */ +public class IntersectionSimilarityTest { + @Test + public void testIntersectionUsingSetCharacter() { + // Compute using single characters. + // This test uses a set and so should not allow duplicates. + final IntersectionSimilarity<Character> similarity = + new IntersectionSimilarity<>(IntersectionSimilarityTest::toCharacterSet); + + // Expected: + // size A or B = count of unique characters (exclude duplicates) + // intersection = count of unique matching characters (exclude duplicates) + assertIntersection(similarity, "", "", 0, 0, 0); + assertIntersection(similarity, "a", "", 1, 0, 0); + assertIntersection(similarity, "a", "a", 1, 1, 1); + assertIntersection(similarity, "a", "b", 1, 1, 0); + assertIntersection(similarity, "aa", "ab", 1, 2, 1); + assertIntersection(similarity, "ab", "ab", 2, 2, 2); + assertIntersection(similarity, "aaba", "abaa", 2, 2, 2); + assertIntersection(similarity, "aaaa", "aa", 1, 1, 1); + assertIntersection(similarity, "aa", "aaaa", 1, 1, 1); + assertIntersection(similarity, "aaaa", "aaa", 1, 1, 1); + assertIntersection(similarity, "aabab", "ababa", 2, 2, 2); + assertIntersection(similarity, "the same", "the same", 7, 7, 7); + assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11); + } + + @Test + public void testIntersectionUsingListCharacter() { + // Compute using single characters. + // This test uses a list and so duplicates should be matched. + final IntersectionSimilarity<Character> similarity = + new IntersectionSimilarity<>(IntersectionSimilarityTest::toCharacterList); + + // Expected: + // size A or B = sequence length + // intersection = count of matching characters (include duplicates) + assertIntersection(similarity, "", "", 0, 0, 0); + assertIntersection(similarity, "a", "", 1, 0, 0); + assertIntersection(similarity, "a", "a", 1, 1, 1); + assertIntersection(similarity, "a", "b", 1, 1, 0); + assertIntersection(similarity, "aa", "ab", 2, 2, 1); + assertIntersection(similarity, "ab", "ab", 2, 2, 2); + assertIntersection(similarity, "aaba", "abaa", 4, 4, 4); + assertIntersection(similarity, "aaaa", "aa", 4, 2, 2); + assertIntersection(similarity, "aa", "aaaa", 2, 4, 2); + assertIntersection(similarity, "aaaa", "aaa", 4, 3, 3); + assertIntersection(similarity, "aabab", "ababa", 5, 5, 5); + assertIntersection(similarity, "the same", "the same", 8, 8, 8); + assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11); + } + + @Test + public void testIntersectionUsingSetBigrams() { + // Compute using pairs of characters (bigrams). + // This can be done using a 32-bit int to store two 16-bit characters. + // This test uses a set and so should not allow duplicates. + final IntersectionSimilarity<Integer> similarity = + new IntersectionSimilarity<>(IntersectionSimilarityTest::toBigramSet); + + // Expected: + // size A or B = count of unique bigrams (exclude duplicates) + // intersection = count of unique matching bigrams (exclude duplicates) + assertIntersection(similarity, "", "", 0, 0, 0); + assertIntersection(similarity, "a", "", 0, 0, 0); + assertIntersection(similarity, "a", "a", 0, 0, 0); + assertIntersection(similarity, "a", "b", 0, 0, 0); + assertIntersection(similarity, "aa", "ab", 1, 1, 0); + assertIntersection(similarity, "ab", "ab", 1, 1, 1); + assertIntersection(similarity, "aaba", "abaa", 3, 3, 3); + assertIntersection(similarity, "aaaa", "aa", 1, 1, 1); + assertIntersection(similarity, "aa", "aaaa", 1, 1, 1); + assertIntersection(similarity, "aaaa", "aaa", 1, 1, 1); + assertIntersection(similarity, "aabab", "ababa", 3, 2, 2); + assertIntersection(similarity, "the same", "the same", 7, 7, 7); + assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8); + } + + @Test + public void testIntersectionUsingListBigrams() { + // Compute using pairs of characters (bigrams). + // This can be done using a 32-bit int to store two 16-bit characters. + // This test uses a list and so duplicates should be matched. + final IntersectionSimilarity<Integer> similarity = + new IntersectionSimilarity<>(IntersectionSimilarityTest::toBigramList); + + // Expected: + // size A or B = sequence length - 1 + // intersection = count of matching bigrams (include duplicates) + assertIntersection(similarity, "", "", 0, 0, 0); + assertIntersection(similarity, "a", "", 0, 0, 0); + assertIntersection(similarity, "a", "a", 0, 0, 0); + assertIntersection(similarity, "a", "b", 0, 0, 0); + assertIntersection(similarity, "aa", "ab", 1, 1, 0); + assertIntersection(similarity, "ab", "ab", 1, 1, 1); + assertIntersection(similarity, "aaba", "abaa", 3, 3, 3); + assertIntersection(similarity, "aaaa", "aa", 3, 1, 1); + assertIntersection(similarity, "aa", "aaaa", 1, 3, 1); + assertIntersection(similarity, "aaaa", "aaa", 3, 2, 2); + assertIntersection(similarity, "aabab", "ababa", 4, 4, 3); + assertIntersection(similarity, "the same", "the same", 7, 7, 7); + assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8); + } + + @Test + public void testIntersectionUsingSetCharacterListCharacter() { + // Compute using a custom converter that returns a Set and a List. + // This is an edge-case test. + final HashMap<CharSequence, Collection<Character>> converter = new HashMap<>(); + final String sequence1 = "aabbccdd"; + final String sequence2 = "aaaaaabbbfffff"; + converter.put(sequence1, toCharacterSet(sequence1)); + converter.put(sequence2, toCharacterList(sequence2)); + final IntersectionSimilarity<Character> similarity = + new IntersectionSimilarity<>(converter::get); + + // Expected: + // size A = count of unique characters (exclude duplicates) + // size B = sequence length + // intersection = count of matching characters (exclude duplicates) + assertIntersection(similarity, sequence1, sequence2, 4, sequence2.length(), 2); + assertIntersection(similarity, sequence2, sequence1, sequence2.length(), 4, 2); + } + + /** + * Convert the {@link CharSequence} to a {@link Set} of {@link Character}s. + * + * @param sequence the sequence + * @return the set + */ + private static Set<Character> toCharacterSet(CharSequence sequence) { + final int length = sequence.length(); + final Set<Character> set = new HashSet<>(length); + for (int i = 0; i < length; i++) { + set.add(sequence.charAt(i)); + } + return set; + } + + /** + * Convert the {@link CharSequence} to a {@link List} of {@link Character}s. + * + * @param sequence the sequence + * @return the list + */ + private static List<Character> toCharacterList(CharSequence sequence) { + final int length = sequence.length(); + final List<Character> list = new ArrayList<>(length); + for (int i = 0; i < length; i++) { + list.add(sequence.charAt(i)); + } + return list; + } + + /** + * Convert the {@link CharSequence} to a {@link Set} of bigrams (pairs of characters). + * These are represented using 2 16-bit chars packed into a 32-bit int. + * + * @param sequence the sequence + * @return the set + */ + private static Set<Integer> toBigramSet(CharSequence sequence) { + final int length = sequence.length(); + final Set<Integer> set = new HashSet<>(length); + if (length > 1) { + char ch2 = sequence.charAt(0); + for (int i = 1; i < length; i++) { + final char ch1 = ch2; + ch2 = sequence.charAt(i); + set.add(Integer.valueOf((ch1 << 16) | ch2)); + } + } + return set; + } + + /** + * Convert the {@link CharSequence} to a {@link List} of bigrams (pairs of characters). + * These are represented using 2 16-bit chars packed into a 32-bit int. + * + * @param sequence the sequence + * @return the list + */ + private static List<Integer> toBigramList(CharSequence sequence) { + final int length = sequence.length(); + final List<Integer> list = new ArrayList<>(length); + if (length > 1) { + char ch2 = sequence.charAt(0); + for (int i = 1; i < length; i++) { + final char ch1 = ch2; + ch2 = sequence.charAt(i); + list.add(Integer.valueOf((ch1 << 16) | ch2)); + } + } + return list; + } + + private static <T> void assertIntersection(IntersectionSimilarity<T> similarity, CharSequence cs1, CharSequence cs2, + int sizeA, int sizeB, int intersection) { + final IntersectionResult result = similarity.apply(cs1, cs2); + assertEquals(sizeA, result.getSizeA(), "Size A error"); + assertEquals(sizeB, result.getSizeB(), "Size B error"); + assertEquals(intersection, result.getIntersection(), "Intersection error"); + } + + @Test + public void testF1ScoreUsingListWordBigrams() { + // Example of a word letter pairs algorithm by Simon White: + // http://www.catalysoft.com/articles/StrikeAMatch.html + // This splits into words using whitespace and then computes uppercase + // bigrams for each word. + + // Split on whitespace + final Pattern pattern = Pattern.compile("\\s+"); + + // Compute using pairs of characters (bigrams) for each word. + // This can be done using a 32-bit int to store two 16-bit characters + final Function<CharSequence, Collection<Integer>> converter = cs -> { + final List<Integer> set = new ArrayList<>(); + for (String word : pattern.split(cs)) { + if (word.length() > 1) { + // The strings are converted to upper case + char ch2 = Character.toUpperCase(word.charAt(0)); + for (int i = 1; i < word.length(); i++) { + final char ch1 = ch2; + ch2 = Character.toUpperCase(word.charAt(i)); + set.add(Integer.valueOf((ch1 << 16) | ch2)); + } + } + } + return set; + }; + final IntersectionSimilarity<Integer> similarity = new IntersectionSimilarity<>(converter); + + String bookTitle; + final String search1 = "Web Database Applications"; + final String search2 = "PHP Web Applications"; + final String search3 = "Web Aplications"; + bookTitle = "Web Database Applications with PHP & MySQL"; + assertEquals(82, toF1ScorePercent(similarity.apply(bookTitle, search1))); + assertEquals(68, toF1ScorePercent(similarity.apply(bookTitle, search2))); + assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search3))); + bookTitle = "Creating Database Web Applications with PHP and ASP"; + assertEquals(71, toF1ScorePercent(similarity.apply(bookTitle, search1))); + assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search2))); + assertEquals(50, toF1ScorePercent(similarity.apply(bookTitle, search3))); + bookTitle = "Building Database Applications on the Web Using PHP3"; + assertEquals(70, toF1ScorePercent(similarity.apply(bookTitle, search1))); + assertEquals(58, toF1ScorePercent(similarity.apply(bookTitle, search2))); + assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search3))); + bookTitle = "Building Web Database Applications with Visual Studio 6"; + assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search1))); + assertEquals(47, toF1ScorePercent(similarity.apply(bookTitle, search2))); + assertEquals(46, toF1ScorePercent(similarity.apply(bookTitle, search3))); + bookTitle = "Web Application Development With PHP"; + assertEquals(51, toF1ScorePercent(similarity.apply(bookTitle, search1))); + assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search2))); + assertEquals(56, toF1ScorePercent(similarity.apply(bookTitle, search3))); + bookTitle = "WebRAD: Building Database Applications on the Web with Visual FoxPro and Web Connection"; + assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search1))); + assertEquals(34, toF1ScorePercent(similarity.apply(bookTitle, search2))); + assertEquals(32, toF1ScorePercent(similarity.apply(bookTitle, search3))); + bookTitle = "Structural Assessment: The Role of Large and Full-Scale Testing"; + assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search1))); + assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search2))); + assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search3))); + bookTitle = "How to Find a Scholarship Online"; + assertEquals(10, toF1ScorePercent(similarity.apply(bookTitle, search1))); + assertEquals(11, toF1ScorePercent(similarity.apply(bookTitle, search2))); + assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search3))); + } + + private static int toF1ScorePercent(IntersectionResult result) { + final double value = 2.0 * result.getIntersection() / (result.getSizeA() + result.getSizeB()); + // Convert to percentage + return (int) Math.round(value * 100); + } + + @Test + public void testConstructorWithNullConverterThrows() { + assertThatIllegalArgumentException().isThrownBy(() -> { + new IntersectionSimilarity<>(null); + }); + } + + @Test + public void testApplyNullNull() { + assertThatIllegalArgumentException().isThrownBy(() -> { + new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, null); + }); + } + + @Test + public void testApplyStringNull() { + assertThatIllegalArgumentException().isThrownBy(() -> { + new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply("left", null); + }); + } + + @Test + public void testApplyNullString() { + assertThatIllegalArgumentException().isThrownBy(() -> { + new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, "right"); + }); + } +}