Repository: commons-text Updated Branches: refs/heads/master 8bda46197 -> 6f6da3467
TEXT-26: adding JaroWinkler changes from LUCENE-1297 Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/7059ba3a Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/7059ba3a Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/7059ba3a Branch: refs/heads/master Commit: 7059ba3a4c746a867192d3d7044871e8d416b204 Parents: 8bda461 Author: Rob Tompkins <chtom...@gmail.com> Authored: Mon Dec 26 11:31:31 2016 -0500 Committer: Rob Tompkins <chtom...@gmail.com> Committed: Mon Dec 26 11:31:31 2016 -0500 ---------------------------------------------------------------------- .../text/similarity/JaroWinklerDistance.java | 320 ++++--------------- .../similarity/JaroWinklerDistanceTest.java | 8 +- 2 files changed, 60 insertions(+), 268 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-text/blob/7059ba3a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java index 403d8a7..33c6a2c 100644 --- a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java +++ b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java @@ -16,6 +16,8 @@ */ package org.apache.commons.text.similarity; +import java.util.Arrays; + /** * A similarity algorithm indicating the percentage of matched characters between two character sequences. * @@ -63,10 +65,10 @@ public class JaroWinklerDistance implements SimilarityScore<Double> { * distance.apply("hippo", "elephant") = 0.44 * distance.apply("hippo", "zzzzzzzz") = 0.0 * distance.apply("hello", "hallo") = 0.88 - * distance.apply("ABC Corporation", "ABC Corp") = 0.91 - * distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.93 - * distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.94 - * distance.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.9 + * distance.apply("ABC Corporation", "ABC Corp") = 0.93 + * distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.95 + * distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92 + * distance.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88 * </pre> * * @param left the first String, must not be null @@ -83,287 +85,77 @@ public class JaroWinklerDistance implements SimilarityScore<Double> { throw new IllegalArgumentException("Strings must not be null"); } - final double jaro = score(left, right); - final int cl = commonPrefixLength(left, right); - final double matchScore = Math.round((jaro + defaultScalingFactor - * cl * (1.0 - jaro)) * percentageRoundValue) / percentageRoundValue; - - return matchScore; - } - - /** - * Calculates the number of characters from the beginning of the strings - * that match exactly one-to-one, up to a maximum of four (4) characters. - * - * @param first The first string. - * @param second The second string. - * @return A number between 0 and 4. - */ - private static int commonPrefixLength(final CharSequence first, - final CharSequence second) { - final int result = getCommonPrefix(first.toString(), second.toString()) - .length(); - - // Limit the result to 4. - return result > PREFIX_LENGTH_LIMIT ? PREFIX_LENGTH_LIMIT : result; - } - - /** - * Compares all Strings in an array and returns the initial sequence of - * characters that is common to all of them. - * - * <p> - * For example, - * <code>getCommonPrefix(new String[] {"i am a machine", "i am a robot"}) -> "i am a "</code> - * </p> - * - * <pre> - * getCommonPrefix(null) = "" - * getCommonPrefix(new String[] {}) = "" - * getCommonPrefix(new String[] {"abc"}) = "abc" - * getCommonPrefix(new String[] {null, null}) = "" - * getCommonPrefix(new String[] {"", ""}) = "" - * getCommonPrefix(new String[] {"", null}) = "" - * getCommonPrefix(new String[] {"abc", null, null}) = "" - * getCommonPrefix(new String[] {null, null, "abc"}) = "" - * getCommonPrefix(new String[] {"", "abc"}) = "" - * getCommonPrefix(new String[] {"abc", ""}) = "" - * getCommonPrefix(new String[] {"abc", "abc"}) = "abc" - * getCommonPrefix(new String[] {"abc", "a"}) = "a" - * getCommonPrefix(new String[] {"ab", "abxyz"}) = "ab" - * getCommonPrefix(new String[] {"abcde", "abxyz"}) = "ab" - * getCommonPrefix(new String[] {"abcde", "xyz"}) = "" - * getCommonPrefix(new String[] {"xyz", "abcde"}) = "" - * getCommonPrefix(new String[] {"i am a machine", "i am a robot"}) = "i am a " - * </pre> - * - * @param strs array of String objects, entries may be null - * @return the initial sequence of characters that are common to all Strings - * in the array; empty String if the array is null, the elements are - * all null or if there is no common prefix. - */ - public static String getCommonPrefix(final String... strs) { - if (strs == null || strs.length == 0) { - return ""; - } - final int smallestIndexOfDiff = indexOfDifference(strs); - if (smallestIndexOfDiff == INDEX_NOT_FOUND) { - // all strings were identical - if (strs[0] == null) { - return ""; - } - return strs[0]; - } else if (smallestIndexOfDiff == 0) { - // there were no common initial characters - return ""; - } else { - // we found a common initial character sequence - return strs[0].substring(0, smallestIndexOfDiff); + int[] mtp = matches(left, right); + double m = mtp[0]; + if (m == 0) { + return 0D; } + double j = ((m / left.length() + m / right.length() + (m - mtp[1]) / m)) / 3; + double jw = j < 0.7D ? j : j + Math.min(defaultScalingFactor, 1D / mtp[3]) * mtp[2] * (1D - j); + return Math.round(jw * percentageRoundValue) / percentageRoundValue; } /** - * This method returns the Jaro-Winkler score for string matching. + * This method returns the Jaro-Winkler string matches, transpositions, prefix, max array. * * @param first the first string to be matched * @param second the second string to be machted - * @return matching score without scaling factor impact + * @return mtp array containing: matches, transpositions, prefix, and max length */ - protected static double score(final CharSequence first, - final CharSequence second) { - String shorter; - String longer; - - // Determine which String is longer. + protected static int[] matches(final CharSequence first, final CharSequence second) { + CharSequence max, min; if (first.length() > second.length()) { - longer = first.toString().toLowerCase(); - shorter = second.toString().toLowerCase(); + max = first; + min = second; } else { - longer = second.toString().toLowerCase(); - shorter = first.toString().toLowerCase(); + max = second; + min = first; } - - // Calculate the half length() distance of the shorter String. - final int halflength = shorter.length() / 2 + 1; - - // Find the set of matching characters between the shorter and longer - // strings. Note that - // the set of matching characters may be different depending on the - // order of the strings. - final String m1 = getSetOfMatchingCharacterWithin(shorter, longer, - halflength); - final String m2 = getSetOfMatchingCharacterWithin(longer, shorter, - halflength); - - // If one or both of the sets of common characters is empty, then - // there is no similarity between the two strings. - if (m1.length() == 0 || m2.length() == 0) { - return 0.0; + int range = Math.max(max.length() / 2 - 1, 0); + int[] matchIndexes = new int[min.length()]; + Arrays.fill(matchIndexes, -1); + boolean[] matchFlags = new boolean[max.length()]; + int matches = 0; + for (int mi = 0; mi < min.length(); mi++) { + char c1 = min.charAt(mi); + for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) { + if (!matchFlags[xi] && c1 == max.charAt(xi)) { + matchIndexes[mi] = xi; + matchFlags[xi] = true; + matches++; + break; + } + } } - - // If the set of common characters is not the same size, then - // there is no similarity between the two strings, either. - if (m1.length() != m2.length()) { - return 0.0; + char[] ms1 = new char[matches]; + char[] ms2 = new char[matches]; + for (int i = 0, si = 0; i < min.length(); i++) { + if (matchIndexes[i] != -1) { + ms1[si] = min.charAt(i); + si++; + } + } + for (int i = 0, si = 0; i < max.length(); i++) { + if (matchFlags[i]) { + ms2[si] = max.charAt(i); + si++; + } } - - // Calculate the number of transposition between the two sets - // of common characters. - final int transpositions = transpositions(m1, m2); - - final double defaultDenominator = 3.0; - - // Calculate the distance. - final double dist = (m1.length() / ((double) shorter.length()) - + m2.length() / ((double) longer.length()) + (m1.length() - transpositions) - / ((double) m1.length())) / defaultDenominator; - return dist; - } - - /** - * Calculates the number of transposition between two strings. - * - * @param first The first string. - * @param second The second string. - * @return The number of transposition between the two strings. - */ - protected static int transpositions(final CharSequence first, - final CharSequence second) { int transpositions = 0; - for (int i = 0; i < first.length(); i++) { - if (first.charAt(i) != second.charAt(i)) { + for (int mi = 0; mi < ms1.length; mi++) { + if (ms1[mi] != ms2[mi]) { transpositions++; } } - return transpositions / 2; - } - - /** - * Compares all CharSequences in an array and returns the index at which the - * CharSequences begin to differ. - * - * <p> - * For example, - * <code>indexOfDifference(new String[] {"i am a machine", "i am a robot"}) -> 7</code> - * </p> - * - * <pre> - * distance.indexOfDifference(null) = -1 - * distance.indexOfDifference(new String[] {}) = -1 - * distance.indexOfDifference(new String[] {"abc"}) = -1 - * distance.indexOfDifference(new String[] {null, null}) = -1 - * distance.indexOfDifference(new String[] {"", ""}) = -1 - * distance.indexOfDifference(new String[] {"", null}) = 0 - * distance.indexOfDifference(new String[] {"abc", null, null}) = 0 - * distance.indexOfDifference(new String[] {null, null, "abc"}) = 0 - * distance.indexOfDifference(new String[] {"", "abc"}) = 0 - * distance.indexOfDifference(new String[] {"abc", ""}) = 0 - * distance.indexOfDifference(new String[] {"abc", "abc"}) = -1 - * distance.indexOfDifference(new String[] {"abc", "a"}) = 1 - * distance.indexOfDifference(new String[] {"ab", "abxyz"}) = 2 - * distance.indexOfDifference(new String[] {"abcde", "abxyz"}) = 2 - * distance.indexOfDifference(new String[] {"abcde", "xyz"}) = 0 - * distance.indexOfDifference(new String[] {"xyz", "abcde"}) = 0 - * distance.indexOfDifference(new String[] {"i am a machine", "i am a robot"}) = 7 - * </pre> - * - * @param css array of CharSequences, entries may be null - * @return the index where the strings begin to differ; -1 if they are all - * equal - */ - protected static int indexOfDifference(final CharSequence... css) { - if (css == null || css.length <= 1) { - return INDEX_NOT_FOUND; - } - boolean anyStringNull = false; - boolean allStringsNull = true; - final int arrayLen = css.length; - int shortestStrLen = Integer.MAX_VALUE; - int longestStrLen = 0; - - // find the min and max string lengths; this avoids checking to make - // sure we are not exceeding the length of the string each time through - // the bottom loop. - for (int i = 0; i < arrayLen; i++) { - if (css[i] == null) { - anyStringNull = true; - shortestStrLen = 0; + int prefix = 0; + for (int mi = 0; mi < min.length(); mi++) { + if (first.charAt(mi) == second.charAt(mi)) { + prefix++; } else { - allStringsNull = false; - shortestStrLen = Math.min(css[i].length(), shortestStrLen); - longestStrLen = Math.max(css[i].length(), longestStrLen); - } - } - - // handle lists containing all nulls or all empty strings - if (allStringsNull || longestStrLen == 0 && !anyStringNull) { - return INDEX_NOT_FOUND; - } - - // handle lists containing some nulls or some empty strings - if (shortestStrLen == 0) { - return 0; - } - - // find the position with the first difference across all strings - int firstDiff = -1; - for (int stringPos = 0; stringPos < shortestStrLen; stringPos++) { - final char comparisonChar = css[0].charAt(stringPos); - for (int arrayPos = 1; arrayPos < arrayLen; arrayPos++) { - if (css[arrayPos].charAt(stringPos) != comparisonChar) { - firstDiff = stringPos; - break; - } - } - if (firstDiff != -1) { break; } } - - if (firstDiff == -1 && shortestStrLen != longestStrLen) { - // we compared all of the characters up to the length of the - // shortest string and didn't find a match, but the string lengths - // vary, so return the length of the shortest string. - return shortestStrLen; - } - return firstDiff; - } - - /** - * Gets a set of matching characters between two strings. - * - * <p> - * Two characters from the first string and the second string are - * considered matching if the character's respective positions are no - * farther than the limit value. - * </p> - * - * @param first The first string. - * @param second The second string. - * @param limit The maximum distance to consider. - * @return A string contain the set of common characters. - */ - protected static String getSetOfMatchingCharacterWithin( - final CharSequence first, final CharSequence second, final int limit) { - final StringBuilder common = new StringBuilder(); - final StringBuilder copy = new StringBuilder(second); - - for (int i = 0; i < first.length(); i++) { - final char ch = first.charAt(i); - boolean found = false; - - // See if the character is within the limit positions away from the - // original position of that character. - for (int j = Math.max(0, i - limit); !found - && j < Math.min(i + limit, second.length()); j++) { - if (copy.charAt(j) == ch) { - found = true; - common.append(ch); - copy.setCharAt(j, '*'); - } - } - } - return common.toString(); + return new int[] { matches, transpositions / 2, prefix, max.length() }; } } http://git-wip-us.apache.org/repos/asf/commons-text/blob/7059ba3a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java index 7548e6e..84276f1 100644 --- a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java +++ b/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java @@ -38,10 +38,10 @@ public class JaroWinklerDistanceTest { assertEquals(0.93d, (double) distance.apply("frog", "fog"), 0.0d); assertEquals(0.0d, (double) distance.apply("fly", "ant"), 0.0d); assertEquals(0.44d, (double) distance.apply("elephant", "hippo"), 0.0d); - assertEquals(0.91d, (double) distance.apply("ABC Corporation", "ABC Corp"), 0.0d); - assertEquals(0.93d, (double) distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.0d); - assertEquals(0.94d, (double) distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.0d); - assertEquals(0.9d, (double) distance.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d); + assertEquals(0.93d, (double) distance.apply("ABC Corporation", "ABC Corp"), 0.0d); + assertEquals(0.95d, (double) distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.0d); + assertEquals(0.92d, (double) distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.0d); + assertEquals(0.88d, (double) distance.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d); } @Test(expected = IllegalArgumentException.class)