Repository: commons-text
Updated Branches:
  refs/heads/master 8bda46197 -> 6f6da3467


TEXT-26: adding JaroWinkler changes from LUCENE-1297


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/7059ba3a
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/7059ba3a
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/7059ba3a

Branch: refs/heads/master
Commit: 7059ba3a4c746a867192d3d7044871e8d416b204
Parents: 8bda461
Author: Rob Tompkins <chtom...@gmail.com>
Authored: Mon Dec 26 11:31:31 2016 -0500
Committer: Rob Tompkins <chtom...@gmail.com>
Committed: Mon Dec 26 11:31:31 2016 -0500

----------------------------------------------------------------------
 .../text/similarity/JaroWinklerDistance.java    | 320 ++++---------------
 .../similarity/JaroWinklerDistanceTest.java     |   8 +-
 2 files changed, 60 insertions(+), 268 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/7059ba3a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java 
b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
index 403d8a7..33c6a2c 100644
--- a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
+++ b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
@@ -16,6 +16,8 @@
  */
 package org.apache.commons.text.similarity;
 
+import java.util.Arrays;
+
 /**
  * A similarity algorithm indicating the percentage of matched characters 
between two character sequences.
  *
@@ -63,10 +65,10 @@ public class JaroWinklerDistance implements 
SimilarityScore<Double> {
      * distance.apply("hippo", "elephant") = 0.44
      * distance.apply("hippo", "zzzzzzzz") = 0.0
      * distance.apply("hello", "hallo")    = 0.88
-     * distance.apply("ABC Corporation", "ABC Corp") = 0.91
-     * distance.apply("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") 
= 0.93
-     * distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens 
Fitness") = 0.94
-     * distance.apply("PENNSYLVANIA", "PENNCISYLVNIA")    = 0.9
+     * distance.apply("ABC Corporation", "ABC Corp") = 0.93
+     * distance.apply("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") 
= 0.95
+     * distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens 
Fitness") = 0.92
+     * distance.apply("PENNSYLVANIA", "PENNCISYLVNIA")    = 0.88
      * </pre>
      *
      * @param left the first String, must not be null
@@ -83,287 +85,77 @@ public class JaroWinklerDistance implements 
SimilarityScore<Double> {
             throw new IllegalArgumentException("Strings must not be null");
         }
 
-        final double jaro = score(left, right);
-        final int cl = commonPrefixLength(left, right);
-        final double matchScore = Math.round((jaro + defaultScalingFactor
-                * cl * (1.0 - jaro)) * percentageRoundValue) / 
percentageRoundValue;
-
-        return matchScore;
-    }
-
-    /**
-     * Calculates the number of characters from the beginning of the strings
-     * that match exactly one-to-one, up to a maximum of four (4) characters.
-     *
-     * @param first The first string.
-     * @param second The second string.
-     * @return A number between 0 and 4.
-     */
-    private static int commonPrefixLength(final CharSequence first,
-            final CharSequence second) {
-        final int result = getCommonPrefix(first.toString(), second.toString())
-                .length();
-
-        // Limit the result to 4.
-        return result > PREFIX_LENGTH_LIMIT ? PREFIX_LENGTH_LIMIT : result;
-    }
-
-    /**
-     * Compares all Strings in an array and returns the initial sequence of
-     * characters that is common to all of them.
-     *
-     * <p>
-     * For example,
-     * <code>getCommonPrefix(new String[] {"i am a machine", "i am a robot"}) 
-&gt; "i am a "</code>
-     * </p>
-     *
-     * <pre>
-     * getCommonPrefix(null) = ""
-     * getCommonPrefix(new String[] {}) = ""
-     * getCommonPrefix(new String[] {"abc"}) = "abc"
-     * getCommonPrefix(new String[] {null, null}) = ""
-     * getCommonPrefix(new String[] {"", ""}) = ""
-     * getCommonPrefix(new String[] {"", null}) = ""
-     * getCommonPrefix(new String[] {"abc", null, null}) = ""
-     * getCommonPrefix(new String[] {null, null, "abc"}) = ""
-     * getCommonPrefix(new String[] {"", "abc"}) = ""
-     * getCommonPrefix(new String[] {"abc", ""}) = ""
-     * getCommonPrefix(new String[] {"abc", "abc"}) = "abc"
-     * getCommonPrefix(new String[] {"abc", "a"}) = "a"
-     * getCommonPrefix(new String[] {"ab", "abxyz"}) = "ab"
-     * getCommonPrefix(new String[] {"abcde", "abxyz"}) = "ab"
-     * getCommonPrefix(new String[] {"abcde", "xyz"}) = ""
-     * getCommonPrefix(new String[] {"xyz", "abcde"}) = ""
-     * getCommonPrefix(new String[] {"i am a machine", "i am a robot"}) = "i 
am a "
-     * </pre>
-     *
-     * @param strs array of String objects, entries may be null
-     * @return the initial sequence of characters that are common to all 
Strings
-     *         in the array; empty String if the array is null, the elements 
are
-     *         all null or if there is no common prefix.
-     */
-    public static String getCommonPrefix(final String... strs) {
-        if (strs == null || strs.length == 0) {
-            return "";
-        }
-        final int smallestIndexOfDiff = indexOfDifference(strs);
-        if (smallestIndexOfDiff == INDEX_NOT_FOUND) {
-            // all strings were identical
-            if (strs[0] == null) {
-                return "";
-            }
-            return strs[0];
-        } else if (smallestIndexOfDiff == 0) {
-            // there were no common initial characters
-            return "";
-        } else {
-            // we found a common initial character sequence
-            return strs[0].substring(0, smallestIndexOfDiff);
+        int[] mtp = matches(left, right);
+        double m = mtp[0];
+        if (m == 0) {
+            return 0D;
         }
+        double j = ((m / left.length() + m / right.length() + (m - mtp[1]) / 
m)) / 3;
+        double jw = j < 0.7D ? j : j + Math.min(defaultScalingFactor, 1D / 
mtp[3]) * mtp[2] * (1D - j);
+        return Math.round(jw * percentageRoundValue) / percentageRoundValue;
     }
 
     /**
-     * This method returns the Jaro-Winkler score for string matching.
+     * This method returns the Jaro-Winkler string matches, transpositions, 
prefix, max array.
      *
      * @param first the first string to be matched
      * @param second the second string to be machted
-     * @return matching score without scaling factor impact
+     * @return mtp array containing: matches, transpositions, prefix, and max 
length
      */
-    protected static double score(final CharSequence first,
-            final CharSequence second) {
-        String shorter;
-        String longer;
-
-        // Determine which String is longer.
+    protected static int[] matches(final CharSequence first, final 
CharSequence second) {
+        CharSequence max, min;
         if (first.length() > second.length()) {
-            longer = first.toString().toLowerCase();
-            shorter = second.toString().toLowerCase();
+            max = first;
+            min = second;
         } else {
-            longer = second.toString().toLowerCase();
-            shorter = first.toString().toLowerCase();
+            max = second;
+            min = first;
         }
-
-        // Calculate the half length() distance of the shorter String.
-        final int halflength = shorter.length() / 2 + 1;
-
-        // Find the set of matching characters between the shorter and longer
-        // strings. Note that
-        // the set of matching characters may be different depending on the
-        // order of the strings.
-        final String m1 = getSetOfMatchingCharacterWithin(shorter, longer,
-                halflength);
-        final String m2 = getSetOfMatchingCharacterWithin(longer, shorter,
-                halflength);
-
-        // If one or both of the sets of common characters is empty, then
-        // there is no similarity between the two strings.
-        if (m1.length() == 0 || m2.length() == 0) {
-            return 0.0;
+        int range = Math.max(max.length() / 2 - 1, 0);
+        int[] matchIndexes = new int[min.length()];
+        Arrays.fill(matchIndexes, -1);
+        boolean[] matchFlags = new boolean[max.length()];
+        int matches = 0;
+        for (int mi = 0; mi < min.length(); mi++) {
+            char c1 = min.charAt(mi);
+            for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 
1, max.length()); xi < xn; xi++) {
+                if (!matchFlags[xi] && c1 == max.charAt(xi)) {
+                    matchIndexes[mi] = xi;
+                    matchFlags[xi] = true;
+                    matches++;
+                    break;
+                }
+            }
         }
-
-        // If the set of common characters is not the same size, then
-        // there is no similarity between the two strings, either.
-        if (m1.length() != m2.length()) {
-            return 0.0;
+        char[] ms1 = new char[matches];
+        char[] ms2 = new char[matches];
+        for (int i = 0, si = 0; i < min.length(); i++) {
+            if (matchIndexes[i] != -1) {
+                ms1[si] = min.charAt(i);
+                si++;
+            }
+        }
+        for (int i = 0, si = 0; i < max.length(); i++) {
+            if (matchFlags[i]) {
+                ms2[si] = max.charAt(i);
+                si++;
+            }
         }
-
-        // Calculate the number of transposition between the two sets
-        // of common characters.
-        final int transpositions = transpositions(m1, m2);
-
-        final double defaultDenominator = 3.0;
-
-        // Calculate the distance.
-        final double dist = (m1.length() / ((double) shorter.length())
-                + m2.length() / ((double) longer.length()) + (m1.length() - 
transpositions)
-                / ((double) m1.length())) / defaultDenominator;
-        return dist;
-    }
-
-    /**
-     * Calculates the number of transposition between two strings.
-     *
-     * @param first The first string.
-     * @param second The second string.
-     * @return The number of transposition between the two strings.
-     */
-    protected static int transpositions(final CharSequence first,
-            final CharSequence second) {
         int transpositions = 0;
-        for (int i = 0; i < first.length(); i++) {
-            if (first.charAt(i) != second.charAt(i)) {
+        for (int mi = 0; mi < ms1.length; mi++) {
+            if (ms1[mi] != ms2[mi]) {
                 transpositions++;
             }
         }
-        return transpositions / 2;
-    }
-
-    /**
-     * Compares all CharSequences in an array and returns the index at which 
the
-     * CharSequences begin to differ.
-     *
-     * <p>
-     * For example,
-     * <code>indexOfDifference(new String[] {"i am a machine", "i am a 
robot"}) -&gt; 7</code>
-     * </p>
-     *
-     * <pre>
-     * distance.indexOfDifference(null) = -1
-     * distance.indexOfDifference(new String[] {}) = -1
-     * distance.indexOfDifference(new String[] {"abc"}) = -1
-     * distance.indexOfDifference(new String[] {null, null}) = -1
-     * distance.indexOfDifference(new String[] {"", ""}) = -1
-     * distance.indexOfDifference(new String[] {"", null}) = 0
-     * distance.indexOfDifference(new String[] {"abc", null, null}) = 0
-     * distance.indexOfDifference(new String[] {null, null, "abc"}) = 0
-     * distance.indexOfDifference(new String[] {"", "abc"}) = 0
-     * distance.indexOfDifference(new String[] {"abc", ""}) = 0
-     * distance.indexOfDifference(new String[] {"abc", "abc"}) = -1
-     * distance.indexOfDifference(new String[] {"abc", "a"}) = 1
-     * distance.indexOfDifference(new String[] {"ab", "abxyz"}) = 2
-     * distance.indexOfDifference(new String[] {"abcde", "abxyz"}) = 2
-     * distance.indexOfDifference(new String[] {"abcde", "xyz"}) = 0
-     * distance.indexOfDifference(new String[] {"xyz", "abcde"}) = 0
-     * distance.indexOfDifference(new String[] {"i am a machine", "i am a 
robot"}) = 7
-     * </pre>
-     *
-     * @param css array of CharSequences, entries may be null
-     * @return the index where the strings begin to differ; -1 if they are all
-     *         equal
-     */
-    protected static int indexOfDifference(final CharSequence... css) {
-        if (css == null || css.length <= 1) {
-            return INDEX_NOT_FOUND;
-        }
-        boolean anyStringNull = false;
-        boolean allStringsNull = true;
-        final int arrayLen = css.length;
-        int shortestStrLen = Integer.MAX_VALUE;
-        int longestStrLen = 0;
-
-        // find the min and max string lengths; this avoids checking to make
-        // sure we are not exceeding the length of the string each time through
-        // the bottom loop.
-        for (int i = 0; i < arrayLen; i++) {
-            if (css[i] == null) {
-                anyStringNull = true;
-                shortestStrLen = 0;
+        int prefix = 0;
+        for (int mi = 0; mi < min.length(); mi++) {
+            if (first.charAt(mi) == second.charAt(mi)) {
+                prefix++;
             } else {
-                allStringsNull = false;
-                shortestStrLen = Math.min(css[i].length(), shortestStrLen);
-                longestStrLen = Math.max(css[i].length(), longestStrLen);
-            }
-        }
-
-        // handle lists containing all nulls or all empty strings
-        if (allStringsNull || longestStrLen == 0 && !anyStringNull) {
-            return INDEX_NOT_FOUND;
-        }
-
-        // handle lists containing some nulls or some empty strings
-        if (shortestStrLen == 0) {
-            return 0;
-        }
-
-        // find the position with the first difference across all strings
-        int firstDiff = -1;
-        for (int stringPos = 0; stringPos < shortestStrLen; stringPos++) {
-            final char comparisonChar = css[0].charAt(stringPos);
-            for (int arrayPos = 1; arrayPos < arrayLen; arrayPos++) {
-                if (css[arrayPos].charAt(stringPos) != comparisonChar) {
-                    firstDiff = stringPos;
-                    break;
-                }
-            }
-            if (firstDiff != -1) {
                 break;
             }
         }
-
-        if (firstDiff == -1 && shortestStrLen != longestStrLen) {
-            // we compared all of the characters up to the length of the
-            // shortest string and didn't find a match, but the string lengths
-            // vary, so return the length of the shortest string.
-            return shortestStrLen;
-        }
-        return firstDiff;
-    }
-
-    /**
-     * Gets a set of matching characters between two strings.
-     *
-     * <p>
-     * Two characters from the first string and the second string are
-     * considered matching if the character's respective positions are no
-     * farther than the limit value.
-     * </p>
-     *
-     * @param first The first string.
-     * @param second The second string.
-     * @param limit The maximum distance to consider.
-     * @return A string contain the set of common characters.
-     */
-    protected static String getSetOfMatchingCharacterWithin(
-            final CharSequence first, final CharSequence second, final int 
limit) {
-        final StringBuilder common = new StringBuilder();
-        final StringBuilder copy = new StringBuilder(second);
-
-        for (int i = 0; i < first.length(); i++) {
-            final char ch = first.charAt(i);
-            boolean found = false;
-
-            // See if the character is within the limit positions away from the
-            // original position of that character.
-            for (int j = Math.max(0, i - limit); !found
-                    && j < Math.min(i + limit, second.length()); j++) {
-                if (copy.charAt(j) == ch) {
-                    found = true;
-                    common.append(ch);
-                    copy.setCharAt(j, '*');
-                }
-            }
-        }
-        return common.toString();
+        return new int[] { matches, transpositions / 2, prefix, max.length() };
     }
 
 }

http://git-wip-us.apache.org/repos/asf/commons-text/blob/7059ba3a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java 
b/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
index 7548e6e..84276f1 100644
--- 
a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
+++ 
b/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
@@ -38,10 +38,10 @@ public class JaroWinklerDistanceTest {
         assertEquals(0.93d, (double) distance.apply("frog", "fog"), 0.0d);
         assertEquals(0.0d, (double) distance.apply("fly", "ant"), 0.0d);
         assertEquals(0.44d, (double) distance.apply("elephant", "hippo"), 
0.0d);
-        assertEquals(0.91d, (double) distance.apply("ABC Corporation", "ABC 
Corp"), 0.0d);
-        assertEquals(0.93d, (double) distance.apply("D N H Enterprises Inc", 
"D & H Enterprises, Inc."), 0.0d);
-        assertEquals(0.94d, (double) distance.apply("My Gym Children's Fitness 
Center", "My Gym. Childrens Fitness"), 0.0d);
-        assertEquals(0.9d, (double) distance.apply("PENNSYLVANIA", 
"PENNCISYLVNIA"), 0.0d);
+        assertEquals(0.93d, (double) distance.apply("ABC Corporation", "ABC 
Corp"), 0.0d);
+        assertEquals(0.95d, (double) distance.apply("D N H Enterprises Inc", 
"D & H Enterprises, Inc."), 0.0d);
+        assertEquals(0.92d, (double) distance.apply("My Gym Children's Fitness 
Center", "My Gym. Childrens Fitness"), 0.0d);
+        assertEquals(0.88d, (double) distance.apply("PENNSYLVANIA", 
"PENNCISYLVNIA"), 0.0d);
     }
 
     @Test(expected = IllegalArgumentException.class)

Reply via email to