Fix [TEXT-130] JaroWinklerDistance: Wrong results due to precision of transpositions
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/4d064dec Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/4d064dec Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/4d064dec Branch: refs/heads/master Commit: 4d064decbf7828918ca59b70d7fca19b7da955ec Parents: 70150fb Author: Jan Martin Keil <jan-martin.k...@uni-jena.de> Authored: Thu Aug 2 22:55:00 2018 +0200 Committer: Jan Martin Keil <jan-martin.k...@uni-jena.de> Committed: Thu Aug 2 22:55:00 2018 +0200 ---------------------------------------------------------------------- .../commons/text/similarity/JaroWinklerDistance.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d064dec/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java index 0ffb1ad..915cd5c 100644 --- a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java +++ b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java @@ -85,17 +85,17 @@ public class JaroWinklerDistance implements SimilarityScore<Double> { if (m == 0) { return 0D; } - final double j = ((m / left.length() + m / right.length() + (m - mtp[1]) / m)) / 3; + final double j = ((m / left.length() + m / right.length() + (m - (double) mtp[1] / 2) / m)) / 3; final double jw = j < 0.7D ? j : j + Math.min(defaultScalingFactor, 1D / mtp[3]) * mtp[2] * (1D - j); return jw; } /** - * This method returns the Jaro-Winkler string matches, transpositions, prefix, max array. + * This method returns the Jaro-Winkler string matches, half transpositions, prefix, max array. * * @param first the first string to be matched * @param second the second string to be matched - * @return mtp array containing: matches, transpositions, prefix, and max length + * @return mtp array containing: matches, half transpositions, prefix, and max length */ protected static int[] matches(final CharSequence first, final CharSequence second) { CharSequence max, min; @@ -136,10 +136,10 @@ public class JaroWinklerDistance implements SimilarityScore<Double> { si++; } } - int transpositions = 0; + int halfTranspositions = 0; for (int mi = 0; mi < ms1.length; mi++) { if (ms1[mi] != ms2[mi]) { - transpositions++; + halfTranspositions++; } } int prefix = 0; @@ -150,7 +150,7 @@ public class JaroWinklerDistance implements SimilarityScore<Double> { break; } } - return new int[] {matches, transpositions / 2, prefix, max.length()}; + return new int[] {matches, halfTranspositions, prefix, max.length()}; } }