Author: britter
Date: Wed May 7 18:42:33 2014
New Revision: 1593112
URL: http://svn.apache.org/r1593112
Log:
LANG-999: Add fuzzy String matching logic to StringUtils. This also closes #20
from github. Thanks to Ben Ripkens.
Modified:
commons/proper/lang/trunk/src/changes/changes.xml
commons/proper/lang/trunk/src/main/java/org/apache/commons/lang3/StringUtils.java
commons/proper/lang/trunk/src/test/java/org/apache/commons/lang3/StringUtilsTest.java
Modified: commons/proper/lang/trunk/src/changes/changes.xml
URL:
http://svn.apache.org/viewvc/commons/proper/lang/trunk/src/changes/changes.xml?rev=1593112&r1=1593111&r2=1593112&view=diff
==============================================================================
--- commons/proper/lang/trunk/src/changes/changes.xml [utf-8] (original)
+++ commons/proper/lang/trunk/src/changes/changes.xml [utf-8] Wed May 7
18:42:33 2014
@@ -22,6 +22,7 @@
<body>
<release version="3.4" date="tba" description="tba">
+ <action issue="LANG-999" type="add" dev="britter" due-to="Ben Ripkens">Add
fuzzy String matching logic to StringUtils</action>
<action issue="LANG-1006" type="update" dev="britter" due-to="Thiago
Andrade">Add wrap (with String or char) to StringUtils</action>
<action issue="LANG-1005" type="update" dev="britter" due-to="Michael
Osipov">Extend DurationFormatUtils#formatDurationISO default pattern to match
#formatDurationHMS</action>
<action issue="LANG-1007" type="update" dev="britter" due-to="Thiago
Andrade">Fixing NumberUtils JAVADoc comments for max methods</action>
Modified:
commons/proper/lang/trunk/src/main/java/org/apache/commons/lang3/StringUtils.java
URL:
http://svn.apache.org/viewvc/commons/proper/lang/trunk/src/main/java/org/apache/commons/lang3/StringUtils.java?rev=1593112&r1=1593111&r2=1593112&view=diff
==============================================================================
---
commons/proper/lang/trunk/src/main/java/org/apache/commons/lang3/StringUtils.java
(original)
+++
commons/proper/lang/trunk/src/main/java/org/apache/commons/lang3/StringUtils.java
Wed May 7 18:42:33 2014
@@ -7073,6 +7073,85 @@ public class StringUtils {
}
/**
+ * <p>Determine the fuzzy score which indicates the similarity between two
Strings.</p>
+ *
+ * <p>This string matching algorithm is similar to the algorithms of
editors such as Sublime Text,
+ * TextMate, Atom and others. One point is given for every matched
character. Subsequent
+ * matches yield two bonus points. A higher score indicates a higher
similarity.</p>
+ *
+ * <pre>
+ * StringUtils.getFuzzyDistance(null, null, null)
= IllegalArgumentException
+ * StringUtils.getFuzzyDistance("", "", Locale.ENGLISH)
= 0
+ * StringUtils.getFuzzyDistance("Workshop", "b", Locale.ENGLISH)
= 0
+ * StringUtils.getFuzzyDistance("Room", "o", Locale.ENGLISH)
= 1
+ * StringUtils.getFuzzyDistance("Workshop", "w", Locale.ENGLISH)
= 1
+ * StringUtils.getFuzzyDistance("Workshop", "ws", Locale.ENGLISH)
= 2
+ * StringUtils.getFuzzyDistance("Workshop", "wo", Locale.ENGLISH)
= 4
+ * StringUtils.getFuzzyDistance("Apache Software Foundation", "asf",
Locale.ENGLISH) = 3
+ * </pre>
+ *
+ * @param term a full term that should be matched against, must not be null
+ * @param query the query that will be matched against a term, must not be
null
+ * @param locale This string matching logic is case insensitive. A locale
is necessary to normalize
+ * both Strings to lower case.
+ * @return result score
+ * @throws IllegalArgumentException if either String input {@code null} or
Locale input {@code null}
+ * @since 3.4
+ */
+ public static int getFuzzyDistance(final CharSequence term, final
CharSequence query, final Locale locale) {
+ if (term == null || query == null) {
+ throw new IllegalArgumentException("Strings must not be null");
+ } else if (locale == null) {
+ throw new IllegalArgumentException("Locale must not be null");
+ }
+
+ // fuzzy logic is case insensitive. We normalize the Strings to lower
+ // case right from the start. Turning characters to lower case
+ // via Character.toLowerCase(char) is unfortunately insufficient
+ // as it does not accept a locale.
+ final String termLowerCase = term.toString().toLowerCase(locale);
+ final String queryLowerCase = query.toString().toLowerCase(locale);
+
+ // the resulting score
+ int score = 0;
+
+ // the position in the term which will be scanned next for potential
+ // query character matches
+ int termIndex = 0;
+
+ // index of the previously matched character in the term
+ int previousMatchingCharacterIndex = Integer.MIN_VALUE;
+
+ for (int queryIndex = 0; queryIndex < queryLowerCase.length();
queryIndex++) {
+ char queryChar = queryLowerCase.charAt(queryIndex);
+
+ boolean termCharacterMatchFound = false;
+ for (; termIndex < termLowerCase.length() &&
!termCharacterMatchFound; termIndex++) {
+ char termChar = termLowerCase.charAt(termIndex);
+
+ if (queryChar == termChar) {
+ // simple character matches result in one point
+ score++;
+
+ // subsequent character matches further improve
+ // the score.
+ if (previousMatchingCharacterIndex + 1 == termIndex) {
+ score += 2;
+ }
+
+ previousMatchingCharacterIndex = termIndex;
+
+ // we can leave the nested loop. Every character in the
+ // query can match at most one character in the term.
+ termCharacterMatchFound = true;
+ }
+ }
+ }
+
+ return score;
+ }
+
+ /**
* Gets a set of matching characters between two strings.
*
* <p><Two characters from the first string and the second string are
considered matching if the character's
Modified:
commons/proper/lang/trunk/src/test/java/org/apache/commons/lang3/StringUtilsTest.java
URL:
http://svn.apache.org/viewvc/commons/proper/lang/trunk/src/test/java/org/apache/commons/lang3/StringUtilsTest.java?rev=1593112&r1=1593111&r2=1593112&view=diff
==============================================================================
---
commons/proper/lang/trunk/src/test/java/org/apache/commons/lang3/StringUtilsTest.java
(original)
+++
commons/proper/lang/trunk/src/test/java/org/apache/commons/lang3/StringUtilsTest.java
Wed May 7 18:42:33 2014
@@ -2018,6 +2018,37 @@ public class StringUtilsTest {
StringUtils.getJaroWinklerDistance(null, "clear");
}
+ @Test
+ public void testGetFuzzyDistance() throws Exception {
+ assertEquals(0, StringUtils.getFuzzyDistance("", "", Locale.ENGLISH));
+ assertEquals(0, StringUtils.getFuzzyDistance("Workshop", "b",
Locale.ENGLISH));
+ assertEquals(1, StringUtils.getFuzzyDistance("Room", "o",
Locale.ENGLISH));
+ assertEquals(1, StringUtils.getFuzzyDistance("Workshop", "w",
Locale.ENGLISH));
+ assertEquals(2, StringUtils.getFuzzyDistance("Workshop", "ws",
Locale.ENGLISH));
+ assertEquals(4, StringUtils.getFuzzyDistance("Workshop", "wo",
Locale.ENGLISH));
+ assertEquals(3, StringUtils.getFuzzyDistance("Apache Software
Foundation", "asf", Locale.ENGLISH));
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testGetFuzzyDistance_NullNullNull() throws Exception {
+ StringUtils.getFuzzyDistance(null, null, null);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testGetFuzzyDistance_StringNullLoclae() throws Exception {
+ StringUtils.getFuzzyDistance(" ", null, Locale.ENGLISH);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testGetFuzzyDistance_NullStringLocale() throws Exception {
+ StringUtils.getFuzzyDistance(null, "clear", Locale.ENGLISH);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testGetFuzzyDistance_StringStringNull() throws Exception {
+ StringUtils.getFuzzyDistance(" ", "clear", null);
+ }
+
/**
* A sanity check for {@link StringUtils#EMPTY}.
*/