http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/similarity/LevenshteinDetailedDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinDetailedDistance.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinDetailedDistance.java new file mode 100644 index 0000000..00b2689 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinDetailedDistance.java @@ -0,0 +1,519 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import java.util.Arrays; + +/** + * An algorithm for measuring the difference between two character sequences. + * + * <p> + * This is the number of changes needed to change one sequence into another, + * where each change is a single character modification (deletion, insertion + * or substitution). + * </p> + * + * @since 1.0 + */ +public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResults> { + + /** + * Default instance. + */ + private static final LevenshteinDetailedDistance DEFAULT_INSTANCE = new LevenshteinDetailedDistance(); + /** + * Threshold. + */ + private final Integer threshold; + + /** + * <p> + * This returns the default instance that uses a version + * of the algorithm that does not use a threshold parameter. + * </p> + * + * @see LevenshteinDetailedDistance#getDefaultInstance() + */ + public LevenshteinDetailedDistance() { + this(null); + } + + /** + * If the threshold is not null, distance calculations will be limited to a maximum length. + * + * <p>If the threshold is null, the unlimited version of the algorithm will be used.</p> + * + * @param threshold If this is null then distances calculations will not be limited. This may not be negative. + */ + public LevenshteinDetailedDistance(final Integer threshold) { + if (threshold != null && threshold < 0) { + throw new IllegalArgumentException("Threshold must not be negative"); + } + this.threshold = threshold; + } + + /** + * <p>Find the Levenshtein distance between two Strings.</p> + * + * <p>A higher score indicates a greater distance.</p> + * + * <p>The previous implementation of the Levenshtein distance algorithm + * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> + * + * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError + * which can occur when my Java implementation is used with very large strings.<br> + * This implementation of the Levenshtein distance algorithm + * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> + * + * <pre> + * distance.apply(null, *) = IllegalArgumentException + * distance.apply(*, null) = IllegalArgumentException + * distance.apply("","") = 0 + * distance.apply("","a") = 1 + * distance.apply("aaapppp", "") = 7 + * distance.apply("frog", "fog") = 1 + * distance.apply("fly", "ant") = 3 + * distance.apply("elephant", "hippo") = 7 + * distance.apply("hippo", "elephant") = 7 + * distance.apply("hippo", "zzzzzzzz") = 8 + * distance.apply("hello", "hallo") = 1 + * </pre> + * + * @param left the first string, must not be null + * @param right the second string, must not be null + * @return result distance, or -1 + * @throws IllegalArgumentException if either String input {@code null} + */ + @Override + public LevenshteinResults apply(final CharSequence left, final CharSequence right) { + if (threshold != null) { + return limitedCompare(left, right, threshold); + } + return unlimitedCompare(left, right); + } + + /** + * Gets the default instance. + * + * @return the default instace + */ + public static LevenshteinDetailedDistance getDefaultInstance() { + return DEFAULT_INSTANCE; + } + + /** + * Gets the distance threshold. + * + * @return the distance threshold + */ + public Integer getThreshold() { + return threshold; + } + + /** + * Find the Levenshtein distance between two CharSequences if it's less than or + * equal to a given threshold. + * + * <p> + * This implementation follows from Algorithms on Strings, Trees and + * Sequences by Dan Gusfield and Chas Emerick's implementation of the + * Levenshtein distance algorithm from <a + * href="http://www.merriampark.com/ld.htm" + * >http://www.merriampark.com/ld.htm</a> + * </p> + * + * <pre> + * limitedCompare(null, *, *) = IllegalArgumentException + * limitedCompare(*, null, *) = IllegalArgumentException + * limitedCompare(*, *, -1) = IllegalArgumentException + * limitedCompare("","", 0) = 0 + * limitedCompare("aaapppp", "", 8) = 7 + * limitedCompare("aaapppp", "", 7) = 7 + * limitedCompare("aaapppp", "", 6)) = -1 + * limitedCompare("elephant", "hippo", 7) = 7 + * limitedCompare("elephant", "hippo", 6) = -1 + * limitedCompare("hippo", "elephant", 7) = 7 + * limitedCompare("hippo", "elephant", 6) = -1 + * </pre> + * + * @param left the first string, must not be null + * @param right the second string, must not be null + * @param threshold the target threshold, must not be negative + * @return result distance, or -1 + */ + private static LevenshteinResults limitedCompare(CharSequence left, + CharSequence right, + final int threshold) { //NOPMD + if (left == null || right == null) { + throw new IllegalArgumentException("Strings must not be null"); + } + if (threshold < 0) { + throw new IllegalArgumentException("Threshold must not be negative"); + } + + /* + * This implementation only computes the distance if it's less than or + * equal to the threshold value, returning -1 if it's greater. The + * advantage is performance: unbounded distance is O(nm), but a bound of + * k allows us to reduce it to O(km) time by only computing a diagonal + * stripe of width 2k + 1 of the cost table. It is also possible to use + * this to compute the unbounded Levenshtein distance by starting the + * threshold at 1 and doubling each time until the distance is found; + * this is O(dm), where d is the distance. + * + * One subtlety comes from needing to ignore entries on the border of + * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry + * to the left of the leftmost member We must ignore the entry above the + * rightmost member + * + * Another subtlety comes from our stripe running off the matrix if the + * strings aren't of the same size. Since string s is always swapped to + * be the shorter of the two, the stripe will always run off to the + * upper right instead of the lower left of the matrix. + * + * As a concrete example, suppose s is of length 5, t is of length 7, + * and our threshold is 1. In this case we're going to walk a stripe of + * length 3. The matrix would look like so: + * + * <pre> + * 1 2 3 4 5 + * 1 |#|#| | | | + * 2 |#|#|#| | | + * 3 | |#|#|#| | + * 4 | | |#|#|#| + * 5 | | | |#|#| + * 6 | | | | |#| + * 7 | | | | | | + * </pre> + * + * Note how the stripe leads off the table as there is no possible way + * to turn a string of length 5 into one of length 7 in edit distance of + * 1. + * + * Additionally, this implementation decreases memory usage by using two + * single-dimensional arrays and swapping them back and forth instead of + * allocating an entire n by m matrix. This requires a few minor + * changes, such as immediately returning when it's detected that the + * stripe has run off the matrix and initially filling the arrays with + * large values so that entries we don't compute are ignored. + * + * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for + * some discussion. + */ + + int n = left.length(); // length of left + int m = right.length(); // length of right + + // if one string is empty, the edit distance is necessarily the length of the other + if (n == 0) { + return m <= threshold ? new LevenshteinResults(m, m, 0, 0) : new LevenshteinResults(-1, 0, 0, 0); + } else if (m == 0) { + return n <= threshold ? new LevenshteinResults(n, 0, n, 0) : new LevenshteinResults(-1, 0, 0, 0); + } + + boolean swapped = false; + if (n > m) { + // swap the two strings to consume less memory + final CharSequence tmp = left; + left = right; + right = tmp; + n = m; + m = right.length(); + swapped = true; + } + + int[] p = new int[n + 1]; // 'previous' cost array, horizontally + int[] d = new int[n + 1]; // cost array, horizontally + int[] tempD; // placeholder to assist in swapping p and d + final int[][] matrix = new int[m + 1][n + 1]; + + //filling the first row and first column values in the matrix + for (int index = 0; index <= n; index++) { + matrix[0][index] = index; + } + for (int index = 0; index <= m; index++) { + matrix[index][0] = index; + } + + // fill in starting table values + final int boundary = Math.min(n, threshold) + 1; + for (int i = 0; i < boundary; i++) { + p[i] = i; + } + // these fills ensure that the value above the rightmost entry of our + // stripe will be ignored in following loop iterations + Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE); + Arrays.fill(d, Integer.MAX_VALUE); + + // iterates through t + for (int j = 1; j <= m; j++) { + final char rightJ = right.charAt(j - 1); // jth character of right + d[0] = j; + + // compute stripe indices, constrain to array size + final int min = Math.max(1, j - threshold); + final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min( + n, j + threshold); + + // the stripe may lead off of the table if s and t are of different sizes + if (min > max) { + return new LevenshteinResults(-1, 0, 0, 0); + } + + // ignore entry left of leftmost + if (min > 1) { + d[min - 1] = Integer.MAX_VALUE; + } + + // iterates through [min, max] in s + for (int i = min; i <= max; i++) { + if (left.charAt(i - 1) == rightJ) { + // diagonally left and up + d[i] = p[i - 1]; + } else { + // 1 + minimum of cell to the left, to the top, diagonally left and up + d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]); + } + matrix[j][i] = d[i]; + } + + // copy current distance counts to 'previous row' distance counts + tempD = p; + p = d; + d = tempD; + } + + // if p[n] is greater than the threshold, there's no guarantee on it being the correct distance + if (p[n] <= threshold) { + return findDetailedResults(left, right, matrix, swapped); + } + return new LevenshteinResults(-1, 0, 0, 0); + } + + /** + * <p>Find the Levenshtein distance between two Strings.</p> + * + * <p>A higher score indicates a greater distance.</p> + * + * <p>The previous implementation of the Levenshtein distance algorithm + * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> + * + * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError + * which can occur when my Java implementation is used with very large strings.<br> + * This implementation of the Levenshtein distance algorithm + * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> + * + * <pre> + * unlimitedCompare(null, *) = IllegalArgumentException + * unlimitedCompare(*, null) = IllegalArgumentException + * unlimitedCompare("","") = 0 + * unlimitedCompare("","a") = 1 + * unlimitedCompare("aaapppp", "") = 7 + * unlimitedCompare("frog", "fog") = 1 + * unlimitedCompare("fly", "ant") = 3 + * unlimitedCompare("elephant", "hippo") = 7 + * unlimitedCompare("hippo", "elephant") = 7 + * unlimitedCompare("hippo", "zzzzzzzz") = 8 + * unlimitedCompare("hello", "hallo") = 1 + * </pre> + * + * @param left the first String, must not be null + * @param right the second String, must not be null + * @return result distance, or -1 + * @throws IllegalArgumentException if either String input {@code null} + */ + private static LevenshteinResults unlimitedCompare(CharSequence left, CharSequence right) { + if (left == null || right == null) { + throw new IllegalArgumentException("Strings must not be null"); + } + + /* + The difference between this impl. and the previous is that, rather + than creating and retaining a matrix of size s.length() + 1 by t.length() + 1, + we maintain two single-dimensional arrays of length s.length() + 1. The first, d, + is the 'current working' distance array that maintains the newest distance cost + counts as we iterate through the characters of String s. Each time we increment + the index of String t we are comparing, d is copied to p, the second int[]. Doing so + allows us to retain the previous cost counts as required by the algorithm (taking + the minimum of the cost count to the left, up one, and diagonally up and to the left + of the current cost count being calculated). (Note that the arrays aren't really + copied anymore, just switched...this is clearly much better than cloning an array + or doing a System.arraycopy() each time through the outer loop.) + + Effectively, the difference between the two implementations is this one does not + cause an out of memory condition when calculating the LD over two very large strings. + */ + + int n = left.length(); // length of left + int m = right.length(); // length of right + + if (n == 0) { + return new LevenshteinResults(m, m, 0, 0); + } else if (m == 0) { + return new LevenshteinResults(n, 0, n, 0); + } + boolean swapped = false; + if (n > m) { + // swap the input strings to consume less memory + final CharSequence tmp = left; + left = right; + right = tmp; + n = m; + m = right.length(); + swapped = true; + } + + int[] p = new int[n + 1]; // 'previous' cost array, horizontally + int[] d = new int[n + 1]; // cost array, horizontally + int[] tempD; //placeholder to assist in swapping p and d + final int[][] matrix = new int[m + 1][n + 1]; + + // filling the first row and first column values in the matrix + for (int index = 0; index <= n; index++) { + matrix[0][index] = index; + } + for (int index = 0; index <= m; index++) { + matrix[index][0] = index; + } + + // indexes into strings left and right + int i; // iterates through left + int j; // iterates through right + + char rightJ; // jth character of right + + int cost; // cost + for (i = 0; i <= n; i++) { + p[i] = i; + } + + for (j = 1; j <= m; j++) { + rightJ = right.charAt(j - 1); + d[0] = j; + + for (i = 1; i <= n; i++) { + cost = left.charAt(i - 1) == rightJ ? 0 : 1; + // minimum of cell to the left+1, to the top+1, diagonally left and up +cost + d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost); + //filling the matrix + matrix[j][i] = d[i]; + } + + // copy current distance counts to 'previous row' distance counts + tempD = p; + p = d; + d = tempD; + } + return findDetailedResults(left, right, matrix, swapped); + } + + /** + * Finds count for each of the three [insert, delete, substitute] operations + * needed. This is based on the matrix formed based on the two character + * sequence. + * + * @param left character sequence which need to be converted from + * @param right character sequence which need to be converted to + * @param matrix two dimensional array containing + * @param swapped tells whether the value for left character sequence and right + * character sequence were swapped to save memory + * @return result object containing the count of insert, delete and substitute and total count needed + */ + private static LevenshteinResults findDetailedResults(final CharSequence left, + final CharSequence right, + final int[][] matrix, + final boolean swapped) { + + int delCount = 0; + int addCount = 0; + int subCount = 0; + + int rowIndex = right.length(); + int columnIndex = left.length(); + + int dataAtLeft = 0; + int dataAtTop = 0; + int dataAtDiagonal = 0; + int data = 0; + boolean deleted = false; + boolean added = false; + + while (rowIndex >= 0 && columnIndex >= 0) { + + if (columnIndex == 0) { + dataAtLeft = -1; + } else { + dataAtLeft = matrix[rowIndex][columnIndex - 1]; + } + if (rowIndex == 0) { + dataAtTop = -1; + } else { + dataAtTop = matrix[rowIndex - 1][columnIndex]; + } + if (rowIndex > 0 && columnIndex > 0) { + dataAtDiagonal = matrix[rowIndex - 1][columnIndex - 1]; + } else { + dataAtDiagonal = -1; + } + if (dataAtLeft == -1 && dataAtTop == -1 && dataAtDiagonal == -1) { + break; + } + data = matrix[rowIndex][columnIndex]; + + // case in which the character at left and right are the same, + // in this case none of the counters will be incremented. + if (columnIndex > 0 && rowIndex > 0 && left.charAt(columnIndex - 1) == right.charAt(rowIndex - 1)) { + columnIndex--; + rowIndex--; + continue; + } + + // handling insert and delete cases. + deleted = false; + added = false; + if (data - 1 == dataAtLeft && (data <= dataAtDiagonal && data <= dataAtTop) + || (dataAtDiagonal == -1 && dataAtTop == -1)) { // NOPMD + columnIndex--; + if (swapped) { + addCount++; + added = true; + } else { + delCount++; + deleted = true; + } + } else if (data - 1 == dataAtTop && (data <= dataAtDiagonal && data <= dataAtLeft) + || (dataAtDiagonal == -1 && dataAtLeft == -1)) { // NOPMD + rowIndex--; + if (swapped) { + delCount++; + deleted = true; + } else { + addCount++; + added = true; + } + } + + // substituted case + if (!added && !deleted) { + subCount++; + columnIndex--; + rowIndex--; + } + } + return new LevenshteinResults(addCount + delCount + subCount, addCount, delCount, subCount); + } +}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java new file mode 100644 index 0000000..f3ef298 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import java.util.Arrays; + +/** + * An algorithm for measuring the difference between two character sequences. + * + * <p> + * This is the number of changes needed to change one sequence into another, + * where each change is a single character modification (deletion, insertion + * or substitution). + * </p> + * + * <p> + * This code has been adapted from Apache Commons Lang 3.3. + * </p> + * + * @since 1.0 + */ +public class LevenshteinDistance implements EditDistance<Integer> { + + /** + * Default instance. + */ + private static final LevenshteinDistance DEFAULT_INSTANCE = new LevenshteinDistance(); + + /** + * Threshold. + */ + private final Integer threshold; + + /** + * <p> + * This returns the default instance that uses a version + * of the algorithm that does not use a threshold parameter. + * </p> + * + * @see LevenshteinDistance#getDefaultInstance() + */ + public LevenshteinDistance() { + this(null); + } + + /** + * <p> + * If the threshold is not null, distance calculations will be limited to a maximum length. + * If the threshold is null, the unlimited version of the algorithm will be used. + * </p> + * + * @param threshold + * If this is null then distances calculations will not be limited. + * This may not be negative. + */ + public LevenshteinDistance(final Integer threshold) { + if (threshold != null && threshold < 0) { + throw new IllegalArgumentException("Threshold must not be negative"); + } + this.threshold = threshold; + } + + /** + * <p>Find the Levenshtein distance between two Strings.</p> + * + * <p>A higher score indicates a greater distance.</p> + * + * <p>The previous implementation of the Levenshtein distance algorithm + * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> + * + * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError + * which can occur when my Java implementation is used with very large strings.<br> + * This implementation of the Levenshtein distance algorithm + * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> + * + * <pre> + * distance.apply(null, *) = IllegalArgumentException + * distance.apply(*, null) = IllegalArgumentException + * distance.apply("","") = 0 + * distance.apply("","a") = 1 + * distance.apply("aaapppp", "") = 7 + * distance.apply("frog", "fog") = 1 + * distance.apply("fly", "ant") = 3 + * distance.apply("elephant", "hippo") = 7 + * distance.apply("hippo", "elephant") = 7 + * distance.apply("hippo", "zzzzzzzz") = 8 + * distance.apply("hello", "hallo") = 1 + * </pre> + * + * @param left the first string, must not be null + * @param right the second string, must not be null + * @return result distance, or -1 + * @throws IllegalArgumentException if either String input {@code null} + */ + @Override + public Integer apply(final CharSequence left, final CharSequence right) { + if (threshold != null) { + return limitedCompare(left, right, threshold); + } + return unlimitedCompare(left, right); + } + + /** + * Gets the default instance. + * + * @return the default instace + */ + public static LevenshteinDistance getDefaultInstance() { + return DEFAULT_INSTANCE; + } + + /** + * Gets the distance threshold. + * + * @return the distance threshold + */ + public Integer getThreshold() { + return threshold; + } + + /** + * Find the Levenshtein distance between two CharSequences if it's less than or + * equal to a given threshold. + * + * <p> + * This implementation follows from Algorithms on Strings, Trees and + * Sequences by Dan Gusfield and Chas Emerick's implementation of the + * Levenshtein distance algorithm from <a + * href="http://www.merriampark.com/ld.htm" + * >http://www.merriampark.com/ld.htm</a> + * </p> + * + * <pre> + * limitedCompare(null, *, *) = IllegalArgumentException + * limitedCompare(*, null, *) = IllegalArgumentException + * limitedCompare(*, *, -1) = IllegalArgumentException + * limitedCompare("","", 0) = 0 + * limitedCompare("aaapppp", "", 8) = 7 + * limitedCompare("aaapppp", "", 7) = 7 + * limitedCompare("aaapppp", "", 6)) = -1 + * limitedCompare("elephant", "hippo", 7) = 7 + * limitedCompare("elephant", "hippo", 6) = -1 + * limitedCompare("hippo", "elephant", 7) = 7 + * limitedCompare("hippo", "elephant", 6) = -1 + * </pre> + * + * @param left the first string, must not be null + * @param right the second string, must not be null + * @param threshold the target threshold, must not be negative + * @return result distance, or -1 + */ + private static int limitedCompare(CharSequence left, CharSequence right, final int threshold) { // NOPMD + if (left == null || right == null) { + throw new IllegalArgumentException("Strings must not be null"); + } + if (threshold < 0) { + throw new IllegalArgumentException("Threshold must not be negative"); + } + + /* + * This implementation only computes the distance if it's less than or + * equal to the threshold value, returning -1 if it's greater. The + * advantage is performance: unbounded distance is O(nm), but a bound of + * k allows us to reduce it to O(km) time by only computing a diagonal + * stripe of width 2k + 1 of the cost table. It is also possible to use + * this to compute the unbounded Levenshtein distance by starting the + * threshold at 1 and doubling each time until the distance is found; + * this is O(dm), where d is the distance. + * + * One subtlety comes from needing to ignore entries on the border of + * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry + * to the left of the leftmost member We must ignore the entry above the + * rightmost member + * + * Another subtlety comes from our stripe running off the matrix if the + * strings aren't of the same size. Since string s is always swapped to + * be the shorter of the two, the stripe will always run off to the + * upper right instead of the lower left of the matrix. + * + * As a concrete example, suppose s is of length 5, t is of length 7, + * and our threshold is 1. In this case we're going to walk a stripe of + * length 3. The matrix would look like so: + * + * <pre> + * 1 2 3 4 5 + * 1 |#|#| | | | + * 2 |#|#|#| | | + * 3 | |#|#|#| | + * 4 | | |#|#|#| + * 5 | | | |#|#| + * 6 | | | | |#| + * 7 | | | | | | + * </pre> + * + * Note how the stripe leads off the table as there is no possible way + * to turn a string of length 5 into one of length 7 in edit distance of + * 1. + * + * Additionally, this implementation decreases memory usage by using two + * single-dimensional arrays and swapping them back and forth instead of + * allocating an entire n by m matrix. This requires a few minor + * changes, such as immediately returning when it's detected that the + * stripe has run off the matrix and initially filling the arrays with + * large values so that entries we don't compute are ignored. + * + * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for + * some discussion. + */ + + int n = left.length(); // length of left + int m = right.length(); // length of right + + // if one string is empty, the edit distance is necessarily the length + // of the other + if (n == 0) { + return m <= threshold ? m : -1; + } else if (m == 0) { + return n <= threshold ? n : -1; + } + + if (n > m) { + // swap the two strings to consume less memory + final CharSequence tmp = left; + left = right; + right = tmp; + n = m; + m = right.length(); + } + + int[] p = new int[n + 1]; // 'previous' cost array, horizontally + int[] d = new int[n + 1]; // cost array, horizontally + int[] tempD; // placeholder to assist in swapping p and d + + // fill in starting table values + final int boundary = Math.min(n, threshold) + 1; + for (int i = 0; i < boundary; i++) { + p[i] = i; + } + // these fills ensure that the value above the rightmost entry of our + // stripe will be ignored in following loop iterations + Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE); + Arrays.fill(d, Integer.MAX_VALUE); + + // iterates through t + for (int j = 1; j <= m; j++) { + final char rightJ = right.charAt(j - 1); // jth character of right + d[0] = j; + + // compute stripe indices, constrain to array size + final int min = Math.max(1, j - threshold); + final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min( + n, j + threshold); + + // the stripe may lead off of the table if s and t are of different + // sizes + if (min > max) { + return -1; + } + + // ignore entry left of leftmost + if (min > 1) { + d[min - 1] = Integer.MAX_VALUE; + } + + // iterates through [min, max] in s + for (int i = min; i <= max; i++) { + if (left.charAt(i - 1) == rightJ) { + // diagonally left and up + d[i] = p[i - 1]; + } else { + // 1 + minimum of cell to the left, to the top, diagonally + // left and up + d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]); + } + } + + // copy current distance counts to 'previous row' distance counts + tempD = p; + p = d; + d = tempD; + } + + // if p[n] is greater than the threshold, there's no guarantee on it + // being the correct + // distance + if (p[n] <= threshold) { + return p[n]; + } + return -1; + } + + /** + * <p>Find the Levenshtein distance between two Strings.</p> + * + * <p>A higher score indicates a greater distance.</p> + * + * <p>The previous implementation of the Levenshtein distance algorithm + * was from <a href="https://web.archive.org/web/20120526085419/http://www.merriampark.com/ldjava.htm"> + * https://web.archive.org/web/20120526085419/http://www.merriampark.com/ldjava.htm</a></p> + * + * <p>This implementation only need one single-dimensional arrays of length s.length() + 1</p> + * + * <pre> + * unlimitedCompare(null, *) = IllegalArgumentException + * unlimitedCompare(*, null) = IllegalArgumentException + * unlimitedCompare("","") = 0 + * unlimitedCompare("","a") = 1 + * unlimitedCompare("aaapppp", "") = 7 + * unlimitedCompare("frog", "fog") = 1 + * unlimitedCompare("fly", "ant") = 3 + * unlimitedCompare("elephant", "hippo") = 7 + * unlimitedCompare("hippo", "elephant") = 7 + * unlimitedCompare("hippo", "zzzzzzzz") = 8 + * unlimitedCompare("hello", "hallo") = 1 + * </pre> + * + * @param left the first String, must not be null + * @param right the second String, must not be null + * @return result distance, or -1 + * @throws IllegalArgumentException if either String input {@code null} + */ + private static int unlimitedCompare(CharSequence left, CharSequence right) { + if (left == null || right == null) { + throw new IllegalArgumentException("Strings must not be null"); + } + + /* + This implementation use two variable to record the previous cost counts, + So this implementation use less memory than previous impl. + */ + + int n = left.length(); // length of left + int m = right.length(); // length of right + + if (n == 0) { + return m; + } else if (m == 0) { + return n; + } + + if (n > m) { + // swap the input strings to consume less memory + final CharSequence tmp = left; + left = right; + right = tmp; + n = m; + m = right.length(); + } + + int[] p = new int[n + 1]; + + // indexes into strings left and right + int i; // iterates through left + int j; // iterates through right + int upperLeft; + int upper; + + char rightJ; // jth character of right + int cost; // cost + + for (i = 0; i <= n; i++) { + p[i] = i; + } + + for (j = 1; j <= m; j++) { + upperLeft = p[0]; + rightJ = right.charAt(j - 1); + p[0] = j; + + for (i = 1; i <= n; i++) { + upper = p[i]; + cost = left.charAt(i - 1) == rightJ ? 0 : 1; + // minimum of cell to the left+1, to the top+1, diagonally left and up +cost + p[i] = Math.min(Math.min(p[i - 1] + 1, p[i] + 1), upperLeft + cost); + upperLeft = upper; + } + } + + return p[n]; + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/similarity/LevenshteinResults.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinResults.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinResults.java new file mode 100644 index 0000000..e6c666f --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinResults.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import java.util.Objects; + +/** + * Container class to store Levenshtein distance between two character sequences. + * + * <p>Stores the count of insert, deletion and substitute operations needed to + * change one character sequence into another.</p> + * + * <p>This class is immutable.</p> + * + * @since 1.0 + */ +public class LevenshteinResults { + /** + * Edit distance. + */ + private final Integer distance; + /** + * Insert character count. + */ + private final Integer insertCount; + /** + * Delete character count. + */ + private final Integer deleteCount; + /** + * Substitute character count. + */ + private final Integer substituteCount; + + /** + * Create the results for a detailed Levenshtein distance. + * + * @param distance distance between two character sequences. + * @param insertCount insert character count + * @param deleteCount delete character count + * @param substituteCount substitute character count + */ + public LevenshteinResults(final Integer distance, final Integer insertCount, final Integer deleteCount, + final Integer substituteCount) { + this.distance = distance; + this.insertCount = insertCount; + this.deleteCount = deleteCount; + this.substituteCount = substituteCount; + } + + /** + * Get the distance between two character sequences. + * + * @return distance between two character sequence + */ + public Integer getDistance() { + return distance; + } + + /** + * Get the number of insertion needed to change one character sequence into another. + * + * @return insert character count + */ + public Integer getInsertCount() { + return insertCount; + } + + /** + * Get the number of character deletion needed to change one character sequence to other. + * + * @return delete character count + */ + public Integer getDeleteCount() { + return deleteCount; + } + + /** + * Get the number of character substitution needed to change one character sequence into another. + * + * @return substitute character count + */ + public Integer getSubstituteCount() { + return substituteCount; + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final LevenshteinResults result = (LevenshteinResults) o; + return Objects.equals(distance, result.distance) && Objects.equals(insertCount, result.insertCount) + && Objects.equals(deleteCount, result.deleteCount) + && Objects.equals(substituteCount, result.substituteCount); + } + + @Override + public int hashCode() { + return Objects.hash(distance, insertCount, deleteCount, substituteCount); + } + + @Override + public String toString() { + return "Distance: " + distance + ", Insert: " + insertCount + ", Delete: " + deleteCount + ", Substitute: " + + substituteCount; + } +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/similarity/LongestCommonSubsequence.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/LongestCommonSubsequence.java b/src/main/java/org/apache/commons/text/similarity/LongestCommonSubsequence.java new file mode 100644 index 0000000..cc5bb45 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/LongestCommonSubsequence.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +/** + * A similarity algorithm indicating the length of the longest common subsequence between two strings. + * + * <p> + * The Longest common subsequence algorithm returns the length of the longest subsequence that two strings have in + * common. Two strings that are entirely different, return a value of 0, and two strings that return a value + * of the commonly shared length implies that the strings are completely the same in value and position. + * <i>Note.</i> Generally this algorithm is fairly inefficient, as for length <i>m</i>, <i>n</i> of the input + * <code>CharSequence</code>'s <code>left</code> and <code>right</code> respectively, the runtime of the + * algorithm is <i>O(m*n)</i>. + * </p> + * + * <p> + * This implementation is based on the Longest Commons Substring algorithm + * from <a href="https://en.wikipedia.org/wiki/Longest_common_subsequence_problem"> + * https://en.wikipedia.org/wiki/Longest_common_subsequence_problem</a>. + * </p> + * + * <p>For further reading see:</p> + * + * <p>Lothaire, M. <i>Applied combinatorics on words</i>. New York: Cambridge U Press, 2005. <b>12-13</b></p> + * + * @since 1.0 + */ +public class LongestCommonSubsequence implements SimilarityScore<Integer> { + + /** + * Calculates longestCommonSubsequence similarity score of two <code>CharSequence</code>'s passed as + * input. + * + * @param left first character sequence + * @param right second character sequence + * @return longestCommonSubsequenceLength + * @throws IllegalArgumentException + * if either String input {@code null} + */ + @Override + public Integer apply(final CharSequence left, final CharSequence right) { + // Quick return for invalid inputs + if (left == null || right == null) { + throw new IllegalArgumentException("Inputs must not be null"); + } + return logestCommonSubsequence(left, right).length(); + } + + /** + * + * Computes the longestCommonSubsequence between the two <code>CharSequence</code>'s passed as + * input. + * + * <p> + * Note, a substring and + * subsequence are not necessarily the same thing. Indeed, <code>abcxyzqrs</code> and + * <code>xyzghfm</code> have both the same common substring and subsequence, namely <code>xyz</code>. However, + * <code>axbyczqrs</code> and <code>abcxyzqtv</code> have the longest common subsequence <code>xyzq</code> because a + * subsequence need not have adjacent characters. + * </p> + * + * <p> + * For reference, we give the definition of a subsequence for the reader: a <i>subsequence</i> is a sequence that + * can be derived from another sequence by deleting some elements without changing the order of the remaining + * elements. + * </p> + * + * @param left first character sequence + * @param right second character sequence + * @return lcsLengthArray + * @throws IllegalArgumentException + * if either String input {@code null} + */ + public CharSequence logestCommonSubsequence(final CharSequence left, final CharSequence right) { + // Quick return + if (left == null || right == null) { + throw new IllegalArgumentException("Inputs must not be null"); + } + StringBuilder longestCommonSubstringArray = new StringBuilder(Math.max(left.length(), right.length())); + int[][] lcsLengthArray = longestCommonSubstringLengthArray(left, right); + int i = left.length() - 1; + int j = right.length() - 1; + int k = lcsLengthArray[left.length()][right.length()] - 1; + while (k >= 0) { + if (left.charAt(i) == right.charAt(j)) { + longestCommonSubstringArray.append(left.charAt(i)); + i = i - 1; + j = j - 1; + k = k - 1; + } else if (lcsLengthArray[i + 1][j] < lcsLengthArray[i][j + 1]) { + i = i - 1; + } else { + j = j - 1; + } + } + return longestCommonSubstringArray.reverse().toString(); + } + + /** + * + * Computes the lcsLengthArray for the sake of doing the actual lcs calculation. This is the + * dynamic programming portion of the algorithm, and is the reason for the runtime complexity being + * O(m*n), where m=left.length() and n=right.length(). + * + * @param left first character sequence + * @param right second character sequence + * @return lcsLengthArray + */ + public int[][] longestCommonSubstringLengthArray(final CharSequence left, final CharSequence right) { + int[][] lcsLengthArray = new int[left.length() + 1][right.length() + 1]; + for (int i = 0; i < left.length(); i++) { + for (int j = 0; j < right.length(); j++) { + if (i == 0) { + lcsLengthArray[i][j] = 0; + } + if (j == 0) { + lcsLengthArray[i][j] = 0; + } + if (left.charAt(i) == right.charAt(j)) { + lcsLengthArray[i + 1][j + 1] = lcsLengthArray[i][j] + 1; + } else { + lcsLengthArray[i + 1][j + 1] = Math.max(lcsLengthArray[i + 1][j], lcsLengthArray[i][j + 1]); + } + } + } + return lcsLengthArray; + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/similarity/LongestCommonSubsequenceDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/LongestCommonSubsequenceDistance.java b/src/main/java/org/apache/commons/text/similarity/LongestCommonSubsequenceDistance.java new file mode 100644 index 0000000..a33f8b3 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/LongestCommonSubsequenceDistance.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +/** + * An edit distance algorithm based on the length of the longest common subsequence between two strings. + * + * <p> + * This code is directly based upon the implementation in {@link LongestCommonSubsequence}. + * </p> + * + * <p> + * For reference see: <a href="https://en.wikipedia.org/wiki/Longest_common_subsequence_problem"> + * https://en.wikipedia.org/wiki/Longest_common_subsequence_problem</a>. + * </p> + * + * <p>For further reading see:</p> + * + * <p>Lothaire, M. <i>Applied combinatorics on words</i>. New York: Cambridge U Press, 2005. <b>12-13</b></p> + * + * @since 1.0 + */ +public class LongestCommonSubsequenceDistance implements EditDistance<Integer> { + + /** + * Object for calculating the longest common subsequence that we can then normalize in apply. + */ + private final LongestCommonSubsequence longestCommonSubsequence = new LongestCommonSubsequence(); + + /** + * Calculates an edit distance between two <code>CharSequence</code>'s <code>left</code> and + * <code>right</code> as: <code>left.length() + right.length() - 2 * LCS(left, right)</code>, where + * <code>LCS</code> is given in {@link LongestCommonSubsequence#apply(CharSequence, CharSequence)}. + * + * @param left first character sequence + * @param right second character sequence + * @return distance + * @throws IllegalArgumentException + * if either String input {@code null} + */ + @Override + public Integer apply(final CharSequence left, final CharSequence right) { + // Quick return for invalid inputs + if (left == null || right == null) { + throw new IllegalArgumentException("Inputs must not be null"); + } + return left.length() + right.length() - 2 * longestCommonSubsequence.apply(left, right); + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java b/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java new file mode 100644 index 0000000..1c9e268 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * A simple word tokenizer that utilizes regex to find words. It applies a regex + * {@code}(\w)+{@code} over the input text to extract words from a given character + * sequence. + * + * @since 1.0 + */ +class RegexTokenizer implements Tokenizer<CharSequence> { + + /** + * {@inheritDoc} + * + * @throws IllegalArgumentException if the input text is blank + */ + @Override + public CharSequence[] tokenize(final CharSequence text) { + if (text == null || text.toString().trim().equals("")) { + throw new IllegalArgumentException("Invalid text"); + } + final Pattern pattern = Pattern.compile("(\\w)+"); + final Matcher matcher = pattern.matcher(text.toString()); + final List<String> tokens = new ArrayList<>(); + while (matcher.find()) { + tokens.add(matcher.group(0)); + } + return tokens.toArray(new String[0]); + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/similarity/SimilarityScore.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/SimilarityScore.java b/src/main/java/org/apache/commons/text/similarity/SimilarityScore.java new file mode 100644 index 0000000..bca8d05 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/SimilarityScore.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +/** + * Interface for the concept of a string similarity score. + * + * <p> + * A string similarity score is intended to have <i>some</i> of the properties of a metric, yet + * allowing for exceptions, namely the Jaro-Winkler similarity score. + * </p> + * <p> + * We Define a SimilarityScore to be a function <code>d: [X * X] -> [0, INFINITY)</code> with the + * following properties: + * </p> + * <ul> + * <li><code>d(x,y) >= 0</code>, non-negativity or separation axiom</li> + * <li><code>d(x,y) == d(y,x)</code>, symmetry.</li> + * </ul> + * + * <p> + * Notice, these are two of the properties that contribute to d being a metric. + * </p> + * + * + * <p> + * Further, this intended to be BiFunction<CharSequence, CharSequence, R>. + * The <code>apply</code> method + * accepts a pair of {@link CharSequence} parameters + * and returns an <code>R</code> type similarity score. We have ommitted the explicit + * statement of extending BiFunction due to it only being implemented in Java 1.8, and we + * wish to maintain Java 1.7 compatibility. + * </p> + * + * @param <R> The type of similarity score unit used by this EditDistance. + * @since 1.0 + */ +public interface SimilarityScore<R> { + + /** + * Compares two CharSequences. + * + * @param left the first CharSequence + * @param right the second CharSequence + * @return the similarity score between two CharSequences + */ + R apply(CharSequence left, CharSequence right); + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/similarity/SimilarityScoreFrom.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/SimilarityScoreFrom.java b/src/main/java/org/apache/commons/text/similarity/SimilarityScoreFrom.java new file mode 100644 index 0000000..b58ea4a --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/SimilarityScoreFrom.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +/** + * <p> + * This stores a {@link SimilarityScore} implementation and a {@link CharSequence} "left" string. + * The {@link #apply(CharSequence right)} method accepts the "right" string and invokes the + * comparison function for the pair of strings. + * </p> + * + * <p> + * The following is an example which finds the most similar string: + * </p> + * <pre> + * SimilarityScore<Integer> similarityScore = new LevenshteinDistance(); + * String target = "Apache"; + * SimilarityScoreFrom<Integer> similarityScoreFrom = + * new SimilarityScoreFrom<Integer>(similarityScore, target); + * String mostSimilar = null; + * Integer shortestDistance = null; + * + * for (String test : new String[] { "Appaloosa", "a patchy", "apple" }) { + * Integer distance = similarityScoreFrom.apply(test); + * if (shortestDistance == null || distance < shortestDistance) { + * shortestDistance = distance; + * mostSimilar = test; + * } + * } + * + * System.out.println("The string most similar to \"" + target + "\" " + * + "is \"" + mostSimilar + "\" because " + * + "its distance is only " + shortestDistance + "."); + * </pre> + * + * @param <R> This is the type of similarity score used by the SimilarityScore function. + * @since 1.0 + */ +public class SimilarityScoreFrom<R> { + + /** + * Similarity score. + */ + private final SimilarityScore<R> similarityScore; + /** + * Left parameter used in distance function. + */ + private final CharSequence left; + + /** + * <p>This accepts the similarity score implementation and the "left" string.</p> + * + * @param similarityScore This may not be null. + * @param left This may be null here, + * but the SimilarityScore#compare(CharSequence left, CharSequence right) + * implementation may not accept nulls. + */ + public SimilarityScoreFrom(final SimilarityScore<R> similarityScore, final CharSequence left) { + if (similarityScore == null) { + throw new IllegalArgumentException("The edit distance may not be null."); + } + + this.similarityScore = similarityScore; + this.left = left; + } + + /** + * <p> + * This compares "left" field against the "right" parameter + * using the "similarity score" implementation. + * </p> + * + * @param right the second CharSequence + * @return the similarity score between two CharSequences + */ + public R apply(final CharSequence right) { + return similarityScore.apply(left, right); + } + + /** + * Gets the left parameter. + * + * @return the left parameter + */ + public CharSequence getLeft() { + return left; + } + + /** + * Gets the edit distance. + * + * @return the edit distance + */ + public SimilarityScore<R> getSimilarityScore() { + return similarityScore; + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/similarity/Tokenizer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/Tokenizer.java b/src/main/java/org/apache/commons/text/similarity/Tokenizer.java new file mode 100644 index 0000000..fa8fda4 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/Tokenizer.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +/** + * A tokenizer. Can produce arrays of tokens from a given type. + * + * @param <T> given type + * @since 1.0 + */ +interface Tokenizer<T> { + + /** + * Returns an array of tokens. + * + * @param text input text + * @return array of tokens + */ + T[] tokenize(CharSequence text); + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/similarity/package-info.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/package-info.java b/src/main/java/org/apache/commons/text/similarity/package-info.java new file mode 100644 index 0000000..703e427 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/package-info.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * <p>Provides algorithms for string similarity.</p> + * + * <p>The algorithms that implement the EditDistance interface follow the same + * simple principle: the more similar (closer) strings are, lower is the distance. + * For example, the words house and hose are closer than house and trousers.</p> + * + * <p>The following algorithms are available at the moment:</p> + * + * <ul> + * <li>{@link org.apache.commons.text.similarity.CosineDistance Cosine Distance}</li> + * <li>{@link org.apache.commons.text.similarity.CosineSimilarity Cosine Similarity}</li> + * <li>{@link org.apache.commons.text.similarity.FuzzyScore Fuzzy Score}</li> + * <li>{@link org.apache.commons.text.similarity.HammingDistance Hamming Distance}</li> + * <li>{@link org.apache.commons.text.similarity.JaroWinklerDistance Jaro-Winkler Distance}</li> + * <li>{@link org.apache.commons.text.similarity.LevenshteinDistance Levenshtein Distance}</li> + * <li>{@link org.apache.commons.text.similarity.LongestCommonSubsequenceDistance + * Longest Commons Subsequence Distance}</li> + * </ul> + * + * <p>The {@link org.apache.commons.text.similarity.CosineDistance Cosine Distance} + * utilises a {@link org.apache.commons.text.similarity.RegexTokenizer regular expression tokenizer (\w+)}. + * And the {@link org.apache.commons.text.similarity.LevenshteinDistance Levenshtein Distance}'s + * behaviour can be changed to take into consideration a maximum throughput.</p> + * + * @since 1.0 + */ +package org.apache.commons.text.similarity; http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/translate/AggregateTranslator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/translate/AggregateTranslator.java b/src/main/java/org/apache/commons/text/translate/AggregateTranslator.java new file mode 100644 index 0000000..010ab2a --- /dev/null +++ b/src/main/java/org/apache/commons/text/translate/AggregateTranslator.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.translate; + +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.List; + +/** + * Executes a sequence of translators one after the other. Execution ends whenever + * the first translator consumes codepoints from the input. + * + * @since 1.0 + */ +public class AggregateTranslator extends CharSequenceTranslator { + + private final List<CharSequenceTranslator> translators = new ArrayList<>(); + + /** + * Specify the translators to be used at creation time. + * + * @param translators CharSequenceTranslator array to aggregate + */ + public AggregateTranslator(final CharSequenceTranslator... translators) { + if (translators != null) { + for (CharSequenceTranslator translator : translators) { + if (translator != null) { + this.translators.add(translator); + } + } + } + } + + /** + * The first translator to consume codepoints from the input is the 'winner'. + * Execution stops with the number of consumed codepoints being returned. + * {@inheritDoc} + */ + @Override + public int translate(final CharSequence input, final int index, final Writer out) throws IOException { + for (final CharSequenceTranslator translator : translators) { + final int consumed = translator.translate(input, index, out); + if(consumed != 0) { + return consumed; + } + } + return 0; + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/translate/CharSequenceTranslator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/translate/CharSequenceTranslator.java b/src/main/java/org/apache/commons/text/translate/CharSequenceTranslator.java new file mode 100644 index 0000000..baec844 --- /dev/null +++ b/src/main/java/org/apache/commons/text/translate/CharSequenceTranslator.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.translate; + +import java.io.IOException; +import java.io.StringWriter; +import java.io.Writer; +import java.util.Locale; + +/** + * An API for translating text. + * Its core use is to escape and unescape text. Because escaping and unescaping + * is completely contextual, the API does not present two separate signatures. + * + * @since 1.0 + */ +public abstract class CharSequenceTranslator { + + static final char[] HEX_DIGITS = new char[] {'0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F'}; + + /** + * Translate a set of codepoints, represented by an int index into a CharSequence, + * into another set of codepoints. The number of codepoints consumed must be returned, + * and the only IOExceptions thrown must be from interacting with the Writer so that + * the top level API may reliably ignore StringWriter IOExceptions. + * + * @param input CharSequence that is being translated + * @param index int representing the current point of translation + * @param out Writer to translate the text to + * @return int count of codepoints consumed + * @throws IOException if and only if the Writer produces an IOException + */ + public abstract int translate(CharSequence input, int index, Writer out) throws IOException; + + /** + * Helper for non-Writer usage. + * @param input CharSequence to be translated + * @return String output of translation + */ + public final String translate(final CharSequence input) { + if (input == null) { + return null; + } + try { + final StringWriter writer = new StringWriter(input.length() * 2); + translate(input, writer); + return writer.toString(); + } catch (final IOException ioe) { + // this should never ever happen while writing to a StringWriter + throw new RuntimeException(ioe); + } + } + + /** + * Translate an input onto a Writer. This is intentionally final as its algorithm is + * tightly coupled with the abstract method of this class. + * + * @param input CharSequence that is being translated + * @param out Writer to translate the text to + * @throws IOException if and only if the Writer produces an IOException + */ + public final void translate(final CharSequence input, final Writer out) throws IOException { + if (out == null) { + throw new IllegalArgumentException("The Writer must not be null"); + } + if (input == null) { + return; + } + int pos = 0; + final int len = input.length(); + while (pos < len) { + final int consumed = translate(input, pos, out); + if (consumed == 0) { + // inlined implementation of Character.toChars(Character.codePointAt(input, pos)) + // avoids allocating temp char arrays and duplicate checks + final char c1 = input.charAt(pos); + out.write(c1); + pos++; + if (Character.isHighSurrogate(c1) && pos < len) { + final char c2 = input.charAt(pos); + if (Character.isLowSurrogate(c2)) { + out.write(c2); + pos++; + } + } + continue; + } + // contract with translators is that they have to understand codepoints + // and they just took care of a surrogate pair + for (int pt = 0; pt < consumed; pt++) { + pos += Character.charCount(Character.codePointAt(input, pos)); + } + } + } + + /** + * Helper method to create a merger of this translator with another set of + * translators. Useful in customizing the standard functionality. + * + * @param translators CharSequenceTranslator array of translators to merge with this one + * @return CharSequenceTranslator merging this translator with the others + */ + public final CharSequenceTranslator with(final CharSequenceTranslator... translators) { + final CharSequenceTranslator[] newArray = new CharSequenceTranslator[translators.length + 1]; + newArray[0] = this; + System.arraycopy(translators, 0, newArray, 1, translators.length); + return new AggregateTranslator(newArray); + } + + /** + * <p>Returns an upper case hexadecimal <code>String</code> for the given + * character.</p> + * + * @param codepoint The codepoint to convert. + * @return An upper case hexadecimal <code>String</code> + */ + public static String hex(final int codepoint) { + return Integer.toHexString(codepoint).toUpperCase(Locale.ENGLISH); + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/translate/CodePointTranslator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/translate/CodePointTranslator.java b/src/main/java/org/apache/commons/text/translate/CodePointTranslator.java new file mode 100644 index 0000000..3318261 --- /dev/null +++ b/src/main/java/org/apache/commons/text/translate/CodePointTranslator.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.translate; + +import java.io.IOException; +import java.io.Writer; + +/** + * Helper subclass to CharSequenceTranslator to allow for translations that + * will replace up to one character at a time. + * + * @since 1.0 + */ +public abstract class CodePointTranslator extends CharSequenceTranslator { + + /** + * Implementation of translate that maps onto the abstract translate(int, Writer) method. + * {@inheritDoc} + */ + @Override + public final int translate(final CharSequence input, final int index, final Writer out) throws IOException { + final int codepoint = Character.codePointAt(input, index); + final boolean consumed = translate(codepoint, out); + return consumed ? 1 : 0; + } + + /** + * Translate the specified codepoint into another. + * + * @param codepoint int character input to translate + * @param out Writer to optionally push the translated output to + * @return boolean as to whether translation occurred or not + * @throws IOException if and only if the Writer produces an IOException + */ + public abstract boolean translate(int codepoint, Writer out) throws IOException; + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c7cf533d/src/main/java/org/apache/commons/text/translate/CsvTranslators.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/translate/CsvTranslators.java b/src/main/java/org/apache/commons/text/translate/CsvTranslators.java new file mode 100644 index 0000000..f11b9fe --- /dev/null +++ b/src/main/java/org/apache/commons/text/translate/CsvTranslators.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.translate; + +import org.apache.commons.lang3.CharUtils; +import org.apache.commons.lang3.StringUtils; + +import java.io.IOException; +import java.io.Writer; + +/** + * This class holds inner classes for escaping/unescaping Comma Separated Values. + */ +public class CsvTranslators { + + private static final char CSV_DELIMITER = ','; + private static final char CSV_QUOTE = '"'; + private static final String CSV_QUOTE_STR = String.valueOf(CSV_QUOTE); + private static final String CSV_ESCAPED_QUOTE_STR = CSV_QUOTE_STR + CSV_QUOTE_STR; + private static final char[] CSV_SEARCH_CHARS = + new char[] {CSV_DELIMITER, CSV_QUOTE, CharUtils.CR, CharUtils.LF}; + + private CsvTranslators() { } + + /** + * Translator for escaping Comma Separated Values. + */ + public static class CsvEscaper extends SinglePassTranslator { + + @Override + void translateWhole(final CharSequence input, final Writer out) throws IOException { + final String inputSting = input.toString(); + if (StringUtils.containsNone(inputSting, CSV_SEARCH_CHARS)) { + out.write(inputSting); + } else { + // input needs quoting + out.write(CSV_QUOTE); + out.write(StringUtils.replace(inputSting, CSV_QUOTE_STR, CSV_ESCAPED_QUOTE_STR)); + out.write(CSV_QUOTE); + } + } + } + + /** + * Translator for unescaping escaped Comma Separated Value entries. + */ + public static class CsvUnescaper extends SinglePassTranslator { + + @Override + void translateWhole(final CharSequence input, final Writer out) throws IOException { + // is input not quoted? + if (input.charAt(0) != CSV_QUOTE || input.charAt(input.length() - 1) != CSV_QUOTE) { + out.write(input.toString()); + return; + } + + // strip quotes + final String quoteless = input.subSequence(1, input.length() - 1).toString(); + + if (StringUtils.containsAny(quoteless, CSV_SEARCH_CHARS)) { + // deal with escaped quotes; ie) "" + out.write(StringUtils.replace(quoteless, CSV_ESCAPED_QUOTE_STR, CSV_QUOTE_STR)); + } else { + out.write(input.toString()); + } + } + } +} \ No newline at end of file