Author: ggregory Date: Wed Nov 28 20:57:07 2012 New Revision: 1414916 URL: http://svn.apache.org/viewvc?rev=1414916&view=rev Log: <action dev="ggregory" type="add" issue="CODEC-161" due-to="crice">Add Match Rating Approach (MRA) phonetic algorithm encoder.</action>
Added: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java (with props) commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/MatchRatingApproachEncoderTest.java (with props) Modified: commons/proper/codec/trunk/pom.xml commons/proper/codec/trunk/src/changes/changes.xml Modified: commons/proper/codec/trunk/pom.xml URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/pom.xml?rev=1414916&r1=1414915&r2=1414916&view=diff ============================================================================== --- commons/proper/codec/trunk/pom.xml (original) +++ commons/proper/codec/trunk/pom.xml Wed Nov 28 20:57:07 2012 @@ -190,6 +190,13 @@ limitations under the License. <role>Beider-Morse phonetic matching</role> </roles> </contributor> + <contributor> + <name>Colm Rice</name> + <email>colm_rice at hotmail dot com</email> + <roles> + <role>Submitted Match Rating Approach (MRA) phonetic encoder and tests [CODEC-161]</role> + </roles> + </contributor> </contributors> <!-- Codec should depend on very little --> <dependencies> Modified: commons/proper/codec/trunk/src/changes/changes.xml URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/changes/changes.xml?rev=1414916&r1=1414915&r2=1414916&view=diff ============================================================================== --- commons/proper/codec/trunk/src/changes/changes.xml (original) +++ commons/proper/codec/trunk/src/changes/changes.xml Wed Nov 28 20:57:07 2012 @@ -48,6 +48,7 @@ The <action> type attribute can be add,u </release> --> <release version="1.8" date="TBA" description="Feature and fix release."> + <action dev="ggregory" type="add" issue="CODEC-161" due-to="crice">Add Match Rating Approach (MRA) phonetic algorithm encoder.</action> <action dev="ggregory" type="fix" issue="CODEC-163" due-to="leo141">ColognePhonetic encoder unneccessarily creates many char arrays on every loop run.</action> <action dev="sebb" type="fix" issue="CODEC-160">Base64.encodeBase64URLSafeString doesn't add padding characters at the end.</action> </release> Added: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java?rev=1414916&view=auto ============================================================================== --- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java (added) +++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java Wed Nov 28 20:57:07 2012 @@ -0,0 +1,421 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.codec.language; + +import java.util.Locale; + +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringEncoder; + +/** + * Match Rating Approach Phonetic Algorithm Developed by <CITE>Western Airlines</CITE> in 1977. + * + * @see <a href="http://en.wikipedia.org/wiki/Match_rating_approach">Wikipedia - Match Rating Approach</a> + * @since 1.8 + */ +public class MatchRatingApproachEncoder implements StringEncoder { + + private static final String SPACE = " "; + + private static final String EMPTY = ""; + + /** + * Constants used mainly for the min rating value. + */ + private static final int ONE = 1, TWO = 2, THREE = 3, FOUR = 4, FIVE = 5, SIX = 6, SEVEN = 7, EIGHT = 8, ELEVEN = 11, TWELVE = 12; + + /** + * The plain letter equivalent of the accented letters. + */ + private static final String PLAIN_ASCII = "AaEeIiOoUu" // grave + + "AaEeIiOoUuYy" // acute + + "AaEeIiOoUuYy" // circumflex + + "AaOoNn" // tilde + + "AaEeIiOoUuYy" // umlaut + + "Aa" // ring + + "Cc" // cedilla + + "OoUu"; // double acute + + /** + * Unicode characters corresponding to various accented letters. For example: \u00DA is U acute etc... + */ + private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" + + "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" + + "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" + "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" + + "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" + "\u00C5\u00E5" + "\u00C7\u00E7" + + "\u0150\u0151\u0170\u0171"; + + /** + * Cleans up a name: 1. Upper-cases everything 2. Removes some common punctuation 3. Removes accents 4. Removes any + * spaces. + * + * <h2>API Usage</h2> + * <p> + * Consider this method private, it is package protected for unit testing only. + * </p> + * + * @param name + * The name to be cleaned + * @return The cleaned name + */ + String cleanName(final String name) { + String upperName = name.toUpperCase(Locale.ENGLISH); + + String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" }; + for (String str : charsToTrim) { + upperName = upperName.replaceAll(str, EMPTY); + } + + upperName = removeAccents(upperName); + upperName = upperName.replaceAll("\\s+", EMPTY); + + return upperName; + } + + /** + * Encodes an Object using the Match Rating Approach algo. Method is here to satisfy the requirements of the + * Encoder interface Throws an EncoderException if input object is not of type java.lang.String. + * + * @param pObject + * Object to encode + * @return An object (or type java.lang.String) containing the Match Rating Approach code which corresponds to the + * String supplied. + * @throws EncoderException + * if the parameter supplied is not of type java.lang.String + */ + @Override + public final Object encode(final Object pObject) throws EncoderException { + if (!(pObject instanceof String)) { + throw new EncoderException("Parameter supplied to Match Rating Approach encoder is not of type java.lang.String"); + } + return encode((String) pObject); + } + + /** + * Encodes a String using the Match Rating Approach (MRA) algorithm. + * + * @param name + * String object to encode + * @return The MRA code corresponding to the String supplied + */ + @Override + public final String encode(String name) { + // Bulletproof for trivial input - NINO + if (name == null || EMPTY.equalsIgnoreCase(name) || SPACE.equalsIgnoreCase(name) || name.length() == 1) { + return EMPTY; + } + + // Preprocessing + name = cleanName(name); + + // BEGIN: Actual encoding part of the algorithm... + // 1. Delete all vowels unless the vowel begins the word + name = removeVowels(name); + + // 2. Remove second consonant from any double consonant + name = removeDoubleConsonants(name); + + // 3. Reduce codex to 6 letters by joining the first 3 and last 3 letters + name = getFirst3Last3(name); + + return name; + } + + /** + * Gets the first & last 3 letters of a name (if > 6 characters) Else just returns the name. + * + * <h2>API Usage</h2> + * <p> + * Consider this method private, it is package protected for unit testing only. + * </p> + * + * @param name + * The string to get the substrings from + * @return Annexed first & last 3 letters of input word. + */ + String getFirst3Last3(final String name) { + int nameLength = name.length(); + + if (nameLength > SIX) { + String firstThree = name.substring(0, THREE); + String lastThree = name.substring(nameLength - THREE, nameLength); + return firstThree + lastThree; + } else { + return name; + } + } + + /** + * Obtains the min rating of the length sum of the 2 names. In essence the larger the sum length the smaller the + * min rating. Values strictly from documentation. + * + * <h2>API Usage</h2> + * <p> + * Consider this method private, it is package protected for unit testing only. + * </p> + * + * @param sumLength + * The length of 2 strings sent down + * @return The min rating value + */ + int getMinRating(final int sumLength) { + int minRating = 0; + + if (sumLength <= FOUR) { + minRating = FIVE; + } else if ((sumLength >= FIVE) && (sumLength <= SEVEN)) { + minRating = FOUR; + } else if ((sumLength >= EIGHT) && (sumLength <= ELEVEN)) { + minRating = THREE; + } else if (sumLength == TWELVE) { + minRating = TWO; + } else { + minRating = ONE; // docs said little here. + } + + return minRating; + } + + /** + * Determines if two names are homophonous via Match Rating Approach (MRA) algorithm. It should be noted that the + * strings are cleaned in the same way as {@link #encode(String)}. + * + * @param name1 + * First of the 2 strings (names) to compare + * @param name2 + * Second of the 2 names to compare + * @return <code>true</code> if the encodings are identical <code>false</code> otherwise. + */ + public boolean isEncodeEquals(String name1, String name2) { + // Bulletproof for trivial input - NINO + if (name1 == null || EMPTY.equalsIgnoreCase(name1) || SPACE.equalsIgnoreCase(name1)) { + return false; + } else if (name2 == null || EMPTY.equalsIgnoreCase(name2) || SPACE.equalsIgnoreCase(name2)) { + return false; + } else if (name1.length() == 1 || name2.length() == 1) { + return false; + } else if (name1.equalsIgnoreCase(name2)) { + return true; + } + + // Preprocessing + name1 = cleanName(name1); + name2 = cleanName(name2); + + // Actual MRA Algorithm + + // 1. Remove vowels + name1 = removeVowels(name1); + name2 = removeVowels(name2); + + // 2. Remove double consonants + name1 = removeDoubleConsonants(name1); + name2 = removeDoubleConsonants(name2); + + // 3. Reduce down to 3 letters + name1 = getFirst3Last3(name1); + name2 = getFirst3Last3(name2); + + // 4. Check for length difference - if 3 or greater then no similarity + // comparison is done + if (Math.abs(name1.length() - name2.length()) >= THREE) { + return false; + } + + // 5. Obtain the minimum rating value by calculating the length sum of the + // encoded Strings and sending it down. + int sumLength = Math.abs(name1.length() + name2.length()); + int minRating = 0; + minRating = getMinRating(sumLength); + + // 6. Process the encoded Strings from left to right and remove any + // identical characters found from both Strings respectively. + int count = leftToRightThenRightToLeftProcessing(name1, name2); + + // 7. Each PNI item that has a similarity rating equal to or greater than + // the min is considered to be a good candidate match + return count >= minRating; + + } + + /** + * Determines if a letter is a vowel. + * + * <h2>API Usage</h2> + * <p> + * Consider this method private, it is package protected for unit testing only. + * </p> + * + * @param letter + * The letter under investiagtion + * @return True if a vowel, else false + */ + boolean isVowel(String letter) { + return letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") || letter.equalsIgnoreCase("O") || letter.equalsIgnoreCase("I") || + letter.equalsIgnoreCase("U"); + } + + /** + * Processes the names from left to right (first) then right to left removing identical letters in same positions. + * Then subtracts the longer string that remains from 6 and returns this. + * + * <h2>API Usage</h2> + * <p> + * Consider this method private, it is package protected for unit testing only. + * </p> + * + * @param name1 + * name2 + * @return + */ + int leftToRightThenRightToLeftProcessing(String name1, String name2) { + char[] name1Char = name1.toCharArray(); + char[] name2Char = name2.toCharArray(); + + int name1Size = name1.length() - 1; + int name2Size = name2.length() - 1; + + String name1LtRStart = EMPTY; + String name1LtREnd = EMPTY; + + String name2RtLStart = EMPTY; + String name2RtLEnd = EMPTY; + + for (int i = 0; i < name1Char.length; i++) { + if (i > name2Size) { + break; + } + + name1LtRStart = name1.substring(i, i + 1); + name1LtREnd = name1.substring(name1Size - i, (name1Size - i) + 1); + + name2RtLStart = name2.substring(i, i + 1); + name2RtLEnd = name2.substring(name2Size - i, (name2Size - i) + 1); + + // Left to right... + if (name1LtRStart.equals(name2RtLStart)) { + name1Char[i] = ' '; + name2Char[i] = ' '; + } + + // Right to left... + if (name1LtREnd.equals(name2RtLEnd)) { + name1Char[name1Size - i] = ' '; + name2Char[name2Size - i] = ' '; + } + } + + // Char arrays -> string & remove extraneous space + String strA = new String(name1Char).replaceAll("\\s+", EMPTY); + String strB = new String(name2Char).replaceAll("\\s+", EMPTY); + + // Final bit - subtract longest string from 6 and return this int value + if (strA.length() > strB.length()) { + return Math.abs(SIX - strA.length()); + } else { + return Math.abs(SIX - strB.length()); + } + } + + /** + * Removes accented letters and replaces with non-accented ascii equivalent Case is preserved. + * http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29 + * + * @param accentedWord + * The word that may have accents in it. + * @return De-accented word + */ + String removeAccents(final String accentedWord) { + if (accentedWord == null) { + return null; + } + + StringBuilder sb = new StringBuilder(); + int n = accentedWord.length(); + + for (int i = 0; i < n; i++) { + char c = accentedWord.charAt(i); + int pos = UNICODE.indexOf(c); + if (pos > -1) { + sb.append(PLAIN_ASCII.charAt(pos)); + } else { + sb.append(c); + } + } + + return sb.toString(); + } + + /** + * Replaces any double consonant pair with the single letter equivalent. + * + * <h2>API Usage</h2> + * <p> + * Consider this method private, it is package protected for unit testing only. + * </p> + * + * @param name + * String to have double consonants removed + * @return Single consonant word + */ + String removeDoubleConsonants(String name) { + String[] dblCnstArray = new String[] { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS", "TT", "VV", + "WW", "XX", "YY", "ZZ" }; + + String replacedName = name.toUpperCase(); + for (String dc : dblCnstArray) { + if (replacedName.contains(dc)) { + String singleLetter = dc.substring(0, 1); + replacedName = replacedName.replace(dc, singleLetter); + } + } + + return replacedName; + } + + /** + * Deletes all vowels unless the vowel begins the word. + * + * <h2>API Usage</h2> + * <p> + * Consider this method private, it is package protected for unit testing only. + * </p> + * + * @param name + * The name to have vowels removed + * @return De-voweled word + */ + String removeVowels(String name) { + // Extract first letter + String firstLetter = name.substring(0, 1); + + name = name.replaceAll("A", EMPTY); + name = name.replaceAll("E", EMPTY); + name = name.replaceAll("I", EMPTY); + name = name.replaceAll("O", EMPTY); + name = name.replaceAll("U", EMPTY); + + name = name.replaceAll("\\s{2,}\\b", SPACE); + + // return isVowel(firstLetter) ? (firstLetter + name) : name; + if (isVowel(firstLetter)) { + return (firstLetter + name); + } else { + return name; + } + } +} Propchange: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java ------------------------------------------------------------------------------ svn:keywords = Id Added: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/MatchRatingApproachEncoderTest.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/MatchRatingApproachEncoderTest.java?rev=1414916&view=auto ============================================================================== --- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/MatchRatingApproachEncoderTest.java (added) +++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/MatchRatingApproachEncoderTest.java Wed Nov 28 20:57:07 2012 @@ -0,0 +1,426 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.codec.language; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import org.apache.commons.codec.StringEncoder; +import org.apache.commons.codec.StringEncoderAbstractTest; +import org.junit.Test; + +/** + * Series of tests for the Match Rating Approach algorithm. + * + * General naming nomeneclature for the test is of the form: + * GeneralMetadataOnTheTestArea_ActualTestValues_ExpectedResult + * + * An unusual value is indicated by the term "corner case" + */ +public class MatchRatingApproachEncoderTest extends StringEncoderAbstractTest { + + private MatchRatingApproachEncoder getMatchRatingApproachEncoder() { + return (MatchRatingApproachEncoder) this.getStringEncoder(); + } + + // ********** BEGIN REGION - TEST SUPPORT METHODS + + @Test + public final void testAccentRemoval_AllLower_SuccessfullyRemoved() { + assertEquals("aeiou", getMatchRatingApproachEncoder().removeAccents("áéÃóú")); + } + + @Test + public final void testAccentRemoval_WithSpaces_SuccessfullyRemovedAndSpacesInvariant() { + assertEquals("ae io u", getMatchRatingApproachEncoder().removeAccents("áé Ãó ú")); + } + + @Test + public final void testAccentRemoval_UpperandLower_SuccessfullyRemovedAndCaseInvariant() { + assertEquals("AeiOuu", getMatchRatingApproachEncoder().removeAccents("ÃeÃÃuu")); + } + + @Test + public final void testAccentRemoval_MixedWithUnusualChars_SuccessfullyRemovedAndUnusualcharactersInvariant() { + assertEquals("A-e'i.,o&u", getMatchRatingApproachEncoder().removeAccents("Ã-e'Ã.,ó&ú")); + } + + @Test + public final void testAccentRemoval_GerSpanFrenMix_SuccessfullyRemoved() { + assertEquals("aeouÃAEOUnNa", getMatchRatingApproachEncoder().removeAccents("äëöüÃÃÃÃÃñÃà ")); + } + + @Test + public final void testAccentRemoval_ComprehensiveAccentMix_AllSuccessfullyRemoved() { + assertEquals("E,E,E,E,U,U,I,I,A,A,O,e,e,e,e,u,u,i,i,a,a,o,c", + getMatchRatingApproachEncoder().removeAccents("Ã,Ã,Ã,Ã,Ã,Ã,Ã,Ã,Ã,Ã,Ã,è,é,ê,ë,û,ù,ï,î,à ,â,ô,ç")); + } + + @Test + public final void testAccentRemovalNormalString_NoChange() { + assertEquals("Colorless green ideas sleep furiously", getMatchRatingApproachEncoder().removeAccents("Colorless green ideas sleep furiously")); + } + + @Test + public final void testAccentRemoval_NINO_NoChange() { + assertEquals("", getMatchRatingApproachEncoder().removeAccents("")); + } + + @Test + public final void testRemoveSingleDoubleConsonants_BUBLE_RemovedSuccessfully() { + assertEquals("BUBLE", getMatchRatingApproachEncoder().removeDoubleConsonants("BUBBLE")); + } + + @Test + public final void testRemoveDoubleConsonants_MISSISSIPPI_RemovedSuccessfully() { + assertEquals("MISISIPI", getMatchRatingApproachEncoder().removeDoubleConsonants("MISSISSIPPI")); + } + + @Test + public final void testRemoveDoubleDoubleVowel_BEETLE_NotRemoved() { + assertEquals("BEETLE", getMatchRatingApproachEncoder().removeDoubleConsonants("BEETLE")); + } + + @Test + public final void testIsVowel_CapitalA_ReturnsTrue() { + assertEquals(true, getMatchRatingApproachEncoder().isVowel("A")); + } + + @Test + public final void testIsVowel_SmallD_ReturnsFalse() { + assertFalse(getMatchRatingApproachEncoder().isVowel("d")); + } + + @Test + public final void testRemoveVowel_ALESSANDRA_Returns_ALSSNDR() { + assertEquals("ALSSNDR", getMatchRatingApproachEncoder().removeVowels("ALESSANDRA")); + } + + @Test + public final void testRemoveVowel__AIDAN_Returns_ADN() { + assertEquals("ADN", getMatchRatingApproachEncoder().removeVowels("AIDAN")); + } + + @Test + public final void testRemoveVowel__DECLAN_Returns_DCLN() { + assertEquals("DCLN", getMatchRatingApproachEncoder().removeVowels("DECLAN")); + } + + @Test + public final void testGetFirstLast3__ALEXANDER_Returns_Aleder() { + assertEquals("Aleder", getMatchRatingApproachEncoder().getFirst3Last3("Alexzander")); + } + + @Test + public final void testGetFirstLast3_PETE_Returns_PETE() { + assertEquals("PETE", getMatchRatingApproachEncoder().getFirst3Last3("PETE")); + } + + @Test + public final void testleftTorightThenRightToLeft_ALEXANDER_ALEXANDRA_Returns4() { + assertEquals(4, getMatchRatingApproachEncoder().leftToRightThenRightToLeftProcessing("ALEXANDER", "ALEXANDRA")); + } + + @Test + public final void testleftTorightThenRightToLeft_EINSTEIN_MICHAELA_Returns0() { + assertEquals(0, getMatchRatingApproachEncoder().leftToRightThenRightToLeftProcessing("EINSTEIN", "MICHAELA")); + } + + @Test + public final void testGetMinRating_7_Return4_Successfully() { + assertEquals(4, getMatchRatingApproachEncoder().getMinRating(7)); + } + + @Test + public final void testGetMinRating_2_Returns5_Successfully() { + assertEquals(5, getMatchRatingApproachEncoder().getMinRating(2)); + } + + @Test + public final void testGetMinRating_2_Return1_Successfully() { + assertEquals(1, getMatchRatingApproachEncoder().getMinRating(13)); + } + + @Test + public final void testcleanName_SuccessfullyClean() { + assertEquals("THISISATEST", getMatchRatingApproachEncoder().cleanName("This-Ãs a t.,es &t")); + } + + // ***** END REGION - TEST SUPPORT METHODS + + // ***** BEGIN REGION - TEST GET MRA ENCODING + + @Test + public final void testGetEncoding_HARPER_HRPR() { + assertEquals("HRPR", getMatchRatingApproachEncoder().encode("HARPER")); + } + + @Test + public final void testGetEncoding_SMITH_to_SMTH() { + assertEquals("SMTH", getMatchRatingApproachEncoder().encode("Smith")); + } + + @Test + public final void testGetEncoding_SMYTH_to_SMYTH() { + assertEquals("SMYTH", getMatchRatingApproachEncoder().encode("Smyth")); + } + + @Test + public final void testGetEncoding_Space_to_Nothing() { + assertEquals("", getMatchRatingApproachEncoder().encode(" ")); + } + + @Test + public final void testGetEncoding_NoSpace_to_Nothing() { + assertEquals("", getMatchRatingApproachEncoder().encode("")); + } + + @Test + public final void testGetEncoding_Null_to_Nothing() { + assertEquals("", getMatchRatingApproachEncoder().encode(null)); + } + + @Test + public final void testGetEncoding_One_Letter_to_Nothing() { + assertEquals("", getMatchRatingApproachEncoder().encode("E")); + } + + // ***** END REGION - TEST GET MRA ENCODING + + // ***** BEGIN REGION - TEST GET MRA COMPARISONS + + @Test + public final void testCompare_SMITH_SMYTH_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("smith", "smyth")); + } + + @Test + public final void testCompare_BURNS_BOURNE_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Burns", "Bourne")); + } + + @Test + public final void testCompare_ShortNames_AL_ED_WorksButNoMatch() { + assertFalse(getMatchRatingApproachEncoder().isEncodeEquals("Al", "Ed")); + } + + @Test + public final void testCompare_CATHERINE_KATHRYN_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Catherine", "Kathryn")); + } + + @Test + public final void testCompare_BRIAN_BRYAN_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Brian", "Bryan")); + } + + @Test + public final void testCompare_SEAN_SHAUN_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Séan", "Shaun")); + } + + @Test + public final void testCompare_COLM_COLIN_WithAccentsAndSymbolsAndSpaces_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Cólm. ", "C-olÃn")); + } + + @Test + public final void testCompare_STEPHEN_STEVEN_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Stephen", "Steven")); + } + + @Test + public final void testCompare_STEVEN_STEFAN_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Steven", "Stefan")); + } + + @Test + public final void testCompare_STEPHEN_STEFAN_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Stephen", "Stefan")); + } + + @Test + public final void testCompare_SAM_SAMUEL_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Sam", "Samuel")); + } + + @Test + public final void testCompare_MICKY_MICHAEL_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Micky", "Michael")); + } + + @Test + public final void testCompare_OONA_OONAGH_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Oona", "Oonagh")); + } + + @Test + public final void testCompare_SOPHIE_SOFIA_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Sophie", "Sofia")); + } + + @Test + public final void testCompare_FRANCISZEK_FRANCES_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Franciszek", "Frances")); + } + + @Test + public final void testCompare_TOMASZ_TOM_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Tomasz", "tom")); + } + + @Test + public final void testCompare_SmallInput_CARK_Kl_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Kl", "Karl")); + } + + @Test + public final void testCompareNameToSingleLetter_KARL_C_DoesNotMatch() { + assertFalse(getMatchRatingApproachEncoder().isEncodeEquals("Karl", "C")); + } + + @Test + public final void testCompare_ZACH_ZAKARIA_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Zach", "Zacharia")); + } + + @Test + public final void testCompare_KARL_ALESSANDRO_DoesNotMatch() { + assertFalse(getMatchRatingApproachEncoder().isEncodeEquals("Karl", "Alessandro")); + } + + @Test + public final void testCompare_Forenames_UNA_OONAGH_ShouldSuccessfullyMatchButDoesNot() { + assertFalse(getMatchRatingApproachEncoder().isEncodeEquals("Ãna", "Oonagh")); // Disappointing + } + + // ***** Begin Region - Test Get Encoding - Surnames + + @Test + public final void testCompare_Surname_OSULLIVAN_OSUILLEABHAIN_SuccessfulMatch() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("O'Sullivan", "à ' Súilleabháin")); + } + + @Test + public final void testCompare_LongSurnames_MORIARTY_OMUIRCHEARTAIGH_DoesNotSuccessfulMatch() { + assertFalse(getMatchRatingApproachEncoder().isEncodeEquals("Moriarty", "OMuircheartaigh")); + } + + @Test + public final void testCompare_LongSurnames_OMUIRCHEARTAIGH_OMIREADHAIGH_SuccessfulMatch() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("o'muireadhaigh", "à 'Muircheartaigh ")); + } + + @Test + public final void testCompare_Surname_COOPERFLYNN_SUPERLYN_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Cooper-Flynn", "Super-Lyn")); + } + + @Test + public final void testCompare_Surname_HAILEY_HALLEY_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Hailey", "Halley")); + } + + // **** BEGIN YIDDISH/SLAVIC SECTION **** + + @Test + public final void testCompare_Surname_AUERBACH_UHRBACH_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Auerbach", "Uhrbach")); + } + + @Test + public final void testCompare_Surname_MOSKOWITZ_MOSKOVITZ_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Moskowitz", "Moskovitz")); + } + + @Test + public final void testCompare_Surname_LIPSHITZ_LIPPSZYC_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("LIPSHITZ", "LIPPSZYC")); + } + + @Test + public final void testCompare_Surname_LEWINSKY_LEVINSKI_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("LEWINSKY", "LEVINSKI")); + } + + @Test + public final void testCompare_Surname_SZLAMAWICZ_SHLAMOVITZ_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("SZLAMAWICZ", "SHLAMOVITZ")); + } + + @Test + public final void testCompare_Surname_ROSOCHOWACIEC_ROSOKHOVATSETS_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("R o s o ch o w a c ie c", " R o s o k ho v a ts e ts")); + } + + @Test + public final void testCompare_Surname_PRZEMYSL_PSHEMESHIL_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals(" P rz e m y s l", " P sh e m e sh i l")); + } + + // **** END YIDDISH/SLAVIC SECTION **** + + @Test + public final void testCompare_PETERSON_PETERS_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Peterson", "Peters")); + } + + @Test + public final void testCompare_MCGOWAN_MCGEOGHEGAN_SuccessfullyMatched() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("McGowan", "Mc Geoghegan")); + } + + @Test + public final void testCompare_SurnamesCornerCase_MURPHY_Space_NoMatch() { + assertFalse(getMatchRatingApproachEncoder().isEncodeEquals("Murphy", " ")); + } + + @Test + public final void testCompare_SurnamesCornerCase_MURPHY_NoSpace_NoMatch() { + assertFalse(getMatchRatingApproachEncoder().isEncodeEquals("Murphy", "")); + } + + @Test + public final void testCompare_SurnameCornerCase_Nulls_NoMatch() { + assertFalse(getMatchRatingApproachEncoder().isEncodeEquals(null, null)); + } + + @Test + public final void testCompare_Surnames_MURPHY_LYNCH_NoMatchExpected() { + assertFalse(getMatchRatingApproachEncoder().isEncodeEquals("Murphy", "Lynch")); + } + + @Test + public final void testCompare_Forenames_SEAN_JOHN_MatchExpected() { + assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Sean", "John")); + } + + @Test + public final void testCompare_Forenames_SEAN_PETE_NoMatchExpected() { + assertFalse(getMatchRatingApproachEncoder().isEncodeEquals("Sean", "Pete")); + } + + @Override + protected StringEncoder createStringEncoder() { + return new MatchRatingApproachEncoder(); + } + + // ***** END REGION - TEST GET MRA COMPARISONS + +} Propchange: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/MatchRatingApproachEncoderTest.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/MatchRatingApproachEncoderTest.java ------------------------------------------------------------------------------ svn:keywords = Id