Author: ggregory Date: Tue Nov 4 02:18:12 2014 New Revision: 1636486 URL: http://svn.apache.org/r1636486 Log: [CODEC-192] Add Daitch–Mokotoff Soundex.
Added: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java (with props) commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt (with props) commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java (with props) Modified: commons/proper/codec/trunk/src/changes/changes.xml Modified: commons/proper/codec/trunk/src/changes/changes.xml URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/changes/changes.xml?rev=1636486&r1=1636485&r2=1636486&view=diff ============================================================================== --- commons/proper/codec/trunk/src/changes/changes.xml (original) +++ commons/proper/codec/trunk/src/changes/changes.xml Tue Nov 4 02:18:12 2014 @@ -43,6 +43,7 @@ The <action> type attribute can be add,u </properties> <body> <release version="1.10" date="DD Mmmm 2014" description="Feature and fix release."> + <action dev="ggregory" type="add" issue="CODEC-192" due-to="Thomas Neidhart">Add DaitchâMokotoff Soundex</action> <action dev="tn" type="fix" issue="CODEC-185" due-to="Sean Busbey">Added clarification to javadoc of Base64 concerning the use of the urlSafe parameter</action> <action dev="tn" type="fix" issue="CODEC-191" due-to="Igor Savin">Added clarification to the javadoc of Base[32|64]OutputStream that it is mandatory to call close()</action> <action dev="ggregory" type="fix" issue="CODEC-188" due-to="Hendrik Saly">Add support for HMAC Message Authentication Code (MAC) digests</action> Added: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java?rev=1636486&view=auto ============================================================================== --- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java (added) +++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java Tue Nov 4 02:18:12 2014 @@ -0,0 +1,554 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.codec.language; + +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Scanner; +import java.util.Set; + +import org.apache.commons.codec.CharEncoding; +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringEncoder; + +/** + * Encodes a string into a Daitch-Mokotoff Soundex value. + * <p> + * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater + * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling. + * <p> + * The main differences compared to the other soundex variants are: + * <ul> + * <li>coded names are 6 digits long + * <li>the initial character of the name is coded + * <li>rules to encoded multi-character n-grams + * <li>multiple possible encodings for the same name (branching) + * </ul> + * <p> + * This implementation supports branching, depending on the used method: + * <ul> + * <li>{@link #encode(String)} - branching disabled, only the first code will be returned + * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|' + * </ul> + * <p> + * Note: this implementation has additional branching rules compared to the original description of the algorithm. The + * rules can be customized by overriding the default rules contained in the resource file + * {@code org/apache/commons/codec/language/dmrules.txt}. + * <p> + * This class is thread-safe. + * + * @see Soundex + * @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a> + * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a> + * + * @version $Id$ + * @since 1.10 + */ +public class DaitchMokotoffSoundex implements StringEncoder { + + /** + * Inner class representing a branch during DM soundex encoding. + */ + private static final class Branch { + private final StringBuilder builder; + private String cachedString; + private String lastReplacement; + + private Branch() { + builder = new StringBuilder(); + lastReplacement = null; + cachedString = null; + } + + /** + * Creates a new branch, identical to this branch. + * + * @return a new, identical branch + */ + public Branch createBranch() { + final Branch branch = new Branch(); + branch.builder.append(toString()); + branch.lastReplacement = this.lastReplacement; + return branch; + } + + @Override + public boolean equals(final Object other) { + if (this == other) { + return true; + } + if (!(other instanceof Branch)) { + return false; + } + + return toString().equals(((Branch) other).toString()); + } + + /** + * Finish this branch by appending '0's until the maximum code length has been reached. + */ + public void finish() { + while (builder.length() < MAX_LENGTH) { + builder.append('0'); + cachedString = null; + } + } + + @Override + public int hashCode() { + return toString().hashCode(); + } + + /** + * Process the next replacement to be added to this branch. + * + * @param replacement + * the next replacement to append + * @param forceAppend + * indicates if the default processing shall be overridden + */ + public void processNextReplacement(final String replacement, final boolean forceAppend) { + final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend; + + if (append && builder.length() < MAX_LENGTH) { + builder.append(replacement); + // remove all characters after the maximum length + if (builder.length() > MAX_LENGTH) { + builder.delete(MAX_LENGTH, builder.length()); + } + cachedString = null; + } + + lastReplacement = replacement; + } + + @Override + public String toString() { + if (cachedString == null) { + cachedString = builder.toString(); + } + return cachedString; + } + } + + // static identifiers used during parsing of the rule file + + /** + * Inner class for storing rules. + */ + private static final class Rule { + private final String pattern; + private final String[] replacementAtStart; + private final String[] replacementBeforeVowel; + private final String[] replacementDefault; + + protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel, + final String replacementDefault) { + this.pattern = pattern; + this.replacementAtStart = replacementAtStart.split("\\|"); + this.replacementBeforeVowel = replacementBeforeVowel.split("\\|"); + this.replacementDefault = replacementDefault.split("\\|"); + } + + public int getPatternLength() { + return pattern.length(); + } + + public String[] getReplacements(final String context, final boolean atStart) { + if (atStart) { + return replacementAtStart; + } + + final int nextIndex = getPatternLength(); + final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false; + if (nextCharIsVowel) { + return replacementBeforeVowel; + } + + return replacementDefault; + } + + private boolean isVowel(final char ch) { + return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u'; + } + + public boolean matches(final String context) { + return context.startsWith(pattern); + } + + @Override + public String toString() { + return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart), + Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault)); + } + } + + private static final String COMMENT = "//"; + private static final String DOUBLE_QUOTE = "\""; + /** Folding rules. */ + private static final Map<Character, Character> FOLDINGS = new HashMap<Character, Character>(); + + /** The code length of a DM soundex value. */ + private static final int MAX_LENGTH = 6; + private static final String MULTILINE_COMMENT_END = "*/"; + + private static final String MULTILINE_COMMENT_START = "/*"; + + /** The resource file containing the replacement and folding rules */ + private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt"; + + /** Transformation rules indexed by the first character of their pattern. */ + private static final Map<Character, List<Rule>> RULES = new HashMap<Character, List<Rule>>(); + + static { + final InputStream rulesIS = DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE); + if (rulesIS == null) { + throw new IllegalArgumentException("Unable to load resource: " + RESOURCE_FILE); + } + + final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8); + parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS); + scanner.close(); + + // sort RULES by pattern length in descending order + for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) { + final List<Rule> ruleList = rule.getValue(); + Collections.sort(ruleList, new Comparator<Rule>() { + @Override + public int compare(final Rule rule1, final Rule rule2) { + return rule2.getPatternLength() - rule1.getPatternLength(); + } + }); + } + } + + private static void parseRules(final Scanner scanner, final String location, + final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) { + int currentLine = 0; + boolean inMultilineComment = false; + + while (scanner.hasNextLine()) { + currentLine++; + final String rawLine = scanner.nextLine(); + String line = rawLine; + + if (inMultilineComment) { + if (line.endsWith(MULTILINE_COMMENT_END)) { + inMultilineComment = false; + } + continue; + } + + if (line.startsWith(MULTILINE_COMMENT_START)) { + inMultilineComment = true; + } else { + // discard comments + final int cmtI = line.indexOf(COMMENT); + if (cmtI >= 0) { + line = line.substring(0, cmtI); + } + + // trim leading-trailing whitespace + line = line.trim(); + + if (line.length() == 0) { + continue; // empty lines can be safely skipped + } + + if (line.contains("=")) { + // folding + final String[] parts = line.split("="); + if (parts.length != 2) { + throw new IllegalArgumentException("Malformed folding statement split into " + parts.length + + " parts: " + rawLine + " in " + location); + } else { + final String leftCharacter = parts[0]; + final String rightCharacter = parts[1]; + + if (leftCharacter.length() != 1 || rightCharacter.length() != 1) { + throw new IllegalArgumentException("Malformed folding statement - " + + "patterns are not single characters: " + rawLine + " in " + location); + } + + asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0)); + } + } else { + // rule + final String[] parts = line.split("\\s+"); + if (parts.length != 4) { + throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + + " parts: " + rawLine + " in " + location); + } else { + try { + final String pattern = stripQuotes(parts[0]); + final String replacement1 = stripQuotes(parts[1]); + final String replacement2 = stripQuotes(parts[2]); + final String replacement3 = stripQuotes(parts[3]); + + final Rule r = new Rule(pattern, replacement1, replacement2, replacement3); + final char patternKey = r.pattern.charAt(0); + List<Rule> rules = ruleMapping.get(patternKey); + if (rules == null) { + rules = new ArrayList<Rule>(); + ruleMapping.put(patternKey, rules); + } + rules.add(r); + } catch (final IllegalArgumentException e) { + throw new IllegalStateException( + "Problem parsing line '" + currentLine + "' in " + location, e); + } + } + } + } + } + } + + private static String stripQuotes(String str) { + if (str.startsWith(DOUBLE_QUOTE)) { + str = str.substring(1); + } + + if (str.endsWith(DOUBLE_QUOTE)) { + str = str.substring(0, str.length() - 1); + } + + return str; + } + + /** Whether to use ascii folding prior to encoding. */ + private final boolean folding; + + /** + * Creates a new instance with ascii-folding enabled. + */ + public DaitchMokotoffSoundex() { + this(true); + } + + /** + * Creates a new instance. + * <p> + * With ascii-folding enabled, certain accented characters will be transformed to equivalent ascii characters, e.g. + * è -> e. + * + * @param folding + * if ascii-folding shall be performed before encoding + */ + public DaitchMokotoffSoundex(final boolean folding) { + this.folding = folding; + } + + /** + * Performs a cleanup of the input string before the actual soundex transformation. + * <p> + * Removes all whitespace characters and performs ascii folding if enabled. + * + * @param input + * the input string to cleanup + * @return a cleaned up string + */ + private String cleanup(final String input) { + final StringBuilder sb = new StringBuilder(); + for (char ch : input.toCharArray()) { + if (Character.isWhitespace(ch)) { + continue; + } + + ch = Character.toLowerCase(ch); + if (folding && FOLDINGS.containsKey(ch)) { + ch = FOLDINGS.get(ch); + } + sb.append(ch); + } + return sb.toString(); + } + + // -- BEGIN STATIC METHODS --// + + /** + * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching. + * <p> + * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an + * EncoderException if the supplied object is not of type java.lang.String. + * + * @see #soundex(String) + * + * @param obj + * Object to encode + * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String + * supplied. + * @throws EncoderException + * if the parameter supplied is not of type java.lang.String + * @throws IllegalArgumentException + * if a character is not mapped + */ + @Override + public Object encode(final Object obj) throws EncoderException { + if (!(obj instanceof String)) { + throw new EncoderException( + "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String"); + } + return encode((String) obj); + } + + /** + * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching. + * + * @see #soundex(String) + * + * @param str + * A String object to encode + * @return A DM Soundex code corresponding to the String supplied + * @throws IllegalArgumentException + * if a character is not mapped + */ + @Override + public String encode(final String source) { + if (source == null) { + return null; + } + return soundex(source, false)[0]; + } + + // -- BEGIN INNER CLASSES --// + + /** + * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching. + * <p> + * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes, + * separated by '|'. + * <p> + * Example: the name "AUERBACH" is encoded as both + * <ul> + * <li>097400</li> + * <li>097500</li> + * </ul> + * Thus the result will be "097400|097500". + * + * @param str + * A String object to encode + * @return A string containing a set of DM Soundex codes corresponding to the String supplied + * @throws IllegalArgumentException + * if a character is not mapped + */ + public String soundex(final String source) { + final String[] branches = soundex(source, true); + final StringBuilder sb = new StringBuilder(); + int index = 0; + for (final String branch : branches) { + sb.append(branch); + if (++index < branches.length) { + sb.append('|'); + } + } + return sb.toString(); + } + + /** + * Perform the actual DM soundex algorithm on the input string. + * + * @param source + * A String object to encode + * @param branching + * If branching shall be performed + * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the + * selected branching mode + */ + private String[] soundex(final String source, final boolean branching) { + if (source == null) { + return null; + } + + final String input = cleanup(source); + + final Set<Branch> currentBranches = new LinkedHashSet<Branch>(); + currentBranches.add(new Branch()); + + char lastChar = '\0'; + for (int index = 0; index < input.length(); index++) { + final char ch = input.charAt(index); + + // ignore whitespace inside a name + if (Character.isWhitespace(ch)) { + continue; + } + + final String inputContext = input.substring(index); + final List<Rule> rules = RULES.get(ch); + if (rules == null) { + continue; + } + + // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access + @SuppressWarnings("unchecked") + final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.EMPTY_LIST; + + for (final Rule rule : rules) { + if (rule.matches(inputContext)) { + if (branching) { + nextBranches.clear(); + } + final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0'); + final boolean branchingRequired = replacements.length > 1 && branching; + + for (final Branch branch : currentBranches) { + for (final String nextReplacement : replacements) { + // if we have multiple replacements, always create a new branch + final Branch nextBranch = branchingRequired ? branch.createBranch() : branch; + + // special rule: occurrences of mn or nm are treated differently + final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm'); + + nextBranch.processNextReplacement(nextReplacement, force); + + if (branching) { + nextBranches.add(nextBranch); + } else { + break; + } + } + } + + if (branching) { + currentBranches.clear(); + currentBranches.addAll(nextBranches); + } + index += rule.getPatternLength() - 1; + break; + } + } + + lastChar = ch; + } + + final String[] result = new String[currentBranches.size()]; + int index = 0; + for (final Branch branch : currentBranches) { + branch.finish(); + result[index++] = branch.toString(); + } + + return result; + } +} Propchange: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java ------------------------------------------------------------------------------ svn:keywords = Id Added: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt?rev=1636486&view=auto ============================================================================== --- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt (added) +++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt Tue Nov 4 02:18:12 2014 @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Format +// "pattern" "replacement at start of word" "replacement before a vowel" "replacement in other cases" + +// Vowels + +"a" "0" "" "" +"e" "0" "" "" +"i" "0" "" "" +"o" "0" "" "" +"u" "0" "" "" + +// Consonants + +"b" "7" "7" "7" +"d" "3" "3" "3" +"f" "7" "7" "7" +"g" "5" "5" "5" +"h" "5" "5" "" +"k" "5" "5" "5" +"l" "8" "8" "8" +"m" "6" "6" "6" +"n" "6" "6" "6" +"p" "7" "7" "7" +"q" "5" "5" "5" +"r" "9" "9" "9" +"s" "4" "4" "4" +"t" "3" "3" "3" +"v" "7" "7" "7" +"w" "7" "7" "7" +"x" "5" "54" "54" +"y" "1" "" "" +"z" "4" "4" "4" + +// Romanian t-cedilla and t-comma should be equivalent +"Å£" "3|4" "3|4" "3|4" +"È" "3|4" "3|4" "3|4" + +// Polish characters (e-ogonek and a-ogonek): default case branch either not coded or 6 +"Ä" "" "" "|6" +"Ä " "" "" "|6" + +// Other terms + +"schtsch" "2" "4" "4" +"schtsh" "2" "4" "4" +"schtch" "2" "4" "4" +"shtch" "2" "4" "4" +"shtsh" "2" "4" "4" +"stsch" "2" "4" "4" +"ttsch" "4" "4" "4" +"zhdzh" "2" "4" "4" +"shch" "2" "4" "4" +"scht" "2" "43" "43" +"schd" "2" "43" "43" +"stch" "2" "4" "4" +"strz" "2" "4" "4" +"strs" "2" "4" "4" +"stsh" "2" "4" "4" +"szcz" "2" "4" "4" +"szcs" "2" "4" "4" +"ttch" "4" "4" "4" +"tsch" "4" "4" "4" +"ttsz" "4" "4" "4" +"zdzh" "2" "4" "4" +"zsch" "4" "4" "4" +"chs" "5" "54" "54" +"csz" "4" "4" "4" +"czs" "4" "4" "4" +"drz" "4" "4" "4" +"drs" "4" "4" "4" +"dsh" "4" "4" "4" +"dsz" "4" "4" "4" +"dzh" "4" "4" "4" +"dzs" "4" "4" "4" +"sch" "4" "4" "4" +"sht" "2" "43" "43" +"szt" "2" "43" "43" +"shd" "2" "43" "43" +"szd" "2" "43" "43" +"tch" "4" "4" "4" +"trz" "4" "4" "4" +"trs" "4" "4" "4" +"tsh" "4" "4" "4" +"tts" "4" "4" "4" +"ttz" "4" "4" "4" +"tzs" "4" "4" "4" +"tsz" "4" "4" "4" +"zdz" "2" "4" "4" +"zhd" "2" "43" "43" +"zsh" "4" "4" "4" +"ai" "0" "1" "" +"aj" "0" "1" "" +"ay" "0" "1" "" +"au" "0" "7" "" +"cz" "4" "4" "4" +"cs" "4" "4" "4" +"ds" "4" "4" "4" +"dz" "4" "4" "4" +"dt" "3" "3" "3" +"ei" "0" "1" "" +"ej" "0" "1" "" +"ey" "0" "1" "" +"eu" "1" "1" "" +"fb" "7" "7" "7" +"ia" "1" "" "" +"ie" "1" "" "" +"io" "1" "" "" +"iu" "1" "" "" +"ks" "5" "54" "54" +"kh" "5" "5" "5" +"mn" "66" "66" "66" +"nm" "66" "66" "66" +"oi" "0" "1" "" +"oj" "0" "1" "" +"oy" "0" "1" "" +"pf" "7" "7" "7" +"ph" "7" "7" "7" +"sh" "4" "4" "4" +"sc" "2" "4" "4" +"st" "2" "43" "43" +"sd" "2" "43" "43" +"sz" "4" "4" "4" +"th" "3" "3" "3" +"ts" "4" "4" "4" +"tc" "4" "4" "4" +"tz" "4" "4" "4" +"ui" "0" "1" "" +"uj" "0" "1" "" +"uy" "0" "1" "" +"ue" "0" "1" "" +"zd" "2" "43" "43" +"zh" "4" "4" "4" +"zs" "4" "4" "4" + +// Branching cases + +"c" "4|5" "4|5" "4|5" +"ch" "4|5" "4|5" "4|5" +"ck" "5|45" "5|45" "5|45" +"rs" "4|94" "4|94" "4|94" +"rz" "4|94" "4|94" "4|94" +"j" "1|4" "|4" "|4" + + +// ASCII foldings + +Ã=s +à =a +á=a +â=a +ã=a +ä=a +Ã¥=a +æ=a +ç=c +è=e +é=e +ê=e +ë=e +ì=i +Ã=i +î=i +ï=i +ð=d +ñ=n +ò=o +ó=o +ô=o +õ=o +ö=o +ø=o +ù=u +ú=u +û=u +ý=y +ý=y +þ=b +ÿ=y +Ä=c +Å=l +Å=s +ż=z +ź=z Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt ------------------------------------------------------------------------------ svn:eol-style = native Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt ------------------------------------------------------------------------------ svn:keywords = Id Added: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java?rev=1636486&view=auto ============================================================================== --- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java (added) +++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java Tue Nov 4 02:18:12 2014 @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.codec.language; + +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringEncoderAbstractTest; +import org.junit.Assert; +import org.junit.Test; + +/** + * Tests {@link DaitchMokotoffSoundex}. + * <p> + * Keep this file in UTF-8 encoding for proper Javadoc processing. + * </p> + * + * @since 1.10 + */ +public class DaitchMokotoffSoundexTest extends StringEncoderAbstractTest<DaitchMokotoffSoundex> { + + @Override + protected DaitchMokotoffSoundex createStringEncoder() { + return new DaitchMokotoffSoundex(); + } + + @Test + public void testAccentedCharacterFolding() { + Assert.assertEquals("294795", this.getStringEncoder().soundex("StraÃburg")); + Assert.assertEquals("294795", this.getStringEncoder().soundex("Strasburg")); + + Assert.assertEquals("095600", this.getStringEncoder().soundex("Ãregon")); + Assert.assertEquals("095600", this.getStringEncoder().soundex("Eregon")); + } + + @Test + public void testAdjacentCodes() { + // AKSSOL + // A-KS-S-O-L + // 0-54-4---8 -> wrong + // 0-54-----8 -> correct + Assert.assertEquals("054800", this.getStringEncoder().soundex("AKSSOL")); + + // GERSCHFELD + // G-E-RS-CH-F-E-L-D + // 5--4/94-5/4-7-8-3 -> wrong + // 5--4/94-5/--7-8-3 -> correct + Assert.assertEquals("547830|545783|594783|594578", this.getStringEncoder().soundex("GERSCHFELD")); + } + + public void testEncodeBasic() { + // same as above, but without branching + Assert.assertEquals("097400", this.getStringEncoder().encode("AUERBACH")); + Assert.assertEquals("097400", this.getStringEncoder().encode("OHRBACH")); + Assert.assertEquals("874400", this.getStringEncoder().encode("LIPSHITZ")); + Assert.assertEquals("874400", this.getStringEncoder().encode("LIPPSZYC")); + Assert.assertEquals("876450", this.getStringEncoder().encode("LEWINSKY")); + Assert.assertEquals("876450", this.getStringEncoder().encode("LEVINSKI")); + Assert.assertEquals("486740", this.getStringEncoder().encode("SZLAMAWICZ")); + Assert.assertEquals("486740", this.getStringEncoder().encode("SHLAMOVITZ")); + } + + @Test + public void testEncodeIgnoreApostrophes() throws EncoderException { + this.checkEncodingVariations("079600", new String[] { "OBrien", "'OBrien", "O'Brien", "OB'rien", "OBr'ien", + "OBri'en", "OBrie'n", "OBrien'" }); + } + + /** + * Test data from http://www.myatt.demon.co.uk/sxalg.htm + * + * @throws EncoderException + */ + @Test + public void testEncodeIgnoreHyphens() throws EncoderException { + this.checkEncodingVariations("565463", new String[] { "KINGSMITH", "-KINGSMITH", "K-INGSMITH", "KI-NGSMITH", + "KIN-GSMITH", "KING-SMITH", "KINGS-MITH", "KINGSM-ITH", "KINGSMI-TH", "KINGSMIT-H", "KINGSMITH-" }); + } + + @Test + public void testEncodeIgnoreTrimmable() { + Assert.assertEquals("746536", this.getStringEncoder().encode(" \t\n\r Washington \t\n\r ")); + Assert.assertEquals("746536", this.getStringEncoder().encode("Washington")); + } + + /** + * Examples from http://www.jewishgen.org/infofiles/soundex.html + */ + @Test + public void testSoundexBasic() { + Assert.assertEquals("583600", this.getStringEncoder().soundex("GOLDEN")); + Assert.assertEquals("087930", this.getStringEncoder().soundex("Alpert")); + Assert.assertEquals("791900", this.getStringEncoder().soundex("Breuer")); + Assert.assertEquals("579000", this.getStringEncoder().soundex("Haber")); + Assert.assertEquals("665600", this.getStringEncoder().soundex("Mannheim")); + Assert.assertEquals("664000", this.getStringEncoder().soundex("Mintz")); + Assert.assertEquals("370000", this.getStringEncoder().soundex("Topf")); + Assert.assertEquals("586660", this.getStringEncoder().soundex("Kleinmann")); + Assert.assertEquals("769600", this.getStringEncoder().soundex("Ben Aron")); + + Assert.assertEquals("097400|097500", this.getStringEncoder().soundex("AUERBACH")); + Assert.assertEquals("097400|097500", this.getStringEncoder().soundex("OHRBACH")); + Assert.assertEquals("874400", this.getStringEncoder().soundex("LIPSHITZ")); + Assert.assertEquals("874400|874500", this.getStringEncoder().soundex("LIPPSZYC")); + Assert.assertEquals("876450", this.getStringEncoder().soundex("LEWINSKY")); + Assert.assertEquals("876450", this.getStringEncoder().soundex("LEVINSKI")); + Assert.assertEquals("486740", this.getStringEncoder().soundex("SZLAMAWICZ")); + Assert.assertEquals("486740", this.getStringEncoder().soundex("SHLAMOVITZ")); + } + + /** + * Examples from http://www.avotaynu.com/soundex.htm + */ + @Test + public void testSoundexBasic2() { + Assert.assertEquals("467000|567000", this.getStringEncoder().soundex("Ceniow")); + Assert.assertEquals("467000", this.getStringEncoder().soundex("Tsenyuv")); + Assert.assertEquals("587400|587500", this.getStringEncoder().soundex("Holubica")); + Assert.assertEquals("587400", this.getStringEncoder().soundex("Golubitsa")); + Assert.assertEquals("746480|794648", this.getStringEncoder().soundex("Przemysl")); + Assert.assertEquals("746480", this.getStringEncoder().soundex("Pshemeshil")); + Assert.assertEquals("944744|944745|944754|944755|945744|945745|945754|945755", + this.getStringEncoder().soundex("Rosochowaciec")); + Assert.assertEquals("945744", this.getStringEncoder().soundex("Rosokhovatsets")); + } + + /** + * Examples from http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex + */ + @Test + public void testSoundexBasic3() { + Assert.assertEquals("734000|739400", this.getStringEncoder().soundex("Peters")); + Assert.assertEquals("734600|739460", this.getStringEncoder().soundex("Peterson")); + Assert.assertEquals("645740", this.getStringEncoder().soundex("Moskowitz")); + Assert.assertEquals("645740", this.getStringEncoder().soundex("Moskovitz")); + Assert.assertEquals("154600|145460|454600|445460", this.getStringEncoder().soundex("Jackson")); + Assert.assertEquals("154654|154645|154644|145465|145464|454654|454645|454644|445465|445464", this + .getStringEncoder().soundex("Jackson-Jackson")); + } + + @Test + public void testSpecialRomanianCharacters() { + Assert.assertEquals("364000|464000", this.getStringEncoder().soundex("Å£amas")); // t-cedilla + Assert.assertEquals("364000|464000", this.getStringEncoder().soundex("Èamas")); // t-comma + } + +} Propchange: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java ------------------------------------------------------------------------------ svn:keywords = Id