svn commit: r1636486 - in /commons/proper/codec/trunk/src: changes/ main/java/org/apache/commons/codec/language/ main/resources/org/apache/commons/codec/language/ test/java/org/apache/commons/codec/language/

ggregory Mon, 03 Nov 2014 18:19:32 -0800

Author: ggregory
Date: Tue Nov  4 02:18:12 2014
New Revision: 1636486

URL: http://svn.apache.org/r1636486
Log:
[CODEC-192] Add Daitch–Mokotoff Soundex.


Added:
    
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java
   (with props)
    
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt
   (with props)
    
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java
   (with props)
Modified:
    commons/proper/codec/trunk/src/changes/changes.xml

Modified: commons/proper/codec/trunk/src/changes/changes.xml
URL: 
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/changes/changes.xml?rev=1636486&r1=1636485&r2=1636486&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/changes/changes.xml (original)
+++ commons/proper/codec/trunk/src/changes/changes.xml Tue Nov  4 02:18:12 2014
@@ -43,6 +43,7 @@ The <action> type attribute can be add,u
   </properties>
   <body>
     <release version="1.10" date="DD Mmmm 2014" description="Feature and fix 
release.">
+      <action dev="ggregory" type="add" issue="CODEC-192" due-to="Thomas 
Neidhart">Add DaitchâMokotoff Soundex</action>   
       <action dev="tn" type="fix" issue="CODEC-185" due-to="Sean Busbey">Added 
clarification to javadoc of Base64 concerning the use of the urlSafe 
parameter</action>   
       <action dev="tn" type="fix" issue="CODEC-191" due-to="Igor Savin">Added 
clarification to the javadoc of Base[32|64]OutputStream that it is mandatory to 
call close()</action>   
       <action dev="ggregory" type="fix" issue="CODEC-188" due-to="Hendrik 
Saly">Add support for HMAC Message Authentication Code (MAC) digests</action>   

Added: 
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java
URL: 
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java?rev=1636486&view=auto
==============================================================================
--- 
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java
 (added)
+++ 
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java
 Tue Nov  4 02:18:12 2014
@@ -0,0 +1,554 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.codec.language;
+
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Scanner;
+import java.util.Set;
+
+import org.apache.commons.codec.CharEncoding;
+import org.apache.commons.codec.EncoderException;
+import org.apache.commons.codec.StringEncoder;
+
+/**
+ * Encodes a string into a Daitch-Mokotoff Soundex value.
+ * <p>
+ * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and 
American Soundex algorithms, yielding greater
+ * accuracy in matching especially Slavish and Yiddish surnames with similar 
pronunciation but differences in spelling.
+ * <p>
+ * The main differences compared to the other soundex variants are:
+ * <ul>
+ * <li>coded names are 6 digits long
+ * <li>the initial character of the name is coded
+ * <li>rules to encoded multi-character n-grams
+ * <li>multiple possible encodings for the same name (branching)
+ * </ul>
+ * <p>
+ * This implementation supports branching, depending on the used method:
+ * <ul>
+ * <li>{@link #encode(String)} - branching disabled, only the first code will 
be returned
+ * <li>{@link #soundex(String)} - branching enabled, all codes will be 
returned, separated by '|'
+ * </ul>
+ * <p>
+ * Note: this implementation has additional branching rules compared to the 
original description of the algorithm. The
+ * rules can be customized by overriding the default rules contained in the 
resource file
+ * {@code org/apache/commons/codec/language/dmrules.txt}.
+ * <p>
+ * This class is thread-safe.
+ *
+ * @see Soundex
+ * @see <a 
href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex";> Wikipedia 
- Daitch-Mokotoff Soundex</a>
+ * @see <a href="http://www.avotaynu.com/soundex.htm";>Avotaynu - Soundexing 
and Genealogy</a>
+ *
+ * @version $Id$
+ * @since 1.10
+ */
+public class DaitchMokotoffSoundex implements StringEncoder {
+
+    /**
+     * Inner class representing a branch during DM soundex encoding.
+     */
+    private static final class Branch {
+        private final StringBuilder builder;
+        private String cachedString;
+        private String lastReplacement;
+
+        private Branch() {
+            builder = new StringBuilder();
+            lastReplacement = null;
+            cachedString = null;
+        }
+
+        /**
+         * Creates a new branch, identical to this branch.
+         *
+         * @return a new, identical branch
+         */
+        public Branch createBranch() {
+            final Branch branch = new Branch();
+            branch.builder.append(toString());
+            branch.lastReplacement = this.lastReplacement;
+            return branch;
+        }
+
+        @Override
+        public boolean equals(final Object other) {
+            if (this == other) {
+                return true;
+            }
+            if (!(other instanceof Branch)) {
+                return false;
+            }
+
+            return toString().equals(((Branch) other).toString());
+        }
+
+        /**
+         * Finish this branch by appending '0's until the maximum code length 
has been reached.
+         */
+        public void finish() {
+            while (builder.length() < MAX_LENGTH) {
+                builder.append('0');
+                cachedString = null;
+            }
+        }
+
+        @Override
+        public int hashCode() {
+            return toString().hashCode();
+        }
+
+        /**
+         * Process the next replacement to be added to this branch.
+         *
+         * @param replacement
+         *            the next replacement to append
+         * @param forceAppend
+         *            indicates if the default processing shall be overridden
+         */
+        public void processNextReplacement(final String replacement, final 
boolean forceAppend) {
+            final boolean append = lastReplacement == null || 
!lastReplacement.endsWith(replacement) || forceAppend;
+
+            if (append && builder.length() < MAX_LENGTH) {
+                builder.append(replacement);
+                // remove all characters after the maximum length
+                if (builder.length() > MAX_LENGTH) {
+                    builder.delete(MAX_LENGTH, builder.length());
+                }
+                cachedString = null;
+            }
+
+            lastReplacement = replacement;
+        }
+
+        @Override
+        public String toString() {
+            if (cachedString == null) {
+                cachedString = builder.toString();
+            }
+            return cachedString;
+        }
+    }
+
+    // static identifiers used during parsing of the rule file
+
+    /**
+     * Inner class for storing rules.
+     */
+    private static final class Rule {
+        private final String pattern;
+        private final String[] replacementAtStart;
+        private final String[] replacementBeforeVowel;
+        private final String[] replacementDefault;
+
+        protected Rule(final String pattern, final String replacementAtStart, 
final String replacementBeforeVowel,
+                final String replacementDefault) {
+            this.pattern = pattern;
+            this.replacementAtStart = replacementAtStart.split("\\|");
+            this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
+            this.replacementDefault = replacementDefault.split("\\|");
+        }
+
+        public int getPatternLength() {
+            return pattern.length();
+        }
+
+        public String[] getReplacements(final String context, final boolean 
atStart) {
+            if (atStart) {
+                return replacementAtStart;
+            }
+
+            final int nextIndex = getPatternLength();
+            final boolean nextCharIsVowel = nextIndex < context.length() ? 
isVowel(context.charAt(nextIndex)) : false;
+            if (nextCharIsVowel) {
+                return replacementBeforeVowel;
+            }
+
+            return replacementDefault;
+        }
+
+        private boolean isVowel(final char ch) {
+            return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 
'u';
+        }
+
+        public boolean matches(final String context) {
+            return context.startsWith(pattern);
+        }
+
+        @Override
+        public String toString() {
+            return String.format("%s=(%s,%s,%s)", pattern, 
Arrays.asList(replacementAtStart),
+                    Arrays.asList(replacementBeforeVowel), 
Arrays.asList(replacementDefault));
+        }
+    }
+
+    private static final String COMMENT = "//";
+    private static final String DOUBLE_QUOTE = "\"";
+    /** Folding rules. */
+    private static final Map<Character, Character> FOLDINGS = new 
HashMap<Character, Character>();
+
+    /** The code length of a DM soundex value. */
+    private static final int MAX_LENGTH = 6;
+    private static final String MULTILINE_COMMENT_END = "*/";
+
+    private static final String MULTILINE_COMMENT_START = "/*";
+
+    /** The resource file containing the replacement and folding rules */
+    private static final String RESOURCE_FILE = 
"org/apache/commons/codec/language/dmrules.txt";
+
+    /** Transformation rules indexed by the first character of their pattern. 
*/
+    private static final Map<Character, List<Rule>> RULES = new 
HashMap<Character, List<Rule>>();
+
+    static {
+        final InputStream rulesIS = 
DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE);
+        if (rulesIS == null) {
+            throw new IllegalArgumentException("Unable to load resource: " + 
RESOURCE_FILE);
+        }
+
+        final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8);
+        parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
+        scanner.close();
+
+        // sort RULES by pattern length in descending order
+        for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) {
+            final List<Rule> ruleList = rule.getValue();
+            Collections.sort(ruleList, new Comparator<Rule>() {
+                @Override
+                public int compare(final Rule rule1, final Rule rule2) {
+                    return rule2.getPatternLength() - rule1.getPatternLength();
+                }
+            });
+        }
+    }
+
+    private static void parseRules(final Scanner scanner, final String 
location,
+            final Map<Character, List<Rule>> ruleMapping, final Map<Character, 
Character> asciiFoldings) {
+        int currentLine = 0;
+        boolean inMultilineComment = false;
+
+        while (scanner.hasNextLine()) {
+            currentLine++;
+            final String rawLine = scanner.nextLine();
+            String line = rawLine;
+
+            if (inMultilineComment) {
+                if (line.endsWith(MULTILINE_COMMENT_END)) {
+                    inMultilineComment = false;
+                }
+                continue;
+            }
+
+            if (line.startsWith(MULTILINE_COMMENT_START)) {
+                inMultilineComment = true;
+            } else {
+                // discard comments
+                final int cmtI = line.indexOf(COMMENT);
+                if (cmtI >= 0) {
+                    line = line.substring(0, cmtI);
+                }
+
+                // trim leading-trailing whitespace
+                line = line.trim();
+
+                if (line.length() == 0) {
+                    continue; // empty lines can be safely skipped
+                }
+
+                if (line.contains("=")) {
+                    // folding
+                    final String[] parts = line.split("=");
+                    if (parts.length != 2) {
+                        throw new IllegalArgumentException("Malformed folding 
statement split into " + parts.length +
+                                " parts: " + rawLine + " in " + location);
+                    } else {
+                        final String leftCharacter = parts[0];
+                        final String rightCharacter = parts[1];
+
+                        if (leftCharacter.length() != 1 || 
rightCharacter.length() != 1) {
+                            throw new IllegalArgumentException("Malformed 
folding statement - " +
+                                    "patterns are not single characters: " + 
rawLine + " in " + location);
+                        }
+
+                        asciiFoldings.put(leftCharacter.charAt(0), 
rightCharacter.charAt(0));
+                    }
+                } else {
+                    // rule
+                    final String[] parts = line.split("\\s+");
+                    if (parts.length != 4) {
+                        throw new IllegalArgumentException("Malformed rule 
statement split into " + parts.length +
+                                " parts: " + rawLine + " in " + location);
+                    } else {
+                        try {
+                            final String pattern = stripQuotes(parts[0]);
+                            final String replacement1 = stripQuotes(parts[1]);
+                            final String replacement2 = stripQuotes(parts[2]);
+                            final String replacement3 = stripQuotes(parts[3]);
+
+                            final Rule r = new Rule(pattern, replacement1, 
replacement2, replacement3);
+                            final char patternKey = r.pattern.charAt(0);
+                            List<Rule> rules = ruleMapping.get(patternKey);
+                            if (rules == null) {
+                                rules = new ArrayList<Rule>();
+                                ruleMapping.put(patternKey, rules);
+                            }
+                            rules.add(r);
+                        } catch (final IllegalArgumentException e) {
+                            throw new IllegalStateException(
+                                    "Problem parsing line '" + currentLine + 
"' in " + location, e);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    private static String stripQuotes(String str) {
+        if (str.startsWith(DOUBLE_QUOTE)) {
+            str = str.substring(1);
+        }
+
+        if (str.endsWith(DOUBLE_QUOTE)) {
+            str = str.substring(0, str.length() - 1);
+        }
+
+        return str;
+    }
+
+    /** Whether to use ascii folding prior to encoding. */
+    private final boolean folding;
+
+    /**
+     * Creates a new instance with ascii-folding enabled.
+     */
+    public DaitchMokotoffSoundex() {
+        this(true);
+    }
+
+    /**
+     * Creates a new instance.
+     * <p>
+     * With ascii-folding enabled, certain accented characters will be 
transformed to equivalent ascii characters, e.g.
+     * Ã¨ -&gt; e.
+     *
+     * @param folding
+     *            if ascii-folding shall be performed before encoding
+     */
+    public DaitchMokotoffSoundex(final boolean folding) {
+        this.folding = folding;
+    }
+
+    /**
+     * Performs a cleanup of the input string before the actual soundex 
transformation.
+     * <p>
+     * Removes all whitespace characters and performs ascii folding if enabled.
+     *
+     * @param input
+     *            the input string to cleanup
+     * @return a cleaned up string
+     */
+    private String cleanup(final String input) {
+        final StringBuilder sb = new StringBuilder();
+        for (char ch : input.toCharArray()) {
+            if (Character.isWhitespace(ch)) {
+                continue;
+            }
+
+            ch = Character.toLowerCase(ch);
+            if (folding && FOLDINGS.containsKey(ch)) {
+                ch = FOLDINGS.get(ch);
+            }
+            sb.append(ch);
+        }
+        return sb.toString();
+    }
+
+    // -- BEGIN STATIC METHODS --//
+
+    /**
+     * Encodes an Object using the Daitch-Mokotoff soundex algorithm without 
branching.
+     * <p>
+     * This method is provided in order to satisfy the requirements of the 
Encoder interface, and will throw an
+     * EncoderException if the supplied object is not of type java.lang.String.
+     *
+     * @see #soundex(String)
+     *
+     * @param obj
+     *            Object to encode
+     * @return An object (of type java.lang.String) containing the DM soundex 
code, which corresponds to the String
+     *         supplied.
+     * @throws EncoderException
+     *             if the parameter supplied is not of type java.lang.String
+     * @throws IllegalArgumentException
+     *             if a character is not mapped
+     */
+    @Override
+    public Object encode(final Object obj) throws EncoderException {
+        if (!(obj instanceof String)) {
+            throw new EncoderException(
+                    "Parameter supplied to DaitchMokotoffSoundex encode is not 
of type java.lang.String");
+        }
+        return encode((String) obj);
+    }
+
+    /**
+     * Encodes a String using the Daitch-Mokotoff soundex algorithm without 
branching.
+     *
+     * @see #soundex(String)
+     *
+     * @param str
+     *            A String object to encode
+     * @return A DM Soundex code corresponding to the String supplied
+     * @throws IllegalArgumentException
+     *             if a character is not mapped
+     */
+    @Override
+    public String encode(final String source) {
+        if (source == null) {
+            return null;
+        }
+        return soundex(source, false)[0];
+    }
+
+    // -- BEGIN INNER CLASSES --//
+
+    /**
+     * Encodes a String using the Daitch-Mokotoff soundex algorithm with 
branching.
+     * <p>
+     * In case a string is encoded into multiple codes (see branching rules), 
the result will contain all codes,
+     * separated by '|'.
+     * <p>
+     * Example: the name "AUERBACH" is encoded as both
+     * <ul>
+     * <li>097400</li>
+     * <li>097500</li>
+     * </ul>
+     * Thus the result will be "097400|097500".
+     *
+     * @param str
+     *            A String object to encode
+     * @return A string containing a set of DM Soundex codes corresponding to 
the String supplied
+     * @throws IllegalArgumentException
+     *             if a character is not mapped
+     */
+    public String soundex(final String source) {
+        final String[] branches = soundex(source, true);
+        final StringBuilder sb = new StringBuilder();
+        int index = 0;
+        for (final String branch : branches) {
+            sb.append(branch);
+            if (++index < branches.length) {
+                sb.append('|');
+            }
+        }
+        return sb.toString();
+    }
+
+    /**
+     * Perform the actual DM soundex algorithm on the input string.
+     *
+     * @param source
+     *            A String object to encode
+     * @param branching
+     *            If branching shall be performed
+     * @return A string array containing all DM Soundex codes corresponding to 
the String supplied depending on the
+     *         selected branching mode
+     */
+    private String[] soundex(final String source, final boolean branching) {
+        if (source == null) {
+            return null;
+        }
+
+        final String input = cleanup(source);
+
+        final Set<Branch> currentBranches = new LinkedHashSet<Branch>();
+        currentBranches.add(new Branch());
+
+        char lastChar = '\0';
+        for (int index = 0; index < input.length(); index++) {
+            final char ch = input.charAt(index);
+
+            // ignore whitespace inside a name
+            if (Character.isWhitespace(ch)) {
+                continue;
+            }
+
+            final String inputContext = input.substring(index);
+            final List<Rule> rules = RULES.get(ch);
+            if (rules == null) {
+                continue;
+            }
+
+            // use an EMPTY_LIST to avoid false positive warnings wrt 
potential null pointer access
+            @SuppressWarnings("unchecked")
+            final List<Branch> nextBranches = branching ? new 
ArrayList<Branch>() : Collections.EMPTY_LIST;
+
+            for (final Rule rule : rules) {
+                if (rule.matches(inputContext)) {
+                    if (branching) {
+                        nextBranches.clear();
+                    }
+                    final String[] replacements = 
rule.getReplacements(inputContext, lastChar == '\0');
+                    final boolean branchingRequired = replacements.length > 1 
&& branching;
+
+                    for (final Branch branch : currentBranches) {
+                        for (final String nextReplacement : replacements) {
+                            // if we have multiple replacements, always create 
a new branch
+                            final Branch nextBranch = branchingRequired ? 
branch.createBranch() : branch;
+
+                            // special rule: occurrences of mn or nm are 
treated differently
+                            final boolean force = (lastChar == 'm' && ch == 
'n') || (lastChar == 'n' && ch == 'm');
+
+                            nextBranch.processNextReplacement(nextReplacement, 
force);
+
+                            if (branching) {
+                                nextBranches.add(nextBranch);
+                            } else {
+                                break;
+                            }
+                        }
+                    }
+
+                    if (branching) {
+                        currentBranches.clear();
+                        currentBranches.addAll(nextBranches);
+                    }
+                    index += rule.getPatternLength() - 1;
+                    break;
+                }
+            }
+
+            lastChar = ch;
+        }
+
+        final String[] result = new String[currentBranches.size()];
+        int index = 0;
+        for (final Branch branch : currentBranches) {
+            branch.finish();
+            result[index++] = branch.toString();
+        }
+
+        return result;
+    }
+}

Propchange: 
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: 
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: 
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt
URL: 
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt?rev=1636486&view=auto
==============================================================================
--- 
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt
 (added)
+++ 
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt
 Tue Nov  4 02:18:12 2014
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Format
+// "pattern" "replacement at start of word" "replacement before a vowel" 
"replacement in other cases"
+
+// Vowels
+
+"a" "0" "" ""
+"e" "0" "" ""
+"i" "0" "" ""
+"o" "0" "" ""
+"u" "0" "" ""
+
+// Consonants
+
+"b" "7" "7" "7"
+"d" "3" "3" "3"
+"f" "7" "7" "7"
+"g" "5" "5" "5"
+"h" "5" "5" ""
+"k" "5" "5" "5"
+"l" "8" "8" "8"
+"m" "6" "6" "6"
+"n" "6" "6" "6"
+"p" "7" "7" "7"
+"q" "5" "5" "5"
+"r" "9" "9" "9"
+"s" "4" "4" "4"
+"t" "3" "3" "3"
+"v" "7" "7" "7"
+"w" "7" "7" "7"
+"x" "5" "54" "54"
+"y" "1" "" ""
+"z" "4" "4" "4"
+
+// Romanian t-cedilla and t-comma should be equivalent
+"Å£" "3|4" "3|4" "3|4"
+"È" "3|4" "3|4" "3|4"
+
+// Polish characters (e-ogonek and a-ogonek): default case branch either not 
coded or 6
+"Ä" "" "" "|6"
+"Ä" "" "" "|6"
+
+// Other terms
+
+"schtsch" "2" "4" "4"
+"schtsh" "2" "4" "4"
+"schtch" "2" "4" "4"
+"shtch" "2" "4" "4"
+"shtsh" "2" "4" "4"
+"stsch" "2" "4" "4"
+"ttsch" "4" "4" "4"
+"zhdzh" "2" "4" "4"
+"shch" "2" "4" "4"
+"scht" "2" "43" "43"
+"schd" "2" "43" "43"
+"stch" "2" "4" "4"
+"strz" "2" "4" "4"
+"strs" "2" "4" "4"
+"stsh" "2" "4" "4"
+"szcz" "2" "4" "4"
+"szcs" "2" "4" "4"
+"ttch" "4" "4" "4"
+"tsch" "4" "4" "4"
+"ttsz" "4" "4" "4"
+"zdzh" "2" "4" "4"
+"zsch" "4" "4" "4"
+"chs" "5" "54" "54"
+"csz" "4" "4" "4"
+"czs" "4" "4" "4"
+"drz" "4" "4" "4"
+"drs" "4" "4" "4"
+"dsh" "4" "4" "4"
+"dsz" "4" "4" "4"
+"dzh" "4" "4" "4"
+"dzs" "4" "4" "4"
+"sch" "4" "4" "4"
+"sht" "2" "43" "43"
+"szt" "2" "43" "43"
+"shd" "2" "43" "43"
+"szd" "2" "43" "43"
+"tch" "4" "4" "4"
+"trz" "4" "4" "4"
+"trs" "4" "4" "4"
+"tsh" "4" "4" "4"
+"tts" "4" "4" "4"
+"ttz" "4" "4" "4"
+"tzs" "4" "4" "4"
+"tsz" "4" "4" "4"
+"zdz" "2" "4" "4"
+"zhd" "2" "43" "43"
+"zsh" "4" "4" "4"
+"ai" "0" "1" ""
+"aj" "0" "1" ""
+"ay" "0" "1" ""
+"au" "0" "7" ""
+"cz" "4" "4" "4"
+"cs" "4" "4" "4"
+"ds" "4" "4" "4"
+"dz" "4" "4" "4"
+"dt" "3" "3" "3"
+"ei" "0" "1" ""
+"ej" "0" "1" ""
+"ey" "0" "1" ""
+"eu" "1" "1" ""
+"fb" "7" "7" "7"
+"ia" "1" "" ""
+"ie" "1" "" ""
+"io" "1" "" ""
+"iu" "1" "" ""
+"ks" "5" "54" "54"
+"kh" "5" "5" "5"
+"mn" "66" "66" "66"
+"nm" "66" "66" "66"
+"oi" "0" "1" ""
+"oj" "0" "1" ""
+"oy" "0" "1" ""
+"pf" "7" "7" "7"
+"ph" "7" "7" "7"
+"sh" "4" "4" "4"
+"sc" "2" "4" "4"
+"st" "2" "43" "43"
+"sd" "2" "43" "43"
+"sz" "4" "4" "4"
+"th" "3" "3" "3"
+"ts" "4" "4" "4"
+"tc" "4" "4" "4"
+"tz" "4" "4" "4"
+"ui" "0" "1" ""
+"uj" "0" "1" ""
+"uy" "0" "1" ""
+"ue" "0" "1" ""
+"zd" "2" "43" "43"
+"zh" "4" "4" "4"
+"zs" "4" "4" "4"
+
+// Branching cases
+
+"c" "4|5" "4|5" "4|5"
+"ch" "4|5" "4|5" "4|5"
+"ck" "5|45" "5|45" "5|45"
+"rs" "4|94" "4|94" "4|94"
+"rz" "4|94" "4|94" "4|94"
+"j" "1|4" "|4" "|4"
+
+
+// ASCII foldings
+
+Ã=s
+Ã =a
+Ã¡=a
+Ã¢=a
+Ã£=a
+Ã¤=a
+Ã¥=a
+Ã¦=a
+Ã§=c
+Ã¨=e
+Ã©=e
+Ãª=e
+Ã«=e
+Ã¬=i
+Ã=i
+Ã®=i
+Ã¯=i
+Ã°=d
+Ã±=n
+Ã²=o
+Ã³=o
+Ã´=o
+Ãµ=o
+Ã¶=o
+Ã¸=o
+Ã¹=u
+Ãº=u
+Ã»=u
+Ã½=y
+Ã½=y
+Ã¾=b
+Ã¿=y
+Ä=c
+Å=l
+Å=s
+Å¼=z
+Åº=z

Propchange: 
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: 
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt
------------------------------------------------------------------------------
    svn:keywords = Id

Added: 
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java
URL: 
http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java?rev=1636486&view=auto
==============================================================================
--- 
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java
 (added)
+++ 
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java
 Tue Nov  4 02:18:12 2014
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.codec.language;
+
+import org.apache.commons.codec.EncoderException;
+import org.apache.commons.codec.StringEncoderAbstractTest;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Tests {@link DaitchMokotoffSoundex}.
+ * <p>
+ * Keep this file in UTF-8 encoding for proper Javadoc processing.
+ * </p>
+ *
+ * @since 1.10
+ */
+public class DaitchMokotoffSoundexTest extends 
StringEncoderAbstractTest<DaitchMokotoffSoundex> {
+
+    @Override
+    protected DaitchMokotoffSoundex createStringEncoder() {
+        return new DaitchMokotoffSoundex();
+    }
+
+    @Test
+    public void testAccentedCharacterFolding() {
+        Assert.assertEquals("294795", 
this.getStringEncoder().soundex("StraÃburg"));
+        Assert.assertEquals("294795", 
this.getStringEncoder().soundex("Strasburg"));
+
+        Assert.assertEquals("095600", 
this.getStringEncoder().soundex("Ãregon"));
+        Assert.assertEquals("095600", 
this.getStringEncoder().soundex("Eregon"));
+    }
+
+    @Test
+    public void testAdjacentCodes() {
+        // AKSSOL
+        // A-KS-S-O-L
+        // 0-54-4---8 -> wrong
+        // 0-54-----8 -> correct
+        Assert.assertEquals("054800", 
this.getStringEncoder().soundex("AKSSOL"));
+
+        // GERSCHFELD
+        // G-E-RS-CH-F-E-L-D
+        // 5--4/94-5/4-7-8-3 -> wrong
+        // 5--4/94-5/--7-8-3 -> correct
+        Assert.assertEquals("547830|545783|594783|594578", 
this.getStringEncoder().soundex("GERSCHFELD"));
+    }
+
+    public void testEncodeBasic() {
+        // same as above, but without branching
+        Assert.assertEquals("097400", 
this.getStringEncoder().encode("AUERBACH"));
+        Assert.assertEquals("097400", 
this.getStringEncoder().encode("OHRBACH"));
+        Assert.assertEquals("874400", 
this.getStringEncoder().encode("LIPSHITZ"));
+        Assert.assertEquals("874400", 
this.getStringEncoder().encode("LIPPSZYC"));
+        Assert.assertEquals("876450", 
this.getStringEncoder().encode("LEWINSKY"));
+        Assert.assertEquals("876450", 
this.getStringEncoder().encode("LEVINSKI"));
+        Assert.assertEquals("486740", 
this.getStringEncoder().encode("SZLAMAWICZ"));
+        Assert.assertEquals("486740", 
this.getStringEncoder().encode("SHLAMOVITZ"));
+    }
+
+    @Test
+    public void testEncodeIgnoreApostrophes() throws EncoderException {
+        this.checkEncodingVariations("079600", new String[] { "OBrien", 
"'OBrien", "O'Brien", "OB'rien", "OBr'ien",
+                "OBri'en", "OBrie'n", "OBrien'" });
+    }
+
+    /**
+     * Test data from http://www.myatt.demon.co.uk/sxalg.htm
+     *
+     * @throws EncoderException
+     */
+    @Test
+    public void testEncodeIgnoreHyphens() throws EncoderException {
+        this.checkEncodingVariations("565463", new String[] { "KINGSMITH", 
"-KINGSMITH", "K-INGSMITH", "KI-NGSMITH",
+                "KIN-GSMITH", "KING-SMITH", "KINGS-MITH", "KINGSM-ITH", 
"KINGSMI-TH", "KINGSMIT-H", "KINGSMITH-" });
+    }
+
+    @Test
+    public void testEncodeIgnoreTrimmable() {
+        Assert.assertEquals("746536", this.getStringEncoder().encode(" \t\n\r 
Washington \t\n\r "));
+        Assert.assertEquals("746536", 
this.getStringEncoder().encode("Washington"));
+    }
+
+    /**
+     * Examples from http://www.jewishgen.org/infofiles/soundex.html
+     */
+    @Test
+    public void testSoundexBasic() {
+        Assert.assertEquals("583600", 
this.getStringEncoder().soundex("GOLDEN"));
+        Assert.assertEquals("087930", 
this.getStringEncoder().soundex("Alpert"));
+        Assert.assertEquals("791900", 
this.getStringEncoder().soundex("Breuer"));
+        Assert.assertEquals("579000", 
this.getStringEncoder().soundex("Haber"));
+        Assert.assertEquals("665600", 
this.getStringEncoder().soundex("Mannheim"));
+        Assert.assertEquals("664000", 
this.getStringEncoder().soundex("Mintz"));
+        Assert.assertEquals("370000", this.getStringEncoder().soundex("Topf"));
+        Assert.assertEquals("586660", 
this.getStringEncoder().soundex("Kleinmann"));
+        Assert.assertEquals("769600", this.getStringEncoder().soundex("Ben 
Aron"));
+
+        Assert.assertEquals("097400|097500", 
this.getStringEncoder().soundex("AUERBACH"));
+        Assert.assertEquals("097400|097500", 
this.getStringEncoder().soundex("OHRBACH"));
+        Assert.assertEquals("874400", 
this.getStringEncoder().soundex("LIPSHITZ"));
+        Assert.assertEquals("874400|874500", 
this.getStringEncoder().soundex("LIPPSZYC"));
+        Assert.assertEquals("876450", 
this.getStringEncoder().soundex("LEWINSKY"));
+        Assert.assertEquals("876450", 
this.getStringEncoder().soundex("LEVINSKI"));
+        Assert.assertEquals("486740", 
this.getStringEncoder().soundex("SZLAMAWICZ"));
+        Assert.assertEquals("486740", 
this.getStringEncoder().soundex("SHLAMOVITZ"));
+    }
+
+    /**
+     * Examples from http://www.avotaynu.com/soundex.htm
+     */
+    @Test
+    public void testSoundexBasic2() {
+        Assert.assertEquals("467000|567000", 
this.getStringEncoder().soundex("Ceniow"));
+        Assert.assertEquals("467000", 
this.getStringEncoder().soundex("Tsenyuv"));
+        Assert.assertEquals("587400|587500", 
this.getStringEncoder().soundex("Holubica"));
+        Assert.assertEquals("587400", 
this.getStringEncoder().soundex("Golubitsa"));
+        Assert.assertEquals("746480|794648", 
this.getStringEncoder().soundex("Przemysl"));
+        Assert.assertEquals("746480", 
this.getStringEncoder().soundex("Pshemeshil"));
+        
Assert.assertEquals("944744|944745|944754|944755|945744|945745|945754|945755",
+                this.getStringEncoder().soundex("Rosochowaciec"));
+        Assert.assertEquals("945744", 
this.getStringEncoder().soundex("Rosokhovatsets"));
+    }
+
+    /**
+     * Examples from 
http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex
+     */
+    @Test
+    public void testSoundexBasic3() {
+        Assert.assertEquals("734000|739400", 
this.getStringEncoder().soundex("Peters"));
+        Assert.assertEquals("734600|739460", 
this.getStringEncoder().soundex("Peterson"));
+        Assert.assertEquals("645740", 
this.getStringEncoder().soundex("Moskowitz"));
+        Assert.assertEquals("645740", 
this.getStringEncoder().soundex("Moskovitz"));
+        Assert.assertEquals("154600|145460|454600|445460", 
this.getStringEncoder().soundex("Jackson"));
+        
Assert.assertEquals("154654|154645|154644|145465|145464|454654|454645|454644|445465|445464",
 this
+                .getStringEncoder().soundex("Jackson-Jackson"));
+    }
+
+    @Test
+    public void testSpecialRomanianCharacters() {
+        Assert.assertEquals("364000|464000", 
this.getStringEncoder().soundex("Å£amas")); // t-cedilla
+        Assert.assertEquals("364000|464000", 
this.getStringEncoder().soundex("Èamas")); // t-comma
+    }
+
+}

Propchange: 
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: 
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java
------------------------------------------------------------------------------
    svn:keywords = Id

svn commit: r1636486 - in /commons/proper/codec/trunk/src: changes/ main/java/org/apache/commons/codec/language/ main/resources/org/apache/commons/codec/language/ test/java/org/apache/commons/codec/language/

Reply via email to