On Mon, Aug 11, 2008 at 12:13 PM, Dave Webster <[EMAIL PROTECTED]> wrote:
> Thanks, Timothy. I'm pretty sure that there is no such thing as a "beautiful"
> implementation of double-metaphone but I would personally like to have a copy
> of your python implementation. I have a fairly elegant version of the
> original
> metaphone algorithm I wrote myself (in PERL, many years ago) but I've
> never found
> the time to reverse-engineer the original C++ code for double-metaphone and
> "pythonize" it.
>
> On Mon, Aug 11, 2008 at 2:08 PM, Timothy Grant <[EMAIL PROTECTED]> wrote:
>> On Mon, Aug 11, 2008 at 8:44 AM, Casey <[EMAIL PROTECTED]> wrote:
>>> My first thought is that you should be looking at implementations of
>>> Hamming Distance. If you are actually looking for something like
>>> SOUNDEX you might also want to look at the double metaphor algorithm,
>>> which is significantly harder to implement but provides better
>>> matching and is less susceptible to differences based on name origins.
>>> --
>>> http://mail.python.org/mailman/listinfo/python-list
>>>
>>
>> I responded in the thread of the poster's original message on this
>> subject, but will do the same here. I have a horribly ugly version of
>> the double-metaphone algorithm in python that does work, and may be of
>> some use in solving this problem.
>>
>> --
>> Stand Fast,
>> tjg. [Timothy Grant]
>>
>
This is truly cringe-worthy, and pretty much a direct port of the C++
code. It need unit tests (which are on my "to-do someday" list) but
even though it's ugly it does work and I have managed to do real work
with it.
--
Stand Fast,
tjg. [Timothy Grant]
#
# DMetaph.py
#
# Copyright 2006 by Timothy (rhacer) Grant
#
# Based on an algorithm and code by Lawrence Philips
#
# This code is licensed under the terms of the GNU Public License v2
#
import re
DEBUG = False
class DMetaph(str):
"""
DMetaph(<word>) creates a Double Metaphone encoding of the word passed in.
"""
def __init__(self, s):
self.s = s.upper() + ' ' # Padding allows safe indexing beyond the end
self.length = len(s)
self.last = self.length - 1
self.current = 0
self.primary = ""
self.secondary = ""
self.alternate = False
if self.StringAt(0, 2, 'GN', 'KN', 'PN', 'WR', 'PS'):
self.current += 1
if self.s[0] == 'X':
self.MetaphAdd('S')
self.current += 1
while len(self.primary) < 4 or len(self.secondary) < 4:
if DEBUG: print "processing character: %s current: %s length: %s" % (
self.s[self.current], self.current, self.length)
if self.current >= self.length:
break
self.ProcessString()
self.metaph = self.primary
self.metaph2 = ''
if len(self.metaph) > 4:
self.metaph = self.metaph[:4]
if self.alternate:
self.metaph2 = self.secondary
if len(self.metaph2) > 4:
self.metaph2 = self.metaph2[:4]
def SlavoGermanic(self):
if ('W' in self.s or 'K' in self.s
or 'CZ' in self.s or 'WITZ' in self.s):
return True
return False
def MetaphAdd(self, main, alt=None):
if main:
self.primary += main
if alt:
self.alternate = True
if (alt[0] <> ' '):
self.secondary += alt
else:
if (main[0] <> ' '):
self.secondary += main
def IsVowel(self, at):
if (at < 0) or (at > self.length):
return False
if self.s[at] in 'AEIOUY':
return True
return False
def StringAt(self, start, length, *sub_strings):
if self.s[start:start + length] in sub_strings:
return True
return False
def print_diags(self, section):
print"section: %s current: %s" % (section, self.current)
def ProcessString(self):
if self.IsVowel(self.current):
if DEBUG: self.print_diags('VOWEL')
if self.current == 0:
self.MetaphAdd('A')
self.current += 1
return
elif self.s[self.current] == 'B':
if DEBUG: self.print_diags('B')
self.MetaphAdd('P')
if self.s[self.current + 1] == 'B':
self.current += 2
else:
self.current += 1
return
elif self.s[self.current] == 'C':
if DEBUG: self.print_diags('C')
if (self.current > 1
and not self.IsVowel(self.current - 2)
and self.StringAt(self.current - 1, 3, 'ACH')
and self.s[self.current + 2] <> 'I'
and self.s[self.current + 2] <> 'E'
or self.StringAt(self.current - 2, 6, 'BACHER', 'MACHER')):
self.MetaphAdd('K')
self.current += 2
return
if self.current == 0 and self.StringAt(self.current, 6, 'CAESAR'):
self.MetaphAdd('S')
self.current += 2
return
if self.StringAt(self.current, 4, 'CHIA'):
self.MetaphAdd('K')
self.current += 2
return
if self.StringAt(self.current, 2, 'CH'):
if self.current > 0 and self.StringAt(self.current, 4, 'CHAE'):
self.MetaphAdd('K', 'X')
self.current += 2
return
if (self.current == 0
and (self.StringAt(self.current + 1, 5, 'HARAC', 'HARIS')
or self.StringAt(self.current + 1, 3, 'HOR', 'HYM', 'HIA', 'HEM'))
and not self.StringAt(0, 5, 'CHORE')):
self.MetaphAdd('K')
self.current += 2
return
if ((self.StringAt(0, 4, 'VAN ', 'VON ') or self.StringAt(0, 3, 'SCH'))
or self.StringAt(self.current - 2, 6, 'ORCHES', 'ARCHIT', 'ORCHID')
or self.s[self.current - 2] in 'TS'
or (self.s[self.current - 1] in 'AOUE' or self.current == 0)
and self.s[self.current + 2] in 'LRNMBHFVW '):
self.MetaphAdd('K')
else:
if self.current > 0:
if self.StringAt(0, 2, 'MC'):
self.MetaphAdd('K')
else:
self.MetaphAdd('X', 'K')
else:
self.MetaphAdd('X')
self.current += 2
return
if (self.StringAt(self.current, 2, 'CZ')
and not self.StringAt(self.current - 2, 4, 'WICZ')):
self.MetaphAdd('S', 'X')
self.current += 2
return
if self.StringAt(self.current + 1, 3, 'CIA'):
self.MetaphAdd('X')
self.current += 3
return
if (self.StringAt(self.current, 2, 'CC')
and not (self.current == 1 and self.s[0] == 'M')):
if (self.s[self.current + 2] in 'IEH'
and not self.StringAt(self.current + 2, 2, 'HU')):
if (self.current == 1 and self.s[self.current - 1] == 'A'
or self.StringAt(self.current - 1, 5, 'UCCEE', 'UCCES')):
self.MetaphAdd('KS')
else:
self.MetaphAdd('X')
self.current += 3
return
else:
self.MetaphAdd('K')
self.current += 2
return
if self.StringAt(self.current, 2, 'CK', 'CG', 'CQ'):
self.MetaphAdd('K')
self.current += 2
return
if self.StringAt(self.current, 2, 'CI', 'CE', 'CY'):
if self.StringAt(self.current, 3, 'CIO', 'CIE', 'CIA'):
self.MetaphAdd('S', 'X')
else:
self.MetaphAdd('S')
self.current += 2
return
self.MetaphAdd('K')
if self.StringAt(self.current + 1, 2, ' C', ' Q', ' G'):
self.current += 3
else:
if (self.s[self.current + 1] in 'CKQ'
and not self.StringAt(self.current + 1, 2, 'CE', 'CI')):
self.current += 2
else:
self.current += 1
return
elif self.s[self.current] == 'D':
if DEBUG: self.print_diags('D')
if self.StringAt(self.current, 2, 'DG'):
if self.s[self.current + 2] in 'IEY':
self.MetaphAdd('J')
self.current += 3
return
else:
self.MetaphAdd('TK')
self.current += 2
if self.StringAt(self.current, 2, 'DT', 'DD'):
self.MetaphAdd('T')
self.current += 2
return
self.MetaphAdd('T')
self.current += 1
return
elif self.s[self.current] == 'F':
if DEBUG: self.print_diags('F')
if self.s[self.current + 1] == 'F':
self.current += 2
else:
self.current += 1
self.MetaphAdd('F')
return
elif self.s[self.current] == 'G':
if DEBUG: self.print_diags('G')
if self.s[self.current + 1] == 'H':
if self.current > 0 and not self.IsVowel(self.current - 1):
self.MetaphAdd('K')
self.current += 2
return
if self.current < 3:
if self.current == 0:
if self.s[self.current + 2] == 'I':
self.MetaphAdd('J')
else:
self.MetaphAdd('K')
self.current += 2
return
if ((self.current > 1 and self.s[self.current - 2] in 'BHD')
or (self.current > 2 and self.s[self.current - 3] in 'BHD')
or (self.current > 3 and self.s[self.current - 4] in 'BH')):
self.current += 2
return
else:
if (self.current > 2
and self.s[self.current - 1] == 'U'
and self.s[self.current - 3] in 'CGLRT'):
self.MetaphAdd('F')
else:
if self.current > 0 and self.s[self.current - 1] <> 'I':
self.MetaphAdd('K')
self.current += 2
return
if self.s[self.current + 1] == 'N':
if self.current == 1 and self.IsVowel(0) and not self.SlavoGermanic():
self.MetaphAdd('KN', 'N')
else:
if (not self.StringAt(self.current + 2, 2, 'EY')
and self.s[self.current + 1] <> 'Y'
and not self.SlavoGermanic()):
self.MetaphAdd('N', 'KN')
else:
self.MetaphAdd('KN')
self.current += 2
return
if self.StringAt(self.current + 1, 2, 'LI') and not self.SlavoGermanic():
self.MetaphAdd('KL', 'L')
self.current += 2
return
if ((self.current == 0
and self.s[self.current + 1] == 'Y')
or self.StringAt(self.current + 1, 2, 'ES', 'EP', 'EB',
'EL', 'EY', 'IB',
'IL', 'IN', 'IE',
'EI', 'ER')):
self.MetaphAdd('K', 'J')
self.current += 2
return
if (self.StringAt(self.current + 1, 2, 'ER')
or self.s[self.current + 1] == 'Y'
and not self.StringAt(0, 6, 'DANGER', 'RANGER', 'MANGER')
and not self.s[self.current - 1] in 'EI'
and not self.StringAt(self.current - 1, 3, 'RGY', 'OGY')):
self.MetaphAdd('K', 'J')
self.current += 2
return
if (self.s[self.current + 1] in 'EIY'
or self.StringAt(self.current - 1, 4, 'AGGI', 'OGGI')):
if ((self.StringAt(0, 4, 'VAN ', 'VON ') or self.StringAt(0, 3, 'SCH'))
or self.StringAt(self.current + 1, 2, 'ET')):
self.MetaphAdd('K')
else:
if self.StringAt(self.current + 1, 4, 'IER'):
self.MetaphAdd('J')
else:
self.MetaphAdd('J', 'K')
self.current += 2
return
if self.s[self.current + 1] == 'G':
self.current += 2
else:
self.current += 1
self.MetaphAdd('K')
return
elif self.s[self.current] == 'H':
if DEBUG: self.print_diags('H')
if ((self.current == 0 or self.IsVowel(self.current - 1))
and self.IsVowel(self.current + 1)):
self.MetaphAdd('H')
self.current += 2
else:
self.current += 1
return
elif self.s[self.current] == 'J':
if DEBUG: self.print_diags('J')
if self.StringAt(self.current, 4, 'JOSE') or self.StringAt(0, 4, 'SAN '):
if ((self.current == 0 and self.s[self.current + 4] == ' ')
or self.StringAt(0, 4, 'SAN ')):
self.MetaphAdd('H')
else:
self.MetaphAdd('J', 'H')
self.current += 1
return
if self.current == 0 and not self.StringAt(self.current, 4, 'JOSE'):
self.MetaphAdd('J', 'A')
else:
if (self.IsVowel(self.current - 1)
and not self.SlavoGermanic()
and (self.s[self.current + 1] == 'A'
or self.s[self.current + 1] == 'O')):
self.MetaphAdd('J', 'H')
else:
if self.current == self.last:
self.MetaphAdd('J', ' ')
else:
if (not self.s[self.current + 1] in 'LTKSNMBZ'
and not self.s[self.current - 1] in 'SKL'):
self.MetaphAdd('J')
if self.s[self.current + 1] == 'J':
self.current += 2
else:
self.current += 1
return
elif self.s[self.current] == 'K':
if DEBUG: self.print_diags('K')
if self.s[self.current + 1] == 'K':
self.current += 2
else:
self.current += 1
self.MetaphAdd('K')
return
elif self.s[self.current] == 'L':
if DEBUG: self.print_diags('L')
if self.s[self.current + 1] == 'L':
if ((self.current == self.length - 3
and self.StringAt(self.current - 1, 4, 'ILLO', 'ILLA', 'ALLE'))
or ((self.StringAt(self.last - 1, 2, 'AS', 'OS')
or self.StringAt(self.last, 1, 'A', 'O'))
and self.StringAt(self.current - 1, 4, 'ALLE'))):
self.MetaphAdd('L', ' ')
self.current += 2
return
self.current += 2
else:
self.current += 1
self.MetaphAdd('L')
return
elif self.s[self.current] == 'M':
if DEBUG: self.print_diags('M')
if ((self.StringAt(self.current - 1, 3, 'UMB')
and ((self.current + 1 == self.last)
or self.StringAt(self.current + 2, 2, 'ER')))
or (self.s[self.current + 1] == 'M')):
self.current += 2
else:
self.current += 1
self.MetaphAdd('M')
return
elif self.s[self.current] == 'N':
if DEBUG: self.print_diags('N')
if self.s[self.current + 1] == 'N':
self.current += 2
else:
self.current += 1
self.MetaphAdd('N')
return
elif self.s[self.current] == 'P':
if DEBUG: self.print_diags('P')
if self.s[self.current + 1] == 'H':
self.MetaphAdd('F')
self.current += 2
return
if self.s[self.current + 1] in 'PB':
self.current += 2
else:
self.current += 1
self.MetaphAdd('P')
return
elif self.s[self.current] == 'Q':
if DEBUG: self.print_diags('Q')
if self.s[self.current + 1] == 'Q':
self.current += 2
else:
self.current += 1
self.MetaphAdd('K')
return
elif self.s[self.current] == 'R':
if DEBUG: self.print_diags('R')
if (self.current == self.last
and not self.SlavoGermanic()
and self.StringAt(self.current - 2, 2, 'IE')
and not self.StringAt(self.current - 4, 2, 'ME', 'MA')):
self.MetaphAdd('','R')
else:
self.MetaphAdd('R')
if self.s[self.current + 1] == 'R':
self.current += 2
else:
self.current += 1
return
elif self.s[self.current] == 'S':
if DEBUG: self.print_diags('S')
if self.StringAt(self.current + 1, 3, 'ISL', 'YSL'):
self.current += 1
return
if self.current == 0 and self.StringAt(self.current, 5, 'SUGAR'):
self.MetaphAdd('X', 'S')
self.current += 1
if self.StringAt(self.current, 2, 'SH'):
if self.StringAt(self.current + 1, 4, 'HEIM', 'HOEK', 'HOLM', 'HOLZ'):
self.MetaphAdd('S')
else:
self.MetaphAdd('X')
self.current += 2
return
if (self.StringAt(self.current, 3, 'SIO', 'SIA')
or self.StringAt(self.current, 4, 'SIAN')):
if not self.SlavoGermanic():
self.MetaphAdd('S', 'X')
else:
self.MetaphAdd('S')
self.current += 3
return
if ((self.current == 0
and self.s[self.current + 1] in 'MNLW')
or self.s[self.current + 1] == 'Z'):
self.MetaphAdd('S', 'X')
if self.s[self.current + 1] == 'Z':
self.current += 2
else:
self.current += 1
return
if self.StringAt(self.current, 2, 'SC'):
if self.s[self.current + 2] == 'H':
if self.StringAt(self.current + 3, 2, 'OO', 'ER', 'EN',
'UY', 'ED', 'EM'):
if self.StringAt(self.current + 3, 2, 'ER', 'EN'):
self.MetaphAdd('X', 'SK')
else:
self.MetaphAdd('SK')
self.current += 3
return
else:
if (self.current == 0 and not self.IsVowel(3) and self.s[3] <> 'W'):
self.MetaphAdd('X', 'S')
else:
self.MetaphAdd('X')
self.current += 3
return
if self.s[self.current + 2] in 'IEY':
self.MetaphAdd('S')
self.current += 3
return
self.MetaphAdd('SK')
self.current += 3
return
if (self.current == self.last
and self.StringAt(self.current - 2, 2, 'AI', 'OI')):
self.MetaphAdd('', 'S')
else:
self.MetaphAdd('S')
if self.s[self.current + 1] in 'SZ':
self.current += 2
else:
self.current += 1
return
elif self.s[self.current] == 'T':
if DEBUG: self.print_diags('T')
if self.StringAt(self.current, 4, 'TION'):
self.MetaphAdd('X')
self.current += 3
return
if self.StringAt(self.current, 3, 'TIA', 'TCH'):
self.MetaphAdd('X')
self.current += 3
return
if (self.StringAt(self.current, 2, 'TH')
or self.StringAt(self.current, 3, 'TTH')):
if (self.StringAt(self.current + 2, 2, 'OM', 'AM')
or self.StringAt(0, 4, 'VAN ', 'VON ')
or self.StringAt(0, 3, 'SCH')):
self.MetaphAdd('T')
else:
self.MetaphAdd('0', 'T')
self.current += 2
return
if self.s[self.current + 1] in 'TD':
self.current += 2
else:
self.current += 1
self.MetaphAdd('T')
return
elif self.s[self.current] == 'V':
if DEBUG: self.print_diags('V')
if self.s[self.current + 1] == 'V':
self.current += 2
else:
self.current += 1
self.MetaphAdd('F')
return
elif self.s[self.current] == 'W':
if DEBUG: self.print_diags('W')
if self.StringAt(self.current, 2, 'WR'):
self.MetaphAdd('R')
self.current += 2
return
if (self.current == 0
and (self.IsVowel(self.current + 1)
or self.StringAt(self.current, 2, 'WH'))):
if self.IsVowel(self.current + 1):
self.MetaphAdd('A', 'F')
else:
self.MetaphAdd('A')
if ((self.current == self.last
and self.IsVowel(self.current - 1))
or self.StringAt(self.current - 1, 5, 'EWSKI', 'EWSKY', 'OWSKI', 'OWSKY')
or self.StringAt(0, 3, 'SCH')):
self.MetaphAdd('', 'F')
self.current += 1
return
if self.StringAt(self.current, 4, 'WICZ', 'WITZ'):
self.MetaphAdd('TS', 'FX')
self.current += 4
return
self.current += 1
return
elif self.s[self.current] == 'X':
if DEBUG: self.print_diags('X')
if (not (self.current == self.last
and (self.StringAt(self.current - 3, 3, 'IAU', 'EAU')
or self.StringAt(self.current - 2, 2, 'AU', 'OU')))):
self.MetaphAdd('KS')
if self.s[self.current + 1] in 'CX':
self.current += 2
else:
self.current += 1
elif self.s[self.current] == 'Z':
if DEBUG: self.print_diags('Z')
if self.s[self.current + 1] == 'H':
self.MetaphAdd('J')
self.current += 2
return
else:
if (self.StringAt(self.current + 1, 2, 'ZO', 'ZI', 'ZA')
or (self.SlavoGermanic()
and (self.current > 0
and self.s[self.current - 1] <> 'T'))):
self.MetaphAdd('S', 'TS')
else:
self.MetaphAdd('S')
if self.s[self.current + 1] == 'Z':
self.current += 2
else:
self.current += 1
else:
self.current += 1
def __str__(self):
if self.metaph2:
return "<%s: %s, %s>" % (self.s.strip(), self.metaph, self.metaph2)
else:
return "<%s: %s>" % (self.s.strip(), self.metaph)
PUNCTUATION = """\.\,\?\-\(\)\:\;\"\'\!\#"""
def metaphonize_string(words):
words = re.sub(r'[%s]' % PUNCTUATION, '', words)
metaphones = []
for word in words.split():
w = DMetaph(word)
metaphones.append(w.metaph)
return metaphones
if __name__ == '__main__':
for word in open('/usr/share/dict/words', 'rb'):
w = DMetaph(word)
print w
print metaphonize_string('Now is the time for all good men to come to the aid of their country')
--
http://mail.python.org/mailman/listinfo/python-list