Bug#330990: iso-codes .pot msgid strings contain non-ASCII characters

Paul Eggert Fri, 30 Sep 2005 15:03:16 -0700

Package: iso-codes
Version: 0.48-1

The .pot files in iso-codes sometimes use msgids that are not ASCII.
However, the GNU gettext manual says:


      Note that the MSGID argument to `gettext' is not subject to
   character set conversion.  Also, when `gettext' does not find a
   translation for MSGID, it returns MSGID unchanged - independently of
   the current output character set.  It is therefore recommended that all
   MSGIDs be US-ASCII strings.

An example problem scenario is if gettext("Åland Islands") returns its
argument (which is UTF-8) in an EUC-JIS locale: the result will be a
corrupted string being sent to the user.

Here is a proposed patch, which causes the msgid to be the ASCII
approximation "Aaland Islands" instead.

2005-09-30  Paul Eggert  <[EMAIL PROTECTED]>

        * iso2pot.py: Convert msgids to an ASCII approximation if they
        are not already ASCII.

diff -pru -I iso-codes- --exclude='*.po' iso-codes-0.48-tmp/iso2pot.py 
iso-codes-0.48-fix/iso2pot.py
--- iso-codes-0.48-tmp/iso2pot.py       2004-06-15 13:36:43.000000000 -0700
+++ iso-codes-0.48-fix/iso2pot.py       2005-09-30 14:36:26.000000000 -0700
@@ -8,7 +8,7 @@
 
 from xml.sax import saxutils, make_parser, saxlib, saxexts, ContentHandler
 from xml.sax.handler import feature_namespaces
-import sys, os, getopt, urllib2, locale, time
+import sys, os, getopt, urllib2, locale, time, codecs
 
 class printPot(saxutils.DefaultHandler):
     def __init__(self, nameslist,comment, ofile):
@@ -21,6 +21,57 @@ class printPot(saxutils.DefaultHandler):
         self.ofile = ofile
         self.done = {}
 
+        def asciiize(exc):
+            """Convert to an ASCII approximation of the input.
+            """
+            codepoint2ascii = {
+                0x00C5 : 'Aa',
+                0x00DA : 'U',
+                0x00E1 : 'a',
+                0x00E2 : 'a',
+                0x00E4 : 'a',
+                0x00E5 : 'aa',
+                0x00E7 : 'c',
+                0x00E8 : 'e',
+                0x00E9 : 'e',
+                0x00EB : 'e',
+                0x00ED : 'i',
+                0x00EF : 'i',
+                0x00C1 : 'A',
+                0x00F1 : 'n',
+                0x00F3 : 'o',
+                0x00F4 : 'o',
+                0x00F6 : 'o',
+                0x00F8 : 'o',
+                0x00FA : 'u',
+                0x00FD : 'y',
+                0x0101 : 'a',
+                0x010D : 'c',
+                0x011B : 'e',
+                0x012B : 'i',
+                0x0141 : 'L',
+                0x0142 : 'l',
+                0x0148 : 'n',
+                0x0151 : 'o',
+                0x0159 : 'r',
+                0x015A : 'S',
+                0x015B : 's',
+                0x015E : 'S',
+                0x015F : 's',
+                0x0161 : 's',
+                0x0162 : 'T',
+                0x0163 : 't',
+                0x016B : 'u',
+                0x017D : 'Z',
+            }
+            if not isinstance(exc, UnicodeEncodeError):
+                raise TypeError("don't know how to handle %r" % exc)
+            l = []
+            for c in exc.object[exc.start:exc.end]:
+                l.append(codepoint2ascii[ord(c)])
+            return (u''.join(l), exc.end)
+
+        codecs.register_error('iso2pot.asciiize', asciiize)
 
     def startElement(self, name, attrs):
         # Get the name attributes
@@ -28,7 +79,7 @@ class printPot(saxutils.DefaultHandler):
                n = attrs.get(aname, None)
                c = attrs.get(self.comment, None)
                if type(n) == unicode:
-                   n = n.encode('UTF-8')
+                   n = n.encode('ascii', 'iso2pot.asciiize')
                if type(c) == unicode:
                    c = c.encode('UTF-8')
                if n != None and not self.done.has_key(n):

Bug#330990: iso-codes .pot msgid strings contain non-ASCII characters

Reply via email to