Bug#301924: python-docutils: rst parser cannot treat CJK characters nicely with table

Kazuo Moriwaka Mon, 28 Mar 2005 22:25:08 -0800

Package: python-docutils
Version: 0.3.7-2
Severity: normal
Tags: patch

rst parser doesn't think about CJK characters width.
It cause problems in CJK environment.


For example, one Chinese character's width is equal to 2 ASCII 
characters'. But docutils just count number of characters.

All markups but TABLE can work. In table markup, this problem
makes marking up tables very difficult.

For example, if 'CC' is one Chinese character, and
'a' is one ASCII character, reST's table is looks like this:

+----+----+
|CCCC  |CCCCCC |
|CCaa |aaCC |
|CCCC  |CCaCC |
|aaaa|CCCC  |
+----+----+

here is a patch for this problem. I got it from
http://city.plala.jp/download/rst/ .
This patch is used about one year and it looks like 
have no problems.

It's license is..

> Copyright (C) 2004 by Matsumoto,Tadashi
> (E-Mail Address: [EMAIL PROTECTED])
> 
> Everyone is permitted to do anything on this program
> including copying, modifying, improving,
> as long as you don't try to pretend that you wrote it.
> i.e., the above copyright notice has to appear in all copies.
> Binary distribution requires original version messages.
> You don't have to ask before copying, redistribution or publishing.
> THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.

regards,

-- System Information:
Debian Release: 3.1
  APT prefers testing
  APT policy: (101, 'testing')
Architecture: i386 (i686)
Kernel: Linux 2.6.8-1-686
Locale: LANG=ja_JP.eucJP, LC_CTYPE=ja_JP.eucJP (charmap=EUC-JP)

Versions of packages python-docutils depends on:
ii  python                        2.3.5-1    An interactive high-level object-o
ii  python2.3-docutils            0.3.7-2    Dependency package for python-docu

-- no debconf information

diff -urP /usr/lib/site-python/docutils/parsers/rst/adjusttable.py 
docutils/parsers/rst/adjusttable.py
--- /usr/lib/site-python/docutils/parsers/rst/adjusttable.py    1970-01-01 
09:00:00.000000000 +0900
+++ docutils/parsers/rst/adjusttable.py 2005-03-28 16:25:20.276759120 +0900
@@ -0,0 +1,85 @@
+"""
+adjusttable.py
+
+Copyright (C) 2004 by Matsumoto,Tadashi
+(E-Mail Address: [EMAIL PROTECTED])
+
+Everyone is permitted to do anything on this program
+including copying, modifying, improving,
+as long as you don't try to pretend that you wrote it.
+i.e., the above copyright notice has to appear in all copies.
+Binary distribution requires original version messages.
+You don't have to ask before copying, redistribution or publishing.
+THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
+"""
+from cStringIO import StringIO
+
+jplib_exists = 1
+try:
+    import jplib
+except:
+    jplib_exists = 0
+
+if jplib_exists:
+    is7bit = jplib.is7bit
+    kanjiwidth = jplib.width
+else:
+    import re
+    is7bit = re.compile('^[\000-\177]*$').match
+    def kanjiwidth(u):
+        w = 0
+        for c in u:
+           if ord(c) > 255:
+               w += 2
+           else:
+               w += 1
+        return w 
+
+def adjustgridtable(lines):
+    rlines = []
+    for line in lines:
+        rline = []
+        for col in line.split('|'):
+            if is7bit(col):
+                rline.append(col)
+            else:
+                diff = kanjiwidth(col) - len(col)
+                if diff:
+                    rline.append(col+' '*diff)
+        rline = '|'.join(rline)
+        rlines.append(rline)
+    return rlines
+
+def adjustsimpletable(lines, columns):
+    rlines = []
+    limit = len(columns)
+    for line in lines:
+        if is7bit(line):
+            rlines.append(line)
+            continue
+        f = StringIO()
+        i = 0
+        start, end = columns[0]
+        width = 0
+        kanji = 0
+        for c in line:
+            f.write(c.encode('utf-8'))
+            if ord(c) > 255:
+                width += 2
+                kanji += 1
+            else:
+                width +=1
+            if width <= end:
+                continue
+            else:
+                i += 1
+                if i < limit:
+                    if kanji:
+                        f.write(' '*kanji)
+                        kanji = 0
+                    start, end = columns[i]
+                else:
+                    end = len(line)*2 + 1
+        rline = f.getvalue()
+        rlines.append(rline.decode('utf-8'))
+    return rlines
diff -urP /usr/lib/site-python/docutils/parsers/rst/states.py 
docutils/parsers/rst/states.py
--- /usr/lib/site-python/docutils/parsers/rst/states.py 2004-11-26 
17:27:55.000000000 +0900
+++ docutils/parsers/rst/states.py      2005-03-28 16:25:20.275759272 +0900
@@ -1,7 +1,7 @@
 # Author: David Goodger
 # Contact: [EMAIL PROTECTED]
-# Revision: $Revision: 1.86 $
-# Date: $Date: 2004/11/06 19:52:19 $
+# Revision: $Revision: 1.2.10.7 $
+# Date: $Date: 2005/01/07 13:26:03 $
 # Copyright: This module has been placed in the public domain.
 
 """
@@ -117,6 +117,7 @@
 from docutils.utils import escape2null, unescape
 from docutils.parsers.rst import directives, languages, tableparser, roles
 from docutils.parsers.rst.languages import en as _fallback_language_module
+from docutils.parsers.rst import adjusttable
 
 
 class MarkupError(DataError): pass
@@ -1551,6 +1552,7 @@
         blank_finish = 1
         try:
             block = self.state_machine.get_text_block(flush_left=1)
+            block.data = adjusttable.adjustgridtable(block)
         except statemachine.UnexpectedIndentationError, instance:
             block, source, lineno = instance.args
             messages.append(self.reporter.error('Unexpected indentation.',
diff -urP /usr/lib/site-python/docutils/parsers/rst/tableparser.py 
docutils/parsers/rst/tableparser.py
--- /usr/lib/site-python/docutils/parsers/rst/tableparser.py    2003-07-06 
05:38:46.000000000 +0900
+++ docutils/parsers/rst/tableparser.py 2005-03-28 16:25:20.276759120 +0900
@@ -1,7 +1,7 @@
 # Author: David Goodger
 # Contact: [EMAIL PROTECTED]
-# Revision: $Revision: 1.9 $
-# Date: $Date: 2003/07/05 22:38:28 $
+# Revision: $Revision: 1.2.10.6 $
+# Date: $Date: 2005/01/07 13:26:04 $
 # Copyright: This module has been placed in the public domain.
 
 """
@@ -25,6 +25,7 @@
 import re
 import sys
 from docutils import DataError
+from docutils.parsers.rst import adjusttable
 
 
 class TableMarkupError(DataError): pass
@@ -463,6 +464,7 @@
         else:
             columns = self.columns[:]
             span_offset = start
+        lines.data = adjusttable.adjustsimpletable(lines, columns)
         self.check_columns(lines, start, columns)
         row = self.init_row(columns, start)
         for i in range(len(columns)):
diff -urP /usr/lib/site-python/docutils/utils.py docutils/utils.py
--- /usr/lib/site-python/docutils/utils.py      2004-11-07 03:52:35.000000000 
+0900
+++ docutils/utils.py   2005-03-28 16:25:20.278758816 +0900
@@ -1,7 +1,7 @@
 # Author: David Goodger
 # Contact: [EMAIL PROTECTED]
-# Revision: $Revision: 1.40 $
-# Date: $Date: 2004/09/30 13:47:58 $
+# Revision: $Revision: 1.2.10.7 $
+# Date: $Date: 2005/01/07 13:26:02 $
 # Copyright: This module has been placed in the public domain.
 
 """
@@ -494,12 +494,12 @@
     parts = []
     start = 0
     while 1:
-        found = text.find('\\', start)
+        found = text.find(u'\\', start)
         if found == -1:
             parts.append(text[start:])
             return ''.join(parts)
         parts.append(text[start:found])
-        parts.append('\x00' + text[found+1:found+2])
+        parts.append(u'\x00' + text[found+1:found+2])
         start = found + 2               # skip character after escape
 
 def unescape(text, restore_backslashes=0):
@@ -508,9 +508,9 @@
     Backslash-escaped spaces are also removed.
     """
     if restore_backslashes:
-        return text.replace('\x00', '\\')
+        return text.replace(u'\x00', u'\\')
     else:
-        for sep in ['\x00 ', '\x00\n', '\x00']:
+        for sep in [u'\x00 ', u'\x00\n', u'\x00']:
             text = ''.join(text.split(sep))
         return text

Bug#301924: python-docutils: rst parser cannot treat CJK characters nicely with table

Reply via email to