[PATCH] uniname/uniname: support character alias

Daiki Ueno Thu, 05 Feb 2015 01:14:36 -0800

I need this for gettext to support Python 3.3's change to its "\N{...}"
notation in string literals, which supports aliased character names:
https://docs.python.org/3/reference/lexical_analysis.html#literals


As usual, the patch does not include generated/copied files.
To generate, do the following from the top directory of gnulib:

  $ sed -e '/^$/d' -e '/^#/d' < $UCD/NameAliases.txt > \
    tests/uniname/NameAliases.txt
  $ (cd lib/uniname \
     && clisp gen-uninames.lisp $UCD/UnicodeData.txt uninames.h \
     tests/uniname/NameAliases.txt)

Regards,
--
Daiki Ueno

>From bf9ae34b0c3803785b7d49f05edd6c6f6de2bbb3 Mon Sep 17 00:00:00 2001
From: Daiki Ueno <u...@gnu.org>
Date: Thu, 5 Feb 2015 16:56:41 +0900
Subject: [PATCH] uniname/uniname: support character alias

* lib/uniname/gen-uninames.lisp (main): New argument ALIASFILE.
Register one-way mapping from aliases to codepoints in the
generated tables.
* lib/uniname/uninames.h: Regenerate.
* tests/uniname/NameAliases.txt: New file, taken from UCD 7.0.0.
* modules/uniname/uniname-tests (Files): Add
tests/uniname/NameAliases.txt.
* tests/uniname/test-uninames.c: Mark as static.
(ALIASLEN): Define.
(struct unicode_alias): New struct.
(unicode_aliases): New variable.
(fill_aliases): New function.
(test_alias_lookup): New test function.
(main): Run the 'test_alias_lookup' test if the second argument is
given.
* tests/uniname/test-uninames.sh: Supply NameAliases.txt as the
second argument.
---
 lib/uniname/gen-uninames.lisp  |  49 ++++++++++++++-----
 modules/uniname/uniname-tests  |   1 +
 tests/uniname/test-uninames.c  | 108 ++++++++++++++++++++++++++++++++++++++++-
 tests/uniname/test-uninames.sh |   2 +-
 4 files changed, 145 insertions(+), 15 deletions(-)

diff --git a/lib/uniname/gen-uninames.lisp b/lib/uniname/gen-uninames.lisp
index e7de0a1..060dda1 100755
--- a/lib/uniname/gen-uninames.lisp
+++ b/lib/uniname/gen-uninames.lisp
@@ -25,10 +25,13 @@
   length                        ; number of words
 )
 
-(defun main (inputfile outputfile)
-  (declare (type string inputfile outputfile))
+(defun main (inputfile outputfile aliasfile)
+  (declare (type string inputfile outputfile aliasfile))
   #+UNICODE (setq *default-file-encoding* charset:utf-8)
   (let ((all-chars '())
+        (all-chars-hashed (make-hash-table :test #'equal))
+        (all-aliases '())
+        all-chars-and-aliases
         (all-ranges '())
         (name-index 0)
         range)
@@ -53,6 +56,7 @@
                   (push (make-unicode-char :index name-index
                                            :name name-string)
                         all-chars)
+                  (setf (gethash code all-chars-hashed) (car all-chars))
                   ;; Update the contiguous range, or start a new range.
                   (if (and range (= (1+ (range-end-code range)) code))
                       (setf (range-end-code range) code)
@@ -70,9 +74,28 @@
     (if range
         (push range all-ranges))
     (setq all-ranges (nreverse all-ranges))
+    (when aliasfile
+      ;; Read all characters and names from the alias file.
+      (with-open-file (istream aliasfile :direction :input)
+        (loop
+         (let ((line (read-line istream nil nil)))
+           (unless line (return))
+           (let* ((i1 (position #\; line))
+                  (i2 (position #\; line :start (1+ i1)))
+                  (code-string (subseq line 0 i1))
+                  (code (parse-integer code-string :radix 16))
+                  (name-string (subseq line (1+ i1) i2))
+                  (uc (gethash code all-chars-hashed)))
+             (when uc
+               (push (make-unicode-char :index (unicode-char-index uc)
+                                        :name name-string)
+                     all-aliases)
+             ) ) ) ) ) )
+    (setq all-aliases (nreverse all-aliases)
+          all-chars-and-aliases (append all-chars all-aliases))
     ;; Split into words.
     (let ((words-by-length (make-array 0 :adjustable t)))
-      (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars)))
+      (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars-and-aliases)))
         (let ((i1 0))
           (loop
             (when (>= i1 (length name)) (return))
@@ -195,7 +218,7 @@
                           (gethash word (word-list-hashed (aref words-by-length (length word))))
         ) )
         ;; Compute the word-indices for every unicode-char.
-        (dolist (uc all-chars)
+        (dolist (uc all-chars-and-aliases)
           (let ((name (unicode-char-name uc))
                 (indices '()))
             (let ((i1 0))
@@ -215,8 +238,8 @@
             )
         ) )
         ;; Sort the list of unicode-chars by word-indices.
-        (setq all-chars
-              (sort all-chars
+        (setq all-chars-and-aliases
+              (sort all-chars-and-aliases
                     (lambda (vec1 vec2)
                       (let ((len1 (length vec1))
                             (len2 (length vec2)))
@@ -235,10 +258,10 @@
         )     )
         ;; Output the word-indices.
         (format ostream "static const uint16_t unicode_names[~D] = {~%"
-                        (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
+                        (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases))
         )
         (let ((i 0))
-          (dolist (uc all-chars)
+          (dolist (uc all-chars-and-aliases)
             (format ostream " ~{ ~D,~}"
                             (maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0)))
                                      (coerce (unicode-char-word-indices uc) 'list)
@@ -255,9 +278,9 @@
         (format ostream "static const struct { uint16_t index; uint32_t name:24; }~%")
         (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%")
         (format ostream "unicode_name_to_index[~D] = {~%"
-                        (length all-chars)
+                        (length all-chars-and-aliases)
         )
-        (dolist (uc all-chars)
+        (dolist (uc all-chars-and-aliases)
           (format ostream "  { 0x~4,'0X, ~D },"
                           (unicode-char-index uc)
                           (unicode-char-word-indices-index uc)
@@ -285,10 +308,10 @@
         )
         (format ostream "};~%")
         (format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
-                        (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars))
+                        (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars-and-aliases))
         )
         (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
-                        (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
+                        (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases))
         )
         (format ostream "static const struct { uint16_t index; uint32_t gap; uint16_t length; } unicode_ranges[~D] = {~%"
                         (length all-ranges))
@@ -302,4 +325,4 @@
       )
 ) ) )
 
-(main (first *args*) (second *args*))
+(main (first *args*) (second *args*) (third *args*))
diff --git a/modules/uniname/uniname-tests b/modules/uniname/uniname-tests
index 305e6a1..512b035 100644
--- a/modules/uniname/uniname-tests
+++ b/modules/uniname/uniname-tests
@@ -2,6 +2,7 @@ Files:
 tests/uniname/test-uninames.sh
 tests/uniname/test-uninames.c
 tests/uniname/UnicodeDataNames.txt
+tests/uniname/NameAliases.txt
 
 Depends-on:
 xalloc
diff --git a/tests/uniname/test-uninames.c b/tests/uniname/test-uninames.c
index f8fb077..eccf2f4 100644
--- a/tests/uniname/test-uninames.c
+++ b/tests/uniname/test-uninames.c
@@ -27,7 +27,19 @@
 
 /* The names according to the UnicodeData.txt file, modified to contain the
    Hangul syllable names, as described in the Unicode 3.0 book.  */
-const char * unicode_names [0x110000];
+static const char * unicode_names [0x110000];
+
+/* Maximum entries in unicode_aliases.  */
+#define ALIASLEN 0x200
+
+/* The aliases according to the NameAliases.txt file.  */
+struct unicode_alias
+{
+  const char *name;
+  unsigned int uc;
+};
+
+static struct unicode_alias unicode_aliases [ALIASLEN];
 
 /* Maximum length of a field in the UnicodeData.txt file.  */
 #define FIELDLEN 120
@@ -113,6 +125,62 @@ fill_names (const char *unicodedata_filename)
     }
 }
 
+/* Stores in unicode_aliases[] the relevant contents of the NameAliases.txt
+   file.  */
+static void
+fill_aliases (const char *namealiases_filename)
+{
+  int i;
+  FILE *stream;
+  char field0[FIELDLEN];
+  char field1[FIELDLEN];
+  int lineno = 0;
+
+  for (i = 0; i < ALIASLEN; i++)
+    unicode_aliases[i].uc = UNINAME_INVALID;
+
+  stream = fopen (namealiases_filename, "r");
+  if (stream == NULL)
+    {
+      fprintf (stderr, "error during fopen of '%s'\n", namealiases_filename);
+      exit (EXIT_FAILURE);
+    }
+
+  for (i = 0; i < ALIASLEN; i++)
+    {
+      int n;
+      int c;
+      unsigned int uc;
+
+      lineno++;
+      n = getfield (stream, field0, ';');
+      n += getfield (stream, field1, ';');
+      if (n == 0)
+        break;
+      if (n != 2)
+        {
+          fprintf (stderr, "short line in '%s':%d\n",
+                   namealiases_filename, lineno);
+          exit (EXIT_FAILURE);
+        }
+      for (; (c = getc (stream)), (c != EOF && c != '\n'); )
+        ;
+      uc = strtoul (field0, NULL, 16);
+      if (uc >= 0x110000)
+        {
+          fprintf (stderr, "index too large\n");
+          exit (EXIT_FAILURE);
+        }
+      unicode_aliases[i].name = xstrdup (field1);
+      unicode_aliases[i].uc = uc;
+    }
+  if (ferror (stream) || fclose (stream))
+    {
+      fprintf (stderr, "error reading from '%s'\n", namealiases_filename);
+      exit (1);
+    }
+}
+
 /* Perform an exhaustive test of the unicode_character_name function.  */
 static int
 test_name_lookup ()
@@ -246,6 +314,38 @@ test_inverse_lookup ()
   return error;
 }
 
+/* Perform a test of the unicode_name_character function for aliases.  */
+static int
+test_alias_lookup ()
+{
+  int error = 0;
+  unsigned int i;
+  char buf[UNINAME_MAX];
+
+  /* Verify all valid character names are recognized.  */
+  for (i = 0; i < ALIASLEN; i++)
+    if (unicode_aliases[i].uc != UNINAME_INVALID
+        /* Skip if the character has no canonical name (e.g. control
+           characters).  */
+        && unicode_character_name (unicode_aliases[i].uc, buf))
+      {
+        unsigned int result = unicode_name_character (unicode_aliases[i].name);
+        if (result != unicode_aliases[i].uc)
+          {
+            if (result == UNINAME_INVALID)
+              fprintf (stderr, "inverse name lookup of \"%s\" failed\n",
+                       unicode_aliases[i]);
+            else
+              fprintf (stderr,
+                       "inverse name lookup of \"%s\" returned 0x%04X\n",
+                       unicode_aliases[i], result);
+            error = 1;
+          }
+      }
+
+  return error;
+}
+
 int
 main (int argc, char *argv[])
 {
@@ -258,5 +358,11 @@ main (int argc, char *argv[])
   error |= test_name_lookup ();
   error |= test_inverse_lookup ();
 
+  if (argc > 2)
+    {
+      fill_aliases (argv[2]);
+      error |= test_alias_lookup ();
+    }
+
   return error;
 }
diff --git a/tests/uniname/test-uninames.sh b/tests/uniname/test-uninames.sh
index f26c275..0e6a018 100755
--- a/tests/uniname/test-uninames.sh
+++ b/tests/uniname/test-uninames.sh
@@ -1,2 +1,2 @@
 #!/bin/sh
-exec ./test-uninames${EXEEXT} "$srcdir/uniname/UnicodeDataNames.txt"
+exec ./test-uninames${EXEEXT} "$srcdir/uniname/UnicodeDataNames.txt" "$srcdir/uniname/NameAliases.txt"
-- 
2.1.0

[PATCH] uniname/uniname: support character alias

Reply via email to