I need this for gettext to support Python 3.3's change to its "\N{...}" notation in string literals, which supports aliased character names: https://docs.python.org/3/reference/lexical_analysis.html#literals
As usual, the patch does not include generated/copied files. To generate, do the following from the top directory of gnulib: $ sed -e '/^$/d' -e '/^#/d' < $UCD/NameAliases.txt > \ tests/uniname/NameAliases.txt $ (cd lib/uniname \ && clisp gen-uninames.lisp $UCD/UnicodeData.txt uninames.h \ tests/uniname/NameAliases.txt) Regards, -- Daiki Ueno
>From bf9ae34b0c3803785b7d49f05edd6c6f6de2bbb3 Mon Sep 17 00:00:00 2001 From: Daiki Ueno <u...@gnu.org> Date: Thu, 5 Feb 2015 16:56:41 +0900 Subject: [PATCH] uniname/uniname: support character alias * lib/uniname/gen-uninames.lisp (main): New argument ALIASFILE. Register one-way mapping from aliases to codepoints in the generated tables. * lib/uniname/uninames.h: Regenerate. * tests/uniname/NameAliases.txt: New file, taken from UCD 7.0.0. * modules/uniname/uniname-tests (Files): Add tests/uniname/NameAliases.txt. * tests/uniname/test-uninames.c: Mark as static. (ALIASLEN): Define. (struct unicode_alias): New struct. (unicode_aliases): New variable. (fill_aliases): New function. (test_alias_lookup): New test function. (main): Run the 'test_alias_lookup' test if the second argument is given. * tests/uniname/test-uninames.sh: Supply NameAliases.txt as the second argument. --- lib/uniname/gen-uninames.lisp | 49 ++++++++++++++----- modules/uniname/uniname-tests | 1 + tests/uniname/test-uninames.c | 108 ++++++++++++++++++++++++++++++++++++++++- tests/uniname/test-uninames.sh | 2 +- 4 files changed, 145 insertions(+), 15 deletions(-) diff --git a/lib/uniname/gen-uninames.lisp b/lib/uniname/gen-uninames.lisp index e7de0a1..060dda1 100755 --- a/lib/uniname/gen-uninames.lisp +++ b/lib/uniname/gen-uninames.lisp @@ -25,10 +25,13 @@ length ; number of words ) -(defun main (inputfile outputfile) - (declare (type string inputfile outputfile)) +(defun main (inputfile outputfile aliasfile) + (declare (type string inputfile outputfile aliasfile)) #+UNICODE (setq *default-file-encoding* charset:utf-8) (let ((all-chars '()) + (all-chars-hashed (make-hash-table :test #'equal)) + (all-aliases '()) + all-chars-and-aliases (all-ranges '()) (name-index 0) range) @@ -53,6 +56,7 @@ (push (make-unicode-char :index name-index :name name-string) all-chars) + (setf (gethash code all-chars-hashed) (car all-chars)) ;; Update the contiguous range, or start a new range. (if (and range (= (1+ (range-end-code range)) code)) (setf (range-end-code range) code) @@ -70,9 +74,28 @@ (if range (push range all-ranges)) (setq all-ranges (nreverse all-ranges)) + (when aliasfile + ;; Read all characters and names from the alias file. + (with-open-file (istream aliasfile :direction :input) + (loop + (let ((line (read-line istream nil nil))) + (unless line (return)) + (let* ((i1 (position #\; line)) + (i2 (position #\; line :start (1+ i1))) + (code-string (subseq line 0 i1)) + (code (parse-integer code-string :radix 16)) + (name-string (subseq line (1+ i1) i2)) + (uc (gethash code all-chars-hashed))) + (when uc + (push (make-unicode-char :index (unicode-char-index uc) + :name name-string) + all-aliases) + ) ) ) ) ) ) + (setq all-aliases (nreverse all-aliases) + all-chars-and-aliases (append all-chars all-aliases)) ;; Split into words. (let ((words-by-length (make-array 0 :adjustable t))) - (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars))) + (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars-and-aliases))) (let ((i1 0)) (loop (when (>= i1 (length name)) (return)) @@ -195,7 +218,7 @@ (gethash word (word-list-hashed (aref words-by-length (length word)))) ) ) ;; Compute the word-indices for every unicode-char. - (dolist (uc all-chars) + (dolist (uc all-chars-and-aliases) (let ((name (unicode-char-name uc)) (indices '())) (let ((i1 0)) @@ -215,8 +238,8 @@ ) ) ) ;; Sort the list of unicode-chars by word-indices. - (setq all-chars - (sort all-chars + (setq all-chars-and-aliases + (sort all-chars-and-aliases (lambda (vec1 vec2) (let ((len1 (length vec1)) (len2 (length vec2))) @@ -235,10 +258,10 @@ ) ) ;; Output the word-indices. (format ostream "static const uint16_t unicode_names[~D] = {~%" - (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars)) + (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases)) ) (let ((i 0)) - (dolist (uc all-chars) + (dolist (uc all-chars-and-aliases) (format ostream " ~{ ~D,~}" (maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0))) (coerce (unicode-char-word-indices uc) 'list) @@ -255,9 +278,9 @@ (format ostream "static const struct { uint16_t index; uint32_t name:24; }~%") (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%") (format ostream "unicode_name_to_index[~D] = {~%" - (length all-chars) + (length all-chars-and-aliases) ) - (dolist (uc all-chars) + (dolist (uc all-chars-and-aliases) (format ostream " { 0x~4,'0X, ~D }," (unicode-char-index uc) (unicode-char-word-indices-index uc) @@ -285,10 +308,10 @@ ) (format ostream "};~%") (format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%" - (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars)) + (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars-and-aliases)) ) (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%" - (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars)) + (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases)) ) (format ostream "static const struct { uint16_t index; uint32_t gap; uint16_t length; } unicode_ranges[~D] = {~%" (length all-ranges)) @@ -302,4 +325,4 @@ ) ) ) ) -(main (first *args*) (second *args*)) +(main (first *args*) (second *args*) (third *args*)) diff --git a/modules/uniname/uniname-tests b/modules/uniname/uniname-tests index 305e6a1..512b035 100644 --- a/modules/uniname/uniname-tests +++ b/modules/uniname/uniname-tests @@ -2,6 +2,7 @@ Files: tests/uniname/test-uninames.sh tests/uniname/test-uninames.c tests/uniname/UnicodeDataNames.txt +tests/uniname/NameAliases.txt Depends-on: xalloc diff --git a/tests/uniname/test-uninames.c b/tests/uniname/test-uninames.c index f8fb077..eccf2f4 100644 --- a/tests/uniname/test-uninames.c +++ b/tests/uniname/test-uninames.c @@ -27,7 +27,19 @@ /* The names according to the UnicodeData.txt file, modified to contain the Hangul syllable names, as described in the Unicode 3.0 book. */ -const char * unicode_names [0x110000]; +static const char * unicode_names [0x110000]; + +/* Maximum entries in unicode_aliases. */ +#define ALIASLEN 0x200 + +/* The aliases according to the NameAliases.txt file. */ +struct unicode_alias +{ + const char *name; + unsigned int uc; +}; + +static struct unicode_alias unicode_aliases [ALIASLEN]; /* Maximum length of a field in the UnicodeData.txt file. */ #define FIELDLEN 120 @@ -113,6 +125,62 @@ fill_names (const char *unicodedata_filename) } } +/* Stores in unicode_aliases[] the relevant contents of the NameAliases.txt + file. */ +static void +fill_aliases (const char *namealiases_filename) +{ + int i; + FILE *stream; + char field0[FIELDLEN]; + char field1[FIELDLEN]; + int lineno = 0; + + for (i = 0; i < ALIASLEN; i++) + unicode_aliases[i].uc = UNINAME_INVALID; + + stream = fopen (namealiases_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", namealiases_filename); + exit (EXIT_FAILURE); + } + + for (i = 0; i < ALIASLEN; i++) + { + int n; + int c; + unsigned int uc; + + lineno++; + n = getfield (stream, field0, ';'); + n += getfield (stream, field1, ';'); + if (n == 0) + break; + if (n != 2) + { + fprintf (stderr, "short line in '%s':%d\n", + namealiases_filename, lineno); + exit (EXIT_FAILURE); + } + for (; (c = getc (stream)), (c != EOF && c != '\n'); ) + ; + uc = strtoul (field0, NULL, 16); + if (uc >= 0x110000) + { + fprintf (stderr, "index too large\n"); + exit (EXIT_FAILURE); + } + unicode_aliases[i].name = xstrdup (field1); + unicode_aliases[i].uc = uc; + } + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", namealiases_filename); + exit (1); + } +} + /* Perform an exhaustive test of the unicode_character_name function. */ static int test_name_lookup () @@ -246,6 +314,38 @@ test_inverse_lookup () return error; } +/* Perform a test of the unicode_name_character function for aliases. */ +static int +test_alias_lookup () +{ + int error = 0; + unsigned int i; + char buf[UNINAME_MAX]; + + /* Verify all valid character names are recognized. */ + for (i = 0; i < ALIASLEN; i++) + if (unicode_aliases[i].uc != UNINAME_INVALID + /* Skip if the character has no canonical name (e.g. control + characters). */ + && unicode_character_name (unicode_aliases[i].uc, buf)) + { + unsigned int result = unicode_name_character (unicode_aliases[i].name); + if (result != unicode_aliases[i].uc) + { + if (result == UNINAME_INVALID) + fprintf (stderr, "inverse name lookup of \"%s\" failed\n", + unicode_aliases[i]); + else + fprintf (stderr, + "inverse name lookup of \"%s\" returned 0x%04X\n", + unicode_aliases[i], result); + error = 1; + } + } + + return error; +} + int main (int argc, char *argv[]) { @@ -258,5 +358,11 @@ main (int argc, char *argv[]) error |= test_name_lookup (); error |= test_inverse_lookup (); + if (argc > 2) + { + fill_aliases (argv[2]); + error |= test_alias_lookup (); + } + return error; } diff --git a/tests/uniname/test-uninames.sh b/tests/uniname/test-uninames.sh index f26c275..0e6a018 100755 --- a/tests/uniname/test-uninames.sh +++ b/tests/uniname/test-uninames.sh @@ -1,2 +1,2 @@ #!/bin/sh -exec ./test-uninames${EXEEXT} "$srcdir/uniname/UnicodeDataNames.txt" +exec ./test-uninames${EXEEXT} "$srcdir/uniname/UnicodeDataNames.txt" "$srcdir/uniname/NameAliases.txt" -- 2.1.0