Hello Ben, Wow! New modules in such quality, 99% perfect, this is rare.
Please squeeze in the attached patch ("git commit --amend"), then you can push. The patch contains: - Reordering: In my feeling, grapheme break facilities should come after line break and word break, because they are less often used. - In lib/gen-uni-tables.c I renamed the function output_gbp to output_gbp_table (to make it clear that it's a function that produces a file). - lib/unigbrk.in.h does not need to include <stddef.h>. - In the module description, use 0.9.4, not 0.9, as the minimum version of libunistring that is expected to contain the code. After this, you'll likely also want to write documentation about the new modules. Which means, it's time to start filing a copyright assignment for libunistring with the FSF. You find info how to request the copyright assignment form from the FSF at <http://www.gnu.org/prep/maintain/html_node/Copyright-Papers.html>. The next modules will have a higher-level API, I imagine. You're welcome to discuss the new API with me, before you implement it. Bruno
--- ChangeLog.orig Mon Dec 27 16:31:25 2010 +++ ChangeLog Mon Dec 27 15:40:53 2010 @@ -10,7 +10,7 @@ * lib/unigbrk/gbrkprop.h: New file. * lib/unigbrk/uc-gbrk-prop.c: New file. - New module unigbrk/uc-is-grapheme-break'. + New module 'unigbrk/uc-is-grapheme-break'. * modules/unigbrk/uc-is-grapheme-break: New file. * modules/unigbrk/uc-is-grapheme-break-tests: New file. * lib/unigbrk/uc-is-grapheme-break.c: New file. --- lib/gen-uni-tables.c.orig Mon Dec 27 16:31:26 2010 +++ lib/gen-uni-tables.c Mon Dec 27 16:04:28 2010 @@ -25,9 +25,9 @@ /usr/local/share/Unidata/Blocks.txt \ /usr/local/share/Unidata/PropList-3.0.1.txt \ /usr/local/share/Unidata/EastAsianWidth.txt \ - /usr/local/share/Unidata/GraphemeBreakProperty.txt \ /usr/local/share/Unidata/LineBreak.txt \ /usr/local/share/Unidata/WordBreakProperty.txt \ + /usr/local/share/Unidata/GraphemeBreakProperty.txt \ /usr/local/share/Unidata/CompositionExclusions.txt \ /usr/local/share/Unidata/SpecialCasing.txt \ /usr/local/share/Unidata/CaseFolding.txt \ @@ -5144,232 +5144,6 @@ /* ========================================================================= */ -/* Grapheme break property. */ - -/* Possible values of the Grapheme_Cluster_Break property. */ -enum -{ - GBP_OTHER = 0, - GBP_CR = 1, - GBP_LF = 2, - GBP_CONTROL = 3, - GBP_EXTEND = 4, - GBP_PREPEND = 5, - GBP_SPACINGMARK = 6, - GBP_L = 7, - GBP_V = 8, - GBP_T = 9, - GBP_LV = 10, - GBP_LVT = 11 -}; - -/* Construction of sparse 3-level tables. */ -#define TABLE gbp_table -#define ELEMENT unsigned char -#define DEFAULT GBP_OTHER -#define xmalloc malloc -#define xrealloc realloc -#include "3level.h" - -/* The grapheme break property from the GraphemeBreakProperty.txt file. */ -int unicode_org_gbp[0x110000]; - -/* Output the per-character grapheme break property table. */ -static void -output_gbp (const char *filename, const char *version) -{ - FILE *stream; - unsigned int ch, i; - struct gbp_table t; - unsigned int level1_offset, level2_offset, level3_offset; - - stream = fopen (filename, "w"); - if (stream == NULL) - { - fprintf (stderr, "cannot open '%s' for writing\n", filename); - exit (1); - } - - fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); - fprintf (stream, "/* Grapheme break property of Unicode characters. */\n"); - fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", - version); - - t.p = 7; - t.q = 9; - gbp_table_init (&t); - - for (ch = 0; ch < 0x110000; ch++) - gbp_table_add (&t, ch, unicode_org_gbp[ch]); - - gbp_table_finalize (&t); - - /* Offsets in t.result, in memory of this process. */ - level1_offset = - 5 * sizeof (uint32_t); - level2_offset = - 5 * sizeof (uint32_t) - + t.level1_size * sizeof (uint32_t); - level3_offset = - 5 * sizeof (uint32_t) - + t.level1_size * sizeof (uint32_t) - + (t.level2_size << t.q) * sizeof (uint32_t); - - for (i = 0; i < 5; i++) - fprintf (stream, "#define gbrkprop_header_%d %d\n", i, - ((uint32_t *) t.result)[i]); - fprintf (stream, "static const\n"); - fprintf (stream, "struct\n"); - fprintf (stream, " {\n"); - fprintf (stream, " int level1[%zu];\n", t.level1_size); - fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); - fprintf (stream, " unsigned char level3[(%zu << %d) / 2];\n", - t.level3_size, t.p); - fprintf (stream, " }\n"); - fprintf (stream, "unigbrkprop =\n"); - fprintf (stream, "{\n"); - fprintf (stream, " {"); - if (t.level1_size > 8) - fprintf (stream, "\n "); - for (i = 0; i < t.level1_size; i++) - { - uint32_t offset; - if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); - offset = ((uint32_t *) (t.result + level1_offset))[i]; - if (offset == 0) - fprintf (stream, " %5d", -1); - else - fprintf (stream, " %5zu", - (offset - level2_offset) / sizeof (uint32_t)); - if (i+1 < t.level1_size) - fprintf (stream, ","); - } - if (t.level1_size > 8) - fprintf (stream, "\n "); - fprintf (stream, " },\n"); - fprintf (stream, " {"); - if (t.level2_size << t.q > 8) - fprintf (stream, "\n "); - for (i = 0; i < t.level2_size << t.q; i++) - { - uint32_t offset; - if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); - offset = ((uint32_t *) (t.result + level2_offset))[i]; - if (offset == 0) - fprintf (stream, " %5d", -1); - else - fprintf (stream, " %5zu", - (offset - level3_offset) / sizeof (uint8_t) / 2); - if (i+1 < t.level2_size << t.q) - fprintf (stream, ","); - } - if (t.level2_size << t.q > 8) - fprintf (stream, "\n "); - fprintf (stream, " },\n"); - fprintf (stream, " {"); - if (t.level3_size << t.p > 8) - fprintf (stream, "\n "); - for (i = 0; i < (t.level3_size << t.p) / 2; i++) - { - unsigned char *p = (unsigned char *) (t.result + level3_offset); - unsigned char value0 = p[i * 2]; - unsigned char value1 = p[i * 2 + 1]; - if (i > 0 && (i % 8) == 0) - fprintf (stream, "\n "); - fprintf (stream, " 0x%02x%s", (value1 << 4) + value0, - (i+1 < (t.level3_size << t.p) / 2 ? "," : "")); - } - if (t.level3_size << t.p > 8) - fprintf (stream, "\n "); - fprintf (stream, " }\n"); - fprintf (stream, "};\n"); - - if (ferror (stream) || fclose (stream)) - { - fprintf (stderr, "error writing to '%s'\n", filename); - exit (1); - } -} - -/* Stores in unicode_org_gbp[] the grapheme breaking property from the - GraphemeBreakProperty.txt file. */ -static void -fill_org_gbp (const char *graphemebreakproperty_filename) -{ - unsigned int i; - FILE *stream; - int lineno = 0; - - for (i = 0; i < 0x110000; i++) - unicode_org_gbp[i] = GBP_OTHER; - - stream = fopen (graphemebreakproperty_filename, "r"); - if (stream == NULL) - { - fprintf (stderr, "error during fopen of '%s'\n", - graphemebreakproperty_filename); - exit (1); - } - - for (;;) - { - char buf[200+1]; - unsigned int i1, i2; - char padding[200+1]; - char propname[200+1]; - int propvalue; - - lineno++; - if (fscanf (stream, "%200[^\n]\n", buf) < 1) - break; - - if (buf[0] == '\0' || buf[0] == '#') - continue; - - if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4) - { - if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3) - { - fprintf (stderr, "parse error in '%s'\n", - graphemebreakproperty_filename); - exit (1); - } - i2 = i1; - } -#define PROP(name,value) \ - if (strcmp (propname, name) == 0) propvalue = value; else - PROP ("CR", GBP_CR) - PROP ("LF", GBP_LF) - PROP ("Control", GBP_CONTROL) - PROP ("Extend", GBP_EXTEND) - PROP ("Prepend", GBP_PREPEND) - PROP ("SpacingMark", GBP_SPACINGMARK) - PROP ("L", GBP_L) - PROP ("V", GBP_V) - PROP ("T", GBP_T) - PROP ("LV", GBP_LV) - PROP ("LVT", GBP_LVT) -#undef PROP - { - fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname, - graphemebreakproperty_filename, lineno); - exit (1); - } - if (!(i1 <= i2 && i2 < 0x110000)) - abort (); - - for (i = i1; i <= i2; i++) - unicode_org_gbp[i] = propvalue; - } - if (ferror (stream) || fclose (stream)) - { - fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename); - exit (1); - } -} - /* Line breaking classification. */ enum @@ -6972,6 +6746,234 @@ /* ========================================================================= */ +/* Grapheme break property. */ + +/* Possible values of the Grapheme_Cluster_Break property. */ +enum +{ + GBP_OTHER = 0, + GBP_CR = 1, + GBP_LF = 2, + GBP_CONTROL = 3, + GBP_EXTEND = 4, + GBP_PREPEND = 5, + GBP_SPACINGMARK = 6, + GBP_L = 7, + GBP_V = 8, + GBP_T = 9, + GBP_LV = 10, + GBP_LVT = 11 +}; + +/* Construction of sparse 3-level tables. */ +#define TABLE gbp_table +#define ELEMENT unsigned char +#define DEFAULT GBP_OTHER +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +/* The grapheme break property from the GraphemeBreakProperty.txt file. */ +int unicode_org_gbp[0x110000]; + +/* Output the per-character grapheme break property table. */ +static void +output_gbp_table (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch, i; + struct gbp_table t; + unsigned int level1_offset, level2_offset, level3_offset; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Grapheme break property of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + t.p = 7; + t.q = 9; + gbp_table_init (&t); + + for (ch = 0; ch < 0x110000; ch++) + gbp_table_add (&t, ch, unicode_org_gbp[ch]); + + gbp_table_finalize (&t); + + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define gbrkprop_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned char level3[(%zu << %d) / 2];\n", + t.level3_size, t.p); + fprintf (stream, " }\n"); + fprintf (stream, "unigbrkprop =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t) / 2); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + for (i = 0; i < (t.level3_size << t.p) / 2; i++) + { + unsigned char *p = (unsigned char *) (t.result + level3_offset); + unsigned char value0 = p[i * 2]; + unsigned char value1 = p[i * 2 + 1]; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " 0x%02x%s", (value1 << 4) + value0, + (i+1 < (t.level3_size << t.p) / 2 ? "," : "")); + } + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Stores in unicode_org_gbp[] the grapheme breaking property from the + GraphemeBreakProperty.txt file. */ +static void +fill_org_gbp (const char *graphemebreakproperty_filename) +{ + unsigned int i; + FILE *stream; + int lineno = 0; + + for (i = 0; i < 0x110000; i++) + unicode_org_gbp[i] = GBP_OTHER; + + stream = fopen (graphemebreakproperty_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", + graphemebreakproperty_filename); + exit (1); + } + + for (;;) + { + char buf[200+1]; + unsigned int i1, i2; + char padding[200+1]; + char propname[200+1]; + int propvalue; + + lineno++; + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4) + { + if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3) + { + fprintf (stderr, "parse error in '%s'\n", + graphemebreakproperty_filename); + exit (1); + } + i2 = i1; + } +#define PROP(name,value) \ + if (strcmp (propname, name) == 0) propvalue = value; else + PROP ("CR", GBP_CR) + PROP ("LF", GBP_LF) + PROP ("Control", GBP_CONTROL) + PROP ("Extend", GBP_EXTEND) + PROP ("Prepend", GBP_PREPEND) + PROP ("SpacingMark", GBP_SPACINGMARK) + PROP ("L", GBP_L) + PROP ("V", GBP_V) + PROP ("T", GBP_T) + PROP ("LV", GBP_LV) + PROP ("LVT", GBP_LVT) +#undef PROP + { + fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname, + graphemebreakproperty_filename, lineno); + exit (1); + } + if (!(i1 <= i2 && i2 < 0x110000)) + abort (); + + for (i = i1; i <= i2; i++) + unicode_org_gbp[i] = propvalue; + } + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename); + exit (1); + } +} + +/* ========================================================================= */ + /* Maximum number of characters into which a single Unicode character can be decomposed. */ #define MAX_DECOMP_LENGTH 18 @@ -8506,9 +8508,9 @@ const char *blocks_filename; const char *proplist30_filename; const char *eastasianwidth_filename; - const char *graphemebreakproperty_filename; const char *linebreak_filename; const char *wordbreakproperty_filename; + const char *graphemebreakproperty_filename; const char *compositionexclusions_filename; const char *specialcasing_filename; const char *casefolding_filename; @@ -8516,7 +8518,7 @@ if (argc != 15) { - fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt GraphemeBreakProperty.txt LineBreak.txt WordBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n", + fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n", argv[0]); exit (1); } @@ -8528,9 +8530,9 @@ blocks_filename = argv[5]; proplist30_filename = argv[6]; eastasianwidth_filename = argv[7]; - graphemebreakproperty_filename = argv[8]; - linebreak_filename = argv[9]; - wordbreakproperty_filename = argv[10]; + linebreak_filename = argv[8]; + wordbreakproperty_filename = argv[9]; + graphemebreakproperty_filename = argv[10]; compositionexclusions_filename = argv[11]; specialcasing_filename = argv[12]; casefolding_filename = argv[13]; @@ -8544,9 +8546,9 @@ fill_scripts (scripts_filename); fill_blocks (blocks_filename); fill_width (eastasianwidth_filename); - fill_org_gbp (graphemebreakproperty_filename); fill_org_lbp (linebreak_filename); fill_org_wbp (wordbreakproperty_filename); + fill_org_gbp (graphemebreakproperty_filename); fill_composition_exclusions (compositionexclusions_filename); fill_casing_rules (specialcasing_filename); fill_casefolding_rules (casefolding_filename); @@ -8571,8 +8573,6 @@ output_ident_properties (version); output_old_ctype (version); - output_gbp ("unigbrk/gbrkprop.h", version); - debug_output_lbrk_tables ("unilbrk/lbrkprop.txt"); debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt"); output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version); @@ -8581,6 +8581,8 @@ debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt"); output_wbrk_tables ("uniwbrk/wbrkprop.h", version); + output_gbp_table ("unigbrk/gbrkprop.h", version); + output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version); debug_output_composition_tables ("uninorm/composition.txt"); output_composition_tables ("uninorm/composition-table.gperf", version); @@ -8611,9 +8613,9 @@ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Blocks.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \ - /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/GraphemeBreakProperty.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/GraphemeBreakProperty.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CompositionExclusions.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/SpecialCasing.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CaseFolding.txt \ --- lib/unigbrk.in.h.orig Mon Dec 27 16:31:26 2010 +++ lib/unigbrk.in.h Mon Dec 27 16:06:29 2010 @@ -1,6 +1,6 @@ /* Grapheme cluster breaks in Unicode strings. Copyright (C) 2010 Free Software Foundation, Inc. - Written by Ben Pfaff <br...@clisp.org>, 2010. + Written by Ben Pfaff <b...@cs.stanford.edu>, 2010. This program is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published @@ -18,9 +18,6 @@ #ifndef _UNIGBRK_H #define _UNIGBRK_H -/* Get size_t. */ -#include <stddef.h> - /* Get bool. */ #include <stdbool.h> --- modules/unigbrk/base.orig Mon Dec 27 16:31:26 2010 +++ modules/unigbrk/base Mon Dec 27 16:21:59 2010 @@ -10,7 +10,7 @@ stdbool configure.ac: -gl_LIBUNISTRING_LIBHEADER([0.9], [unigbrk.h]) +gl_LIBUNISTRING_LIBHEADER([0.9.4], [unigbrk.h]) Makefile.am: BUILT_SOURCES += $(LIBUNISTRING_UNIGBRK_H) --- modules/unigbrk/uc-gbrk-prop.orig Mon Dec 27 16:31:26 2010 +++ modules/unigbrk/uc-gbrk-prop Mon Dec 27 16:22:04 2010 @@ -9,7 +9,7 @@ unigbrk/base configure.ac: -gl_LIBUNISTRING_MODULE([0.9], [unigbrk/uc-gbrk-prop]) +gl_LIBUNISTRING_MODULE([0.9.4], [unigbrk/uc-gbrk-prop]) Makefile.am: if LIBUNISTRING_COMPILE_UNIGBRK_UC_GBRK_PROP --- modules/unigbrk/uc-is-grapheme-break.orig Mon Dec 27 16:31:26 2010 +++ modules/unigbrk/uc-is-grapheme-break Mon Dec 27 16:22:09 2010 @@ -9,7 +9,7 @@ unigbrk/uc-gbrk-prop configure.ac: -gl_LIBUNISTRING_MODULE([0.9], [unigbrk/uc-is-grapheme-break]) +gl_LIBUNISTRING_MODULE([0.9.4], [unigbrk/uc-is-grapheme-break]) Makefile.am: if LIBUNISTRING_COMPILE_UNIGBRK_UC_IS_GRAPHEME_BREAK Changing permissions from 100644 to 100755