With clang 19, I see a warning: ../../gllib/unictype/scripts.h:2637:26: warning: implicit conversion from 'int' to 'short' changes value from 32768 to -32768 [-Wconstant-conversion]
The cause is that through the Unicode 16 upgrade, the level3 array in the generated scripts.h grew from 248 blocks to 257 blocks. This patch fixes it. 2024-09-20 Bruno Haible <br...@clisp.org> unictype/scripts: Fix integer overflow in generated table. Reported by clang through a -Wconstant-conversion warning. * lib/gen-uni-tables.c (output_scripts): Generate a level2 array of 'unsigned short', not 'short', values. * lib/unictype/scripts.h: Regenerated. * lib/unictype/scripts.c (uc_script): Update accordingly. diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c index 003d4d7b99..617af649f1 100644 --- a/lib/gen-uni-tables.c +++ b/lib/gen-uni-tables.c @@ -5356,7 +5356,7 @@ output_scripts (const char *version) fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); - fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned short level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p); fprintf (stream, " }\n"); fprintf (stream, "u_script =\n"); @@ -5390,11 +5390,13 @@ output_scripts (const char *version) if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; + /* To make the level2 values fit in 16 bits, we use 'unsigned short' + instead of 'short' and add 1 to each value. */ if (offset == 0) - fprintf (stream, " %5d", -1); + fprintf (stream, " %5d", -1 + 1); else fprintf (stream, " %5zu", - (offset - level3_offset) / sizeof (uint8_t)); + (offset - level3_offset) / sizeof (uint8_t) + 1); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); } diff --git a/lib/unictype/scripts.c b/lib/unictype/scripts.c index a226d8da28..1d9d13aaad 100644 --- a/lib/unictype/scripts.c +++ b/lib/unictype/scripts.c @@ -35,11 +35,11 @@ uc_script (ucs4_t uc) if (lookup1 >= 0) { unsigned int index2 = (uc >> script_header_2) & script_header_3; - int lookup2 = u_script.level2[lookup1 + index2]; - if (lookup2 >= 0) + unsigned int lookup2 = u_script.level2[lookup1 + index2]; + if (lookup2 > 0) { unsigned int index3 = (uc & script_header_4); - unsigned char lookup3 = u_script.level3[lookup2 + index3]; + unsigned char lookup3 = u_script.level3[(lookup2 - 1) + index3]; if (lookup3 != 0xff) return &scripts[lookup3]; diff --git a/lib/unictype/scripts.h b/lib/unictype/scripts.h index 0052dbaa70..0dbb7594f1 100644 --- a/lib/unictype/scripts.h +++ b/lib/unictype/scripts.h @@ -2368,7 +2368,7 @@ static const struct { int level1[15]; - short level2[5 << 9]; + unsigned short level2[5 << 9]; unsigned char level3[257 << 7]; } u_script = @@ -2378,326 +2378,326 @@ u_script = -1, -1, -1, -1, -1, -1, 2048 }, { - 0, 128, 256, 256, 256, 384, 512, 640, - 768, 896, 1024, 1152, 1280, 1408, 1536, 1664, ... + 1, 129, 257, 257, 257, 385, 513, 641, + 769, 897, 1025, 1153, 1281, 1409, 1537, 1665, ... }, { 0, 0, 0, 0, 0, 0, 0, 0,