Package: luit Version: 2.0.20221028-1 Severity: normal Tags: patch X-Debbugs-Cc: bugs.debian....@wongs.net
Dear Maintainer, It appears that luit does not handle UTF-8 beyond U+FFFF. For example: printf "Nabla (U+2207): \U2207\nBold Nabla (U+1D6C1): \U1D6C1\n" \ | luit -encoding UTF-8 -c The output expected is: Nabla (U+2207): ∇ Bold Nabla (U+1D6C1): 𝛁 The output actually produced by luit is: Nabla (U+2207): ∇ Bold Nabla (U+1D6C1): 훁 Note that luit generates U+D6C1 (훁) instead of U+1D6C1 (𝛁). I believe the bug is in iso2022.c:outbufUTF8() which looks like this: if (c <= 0x7F) { OUTBUF_MAKE_FREE(is, fd, 1); is->outbuf[is->outbuf_count++] = UChar(c); } else if (c <= 0x7FF) { OUTBUF_MAKE_FREE(is, fd, 2); is->outbuf[is->outbuf_count++] = UChar(0xC0 | ((c >> 6) & 0x1F)); is->outbuf[is->outbuf_count++] = UChar(0x80 | (c & 0x3F)); } else { OUTBUF_MAKE_FREE(is, fd, 3); is->outbuf[is->outbuf_count++] = UChar(0xE0 | ((c >> 12) & 0x0F)); is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 6) & 0x3F)); is->outbuf[is->outbuf_count++] = UChar(0x80 | (c & 0x3F)); } As you can see, it only handles three byte UTF-8 sequences, covering 0x000000 to 0x00FFFF. A fourth byte is needed to cover the supplemental planes up to 0x10FFFF (the limit of Unicode). * * * I created a patch for the above problem and, in testing it, found another bug. Certain valid Unicode characters were not being read if they were in the final plane (0x100000 to 0x10FFFF). I tracked it down to other.c:stack_utf8(): u = ((s->utf8.buf[0] & 0x03) << 18) | ((s->utf8.buf[1] & 0x3F) << 12) | ((s->utf8.buf[2] & 0x3F) << 6) | ((s->utf8.buf[3] & 0x3F)); The first byte of a four byte UTF-8 sequence gets ANDed with 0x03, keeping just the low two bits. However, it should keep three bits. Changing 0x03 to 0x07 fixes the problem. I have attached patches for both issues. -- System Information: Debian Release: bookworm/sid APT prefers testing APT policy: (500, 'testing') Architecture: amd64 (x86_64) Kernel: Linux 6.0.0-6-amd64 (SMP w/8 CPU threads; PREEMPT) Locale: LANG=en_US.UTF-8, LC_CTYPE=en_US.UTF-8 (charmap=UTF-8), LANGUAGE not set Shell: /bin/sh linked to /usr/bin/dash Init: systemd (via /run/systemd/system) LSM: AppArmor: enabled Versions of packages luit depends on: ii libc6 2.36-6 luit recommends no packages. luit suggests no packages. -- no debconf information
--- iso2022.c.orig 2018-06-27 15:46:34.000000000 -0700 +++ iso2022.c 2022-12-30 19:22:53.774355814 -0800 @@ -134,11 +134,35 @@ OUTBUF_MAKE_FREE(is, fd, 2); is->outbuf[is->outbuf_count++] = UChar(0xC0 | ((c >> 6) & 0x1F)); is->outbuf[is->outbuf_count++] = UChar(0x80 | (c & 0x3F)); - } else { + } else if (c <= 0xFFFF) { OUTBUF_MAKE_FREE(is, fd, 3); is->outbuf[is->outbuf_count++] = UChar(0xE0 | ((c >> 12) & 0x0F)); is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 6) & 0x3F)); is->outbuf[is->outbuf_count++] = UChar(0x80 | (c & 0x3F)); + } else if (c <= 0x1FFFFF) { + OUTBUF_MAKE_FREE(is, fd, 4); + is->outbuf[is->outbuf_count++] = UChar(0xF0 | ((c >> 18) & 0x07)); + is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 12) & 0x3F)); + is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 6) & 0x3F)); + is->outbuf[is->outbuf_count++] = UChar(0x80 | (c & 0x3F)); + } else if (c <= 0x03FFFFFF) { + OUTBUF_MAKE_FREE(is, fd, 5); + is->outbuf[is->outbuf_count++] = UChar(0xF8 | ((c >> 24) & 0x03)); + is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 18) & 0x3f)); + is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 12) & 0x3F)); + is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 6) & 0x3F)); + is->outbuf[is->outbuf_count++] = UChar(0x80 | (c & 0x3F)); + } else if (c <= 0x7FFFFFFF) { + OUTBUF_MAKE_FREE(is, fd, 6); + is->outbuf[is->outbuf_count++] = UChar(0xFC | ((c >> 30) & 0x01)); + is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 24) & 0x3f)); + is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 18) & 0x3f)); + is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 12) & 0x3F)); + is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 6) & 0x3F)); + is->outbuf[is->outbuf_count++] = UChar(0x80 | (c & 0x3F)); + } else { + /* "21 bits ought to be enough for anybody!" -- The Unicode Consortium */ + Warning("ignoring character beyond UTF-8's 31-bit range: %'X.\n", c); } }
--- other.c.orig 2013-02-02 13:50:30.000000000 -0800 +++ other.c 2022-12-30 20:11:22.391737595 -0800 @@ -122,26 +122,26 @@ return (int) c; } if (s->utf8.buf_ptr == 0) { - if ((c & 0x40) == 0) + if ((c & 0x40) == 0) /* Skip continuation bytes 10xx xxxx */ return -1; s->utf8.buf[s->utf8.buf_ptr++] = UChar(c); - if ((c & 0x60) == 0x40) + if ((c & 0x60) == 0x40) /* Starts with 110x xxxx */ s->utf8.len = 2; - else if ((c & 0x70) == 0x60) + else if ((c & 0x70) == 0x60) /* Starts with 1110 xxxx */ s->utf8.len = 3; - else if ((c & 0x78) == 0x70) + else if ((c & 0x78) == 0x70) /* Starts with 1111 0xxx */ s->utf8.len = 4; else s->utf8.buf_ptr = 0; return -1; } - if ((c & 0x40) != 0) { + if ((c & 0x40) != 0) { /* Resync if not a continuation 10xx xxxx */ s->utf8.buf_ptr = 0; return -1; } s->utf8.buf[s->utf8.buf_ptr++] = UChar(c); if (s->utf8.buf_ptr < s->utf8.len) - return -1; + return -1; /* Get the next continuation byte */ switch (s->utf8.len) { case 2: u = ((s->utf8.buf[0] & 0x1F) << 6) | (s->utf8.buf[1] & 0x3F); @@ -160,7 +160,7 @@ else return u; case 4: - u = ((s->utf8.buf[0] & 0x03) << 18) + u = ((s->utf8.buf[0] & 0x07) << 18) | ((s->utf8.buf[1] & 0x3F) << 12) | ((s->utf8.buf[2] & 0x3F) << 6) | ((s->utf8.buf[3] & 0x3F));