Bug#1027414: luit: Luit does not handle Unicode beyond BMP

Ben Wong Fri, 30 Dec 2022 20:36:16 -0800

Package: luit
Version: 2.0.20221028-1
Severity: normal
Tags: patch
X-Debbugs-Cc: bugs.debian....@wongs.net


Dear Maintainer,

It appears that luit does not handle UTF-8 beyond U+FFFF. For example:

    printf "Nabla (U+2207): \U2207\nBold Nabla (U+1D6C1): \U1D6C1\n" \
        | luit -encoding UTF-8 -c

The output expected is:

        Nabla (U+2207): ∇
        Bold Nabla (U+1D6C1): 𝛁

The output actually produced by luit is:

        Nabla (U+2207): ∇
        Bold Nabla (U+1D6C1): 훁

Note that luit generates U+D6C1 (훁) instead of U+1D6C1 (𝛁).

I believe the bug is in iso2022.c:outbufUTF8() which looks like this:

    if (c <= 0x7F) {
        OUTBUF_MAKE_FREE(is, fd, 1);
        is->outbuf[is->outbuf_count++] = UChar(c);
    } else if (c <= 0x7FF) {
        OUTBUF_MAKE_FREE(is, fd, 2);
        is->outbuf[is->outbuf_count++] = UChar(0xC0 | ((c >> 6) & 0x1F));
        is->outbuf[is->outbuf_count++] = UChar(0x80 | (c & 0x3F));
    } else {
        OUTBUF_MAKE_FREE(is, fd, 3);
        is->outbuf[is->outbuf_count++] = UChar(0xE0 | ((c >> 12) & 0x0F));
        is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 6) & 0x3F));
        is->outbuf[is->outbuf_count++] = UChar(0x80 | (c & 0x3F));
    }

As you can see, it only handles three byte UTF-8 sequences, covering
0x000000 to 0x00FFFF. A fourth byte is needed to cover the
supplemental planes up to 0x10FFFF (the limit of Unicode).

        *   *   *   


I created a patch for the above problem and, in testing it, found
another bug. Certain valid Unicode characters were not being read if
they were in the final plane (0x100000 to 0x10FFFF). I tracked it down
to other.c:stack_utf8():

        u = ((s->utf8.buf[0] & 0x03) << 18)
            | ((s->utf8.buf[1] & 0x3F) << 12)
            | ((s->utf8.buf[2] & 0x3F) << 6)
            | ((s->utf8.buf[3] & 0x3F));

The first byte of a four byte UTF-8 sequence gets ANDed with 0x03,
keeping just the low two bits. However, it should keep three bits.
Changing 0x03 to 0x07 fixes the problem.

I have attached patches for both issues. 


-- System Information:
Debian Release: bookworm/sid
  APT prefers testing
  APT policy: (500, 'testing')
Architecture: amd64 (x86_64)

Kernel: Linux 6.0.0-6-amd64 (SMP w/8 CPU threads; PREEMPT)
Locale: LANG=en_US.UTF-8, LC_CTYPE=en_US.UTF-8 (charmap=UTF-8), LANGUAGE not set
Shell: /bin/sh linked to /usr/bin/dash
Init: systemd (via /run/systemd/system)
LSM: AppArmor: enabled

Versions of packages luit depends on:
ii  libc6  2.36-6

luit recommends no packages.

luit suggests no packages.

-- no debconf information

--- iso2022.c.orig      2018-06-27 15:46:34.000000000 -0700
+++ iso2022.c   2022-12-30 19:22:53.774355814 -0800
@@ -134,11 +134,35 @@
        OUTBUF_MAKE_FREE(is, fd, 2);
        is->outbuf[is->outbuf_count++] = UChar(0xC0 | ((c >> 6) & 0x1F));
        is->outbuf[is->outbuf_count++] = UChar(0x80 | (c & 0x3F));
-    } else {
+    } else if (c <= 0xFFFF) {
        OUTBUF_MAKE_FREE(is, fd, 3);
        is->outbuf[is->outbuf_count++] = UChar(0xE0 | ((c >> 12) & 0x0F));
        is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 6) & 0x3F));
        is->outbuf[is->outbuf_count++] = UChar(0x80 | (c & 0x3F));
+    } else if (c <= 0x1FFFFF) {
+       OUTBUF_MAKE_FREE(is, fd, 4);
+       is->outbuf[is->outbuf_count++] = UChar(0xF0 | ((c >> 18) & 0x07));
+       is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 12) & 0x3F));
+       is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 6) & 0x3F));
+       is->outbuf[is->outbuf_count++] = UChar(0x80 | (c & 0x3F));
+    } else if (c <= 0x03FFFFFF) {
+       OUTBUF_MAKE_FREE(is, fd, 5);
+       is->outbuf[is->outbuf_count++] = UChar(0xF8 | ((c >> 24) & 0x03));
+       is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 18) & 0x3f));
+       is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 12) & 0x3F));
+       is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 6) & 0x3F));
+       is->outbuf[is->outbuf_count++] = UChar(0x80 | (c & 0x3F));
+    } else if (c <= 0x7FFFFFFF) {
+       OUTBUF_MAKE_FREE(is, fd, 6);
+       is->outbuf[is->outbuf_count++] = UChar(0xFC | ((c >> 30) & 0x01));
+       is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 24) & 0x3f));
+       is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 18) & 0x3f));
+       is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 12) & 0x3F));
+       is->outbuf[is->outbuf_count++] = UChar(0x80 | ((c >> 6) & 0x3F));
+       is->outbuf[is->outbuf_count++] = UChar(0x80 | (c & 0x3F));
+    } else {
+      /* "21 bits ought to be enough for anybody!" -- The Unicode Consortium */
+      Warning("ignoring character beyond UTF-8's 31-bit range: %'X.\n", c);
     }
 }

--- other.c.orig        2013-02-02 13:50:30.000000000 -0800
+++ other.c     2022-12-30 20:11:22.391737595 -0800
@@ -122,26 +122,26 @@
        return (int) c;
     }
     if (s->utf8.buf_ptr == 0) {
-       if ((c & 0x40) == 0)
+       if ((c & 0x40) == 0)    /* Skip continuation bytes 10xx xxxx */
            return -1;
        s->utf8.buf[s->utf8.buf_ptr++] = UChar(c);
-       if ((c & 0x60) == 0x40)
+       if ((c & 0x60) == 0x40)                 /* Starts with 110x xxxx */
            s->utf8.len = 2;
-       else if ((c & 0x70) == 0x60)
+       else if ((c & 0x70) == 0x60)            /* Starts with 1110 xxxx */
            s->utf8.len = 3;
-       else if ((c & 0x78) == 0x70)
+       else if ((c & 0x78) == 0x70)            /* Starts with 1111 0xxx */
            s->utf8.len = 4;
        else
            s->utf8.buf_ptr = 0;
        return -1;
     }
-    if ((c & 0x40) != 0) {
+    if ((c & 0x40) != 0) {     /* Resync if not a continuation 10xx xxxx */
        s->utf8.buf_ptr = 0;
        return -1;
     }
     s->utf8.buf[s->utf8.buf_ptr++] = UChar(c);
     if (s->utf8.buf_ptr < s->utf8.len)
-       return -1;
+       return -1;              /* Get the next continuation byte */
     switch (s->utf8.len) {
     case 2:
        u = ((s->utf8.buf[0] & 0x1F) << 6) | (s->utf8.buf[1] & 0x3F);
@@ -160,7 +160,7 @@
        else
            return u;
     case 4:
-       u = ((s->utf8.buf[0] & 0x03) << 18)
+       u = ((s->utf8.buf[0] & 0x07) << 18)
            | ((s->utf8.buf[1] & 0x3F) << 12)
            | ((s->utf8.buf[2] & 0x3F) << 6)
            | ((s->utf8.buf[3] & 0x3F));

Bug#1027414: luit: Luit does not handle Unicode beyond BMP

Reply via email to