This is a follow up to a discussion from the dev list:
https://lists.suckless.org/dev/2407/35646.html

I've separated the patch into two halves.

1. Replaces the current utf8decoder with a shorter and more direct one.
   It reports back errors and how much to advance on error (needed for the
   next patch)
2. Detects invalid utf8 bytes and renders them as U+FFFD instead of
   passing invalid bytes to xft which cuts it off.

If keeping the current utf8 decoder is important, I can look into how to
change it so that it can return error information back to the caller.
But IMHO the current decoder is unnecessarily overcomplicated for
libsl's need.

The interaction between invalid utf8 sequence and ellipsis is a bit
finicky and could use some extra eyes. But so far it has worked well in
my testing.

- NRK
>From 7208684792d36efe262d2272f84e7d6e729fd3e7 Mon Sep 17 00:00:00 2001
From: NRK <[email protected]>
Date: Thu, 4 Jul 2024 21:25:37 +0000
Subject: [PATCH 1/2] overhaul utf8decode()

this changes the utf8decode function to:

* report when an error occurs
* report how many bytes to advance on error

these will be useful in the next commit to render invalid utf8
sequences.

the new implementation is also shorter and more direct.
---
 drw.c | 76 ++++++++++++++++++++++++-----------------------------------
 1 file changed, 31 insertions(+), 45 deletions(-)

diff --git a/drw.c b/drw.c
index 78a2b27..eb71da7 100644
--- a/drw.c
+++ b/drw.c
@@ -9,54 +9,40 @@
 #include "util.h"
 
 #define UTF_INVALID 0xFFFD
-#define UTF_SIZ     4
 
-static const unsigned char utfbyte[UTF_SIZ + 1] = {0x80,    0, 0xC0, 0xE0, 
0xF0};
-static const unsigned char utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 
0xF8};
-static const long utfmin[UTF_SIZ + 1] = {       0,    0,  0x80,  0x800,  
0x10000};
-static const long utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 
0x10FFFF};
-
-static long
-utf8decodebyte(const char c, size_t *i)
-{
-       for (*i = 0; *i < (UTF_SIZ + 1); ++(*i))
-               if (((unsigned char)c & utfmask[*i]) == utfbyte[*i])
-                       return (unsigned char)c & ~utfmask[*i];
-       return 0;
-}
-
-static size_t
-utf8validate(long *u, size_t i)
+static int
+utf8decode(const char *s_in, long *u, int *err)
 {
-       if (!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF))
-               *u = UTF_INVALID;
-       for (i = 1; *u > utfmax[i]; ++i)
-               ;
-       return i;
-}
-
-static size_t
-utf8decode(const char *c, long *u, size_t clen)
-{
-       size_t i, j, len, type;
-       long udecoded;
-
+       static const unsigned char lens[] = {
+               /* 0XXXX */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+               /* 10XXX */ 0, 0, 0, 0, 0, 0, 0, 0,  /* invalid */
+               /* 110XX */ 2, 2, 2, 2,
+               /* 1110X */ 3, 3,
+               /* 11110 */ 4,
+               /* 11111 */ 0,  /* invalid */
+       };
+       static const unsigned char leading_mask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
+       static const unsigned int overlong[] = { 0x0, 0x80, 0x0800, 0x10000 };
+
+       const unsigned char *s = (const unsigned char *)s_in;
+       int len = lens[*s >> 3];
        *u = UTF_INVALID;
-       if (!clen)
-               return 0;
-       udecoded = utf8decodebyte(c[0], &len);
-       if (!BETWEEN(len, 1, UTF_SIZ))
+       *err = 1;
+       if (len == 0)
                return 1;
-       for (i = 1, j = 1; i < clen && j < len; ++i, ++j) {
-               udecoded = (udecoded << 6) | utf8decodebyte(c[i], &type);
-               if (type)
-                       return j;
+
+       long cp = s[0] & leading_mask[len - 1];
+       for (int i = 1; i < len; ++i) {
+               if (s[i] == '\0' || (s[i] & 0xC0) != 0x80)
+                       return i;
+               cp = (cp << 6) | (s[i] & 0x3F);
        }
-       if (j < len)
-               return 0;
-       *u = udecoded;
-       utf8validate(u, len);
+       /* out of range, surrogate, overlong encoding */
+       if (cp > 0x10FFFF || (cp >> 11) == 0x1B || cp < overlong[len - 1])
+               return len;
 
+       *err = 0;
+       *u = cp;
        return len;
 }
 
@@ -242,7 +228,7 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned 
int h, unsigned int lp
        unsigned int tmpw, ew, ellipsis_w = 0, ellipsis_len, hash, h0, h1;
        XftDraw *d = NULL;
        Fnt *usedfont, *curfont, *nextfont;
-       int utf8strlen, utf8charlen, render = x || y || w || h;
+       int utf8strlen, utf8charlen, utf8err, render = x || y || w || h;
        long utf8codepoint = 0;
        const char *utf8str;
        FcCharSet *fccharset;
@@ -272,11 +258,11 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned 
int h, unsigned int lp
        if (!ellipsis_width && render)
                ellipsis_width = drw_fontset_getwidth(drw, "...");
        while (1) {
-               ew = ellipsis_len = utf8strlen = 0;
+               ew = ellipsis_len = utf8err = utf8charlen = utf8strlen = 0;
                utf8str = text;
                nextfont = NULL;
                while (*text) {
-                       utf8charlen = utf8decode(text, &utf8codepoint, UTF_SIZ);
+                       utf8charlen = utf8decode(text, &utf8codepoint, 
&utf8err);
                        for (curfont = drw->fonts; curfont; curfont = 
curfont->next) {
                                charexists = charexists || 
XftCharExists(drw->dpy, curfont->xfont, utf8codepoint);
                                if (charexists) {
-- 
2.42.0

>From 396a21976f08b3c66e521855084685ce93356756 Mon Sep 17 00:00:00 2001
From: NRK <[email protected]>
Date: Thu, 4 Jul 2024 21:27:47 +0000
Subject: [PATCH 2/2] render invalid utf8 sequences as U+FFFD

previously drw_text would do the width calculations as if
invalid utf8 sequences were replaced with U+FFFD but would pass
the invalid utf8 sequence to xft to render where xft would just
cut it off at the first invalid byte.

this change makes invalid utf8 render as U+FFFD and avoids
sending invalid sequences to xft. the following can be used to
check the behavior before and after the patch:

        $ printf "0\xef1234567\ntest" | dmenu

Ref: https://lists.suckless.org/dev/2407/35646.html
---
 drw.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/drw.c b/drw.c
index eb71da7..f151ae5 100644
--- a/drw.c
+++ b/drw.c
@@ -237,7 +237,8 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned 
int h, unsigned int lp
        XftResult result;
        int charexists = 0, overflow = 0;
        /* keep track of a couple codepoints for which we have no match. */
-       static unsigned int nomatches[128], ellipsis_width;
+       static unsigned int nomatches[128], ellipsis_width, invalid_width;
+       static const char invalid[] = "�";
 
        if (!drw || (render && (!drw->scheme || !w)) || !text || !drw->fonts)
                return 0;
@@ -257,6 +258,10 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned 
int h, unsigned int lp
        usedfont = drw->fonts;
        if (!ellipsis_width && render)
                ellipsis_width = drw_fontset_getwidth(drw, "...");
+       if (!invalid_width) {
+               invalid_width = -1; /* stop infinite recursion */
+               invalid_width = drw_fontset_getwidth(drw, invalid);
+       }
        while (1) {
                ew = ellipsis_len = utf8err = utf8charlen = utf8strlen = 0;
                utf8str = text;
@@ -284,9 +289,9 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned 
int h, unsigned int lp
                                                else
                                                        utf8strlen = 
ellipsis_len;
                                        } else if (curfont == usedfont) {
-                                               utf8strlen += utf8charlen;
                                                text += utf8charlen;
-                                               ew += tmpw;
+                                               utf8strlen += utf8err ? 0 : 
utf8charlen;
+                                               ew += utf8err ? 0 : tmpw;
                                        } else {
                                                nextfont = curfont;
                                        }
@@ -294,7 +299,7 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned 
int h, unsigned int lp
                                }
                        }
 
-                       if (overflow || !charexists || nextfont)
+                       if (overflow || !charexists || nextfont || utf8err)
                                break;
                        else
                                charexists = 0;
@@ -309,6 +314,12 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned 
int h, unsigned int lp
                        x += ew;
                        w -= ew;
                }
+               if (utf8err && (!render || invalid_width < w)) {
+                       if (render)
+                               drw_text(drw, x, y, w, h, 0, invalid, invert);
+                       x += invalid_width;
+                       w -= invalid_width;
+               }
                if (render && overflow)
                        drw_text(drw, ellipsis_x, y, ellipsis_w, h, 0, "...", 
invert);
 
-- 
2.42.0

Reply via email to