This is a follow up to a discussion from the dev list: https://lists.suckless.org/dev/2407/35646.html
I've separated the patch into two halves. 1. Replaces the current utf8decoder with a shorter and more direct one. It reports back errors and how much to advance on error (needed for the next patch) 2. Detects invalid utf8 bytes and renders them as U+FFFD instead of passing invalid bytes to xft which cuts it off. If keeping the current utf8 decoder is important, I can look into how to change it so that it can return error information back to the caller. But IMHO the current decoder is unnecessarily overcomplicated for libsl's need. The interaction between invalid utf8 sequence and ellipsis is a bit finicky and could use some extra eyes. But so far it has worked well in my testing. - NRK
>From 7208684792d36efe262d2272f84e7d6e729fd3e7 Mon Sep 17 00:00:00 2001 From: NRK <[email protected]> Date: Thu, 4 Jul 2024 21:25:37 +0000 Subject: [PATCH 1/2] overhaul utf8decode() this changes the utf8decode function to: * report when an error occurs * report how many bytes to advance on error these will be useful in the next commit to render invalid utf8 sequences. the new implementation is also shorter and more direct. --- drw.c | 76 ++++++++++++++++++++++++----------------------------------- 1 file changed, 31 insertions(+), 45 deletions(-) diff --git a/drw.c b/drw.c index 78a2b27..eb71da7 100644 --- a/drw.c +++ b/drw.c @@ -9,54 +9,40 @@ #include "util.h" #define UTF_INVALID 0xFFFD -#define UTF_SIZ 4 -static const unsigned char utfbyte[UTF_SIZ + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0}; -static const unsigned char utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8}; -static const long utfmin[UTF_SIZ + 1] = { 0, 0, 0x80, 0x800, 0x10000}; -static const long utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF}; - -static long -utf8decodebyte(const char c, size_t *i) -{ - for (*i = 0; *i < (UTF_SIZ + 1); ++(*i)) - if (((unsigned char)c & utfmask[*i]) == utfbyte[*i]) - return (unsigned char)c & ~utfmask[*i]; - return 0; -} - -static size_t -utf8validate(long *u, size_t i) +static int +utf8decode(const char *s_in, long *u, int *err) { - if (!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF)) - *u = UTF_INVALID; - for (i = 1; *u > utfmax[i]; ++i) - ; - return i; -} - -static size_t -utf8decode(const char *c, long *u, size_t clen) -{ - size_t i, j, len, type; - long udecoded; - + static const unsigned char lens[] = { + /* 0XXXX */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 10XXX */ 0, 0, 0, 0, 0, 0, 0, 0, /* invalid */ + /* 110XX */ 2, 2, 2, 2, + /* 1110X */ 3, 3, + /* 11110 */ 4, + /* 11111 */ 0, /* invalid */ + }; + static const unsigned char leading_mask[] = { 0x7F, 0x1F, 0x0F, 0x07 }; + static const unsigned int overlong[] = { 0x0, 0x80, 0x0800, 0x10000 }; + + const unsigned char *s = (const unsigned char *)s_in; + int len = lens[*s >> 3]; *u = UTF_INVALID; - if (!clen) - return 0; - udecoded = utf8decodebyte(c[0], &len); - if (!BETWEEN(len, 1, UTF_SIZ)) + *err = 1; + if (len == 0) return 1; - for (i = 1, j = 1; i < clen && j < len; ++i, ++j) { - udecoded = (udecoded << 6) | utf8decodebyte(c[i], &type); - if (type) - return j; + + long cp = s[0] & leading_mask[len - 1]; + for (int i = 1; i < len; ++i) { + if (s[i] == '\0' || (s[i] & 0xC0) != 0x80) + return i; + cp = (cp << 6) | (s[i] & 0x3F); } - if (j < len) - return 0; - *u = udecoded; - utf8validate(u, len); + /* out of range, surrogate, overlong encoding */ + if (cp > 0x10FFFF || (cp >> 11) == 0x1B || cp < overlong[len - 1]) + return len; + *err = 0; + *u = cp; return len; } @@ -242,7 +228,7 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned int h, unsigned int lp unsigned int tmpw, ew, ellipsis_w = 0, ellipsis_len, hash, h0, h1; XftDraw *d = NULL; Fnt *usedfont, *curfont, *nextfont; - int utf8strlen, utf8charlen, render = x || y || w || h; + int utf8strlen, utf8charlen, utf8err, render = x || y || w || h; long utf8codepoint = 0; const char *utf8str; FcCharSet *fccharset; @@ -272,11 +258,11 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned int h, unsigned int lp if (!ellipsis_width && render) ellipsis_width = drw_fontset_getwidth(drw, "..."); while (1) { - ew = ellipsis_len = utf8strlen = 0; + ew = ellipsis_len = utf8err = utf8charlen = utf8strlen = 0; utf8str = text; nextfont = NULL; while (*text) { - utf8charlen = utf8decode(text, &utf8codepoint, UTF_SIZ); + utf8charlen = utf8decode(text, &utf8codepoint, &utf8err); for (curfont = drw->fonts; curfont; curfont = curfont->next) { charexists = charexists || XftCharExists(drw->dpy, curfont->xfont, utf8codepoint); if (charexists) { -- 2.42.0
>From 396a21976f08b3c66e521855084685ce93356756 Mon Sep 17 00:00:00 2001 From: NRK <[email protected]> Date: Thu, 4 Jul 2024 21:27:47 +0000 Subject: [PATCH 2/2] render invalid utf8 sequences as U+FFFD previously drw_text would do the width calculations as if invalid utf8 sequences were replaced with U+FFFD but would pass the invalid utf8 sequence to xft to render where xft would just cut it off at the first invalid byte. this change makes invalid utf8 render as U+FFFD and avoids sending invalid sequences to xft. the following can be used to check the behavior before and after the patch: $ printf "0\xef1234567\ntest" | dmenu Ref: https://lists.suckless.org/dev/2407/35646.html --- drw.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drw.c b/drw.c index eb71da7..f151ae5 100644 --- a/drw.c +++ b/drw.c @@ -237,7 +237,8 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned int h, unsigned int lp XftResult result; int charexists = 0, overflow = 0; /* keep track of a couple codepoints for which we have no match. */ - static unsigned int nomatches[128], ellipsis_width; + static unsigned int nomatches[128], ellipsis_width, invalid_width; + static const char invalid[] = "�"; if (!drw || (render && (!drw->scheme || !w)) || !text || !drw->fonts) return 0; @@ -257,6 +258,10 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned int h, unsigned int lp usedfont = drw->fonts; if (!ellipsis_width && render) ellipsis_width = drw_fontset_getwidth(drw, "..."); + if (!invalid_width) { + invalid_width = -1; /* stop infinite recursion */ + invalid_width = drw_fontset_getwidth(drw, invalid); + } while (1) { ew = ellipsis_len = utf8err = utf8charlen = utf8strlen = 0; utf8str = text; @@ -284,9 +289,9 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned int h, unsigned int lp else utf8strlen = ellipsis_len; } else if (curfont == usedfont) { - utf8strlen += utf8charlen; text += utf8charlen; - ew += tmpw; + utf8strlen += utf8err ? 0 : utf8charlen; + ew += utf8err ? 0 : tmpw; } else { nextfont = curfont; } @@ -294,7 +299,7 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned int h, unsigned int lp } } - if (overflow || !charexists || nextfont) + if (overflow || !charexists || nextfont || utf8err) break; else charexists = 0; @@ -309,6 +314,12 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned int h, unsigned int lp x += ew; w -= ew; } + if (utf8err && (!render || invalid_width < w)) { + if (render) + drw_text(drw, x, y, w, h, 0, invalid, invert); + x += invalid_width; + w -= invalid_width; + } if (render && overflow) drw_text(drw, ellipsis_x, y, ellipsis_w, h, 0, "...", invert); -- 2.42.0
