it only gets deeper and thicker... this decodes chars and prints ? for bytes it doesn't like, as well as codepoints (128-159) it doesn't like.
(this is extracted from some old utf8 code i had laying around. it's a bit simpler than the stringprep stuff but it seems to handle the case of some incorrect sequences correctly. it does allow overlong encodings, but "not my problem"?) Index: ls.c =================================================================== RCS file: /cvs/src/bin/ls/ls.c,v retrieving revision 1.43 diff -u -p -r1.43 ls.c --- ls.c 9 Oct 2015 01:37:06 -0000 1.43 +++ ls.c 23 Oct 2015 12:39:52 -0000 @@ -410,6 +410,8 @@ traverse(int argc, char *argv[], int opt fts_close(ftsp); } +extern size_t u8len(char *); + /* * Display() takes a linked list of FTSENT structures and passes the list * along with any other necessary information to the print function. P @@ -474,8 +476,8 @@ display(FTSENT *p, FTSENT *list) continue; } } - if (cur->fts_namelen > maxlen) - maxlen = cur->fts_namelen; + if (u8len(cur->fts_name) > maxlen) + maxlen = u8len(cur->fts_name); if (needstats) { sp = cur->fts_statp; if (sp->st_blocks > maxblock) Index: util.c =================================================================== RCS file: /cvs/src/bin/ls/util.c,v retrieving revision 1.16 diff -u -p -r1.16 util.c --- util.c 21 Nov 2013 15:54:45 -0000 1.16 +++ util.c 25 Oct 2015 22:54:05 -0000 @@ -45,14 +45,147 @@ #include "ls.h" #include "extern.h" + +static size_t +u8chars(uint32_t u, uint8_t *buf) +{ + size_t l; + uint8_t mask; + + if (u <= 0x7f) { + if (u == 0) { + l = 0; + } else { + buf[0] = u; + l = 1; + } + } else if (u <= 0x7ff) { + buf[1] = (u & 0x3f) | 0x80; + u >>= 6; + buf[0] = u | 0xc0; + l = 2; + } else { + l = 0; + mask = 0; + u &= 0x7fffffff; + if (u >= 0x04000000) { + buf[5] = (u & 0x3f) | 0x80; + u >>= 6; + mask |= 0x4; + l++; + } + if (u >= 0x00200000) { + buf[4] = (u & 0x3f) | 0x80; + u >>= 6; + mask |= 0x8; + l++; + } + if (u >= 0x00010000) { + buf[3] = (u & 0x3f) | 0x80; + u >>= 6; + mask |= 0x10; + l++; + } + buf[2] = (u & 0x3f) | 0x80; + u >>= 6; + buf[1] = (u & 0x3f) | 0x80; + u >>= 6; + mask |= 0xe0; + buf[0] = u | mask; + l += 3; + } + buf[l] = 0; + return l; +} + +static uint32_t +bigchar(const uint8_t **sp) +{ + uint32_t u; + uint8_t c; + const uint8_t *s; + int seq; + + s = *sp; + c = *s++; + u = 0; + if (c & 0x80) { + if ((c & 0xe0) == 0xc0) { + u = c & 0x1f; + seq = 1; + } else if ((c & 0xf0) == 0xe0) { + u = c & 0x0f; + seq = 2; + } else if ((c & 0xf8) == 0xf0) { + u = c & 0x07; + seq = 3; + } else if ((c & 0xfc) == 0xf8) { + u = c & 0x03; + seq = 4; + } else if ((c & 0xfe) == 0xfc) { + u = c & 0x01; + seq = 5; + } else { + /* invalid */ + *sp = *sp + 1; + return 0xffffffff; + } + while (seq-- && ((c = *s) & 0xc0) == 0x80) { + u <<= 6; + u |= c & 0x3f; + s++; + } + if (seq != -1) { + *sp = *sp + 1; + return 0xffffffff; + } + } else { + u = c; + } + *sp = s; + return u; +} + +/* + * it is somewhat inefficient to convert just to get the length, but + * this avoids length mistakes when dealing with invalid sequences. + */ +size_t +u8len(const uint8_t *s) +{ + size_t l; + + l = 0; + while (bigchar(&s)) + l++; + return l; +} + int putname(char *name) { - int len; + const uint8_t *s = name; + uint32_t c; + char *b, buf[8]; + int slen; - for (len = 0; *name; len++, name++) - putchar((!isprint((unsigned char)*name) && f_nonprint) ? '?' : *name); - return len; + slen = 0; + while ((c = bigchar(&s))) { + if (c == 0xffffffff) { + putchar('?'); + } else if (c < 128) { + putchar(isprint(c) ? c : '?'); + } else if (c <= 159) { + putchar('?'); + } else { + u8chars(c, buf); + b = buf; + while (*b) + putchar(*b++); + } + slen++; + } + return slen; } void