Hi, here is a simple one. The uniq(1) utility only needs UTF-8 support to distinguish blank and non-blank characters with -f and to skip characters with -s. The former is easy to implement with mbtowc(3) and iswblank(3), the latter with mblen(3). There is no need for wrapper functions or a seperate utf8.c file.
OK? Ingo Index: uniq.1 =================================================================== RCS file: /cvs/src/usr.bin/uniq/uniq.1,v retrieving revision 1.17 diff -u -p -r1.17 uniq.1 --- uniq.1 3 Sep 2010 11:09:29 -0000 1.17 +++ uniq.1 10 Dec 2015 15:37:02 -0000 @@ -114,6 +114,14 @@ A file name of .Ql - denotes the standard input or the standard output .Pq depending on its position on the command line . +.Sh ENVIRONMENT +.Bl -tag -width LC_CTYPE +.It Ev LC_CTYPE +The character set +.Xr locale 1 . +Determines which groups of bytes are treated as characters +and which characters are considered blank. +.El .Sh EXIT STATUS .Ex -std uniq .Sh SEE ALSO Index: uniq.c =================================================================== RCS file: /cvs/src/usr.bin/uniq/uniq.c,v retrieving revision 1.23 diff -u -p -r1.23 uniq.c --- uniq.c 2 Nov 2015 20:25:42 -0000 1.23 +++ uniq.c 10 Dec 2015 15:37:02 -0000 @@ -37,10 +37,13 @@ #include <err.h> #include <errno.h> #include <limits.h> +#include <locale.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> +#include <wchar.h> +#include <wctype.h> #define MAXLINELEN (8 * 1024) @@ -61,6 +64,8 @@ main(int argc, char *argv[]) int ch; char *prevline, *thisline; + setlocale(LC_CTYPE, ""); + if (pledge("stdio rpath wpath cpath", NULL) == -1) err(1, "pledge"); @@ -176,16 +181,38 @@ show(FILE *ofp, char *str) char * skip(char *str) { + wchar_t wc; int nchars, nfields; + int len; for (nfields = numfields; nfields && *str; nfields--) { - while (isblank((unsigned char)*str)) - str++; - while (*str && !isblank((unsigned char)*str)) - str++; + + /* Skip blanks before the next field. */ + do { + if ((len = mbtowc(&wc, str, MB_CUR_MAX)) == -1) { + (void)mbtowc(NULL, NULL, MB_CUR_MAX); + wc = L'?'; + len = 1; + } + str += len; + } while (*str != '\0' && iswblank(wc)); + + /* Skip one field. */ + while (*str != '\0' && !iswblank(wc)) { + if ((len = mbtowc(&wc, str, MB_CUR_MAX)) == -1) { + (void)mbtowc(NULL, NULL, MB_CUR_MAX); + wc = L'?'; + len = 1; + } + str += len; + } } - for (nchars = numchars; nchars-- && *str && *str != '\n'; ++str) - ; + + /* Skip some additional characters. */ + for (nchars = numchars; nchars-- && *str != '\0'; str += len) + if ((len = mblen(str, MB_CUR_MAX)) == -1) + len = 1; + return (str); }