Hi,

here is a simple one.  The uniq(1) utility only needs UTF-8 support
to distinguish blank and non-blank characters with -f and to skip
characters with -s.  The former is easy to implement with mbtowc(3)
and iswblank(3), the latter with mblen(3).  There is no need for
wrapper functions or a seperate utf8.c file.

OK?
  Ingo


Index: uniq.1
===================================================================
RCS file: /cvs/src/usr.bin/uniq/uniq.1,v
retrieving revision 1.17
diff -u -p -r1.17 uniq.1
--- uniq.1      3 Sep 2010 11:09:29 -0000       1.17
+++ uniq.1      10 Dec 2015 15:37:02 -0000
@@ -114,6 +114,14 @@ A file name of
 .Ql -
 denotes the standard input or the standard output
 .Pq depending on its position on the command line .
+.Sh ENVIRONMENT
+.Bl -tag -width LC_CTYPE
+.It Ev LC_CTYPE
+The character set
+.Xr locale 1 .
+Determines which groups of bytes are treated as characters
+and which characters are considered blank.
+.El
 .Sh EXIT STATUS
 .Ex -std uniq
 .Sh SEE ALSO
Index: uniq.c
===================================================================
RCS file: /cvs/src/usr.bin/uniq/uniq.c,v
retrieving revision 1.23
diff -u -p -r1.23 uniq.c
--- uniq.c      2 Nov 2015 20:25:42 -0000       1.23
+++ uniq.c      10 Dec 2015 15:37:02 -0000
@@ -37,10 +37,13 @@
 #include <err.h>
 #include <errno.h>
 #include <limits.h>
+#include <locale.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <wchar.h>
+#include <wctype.h>
 
 #define        MAXLINELEN      (8 * 1024)
 
@@ -61,6 +64,8 @@ main(int argc, char *argv[])
        int ch;
        char *prevline, *thisline;
 
+       setlocale(LC_CTYPE, "");
+
        if (pledge("stdio rpath wpath cpath", NULL) == -1)
                err(1, "pledge");
 
@@ -176,16 +181,38 @@ show(FILE *ofp, char *str)
 char *
 skip(char *str)
 {
+       wchar_t wc;
        int nchars, nfields;
+       int len;
 
        for (nfields = numfields; nfields && *str; nfields--) {
-               while (isblank((unsigned char)*str))
-                       str++;
-               while (*str && !isblank((unsigned char)*str))
-                       str++;
+
+               /* Skip blanks before the next field. */
+               do {
+                       if ((len = mbtowc(&wc, str, MB_CUR_MAX)) == -1) {
+                               (void)mbtowc(NULL, NULL, MB_CUR_MAX);
+                               wc = L'?';
+                               len = 1;
+                       }
+                       str += len;
+               } while (*str != '\0' && iswblank(wc));
+
+               /* Skip one field. */
+               while (*str != '\0' && !iswblank(wc)) {
+                       if ((len = mbtowc(&wc, str, MB_CUR_MAX)) == -1) {
+                               (void)mbtowc(NULL, NULL, MB_CUR_MAX);
+                               wc = L'?';
+                               len = 1;
+                       }
+                       str += len;
+               }
        }
-       for (nchars = numchars; nchars-- && *str && *str != '\n'; ++str)
-               ;
+
+       /* Skip some additional characters. */
+       for (nchars = numchars; nchars-- && *str != '\0'; str += len)
+               if ((len = mblen(str, MB_CUR_MAX)) == -1)
+                       len = 1;
+
        return (str);
 }
 

Reply via email to