Hi, our wc(1) utility currently violates POSIX in two ways:
1. The -m option counts bytes instead of characters. The patch given below fixes that. 2. Word counting with -w only treats ASCII whitespace as word boundaries and regards two words joined by non-ASCII whitespace as one single word. The second issue is not related to UTF-8, but a matter of full Unicode support. It would not be hard to fix that by using mbtowc(3) and iswblank(3) instead of mblen(3). However, i don't think we want to pollute our base system tools with functions requiring full Unicode support, not even to the extent available in our own C library. So i consider iswblank(3) taboo for now. A few notes about the patch: * As usual, reduce the ridiculous setlocale(LC_CTYPE, "") to what is actually needed, setlocale(LC_ALL, ""). * As usual, -m only differs from -c if LC_CTYPE is set to a multibyte encoding. * In the case /* Do it the hard way... */, we need to switch from read(2) to getline(3) because read(2) might chop multibyte characters to pieces. That doesn't affect memory consumption of "wc -l" or "wc -c", not even for huge binary files without newline characters. It does increase memory consumption for files with very long lines when -w or -m is requested - but that's not a problem because both only make sense with real text, and real text does not have lines of a length that getline(3) is unable to handle. OK? Ingo Index: wc.1 =================================================================== RCS file: /cvs/src/usr.bin/wc/wc.1,v retrieving revision 1.25 diff -u -p -r1.25 wc.1 --- wc.1 21 Apr 2015 10:46:48 -0000 1.25 +++ wc.1 29 Nov 2015 16:34:28 -0000 @@ -72,9 +72,7 @@ using powers of 2 for sizes (K=1024, M=1 The number of lines in each input file is written to the standard output. .It Fl m -Intended to count characters instead of bytes; -currently an alias for -.Fl c . +Count characters instead of bytes. .It Fl w The number of words in each input file is written to the standard output. @@ -111,7 +109,8 @@ The .Nm utility is compliant with the .St -p1003.1-2008 -specification, except that it ignores the locale. +specification, except that it recognizes word boundaries only at ASCII +whitespace. .Pp The flag .Op Fl h @@ -121,7 +120,16 @@ A .Nm utility appeared in .At v1 . -.Sh BUGS +.Sh CAVEATS The .Fl m -option counts bytes instead of characters. +option depends on the character set +.Xr locale 1 . +If +.Ev LC_CTYPE +is set to +.Qq C +or +.Qq POSIX , +it has the same effect as +.Fl c . Index: wc.c =================================================================== RCS file: /cvs/src/usr.bin/wc/wc.c,v retrieving revision 1.19 diff -u -p -r1.19 wc.c --- wc.c 9 Oct 2015 01:37:09 -0000 1.19 +++ wc.c 29 Nov 2015 16:34:28 -0000 @@ -42,7 +42,7 @@ #include <util.h> int64_t tlinect, twordct, tcharct; -int doline, doword, dochar, humanchar; +int doline, doword, dochar, humanchar, multibyte; int rval; extern char *__progname; @@ -55,7 +55,7 @@ main(int argc, char *argv[]) { int ch; - setlocale(LC_ALL, ""); + setlocale(LC_CTYPE, ""); if (pledge("stdio rpath", NULL) == -1) err(1, "pledge"); @@ -68,8 +68,11 @@ main(int argc, char *argv[]) case 'w': doword = 1; break; - case 'c': case 'm': + if (MB_CUR_MAX > 1) + multibyte = 1; + /* FALLTHROUGH */ + case 'c': dochar = 1; break; case 'h': @@ -112,15 +115,19 @@ main(int argc, char *argv[]) void cnt(char *file) { + static char *buf; + static ssize_t bufsz; + + FILE *stream; u_char *C; short gotsp; - int len; + ssize_t len; int64_t linect, wordct, charct; struct stat sbuf; int fd; - u_char buf[MAXBSIZE]; linect = wordct = charct = 0; + stream = NULL; if (file) { if ((fd = open(file, O_RDONLY, 0)) < 0) { warn("%s", file); @@ -131,7 +138,10 @@ cnt(char *file) fd = STDIN_FILENO; } - if (!doword) { + if (!doword && !multibyte) { + if (bufsz < MAXBSIZE && + (buf = realloc(buf, MAXBSIZE)) == NULL) + err(1, NULL); /* * Line counting is split out because it's a lot * faster to get lines than to get words, since @@ -178,16 +188,25 @@ cnt(char *file) } } } else { + if (file == NULL) + stream = stdin; + else if ((stream = fdopen(fd, "r")) == NULL) { + warn("%s", file); + close(fd); + rval = 1; + return; + } + /* Do it the hard way... */ gotsp = 1; - while ((len = read(fd, buf, MAXBSIZE)) > 0) { - /* - * This loses in the presence of multi-byte characters. - * To do it right would require a function to return a - * character while knowing how many bytes it consumed. - */ - charct += len; - for (C = buf; len--; ++C) { + while ((len = getline(&buf, &bufsz, stream)) > 0) { + for (C = buf; *C != '\0'; ++C) { + ++charct; + /* + * XXX For now, we don't want full + * Unicode support. Only treat ASCII + * whitespace as whitespace. + */ if (isspace(*C)) { gotsp = 1; if (*C == '\n') @@ -205,10 +224,13 @@ cnt(char *file) gotsp = 0; ++wordct; } + if (multibyte && + (len = mblen(C, MB_CUR_MAX)) > 1) + C += len - 1; } } } - if (len == -1) { + if (ferror(stream)) { warn("%s", file); rval = 1; } @@ -224,7 +246,7 @@ cnt(char *file) twordct += wordct; tcharct += charct; - if (close(fd) != 0) { + if ((stream == NULL ? close(fd) : fclose(stream)) != 0) { warn("%s", file); rval = 1; }