Hi,

as noticed by Matthew Martin, with this patch, fold(1) hangs when
the input contains a NUL byte.  The correct solution is to test
the return value of of mbtowc(3) for "< 1" rather than "== -1",
treating NUL just like other non-printable C1 control characters.

I'm also adding a regression test for that:

  test_fold "-w 3" "1\000034" "1\00003\n4"

No other changes to the patch, updated patch appended below.

Thanks to Matthew for spotting this,
  Ingo


Index: usr.bin/fold/fold.1
===================================================================
RCS file: /cvs/src/usr.bin/fold/fold.1,v
retrieving revision 1.17
diff -u -p -r1.17 fold.1
--- usr.bin/fold/fold.1 5 Jan 2016 12:44:55 -0000       1.17
+++ usr.bin/fold/fold.1 22 May 2016 10:52:57 -0000
@@ -48,7 +48,7 @@ or the standard input if no files are sp
 breaking the lines to have a maximum of 80 display columns.
 .Pp
 The options are as follows:
-.Bl -tag -width Ds
+.Bl -tag -width 8n
 .It Fl b
 Count
 .Ar width
@@ -62,10 +62,31 @@ possible.
 .It Fl w Ar width
 Specifies a line width to use instead of the default of 80.
 .El
+.Pp
+Unless
+.Fl b
+is specified, a backspace character decrements the column position
+by one, a carriage return resets the column position to zero, and
+a tab advances the column position to the next multiple of eight.
+.Sh ENVIRONMENT
+.Bl -tag -width 8n
+.It Ev LC_CTYPE
+The character set
+.Xr locale 1 .
+It is used to decide which byte sequences form characters and what
+their display width is.
+If it is unset or set to
+.Qq C ,
+.Qq POSIX ,
+or an unsupported value, each byte except backspace, tab, newline,
+and carriage return is assumed to represent a character of display
+width 1.
+.El
 .Sh EXIT STATUS
 .Ex -std fold
 .Sh SEE ALSO
-.Xr expand 1
+.Xr expand 1 ,
+.Xr fmt 1
 .Sh STANDARDS
 The
 .Nm
@@ -100,15 +121,17 @@ rewrote the command in 1990, and
 .An J. T. Conklin
 added the missing options in 1993.
 .Sh BUGS
-If underlining (see
-.Xr ul 1 )
-is present it may be messed up by folding.
-.Pp
-.Ar width
-should be a multiple of 8 if tabs are present, or the tabs should
-be expanded using
-.Xr expand 1
-before using
-.Nm fold .
-.Pp
-Multibyte character support is missing.
+Traditional
+.Xr roff 7
+output semantics, implemented both by GNU nroff and by
+.Xr mandoc 1 ,
+only uses a single backspace for backing up the previous character,
+even for double-width characters.
+The
+.Nm
+backspace semantics required by POSIX mishandles such backspace-encoded
+sequences, breaking lines early.
+The
+.Xr fmt 1
+utility provides similar functionality and does not suffer from that
+problem, but isn't standardized by POSIX.
Index: usr.bin/fold/fold.c
===================================================================
RCS file: /cvs/src/usr.bin/fold/fold.c,v
retrieving revision 1.17
diff -u -p -r1.17 fold.c
--- usr.bin/fold/fold.c 9 Oct 2015 01:37:07 -0000       1.17
+++ usr.bin/fold/fold.c 22 May 2016 10:52:57 -0000
@@ -33,19 +33,22 @@
  * SUCH DAMAGE.
  */
 
+#include <ctype.h>
+#include <err.h>
+#include <limits.h>
+#include <locale.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
-#include <ctype.h>
-#include <err.h>
-#include <limits.h>
+#include <wchar.h>
 
 #define        DEFLINEWIDTH    80
 
 static void fold(unsigned int);
-static unsigned int new_column_position(unsigned int, int);
+static int isu8cont(unsigned char);
 static __dead void usage(void);
+
 int count_bytes = 0;
 int split_words = 0;
 
@@ -56,6 +59,8 @@ main(int argc, char *argv[])
        unsigned int width;
        const char *errstr;
 
+       setlocale(LC_CTYPE, "");
+
        if (pledge("stdio rpath", NULL) == -1)
                err(1, "pledge");
 
@@ -110,12 +115,11 @@ main(int argc, char *argv[])
                for (; *argv; ++argv) {
                        if (!freopen(*argv, "r", stdin))
                                err(1, "%s", *argv);
-                               /* NOTREACHED */
                        else
                                fold(width);
                }
        }
-       exit(0);
+       return 0;
 }
 
 /*
@@ -130,100 +134,131 @@ main(int argc, char *argv[])
  * returns embedded in the input stream.
  */
 static void
-fold(unsigned int width)
+fold(unsigned int max_width)
 {
-       static char *buf = NULL;
-       static int   buf_max = 0;
-       int ch;
-       unsigned int col, indx;
-
-       col = indx = 0;
-       while ((ch = getchar()) != EOF) {
-               if (ch == '\n') {
-                       if (indx != 0)
-                               fwrite(buf, 1, indx, stdout);
-                       putchar('\n');
-                       col = indx = 0;
-                       continue;
-               }
+       static char     *buf = NULL;
+       static size_t    bufsz = 2048;
+       char            *cp;    /* Current mb character. */
+       char            *np;    /* Next mb character. */
+       char            *sp;    /* To search for the last space. */
+       wchar_t          wc;    /* Current wide character. */
+       int              ch;    /* Last byte read. */
+       int              len;   /* Bytes in the current mb character. */
+       unsigned int     col;   /* Current display position. */
+       int              width; /* Display width of wc. */
+
+       if (buf == NULL && (buf = malloc(bufsz)) == NULL)
+               err(1, NULL);
+
+       np = cp = buf;
+       ch = 0;
+       col = 0;
+
+       while (ch != EOF) {  /* Loop on input characters. */
+               while ((ch = getchar()) != EOF) {  /* Loop on input bytes. */
+                       if (np + 1 == buf + bufsz) {
+                               buf = reallocarray(buf, 2, bufsz);
+                               if (buf == NULL)
+                                       err(1, NULL);
+                               bufsz *= 2;
+                       }
+                       *np++ = ch;
 
-               col = new_column_position(col, ch);
-               if (col > width) {
-                       unsigned int i, last_space;
-
-                       if (split_words) {
-                               for (i = 0, last_space = -1; i < indx; i++)
-                                       if(buf[i] == ' ')
-                                               last_space = i;
+                       /*
+                        * Read up to and including the first byte of
+                        * the next character, such that we are sure
+                        * to have a complete character in the buffer.
+                        * There is no need to read more than five bytes
+                        * ahead, since UTF-8 characters are four bytes
+                        * long at most.
+                        */
+
+                       if (np - cp > 4 || (np - cp > 1 && !isu8cont(ch)))
+                               break;
+               }
+
+               while (cp < np) {  /* Loop on output characters. */
+
+                       /* Handle end of line and backspace. */
+
+                       if (*cp == '\n' || (*cp == '\r' && !count_bytes)) {
+                               fwrite(buf, 1, ++cp - buf, stdout);
+                               memmove(buf, cp, np - cp);
+                               np = buf + (np - cp);
+                               cp = buf;
+                               col = 0;
+                               continue;
+                       }
+                       if (*cp == '\b' && !count_bytes) {
+                               if (col)
+                                       col--;
+                               cp++;
+                               continue;
                        }
 
-                       if (split_words && last_space != -1) {
-                               last_space++;
+                       /*
+                        * Measure display width.
+                        * Process the last byte only if
+                        * end of file was reached.
+                        */
+
+                       if (np - cp > (ch != EOF)) {
+                               len = 1;
+                               width = 1;
+
+                               if (*cp == '\t') {
+                                       if (count_bytes == 0)
+                                               width = 8 - (col & 7);
+                               } else if ((len = mbtowc(&wc, cp,
+                                   np - cp)) < 1)
+                                       len = 1;
+                               else if (count_bytes)
+                                       width = len;
+                               else if ((width = wcwidth(wc)) < 0)
+                                       width = 1;
+
+                               col += width;
+                               if (col <= max_width || cp == buf) {
+                                       cp += len;
+                                       continue;
+                               }
+                       }
 
-                               fwrite(buf, 1, last_space, stdout);
-                               memmove(buf, buf+last_space, indx-last_space);
+                       /* Line break required. */
 
-                               indx -= last_space;
-                               col = 0;
-                               for (i = 0; i < indx; i++) {
-                                       col = new_column_position(col, buf[i]);
+                       if (col > max_width) {
+                               if (split_words) {
+                                       for (sp = cp; sp > buf; sp--) {
+                                               if (sp[-1] == ' ') {
+                                                       cp = sp;
+                                                       break;
+                                               }
+                                       }
                                }
-                       } else {
-                               fwrite(buf, 1, indx, stdout);
-                               col = indx = 0;
+                               fwrite(buf, 1, cp - buf, stdout);
+                               putchar('\n');
+                               memmove(buf, cp, np - cp);
+                               np = buf + (np - cp);
+                               cp = buf;
+                               col = 0;
+                               continue;
                        }
-                       putchar('\n');
 
-                       /* calculate the column position for the next line. */
-                       col = new_column_position(col, ch);
-               }
+                       /* Need more input. */
 
-               if (indx + 1 > buf_max) {
-                       int newmax = buf_max + 2048;
-                       char *newbuf;
-
-                       /* Allocate buffer in LINE_MAX increments */
-                       if ((newbuf = realloc(buf, newmax)) == NULL) {
-                               err(1, NULL);
-                               /* NOTREACHED */
-                       }
-                       buf = newbuf;
-                       buf_max = newmax;
+                       break;
                }
-               buf[indx++] = ch;
        }
+       fwrite(buf, 1, np - buf, stdout);
 
-       if (indx != 0)
-               fwrite(buf, 1, indx, stdout);
+       if (ferror(stdin))
+               err(1, NULL);
 }
 
-/*
- * calculate the column position 
- */
-static unsigned int
-new_column_position(unsigned int col, int ch)
+static int
+isu8cont(unsigned char c)
 {
-       if (!count_bytes) {
-               switch (ch) {
-               case '\b':
-                       if (col > 0)
-                               --col;
-                       break;
-               case '\r':
-                       col = 0;
-                       break;
-               case '\t':
-                       col = (col + 8) & ~7;
-                       break;
-               default:
-                       ++col;
-                       break;
-               }
-       } else {
-               ++col;
-       }
-
-       return col;
+       return MB_CUR_MAX > 1 && (c & (0x80 | 0x40)) == 0x80;
 }
 
 static __dead void
Index: regress/usr.bin/fold/fold.sh
===================================================================
RCS file: /cvs/src/regress/usr.bin/fold/fold.sh,v
retrieving revision 1.1
diff -u -p -r1.1 fold.sh
--- regress/usr.bin/fold/fold.sh        3 May 2016 16:06:11 -0000       1.1
+++ regress/usr.bin/fold/fold.sh        22 May 2016 10:52:57 -0000
@@ -14,11 +14,18 @@
 # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
+FOLD=/usr/bin/fold
+
+# Arguments of the test function:
+# 1. command line arguments for fold(1)
+# 2. standard input for fold, backslash-encoded
+# 3. expected standard output, backslash-encoded
+# 4. expected standard output of "fold -b", backslash-encoded
+#    (optional, by default the same as argument 3.)
 test_fold()
 {
        expect=`echo -n "$3" ; echo .`
-    if [ $SKIPUTF8 -eq 0 ]; then
-       result=`echo -n "$2" | fold $1 2>&1 ; echo .`
+       result=`echo -n "$2" | $FOLD $1 2>&1 ; echo .`
        if [ "$result" != "$expect" ]; then
                echo "fold $1 \"$2\":"
                echo -n "$2" | hexdump -C
@@ -28,9 +35,8 @@ test_fold()
                echo -n "$result" | hexdump -C
                exit 1
        fi
-    fi
        [ -n "$4" ] && expect=`echo -n "$4" ; echo .`
-       result=`echo -n "$2" | fold -b $1 2>&1 ; echo .`
+       result=`echo -n "$2" | $FOLD -b $1 2>&1 ; echo .`
        if [ "$result" != "$expect" ]; then
                echo "fold -b $1 \"$2\":"
                echo -n "$2" | hexdump -C
@@ -44,17 +50,21 @@ test_fold()
 
 export LC_ALL=C
 
-SKIPUTF8=0
-
 test_fold "" "" ""
+
+# newline
 test_fold "" "\n" "\n"
 test_fold "" "\n\n" "\n\n"
 test_fold "-w 1" "\n\n" "\n\n"
+test_fold "-w 2" "1\n12\n123" "1\n12\n12\n3"
+test_fold "-w 2" "12345" "12\n34\n5"
+test_fold "-w 2" "12345\n" "12\n34\n5\n"
 
 # backspace
 test_fold "-w 2" "123" "12\n3" 
 test_fold "-w 2" "1\b234" "1\b23\n4" "1\b\n23\n4"
 test_fold "-w 2" "\b1234" "\b12\n34" "\b1\n23\n4"
+test_fold "-w 2" "12\b\b345" "12\b\b34\n5" "12\n\b\b\n34\n5"
 test_fold "-w 2" "12\r3" "12\r3" "12\n\r3"
 
 # tabulator
@@ -66,20 +76,35 @@ test_fold "-w 9" "1\t9\b\b89012" "1\t9\b
 test_fold "-sw 4" "1 23 45" "1 \n23 \n45"
 test_fold "-sw 3" "1234 56" "123\n4 \n56"
 
-export LC_ALL=en_US.UTF-8
-
 # invalid characters
 test_fold "-w 3" "1\037734" "1\03773\n4"
 test_fold "-w 3" "1\000734" "1\00073\n4"
+test_fold "-w 3" "1\000034" "1\00003\n4"
 
-SKIPUTF8=1
+export LC_ALL=en_US.UTF-8
 
 # double width characters
 test_fold "-w 4" "1\0343\0201\020145" "1\0343\0201\02014\n5" \
                "1\0343\0201\0201\n45"
+test_fold "-w 3" "\0343\0201\0201\0343\0201\020134" \
+               "\0343\0201\0201\n\0343\0201\02013\n4" \
+               "\0343\0201\0201\n\0343\0201\0201\n34"
+test_fold "-w 2" "\0343\0201\0201\b23" "\0343\0201\0201\b2\n3" \
+               "\0343\0201\0201\n\b2\n3"
+test_fold "-w 1" "1\0343\0201\02014" "1\n\0343\0201\0201\n4"
 
 # zero width characters
-test_fold "-w 3" "1a\0314\020034" "1a\0314\02003\n4" "1a\0314\n\020034"
+test_fold "-w 3" "1a\0314\020034" "1a\0314\02003\n4" "1a\n\0314\02003\n4"
 test_fold "-w 2" "1a\0314\02003" "1a\0314\0200\n3" "1a\n\0314\0200\n3"
+
+# four byte UTF-8 encoding
+test_fold "-w 3" "1\0360\0220\0200\020034" "1\0360\0220\0200\02003\n4" \
+               "1\n\0360\0220\0200\0200\n34"
+
+# invalid UTF-8
+test_fold "-w 3" "\0343\0201\0201\0201\0201\0201\0201\0201\n" \
+               "\0343\0201\0201\0201\n\0201\0201\0201\n\0201\n" \
+               "\0343\0201\0201\n\0201\0201\0201\n\0201\0201\n"
+test_fold "-w 2" "\0343\0343\0201\0201\n" "\0343\n\0343\0201\0201\n"
 
 exit 0

Reply via email to