Pádraig Brady wrote:
I've also attached an alternative patch for df (in your name).

That still has problems, since it can generate improperly-encoded strings in UTF-8 locales (if the inputs are improperly encoded), and can replace parts of multibyte characters with '?' in non-UTF-8 locales. Please try the attached patch instead, which attempts to address these issues. This is more along the lines that Bruno suggested, except it doesn't use mbsiter as I figured it was simpler overall just to use mbrtowc directly for this one thing.
From 17a1a37549344cdfd95cc84b1848dafa256be5a0 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sun, 22 Jul 2018 08:09:01 -0700
Subject: [PATCH] df: avoid multibyte character corruption on macOS

Problem reported by Chih-Hsuan Yen (Bug#32236).
* NEWS: Mention the bug fix.
* src/df.c: Include wchar.h and wctype.h instead of mbswidth.h.
(hide_problematic_chars): Return number of screen columns.
All callers changed.  Use iswcntrl, not iscntrl.
(get_header, get_dev): Rely on hide_problematic_chars width,
not mbswidth.  Scan the cell once, instead of two or three times.
---
 NEWS     |  4 ++++
 src/df.c | 46 +++++++++++++++++++++++++++++++---------------
 2 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/NEWS b/NEWS
index af1a990..aa3b4f9 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,10 @@ GNU coreutils NEWS                                    -*- outline -*-
 
 * Noteworthy changes in release ?.? (????-??-??) [?]
 
+** Bug fixes
+
+  df no longer corrupts displayed multibyte characters on macOS.
+
 
 * Noteworthy changes in release 8.30 (2018-07-01) [stable]
 
diff --git a/src/df.c b/src/df.c
index 1178865..664b88b 100644
--- a/src/df.c
+++ b/src/df.c
@@ -23,6 +23,8 @@
 #include <sys/types.h>
 #include <getopt.h>
 #include <assert.h>
+#include <wchar.h>
+#include <wctype.h>
 
 #include "system.h"
 #include "canonicalize.h"
@@ -31,7 +33,6 @@
 #include "fsusage.h"
 #include "human.h"
 #include "mbsalign.h"
-#include "mbswidth.h"
 #include "mountlist.h"
 #include "quote.h"
 #include "find-mount-point.h"
@@ -272,20 +273,36 @@ static struct option const long_options[] =
 };
 
 /* Replace problematic chars with '?'.
-   Since only control characters are currently considered,
-   this should work in all encodings.  */
+   Return the number of screen columns.  */
 
-static char*
+static size_t
 hide_problematic_chars (char *cell)
 {
-  char *p = cell;
-  while (*p)
+  char *srcend = cell + strlen (cell);
+  char *dst = cell;
+  mbstate_t mbstate = { 0, };
+  size_t n;
+  size_t width = 0;
+
+  for (char *src = cell; src != srcend; src += n)
     {
-      if (iscntrl (to_uchar (*p)))
-        *p = '?';
-      p++;
+      wchar_t wc;
+      n = mbrtowc (&wc, src, srcend - src, &mbstate);
+      if (n < (size_t) -2 && !iswcntrl (wc))
+        {
+          memcpy (dst, src, n);
+          dst += n;
+        }
+      else
+        {
+          *dst++ = '?';
+          memset (&mbstate, 0, sizeof mbstate);
+        }
+      width++;
     }
-  return cell;
+
+  *dst = '\0';
+  return width;
 }
 
 /* Dynamically allocate a row of pointers in TABLE, which
@@ -569,11 +586,10 @@ get_header (void)
       if (!cell)
         xalloc_die ();
 
-      hide_problematic_chars (cell);
-
       table[nrows - 1][col] = cell;
 
-      columns[col]->width = MAX (columns[col]->width, mbswidth (cell, 0));
+      size_t cell_width = hide_problematic_chars (cell);
+      columns[col]->width = MAX (columns[col]->width, cell_width);
     }
 }
 
@@ -1182,8 +1198,8 @@ get_dev (char const *disk, char const *mount_point, char const* file,
       if (!cell)
         assert (!"empty cell");
 
-      hide_problematic_chars (cell);
-      columns[col]->width = MAX (columns[col]->width, mbswidth (cell, 0));
+      size_t cell_width = hide_problematic_chars (cell);
+      columns[col]->width = MAX (columns[col]->width, cell_width);
       table[nrows - 1][col] = cell;
     }
   free (dev_name);
-- 
2.7.4

Reply via email to