[PATCH 4/4] dfa: do not match invalid UTF-8

Paul Eggert Tue, 17 Dec 2019 21:48:35 -0800

* lib/dfa.c (struct dfa): Grow utf8_anychar_classes member array
from 5 to 9 tokens; this is needed due to the changes to
add_utf8_anychar.
(charclass_index): 2nd arg is now pointer-to-const.
(add_utf8_anychar): Match only valid UTF-8 byte sequences
instead of allowing overlong encodings or surrogate halves.
---
 ChangeLog |   8 ++++
 lib/dfa.c | 138 ++++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 105 insertions(+), 41 deletions(-)


diff --git a/ChangeLog b/ChangeLog
index 8d0595bbc..7988becbf 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,13 @@
 2019-12-17  Paul Eggert  <egg...@cs.ucla.edu>
 
+       dfa: do not match invalid UTF-8
+       * lib/dfa.c (struct dfa): Grow utf8_anychar_classes member array
+       from 5 to 9 tokens; this is needed due to the changes to
+       add_utf8_anychar.
+       (charclass_index): 2nd arg is now pointer-to-const.
+       (add_utf8_anychar): Match only valid UTF-8 byte sequences
+       instead of allowing overlong encodings or surrogate halves.
+
        dfa: simplify charclass by assuming C99
        * lib/dfa.c (CHARCLASS_WORD_BITS): Now always 64.
        (charclass_word): Now always uint_fast64_t.
diff --git a/lib/dfa.c b/lib/dfa.c
index 385125f52..8d3e01c2e 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -454,7 +454,7 @@ struct dfa
   idx_t nregexps;              /* Count of parallel regexps being built
                                    with dfaparse.  */
   bool fast;                   /* The DFA is fast.  */
-  token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales.  */
+  token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales.  */
   mbstate_t mbs;               /* Multibyte conversion state.  */
 
   /* The following are valid only if MB_CUR_MAX > 1.  */
@@ -838,7 +838,7 @@ maybe_realloc (void *pa, idx_t i, idx_t *nitems,
 
 /* In DFA D, find the index of charclass S, or allocate a new one.  */
 static idx_t
-charclass_index (struct dfa *d, charclass *s)
+charclass_index (struct dfa *d, charclass const *s)
 {
   idx_t i;
 
@@ -1669,55 +1669,111 @@ addtok_wc (struct dfa *dfa, wint_t wc)
 static void
 add_utf8_anychar (struct dfa *dfa)
 {
-  static charclass const utf8_classes[5] = {
-    /* 80-bf: non-leading bytes.  */
-    CHARCLASS_INIT (0, 0, 0xffffffffffffffff, 0),
-
-    /* 00-7f: 1-byte sequence.  */
+  /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
+     UTF-8 byte sequence has been defined as follows:
+
+     ([\x00-\x7f]
+     |[\xc2-\xdf][\x80-\xbf]
+     |[\xe0][\xa0-\xbf][\x80-\xbf]
+     |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
+     |[\xed][\x80-\x9f][\x80-\xbf]
+     |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
+     |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
+     |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
+
+     which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
+     where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
+     D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
+     H = [\x80-\x9f], I = [\xf0],
+     J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
+
+     This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C".  */
+
+  /* Mnemonics for classes containing two or more bytes.  */
+  enum { A, B, C, E, F, H, J, K, M };
+
+  /* Mnemonics for single-byte tokens.  */
+  enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
+
+  static charclass const utf8_classes[] = {
+    /* A. 00-7f: 1-byte sequence.  */
     CHARCLASS_INIT (0xffffffffffffffff, 0xffffffffffffffff, 0, 0),
 
-    /* c2-df: 2-byte sequence.  */
+    /* B. c2-df: 1st byte of a 2-byte sequence.  */
     CHARCLASS_INIT (0, 0, 0, 0x00000000fffffffc),
 
-    /* e0-ef: 3-byte sequence.  */
-    CHARCLASS_INIT (0, 0, 0, 0x0000ffff00000000),
+    /* C. 80-bf: non-leading bytes.  */
+    CHARCLASS_INIT (0, 0, 0xffffffffffffffff, 0),
 
-    /* f0-f7: 4-byte sequence.  */
-    CHARCLASS_INIT (0, 0, 0, 0x00ff000000000000)
-  };
-  int n = sizeof utf8_classes / sizeof *utf8_classes;
+    /* D. e0 (just a token).  */
 
-  /* Define the five character classes that are needed below.  */
-  if (dfa->utf8_anychar_classes[0] == 0)
-    for (int i = 0; i < n; i++)
-      {
-        charclass c = utf8_classes[i];
-        if (i == 1)
-          {
-            if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
-              clrbit ('\n', &c);
-            if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
-              clrbit ('\0', &c);
-          }
-        dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, &c);
-      }
+    /* E. a0-bf: 2nd byte of a "DEC" sequence.  */
+    CHARCLASS_INIT (0, 0, 0xffffffff00000000, 0),
+
+    /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence.  */
+    CHARCLASS_INIT (0, 0, 0, 0x0000dffe00000000),
 
-  /* A valid UTF-8 character is
+    /* G. ed (just a token).  */
 
-     ([0x00-0x7f]
-     |[0xc2-0xdf][0x80-0xbf]
-     |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
-     |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
+    /* H. 80-9f: 2nd byte of a "GHC" sequence.  */
+    CHARCLASS_INIT (0, 0, 0x000000000000ffff, 0),
+
+    /* I. f0 (just a token).  */
+
+    /* J. 90-bf: 2nd byte of an "IJCC" sequence.  */
+    CHARCLASS_INIT (0, 0, 0xffffffffffff0000, 0),
+
+    /* K. f1-f3: 1st byte of a "KCCC" sequence.  */
+    CHARCLASS_INIT (0, 0, 0, 0x000e000000000000),
+
+    /* L. f4 (just a token).  */
+
+    /* M. 80-8f: 2nd byte of a "LMCC" sequence.  */
+    CHARCLASS_INIT (0, 0, 0x00000000000000ff, 0),
+  };
+
+  /* Define the character classes that are needed below.  */
+  if (dfa->utf8_anychar_classes[0] == 0)
+    {
+      charclass c = utf8_classes[0];
+      if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
+        clrbit ('\n', &c);
+      if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
+        clrbit ('\0', &c);
+      dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
+
+      for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
+        dfa->utf8_anychar_classes[i]
+          = CSET + charclass_index (dfa, &utf8_classes[i]);
+    }
 
-     which I'll write more concisely "B|CA|DAA|EAAA".  Factor the [0x00-0x7f]
-     and you get "B|(C|(D|EA)A)A".  And since the token buffer is in reverse
-     Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR".  */
-  int i;
-  for (i = 1; i < n; i++)
-    addtok (dfa, dfa->utf8_anychar_classes[i]);
-  while (--i > 1)
+  /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
+     The token buffer is in reverse Polish order, so we get
+     "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
+      C CAT OR C CAT OR C CAT OR".  */
+  addtok (dfa, dfa->utf8_anychar_classes[A]);
+  addtok (dfa, dfa->utf8_anychar_classes[B]);
+  addtok (dfa, D_token);
+  addtok (dfa, dfa->utf8_anychar_classes[E]);
+  addtok (dfa, CAT);
+  addtok (dfa, OR);
+  addtok (dfa, G_token);
+  addtok (dfa, dfa->utf8_anychar_classes[H]);
+  addtok (dfa, CAT);
+  addtok (dfa, OR);
+  addtok (dfa, dfa->utf8_anychar_classes[F]);
+  addtok (dfa, I_token);
+  addtok (dfa, dfa->utf8_anychar_classes[J]);
+  addtok (dfa, CAT);
+  addtok (dfa, OR);
+  addtok (dfa, L_token);
+  addtok (dfa, dfa->utf8_anychar_classes[M]);
+  addtok (dfa, CAT);
+  addtok (dfa, OR);
+  addtok (dfa, dfa->utf8_anychar_classes[K]);
+  for (int i = 0; i < 3; i++)
     {
-      addtok (dfa, dfa->utf8_anychar_classes[0]);
+      addtok (dfa, dfa->utf8_anychar_classes[C]);
       addtok (dfa, CAT);
       addtok (dfa, OR);
     }
-- 
2.17.1

[PATCH 4/4] dfa: do not match invalid UTF-8

Reply via email to