Commit: patch 9.1.1258: regexp: max \U and \%U value is limited by INT_MAX

Christian Brabandt Sat, 29 Mar 2025 01:15:35 -0700

patch 9.1.1258: regexp: max U and %U value is limited by INT_MAX

Commit: 
https://github.com/vim/vim/commit/f2b16986a194ab839c5a23bd7fe904f9fae1526f
Author: Christian Brabandt <[email protected]>
Date:   Sat Mar 29 09:08:58 2025 +0100


    patch 9.1.1258: regexp: max \U and \%U value is limited by INT_MAX
    
    Problem:  regexp: max \U and \%U value is limited by INT_MAX but gives a
              confusing error message (related: v8.1.0985).
    Solution: give a better error message when the value reaches INT_MAX
    
    When searching Vim allows to get up to 8 hex characters using the /\V
    and /\%V regex atoms.  However, when using "/\UFFFFFFFF" the code point is
    already above what an integer variable can hold, which is 2,147,483,647.
    
    Since patch v8.1.0985, Vim already limited the max codepoint to INT_MAX
    (otherwise it caused a crash in the nfa regex engine), but instead of
    error'ing out it silently fell back to parse the number as a backslash
    value and not as a codepoint value and as such this "/[\UFFFFFFFF]" will
    happily find a "\" or an literal "F".  And this "/[\d127-\UFFFFFFFF]"
    will error out as "reverse range in character class).
    
    Interestingly, the max Unicode codepoint value is U+10FFFF which still
    fits into an ordinary integer value,  which means, that we don't even
    need to parse 8 hex characters, but 6 should have been enough.
    
    However, let's not limit Vim to search for only max 6 hex characters
    (which would be a backward incompatible change), but instead allow all 8
    characters and only if the codepoint reaches INT_MAX, give a more
    precise error message (about what the max unicode codepoint value is).
    This allows to search for "[\U7FFFFFFE]" (will likely return "E486
    Pattern not found") and "[/\U7FFFFFF]" now errors "E1517: Value too
    large, max Unicode codepoint is U+10FFFF".
    
    While this change is straight forward on architectures where long is 8
    bytes, this is not so simple on Windows or 32bit architectures where long
    is 4 bytes (and therefore the test fails there).  To account for that,
    let's make use of the vimlong_T number type and make a few corresponding
    changes in the regex engine code and cast the value to the expected data
    type. This however may not work correctly on systems that doesn't have
    the long long datatype (e.g. OpenVMS) and probably the test will fail
    there.
    
    fixes: #16949
    closes: #16994
    
    Signed-off-by: Christian Brabandt <[email protected]>

diff --git a/runtime/doc/pattern.txt b/runtime/doc/pattern.txt
index f11040c4e..857a3e648 100644
--- a/runtime/doc/pattern.txt
+++ b/runtime/doc/pattern.txt
@@ -1,4 +1,4 @@
-*pattern.txt*   For Vim version 9.1.  Last change: 2025 Mar 21
+*pattern.txt*   For Vim version 9.1.  Last change: 2025 Mar 28
 
 
                  VIM REFERENCE MANUAL    by Bram Moolenaar
@@ -1222,7 +1222,8 @@ x A single character, with no special meaning, matches 
itself
                \o40    octal number of character up to 0o377
                \x20    hexadecimal number of character up to 0xff
                \u20AC  hex. number of multibyte character up to 0xffff
-               \U1234  hex. number of multibyte character up to 0xffffffff
+               \U1234  hex. number of multibyte character up to 8 characters
+                       0xffffffff |E1541|
          NOTE: The other backslash codes mentioned above do not work inside
          []!
        - Matching with a collection can be slow, because each character in
@@ -1263,7 +1264,8 @@ x A single character, with no special meaning, matches 
itself
 \%u20AC        Matches the character specified with up to four hexadecimal
        characters.
 \%U1234abcd    Matches the character specified with up to eight hexadecimal
-       characters, up to 0x7fffffff
+       characters, up to 0x7fffffff (the maximum allowed value is INT_MAX
+       |E1541|, but the maximum valid Unicode codepoint is U+10FFFF).
 
 ==============================================================================
 7. Ignoring case in a pattern                                  */ignorecase*
diff --git a/runtime/doc/tags b/runtime/doc/tags
index 75b00aae1..7d54ee9b8 100644
--- a/runtime/doc/tags
+++ b/runtime/doc/tags
@@ -4621,6 +4621,7 @@ E1538     eval.txt        /*E1538*
 E1539  vim9.txt        /*E1539*
 E154   helphelp.txt    /*E154*
 E1540  eval.txt        /*E1540*
+E1541  vi_diff.txt     /*E1541*
 E155   sign.txt        /*E155*
 E156   sign.txt        /*E156*
 E157   sign.txt        /*E157*
diff --git a/runtime/doc/vi_diff.txt b/runtime/doc/vi_diff.txt
index b96f77907..46db57a45 100644
--- a/runtime/doc/vi_diff.txt
+++ b/runtime/doc/vi_diff.txt
@@ -1,4 +1,4 @@
-*vi_diff.txt*   For Vim version 9.1.  Last change: 2024 Nov 10
+*vi_diff.txt*   For Vim version 9.1.  Last change: 2025 Mar 28
 
 
                  VIM REFERENCE MANUAL    by Bram Moolenaar
@@ -91,8 +91,11 @@ Maximum display width           Unix and Win32: 1024 
characters, otherwise 255
 Maximum lhs of a mapping   50 characters.
 Number of different highlighting types: over 30000
 Range of a Number variable:  -2147483648 to 2147483647 (might be more on 64
-                          bit systems)
+                          bit systems)  See also: |v:numbermax|,
+                          |v:numbermin| and |v:numbersize|
 Maximum length of a line in a tags file: 512 bytes.
+                                                       *E1541*
+Maximum value for |/\U| and |/\%U|: 2147483647 (for 32bit integer).
 
 Information for undo and text in registers is kept in memory, thus when making
 (big) changes the amount of (virtual) memory available limits the number of
diff --git a/src/errors.h b/src/errors.h
index 9331484ac..6e2782df5 100644
--- a/src/errors.h
+++ b/src/errors.h
@@ -3716,3 +3716,5 @@ EXTERN char e_variadic_tuple_must_end_with_list_type_str[]
 EXTERN char e_cannot_use_variadic_tuple_in_concatenation[]
        INIT(= N_("E1540: Cannot use a variadic tuple in concatenation"));
 #endif
+EXTERN char e_unicode_val_too_large[]
+       INIT(= N_("E1541: Value too large, max Unicode codepoint is U+10FFFF"));
diff --git a/src/regexp.c b/src/regexp.c
index ea6079b00..32a721f9f 100644
--- a/src/regexp.c
+++ b/src/regexp.c
@@ -427,9 +427,9 @@ static void skipchr_keepstart(void);
 static int     peekchr(void);
 static void    skipchr(void);
 static void    ungetchr(void);
-static long    gethexchrs(int maxinputlen);
+static vimlong_T       gethexchrs(int maxinputlen);
 static long    getoctchrs(void);
-static long    getdecchrs(void);
+static vimlong_T       getdecchrs(void);
 static int     coll_get_char(void);
 static int     prog_magic_wrong(void);
 static int     cstrncmp(char_u *s1, char_u *s2, int *n);
@@ -979,7 +979,7 @@ ungetchr(void)
  * The parameter controls the maximum number of input characters. This will be
  * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
  */
-    static long
+    static vimlong_T
 gethexchrs(int maxinputlen)
 {
     long_u     nr = 0;
@@ -998,14 +998,14 @@ gethexchrs(int maxinputlen)
 
     if (i == 0)
        return -1;
-    return (long)nr;
+    return nr;
 }
 
 /*
  * Get and return the value of the decimal string immediately after the
  * current position. Return -1 for invalid.  Consumes all digits.
  */
-    static long
+    static vimlong_T
 getdecchrs(void)
 {
     long_u     nr = 0;
@@ -1025,7 +1025,7 @@ getdecchrs(void)
 
     if (i == 0)
        return -1;
-    return (long)nr;
+    return nr;
 }
 
 /*
diff --git a/src/regexp_bt.c b/src/regexp_bt.c
index 16dac730d..f4bd6c36d 100644
--- a/src/regexp_bt.c
+++ b/src/regexp_bt.c
@@ -1589,7 +1589,7 @@ regatom(int *flagp)
                case 'u':   // %uabcd hex 4
                case 'U':   // %U1234abcd hex 8
                          {
-                             long i;
+                             vimlong_T i;
 
                              switch (c)
                              {
@@ -1612,7 +1612,7 @@ regatom(int *flagp)
                              if (i == 0)
                                  regc(0x0a);
                              else
-                                 regmbc(i);
+                                 regmbc((int)i);
                              regc(NUL);
                              *flagp |= HASWIDTH;
                              break;
@@ -1831,6 +1831,10 @@ collection:
                                || *regparse == 'U')
                        {
                            startc = coll_get_char();
+                           // max UTF-8 Codepoint is U+10FFFF,
+                           // but allow values until INT_MAX
+                           if (startc == INT_MAX)
+                               EMSG_RET_NULL(_(e_unicode_val_too_large));
                            if (startc == 0)
                                regc(0x0a);
                            else
@@ -2131,7 +2135,7 @@ regpiece(int *flagp)
                int     lop = END;
                long    nr;
 
-               nr = getdecchrs();
+               nr = (long)getdecchrs();
                switch (no_Magic(getchr()))
                {
                    case '=': lop = MATCH; break;                 // \@=
@@ -2610,7 +2614,7 @@ vim_regcomp_had_eol(void)
     static int
 coll_get_char(void)
 {
-    long       nr = -1;
+    vimlong_T  nr = -1;
 
     switch (*regparse++)
     {
@@ -2620,13 +2624,15 @@ coll_get_char(void)
        case 'u': nr = gethexchrs(4); break;
        case 'U': nr = gethexchrs(8); break;
     }
-    if (nr < 0 || nr > INT_MAX)
+    if (nr < 0)
     {
        // If getting the number fails be backwards compatible: the character
        // is a backslash.
        --regparse;
        nr = '\';
     }
+    if (nr > INT_MAX)
+       nr = INT_MAX;
     return nr;
 }
 
diff --git a/src/regexp_nfa.c b/src/regexp_nfa.c
index 557d0e1aa..6ad682bcf 100644
--- a/src/regexp_nfa.c
+++ b/src/regexp_nfa.c
@@ -1560,7 +1560,7 @@ nfa_regatom(void)
                case 'u':   // %uabcd hex 4
                case 'U':   // %U1234abcd hex 8
                    {
-                       long nr;
+                       vimlong_T nr;
 
                        switch (c)
                        {
@@ -1577,7 +1577,7 @@ nfa_regatom(void)
                                                       reg_magic == MAGIC_ALL);
                        // A NUL is stored in the text as NL
                        // TODO: what if a composing character follows?
-                       EMIT(nr == 0 ? 0x0a : nr);
+                       EMIT(nr == 0 ? 0x0a : (long)nr);
                    }
                    break;
 
@@ -1953,6 +1953,10 @@ collection:
                            {
                                // TODO(RE) This needs more testing
                                startc = coll_get_char();
+                               // max UTF-8 Codepoint is U+10FFFF,
+                               // but allow values until INT_MAX
+                               if (startc == INT_MAX)
+                                   EMSG_RET_FAIL(_(e_unicode_val_too_large));
                                got_coll_char = TRUE;
                                MB_PTR_BACK(old_regparse, regparse);
                            }
@@ -2218,7 +2222,7 @@ nfa_regpiece(void)
            break;
 
        case Magic('@'):
-           c2 = getdecchrs();
+           c2 = (long)getdecchrs();
            op = no_Magic(getchr());
            i = 0;
            switch(op)
diff --git a/src/testdir/test_search.vim b/src/testdir/test_search.vim
index 708aca2a8..75291750f 100644
--- a/src/testdir/test_search.vim
+++ b/src/testdir/test_search.vim
@@ -1541,17 +1541,46 @@ func Test_large_hex_chars2()
   try
     /[\Ufffffc1f]
   catch
-    call assert_match('E486:', v:exception)
+    call assert_match('E1541:', v:exception)
   endtry
   try
     set re=1
     /[\Ufffffc1f]
   catch
-    call assert_match('E486:', v:exception)
+    call assert_match('E1541:', v:exception)
   endtry
   set re&
 endfunc
 
+func Test_large_hex_chars3()
+  " Validate max number of Unicode char
+  try
+    /[\UFFFFFFFF]
+  catch
+    call assert_match('E1541:', v:exception)
+  endtry
+  try
+    /[\UFFFFFFF]
+  catch
+    call assert_match('E486:', v:exception)
+  endtry
+  try
+    /\%#=2[\d32-\UFFFFFFFF]
+  catch
+    call assert_match('E1541:', v:exception)
+  endtry
+  try
+    /\%#=1[\UFFFFFFFF]
+  catch
+    call assert_match('E1541:', v:exception)
+  endtry
+  try
+    /\%#=1[\d32-\UFFFFFFFF]
+  catch
+    call assert_match('E945:', v:exception)
+  endtry
+endfunc
+
 func Test_one_error_msg()
   " This was also giving an internal error
   call assert_fails('call search(" \((\v[[=P=]]){185}+             ")', 
'E871:')
diff --git a/src/version.c b/src/version.c
index 7d6c7e3f9..22357447f 100644
--- a/src/version.c
+++ b/src/version.c
@@ -704,6 +704,8 @@ static char *(features[]) =
 
 static int included_patches[] =
 {   /* Add new patch number below this line */
+/**/
+    1258,
 /**/
     1257,
 /**/

-- 
-- 
You received this message from the "vim_dev" maillist.
Do not top-post! Type your reply below the text you are replying to.
For more information, visit http://www.vim.org/maillist.php

--- 
You received this message because you are subscribed to the Google Groups 
"vim_dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion visit 
https://groups.google.com/d/msgid/vim_dev/E1tyRLO-004yJR-3i%40256bit.org.

Commit: patch 9.1.1258: regexp: max \U and \%U value is limited by INT_MAX

Raspunde prin e-mail lui