[gcc r15-7089] c++: Speed up compilation of large char array initializers when not using #embed

Jakub Jelinek via Gcc-cvs Tue, 21 Jan 2025 00:16:26 -0800

https://gcc.gnu.org/g:f31d49d65412d03d2cf91dd3b8b7281815c6d03b


commit r15-7089-gf31d49d65412d03d2cf91dd3b8b7281815c6d03b
Author: Jakub Jelinek <ja...@redhat.com>
Date:   Tue Jan 21 09:15:53 2025 +0100

    c++: Speed up compilation of large char array initializers when not using 
#embed
    
    The following patch (again, on top of the #embed patchset
    attempts to optimize compilation of large {{{,un}signed ,}char,std::byte}
    array initializers when not using #embed in the source.
    
    Unlike the C patch which is done during the parsing of initializers this
    is done when lexing tokens into an array, because C++ lexes all tokens
    upfront and so by the time we parse the initializers we already have 16
    bytes per token allocated (i.e. 32 extra compile time memory bytes per
    one byte in the array).
    
    The drawback is again that it can result in worse locations for diagnostics
    (-Wnarrowing, -Wconversion) when initializing signed char arrays with values
    128..255.  Not really sure what to do about this though unlike the C case,
    the locations would need to be preserved through reshape_init* and perhaps
    till template instantiation.
    For #embed, there is just a single location_t (could be range of the
    directive), for diagnostics perhaps we could extend it to say byte xyz of
    the file embedded here or something like that, but the optimization done by
    this patch, either we'd need to bump the minimum limit at which to try it,
    or say temporarily allocate a location_t array for each byte and then clear
    it when we no longer need it or something.
    I've been using the same testcases as for C, with #embed of 100'000'000
    bytes:
    time ./cc1plus -quiet -O2 -o test4a.s2 test4a.c
    
    real    0m0.972s
    user    0m0.578s
    sys     0m0.195s
    with xxd -i alternative of the same data without this patch it consumed
    around 13.2GB of RAM and
    time ./cc1plus -quiet -O2 -o test4b.s4 test4b.c
    
    real    3m47.968s
    user    3m41.907s
    sys     0m5.015s
    and the same with this patch it consumed around 3.7GB of RAM and
    time ./cc1plus -quiet -O2 -o test4b.s3 test4b.c
    
    real    0m24.772s
    user    0m23.118s
    sys     0m1.495s
    
    2025-01-21  Jakub Jelinek  <ja...@redhat.com>
    
            * parser.cc (cp_lexer_new_main): Attempt to optimize large sequences
            of CPP_NUMBER with int type and values 0-255 separated by CPP_COMMA
            into CPP_EMBED with RAW_DATA_CST u.value.

Diff:
---
 gcc/cp/parser.cc | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)

diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc
index a8ac8af09550..37214dae5b11 100644
--- a/gcc/cp/parser.cc
+++ b/gcc/cp/parser.cc
@@ -735,6 +735,12 @@ cp_lexer_new_main (void)
   gcc_assert (!the_parser);
   the_parser = cp_parser_new (lexer);
 
+  unsigned raw_data_tokens = 0;
+  char *raw_data_buf = NULL;
+  const unsigned int raw_data_max_len
+    = 131072 - offsetof (struct tree_string, str) - 1;
+  const unsigned int raw_data_min_len = 128;
+
   /* Get the remaining tokens from the preprocessor.  */
   while (tok->type != CPP_EOF)
     {
@@ -743,6 +749,99 @@ cp_lexer_new_main (void)
        module_token_lang (tok->type, tok->keyword, tok->u.value,
                           tok->location, filter);
 
+      /* Attempt to optimize long lists of 0-255 integers
+        separated by commas into CPP_EMBED.
+        In particular, when we see
+        CPP_NUMBER CPP_COMMA ( CPP_NUMBER CPP_COMMA ){n} CPP_NUMBER
+        where n is in [raw_data_min_len, raw_data_max_len - 2]
+        and all CPP_NUMBER tokens have int type and value in [0, UCHAR_MAX]
+        it is changed into
+        CPP_NUMBER CPP_COMMA CPP_EMBED CPP_COMMA CPP_NUMBER.  */
+    recheck:
+      if (tok->type == CPP_NUMBER
+         && (raw_data_tokens & 1) == 0
+         && TREE_CODE (tok->u.value) == INTEGER_CST
+         && TREE_TYPE (tok->u.value) == integer_type_node
+         && !wi::neg_p (wi::to_wide (tok->u.value))
+         && wi::to_widest (tok->u.value) <= UCHAR_MAX
+         && raw_data_tokens < raw_data_max_len * 2)
+       {
+         raw_data_tokens++;
+         /* * 2 comes from each byte in the middle represented by 2 tokens,
+            CPP_NUMBER and CPP_COMMA, while + 3 stands for the
+            CPP_NUMBER CPP_COMMA at the start and CPP_NUMBER at the end.  */
+         if (raw_data_tokens >= raw_data_min_len * 2 + 3)
+           {
+             unsigned int len = lexer->buffer->length ();
+             unsigned int new_len;
+             if (raw_data_tokens == raw_data_min_len * 2 + 3)
+               {
+                 if (raw_data_buf == NULL)
+                   raw_data_buf = XNEWVEC (char, raw_data_max_len);
+                 for (unsigned i = len - raw_data_tokens, j = 0;
+                      i < len; i += 2, ++j)
+                   raw_data_buf[j]
+                     = (char) tree_to_uhwi ((*lexer->buffer)[i].u.value);
+                 /* + 5 stands for
+                    CPP_NUMBER CPP_COMMA CPP_EMBED CPP_COMMA CPP_NUMBER
+                    tokens that will replace the original raw_data_tokens
+                    tokens.  */
+                 new_len = len - raw_data_tokens + 5;
+               }
+             else
+               {
+                 raw_data_buf[raw_data_tokens / 2]
+                   = (char) tree_to_uhwi (tok->u.value);
+                 new_len = len - 2;
+               }
+             /* The last 2 tokens are always CPP_COMMA CPP_NUMBER.  */
+             (*lexer->buffer)[new_len - 2] = (*lexer->buffer)[len - 2];
+             (*lexer->buffer)[new_len - 1] = (*lexer->buffer)[len - 1];
+             lexer->buffer->truncate (new_len);
+           }
+       }
+      else if (tok->type == CPP_COMMA && (raw_data_tokens & 1) == 1)
+       raw_data_tokens++;
+      else if (raw_data_tokens >= raw_data_min_len * 2 + 3)
+       {
+         unsigned last_number = (raw_data_tokens & 1);
+         /* Index of the CPP_EMBED token.  From the above code, that
+            future CPP_EMBED slot is followed by at least CPP_COMMA CPP_NUMBER
+            tokens and if !last_number by another CPP_COMMA and then
+            by the current token which is either not a CPP_NUMBER (e.g.
+            often CPP_CLOSE_BRACE), or CPP_NUMBER with non-int type or with
+            value not in [0, UCHAR_MAX], or reaching the raw_data_max_len
+            limit.  So we need to substract 1 (to get at the current token
+            index) plus 3 + !last_number to get at the CPP_EMBED index.  */
+         unsigned int idx = lexer->buffer->length () - 4 - !last_number;
+         if (!last_number)
+           --raw_data_tokens;
+         /* Number of bytes in the sequence, including the first and last
+            CPP_NUMBER.  Those two bytes are included in the underlying
+            STRING_CST but not in RAW_DATA_CST.  */
+         raw_data_tokens = raw_data_tokens / 2 + 1;
+         tree raw = make_node (RAW_DATA_CST);
+         TREE_TYPE (raw) = integer_type_node;
+         /* Minus the first and last bytes which have their own CPP_NUMBER
+            tokens.  */
+         RAW_DATA_LENGTH (raw) = raw_data_tokens - 2;
+         tree owner = build_string (raw_data_tokens, raw_data_buf);
+         TREE_TYPE (owner) = build_array_type_nelts (unsigned_char_type_node,
+                                                     raw_data_tokens);
+         RAW_DATA_OWNER (raw) = owner;
+         /* Skip over the first byte which has its own CPP_NUMBER token.  */
+         RAW_DATA_POINTER (raw) = TREE_STRING_POINTER (owner) + 1;
+         (*lexer->buffer)[idx].type = CPP_EMBED;
+         (*lexer->buffer)[idx].u.value = raw;
+         raw_data_tokens = 0;
+         goto recheck;
+       }
+      else if (raw_data_tokens)
+       {
+         raw_data_tokens = 0;
+         goto recheck;
+       }
+
       /* Check for early pragmas that need to be handled now.  */
       if (tok->type == CPP_PRAGMA_EOL)
        cp_lexer_handle_early_pragma (lexer);
@@ -751,6 +850,8 @@ cp_lexer_new_main (void)
       cp_lexer_get_preprocessor_token (C_LEX_STRING_NO_JOIN, tok);
     }
 
+  XDELETEVEC (raw_data_buf);
+
   lexer->next_token = lexer->buffer->address ();
   lexer->last_token = lexer->next_token
                       + lexer->buffer->length ()

[gcc r15-7089] c++: Speed up compilation of large char array initializers when not using #embed

Reply via email to