Hi!

Just to show what I'm working on on top of the already posted #embed
patches.
Working on the C FE only for now, the patch emits CPP_EMBED tokens
when preprocessing C (but still with the important simplification that
CPP_EMBED token is always preceded by {CPP_NUMBER,CPP_EMBED} CPP_COMMA
and followed by CPP_COMMA {CPP_NUMBER,CPP_EMBED}, without that the FE code
would be significantly larger).

The patch introduces RAW_DATA_CST tree, which is shorthand for a (possibly huge)
sequence of INTEGER_CSTs in CONSTRUCTOR of ARRAY_TYPE.  TREE_TYPE of this
is the type of each element in the sequence (rather than say some array type
like in the STRING_CST case) and it intentionally doesn't own the underlying
data, but has (so far unused) RAW_DATA_OWNER subtree for the GC owner of
that data.  If RAW_DATA_OWNER is NULL, the data is owned directly by libcpp
buffers, but e.g. for #embed in headers compiled into PCH I think we'll need
to add a STRING_CST as the owner of the data.
This way, it is cheap to peel them off or split them apart (see e.g. the
testcases which create possibly huge RAW_DATA_CST and then using designed
initializer store some other element in the middle of it).
The new testcases already work with the patch, what is missing is handling
it where needed in the middle-end (surely needs to be handled e.g. in the
gimplifier if somebody uses #embed in automatic var initializers, in IPA-ICF
hashing, or when trying to extract a constant from some large array
initializer or LTO), plus decide what to do with CPP_EMBED in the FE (one
possibility is to keep it as CPP_EMBED only when peeking non-raw token
when some new parser flag is set and otherwise already during token peeking
split it off into individual CPP_NUMBER tokens, another possibility if
we manage to come up with all spots where CPP_EMBED can validly appear in
the grammar (my current understanding besides the initializers already
handled is in function arguments or attributes (both using
c_parser_expr_list, so that function could be taught to parse CPP_EMBED
into list of INTEGER_CSTs), in comma expressions (so teach
c_parser_expression about it, but in that case I wonder if we just don't
need to ensure correct warnings and that is it, because whether a comma
expression is something,somethingelse or
something,1,2,3,4,5,6,7,8,somethingelse
shouldn't be significantly different) and I think in OpenMP sizes clause
(but that could be changed to use c_parser_expr_list); for C++ I think
template arguments, or multi-dimensional array indices are other spots).

I don't have a bootstrapped compiler with this patch yet, but -O0 built
cc1 can compile at -O0
unsigned char a[] = {
#embed "cc1plus"
};
where cc1plus is 261M binary compiles under 20s for me (most time spent
in assemble_string).  If I preprocess the same with C++ (so that it
just expands it into a sequence of 261M numbers), I've Ctrl-C the
compilation after a couple of minutes.  Obviously I'll do a proper
test with optimized compiler later.

Anyway, thoughts on this before I spend too much time on it?

And while clang 19 now claims to have #embed implemented, most of the
testcases in the patch are miscompiled by it.

--- libcpp/files.cc.jj  2024-07-03 14:52:12.231817485 +0200
+++ libcpp/files.cc     2024-07-03 15:44:39.248913032 +0200
@@ -1241,7 +1241,10 @@ finish_embed (cpp_reader *pfile, _cpp_fi
     limit = params->limit;
 
   size_t embed_tokens = 0;
-  if (CPP_OPTION (pfile, directives_only) && limit >= 64)
+  if ((CPP_OPTION (pfile, directives_only)
+       || !CPP_OPTION (pfile, cplusplus))
+      && CPP_OPTION (pfile, lang) != CLK_ASM
+      && limit >= 64)
     embed_tokens = ((limit - 2) / INT_MAX) + (((limit - 2) % INT_MAX) != 0);
 
   size_t max = INTTYPE_MAXIMUM (size_t) / sizeof (cpp_token);
--- gcc/varasm.cc.jj    2024-05-07 18:10:10.674871087 +0200
+++ gcc/varasm.cc       2024-07-04 14:58:33.570465411 +0200
@@ -4875,6 +4875,7 @@ initializer_constant_valid_p_1 (tree val
     case FIXED_CST:
     case STRING_CST:
     case COMPLEX_CST:
+    case RAW_DATA_CST:
       return null_pointer_node;
 
     case ADDR_EXPR:
@@ -5468,6 +5469,9 @@ array_size_for_constructor (tree val)
     {
       if (TREE_CODE (index) == RANGE_EXPR)
        index = TREE_OPERAND (index, 1);
+      if (value && TREE_CODE (value) == RAW_DATA_CST)
+       index = size_binop (PLUS_EXPR, index,
+                           size_int (RAW_DATA_LENGTH (value) - 1));
       if (max_index == NULL_TREE || tree_int_cst_lt (max_index, index))
        max_index = index;
     }
@@ -5659,6 +5663,12 @@ output_constructor_regular_field (oc_loc
   /* Output the element's initial value.  */
   if (local->val == NULL_TREE)
     assemble_zeros (fieldsize);
+  else if (local->val && TREE_CODE (local->val) == RAW_DATA_CST)
+    {
+      fieldsize *= RAW_DATA_LENGTH (local->val);
+      assemble_string (RAW_DATA_POINTER (local->val),
+                      RAW_DATA_LENGTH (local->val));
+    }
   else
     fieldsize = output_constant (local->val, fieldsize, align2,
                                 local->reverse, false);
--- gcc/tree.h.jj       2024-06-05 19:09:54.046617006 +0200
+++ gcc/tree.h  2024-07-03 19:41:04.453201043 +0200
@@ -1165,6 +1165,14 @@ extern void omp_clause_range_check_faile
 #define TREE_STRING_POINTER(NODE) \
   ((const char *)(STRING_CST_CHECK (NODE)->string.str))
 
+/* In a RAW_DATA_CST */
+#define RAW_DATA_LENGTH(NODE) \
+  (RAW_DATA_CST_CHECK (NODE)->raw_data_cst.length)
+#define RAW_DATA_POINTER(NODE) \
+  (RAW_DATA_CST_CHECK (NODE)->raw_data_cst.str)
+#define RAW_DATA_OWNER(NODE) \
+  (RAW_DATA_CST_CHECK (NODE)->raw_data_cst.owner)
+
 /* In a COMPLEX_CST node.  */
 #define TREE_REALPART(NODE) (COMPLEX_CST_CHECK (NODE)->complex.real)
 #define TREE_IMAGPART(NODE) (COMPLEX_CST_CHECK (NODE)->complex.imag)
--- gcc/expr.cc.jj      2024-07-01 11:28:22.704237981 +0200
+++ gcc/expr.cc 2024-07-04 14:58:33.576465331 +0200
@@ -7144,6 +7144,12 @@ categorize_ctor_elements_1 (const_tree c
          init_elts += mult * TREE_STRING_LENGTH (value);
          break;
 
+       case RAW_DATA_CST:
+         nz_elts += mult * RAW_DATA_LENGTH (value);
+         unique_nz_elts += RAW_DATA_LENGTH (value);
+         init_elts += mult * RAW_DATA_LENGTH (value);
+         break;
+
        case COMPLEX_CST:
          if (!initializer_zerop (TREE_REALPART (value)))
            {
--- gcc/tree-pretty-print.cc.jj 2024-06-14 19:45:09.446777591 +0200
+++ gcc/tree-pretty-print.cc    2024-07-04 14:58:33.571465397 +0200
@@ -2519,6 +2519,28 @@ dump_generic_node (pretty_printer *pp, t
       }
       break;
 
+    case RAW_DATA_CST:
+      for (unsigned i = 0; i < (unsigned) RAW_DATA_LENGTH (node); ++i)
+       {
+         if (TYPE_UNSIGNED (TREE_TYPE (node))
+             || TYPE_PRECISION (TREE_TYPE (node)) > CHAR_BIT)
+           pp_decimal_int (pp, ((const unsigned char *)
+                                RAW_DATA_POINTER (node))[i]);
+         else
+           pp_decimal_int (pp, ((const signed char *)
+                                RAW_DATA_POINTER (node))[i]);
+         if (i == RAW_DATA_LENGTH (node) - 1U)
+           break;
+         else if (i == 9 && RAW_DATA_LENGTH (node) > 20)
+           {
+             pp_string (pp, ", ..., ");
+             i = RAW_DATA_LENGTH (node) - 11;
+           }
+         else
+           pp_string (pp, ", ");
+       }
+      break;
+
     case FUNCTION_TYPE:
     case METHOD_TYPE:
       dump_generic_node (pp, TREE_TYPE (node), spc, flags, false);
--- gcc/tree.cc.jj      2024-07-01 11:28:23.495227837 +0200
+++ gcc/tree.cc 2024-07-04 14:58:33.563465503 +0200
@@ -513,6 +513,7 @@ tree_node_structure_for_code (enum tree_
     case STRING_CST:           return TS_STRING;
     case VECTOR_CST:           return TS_VECTOR;
     case VOID_CST:             return TS_TYPED;
+    case RAW_DATA_CST:         return TS_RAW_DATA_CST;
 
       /* tcc_exceptional cases.  */
     case BLOCK:                        return TS_BLOCK;
@@ -571,6 +572,7 @@ initialize_tree_contains_struct (void)
        case TS_FIXED_CST:
        case TS_VECTOR:
        case TS_STRING:
+       case TS_RAW_DATA_CST:
        case TS_COMPLEX:
        case TS_SSA_NAME:
        case TS_CONSTRUCTOR:
@@ -1026,6 +1028,7 @@ tree_code_size (enum tree_code code)
        case REAL_CST:          return sizeof (tree_real_cst);
        case FIXED_CST:         return sizeof (tree_fixed_cst);
        case COMPLEX_CST:       return sizeof (tree_complex);
+       case RAW_DATA_CST:      return sizeof (tree_raw_data);
        case VECTOR_CST:        gcc_unreachable ();
        case STRING_CST:        gcc_unreachable ();
        default:
@@ -10467,6 +10470,15 @@ initializer_zerop (const_tree init, bool
       *nonzero = true;
       return false;
 
+    case RAW_DATA_CST:
+      for (unsigned int i = 0; i < (unsigned int) RAW_DATA_LENGTH (init); ++i)
+       if (RAW_DATA_POINTER (init)[i])
+         {
+           *nonzero = true;
+           return false;
+         }
+      return true;
+
     case CONSTRUCTOR:
       {
        if (TREE_CLOBBER_P (init))
--- gcc/testsuite/c-c++-common/cpp/embed-19.c.jj        2024-07-05 
11:30:09.333874817 +0200
+++ gcc/testsuite/c-c++-common/cpp/embed-19.c   2024-07-05 11:35:19.825724327 
+0200
@@ -0,0 +1,24 @@
+/* { dg-do run } */
+/* { dg-options "" } */
+/* { dg-additional-options "-std=c23" { target c } } */
+
+unsigned char a[] = {
+#embed __FILE__
+};
+struct S { unsigned char h[(sizeof (a) - 7) / 2]; short int i; unsigned char 
j[sizeof (a) - 7 - (sizeof (a) - 7) / 2]; };
+struct T { int a, b, c; struct S d; long long e; double f; long long g; };
+struct T b = {
+#embed __FILE__
+};
+
+int
+main ()
+{
+  if (b.a != a[0] || b.b != a[1] || b.c != a[2]
+      || __builtin_memcmp (b.d.h, a + 3, sizeof (b.d.h))
+      || b.d.i != a[3 + sizeof (b.d.h)]
+      || __builtin_memcmp (b.d.j, a + 4 + sizeof (b.d.h), sizeof (b.d.j))
+      || b.e != a[sizeof (a) - 3] || b.f != a[sizeof (a) - 2]
+      || b.g != a[sizeof (a) - 1])
+    __builtin_abort ();
+}
--- gcc/testsuite/gcc.dg/cpp/embed-8.c.jj       2024-07-05 13:37:25.289157048 
+0200
+++ gcc/testsuite/gcc.dg/cpp/embed-8.c  2024-07-05 13:39:15.232694163 +0200
@@ -0,0 +1,7 @@
+/* This is a comment with some UTF-8 non-ASCII characters: áéíóú.  */
+/* { dg-do compile } */
+/* { dg-options "-std=c23 -Wconversion" } */
+
+signed char a[] = {
+#embed __FILE__        /* { dg-warning "conversion from 'int' to 'signed char' 
changes value from '\[12]\[0-9]\[0-9]' to '-\[0-9]\[0-9]*'" } */
+};
--- gcc/testsuite/gcc.dg/cpp/embed-7.c.jj       2024-07-05 13:27:28.580097964 
+0200
+++ gcc/testsuite/gcc.dg/cpp/embed-7.c  2024-07-05 13:36:04.728228965 +0200
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c23 -Woverride-init" } */
+
+unsigned char a[] = {
+#embed __FILE__
+};
+unsigned char b[] = {
+  [26] =
+#embed __FILE__
+};
+unsigned char c[] = {
+#embed __FILE__ suffix (,)
+  [sizeof (a) / 4] = 0,                /* { dg-warning "initialized field 
overwritten" } */
+  [sizeof (a) / 2] = 1,                /* { dg-warning "initialized field 
overwritten" } */
+  [1] = 2,                     /* { dg-warning "initialized field overwritten" 
} */
+  [sizeof (a) - 2] = 3         /* { dg-warning "initialized field overwritten" 
} */
+};
+unsigned char d[] = {
+  [1] = 4,
+  [26] = 5,
+  [sizeof (a) / 4] = 6,
+  [sizeof (a) / 2] = 7,
+  [sizeof (a) - 2] = 8,
+#embed __FILE__ prefix ([0] = )        /* { dg-warning "initialized field 
overwritten" } */
+};
+unsigned char e[] = {
+#embed __FILE__ suffix (,)
+  [2] = 9,                     /* { dg-warning "initialized field overwritten" 
} */
+  [sizeof (a) - 3] = 10                /* { dg-warning "initialized field 
overwritten" } */
+};
+unsigned char f[] = {
+  [23] = 11,
+  [sizeof (a) / 4 - 1] = 12,
+#embed __FILE__ limit (128) prefix ([sizeof (a) / 4 - 1] = ) suffix (,)        
        /* { dg-warning "initialized field overwritten" } */
+#embed __FILE__ limit (130) prefix ([sizeof (a) / 4 - 2] = ) suffix (,)        
        /* { dg-warning "initialized field overwritten" } */
+#embed __FILE__ prefix ([sizeof (a) / 4 + 10] = ) suffix (,)                   
/* { dg-warning "initialized field overwritten" } */
+#embed __FILE__ limit (128) prefix ([sizeof (a) + sizeof (a) / 4 - 30] = ) 
suffix (,) /* { dg-warning "initialized field overwritten" } */
+#embed __FILE__ limit (128) prefix ([sizeof (a) / 4 + 96] = ) suffix (,)       
/* { dg-warning "initialized field overwritten" } */
+};
--- gcc/testsuite/gcc.dg/cpp/embed-9.c.jj       2024-07-05 13:54:06.976828053 
+0200
+++ gcc/testsuite/gcc.dg/cpp/embed-9.c  2024-07-05 13:53:54.994987508 +0200
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c23" } */
+
+struct __attribute__((designated_init)) S {
+  int a, b, c, d;
+  unsigned char e[128];
+};
+
+struct S s = { .a = 1, .b =
+#embed __FILE__ limit(128)     /* { dg-warning "positional initialization of 
field in 'struct' declared with 'designated_init' attribute" } */
+};                             /* { dg-message "near initialization" "" { 
target *-*-* } .-1 } */
--- gcc/testsuite/gcc.dg/cpp/embed-6.c.jj       2024-07-05 13:25:08.339965010 
+0200
+++ gcc/testsuite/gcc.dg/cpp/embed-6.c  2024-07-05 13:24:03.036834399 +0200
@@ -0,0 +1,82 @@
+/* { dg-do run } */
+/* { dg-options "-std=c23" } */
+
+unsigned char a[] = {
+#embed __FILE__
+};
+unsigned char b[] = {
+  [26] =
+#embed __FILE__
+};
+unsigned char c[] = {
+#embed __FILE__ suffix (,)
+  [sizeof (a) / 4] = 0,
+  [sizeof (a) / 2] = 1,
+  [1] = 2,
+  [sizeof (a) - 2] = 3
+};
+unsigned char d[] = {
+  [1] = 4,
+  [26] = 5,
+  [sizeof (a) / 4] = 6,
+  [sizeof (a) / 2] = 7,
+  [sizeof (a) - 2] = 8,
+#embed __FILE__ prefix ([0] = )
+};
+unsigned char e[] = {
+#embed __FILE__ suffix (,)
+  [2] = 9,
+  [sizeof (a) - 3] = 10
+};
+unsigned char f[] = {
+  [23] = 11,
+  [sizeof (a) / 4 - 1] = 12,
+#embed __FILE__ limit (128) prefix ([sizeof (a) / 4 - 1] = ) suffix (,)
+#embed __FILE__ limit (130) prefix ([sizeof (a) / 4 - 2] = ) suffix (,)
+#embed __FILE__ prefix ([sizeof (a) / 4 + 10] = ) suffix (,)
+#embed __FILE__ limit (128) prefix ([sizeof (a) + sizeof (a) / 4 - 30] = ) 
suffix (,)
+#embed __FILE__ limit (128) prefix ([sizeof (a) / 4 + 96] = ) suffix (,)
+};
+unsigned char z[sizeof (a) / 4] = {
+};
+
+int
+main ()
+{
+  if (sizeof (b) != sizeof (a) + 26
+      || __builtin_memcmp (a, b + 26, sizeof (a)))
+    __builtin_abort ();
+  if (sizeof (c) != sizeof (a)
+      || a[0] != c[0]
+      || c[1] != 2
+      || __builtin_memcmp (a + 2, c + 2, sizeof (a) / 4 - 2)
+      || c[sizeof (a) / 4] != 0
+      || __builtin_memcmp (a + sizeof (a) / 4 + 1, c + sizeof (a) / 4 + 1, 
sizeof (a) / 2 - sizeof (a) / 4 - 1)
+      || c[sizeof (a) / 2] != 1
+      || __builtin_memcmp (a + sizeof (a) / 2 + 1, c + sizeof (a) / 2 + 1, 
sizeof (a) - sizeof (a) / 2 - 3)
+      || c[sizeof (a) - 2] != 3
+      || a[sizeof (a) - 1] != c[sizeof (a) - 1])
+    __builtin_abort ();
+  if (sizeof (d) != sizeof (a)
+      || __builtin_memcmp (a, d, sizeof (a)))
+    __builtin_abort ();
+  if (sizeof (e) != sizeof (a)
+      || a[0] != e[0]
+      || a[1] != e[1]
+      || e[2] != 9
+      || __builtin_memcmp (a + 3, e + 3, sizeof (a) - 6)
+      || e[sizeof (a) - 3] != 10
+      || a[sizeof (a) - 2] != e[sizeof (a) - 2]
+      || a[sizeof (a) - 1] != e[sizeof (a) - 1])
+    __builtin_abort ();
+  if (sizeof (f) != sizeof (a) + sizeof (a) / 4 - 30 + 128
+      || __builtin_memcmp (z, f, 23)
+      || f[23] != 11
+      || __builtin_memcmp (z, f + 24, sizeof (a) / 4 - 2 - 24)
+      || __builtin_memcmp (f + sizeof (a) / 4 - 2, a, 12)
+      || __builtin_memcmp (f + sizeof (a) / 4 + 10, a, 86)
+      || __builtin_memcmp (f + sizeof (a) / 4 + 96, a, 128)
+      || __builtin_memcmp (f + sizeof (a) / 4 + 96 + 128, a + 86 + 128, sizeof 
(a) - 86 - 128 - 40)
+      || __builtin_memcmp (f + sizeof (a) + sizeof (a) / 4 - 30, a, 128))
+    __builtin_abort ();
+}
--- gcc/c/c-parser.cc.jj        2024-07-01 11:28:21.840249061 +0200
+++ gcc/c/c-parser.cc   2024-07-04 14:58:33.568465437 +0200
@@ -6212,6 +6212,25 @@ c_parser_braced_init (c_parser *parser,
            {
              last_init_list_comma = c_parser_peek_token (parser)->location;
              c_parser_consume_token (parser);
+             /* CPP_EMBED should be always in between two CPP_COMMA
+                tokens.  */
+             while (c_parser_next_token_is (parser, CPP_EMBED))
+               {
+                 c_token *embed = c_parser_peek_token (parser);
+                 c_parser_consume_token (parser);
+                 c_expr embed_val;
+                 embed_val.value = embed->value;
+                 embed_val.original_code = RAW_DATA_CST;
+                 embed_val.original_type = integer_type_node;
+                 set_c_expr_source_range (&embed_val, embed->get_range ());
+                 embed_val.m_decimal = 0;
+                 process_init_element (embed->location, embed_val, false,
+                                       &braced_init_obstack);
+                 gcc_checking_assert (c_parser_next_token_is (parser,
+                                                              CPP_COMMA));
+                 last_init_list_comma = c_parser_peek_token (parser)->location;
+                 c_parser_consume_token (parser);
+               }
            }
          else
            break;
--- gcc/c/c-typeck.cc.jj        2024-06-14 19:45:07.455803708 +0200
+++ gcc/c/c-typeck.cc   2024-07-05 12:48:05.357558694 +0200
@@ -8747,12 +8747,13 @@ digest_init (location_t init_loc, tree t
   if (!maybe_const)
     arith_const_expr = false;
   else if (!INTEGRAL_TYPE_P (TREE_TYPE (inside_init))
-      && TREE_CODE (TREE_TYPE (inside_init)) != REAL_TYPE
-      && TREE_CODE (TREE_TYPE (inside_init)) != COMPLEX_TYPE)
+          && TREE_CODE (TREE_TYPE (inside_init)) != REAL_TYPE
+          && TREE_CODE (TREE_TYPE (inside_init)) != COMPLEX_TYPE)
     arith_const_expr = false;
   else if (TREE_CODE (inside_init) != INTEGER_CST
-      && TREE_CODE (inside_init) != REAL_CST
-      && TREE_CODE (inside_init) != COMPLEX_CST)
+          && TREE_CODE (inside_init) != REAL_CST
+          && TREE_CODE (inside_init) != COMPLEX_CST
+          && TREE_CODE (inside_init) != RAW_DATA_CST)
     arith_const_expr = false;
   else if (TREE_OVERFLOW (inside_init))
     arith_const_expr = false;
@@ -9013,6 +9014,22 @@ digest_init (location_t init_loc, tree t
                                               ? ic_init_const
                                               : ic_init), 
null_pointer_constant,
                                              NULL_TREE, NULL_TREE, 0);
+      if (TREE_CODE (inside_init) == RAW_DATA_CST
+         && c_inhibit_evaluation_warnings == 0
+         && warn_overflow
+         && !TYPE_UNSIGNED (type)
+         && TYPE_PRECISION (type) == CHAR_BIT)
+       for (unsigned int i = 0;
+            i < (unsigned) RAW_DATA_LENGTH (inside_init); ++i)
+         if (((const signed char *) RAW_DATA_POINTER (inside_init))[i] < 0)
+           warning_at (init_loc, OPT_Wconversion,
+                       "conversion from %qT to %qT changes value from "
+                       "%qd to %qd",
+                       integer_type_node, type,
+                       ((const unsigned char *)
+                        RAW_DATA_POINTER (inside_init))[i],
+                       ((const signed char *)
+                        RAW_DATA_POINTER (inside_init))[i]);
       return inside_init;
     }
 
@@ -10124,6 +10141,28 @@ set_init_label (location_t loc, tree fie
     while (field != NULL_TREE);
 }
 
+/* Helper function for add_pending_init.  Find inorder successor of P
+   in AVL tree.  */
+static struct init_node *
+init_node_successor (struct init_node *p)
+{
+  struct init_node *r;
+  if (p->right)
+    {
+      r = p->right;
+      while (r->left)
+       r = r->left;
+      return r;
+    }
+  r = p->parent;
+  while (r && p == r->right)
+    {
+      p = r;
+      r = r->parent;
+    }
+  return r;
+}
+
 /* Add a new initializer to the tree of pending initializers.  PURPOSE
    identifies the initializer, either array index or field in a structure.
    VALUE is the value of that index or field.  If ORIGTYPE is not
@@ -10151,9 +10190,179 @@ add_pending_init (location_t loc, tree p
          if (tree_int_cst_lt (purpose, p->purpose))
            q = &p->left;
          else if (tree_int_cst_lt (p->purpose, purpose))
-           q = &p->right;
+           {
+             if (TREE_CODE (p->value) != RAW_DATA_CST
+                 || (p->right
+                     && tree_int_cst_le (p->right->purpose, purpose)))
+               q = &p->right;
+             else
+               {
+                 widest_int pp = wi::to_widest (p->purpose);
+                 widest_int pw = wi::to_widest (purpose);
+                 if (pp + RAW_DATA_LENGTH (p->value) <= pw)
+                   q = &p->right;
+                 else
+                   {
+                     /* Override which should split the old RAW_DATA_CST
+                        into 2 or 3 pieces.  */
+                     if (!implicit && warn_override_init)
+                       warning_init (loc, OPT_Woverride_init,
+                                     "initialized field overwritten");
+                     unsigned HOST_WIDE_INT start = (pw - pp).to_uhwi ();
+                     unsigned HOST_WIDE_INT len = 1;
+                     if (TREE_CODE (value) == RAW_DATA_CST)
+                       len = RAW_DATA_LENGTH (value);
+                     unsigned HOST_WIDE_INT end = 0;
+                     unsigned plen = RAW_DATA_LENGTH (p->value);
+                     gcc_checking_assert (start < plen && start);
+                     if (plen - start > len)
+                       end = plen - start - len;
+                     tree v = p->value;
+                     tree origtype = p->origtype;
+                     if (start == 1)
+                       p->value = build_int_cst (TREE_TYPE (v),
+                                                 *(const unsigned char *)
+                                                 RAW_DATA_POINTER (v));
+                     else
+                       {
+                         p->value = v;
+                         if (end > 1)
+                           v = copy_node (v);
+                         RAW_DATA_LENGTH (p->value) = start;
+                       }
+                     if (end)
+                       {
+                         tree epurpose
+                           = size_binop (PLUS_EXPR, purpose,
+                                         bitsize_int (len));
+                         if (end > 1)
+                           {
+                             RAW_DATA_LENGTH (v) -= plen - end;
+                             RAW_DATA_POINTER (v) += plen - end;
+                           }
+                         else
+                           v = build_int_cst (TREE_TYPE (v),
+                                              ((const unsigned char *)
+                                               RAW_DATA_POINTER (v))[plen
+                                                                     - end]);
+                         add_pending_init (loc, epurpose, v, origtype,
+                                           implicit, braced_init_obstack);
+                       }
+                     q = &constructor_pending_elts;
+                     continue;
+                   }
+               }
+           }
          else
            {
+             if (TREE_CODE (p->value) == RAW_DATA_CST
+                 && (RAW_DATA_LENGTH (p->value)
+                     > (TREE_CODE (value) == RAW_DATA_CST
+                        ? RAW_DATA_LENGTH (value) : 1)))
+               {
+                 /* Override which should split the old RAW_DATA_CST
+                    into 2 pieces.  */
+                 if (!implicit && warn_override_init)
+                   warning_init (loc, OPT_Woverride_init,
+                                 "initialized field overwritten");
+                 unsigned HOST_WIDE_INT len = 1;
+                 if (TREE_CODE (value) == RAW_DATA_CST)
+                   len = RAW_DATA_LENGTH (value);
+                 if ((unsigned) RAW_DATA_LENGTH (p->value) > len + 1)
+                   {
+                     RAW_DATA_LENGTH (p->value) -= len;
+                     RAW_DATA_POINTER (p->value) += len;
+                   }
+                 else
+                   {
+                     unsigned int l = RAW_DATA_LENGTH (p->value) - 1;
+                     p->value
+                       = build_int_cst (TREE_TYPE (p->value),
+                                        ((const unsigned char *)
+                                         RAW_DATA_POINTER (p->value))[l]);
+                   }
+                 p->purpose = size_binop (PLUS_EXPR, p->purpose,
+                                          bitsize_int (len));
+                 continue;
+               }
+             if (TREE_CODE (value) == RAW_DATA_CST)
+               {
+               handle_raw_data:
+                 /* RAW_DATA_CST value might overlap various further
+                    prior initval entries.  Find out how many.  */
+                 unsigned cnt = 0;
+                 widest_int w
+                   = wi::to_widest (purpose) + RAW_DATA_LENGTH (value);
+                 struct init_node *r = p, *last = NULL;
+                 bool override_init = warn_override_init;
+                 while ((r = init_node_successor (r))
+                        && wi::to_widest (r->purpose) < w)
+                   {
+                     ++cnt;
+                     if (TREE_SIDE_EFFECTS (r->value))
+                       warning_init (loc, OPT_Woverride_init_side_effects,
+                                     "initialized field with side-effects "
+                                     "overwritten");
+                     else if (override_init)
+                       {
+                         warning_init (loc, OPT_Woverride_init,
+                                       "initialized field overwritten");
+                         override_init = false;
+                       }
+                     last = r;
+                   }
+                 if (cnt)
+                   {
+                     if (TREE_CODE (last->value) == RAW_DATA_CST
+                         && (wi::to_widest (last->purpose)
+                             + RAW_DATA_LENGTH (last->value) > w))
+                       {
+                         /* The last overlapping prior initval overlaps
+                            only partially.  Shrink it and decrease cnt.  */
+                         unsigned int l = (wi::to_widest (last->purpose)
+                                           + RAW_DATA_LENGTH (last->value)
+                                           - w).to_uhwi ();
+                         --cnt;
+                         RAW_DATA_LENGTH (last->value) -= l;
+                         RAW_DATA_POINTER (last->value) += l;
+                         if (RAW_DATA_LENGTH (last->value) == 1)
+                           {
+                             const unsigned char *s
+                               = ((const unsigned char *)
+                                  RAW_DATA_POINTER (last->value));
+                             last->value
+                               = build_int_cst (TREE_TYPE (last->value), *s);
+                           }
+                         last->purpose
+                           = size_binop (PLUS_EXPR, last->purpose,
+                                         bitsize_int (l));
+                       }
+                     /* Instead of deleting cnt nodes from the AVL tree
+                        and rebalancing, peel of last cnt bytes from the
+                        RAW_DATA_CST.  Overriding thousands of previously
+                        initialized array elements with #embed needs to work,
+                        but doesn't need to be super efficient.  */
+                     gcc_checking_assert ((unsigned) RAW_DATA_LENGTH (value)
+                                          > cnt);
+                     RAW_DATA_LENGTH (value) -= cnt;
+                     const unsigned char *s
+                       = ((const unsigned char *) RAW_DATA_POINTER (value)
+                          + RAW_DATA_LENGTH (value));
+                     unsigned int o = RAW_DATA_LENGTH (value);
+                     for (r = p; cnt--; ++o, ++s)
+                       {
+                         r = init_node_successor (r);
+                         r->purpose = size_binop (PLUS_EXPR, purpose,
+                                                  bitsize_int (o));
+                         r->value = build_int_cst (TREE_TYPE (value), *s);
+                         r->origtype = origtype;
+                       }
+                     if (RAW_DATA_LENGTH (value) == 1)
+                       value = build_int_cst (TREE_TYPE (value),
+                                              *((const unsigned char *)
+                                                RAW_DATA_POINTER (value)));
+                   }
+               }
              if (!implicit)
                {
                  if (TREE_SIDE_EFFECTS (p->value))
@@ -10169,6 +10378,23 @@ add_pending_init (location_t loc, tree p
              return;
            }
        }
+      if (TREE_CODE (value) == RAW_DATA_CST && p)
+       {
+         struct init_node *r;
+         if (q == &p->left)
+           r = p;
+         else
+           r = init_node_successor (p);
+         if (r && wi::to_widest (r->purpose) < (wi::to_widest (purpose)
+                                                + RAW_DATA_LENGTH (value)))
+           {
+             /* Overlap with at least one prior initval in the range but
+                not at the start.  */
+             p = r;
+             p->purpose = purpose;
+             goto handle_raw_data;
+           }
+       }
     }
   else
     {
@@ -10397,8 +10623,8 @@ set_nonincremental_init (struct obstack
     {
       if (TYPE_DOMAIN (constructor_type))
        constructor_unfilled_index
-           = convert (bitsizetype,
-                      TYPE_MIN_VALUE (TYPE_DOMAIN (constructor_type)));
+         = convert (bitsizetype,
+                    TYPE_MIN_VALUE (TYPE_DOMAIN (constructor_type)));
       else
        constructor_unfilled_index = bitsize_zero_node;
     }
@@ -10612,12 +10838,13 @@ output_init_element (location_t loc, tre
   if (!maybe_const)
     arith_const_expr = false;
   else if (!INTEGRAL_TYPE_P (TREE_TYPE (value))
-      && TREE_CODE (TREE_TYPE (value)) != REAL_TYPE
-      && TREE_CODE (TREE_TYPE (value)) != COMPLEX_TYPE)
+          && TREE_CODE (TREE_TYPE (value)) != REAL_TYPE
+          && TREE_CODE (TREE_TYPE (value)) != COMPLEX_TYPE)
     arith_const_expr = false;
   else if (TREE_CODE (value) != INTEGER_CST
-      && TREE_CODE (value) != REAL_CST
-      && TREE_CODE (value) != COMPLEX_CST)
+          && TREE_CODE (value) != REAL_CST
+          && TREE_CODE (value) != COMPLEX_CST
+          && TREE_CODE (value) != RAW_DATA_CST)
     arith_const_expr = false;
   else if (TREE_OVERFLOW (value))
     arith_const_expr = false;
@@ -10784,9 +11011,14 @@ output_init_element (location_t loc, tre
 
   /* Advance the variable that indicates sequential elements output.  */
   if (TREE_CODE (constructor_type) == ARRAY_TYPE)
-    constructor_unfilled_index
-      = size_binop_loc (input_location, PLUS_EXPR, constructor_unfilled_index,
-                       bitsize_one_node);
+    {
+      tree inc = bitsize_one_node;
+      if (value && TREE_CODE (value) == RAW_DATA_CST)
+       inc = bitsize_int (RAW_DATA_LENGTH (value));
+      constructor_unfilled_index
+       = size_binop_loc (input_location, PLUS_EXPR,
+                         constructor_unfilled_index, inc);
+    }
   else if (TREE_CODE (constructor_type) == RECORD_TYPE)
     {
       constructor_unfilled_fields
@@ -10795,8 +11027,8 @@ output_init_element (location_t loc, tre
       /* Skip any nameless bit fields.  */
       while (constructor_unfilled_fields != NULL_TREE
             && DECL_UNNAMED_BIT_FIELD (constructor_unfilled_fields))
-       constructor_unfilled_fields =
-         DECL_CHAIN (constructor_unfilled_fields);
+       constructor_unfilled_fields
+         = DECL_CHAIN (constructor_unfilled_fields);
     }
   else if (TREE_CODE (constructor_type) == UNION_TYPE)
     constructor_unfilled_fields = NULL_TREE;
@@ -11042,6 +11274,23 @@ initialize_elementwise_p (tree type, tre
   return false;
 }
 
+/* Helper function for process_init_element.  Split first element of
+   RAW_DATA_CST and save the rest to *RAW_DATA.  */
+
+static inline tree
+maybe_split_raw_data (tree value, tree *raw_data)
+{
+  if (value == NULL_TREE || TREE_CODE (value) != RAW_DATA_CST)
+    return value;
+  *raw_data = value;
+  value = build_int_cst (integer_type_node,
+                        *(const unsigned char *)
+                        RAW_DATA_POINTER (*raw_data));
+  ++RAW_DATA_POINTER (*raw_data);
+  --RAW_DATA_LENGTH (*raw_data);
+  return value;
+}
+
 /* Add one non-braced element to the current constructor level.
    This adjusts the current position within the constructor's type.
    This may also start or terminate implicit levels
@@ -11064,7 +11313,9 @@ process_init_element (location_t loc, st
     = (orig_value != NULL_TREE && TREE_CODE (orig_value) == STRING_CST);
   bool strict_string = value.original_code == STRING_CST;
   bool was_designated = designator_depth != 0;
+  tree raw_data = NULL_TREE;
 
+retry:
   designator_depth = 0;
   designator_erroneous = 0;
 
@@ -11232,6 +11483,7 @@ process_init_element (location_t loc, st
              continue;
            }
 
+         value.value = maybe_split_raw_data (value.value, &raw_data);
          if (value.value)
            {
              push_member_name (constructor_fields);
@@ -11320,6 +11572,7 @@ process_init_element (location_t loc, st
              continue;
            }
 
+         value.value = maybe_split_raw_data (value.value, &raw_data);
          if (value.value)
            {
              push_member_name (constructor_fields);
@@ -11368,26 +11621,66 @@ process_init_element (location_t loc, st
              break;
            }
 
-         /* Now output the actual element.  */
-         if (value.value)
+         if (value.value
+             && TREE_CODE (value.value) == RAW_DATA_CST
+             && RAW_DATA_LENGTH (value.value) > 1
+             && (TREE_CODE (elttype) == INTEGER_TYPE
+                 || TREE_CODE (elttype) == BITINT_TYPE)
+             && TYPE_PRECISION (elttype) == CHAR_BIT
+             && (constructor_max_index == NULL_TREE
+                 || tree_int_cst_lt (constructor_index,
+                                     constructor_max_index)))
            {
+             unsigned int len = RAW_DATA_LENGTH (value.value);
+             if (constructor_max_index)
+               {
+                 widest_int w = wi::to_widest (constructor_max_index);
+                 w -= wi::to_widest (constructor_index);
+                 w += 1;
+                 if (w < len)
+                   len = w.to_uhwi ();
+               }
+             if (len < (unsigned) RAW_DATA_LENGTH (value.value))
+               {
+                 raw_data = copy_node (value.value);
+                 RAW_DATA_LENGTH (raw_data) -= len;
+                 RAW_DATA_POINTER (raw_data) += len;
+                 RAW_DATA_LENGTH (value.value) = len;
+               }
+             TREE_TYPE (value.value) = elttype;
              push_array_bounds (tree_to_uhwi (constructor_index));
              output_init_element (loc, value.value, value.original_type,
-                                  strict_string, elttype,
-                                  constructor_index, true, implicit,
-                                  braced_init_obstack);
+                                  false, elttype, constructor_index, true,
+                                  implicit, braced_init_obstack);
              RESTORE_SPELLING_DEPTH (constructor_depth);
+             constructor_index
+               = size_binop_loc (input_location, PLUS_EXPR,
+                                 constructor_index, bitsize_int (len));
            }
+         else
+           {
+             value.value = maybe_split_raw_data (value.value, &raw_data);
+             /* Now output the actual element.  */
+             if (value.value)
+               {
+                 push_array_bounds (tree_to_uhwi (constructor_index));
+                 output_init_element (loc, value.value, value.original_type,
+                                      strict_string, elttype,
+                                      constructor_index, true, implicit,
+                                      braced_init_obstack);
+                 RESTORE_SPELLING_DEPTH (constructor_depth);
+               }
 
-         constructor_index
-           = size_binop_loc (input_location, PLUS_EXPR,
-                             constructor_index, bitsize_one_node);
-
-         if (!value.value)
-           /* If we are doing the bookkeeping for an element that was
-              directly output as a constructor, we must update
-              constructor_unfilled_index.  */
-           constructor_unfilled_index = constructor_index;
+             constructor_index
+               = size_binop_loc (input_location, PLUS_EXPR,
+                                 constructor_index, bitsize_one_node);
+
+             if (!value.value)
+               /* If we are doing the bookkeeping for an element that was
+                  directly output as a constructor, we must update
+                  constructor_unfilled_index.  */
+               constructor_unfilled_index = constructor_index;
+           }
        }
       else if (gnu_vector_type_p (constructor_type))
        {
@@ -11402,6 +11695,7 @@ process_init_element (location_t loc, st
              break;
            }
 
+         value.value = maybe_split_raw_data (value.value, &raw_data);
          /* Now output the actual element.  */
          if (value.value)
            {
@@ -11435,6 +11729,7 @@ process_init_element (location_t loc, st
        }
       else
        {
+         value.value = maybe_split_raw_data (value.value, &raw_data);
          if (value.value)
            output_init_element (loc, value.value, value.original_type,
                                 strict_string, constructor_type,
@@ -11506,6 +11801,14 @@ process_init_element (location_t loc, st
     }
 
   constructor_range_stack = 0;
+
+  if (raw_data && RAW_DATA_LENGTH (raw_data))
+    {
+      gcc_assert (!string_flag && !was_designated);
+      value.value = raw_data;
+      raw_data = NULL_TREE;
+      goto retry;
+    }
 }
 
 /* Build a complete asm-statement, whose components are a CV_QUALIFIER
--- gcc/tree.def.jj     2024-06-05 19:09:54.045617019 +0200
+++ gcc/tree.def        2024-07-05 10:10:48.372613006 +0200
@@ -309,6 +309,12 @@ DEFTREECODE (VECTOR_CST, "vector_cst", t
 /* Contents are TREE_STRING_LENGTH and the actual contents of the string.  */
 DEFTREECODE (STRING_CST, "string_cst", tcc_constant, 0)
 
+/* Contents are RAW_DATA_LENGTH and the actual content
+   of the raw data, plus RAW_DATA_OWNER if non-NULL for owner of the
+   data (e.g. STRING_CST), if it is NULL, the data is owned by libcpp.
+   TREE_TYPE is the type of each of the RAW_DATA_LENGTH elements.  */
+DEFTREECODE (RAW_DATA_CST, "raw_data_cst", tcc_constant, 0)
+
 /* Declarations.  All references to names are represented as ..._DECL
    nodes.  The decls in one binding context are chained through the
    TREE_CHAIN field.  Each DECL has a DECL_NAME field which contains
--- gcc/c-family/c-lex.cc.jj    2024-02-22 19:29:51.226074838 +0100
+++ gcc/c-family/c-lex.cc       2024-07-04 14:58:33.568465437 +0200
@@ -781,6 +781,13 @@ c_lex_with_flags (tree *value, location_
       *value = build_string (tok->val.str.len, (const char 
*)tok->val.str.text);
       break;
 
+    case CPP_EMBED:
+      *value = make_node (RAW_DATA_CST);
+      TREE_TYPE (*value) = integer_type_node;
+      RAW_DATA_LENGTH (*value) = tok->val.str.len;
+      RAW_DATA_POINTER (*value) = (const char *) tok->val.str.text;
+      break;
+
       /* This token should not be visible outside cpplib.  */
     case CPP_MACRO_ARG:
       gcc_unreachable ();
@@ -800,7 +807,7 @@ c_lex_with_flags (tree *value, location_
          add_flags |= PREV_FALLTHROUGH;
          goto retry_after_at;
        }
-       goto retry;
+      goto retry;
 
     default:
       *value = NULL_TREE;
--- gcc/tree-core.h.jj  2024-07-01 11:28:23.408228952 +0200
+++ gcc/tree-core.h     2024-07-03 19:41:28.821880055 +0200
@@ -1516,6 +1516,13 @@ struct GTY(()) tree_string {
   char str[1];
 };
 
+struct GTY(()) tree_raw_data {
+  struct tree_typed typed;
+  tree owner;
+  const char *GTY ((skip(""))) str;
+  int length;
+};
+
 struct GTY(()) tree_complex {
   struct tree_typed typed;
   tree real;
@@ -2106,6 +2113,7 @@ union GTY ((ptr_alias (union lang_tree_n
   struct tree_fixed_cst GTY ((tag ("TS_FIXED_CST"))) fixed_cst;
   struct tree_vector GTY ((tag ("TS_VECTOR"))) vector;
   struct tree_string GTY ((tag ("TS_STRING"))) string;
+  struct tree_raw_data GTY ((tag ("TS_RAW_DATA_CST"))) raw_data_cst;
   struct tree_complex GTY ((tag ("TS_COMPLEX"))) complex;
   struct tree_identifier GTY ((tag ("TS_IDENTIFIER"))) identifier;
   struct tree_decl_minimal GTY((tag ("TS_DECL_MINIMAL"))) decl_minimal;
--- gcc/treestruct.def.jj       2024-01-03 11:51:38.761630845 +0100
+++ gcc/treestruct.def  2024-07-03 17:06:57.539794162 +0200
@@ -39,6 +39,7 @@ DEFTREESTRUCT(TS_REAL_CST, "real cst")
 DEFTREESTRUCT(TS_FIXED_CST, "fixed cst")
 DEFTREESTRUCT(TS_VECTOR, "vector")
 DEFTREESTRUCT(TS_STRING, "string")
+DEFTREESTRUCT(TS_RAW_DATA_CST, "raw data cst")
 DEFTREESTRUCT(TS_COMPLEX, "complex")
 DEFTREESTRUCT(TS_IDENTIFIER, "identifier")
 DEFTREESTRUCT(TS_DECL_MINIMAL, "decl minimal")

        Jakub

Reply via email to