Hi! Just to show what I'm working on on top of the already posted #embed patches. Working on the C FE only for now, the patch emits CPP_EMBED tokens when preprocessing C (but still with the important simplification that CPP_EMBED token is always preceded by {CPP_NUMBER,CPP_EMBED} CPP_COMMA and followed by CPP_COMMA {CPP_NUMBER,CPP_EMBED}, without that the FE code would be significantly larger).
The patch introduces RAW_DATA_CST tree, which is shorthand for a (possibly huge) sequence of INTEGER_CSTs in CONSTRUCTOR of ARRAY_TYPE. TREE_TYPE of this is the type of each element in the sequence (rather than say some array type like in the STRING_CST case) and it intentionally doesn't own the underlying data, but has (so far unused) RAW_DATA_OWNER subtree for the GC owner of that data. If RAW_DATA_OWNER is NULL, the data is owned directly by libcpp buffers, but e.g. for #embed in headers compiled into PCH I think we'll need to add a STRING_CST as the owner of the data. This way, it is cheap to peel them off or split them apart (see e.g. the testcases which create possibly huge RAW_DATA_CST and then using designed initializer store some other element in the middle of it). The new testcases already work with the patch, what is missing is handling it where needed in the middle-end (surely needs to be handled e.g. in the gimplifier if somebody uses #embed in automatic var initializers, in IPA-ICF hashing, or when trying to extract a constant from some large array initializer or LTO), plus decide what to do with CPP_EMBED in the FE (one possibility is to keep it as CPP_EMBED only when peeking non-raw token when some new parser flag is set and otherwise already during token peeking split it off into individual CPP_NUMBER tokens, another possibility if we manage to come up with all spots where CPP_EMBED can validly appear in the grammar (my current understanding besides the initializers already handled is in function arguments or attributes (both using c_parser_expr_list, so that function could be taught to parse CPP_EMBED into list of INTEGER_CSTs), in comma expressions (so teach c_parser_expression about it, but in that case I wonder if we just don't need to ensure correct warnings and that is it, because whether a comma expression is something,somethingelse or something,1,2,3,4,5,6,7,8,somethingelse shouldn't be significantly different) and I think in OpenMP sizes clause (but that could be changed to use c_parser_expr_list); for C++ I think template arguments, or multi-dimensional array indices are other spots). I don't have a bootstrapped compiler with this patch yet, but -O0 built cc1 can compile at -O0 unsigned char a[] = { #embed "cc1plus" }; where cc1plus is 261M binary compiles under 20s for me (most time spent in assemble_string). If I preprocess the same with C++ (so that it just expands it into a sequence of 261M numbers), I've Ctrl-C the compilation after a couple of minutes. Obviously I'll do a proper test with optimized compiler later. Anyway, thoughts on this before I spend too much time on it? And while clang 19 now claims to have #embed implemented, most of the testcases in the patch are miscompiled by it. --- libcpp/files.cc.jj 2024-07-03 14:52:12.231817485 +0200 +++ libcpp/files.cc 2024-07-03 15:44:39.248913032 +0200 @@ -1241,7 +1241,10 @@ finish_embed (cpp_reader *pfile, _cpp_fi limit = params->limit; size_t embed_tokens = 0; - if (CPP_OPTION (pfile, directives_only) && limit >= 64) + if ((CPP_OPTION (pfile, directives_only) + || !CPP_OPTION (pfile, cplusplus)) + && CPP_OPTION (pfile, lang) != CLK_ASM + && limit >= 64) embed_tokens = ((limit - 2) / INT_MAX) + (((limit - 2) % INT_MAX) != 0); size_t max = INTTYPE_MAXIMUM (size_t) / sizeof (cpp_token); --- gcc/varasm.cc.jj 2024-05-07 18:10:10.674871087 +0200 +++ gcc/varasm.cc 2024-07-04 14:58:33.570465411 +0200 @@ -4875,6 +4875,7 @@ initializer_constant_valid_p_1 (tree val case FIXED_CST: case STRING_CST: case COMPLEX_CST: + case RAW_DATA_CST: return null_pointer_node; case ADDR_EXPR: @@ -5468,6 +5469,9 @@ array_size_for_constructor (tree val) { if (TREE_CODE (index) == RANGE_EXPR) index = TREE_OPERAND (index, 1); + if (value && TREE_CODE (value) == RAW_DATA_CST) + index = size_binop (PLUS_EXPR, index, + size_int (RAW_DATA_LENGTH (value) - 1)); if (max_index == NULL_TREE || tree_int_cst_lt (max_index, index)) max_index = index; } @@ -5659,6 +5663,12 @@ output_constructor_regular_field (oc_loc /* Output the element's initial value. */ if (local->val == NULL_TREE) assemble_zeros (fieldsize); + else if (local->val && TREE_CODE (local->val) == RAW_DATA_CST) + { + fieldsize *= RAW_DATA_LENGTH (local->val); + assemble_string (RAW_DATA_POINTER (local->val), + RAW_DATA_LENGTH (local->val)); + } else fieldsize = output_constant (local->val, fieldsize, align2, local->reverse, false); --- gcc/tree.h.jj 2024-06-05 19:09:54.046617006 +0200 +++ gcc/tree.h 2024-07-03 19:41:04.453201043 +0200 @@ -1165,6 +1165,14 @@ extern void omp_clause_range_check_faile #define TREE_STRING_POINTER(NODE) \ ((const char *)(STRING_CST_CHECK (NODE)->string.str)) +/* In a RAW_DATA_CST */ +#define RAW_DATA_LENGTH(NODE) \ + (RAW_DATA_CST_CHECK (NODE)->raw_data_cst.length) +#define RAW_DATA_POINTER(NODE) \ + (RAW_DATA_CST_CHECK (NODE)->raw_data_cst.str) +#define RAW_DATA_OWNER(NODE) \ + (RAW_DATA_CST_CHECK (NODE)->raw_data_cst.owner) + /* In a COMPLEX_CST node. */ #define TREE_REALPART(NODE) (COMPLEX_CST_CHECK (NODE)->complex.real) #define TREE_IMAGPART(NODE) (COMPLEX_CST_CHECK (NODE)->complex.imag) --- gcc/expr.cc.jj 2024-07-01 11:28:22.704237981 +0200 +++ gcc/expr.cc 2024-07-04 14:58:33.576465331 +0200 @@ -7144,6 +7144,12 @@ categorize_ctor_elements_1 (const_tree c init_elts += mult * TREE_STRING_LENGTH (value); break; + case RAW_DATA_CST: + nz_elts += mult * RAW_DATA_LENGTH (value); + unique_nz_elts += RAW_DATA_LENGTH (value); + init_elts += mult * RAW_DATA_LENGTH (value); + break; + case COMPLEX_CST: if (!initializer_zerop (TREE_REALPART (value))) { --- gcc/tree-pretty-print.cc.jj 2024-06-14 19:45:09.446777591 +0200 +++ gcc/tree-pretty-print.cc 2024-07-04 14:58:33.571465397 +0200 @@ -2519,6 +2519,28 @@ dump_generic_node (pretty_printer *pp, t } break; + case RAW_DATA_CST: + for (unsigned i = 0; i < (unsigned) RAW_DATA_LENGTH (node); ++i) + { + if (TYPE_UNSIGNED (TREE_TYPE (node)) + || TYPE_PRECISION (TREE_TYPE (node)) > CHAR_BIT) + pp_decimal_int (pp, ((const unsigned char *) + RAW_DATA_POINTER (node))[i]); + else + pp_decimal_int (pp, ((const signed char *) + RAW_DATA_POINTER (node))[i]); + if (i == RAW_DATA_LENGTH (node) - 1U) + break; + else if (i == 9 && RAW_DATA_LENGTH (node) > 20) + { + pp_string (pp, ", ..., "); + i = RAW_DATA_LENGTH (node) - 11; + } + else + pp_string (pp, ", "); + } + break; + case FUNCTION_TYPE: case METHOD_TYPE: dump_generic_node (pp, TREE_TYPE (node), spc, flags, false); --- gcc/tree.cc.jj 2024-07-01 11:28:23.495227837 +0200 +++ gcc/tree.cc 2024-07-04 14:58:33.563465503 +0200 @@ -513,6 +513,7 @@ tree_node_structure_for_code (enum tree_ case STRING_CST: return TS_STRING; case VECTOR_CST: return TS_VECTOR; case VOID_CST: return TS_TYPED; + case RAW_DATA_CST: return TS_RAW_DATA_CST; /* tcc_exceptional cases. */ case BLOCK: return TS_BLOCK; @@ -571,6 +572,7 @@ initialize_tree_contains_struct (void) case TS_FIXED_CST: case TS_VECTOR: case TS_STRING: + case TS_RAW_DATA_CST: case TS_COMPLEX: case TS_SSA_NAME: case TS_CONSTRUCTOR: @@ -1026,6 +1028,7 @@ tree_code_size (enum tree_code code) case REAL_CST: return sizeof (tree_real_cst); case FIXED_CST: return sizeof (tree_fixed_cst); case COMPLEX_CST: return sizeof (tree_complex); + case RAW_DATA_CST: return sizeof (tree_raw_data); case VECTOR_CST: gcc_unreachable (); case STRING_CST: gcc_unreachable (); default: @@ -10467,6 +10470,15 @@ initializer_zerop (const_tree init, bool *nonzero = true; return false; + case RAW_DATA_CST: + for (unsigned int i = 0; i < (unsigned int) RAW_DATA_LENGTH (init); ++i) + if (RAW_DATA_POINTER (init)[i]) + { + *nonzero = true; + return false; + } + return true; + case CONSTRUCTOR: { if (TREE_CLOBBER_P (init)) --- gcc/testsuite/c-c++-common/cpp/embed-19.c.jj 2024-07-05 11:30:09.333874817 +0200 +++ gcc/testsuite/c-c++-common/cpp/embed-19.c 2024-07-05 11:35:19.825724327 +0200 @@ -0,0 +1,24 @@ +/* { dg-do run } */ +/* { dg-options "" } */ +/* { dg-additional-options "-std=c23" { target c } } */ + +unsigned char a[] = { +#embed __FILE__ +}; +struct S { unsigned char h[(sizeof (a) - 7) / 2]; short int i; unsigned char j[sizeof (a) - 7 - (sizeof (a) - 7) / 2]; }; +struct T { int a, b, c; struct S d; long long e; double f; long long g; }; +struct T b = { +#embed __FILE__ +}; + +int +main () +{ + if (b.a != a[0] || b.b != a[1] || b.c != a[2] + || __builtin_memcmp (b.d.h, a + 3, sizeof (b.d.h)) + || b.d.i != a[3 + sizeof (b.d.h)] + || __builtin_memcmp (b.d.j, a + 4 + sizeof (b.d.h), sizeof (b.d.j)) + || b.e != a[sizeof (a) - 3] || b.f != a[sizeof (a) - 2] + || b.g != a[sizeof (a) - 1]) + __builtin_abort (); +} --- gcc/testsuite/gcc.dg/cpp/embed-8.c.jj 2024-07-05 13:37:25.289157048 +0200 +++ gcc/testsuite/gcc.dg/cpp/embed-8.c 2024-07-05 13:39:15.232694163 +0200 @@ -0,0 +1,7 @@ +/* This is a comment with some UTF-8 non-ASCII characters: áéíóú. */ +/* { dg-do compile } */ +/* { dg-options "-std=c23 -Wconversion" } */ + +signed char a[] = { +#embed __FILE__ /* { dg-warning "conversion from 'int' to 'signed char' changes value from '\[12]\[0-9]\[0-9]' to '-\[0-9]\[0-9]*'" } */ +}; --- gcc/testsuite/gcc.dg/cpp/embed-7.c.jj 2024-07-05 13:27:28.580097964 +0200 +++ gcc/testsuite/gcc.dg/cpp/embed-7.c 2024-07-05 13:36:04.728228965 +0200 @@ -0,0 +1,39 @@ +/* { dg-do compile } */ +/* { dg-options "-std=c23 -Woverride-init" } */ + +unsigned char a[] = { +#embed __FILE__ +}; +unsigned char b[] = { + [26] = +#embed __FILE__ +}; +unsigned char c[] = { +#embed __FILE__ suffix (,) + [sizeof (a) / 4] = 0, /* { dg-warning "initialized field overwritten" } */ + [sizeof (a) / 2] = 1, /* { dg-warning "initialized field overwritten" } */ + [1] = 2, /* { dg-warning "initialized field overwritten" } */ + [sizeof (a) - 2] = 3 /* { dg-warning "initialized field overwritten" } */ +}; +unsigned char d[] = { + [1] = 4, + [26] = 5, + [sizeof (a) / 4] = 6, + [sizeof (a) / 2] = 7, + [sizeof (a) - 2] = 8, +#embed __FILE__ prefix ([0] = ) /* { dg-warning "initialized field overwritten" } */ +}; +unsigned char e[] = { +#embed __FILE__ suffix (,) + [2] = 9, /* { dg-warning "initialized field overwritten" } */ + [sizeof (a) - 3] = 10 /* { dg-warning "initialized field overwritten" } */ +}; +unsigned char f[] = { + [23] = 11, + [sizeof (a) / 4 - 1] = 12, +#embed __FILE__ limit (128) prefix ([sizeof (a) / 4 - 1] = ) suffix (,) /* { dg-warning "initialized field overwritten" } */ +#embed __FILE__ limit (130) prefix ([sizeof (a) / 4 - 2] = ) suffix (,) /* { dg-warning "initialized field overwritten" } */ +#embed __FILE__ prefix ([sizeof (a) / 4 + 10] = ) suffix (,) /* { dg-warning "initialized field overwritten" } */ +#embed __FILE__ limit (128) prefix ([sizeof (a) + sizeof (a) / 4 - 30] = ) suffix (,) /* { dg-warning "initialized field overwritten" } */ +#embed __FILE__ limit (128) prefix ([sizeof (a) / 4 + 96] = ) suffix (,) /* { dg-warning "initialized field overwritten" } */ +}; --- gcc/testsuite/gcc.dg/cpp/embed-9.c.jj 2024-07-05 13:54:06.976828053 +0200 +++ gcc/testsuite/gcc.dg/cpp/embed-9.c 2024-07-05 13:53:54.994987508 +0200 @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-std=c23" } */ + +struct __attribute__((designated_init)) S { + int a, b, c, d; + unsigned char e[128]; +}; + +struct S s = { .a = 1, .b = +#embed __FILE__ limit(128) /* { dg-warning "positional initialization of field in 'struct' declared with 'designated_init' attribute" } */ +}; /* { dg-message "near initialization" "" { target *-*-* } .-1 } */ --- gcc/testsuite/gcc.dg/cpp/embed-6.c.jj 2024-07-05 13:25:08.339965010 +0200 +++ gcc/testsuite/gcc.dg/cpp/embed-6.c 2024-07-05 13:24:03.036834399 +0200 @@ -0,0 +1,82 @@ +/* { dg-do run } */ +/* { dg-options "-std=c23" } */ + +unsigned char a[] = { +#embed __FILE__ +}; +unsigned char b[] = { + [26] = +#embed __FILE__ +}; +unsigned char c[] = { +#embed __FILE__ suffix (,) + [sizeof (a) / 4] = 0, + [sizeof (a) / 2] = 1, + [1] = 2, + [sizeof (a) - 2] = 3 +}; +unsigned char d[] = { + [1] = 4, + [26] = 5, + [sizeof (a) / 4] = 6, + [sizeof (a) / 2] = 7, + [sizeof (a) - 2] = 8, +#embed __FILE__ prefix ([0] = ) +}; +unsigned char e[] = { +#embed __FILE__ suffix (,) + [2] = 9, + [sizeof (a) - 3] = 10 +}; +unsigned char f[] = { + [23] = 11, + [sizeof (a) / 4 - 1] = 12, +#embed __FILE__ limit (128) prefix ([sizeof (a) / 4 - 1] = ) suffix (,) +#embed __FILE__ limit (130) prefix ([sizeof (a) / 4 - 2] = ) suffix (,) +#embed __FILE__ prefix ([sizeof (a) / 4 + 10] = ) suffix (,) +#embed __FILE__ limit (128) prefix ([sizeof (a) + sizeof (a) / 4 - 30] = ) suffix (,) +#embed __FILE__ limit (128) prefix ([sizeof (a) / 4 + 96] = ) suffix (,) +}; +unsigned char z[sizeof (a) / 4] = { +}; + +int +main () +{ + if (sizeof (b) != sizeof (a) + 26 + || __builtin_memcmp (a, b + 26, sizeof (a))) + __builtin_abort (); + if (sizeof (c) != sizeof (a) + || a[0] != c[0] + || c[1] != 2 + || __builtin_memcmp (a + 2, c + 2, sizeof (a) / 4 - 2) + || c[sizeof (a) / 4] != 0 + || __builtin_memcmp (a + sizeof (a) / 4 + 1, c + sizeof (a) / 4 + 1, sizeof (a) / 2 - sizeof (a) / 4 - 1) + || c[sizeof (a) / 2] != 1 + || __builtin_memcmp (a + sizeof (a) / 2 + 1, c + sizeof (a) / 2 + 1, sizeof (a) - sizeof (a) / 2 - 3) + || c[sizeof (a) - 2] != 3 + || a[sizeof (a) - 1] != c[sizeof (a) - 1]) + __builtin_abort (); + if (sizeof (d) != sizeof (a) + || __builtin_memcmp (a, d, sizeof (a))) + __builtin_abort (); + if (sizeof (e) != sizeof (a) + || a[0] != e[0] + || a[1] != e[1] + || e[2] != 9 + || __builtin_memcmp (a + 3, e + 3, sizeof (a) - 6) + || e[sizeof (a) - 3] != 10 + || a[sizeof (a) - 2] != e[sizeof (a) - 2] + || a[sizeof (a) - 1] != e[sizeof (a) - 1]) + __builtin_abort (); + if (sizeof (f) != sizeof (a) + sizeof (a) / 4 - 30 + 128 + || __builtin_memcmp (z, f, 23) + || f[23] != 11 + || __builtin_memcmp (z, f + 24, sizeof (a) / 4 - 2 - 24) + || __builtin_memcmp (f + sizeof (a) / 4 - 2, a, 12) + || __builtin_memcmp (f + sizeof (a) / 4 + 10, a, 86) + || __builtin_memcmp (f + sizeof (a) / 4 + 96, a, 128) + || __builtin_memcmp (f + sizeof (a) / 4 + 96 + 128, a + 86 + 128, sizeof (a) - 86 - 128 - 40) + || __builtin_memcmp (f + sizeof (a) + sizeof (a) / 4 - 30, a, 128)) + __builtin_abort (); +} --- gcc/c/c-parser.cc.jj 2024-07-01 11:28:21.840249061 +0200 +++ gcc/c/c-parser.cc 2024-07-04 14:58:33.568465437 +0200 @@ -6212,6 +6212,25 @@ c_parser_braced_init (c_parser *parser, { last_init_list_comma = c_parser_peek_token (parser)->location; c_parser_consume_token (parser); + /* CPP_EMBED should be always in between two CPP_COMMA + tokens. */ + while (c_parser_next_token_is (parser, CPP_EMBED)) + { + c_token *embed = c_parser_peek_token (parser); + c_parser_consume_token (parser); + c_expr embed_val; + embed_val.value = embed->value; + embed_val.original_code = RAW_DATA_CST; + embed_val.original_type = integer_type_node; + set_c_expr_source_range (&embed_val, embed->get_range ()); + embed_val.m_decimal = 0; + process_init_element (embed->location, embed_val, false, + &braced_init_obstack); + gcc_checking_assert (c_parser_next_token_is (parser, + CPP_COMMA)); + last_init_list_comma = c_parser_peek_token (parser)->location; + c_parser_consume_token (parser); + } } else break; --- gcc/c/c-typeck.cc.jj 2024-06-14 19:45:07.455803708 +0200 +++ gcc/c/c-typeck.cc 2024-07-05 12:48:05.357558694 +0200 @@ -8747,12 +8747,13 @@ digest_init (location_t init_loc, tree t if (!maybe_const) arith_const_expr = false; else if (!INTEGRAL_TYPE_P (TREE_TYPE (inside_init)) - && TREE_CODE (TREE_TYPE (inside_init)) != REAL_TYPE - && TREE_CODE (TREE_TYPE (inside_init)) != COMPLEX_TYPE) + && TREE_CODE (TREE_TYPE (inside_init)) != REAL_TYPE + && TREE_CODE (TREE_TYPE (inside_init)) != COMPLEX_TYPE) arith_const_expr = false; else if (TREE_CODE (inside_init) != INTEGER_CST - && TREE_CODE (inside_init) != REAL_CST - && TREE_CODE (inside_init) != COMPLEX_CST) + && TREE_CODE (inside_init) != REAL_CST + && TREE_CODE (inside_init) != COMPLEX_CST + && TREE_CODE (inside_init) != RAW_DATA_CST) arith_const_expr = false; else if (TREE_OVERFLOW (inside_init)) arith_const_expr = false; @@ -9013,6 +9014,22 @@ digest_init (location_t init_loc, tree t ? ic_init_const : ic_init), null_pointer_constant, NULL_TREE, NULL_TREE, 0); + if (TREE_CODE (inside_init) == RAW_DATA_CST + && c_inhibit_evaluation_warnings == 0 + && warn_overflow + && !TYPE_UNSIGNED (type) + && TYPE_PRECISION (type) == CHAR_BIT) + for (unsigned int i = 0; + i < (unsigned) RAW_DATA_LENGTH (inside_init); ++i) + if (((const signed char *) RAW_DATA_POINTER (inside_init))[i] < 0) + warning_at (init_loc, OPT_Wconversion, + "conversion from %qT to %qT changes value from " + "%qd to %qd", + integer_type_node, type, + ((const unsigned char *) + RAW_DATA_POINTER (inside_init))[i], + ((const signed char *) + RAW_DATA_POINTER (inside_init))[i]); return inside_init; } @@ -10124,6 +10141,28 @@ set_init_label (location_t loc, tree fie while (field != NULL_TREE); } +/* Helper function for add_pending_init. Find inorder successor of P + in AVL tree. */ +static struct init_node * +init_node_successor (struct init_node *p) +{ + struct init_node *r; + if (p->right) + { + r = p->right; + while (r->left) + r = r->left; + return r; + } + r = p->parent; + while (r && p == r->right) + { + p = r; + r = r->parent; + } + return r; +} + /* Add a new initializer to the tree of pending initializers. PURPOSE identifies the initializer, either array index or field in a structure. VALUE is the value of that index or field. If ORIGTYPE is not @@ -10151,9 +10190,179 @@ add_pending_init (location_t loc, tree p if (tree_int_cst_lt (purpose, p->purpose)) q = &p->left; else if (tree_int_cst_lt (p->purpose, purpose)) - q = &p->right; + { + if (TREE_CODE (p->value) != RAW_DATA_CST + || (p->right + && tree_int_cst_le (p->right->purpose, purpose))) + q = &p->right; + else + { + widest_int pp = wi::to_widest (p->purpose); + widest_int pw = wi::to_widest (purpose); + if (pp + RAW_DATA_LENGTH (p->value) <= pw) + q = &p->right; + else + { + /* Override which should split the old RAW_DATA_CST + into 2 or 3 pieces. */ + if (!implicit && warn_override_init) + warning_init (loc, OPT_Woverride_init, + "initialized field overwritten"); + unsigned HOST_WIDE_INT start = (pw - pp).to_uhwi (); + unsigned HOST_WIDE_INT len = 1; + if (TREE_CODE (value) == RAW_DATA_CST) + len = RAW_DATA_LENGTH (value); + unsigned HOST_WIDE_INT end = 0; + unsigned plen = RAW_DATA_LENGTH (p->value); + gcc_checking_assert (start < plen && start); + if (plen - start > len) + end = plen - start - len; + tree v = p->value; + tree origtype = p->origtype; + if (start == 1) + p->value = build_int_cst (TREE_TYPE (v), + *(const unsigned char *) + RAW_DATA_POINTER (v)); + else + { + p->value = v; + if (end > 1) + v = copy_node (v); + RAW_DATA_LENGTH (p->value) = start; + } + if (end) + { + tree epurpose + = size_binop (PLUS_EXPR, purpose, + bitsize_int (len)); + if (end > 1) + { + RAW_DATA_LENGTH (v) -= plen - end; + RAW_DATA_POINTER (v) += plen - end; + } + else + v = build_int_cst (TREE_TYPE (v), + ((const unsigned char *) + RAW_DATA_POINTER (v))[plen + - end]); + add_pending_init (loc, epurpose, v, origtype, + implicit, braced_init_obstack); + } + q = &constructor_pending_elts; + continue; + } + } + } else { + if (TREE_CODE (p->value) == RAW_DATA_CST + && (RAW_DATA_LENGTH (p->value) + > (TREE_CODE (value) == RAW_DATA_CST + ? RAW_DATA_LENGTH (value) : 1))) + { + /* Override which should split the old RAW_DATA_CST + into 2 pieces. */ + if (!implicit && warn_override_init) + warning_init (loc, OPT_Woverride_init, + "initialized field overwritten"); + unsigned HOST_WIDE_INT len = 1; + if (TREE_CODE (value) == RAW_DATA_CST) + len = RAW_DATA_LENGTH (value); + if ((unsigned) RAW_DATA_LENGTH (p->value) > len + 1) + { + RAW_DATA_LENGTH (p->value) -= len; + RAW_DATA_POINTER (p->value) += len; + } + else + { + unsigned int l = RAW_DATA_LENGTH (p->value) - 1; + p->value + = build_int_cst (TREE_TYPE (p->value), + ((const unsigned char *) + RAW_DATA_POINTER (p->value))[l]); + } + p->purpose = size_binop (PLUS_EXPR, p->purpose, + bitsize_int (len)); + continue; + } + if (TREE_CODE (value) == RAW_DATA_CST) + { + handle_raw_data: + /* RAW_DATA_CST value might overlap various further + prior initval entries. Find out how many. */ + unsigned cnt = 0; + widest_int w + = wi::to_widest (purpose) + RAW_DATA_LENGTH (value); + struct init_node *r = p, *last = NULL; + bool override_init = warn_override_init; + while ((r = init_node_successor (r)) + && wi::to_widest (r->purpose) < w) + { + ++cnt; + if (TREE_SIDE_EFFECTS (r->value)) + warning_init (loc, OPT_Woverride_init_side_effects, + "initialized field with side-effects " + "overwritten"); + else if (override_init) + { + warning_init (loc, OPT_Woverride_init, + "initialized field overwritten"); + override_init = false; + } + last = r; + } + if (cnt) + { + if (TREE_CODE (last->value) == RAW_DATA_CST + && (wi::to_widest (last->purpose) + + RAW_DATA_LENGTH (last->value) > w)) + { + /* The last overlapping prior initval overlaps + only partially. Shrink it and decrease cnt. */ + unsigned int l = (wi::to_widest (last->purpose) + + RAW_DATA_LENGTH (last->value) + - w).to_uhwi (); + --cnt; + RAW_DATA_LENGTH (last->value) -= l; + RAW_DATA_POINTER (last->value) += l; + if (RAW_DATA_LENGTH (last->value) == 1) + { + const unsigned char *s + = ((const unsigned char *) + RAW_DATA_POINTER (last->value)); + last->value + = build_int_cst (TREE_TYPE (last->value), *s); + } + last->purpose + = size_binop (PLUS_EXPR, last->purpose, + bitsize_int (l)); + } + /* Instead of deleting cnt nodes from the AVL tree + and rebalancing, peel of last cnt bytes from the + RAW_DATA_CST. Overriding thousands of previously + initialized array elements with #embed needs to work, + but doesn't need to be super efficient. */ + gcc_checking_assert ((unsigned) RAW_DATA_LENGTH (value) + > cnt); + RAW_DATA_LENGTH (value) -= cnt; + const unsigned char *s + = ((const unsigned char *) RAW_DATA_POINTER (value) + + RAW_DATA_LENGTH (value)); + unsigned int o = RAW_DATA_LENGTH (value); + for (r = p; cnt--; ++o, ++s) + { + r = init_node_successor (r); + r->purpose = size_binop (PLUS_EXPR, purpose, + bitsize_int (o)); + r->value = build_int_cst (TREE_TYPE (value), *s); + r->origtype = origtype; + } + if (RAW_DATA_LENGTH (value) == 1) + value = build_int_cst (TREE_TYPE (value), + *((const unsigned char *) + RAW_DATA_POINTER (value))); + } + } if (!implicit) { if (TREE_SIDE_EFFECTS (p->value)) @@ -10169,6 +10378,23 @@ add_pending_init (location_t loc, tree p return; } } + if (TREE_CODE (value) == RAW_DATA_CST && p) + { + struct init_node *r; + if (q == &p->left) + r = p; + else + r = init_node_successor (p); + if (r && wi::to_widest (r->purpose) < (wi::to_widest (purpose) + + RAW_DATA_LENGTH (value))) + { + /* Overlap with at least one prior initval in the range but + not at the start. */ + p = r; + p->purpose = purpose; + goto handle_raw_data; + } + } } else { @@ -10397,8 +10623,8 @@ set_nonincremental_init (struct obstack { if (TYPE_DOMAIN (constructor_type)) constructor_unfilled_index - = convert (bitsizetype, - TYPE_MIN_VALUE (TYPE_DOMAIN (constructor_type))); + = convert (bitsizetype, + TYPE_MIN_VALUE (TYPE_DOMAIN (constructor_type))); else constructor_unfilled_index = bitsize_zero_node; } @@ -10612,12 +10838,13 @@ output_init_element (location_t loc, tre if (!maybe_const) arith_const_expr = false; else if (!INTEGRAL_TYPE_P (TREE_TYPE (value)) - && TREE_CODE (TREE_TYPE (value)) != REAL_TYPE - && TREE_CODE (TREE_TYPE (value)) != COMPLEX_TYPE) + && TREE_CODE (TREE_TYPE (value)) != REAL_TYPE + && TREE_CODE (TREE_TYPE (value)) != COMPLEX_TYPE) arith_const_expr = false; else if (TREE_CODE (value) != INTEGER_CST - && TREE_CODE (value) != REAL_CST - && TREE_CODE (value) != COMPLEX_CST) + && TREE_CODE (value) != REAL_CST + && TREE_CODE (value) != COMPLEX_CST + && TREE_CODE (value) != RAW_DATA_CST) arith_const_expr = false; else if (TREE_OVERFLOW (value)) arith_const_expr = false; @@ -10784,9 +11011,14 @@ output_init_element (location_t loc, tre /* Advance the variable that indicates sequential elements output. */ if (TREE_CODE (constructor_type) == ARRAY_TYPE) - constructor_unfilled_index - = size_binop_loc (input_location, PLUS_EXPR, constructor_unfilled_index, - bitsize_one_node); + { + tree inc = bitsize_one_node; + if (value && TREE_CODE (value) == RAW_DATA_CST) + inc = bitsize_int (RAW_DATA_LENGTH (value)); + constructor_unfilled_index + = size_binop_loc (input_location, PLUS_EXPR, + constructor_unfilled_index, inc); + } else if (TREE_CODE (constructor_type) == RECORD_TYPE) { constructor_unfilled_fields @@ -10795,8 +11027,8 @@ output_init_element (location_t loc, tre /* Skip any nameless bit fields. */ while (constructor_unfilled_fields != NULL_TREE && DECL_UNNAMED_BIT_FIELD (constructor_unfilled_fields)) - constructor_unfilled_fields = - DECL_CHAIN (constructor_unfilled_fields); + constructor_unfilled_fields + = DECL_CHAIN (constructor_unfilled_fields); } else if (TREE_CODE (constructor_type) == UNION_TYPE) constructor_unfilled_fields = NULL_TREE; @@ -11042,6 +11274,23 @@ initialize_elementwise_p (tree type, tre return false; } +/* Helper function for process_init_element. Split first element of + RAW_DATA_CST and save the rest to *RAW_DATA. */ + +static inline tree +maybe_split_raw_data (tree value, tree *raw_data) +{ + if (value == NULL_TREE || TREE_CODE (value) != RAW_DATA_CST) + return value; + *raw_data = value; + value = build_int_cst (integer_type_node, + *(const unsigned char *) + RAW_DATA_POINTER (*raw_data)); + ++RAW_DATA_POINTER (*raw_data); + --RAW_DATA_LENGTH (*raw_data); + return value; +} + /* Add one non-braced element to the current constructor level. This adjusts the current position within the constructor's type. This may also start or terminate implicit levels @@ -11064,7 +11313,9 @@ process_init_element (location_t loc, st = (orig_value != NULL_TREE && TREE_CODE (orig_value) == STRING_CST); bool strict_string = value.original_code == STRING_CST; bool was_designated = designator_depth != 0; + tree raw_data = NULL_TREE; +retry: designator_depth = 0; designator_erroneous = 0; @@ -11232,6 +11483,7 @@ process_init_element (location_t loc, st continue; } + value.value = maybe_split_raw_data (value.value, &raw_data); if (value.value) { push_member_name (constructor_fields); @@ -11320,6 +11572,7 @@ process_init_element (location_t loc, st continue; } + value.value = maybe_split_raw_data (value.value, &raw_data); if (value.value) { push_member_name (constructor_fields); @@ -11368,26 +11621,66 @@ process_init_element (location_t loc, st break; } - /* Now output the actual element. */ - if (value.value) + if (value.value + && TREE_CODE (value.value) == RAW_DATA_CST + && RAW_DATA_LENGTH (value.value) > 1 + && (TREE_CODE (elttype) == INTEGER_TYPE + || TREE_CODE (elttype) == BITINT_TYPE) + && TYPE_PRECISION (elttype) == CHAR_BIT + && (constructor_max_index == NULL_TREE + || tree_int_cst_lt (constructor_index, + constructor_max_index))) { + unsigned int len = RAW_DATA_LENGTH (value.value); + if (constructor_max_index) + { + widest_int w = wi::to_widest (constructor_max_index); + w -= wi::to_widest (constructor_index); + w += 1; + if (w < len) + len = w.to_uhwi (); + } + if (len < (unsigned) RAW_DATA_LENGTH (value.value)) + { + raw_data = copy_node (value.value); + RAW_DATA_LENGTH (raw_data) -= len; + RAW_DATA_POINTER (raw_data) += len; + RAW_DATA_LENGTH (value.value) = len; + } + TREE_TYPE (value.value) = elttype; push_array_bounds (tree_to_uhwi (constructor_index)); output_init_element (loc, value.value, value.original_type, - strict_string, elttype, - constructor_index, true, implicit, - braced_init_obstack); + false, elttype, constructor_index, true, + implicit, braced_init_obstack); RESTORE_SPELLING_DEPTH (constructor_depth); + constructor_index + = size_binop_loc (input_location, PLUS_EXPR, + constructor_index, bitsize_int (len)); } + else + { + value.value = maybe_split_raw_data (value.value, &raw_data); + /* Now output the actual element. */ + if (value.value) + { + push_array_bounds (tree_to_uhwi (constructor_index)); + output_init_element (loc, value.value, value.original_type, + strict_string, elttype, + constructor_index, true, implicit, + braced_init_obstack); + RESTORE_SPELLING_DEPTH (constructor_depth); + } - constructor_index - = size_binop_loc (input_location, PLUS_EXPR, - constructor_index, bitsize_one_node); - - if (!value.value) - /* If we are doing the bookkeeping for an element that was - directly output as a constructor, we must update - constructor_unfilled_index. */ - constructor_unfilled_index = constructor_index; + constructor_index + = size_binop_loc (input_location, PLUS_EXPR, + constructor_index, bitsize_one_node); + + if (!value.value) + /* If we are doing the bookkeeping for an element that was + directly output as a constructor, we must update + constructor_unfilled_index. */ + constructor_unfilled_index = constructor_index; + } } else if (gnu_vector_type_p (constructor_type)) { @@ -11402,6 +11695,7 @@ process_init_element (location_t loc, st break; } + value.value = maybe_split_raw_data (value.value, &raw_data); /* Now output the actual element. */ if (value.value) { @@ -11435,6 +11729,7 @@ process_init_element (location_t loc, st } else { + value.value = maybe_split_raw_data (value.value, &raw_data); if (value.value) output_init_element (loc, value.value, value.original_type, strict_string, constructor_type, @@ -11506,6 +11801,14 @@ process_init_element (location_t loc, st } constructor_range_stack = 0; + + if (raw_data && RAW_DATA_LENGTH (raw_data)) + { + gcc_assert (!string_flag && !was_designated); + value.value = raw_data; + raw_data = NULL_TREE; + goto retry; + } } /* Build a complete asm-statement, whose components are a CV_QUALIFIER --- gcc/tree.def.jj 2024-06-05 19:09:54.045617019 +0200 +++ gcc/tree.def 2024-07-05 10:10:48.372613006 +0200 @@ -309,6 +309,12 @@ DEFTREECODE (VECTOR_CST, "vector_cst", t /* Contents are TREE_STRING_LENGTH and the actual contents of the string. */ DEFTREECODE (STRING_CST, "string_cst", tcc_constant, 0) +/* Contents are RAW_DATA_LENGTH and the actual content + of the raw data, plus RAW_DATA_OWNER if non-NULL for owner of the + data (e.g. STRING_CST), if it is NULL, the data is owned by libcpp. + TREE_TYPE is the type of each of the RAW_DATA_LENGTH elements. */ +DEFTREECODE (RAW_DATA_CST, "raw_data_cst", tcc_constant, 0) + /* Declarations. All references to names are represented as ..._DECL nodes. The decls in one binding context are chained through the TREE_CHAIN field. Each DECL has a DECL_NAME field which contains --- gcc/c-family/c-lex.cc.jj 2024-02-22 19:29:51.226074838 +0100 +++ gcc/c-family/c-lex.cc 2024-07-04 14:58:33.568465437 +0200 @@ -781,6 +781,13 @@ c_lex_with_flags (tree *value, location_ *value = build_string (tok->val.str.len, (const char *)tok->val.str.text); break; + case CPP_EMBED: + *value = make_node (RAW_DATA_CST); + TREE_TYPE (*value) = integer_type_node; + RAW_DATA_LENGTH (*value) = tok->val.str.len; + RAW_DATA_POINTER (*value) = (const char *) tok->val.str.text; + break; + /* This token should not be visible outside cpplib. */ case CPP_MACRO_ARG: gcc_unreachable (); @@ -800,7 +807,7 @@ c_lex_with_flags (tree *value, location_ add_flags |= PREV_FALLTHROUGH; goto retry_after_at; } - goto retry; + goto retry; default: *value = NULL_TREE; --- gcc/tree-core.h.jj 2024-07-01 11:28:23.408228952 +0200 +++ gcc/tree-core.h 2024-07-03 19:41:28.821880055 +0200 @@ -1516,6 +1516,13 @@ struct GTY(()) tree_string { char str[1]; }; +struct GTY(()) tree_raw_data { + struct tree_typed typed; + tree owner; + const char *GTY ((skip(""))) str; + int length; +}; + struct GTY(()) tree_complex { struct tree_typed typed; tree real; @@ -2106,6 +2113,7 @@ union GTY ((ptr_alias (union lang_tree_n struct tree_fixed_cst GTY ((tag ("TS_FIXED_CST"))) fixed_cst; struct tree_vector GTY ((tag ("TS_VECTOR"))) vector; struct tree_string GTY ((tag ("TS_STRING"))) string; + struct tree_raw_data GTY ((tag ("TS_RAW_DATA_CST"))) raw_data_cst; struct tree_complex GTY ((tag ("TS_COMPLEX"))) complex; struct tree_identifier GTY ((tag ("TS_IDENTIFIER"))) identifier; struct tree_decl_minimal GTY((tag ("TS_DECL_MINIMAL"))) decl_minimal; --- gcc/treestruct.def.jj 2024-01-03 11:51:38.761630845 +0100 +++ gcc/treestruct.def 2024-07-03 17:06:57.539794162 +0200 @@ -39,6 +39,7 @@ DEFTREESTRUCT(TS_REAL_CST, "real cst") DEFTREESTRUCT(TS_FIXED_CST, "fixed cst") DEFTREESTRUCT(TS_VECTOR, "vector") DEFTREESTRUCT(TS_STRING, "string") +DEFTREESTRUCT(TS_RAW_DATA_CST, "raw data cst") DEFTREESTRUCT(TS_COMPLEX, "complex") DEFTREESTRUCT(TS_IDENTIFIER, "identifier") DEFTREESTRUCT(TS_DECL_MINIMAL, "decl minimal") Jakub