Hi! The following patch attempts to implement the current wording of the C23 #embed expansion rules on top of the https://gcc.gnu.org/pipermail/gcc-patches/2024-August/661901.html patch (haven't yet adjusted the rest of the series, but I expect only minor tweaks). After parsing #embed it first checks whether the tokens with prevent_expansion = 1 match the <h-char-sequence> embed-parameter-sequence[opt] new-line or "q-char-sequence" embed-parameter-sequence[opt] new-line grammar. If not (and that can be for tons of reasons, the first token being a CPP_NAME (rather than CPP_HEADER_NAME or CPP_STRING), or e.g. unbalanced token sequence in some parameter clause, or (not currently tested in the patch, would need to wait for at least two gnu namespace parameters other than gnu::base64) e.g. identifier::identifier2::identifier3 () syntax (where #define identifier gnu #define identifier2 offset (16) gnu #define identifier3 whatever ) etc.), it handles it like the patch before, i.e. everything after #embed is macro expanded, if yes, the parameter names aren't macro expanded, only limit (and later on gnu::offset) argument is macro expanded (and diagnostics emitted if the closing ) comes from a macro so that one doesn't actually bypass the no expansion like in embed-29.c) and the prefix/suffix/if_empty arguments (but that is expanded only when actually emitted into the #embed replacement, so it can after macro expansion then contain unbalanced parens, but for non-empty resource if_empty tokens aren't macro expanded and for empty resource prefix/suffix tokens).
I've done this for #embed only and not for __has_embed because as I wrote in my earlier mail, whether that is always macro expanded or not is unclear given conflicting wording (or should it be expanded twice in some cases?), but given the questions on __has_embed I haven't added further testsuite coverage for macro expansion of __has_embed. 2024-09-03 Jakub Jelinek <ja...@redhat.com> libcpp/ * internal.h (struct cpp_embed_params): Add no_expand member. * directives.cc (skip_balanced_token_seq): Don't set NO_EXPAND flags on the tokens here. (check_balanced_token_seq): New function. (do_embed): Check whether non-expanded tokens match <h-char-sequence> embed-parameter-sequence[opt] new-line or "q-char-sequence" embed-parameter-sequence[opt] new-line grammar, if yes, set params.no_expand and don't macro expand most of the tokens. * expr.cc (_cpp_parse_expr): Enable macro expansion if disabled in #embed argument and diagnose if closing paren comes from a macro. * files.cc (maybe_expand_embed_params_tokens): New function. (_cpp_stack_embed): Call maybe_expand_embed_params_tokens if needed, set NO_EXPAND flags on the tokens coming from prefix/suffix/if_empty. gcc/testsuite/ * c-c++-common/cpp/embed-28.c: New test. * c-c++-common/cpp/embed-29.c: New test. --- libcpp/internal.h.jj 2024-09-02 17:09:22.739723226 +0200 +++ libcpp/internal.h 2024-09-02 17:24:23.290579871 +0200 @@ -636,7 +636,7 @@ struct cpp_embed_params_tokens struct cpp_embed_params { location_t loc; - bool has_embed; + bool has_embed, no_expand; cpp_num_part limit; cpp_embed_params_tokens prefix, suffix, if_empty; }; --- libcpp/directives.cc.jj 2024-09-02 17:09:22.757723007 +0200 +++ libcpp/directives.cc 2024-09-03 15:46:09.891236633 +0200 @@ -977,7 +977,6 @@ skip_balanced_token_seq (cpp_reader *pfi save->cur_token = save->cur_run->base; } *save->cur_token = *token; - save->cur_token->flags |= NO_EXPAND; save->cur_token++; save->count++; } @@ -1187,6 +1186,49 @@ _cpp_parse_embed_params (cpp_reader *pfi while (1); } +/* Skip over balanced token sequence, stopping at END token. Return + true if it is valid, false if invalid. Update *CNT by the number of + consumed tokens. */ + +static bool +check_balanced_token_seq (cpp_reader *pfile, cpp_ttype end, unsigned *cnt) +{ + do + { + const cpp_token *token = cpp_peek_token (pfile, 0); + if (token->type == CPP_EOF) + return false; + token = cpp_get_token (pfile); + ++*cnt; + if (token->type == end) + return true; + switch (token->type) + { + case CPP_OPEN_PAREN: + if (!check_balanced_token_seq (pfile, CPP_CLOSE_PAREN, cnt)) + return false; + break; + case CPP_OPEN_SQUARE: + if (!check_balanced_token_seq (pfile, CPP_CLOSE_SQUARE, cnt)) + return false; + break; + case CPP_OPEN_BRACE: + if (!check_balanced_token_seq (pfile, CPP_CLOSE_BRACE, cnt)) + return false; + break; + case CPP_CLOSE_PAREN: + case CPP_CLOSE_SQUARE: + case CPP_CLOSE_BRACE: + return false; + default: + break; + } + } + while (1); +} + + + /* Handle #embed directive. */ static void @@ -1196,9 +1238,13 @@ do_embed (cpp_reader *pfile) struct cpp_embed_params params = {}; bool ok; const char *fname = NULL; + unsigned int cnt, state; + void (*line_change) (cpp_reader *, const cpp_token *, int); + unsigned char prevent_expansion; /* Tell the lexer this is an embed directive. */ pfile->state.in_directive = 3; + prevent_expansion = pfile->state.prevent_expansion; if (CPP_OPTION (pfile, traditional)) { @@ -1218,6 +1264,113 @@ do_embed (cpp_reader *pfile) "#%s before C23 is a GCC extension", "embed"); } + /* Determine if the #embed directive should be macro expanded or not. */ + pfile->state.prevent_expansion = 1; + pfile->keep_tokens++; + params.no_expand = true; + state = 0; + cnt = 0; + /* For peeked tokens temporarily disable line_change reporting, + until the tokens are parsed for real. */ + line_change = pfile->cb.line_change; + pfile->cb.line_change = NULL; + while (true) + { + const cpp_token *tok = cpp_peek_token (pfile, 0); + if (tok->type == CPP_EOF) + { + switch (state) + { + case 0: + case 3: + case 4: + params.no_expand = false; + break; + } + break; + } + tok = cpp_get_token (pfile); + ++cnt; + if (tok->type == CPP_PADDING) + continue; + switch (state) + { + case 0: + if ((tok->type == CPP_STRING && tok->val.str.text[0] != 'R') + || tok->type == CPP_HEADER_NAME) + { + pfile->state.angled_headers = false; + state = 1; + continue; + } + break; + case 1: + if (tok->type == CPP_NAME) + { + state = 2; + continue; + } + break; + case 2: + if (tok->type == CPP_NAME) + continue; + else if (tok->type == CPP_SCOPE) + { + state = 4; + continue; + } + else if (tok->type == CPP_COLON && (tok->flags & COLON_SCOPE) != 0) + { + state = 3; + continue; + } + else if (tok->type == CPP_OPEN_PAREN + && check_balanced_token_seq (pfile, CPP_CLOSE_PAREN, &cnt)) + { + state = 1; + continue; + } + break; + case 3: + if (tok->type == CPP_COLON) + { + state = 4; + continue; + } + break; + case 4: + if (tok->type == CPP_NAME) + { + state = 5; + continue; + } + break; + case 5: + if (tok->type == CPP_NAME) + { + state = 2; + continue; + } + else if (tok->type == CPP_OPEN_PAREN + && check_balanced_token_seq (pfile, CPP_CLOSE_PAREN, &cnt)) + { + state = 1; + continue; + } + break; + default: + break; + } + params.no_expand = false; + break; + } + + _cpp_backup_tokens_direct (pfile, cnt); + pfile->keep_tokens--; + pfile->cb.line_change = line_change; + pfile->state.angled_headers = true; + pfile->state.prevent_expansion = params.no_expand; + fname = parse_include (pfile, &angle_brackets, NULL, ¶ms.loc); if (!fname) { @@ -1266,6 +1419,7 @@ do_embed (cpp_reader *pfile) } done: + pfile->state.prevent_expansion = prevent_expansion; XDELETEVEC (fname); } --- libcpp/expr.cc.jj 2024-09-02 17:09:22.775722787 +0200 +++ libcpp/expr.cc 2024-09-03 14:14:40.039487661 +0200 @@ -1379,6 +1379,7 @@ _cpp_parse_expr (cpp_reader *pfile, cons unsigned int lex_count; bool saw_leading_not, want_value = true; location_t virtual_location = 0; + unsigned char no_expand = 0; pfile->state.skip_eval = 0; @@ -1396,6 +1397,8 @@ _cpp_parse_expr (cpp_reader *pfile, cons top->op = CPP_OPEN_PAREN; top->token = open_paren; top->loc = open_paren->src_loc; + no_expand = pfile->state.prevent_expansion; + pfile->state.prevent_expansion = 0; } for (;;) @@ -1493,7 +1496,12 @@ _cpp_parse_expr (cpp_reader *pfile, cons { case CPP_CLOSE_PAREN: if (pfile->state.in_directive == 3 && top == pfile->op_stack) - goto embed_done; + { + if (pfile->context->prev && no_expand) + cpp_error_with_line (pfile, CPP_DL_ERROR, op.loc, 0, + "closing ')' comes from macro expansion"); + goto embed_done; + } continue; case CPP_OR_OR: if (!num_zerop (top->value)) @@ -1538,12 +1546,16 @@ _cpp_parse_expr (cpp_reader *pfile, cons cpp_error_with_line (pfile, CPP_DL_ICE, top->loc, 0, "unbalanced stack in %s", dir); syntax_error: + if (no_expand) + pfile->state.prevent_expansion = no_expand; return false; /* Return false on syntax error. */ } if (pfile->state.in_directive == 3) { embed_done: + if (no_expand) + pfile->state.prevent_expansion = 1; if (num_zerop (top->value)) return 0; if (!top->value.unsignedp --- libcpp/files.cc.jj 2024-09-02 17:09:22.782722701 +0200 +++ libcpp/files.cc 2024-09-03 15:25:27.126296537 +0200 @@ -1217,6 +1217,76 @@ cpp_probe_header_unit (cpp_reader *pfile return nullptr; } +/* Macro expand the TOKENS. */ + +void +maybe_expand_embed_params_tokens (cpp_reader *pfile, + cpp_embed_params_tokens *tokens) +{ + if (tokens->count == 0) + return; + + _cpp_buff *tok_buff + = _cpp_get_buff (pfile, (tokens->count + 1) * sizeof (cpp_token)); + cpp_token *toks = (cpp_token *) tok_buff->base; + cpp_token *tok = toks; + tokenrun *cur_run = &tokens->base_run; + while (cur_run) + { + size_t cnt = (cur_run->next ? cur_run->limit + : tokens->cur_token) - cur_run->base; + cpp_token *t = cur_run->base; + memcpy (tok, t, cnt * sizeof (cpp_token)); + tok += cnt; + cur_run = cur_run->next; + } + tok->type = CPP_EOF; + tok->src_loc = pfile->line_table->highest_line; + tok->flags = BOL; + ++tok; + tokenrun *n; + for (tokenrun *t = &tokens->base_run; t; t = n) + { + n = t->next; + XDELETEVEC (t->base); + if (t != &tokens->base_run) + XDELETE (t); + } + _cpp_push_token_context (pfile, NULL, toks, tok - toks); + pfile->context->buff = tok_buff; + tokens->count = 0; + _cpp_init_tokenrun (&tokens->base_run, 4); + tokens->cur_run = &tokens->base_run; + tokens->cur_token = tokens->base_run.base; + pfile->state.prevent_expansion = 0; + pfile->state.in_directive = 0; + do + { + const cpp_token *token = cpp_peek_token (pfile, 0); + if (token->type == CPP_EOF) + break; + token = cpp_get_token (pfile); + if (token->type == CPP_PADDING && tokens->count == 0) + continue; + if (tokens->cur_token == tokens->cur_run->limit) + { + tokens->cur_run->next = XNEW (tokenrun); + tokens->cur_run->next->prev = tokens->cur_run; + _cpp_init_tokenrun (tokens->cur_run->next, 4); + tokens->cur_run = tokens->cur_run->next; + tokens->cur_token = tokens->cur_run->base; + } + *tokens->cur_token = *token; + tokens->cur_token++; + tokens->count++; + } + while (1); + while (pfile->context->prev) + _cpp_pop_context (pfile); + pfile->state.prevent_expansion = 1; + pfile->state.in_directive = 3; +} + /* Try to load FNAME with #embed/__has_embed parameters PARAMS. If !PARAMS->has_embed, return new token in pfile->directive_result (first token) and rest in a pushed non-macro context. @@ -1392,6 +1462,17 @@ _cpp_stack_embed (cpp_reader *pfile, con if (params->limit < limit) limit = params->limit; + if (params->no_expand) + { + if (limit) + { + maybe_expand_embed_params_tokens (pfile, ¶ms->prefix); + maybe_expand_embed_params_tokens (pfile, ¶ms->suffix); + } + else + maybe_expand_embed_params_tokens (pfile, ¶ms->if_empty); + } + /* For sizes larger than say 64 bytes, this is just a temporary solution, we should emit a single new token which the FEs will handle as an optimization. */ @@ -1470,6 +1551,9 @@ _cpp_stack_embed (cpp_reader *pfile, con tok += cnt; cur_run = cur_run->next; } + pfile->directive_result.flags |= NO_EXPAND; + for (cpp_token *t = toks; t < tok; ++t) + t->flags |= NO_EXPAND; } for (size_t i = 0; i < limit; ++i) { @@ -1507,6 +1591,8 @@ _cpp_stack_embed (cpp_reader *pfile, con cur_run = cur_run->next; } orig_tok->flags |= PREV_WHITE; + for (cpp_token *t = orig_tok; t < tok; ++t) + t->flags |= NO_EXPAND; } pfile->directive_result.flags |= PREV_WHITE; if (count) --- gcc/testsuite/c-c++-common/cpp/embed-28.c.jj 2024-09-03 14:50:30.710448414 +0200 +++ gcc/testsuite/c-c++-common/cpp/embed-28.c 2024-09-03 15:49:05.877117035 +0200 @@ -0,0 +1,66 @@ +/* { dg-do run } */ +/* { dg-options "--embed-dir=${srcdir}/c-c++-common/cpp/embed-dir" } */ +/* { dg-additional-options "-std=c23" { target c } } */ + +const unsigned char a[] = { +#embed "magna-carta.txt" prefix (1, ) suffix (, 2) limit (128) +}; +#define embed ! +#define limit ! +#define prefix ! +#define suffix ! +#define if_empty ! +#define prefix_arg unsigned char b[] = { 1, +#define suffix_arg , 2 }; +#define limit_arg 128 +#define concat(x,y) x##y +#embed "magna-carta.txt" prefix (const prefix_arg) suffix (suffix_arg) limit (limit_arg) if_empty (concat (.,.) concat (<,>) concat (({[]}),(([[{{}}]])))) +#define magna_carta "magna-carta.txt" +#undef limit +#undef prefix +#undef suffix +#undef if_empty +#define limit __prefix__ +#define prefix __suffix__ +#define suffix __limit__ +#define empty +const unsigned char c[] = { +#embed empty "magna-carta.txt" limit (1, ) suffix (limit_arg) prefix (, 2) +}; +const unsigned char d[] = { +#embed magna_carta limit (1, ) prefix (, 2) suffix (128) +}; +#define ignore(x) +const unsigned char e[] = { +#embed "magna-carta.txt" limit (1, ) prefix (, 2) suffix (ignore ({[) 128) +}; +const unsigned char f[] = { +#embed "magna-carta.txt" __limit__ (0) __if_empty__ (1, 2) __prefix__ (concat (<,>)) __suffix__ (concat (<,>)) +}; +#undef limit +#define limit __limit__ (128) __prefix__(1, +const unsigned char g[] = { +#embed "magna-carta.txt" limit ) __suffix__(, 2) +}; + +int +main () +{ + if (sizeof (a) != 130 + || a[0] != 1 + || a[129] != 2 + || sizeof (b) != 130 + || __builtin_memcmp (a, b, 130) != 0 + || sizeof (c) != 130 + || __builtin_memcmp (a, c, 130) != 0 + || sizeof (d) != 130 + || __builtin_memcmp (a, d, 130) != 0 + || sizeof (e) != 130 + || __builtin_memcmp (a, e, 130) != 0 + || sizeof (f) != 2 + || f[0] != 1 + || f[1] != 2 + || sizeof (g) != 130 + || __builtin_memcmp (a, g, 130) != 0) + __builtin_abort (); +} --- gcc/testsuite/c-c++-common/cpp/embed-29.c.jj 2024-09-03 15:51:55.921255480 +0200 +++ gcc/testsuite/c-c++-common/cpp/embed-29.c 2024-09-03 16:00:06.171004183 +0200 @@ -0,0 +1,11 @@ +/* { dg-do preprocess } */ +/* { dg-options "--embed-dir=${srcdir}/c-c++-common/cpp/embed-dir" } */ +/* { dg-additional-options "-std=c23" { target c } } */ + +#define embed ! +#define limit ! +#define prefix ! +#define suffix ! +#define if_empty ! +#define limit_arg 1 ) __prefix__ (1, /* { dg-error "closing '\\\)' comes from macro expansion" } */ +#embed "magna-carta.txt" limit (limit_arg) /* { dg-message "in expansion of macro 'limit_arg'" } */ Jakub