Testing my solution for bug 77671 (missing -Wformat-overflow sprintf with "%s") caused a regression in the charset/builtin2.c test for bug 25120 (builtin *printf handlers are confused by -fexec-charset). That led me to realize that like -Wformat itself, the whole gimple-ssa-sprintf pass is oblivious to potential differences between the source character set on the host and the execution character set on the target. As a result, when the host and target sets are different, the pass misinterprets ordinary format characters as special (e.g., parts of directives) and vice versa.
The attached patch implements a simple solution to this problem by introducing a mapping between the two sets. Martin
PR tree-optimization/80523 - -Wformat-overflow doesn't consider -fexec-charset gcc/ChangeLog: PR tree-optimization/80523 * gimple-ssa-sprintf.c (target_to_host_charmap): New global variable. (init_target_to_host_charmap, target_to_host, target_strtol10): New functions. (maybe_warn, format_directive, parse_directive): Use new functions. (pass_sprintf_length::execute): Call init_target_to_host_charmap. gcc/testsuite/ChangeLog: PR tree-optimization/80523 * gcc.dg/tree-ssa/builtin-sprintf-warn-18.c: New test. Index: gcc/gimple-ssa-sprintf.c =================================================================== --- gcc/gimple-ssa-sprintf.c (revision 247263) +++ gcc/gimple-ssa-sprintf.c (working copy) @@ -66,6 +66,7 @@ along with GCC; see the file COPYING3. If not see #include "calls.h" #include "cfgloop.h" #include "intl.h" +#include "langhooks.h" #include "builtins.h" #include "stor-layout.h" @@ -273,6 +274,118 @@ target_size_max () return tree_to_uhwi (TYPE_MAX_VALUE (size_type_node)); } +/* A straightforward mapping from the execution character set to the host + character set indexed by execution character. */ + +static char target_to_host_charmap[256]; + +/* Initialize a mapping from the execution character set to the host + character set. */ + +static bool +init_target_to_host_charmap () +{ + if (!init_target_chars ()) + return false; + + /* The subset of the source character set used by printf conversion + specifications (strictly speaking, not all letters are used but + they are included here for the sake of simplicity). The dollar + sign must be included even though it's not in the basic source + character set. */ + const char srcset[] = " ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz0123456789#%'*+-.$"; + + /* Set the mapping for all characters to some ordinary value (i,e., + not one used in printf conversion specifications) and overwrite + those that are used by conversion specifications with their + corresponding values. */ + memset (target_to_host_charmap + 1, '?', sizeof target_to_host_charmap - 1); + + for (const char *pc = srcset; *pc; ++pc) + { + /* Slice off the high end bits in case target characters are + signed. All values are expected to be non-nul, otherwise + there's a problem. */ + if (unsigned char tc = lang_hooks.to_target_charset (*pc)) + target_to_host_charmap[tc] = *pc; + else + return false; + } + + return true; +} + +/* Return the host source character corresponding to the character + CH in the execution character set if one exists, or some innocuous + (non-special, non-nul) source character otherwise. */ + +static inline int +target_to_host (unsigned char ch) +{ + return target_to_host_charmap[ch]; +} + +/* Return a string consisting of characters in the host source character + set corresponding to the string TARGSTR consisting of characters in + the execution character set. */ + +static const char* +target_to_host (const char *targstr) +{ + if (target_to_host ('%') == '%') + return targstr; + + static char hostr[32]; + for (char *ph = hostr; *targstr; ++targstr) + { + *ph++ = target_to_host (*targstr); + if (!*targstr) + break; + + if (ph - hostr == sizeof hostr - 4) + { + *ph = '\0'; + strcat (ph, "..."); + break; + } + } + + return hostr; +} + +/* Convert the sequence of decimal digits in the execution character + starting at S to a long, just like strtol does. Return the result + and set *END to one past the last converted character. */ + +static inline long +target_strtol10 (const char *s, char** end) +{ + /* As an optimization use the host strtol if the digit zero is the same + in the host and target character sets. */ + if ('0' == target_to_host ('0')) + return strtol (s, end, 10); + + long val = 0; + for ( ; ; ) + { + unsigned char c = target_to_host (*s); + if (ISDIGIT (c)) + { + val *= 10; + val += c - '0'; + } + else + break; + + ++s; + } + + *end = const_cast<char*>(s); + + return val; +} + /* Return the constant initial value of DECL if available or DECL otherwise. Same as the synonymous function in c/c-typeck.c. */ @@ -2289,7 +2402,7 @@ maybe_warn (substring_loc &dirloc, source_range *p /* The size of the destination region is exact. */ unsigned HOST_WIDE_INT navail = avail_range.max; - if (*dir.beg != '%') + if (target_to_host (*dir.beg) != '%') { /* For plain character directives (i.e., the format string itself) but not others, point the caret at the first character that's @@ -2340,7 +2453,7 @@ maybe_warn (substring_loc &dirloc, source_range *p "into a region of size %wu"))); return fmtwarn (dirloc, pargrange, NULL, info.warnopt (), fmtstr, - dir.len, dir.beg, res.min, + dir.len, target_to_host (dir.beg), res.min, navail); } @@ -2357,7 +2470,7 @@ maybe_warn (substring_loc &dirloc, source_range *p "into a region of size %wu")); return fmtwarn (dirloc, pargrange, NULL, info.warnopt (), fmtstr, - dir.len, dir.beg, + dir.len, target_to_host (dir.beg), res.max, navail); } @@ -2377,7 +2490,7 @@ maybe_warn (substring_loc &dirloc, source_range *p "into a region of size %wu")); return fmtwarn (dirloc, pargrange, NULL, info.warnopt (), fmtstr, - dir.len, dir.beg, + dir.len, target_to_host (dir.beg), res.likely, navail); } @@ -2394,7 +2507,7 @@ maybe_warn (substring_loc &dirloc, source_range *p "%wu bytes into a region of size %wu")); return fmtwarn (dirloc, pargrange, NULL, info.warnopt (), fmtstr, - dir.len, dir.beg, + dir.len, target_to_host (dir.beg), res.min, res.max, navail); } @@ -2410,13 +2523,13 @@ maybe_warn (substring_loc &dirloc, source_range *p "into a region of size %wu")); return fmtwarn (dirloc, pargrange, NULL, info.warnopt (), fmtstr, - dir.len, dir.beg, + dir.len, target_to_host (dir.beg), res.min, navail); } /* The size of the destination region is a range. */ - if (*dir.beg != '%') + if (target_to_host (*dir.beg) != '%') { unsigned HOST_WIDE_INT navail = avail_range.max; @@ -2469,7 +2582,7 @@ maybe_warn (substring_loc &dirloc, source_range *p return fmtwarn (dirloc, pargrange, NULL, info.warnopt (), fmtstr, - dir.len, dir.beg, res.min, + dir.len, target_to_host (dir.beg), res.min, avail_range.min, avail_range.max); } @@ -2488,7 +2601,7 @@ maybe_warn (substring_loc &dirloc, source_range *p "into a region of size between %wu and %wu")); return fmtwarn (dirloc, pargrange, NULL, info.warnopt (), fmtstr, - dir.len, dir.beg, res.max, + dir.len, target_to_host (dir.beg), res.max, avail_range.min, avail_range.max); } @@ -2510,7 +2623,7 @@ maybe_warn (substring_loc &dirloc, source_range *p "into a region of size between %wu and %wu")); return fmtwarn (dirloc, pargrange, NULL, info.warnopt (), fmtstr, - dir.len, dir.beg, res.likely, + dir.len, target_to_host (dir.beg), res.likely, avail_range.min, avail_range.max); } @@ -2529,7 +2642,7 @@ maybe_warn (substring_loc &dirloc, source_range *p "%wu bytes into a region of size between %wu and %wu")); return fmtwarn (dirloc, pargrange, NULL, info.warnopt (), fmtstr, - dir.len, dir.beg, + dir.len, target_to_host (dir.beg), res.min, res.max, avail_range.min, avail_range.max); } @@ -2547,7 +2660,7 @@ maybe_warn (substring_loc &dirloc, source_range *p "into a region of size between %wu and %wu")); return fmtwarn (dirloc, pargrange, NULL, info.warnopt (), fmtstr, - dir.len, dir.beg, + dir.len, target_to_host (dir.beg), res.min, avail_range.min, avail_range.max); } @@ -2636,7 +2749,7 @@ format_directive (const pass_sprintf_length::call_ { fmtwarn (dirloc, pargrange, NULL, info.warnopt (), "%<%.*s%> directive argument is null", - dirlen, dir.beg); + dirlen, target_to_host (dir.beg)); /* Don't bother processing the rest of the format string. */ res->warned = true; @@ -2703,7 +2816,7 @@ format_directive (const pass_sprintf_length::call_ info.warnopt (), "%<%.*s%> directive output of %wu bytes exceeds " "minimum required size of 4095", - dirlen, dir.beg, fmtres.range.min); + dirlen, target_to_host (dir.beg), fmtres.range.min); else { const char *fmtstr @@ -2715,7 +2828,7 @@ format_directive (const pass_sprintf_length::call_ warned = fmtwarn (dirloc, pargrange, NULL, info.warnopt (), fmtstr, - dirlen, dir.beg, + dirlen, target_to_host (dir.beg), fmtres.range.min, fmtres.range.max); } } @@ -2744,7 +2857,7 @@ format_directive (const pass_sprintf_length::call_ warned = fmtwarn (dirloc, pargrange, NULL, info.warnopt (), "%<%.*s%> directive output of %wu bytes causes " "result to exceed %<INT_MAX%>", - dirlen, dir.beg, fmtres.range.min); + dirlen, target_to_host (dir.beg), fmtres.range.min); else { const char *fmtstr @@ -2755,7 +2868,7 @@ format_directive (const pass_sprintf_length::call_ "bytes may cause result to exceed %<INT_MAX%>")); warned = fmtwarn (dirloc, pargrange, NULL, info.warnopt (), fmtstr, - dirlen, dir.beg, + dirlen, target_to_host (dir.beg), fmtres.range.min, fmtres.range.max); } } @@ -2847,7 +2960,7 @@ parse_directive (pass_sprintf_length::call_info &i directive &dir, format_result *res, const char *str, unsigned *argno) { - const char *pcnt = strchr (str, '%'); + const char *pcnt = strchr (str, target_percent); dir.beg = str; if (size_t len = pcnt ? pcnt - str : *str ? strlen (str) : 1) @@ -2889,7 +3002,7 @@ parse_directive (pass_sprintf_length::call_info &i For vararg functions set to void_node. */ tree star_precision = NULL_TREE; - if (ISDIGIT (*pf)) + if (ISDIGIT (target_to_host (*pf))) { /* This could be either a POSIX positional argument, the '0' flag, or a width, depending on what follows. Store it as @@ -2896,10 +3009,10 @@ parse_directive (pass_sprintf_length::call_info &i width and sort it out later after the next character has been seen. */ char *end; - width = strtol (pf, &end, 10); + width = target_strtol10 (pf, &end); pf = end; } - else if ('*' == *pf) + else if (target_to_host (*pf) == '*') { /* Similarly to the block above, this could be either a POSIX positional argument or a width, depending on what follows. */ @@ -2910,7 +3023,7 @@ parse_directive (pass_sprintf_length::call_info &i ++pf; } - if (*pf == '$') + if (target_to_host (*pf) == '$') { /* Handle the POSIX dollar sign which references the 1-based positional argument number. */ @@ -2959,7 +3072,7 @@ parse_directive (pass_sprintf_length::call_info &i the next field is the optional flags followed by an optional width. */ for ( ; ; ) { - switch (*pf) + switch (target_to_host (*pf)) { case ' ': case '0': @@ -2966,7 +3079,7 @@ parse_directive (pass_sprintf_length::call_info &i case '+': case '-': case '#': - dir.set_flag (*pf++); + dir.set_flag (target_to_host (*pf++)); break; default: @@ -2975,13 +3088,13 @@ parse_directive (pass_sprintf_length::call_info &i } start_width: - if (ISDIGIT (*pf)) + if (ISDIGIT (target_to_host (*pf))) { char *end; - width = strtol (pf, &end, 10); + width = target_strtol10 (pf, &end); pf = end; } - else if ('*' == *pf) + else if (target_to_host (*pf) == '*') { if (*argno < gimple_call_num_args (info.callstmt)) star_width = gimple_call_arg (info.callstmt, (*argno)++); @@ -2993,7 +3106,7 @@ parse_directive (pass_sprintf_length::call_info &i } ++pf; } - else if ('\'' == *pf) + else if (target_to_host (*pf) == '\'') { /* The POSIX apostrophe indicating a numeric grouping in the current locale. Even though it's possible to @@ -3005,17 +3118,17 @@ parse_directive (pass_sprintf_length::call_info &i } start_precision: - if ('.' == *pf) + if (target_to_host (*pf) == '.') { ++pf; - if (ISDIGIT (*pf)) + if (ISDIGIT (target_to_host (*pf))) { char *end; - precision = strtol (pf, &end, 10); + precision = target_strtol10 (pf, &end); pf = end; } - else if ('*' == *pf) + else if (target_to_host (*pf) == '*') { if (*argno < gimple_call_num_args (info.callstmt)) star_precision = gimple_call_arg (info.callstmt, (*argno)++); @@ -3035,10 +3148,10 @@ parse_directive (pass_sprintf_length::call_info &i } } - switch (*pf) + switch (target_to_host (*pf)) { case 'h': - if (pf[1] == 'h') + if (target_to_host (pf[1]) == 'h') { ++pf; dir.modifier = FMT_LEN_hh; @@ -3059,7 +3172,7 @@ parse_directive (pass_sprintf_length::call_info &i break; case 'l': - if (pf[1] == 'l') + if (target_to_host (pf[1]) == 'l') { ++pf; dir.modifier = FMT_LEN_ll; @@ -3080,7 +3193,7 @@ parse_directive (pass_sprintf_length::call_info &i break; } - switch (*pf) + switch (target_to_host (*pf)) { /* Handle a sole '%' character the same as "%%" but since it's undefined prevent the result from being folded. */ @@ -3141,7 +3254,7 @@ parse_directive (pass_sprintf_length::call_info &i return 0; } - dir.specifier = *pf++; + dir.specifier = target_to_host (*pf++); if (star_width) { @@ -3708,6 +3821,9 @@ pass_sprintf_length::handle_gimple_call (gimple_st unsigned int pass_sprintf_length::execute (function *fun) { + if (!target_to_host_charmap ['%']) + init_target_to_host_charmap (); + basic_block bb; FOR_EACH_BB_FN (bb, fun) { Index: gcc/testsuite/gcc.dg/tree-ssa/builtin-sprintf-warn-18.c =================================================================== --- gcc/testsuite/gcc.dg/tree-ssa/builtin-sprintf-warn-18.c (nonexistent) +++ gcc/testsuite/gcc.dg/tree-ssa/builtin-sprintf-warn-18.c (working copy) @@ -0,0 +1,110 @@ +/* PR tree-optimization/80523 - -Wformat-overflow doesn't consider + -fexec-charset + { dg-do compile } + { dg-require-iconv "IBM1047" } + { dg-options "-O2 -Wall -Wno-format -Wformat-overflow -fexec-charset=IBM1047 -ftrack-macro-expansion=0" } */ + +char buf[1]; +void sink (void*); + +#define T(...) (__builtin_sprintf (buf + 1, __VA_ARGS__), sink (buf)) + +/* Exercise all special C and POSIX characters. */ + +void test_characters () +{ + T ("%%"); /* { dg-warning ".%%. directive writing 1 byte" } */ + + T ("%A", 0.0); /* { dg-warning ".%A. directive writing between 6 and 20 " } */ + T ("%a", 0.0); /* { dg-warning ".%a. directive writing between 6 and 20 " } */ + + T ("%C", 'a'); /* { dg-warning ".%C. directive writing 1 byte" "bug 80537" { xfail *-*-* } } */ + T ("%c", 'a'); /* { dg-warning ".%c. directive writing 1 byte" } */ + + T ("%d", 12); /* { dg-warning ".%d. directive writing 2 bytes" } */ + T ("% d", 12); /* { dg-warning ".% d. directive writing 3 bytes" } */ + T ("%-d", 123); /* { dg-warning ".%-d. directive writing 3 bytes" } */ + T ("%+d", 1234); /* { dg-warning ".%\\+d. directive writing 5 bytes" } */ + T ("%'d", 1234); /* { dg-warning ".%'d. directive writing 5 bytes" "bug 80535" { xfail *-*-* } } */ + T ("%1$d", 2345); /* { dg-warning ".%1\\\$d. directive writing 4 bytes" } */ + + /* Verify that digits are correctly interpreted as width and precision. */ + T ("%0d", 12345); /* { dg-warning ".%0d. directive writing 5 bytes" } */ + T ("%1d", 12345); /* { dg-warning ".%1d. directive writing 5 bytes" } */ + T ("%2d", 12345); /* { dg-warning ".%2d. directive writing 5 bytes" } */ + T ("%3d", 12345); /* { dg-warning ".%3d. directive writing 5 bytes" } */ + T ("%4d", 12345); /* { dg-warning ".%4d. directive writing 5 bytes" } */ + T ("%5d", 12345); /* { dg-warning ".%5d. directive writing 5 bytes" } */ + T ("%6d", 12345); /* { dg-warning ".%6d. directive writing 6 bytes" } */ + T ("%7d", 12345); /* { dg-warning ".%7d. directive writing 7 bytes" } */ + T ("%8d", 12345); /* { dg-warning ".%8d. directive writing 8 bytes" } */ + T ("%9d", 12345); /* { dg-warning ".%9d. directive writing 9 bytes" } */ + + T ("%.0d", 12345); /* { dg-warning ".%.0d. directive writing 5 bytes" } */ + T ("%.1d", 12345); /* { dg-warning ".%.1d. directive writing 5 bytes" } */ + T ("%.2d", 12345); /* { dg-warning ".%.2d. directive writing 5 bytes" } */ + T ("%.3d", 12345); /* { dg-warning ".%.3d. directive writing 5 bytes" } */ + T ("%.4d", 12345); /* { dg-warning ".%.4d. directive writing 5 bytes" } */ + T ("%.5d", 12345); /* { dg-warning ".%.5d. directive writing 5 bytes" } */ + T ("%.6d", 12345); /* { dg-warning ".%.6d. directive writing 6 bytes" } */ + T ("%.7d", 12345); /* { dg-warning ".%.7d. directive writing 7 bytes" } */ + T ("%.8d", 12345); /* { dg-warning ".%.8d. directive writing 8 bytes" } */ + T ("%.9d", 12345); /* { dg-warning ".%.9d. directive writing 9 bytes" } */ + + T ("%hhd", 12); /* { dg-warning ".%hhd. directive writing 2 bytes" } */ + T ("%hd", 234); /* { dg-warning ".%hd. directive writing 3 bytes" } */ + + { + const __PTRDIFF_TYPE__ i = 3456; + T ("%jd", i); /* { dg-warning ".%jd. directive writing 4 bytes" } */ + } + + T ("%ld", 45678L); /* { dg-warning ".%ld. directive writing 5 bytes" } */ + + { + const __PTRDIFF_TYPE__ i = 56789; + T ("%td", i); /* { dg-warning ".%td. directive writing 5 bytes" } */ + } + + { + const __SIZE_TYPE__ i = 67890; + T ("%zd", i); /* { dg-warning ".%zd. directive writing 5 bytes" } */ + } + + T ("%E", 0.0); /* { dg-warning ".%E. directive writing 12 bytes" } */ + T ("%e", 0.0); /* { dg-warning ".%e. directive writing 12 bytes" } */ + T ("%F", 0.0); /* { dg-warning ".%F. directive writing 8 bytes" } */ + T ("%f", 0.0); /* { dg-warning ".%f. directive writing 8 bytes" } */ + T ("%G", 0.0); /* { dg-warning ".%G. directive writing 1 byte" } */ + T ("%g", 0.0); /* { dg-warning ".%g. directive writing 1 byte" } */ + + T ("%i", 123); /* { dg-warning ".%i. directive writing 3 bytes" } */ + + { + int n; + + T ("%n", &n); /* { dg-warning "writing a terminating nul" } */ + T ("%nH", &n); /* { dg-warning ".H. directive writing 1 byte" } */ + } + + T ("%o", 999); /* { dg-warning ".%o. directive writing 4 bytes" } */ + T ("%#o", 999); /* { dg-warning ".%#o. directive writing 5 bytes" } */ + + T ("%x", 1234); /* { dg-warning ".%x. directive writing 3 bytes" } */ + T ("%#X", 1235); /* { dg-warning ".%#X. directive writing 5 bytes" } */ + + T ("%S", L"1"); /* { dg-warning ".%S. directive writing 1 byte" } */ + T ("%-s", "1"); /* { dg-warning ".%-s. directive writing 1 byte" } */ + + /* Verify that characters in the source character set appear in + the text of the warning unchanged (i.e., not as their equivalents + in the execution character set on the target). The trailing %% + disables sprintf->strcpy optimization. */ + T ("ABCDEFGHIJ%%"); /* { dg-warning ".ABCDEFGHIJ. directive writing 10 bytes" } */ + T ("KLMNOPQRST%%"); /* { dg-warning ".KLMNOPQRST. directive writing 10 bytes" } */ + T ("UVWXYZ%%"); /* { dg-warning ".UVWXYZ. directive writing 6 bytes" } */ + + T ("abcdefghij%%"); /* { dg-warning ".abcdefghij. directive writing 10 bytes" } */ + T ("klmnopqrst%%"); /* { dg-warning ".klmnopqrst. directive writing 10 bytes" } */ + T ("uvwxyz%%"); /* { dg-warning ".uvwxyz. directive writing 6 bytes" } */ +}