gcc/ChangeLog 2015-07-09 Bernhard Reutner-Fischer <al...@gcc.gnu.org>
* builtins.c (fold_builtin_tolower, fold_builtin_toupper): New static functions. (fold_builtin_1): Handle BUILT_IN_TOLOWER, BUILT_IN_TOUPPER. Signed-off-by: Bernhard Reutner-Fischer <rep.dot....@gmail.com> --- gcc/builtins.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) Using the three testcases attached to PR66741 where the -1.c one is using builtins $ for i in 0 1 2;do gcc -o tolower_strcpy-$i tolower_strcpy-$i.c -Ofast -W -Wall -Wextra -pedantic -DMAIN -msse4.2;done pristine (trunk@225368): # tolower_strcpy-0 real 0m6.068s user 0m3.204s sys 0m2.840s # tolower_strcpy-1 real 0m8.097s user 0m5.548s sys 0m2.528s # tolower_strcpy-2 real 0m3.568s user 0m0.804s sys 0m2.748s trunk@225368 + fold tolower/toupper below # tolower_strcpy-0 real 0m6.055s user 0m3.212s sys 0m2.832s # tolower_strcpy-1 real 0m5.383s user 0m2.464s sys 0m2.900s # tolower_strcpy-2 real 0m3.605s user 0m0.668s sys 0m2.924s The tolower loop now ends up as .L5: movsbl (%rbx), %edx leal 32(%rdx), %ecx movl %edx, %eax subl $65, %edx cmpl $25, %edx cmovbe %ecx, %eax addq $1, %rbx movb %al, -1(%rbx) cmpq %rsi, %rbx jne .L5 instead of the former call .L5: movsbl (%rbx), %edi addq $1, %rbx call tolower movb %al, -1(%rbx) cmpq %rbp, %rbx jne .L5 Would something like attached be ok for trunk after proper testing? Advise on the questions inline WRT caching lang_hooks intermediate results? Hints on further steps towards fixing the PR? I think the next step would be to try to teach graphite to fuse the two loops in tolower_strcpy-0.c. Need to look at graphite.. Then see how to classify builtins that could be expanded early and what breaks if doing so. This sounds like a potential disaster, fun. Next, see why the vectorizer (or something else) does not pave the way to use SSE instruction as the tolower_strcpy-2.c does. thanks, diff --git a/gcc/builtins.c b/gcc/builtins.c index 5f53342..421c908 100644 --- a/gcc/builtins.c +++ b/gcc/builtins.c @@ -204,6 +204,9 @@ static tree fold_builtin_strrchr (location_t, tree, tree, tree); static tree fold_builtin_strspn (location_t, tree, tree); static tree fold_builtin_strcspn (location_t, tree, tree); +static tree fold_builtin_tolower (location_t, tree); +static tree fold_builtin_toupper (location_t, tree); + static rtx expand_builtin_object_size (tree); static rtx expand_builtin_memory_chk (tree, rtx, machine_mode, enum built_in_function); @@ -10285,6 +10288,12 @@ fold_builtin_1 (location_t loc, tree fndecl, tree arg0) case BUILT_IN_ISDIGIT: return fold_builtin_isdigit (loc, arg0); + case BUILT_IN_TOLOWER: + return fold_builtin_tolower (loc, arg0); + + case BUILT_IN_TOUPPER: + return fold_builtin_toupper (loc, arg0); + CASE_FLT_FN (BUILT_IN_FINITE): case BUILT_IN_FINITED32: case BUILT_IN_FINITED64: @@ -11208,6 +11217,96 @@ fold_builtin_strcspn (location_t loc, tree s1, tree s2) } } + +/* Simplify a call to the tolower builtin. ARG is the argument to the call. + + Return NULL_TREE if no simplification was possible, otherwise return the + simplified form of the call as a tree. */ + +static tree +fold_builtin_tolower (location_t loc, tree arg) +{ + if (!validate_arg (arg, INTEGER_TYPE)) + return NULL_TREE; + + /* Transform tolower(c) -> (unsigned)(c) | 0x20. + + More specifically: + unsigned tem = arg - 'A'; + if (tem <= ('Z' - 'A')) + arg += 'a' - 'A'; + return arg; + */ + unsigned HOST_WIDE_INT target_A = lang_hooks.to_target_charset ('A'); + unsigned HOST_WIDE_INT target_Z = lang_hooks.to_target_charset ('Z'); + unsigned HOST_WIDE_INT target_a = lang_hooks.to_target_charset ('a'); + if (target_A == 0 + || target_Z == 0 + || target_a == 0) + return NULL_TREE; + + arg = fold_convert_loc (loc, unsigned_type_node, arg); + tree tem = fold_build2 (MINUS_EXPR, unsigned_type_node, arg, + build_int_cst (unsigned_type_node, target_A)); + /* ??? x19 and x20 would better live in static storage; Think: + * static struct static_fold_tolower {uHWI x19, x20; unsigned probe_done}; + */ + unsigned HOST_WIDE_INT x19 = target_Z - target_A; + unsigned HOST_WIDE_INT x20 = target_a - target_A; + tem = fold_build2_loc (loc, LE_EXPR, integer_type_node, tem, + build_int_cst (unsigned_type_node, x19)); + tem = fold_build3_loc (loc, COND_EXPR, unsigned_type_node, tem, + fold_build2 (PLUS_EXPR, unsigned_type_node, arg, + build_int_cst (unsigned_type_node, x20)), + arg); + return fold_convert_loc (loc, integer_type_node, tem); +} + +/* Simplify a call to the toupper builtin. ARG is the argument to the call. + + Return NULL_TREE if no simplification was possible, otherwise return the + simplified form of the call as a tree. */ + +static tree +fold_builtin_toupper (location_t loc, tree arg) +{ + if (!validate_arg (arg, INTEGER_TYPE)) + return NULL_TREE; + + /* Transform toupper(c) -> (unsigned)(c) ^ 0x20. + + More specifically: + unsigned tem = arg - 'a'; + if (tem <= ('z' - 'a')) + arg -= 'a' - 'A'; + return arg; + */ + unsigned HOST_WIDE_INT target_A = lang_hooks.to_target_charset ('A'); + unsigned HOST_WIDE_INT target_z = lang_hooks.to_target_charset ('z'); + unsigned HOST_WIDE_INT target_a = lang_hooks.to_target_charset ('a'); + if (target_A == 0 + || target_z == 0 + || target_a == 0) + return NULL_TREE; + + arg = fold_convert_loc (loc, unsigned_type_node, arg); + tree tem = fold_build2 (MINUS_EXPR, unsigned_type_node, arg, + build_int_cst (unsigned_type_node, target_a)); + /* ??? x19 and x20 would better live in static storage; Think: + * static struct static_fold_tolower {uHWI x19, x20; unsigned probe_done}; + */ + unsigned HOST_WIDE_INT x19 = target_z - target_a; + unsigned HOST_WIDE_INT x20 = target_a - target_A; + tem = fold_build2_loc (loc, LE_EXPR, integer_type_node, tem, + build_int_cst (unsigned_type_node, x19)); + tem = fold_build3_loc (loc, COND_EXPR, unsigned_type_node, tem, + fold_build2 (MINUS_EXPR, unsigned_type_node, arg, + build_int_cst (unsigned_type_node, x20)), + arg); + return fold_convert_loc (loc, integer_type_node, tem); +} + + /* Fold the next_arg or va_start call EXP. Returns true if there was an error produced. False otherwise. This is done so that we don't output the error or warning twice or three times. */ -- 2.1.4