On Thu, 9 Jul 2015, Bernhard Reutner-Fischer wrote:
> gcc/ChangeLog
>
> 2015-07-09 Bernhard Reutner-Fischer <[email protected]>
>
> * builtins.c (fold_builtin_tolower, fold_builtin_toupper): New
> static functions.
> (fold_builtin_1): Handle BUILT_IN_TOLOWER, BUILT_IN_TOUPPER.
As I read it you fold tolower (X) to (X) >= target_char_set ('A')
&& (X) <= target_char_set ('Z') ? (X) - target_char_set ('A') +
target_char_set ('a');
I don't think this can be correct for all locales which need not
have a lower-case character for all upper-case ones nor do
all letters having one need to be in the range of 'A' to 'Z'.
Joseph will surely correct me if I am wrong.
What works would eventually be constant folding.
Richard.
> Signed-off-by: Bernhard Reutner-Fischer <[email protected]>
> ---
> gcc/builtins.c | 99
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 99 insertions(+)
>
> Using the three testcases attached to PR66741 where the -1.c one is using
> builtins
> $ for i in 0 1 2;do gcc -o tolower_strcpy-$i tolower_strcpy-$i.c -Ofast -W
> -Wall -Wextra -pedantic -DMAIN -msse4.2;done
>
> pristine (trunk@225368):
> # tolower_strcpy-0
>
> real 0m6.068s
> user 0m3.204s
> sys 0m2.840s
> # tolower_strcpy-1
>
> real 0m8.097s
> user 0m5.548s
> sys 0m2.528s
> # tolower_strcpy-2
>
> real 0m3.568s
> user 0m0.804s
> sys 0m2.748s
>
> trunk@225368 + fold tolower/toupper below
>
> # tolower_strcpy-0
>
> real 0m6.055s
> user 0m3.212s
> sys 0m2.832s
> # tolower_strcpy-1
>
> real 0m5.383s
> user 0m2.464s
> sys 0m2.900s
> # tolower_strcpy-2
>
> real 0m3.605s
> user 0m0.668s
> sys 0m2.924s
>
> The tolower loop now ends up as
> .L5:
> movsbl (%rbx), %edx
> leal 32(%rdx), %ecx
> movl %edx, %eax
> subl $65, %edx
> cmpl $25, %edx
> cmovbe %ecx, %eax
> addq $1, %rbx
> movb %al, -1(%rbx)
> cmpq %rsi, %rbx
> jne .L5
>
> instead of the former call
>
> .L5:
> movsbl (%rbx), %edi
> addq $1, %rbx
> call tolower
> movb %al, -1(%rbx)
> cmpq %rbp, %rbx
> jne .L5
>
> Would something like attached be ok for trunk after proper testing?
> Advise on the questions inline WRT caching lang_hooks intermediate
> results?
> Hints on further steps towards fixing the PR?
>
> I think the next step would be to try to teach graphite to fuse the two
> loops in tolower_strcpy-0.c. Need to look at graphite..
> Then see how to classify builtins that could be expanded early and what
> breaks if doing so. This sounds like a potential disaster, fun.
> Next, see why the vectorizer (or something else) does not pave the way
> to use SSE instruction as the tolower_strcpy-2.c does.
>
> thanks,
>
> diff --git a/gcc/builtins.c b/gcc/builtins.c
> index 5f53342..421c908 100644
> --- a/gcc/builtins.c
> +++ b/gcc/builtins.c
> @@ -204,6 +204,9 @@ static tree fold_builtin_strrchr (location_t, tree, tree,
> tree);
> static tree fold_builtin_strspn (location_t, tree, tree);
> static tree fold_builtin_strcspn (location_t, tree, tree);
>
> +static tree fold_builtin_tolower (location_t, tree);
> +static tree fold_builtin_toupper (location_t, tree);
> +
> static rtx expand_builtin_object_size (tree);
> static rtx expand_builtin_memory_chk (tree, rtx, machine_mode,
> enum built_in_function);
> @@ -10285,6 +10288,12 @@ fold_builtin_1 (location_t loc, tree fndecl, tree
> arg0)
> case BUILT_IN_ISDIGIT:
> return fold_builtin_isdigit (loc, arg0);
>
> + case BUILT_IN_TOLOWER:
> + return fold_builtin_tolower (loc, arg0);
> +
> + case BUILT_IN_TOUPPER:
> + return fold_builtin_toupper (loc, arg0);
> +
> CASE_FLT_FN (BUILT_IN_FINITE):
> case BUILT_IN_FINITED32:
> case BUILT_IN_FINITED64:
> @@ -11208,6 +11217,96 @@ fold_builtin_strcspn (location_t loc, tree s1, tree
> s2)
> }
> }
>
> +
> +/* Simplify a call to the tolower builtin. ARG is the argument to the call.
> +
> + Return NULL_TREE if no simplification was possible, otherwise return the
> + simplified form of the call as a tree. */
> +
> +static tree
> +fold_builtin_tolower (location_t loc, tree arg)
> +{
> + if (!validate_arg (arg, INTEGER_TYPE))
> + return NULL_TREE;
> +
> + /* Transform tolower(c) -> (unsigned)(c) | 0x20.
> +
> + More specifically:
> + unsigned tem = arg - 'A';
> + if (tem <= ('Z' - 'A'))
> + arg += 'a' - 'A';
> + return arg;
> + */
> + unsigned HOST_WIDE_INT target_A = lang_hooks.to_target_charset ('A');
> + unsigned HOST_WIDE_INT target_Z = lang_hooks.to_target_charset ('Z');
> + unsigned HOST_WIDE_INT target_a = lang_hooks.to_target_charset ('a');
> + if (target_A == 0
> + || target_Z == 0
> + || target_a == 0)
> + return NULL_TREE;
> +
> + arg = fold_convert_loc (loc, unsigned_type_node, arg);
> + tree tem = fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
> + build_int_cst (unsigned_type_node, target_A));
> + /* ??? x19 and x20 would better live in static storage; Think:
> + * static struct static_fold_tolower {uHWI x19, x20; unsigned probe_done};
> + */
> + unsigned HOST_WIDE_INT x19 = target_Z - target_A;
> + unsigned HOST_WIDE_INT x20 = target_a - target_A;
> + tem = fold_build2_loc (loc, LE_EXPR, integer_type_node, tem,
> + build_int_cst (unsigned_type_node, x19));
> + tem = fold_build3_loc (loc, COND_EXPR, unsigned_type_node, tem,
> + fold_build2 (PLUS_EXPR, unsigned_type_node, arg,
> + build_int_cst (unsigned_type_node, x20)),
> + arg);
> + return fold_convert_loc (loc, integer_type_node, tem);
> +}
> +
> +/* Simplify a call to the toupper builtin. ARG is the argument to the call.
> +
> + Return NULL_TREE if no simplification was possible, otherwise return the
> + simplified form of the call as a tree. */
> +
> +static tree
> +fold_builtin_toupper (location_t loc, tree arg)
> +{
> + if (!validate_arg (arg, INTEGER_TYPE))
> + return NULL_TREE;
> +
> + /* Transform toupper(c) -> (unsigned)(c) ^ 0x20.
> +
> + More specifically:
> + unsigned tem = arg - 'a';
> + if (tem <= ('z' - 'a'))
> + arg -= 'a' - 'A';
> + return arg;
> + */
> + unsigned HOST_WIDE_INT target_A = lang_hooks.to_target_charset ('A');
> + unsigned HOST_WIDE_INT target_z = lang_hooks.to_target_charset ('z');
> + unsigned HOST_WIDE_INT target_a = lang_hooks.to_target_charset ('a');
> + if (target_A == 0
> + || target_z == 0
> + || target_a == 0)
> + return NULL_TREE;
> +
> + arg = fold_convert_loc (loc, unsigned_type_node, arg);
> + tree tem = fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
> + build_int_cst (unsigned_type_node, target_a));
> + /* ??? x19 and x20 would better live in static storage; Think:
> + * static struct static_fold_tolower {uHWI x19, x20; unsigned probe_done};
> + */
> + unsigned HOST_WIDE_INT x19 = target_z - target_a;
> + unsigned HOST_WIDE_INT x20 = target_a - target_A;
> + tem = fold_build2_loc (loc, LE_EXPR, integer_type_node, tem,
> + build_int_cst (unsigned_type_node, x19));
> + tem = fold_build3_loc (loc, COND_EXPR, unsigned_type_node, tem,
> + fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
> + build_int_cst (unsigned_type_node, x20)),
> + arg);
> + return fold_convert_loc (loc, integer_type_node, tem);
> +}
> +
> +
> /* Fold the next_arg or va_start call EXP. Returns true if there was an error
> produced. False otherwise. This is done so that we don't output the
> error
> or warning twice or three times. */
>
--
Richard Biener <[email protected]>
SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Dilip Upmanyu, Graham
Norton, HRB 21284 (AG Nuernberg)