On Wed, May 5, 2021 at 4:50 PM H.J. Lu <[email protected]> wrote:
>
> On Wed, May 05, 2021 at 09:36:16AM +0200, Richard Biener wrote:
> > On Mon, May 3, 2021 at 11:31 AM Ivan Sorokin via Gcc-patches
> > <[email protected]> wrote:
> > >
> > > Prior to this commit GCC -O2 generated quite bad code for this
> > > function:
> > >
> > > bool f()
> > > {
> > > return __builtin_cpu_supports("popcnt")
> > > && __builtin_cpu_supports("ssse3");
> > > }
> > >
> > > f:
> > > movl __cpu_model+12(%rip), %eax
> > > xorl %r8d, %r8d
> > > testb $4, %al
> > > je .L1
> > > shrl $6, %eax
> > > movl %eax, %r8d
> > > andl $1, %r8d
> > > .L1:
> > > movl %r8d, %eax
> > > ret
> > >
> > > The problem was caused by the fact that internally every invocation
> > > of __builtin_cpu_supports built a new variable __cpu_model and a new
> > > type __processor_model. Because of this GIMPLE level optimizers
> > > weren't able to CSE the loads of __cpu_model and optimize
> > > bit-operations properly.
> > >
> > > This commit fixes the problem by caching created __cpu_model
> > > variable and __processor_model type. Now the GCC -O2 generates:
> > >
> > > f:
> > > movl __cpu_model+12(%rip), %eax
> > > andl $68, %eax
> > > cmpl $68, %eax
> > > sete %al
> > > ret
> >
> > The patch looks good, the function could need a comment
> > and the global variables better names, not starting with __
> >
> > Up to the x86 maintainers - HJ, can you pick up this work?
> >
>
> Here is the updated patch to also handle and__cpu_features2.
> OK for master?
LGTM (Richi effectively approved tree parts).
Thanks,
Uros.
>
> Thanks.
>
> H.J.
> ---
> GCC -O2 generated quite bad code for this function:
>
> bool
> f (void)
> {
> return __builtin_cpu_supports("popcnt")
> && __builtin_cpu_supports("ssse3");
> }
>
> f:
> movl __cpu_model+12(%rip), %edx
> movl %edx, %eax
> shrl $6, %eax
> andl $1, %eax
> andl $4, %edx
> movl $0, %edx
> cmove %edx, %eax
> ret
>
> The problem was caused by the fact that internally every invocation of
> __builtin_cpu_supports built a new variable __cpu_model and a new type
> __processor_model. Because of this, GIMPLE level optimizers weren't able
> to CSE the loads of __cpu_model and optimize bit-operations properly.
>
> Improve GCC -O2 code generation by caching __cpu_model and__cpu_features2
> variables as well as their types:
>
> f:
> movl __cpu_model+12(%rip), %eax
> andl $68, %eax
> cmpl $68, %eax
> sete %al
> ret
>
> gcc/ChangeLog:
>
> 2021-05-05 Ivan Sorokin <[email protected]>
> H.J. Lu <[email protected]>
>
> PR target/91400
> * config/i386/i386-builtins.c (ix86_cpu_model_type_node): New.
> (ix86_cpu_model_var): Likewise.
> (ix86_cpu_features2_type_node): Likewise.
> (ix86_cpu_features2_var): Likewise.
> (fold_builtin_cpu): Cache __cpu_model and __cpu_features2 with
> their types.
>
> gcc/testsuite/Changelog:
>
> 2021-05-05 Ivan Sorokin <[email protected]>
> H.J. Lu <[email protected]>
>
> PR target/91400
> * gcc.target/i386/pr91400-1.c: New test.
> * gcc.target/i386/pr91400-2.c: Likewise.
> ---
> gcc/config/i386/i386-builtins.c | 52 +++++++++++++++--------
> gcc/testsuite/gcc.target/i386/pr91400-1.c | 14 ++++++
> gcc/testsuite/gcc.target/i386/pr91400-2.c | 14 ++++++
> 3 files changed, 63 insertions(+), 17 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr91400-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr91400-2.c
>
> diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c
> index b66911082ab..8036aedebac 100644
> --- a/gcc/config/i386/i386-builtins.c
> +++ b/gcc/config/i386/i386-builtins.c
> @@ -2103,6 +2103,11 @@ make_var_decl (tree type, const char *name)
> return new_decl;
> }
>
> +static GTY(()) tree ix86_cpu_model_type_node;
> +static GTY(()) tree ix86_cpu_model_var;
> +static GTY(()) tree ix86_cpu_features2_type_node;
> +static GTY(()) tree ix86_cpu_features2_var;
> +
> /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is
> folded
> into an integer defined in libgcc/config/i386/cpuinfo.c */
>
> @@ -2114,12 +2119,16 @@ fold_builtin_cpu (tree fndecl, tree *args)
> = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl);
> tree param_string_cst = NULL;
>
> - tree __processor_model_type = build_processor_model_struct ();
> - tree __cpu_model_var = make_var_decl (__processor_model_type,
> - "__cpu_model");
> -
> -
> - varpool_node::add (__cpu_model_var);
> + if (ix86_cpu_model_var == nullptr)
> + {
> + /* Build a single __cpu_model variable for all references to
> + __cpu_model so that GIMPLE level optimizers can CSE the loads
> + of __cpu_model and optimize bit-operations properly. */
> + ix86_cpu_model_type_node = build_processor_model_struct ();
> + ix86_cpu_model_var = make_var_decl (ix86_cpu_model_type_node,
> + "__cpu_model");
> + varpool_node::add (ix86_cpu_model_var);
> + }
>
> gcc_assert ((args != NULL) && (*args != NULL));
>
> @@ -2160,7 +2169,7 @@ fold_builtin_cpu (tree fndecl, tree *args)
> return integer_zero_node;
> }
>
> - field = TYPE_FIELDS (__processor_model_type);
> + field = TYPE_FIELDS (ix86_cpu_model_type_node);
> field_val = processor_alias_table[i].model;
>
> /* CPU types are stored in the next field. */
> @@ -2179,7 +2188,7 @@ fold_builtin_cpu (tree fndecl, tree *args)
> }
>
> /* Get the appropriate field in __cpu_model. */
> - ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
> + ref = build3 (COMPONENT_REF, TREE_TYPE (field), ix86_cpu_model_var,
> field, NULL_TREE);
>
> /* Check the value. */
> @@ -2212,13 +2221,22 @@ fold_builtin_cpu (tree fndecl, tree *args)
>
> if (isa_names_table[i].feature >= 32)
> {
> - tree index_type
> - = build_index_type (size_int (SIZE_OF_CPU_FEATURES));
> - tree type = build_array_type (unsigned_type_node, index_type);
> - tree __cpu_features2_var = make_var_decl (type,
> - "__cpu_features2");
> + if (ix86_cpu_features2_var == nullptr)
> + {
> + /* Build a single __cpu_features2 variable for all
> + references to __cpu_features2 so that GIMPLE level
> + optimizers can CSE the loads of __cpu_features2 and
> + optimize bit-operations properly. */
> + tree index_type
> + = build_index_type (size_int (SIZE_OF_CPU_FEATURES));
> + ix86_cpu_features2_type_node
> + = build_array_type (unsigned_type_node, index_type);
> + ix86_cpu_features2_var
> + = make_var_decl (ix86_cpu_features2_type_node,
> + "__cpu_features2");
> + varpool_node::add (ix86_cpu_features2_var);
> + }
>
> - varpool_node::add (__cpu_features2_var);
> for (unsigned int j = 0; j < SIZE_OF_CPU_FEATURES; j++)
> if (isa_names_table[i].feature < (32 + 32 + j * 32))
> {
> @@ -2226,7 +2244,7 @@ fold_builtin_cpu (tree fndecl, tree *args)
> - (32 + j * 32)));
> tree index = size_int (j);
> array_elt = build4 (ARRAY_REF, unsigned_type_node,
> - __cpu_features2_var,
> + ix86_cpu_features2_var,
> index, NULL_TREE, NULL_TREE);
> /* Return __cpu_features2[index] & field_val */
> final = build2 (BIT_AND_EXPR, unsigned_type_node,
> @@ -2237,13 +2255,13 @@ fold_builtin_cpu (tree fndecl, tree *args)
> }
> }
>
> - field = TYPE_FIELDS (__processor_model_type);
> + field = TYPE_FIELDS (ix86_cpu_model_type_node);
> /* Get the last field, which is __cpu_features. */
> while (DECL_CHAIN (field))
> field = DECL_CHAIN (field);
>
> /* Get the appropriate field: __cpu_model.__cpu_features */
> - ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
> + ref = build3 (COMPONENT_REF, TREE_TYPE (field), ix86_cpu_model_var,
> field, NULL_TREE);
>
> /* Access the 0th element of __cpu_features array. */
> diff --git a/gcc/testsuite/gcc.target/i386/pr91400-1.c
> b/gcc/testsuite/gcc.target/i386/pr91400-1.c
> new file mode 100644
> index 00000000000..6124058dc1e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr91400-1.c
> @@ -0,0 +1,14 @@
> +/* PR target/91400 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler-times "andl" 1 } } */
> +/* { dg-final { scan-assembler-times "cmpl" 1 } } */
> +/* { dg-final { scan-assembler-times "sete" 1 } } */
> +/* { dg-final { scan-assembler-not "cmove" } } */
> +
> +_Bool
> +f (void)
> +{
> + return __builtin_cpu_supports("popcnt")
> + && __builtin_cpu_supports("ssse3");
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr91400-2.c
> b/gcc/testsuite/gcc.target/i386/pr91400-2.c
> new file mode 100644
> index 00000000000..1af5a2f41cd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr91400-2.c
> @@ -0,0 +1,14 @@
> +/* PR target/91400 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler-times "andl" 1 } } */
> +/* { dg-final { scan-assembler-times "cmpl" 1 } } */
> +/* { dg-final { scan-assembler-times "sete" 1 } } */
> +/* { dg-final { scan-assembler-not "cmove" } } */
> +
> +_Bool
> +f (void)
> +{
> + return __builtin_cpu_supports("avx512vnni")
> + && __builtin_cpu_supports("3dnow");
> +}
> --
> 2.31.1
>