On Thu, Aug 26, 2021 at 12:57 PM liuhongt <hongtao....@intel.com> wrote:
>
>   This patch is a follow-up to [1], it fold all shufps/shufpd builtins into 
> gimple.
Of course for non-mask or mask all-ones version.
>   Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
>
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2019-May/521983.html
>
> gcc/
>         PR target/98167
>         PR target/43147
>         * config/i386/i386.c (ix86_gimple_fold_builtin): Fold
>         IX86_BUILTIN_SHUFPD512, IX86_BUILTIN_SHUFPS512,
>         IX86_BUILTIN_SHUFPD256, IX86_BUILTIN_SHUFPS,
>         IX86_BUILTIN_SHUFPS256.
>         (ix86_masked_all_ones): New function.
>
> gcc/testsuite/
>         * gcc.target/i386/avx512f-vshufpd-1.c: Adjust testcase.
>         * gcc.target/i386/avx512f-vshufps-1.c: Adjust testcase.
>         * gcc.target/i386/pr43147.c: New test.
> ---
>  gcc/config/i386/i386.c                        | 90 ++++++++++++++-----
>  .../gcc.target/i386/avx512f-vshufpd-1.c       |  3 +-
>  .../gcc.target/i386/avx512f-vshufps-1.c       |  3 +-
>  gcc/testsuite/gcc.target/i386/pr43147.c       | 15 ++++
>  4 files changed, 87 insertions(+), 24 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr43147.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index ebec8668758..f3eed9f2426 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -17541,6 +17541,20 @@ ix86_vector_shift_count (tree arg1)
>    return NULL_TREE;
>  }
>
> +/* Return true if arg_mask is all ones, arg_vec is corresponding vector.  */
> +static bool
> +ix86_masked_all_ones (unsigned HOST_WIDE_INT elems, tree arg_mask)
> +{
> +  if (TREE_CODE (arg_mask) != INTEGER_CST)
> +    return false;
> +
> +  unsigned HOST_WIDE_INT mask = TREE_INT_CST_LOW (arg_mask);
> +  if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
> +    return false;
> +
> +  return true;
> +}
> +
>  static tree
>  ix86_fold_builtin (tree fndecl, int n_args,
>                    tree *args, bool ignore ATTRIBUTE_UNUSED)
> @@ -18026,6 +18040,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
>    enum tree_code tcode;
>    unsigned HOST_WIDE_INT count;
>    bool is_vshift;
> +  unsigned HOST_WIDE_INT elems;
>
>    switch (fn_code)
>      {
> @@ -18349,17 +18364,11 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
>        gcc_assert (n_args >= 2);
>        arg0 = gimple_call_arg (stmt, 0);
>        arg1 = gimple_call_arg (stmt, 1);
> -      if (n_args > 2)
> -       {
> -         /* This is masked shift.  Only optimize if the mask is all ones.  */
> -         tree argl = gimple_call_arg (stmt, n_args - 1);
> -         if (!tree_fits_uhwi_p (argl))
> -           break;
> -         unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl);
> -         unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> -         if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
> -           break;
> -       }
> +      elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> +      /* For masked shift, only optimize if the mask is all ones.  */
> +      if (n_args > 2
> +         && !ix86_masked_all_ones (elems, gimple_call_arg (stmt, n_args - 
> 1)))
> +       break;
>        if (is_vshift)
>         {
>           if (TREE_CODE (arg1) != VECTOR_CST)
> @@ -18408,25 +18417,62 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
>         }
>        break;
>
> +    case IX86_BUILTIN_SHUFPD512:
> +    case IX86_BUILTIN_SHUFPS512:
>      case IX86_BUILTIN_SHUFPD:
> +    case IX86_BUILTIN_SHUFPD256:
> +    case IX86_BUILTIN_SHUFPS:
> +    case IX86_BUILTIN_SHUFPS256:
> +      arg0 = gimple_call_arg (stmt, 0);
> +      elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> +      /* This is masked shuffle.  Only optimize if the mask is all ones.  */
> +      if (n_args > 3
> +         && !ix86_masked_all_ones (elems,
> +                                   gimple_call_arg (stmt, n_args - 1)))
> +       break;
>        arg2 = gimple_call_arg (stmt, 2);
>        if (TREE_CODE (arg2) == INTEGER_CST)
>         {
> +         unsigned HOST_WIDE_INT shuffle_mask = TREE_INT_CST_LOW (arg2);
> +         /* Check valid imm, refer to gcc.target/i386/testimm-10.c.  */
> +         if (shuffle_mask > 255)
> +           return false;
> +
> +         machine_mode imode = GET_MODE_INNER (TYPE_MODE (TREE_TYPE (arg0)));
>           location_t loc = gimple_location (stmt);
> -         unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2);
> -         arg0 = gimple_call_arg (stmt, 0);
> +         tree itype = (imode == E_DFmode
> +                       ? long_long_integer_type_node : integer_type_node);
> +         tree vtype = build_vector_type (itype, elems);
> +         tree_vector_builder elts (vtype, elems, 1);
> +
> +
> +         /* Transform integer shuffle_mask to vector perm_mask which
> +            is used by vec_perm_expr, refer to shuflp[sd]256/512 in sse.md.  
> */
> +         for (unsigned i = 0; i != elems; i++)
> +           {
> +             unsigned sel_idx;
> +             /* Imm[1:0](if VL > 128, then use Imm[3:2],Imm[5:4],Imm[7:6])
> +                provide 2 select constrols for each element of the
> +                destination.  */
> +             if (imode == E_DFmode)
> +               sel_idx = (i & 1) * elems + (i & ~1)
> +                         + ((shuffle_mask >> i) & 1);
> +             else
> +               {
> +                 /* Imm[7:0](if VL > 128, also use Imm[7:0]) provide 4 select
> +                    controls for each element of the destination.  */
> +                 unsigned j = i % 4;
> +                 sel_idx = ((i >> 1) & 1) * elems + (i & ~3)
> +                           + ((shuffle_mask >> 2 * j) & 3);
> +               }
> +             elts.quick_push (build_int_cst (itype, sel_idx));
> +           }
> +
> +         tree perm_mask = elts.build ();
>           arg1 = gimple_call_arg (stmt, 1);
> -         tree itype = long_long_integer_type_node;
> -         tree vtype = build_vector_type (itype, 2); /* V2DI */
> -         tree_vector_builder elts (vtype, 2, 1);
> -         /* Ignore bits other than the lowest 2.  */
> -         elts.quick_push (build_int_cst (itype, imask & 1));
> -         imask >>= 1;
> -         elts.quick_push (build_int_cst (itype, 2 + (imask & 1)));
> -         tree omask = elts.build ();
>           gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
>                                            VEC_PERM_EXPR,
> -                                          arg0, arg1, omask);
> +                                          arg0, arg1, perm_mask);
>           gimple_set_location (g, loc);
>           gsi_replace (gsi, g, false);
>           return true;
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c 
> b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
> index d1ac01e1c88..8df5b9d4441 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufpd-1.c
> @@ -7,11 +7,12 @@
>  #include <immintrin.h>
>
>  __m512d x;
> +__m512d y;
>
>  void extern
>  avx512f_test (void)
>  {
> -  x = _mm512_shuffle_pd (x, x, 56);
> +  x = _mm512_shuffle_pd (x, y, 56);
>    x = _mm512_mask_shuffle_pd (x, 2, x, x, 56);
>    x = _mm512_maskz_shuffle_pd (2, x, x, 56);
>  }
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c 
> b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
> index 07a63fca3ff..378ae4b7101 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-vshufps-1.c
> @@ -7,11 +7,12 @@
>  #include <immintrin.h>
>
>  __m512 x;
> +__m512 y;
>
>  void extern
>  avx512f_test (void)
>  {
> -  x = _mm512_shuffle_ps (x, x, 56);
> +  x = _mm512_shuffle_ps (x, y, 56);
>    x = _mm512_mask_shuffle_ps (x, 2, x, x, 56);
>    x = _mm512_maskz_shuffle_ps (2, x, x, 56);
>  }
> diff --git a/gcc/testsuite/gcc.target/i386/pr43147.c 
> b/gcc/testsuite/gcc.target/i386/pr43147.c
> new file mode 100644
> index 00000000000..3c30f917c06
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr43147.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2" } */
> +/* { dg-final { scan-assembler "movaps" } } */
> +/* { dg-final { scan-assembler-not "shufps" } } */
> +
> +#include <x86intrin.h>
> +
> +__m128
> +foo (void)
> +{
> +  __m128 m = _mm_set_ps(1.0f, 2.0f, 3.0f, 4.0f);
> +  m = _mm_shuffle_ps(m, m, 0xC9);
> +  m = _mm_shuffle_ps(m, m, 0x2D);
> +  return m;
> +}
> --
> 2.18.1
>


-- 
BR,
Hongtao

Reply via email to