On Sat, Apr 26, 2025 at 2:42 AM Pengxuan Zheng <quic_pzh...@quicinc.com> wrote:
>
> Certain permute that blends a vector with zero can be interpreted as an AND 
> of a
> mask. This idea was suggested by Richard Sandiford when he was reviewing my
> patch which tries to optimizes certain vector permute with the FMOV 
> instruction
> for the aarch64 target. Canonicalizing this class of vector permute as AND can
> be more general and potentially benefit more targets.
>
> For example, for the aarch64 target, at present:
>
> v4hi
> f_v4hi (v4hi x)
> {
>   return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 });
> }
>
> generates:
>
> f_v4hi:
>         uzp1    v0.2d, v0.2d, v0.2d
>         adrp    x0, .LC0
>         ldr     d31, [x0, #:lo12:.LC0]
>         tbl     v0.8b, {v0.16b}, v31.8b
>         ret
> .LC0:
>         .byte   -1
>         .byte   -1
>         .byte   2
>         .byte   3
>         .byte   -1
>         .byte   -1
>         .byte   6
>         .byte   7
>
> With this patch, it generates:
>
> f_v4hi:
>         mvni    v31.2s, 0xff, msl 8
>         and     v0.8b, v0.8b, v31.8b
>         ret
>
> However, we do have to xfail a few i386 tests due to the new canonicalization
> this patch introduces and PR119922 has been filed to track these regressions.

That you need to XFAIL x86 tests suggests you want to implement this in
the backends vec_perm_const expander instead.

Richard.

>         PR target/100165
>
> gcc/ChangeLog:
>
>         * optabs.cc (vec_perm_and_mask): New function.
>         (expand_vec_perm_const): Add new AND canonicalization.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/avx-pr94680.c: XFAIL.
>         * gcc.target/i386/avx10_2-vmovd-1.c: Likewise.
>         * gcc.target/i386/avx10_2-vmovw-1.c: Likewise.
>         * gcc.target/i386/avx512f-pr94680.c: Likewise.
>         * gcc.target/i386/avx512fp16-pr94680.c: Likewise.
>         * gcc.target/i386/sse2-pr94680.c: Likewise.
>         * gcc.target/aarch64/and-be.c: New test.
>         * gcc.target/aarch64/and.c: New test.
>
> Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com>
> ---
>  gcc/optabs.cc                                 |  69 +++++++++-
>  gcc/testsuite/gcc.target/aarch64/and-be.c     | 125 ++++++++++++++++++
>  gcc/testsuite/gcc.target/aarch64/and.c        | 125 ++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/avx-pr94680.c   |   3 +-
>  .../gcc.target/i386/avx10_2-vmovd-1.c         |   3 +-
>  .../gcc.target/i386/avx10_2-vmovw-1.c         |   3 +-
>  .../gcc.target/i386/avx512f-pr94680.c         |   3 +-
>  .../gcc.target/i386/avx512fp16-pr94680.c      |   3 +-
>  gcc/testsuite/gcc.target/i386/sse2-pr94680.c  |   3 +-
>  9 files changed, 330 insertions(+), 7 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/and-be.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/and.c
>
> diff --git a/gcc/optabs.cc b/gcc/optabs.cc
> index 0a14b1eef8a..dca9df42673 100644
> --- a/gcc/optabs.cc
> +++ b/gcc/optabs.cc
> @@ -6384,6 +6384,50 @@ expand_vec_perm_1 (enum insn_code icode, rtx target,
>    return NULL_RTX;
>  }
>
> +/* Check if vec_perm mask SEL is a constant equivalent to an and operation of
> +   the non-zero vec_perm operand with some mask consisting of 0xffs and 
> 0x00s,
> +   assuming the other vec_perm operand is a constant vector of zeros.  Return
> +   the mask for the equivalent and operation, or NULL_RTX if the vec_perm can
> +   not be modeled as an and.  MODE is the mode of the value being anded.
> +   ZERO_OP0_P is true if the first operand of the vec_perm is a constant 
> vector
> +   of zeros or false if the second operand of the vec_perm is a constant 
> vector
> +   of zeros.  */
> +static rtx
> +vec_perm_and_mask (machine_mode mode, const vec_perm_indices &sel,
> +                  bool zero_op0_p)
> +{
> +  unsigned int nelt;
> +  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
> +    return NULL_RTX;
> +
> +  rtx_vector_builder builder (mode, nelt, 1);
> +  machine_mode emode = GET_MODE_INNER (mode);
> +
> +  for (unsigned int i = 0; i < nelt; i++)
> +    {
> +      if (!zero_op0_p)
> +       {
> +         if (known_eq (sel[i], i))
> +           builder.quick_push (CONSTM1_RTX (emode));
> +         else if (known_ge (sel[i], nelt))
> +           builder.quick_push (CONST0_RTX (emode));
> +         else
> +           return NULL_RTX;
> +       }
> +      else
> +       {
> +         if (known_eq (sel[i], nelt + i))
> +           builder.quick_push (CONSTM1_RTX (emode));
> +         else if (known_lt (sel[i], nelt))
> +           builder.quick_push (CONST0_RTX (emode));
> +         else
> +           return NULL_RTX;
> +       }
> +    }
> +
> +  return builder.build ();
> +}
> +
>  /* Implement a permutation of vectors v0 and v1 using the permutation
>     vector in SEL and return the result.  Use TARGET to hold the result
>     if nonnull and convenient.
> @@ -6422,12 +6466,18 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx 
> v1,
>    insn_code shift_code_qi = CODE_FOR_nothing;
>    optab shift_optab = unknown_optab;
>    rtx v2 = v0;
> +  bool zero_op0_p = false;
> +  bool zero_op1_p = false;
>    if (v1 == CONST0_RTX (GET_MODE (v1)))
> -    shift_optab = vec_shr_optab;
> +    {
> +      shift_optab = vec_shr_optab;
> +      zero_op1_p = true;
> +    }
>    else if (v0 == CONST0_RTX (GET_MODE (v0)))
>      {
>        shift_optab = vec_shl_optab;
>        v2 = v1;
> +      zero_op0_p = true;
>      }
>    if (shift_optab != unknown_optab)
>      {
> @@ -6463,6 +6513,23 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx 
> v1,
>             }
>         }
>      }
> +  /* See if the vec_perm can be interpreted as an and operation.  We only do
> +     this if one of the operands is all zeros.  */
> +  if (sel_mode != BLKmode && (zero_op0_p || zero_op1_p))
> +    {
> +      insn_code and_code = optab_handler (and_optab, sel_mode);
> +      rtx and_mask = vec_perm_and_mask (sel_mode, indices, zero_op0_p);
> +      if (and_code != CODE_FOR_nothing && and_mask)
> +       {
> +         class expand_operand ops[3];
> +         rtx tmp = gen_reg_rtx (sel_mode);
> +         create_output_operand (&ops[0], tmp, sel_mode);
> +         create_input_operand (&ops[1], gen_lowpart (sel_mode, v2), 
> sel_mode);
> +         create_input_operand (&ops[2], and_mask, sel_mode);
> +         if (maybe_expand_insn (and_code, 3, ops))
> +           return gen_lowpart (mode, ops[0].value);
> +       }
> +    }
>
>    if (targetm.vectorize.vec_perm_const != NULL)
>      {
> diff --git a/gcc/testsuite/gcc.target/aarch64/and-be.c 
> b/gcc/testsuite/gcc.target/aarch64/and-be.c
> new file mode 100644
> index 00000000000..8ed87949f0b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/and-be.c
> @@ -0,0 +1,125 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mbig-endian" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +typedef short v4hi __attribute__ ((vector_size (8)));
> +typedef char v8qi __attribute__ ((vector_size (8)));
> +typedef int v4si __attribute__ ((vector_size (16)));
> +typedef float v4sf __attribute__ ((vector_size (16)));
> +typedef short v8hi __attribute__ ((vector_size (16)));
> +typedef char v16qi __attribute__ ((vector_size (16)));
> +
> +
> +/*
> +** f_v4hi:
> +**     movi    v([0-9]+).2s, 0xff, msl 8
> +**     and     v0.8b, v0.8b, v\1.8b
> +**     ret
> +*/
> +v4hi
> +f_v4hi (v4hi x)
> +{
> +  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 });
> +}
> +
> +/*
> +** g_v4hi:
> +**     mvni    v([0-9]+).2s, 0xff, msl 8
> +**     and     v0.8b, v0.8b, v\1.8b
> +**     ret
> +*/
> +v4hi
> +g_v4hi (v4hi x)
> +{
> +  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 5, 2, 7 });
> +}
> +
> +/*
> +** f_v8hi:
> +**     adrp    x([0-9]+), .LC([0-9]+)
> +**     ldr     q([0-9]+), \[x\1, #:lo12:.LC\2\]
> +**     and     v0.16b, v0.16b, v\3.16b
> +**     ret
> +*/
> +v8hi
> +f_v8hi (v8hi x)
> +{
> +  return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 },
> +                           (v8hi){ 0, 8, 2, 9, 4, 10, 12, 11 });
> +}
> +
> +/*
> +** f_v4si:
> +**     movi    v([0-9]+).2d, 0xffffffff00000000
> +**     and     v0.16b, v0.16b, v\1.16b
> +**     ret
> +*/
> +v4si
> +f_v4si (v4si x)
> +{
> +  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 4, 2, 5 });
> +}
> +
> +/*
> +** g_v4si:
> +**     movi    v([0-9]+).2d, 0xffffffff
> +**     and     v0.16b, v0.16b, v\1.16b
> +**     ret
> +*/
> +v4si
> +g_v4si (v4si x)
> +{
> +  return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 1, 5, 3, 7 });
> +}
> +
> +/*
> +** h_v4si:
> +**     movi    v([0-9]+).2d, 0xffffffff
> +**     and     v0.16b, v0.16b, v\1.16b
> +**     ret
> +*/
> +v4si
> +h_v4si (v4si x)
> +{
> +  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 7, 1, 6, 3 });
> +}
> +
> +/*
> +** f_v4sf:
> +**     movi    v([0-9]+).2d, 0xffffffff00000000
> +**     and     v0.16b, v\1.16b, v0.16b
> +**     ret
> +*/
> +v4sf
> +f_v4sf (v4sf x)
> +{
> +  return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 6, 2, 7 });
> +}
> +
> +/*
> +** f_v8qi:
> +**     movi    d([0-9]+), 0xff00ff00ff000000
> +**     and     v0.8b, v0.8b, v\1.8b
> +**     ret
> +*/
> +v8qi
> +f_v8qi (v8qi x)
> +{
> +  return __builtin_shuffle (x, (v8qi){ 0, 0, 0, 0, 0, 0, 0, 0 },
> +                           (v8qi){ 0, 8, 2, 9, 4, 10, 12, 11 });
> +}
> +
> +/*
> +** f_v16qi:
> +**     adrp    x([0-9]+), .LC([0-9]+)
> +**     ldr     q([0-9]+), \[x\1, #:lo12:.LC\2\]
> +**     and     v0.16b, v0.16b, v\3.16b
> +**     ret
> +*/
> +v16qi
> +f_v16qi (v16qi x)
> +{
> +  return __builtin_shuffle (
> +      x, (v16qi){ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
> +      (v16qi){ 16, 1, 17, 3, 18, 5, 19, 7, 20, 9, 21, 11, 22, 13, 23, 24 });
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/and.c 
> b/gcc/testsuite/gcc.target/aarch64/and.c
> new file mode 100644
> index 00000000000..56586612b6e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/and.c
> @@ -0,0 +1,125 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +typedef short v4hi __attribute__ ((vector_size (8)));
> +typedef char v8qi __attribute__ ((vector_size (8)));
> +typedef int v4si __attribute__ ((vector_size (16)));
> +typedef float v4sf __attribute__ ((vector_size (16)));
> +typedef short v8hi __attribute__ ((vector_size (16)));
> +typedef char v16qi __attribute__ ((vector_size (16)));
> +
> +
> +/*
> +** f_v4hi:
> +**     mvni    v([0-9]+).2s, 0xff, msl 8
> +**     and     v0.8b, v0.8b, v\1.8b
> +**     ret
> +*/
> +v4hi
> +f_v4hi (v4hi x)
> +{
> +  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 });
> +}
> +
> +/*
> +** g_v4hi:
> +**     movi    v([0-9]+).2s, 0xff, msl 8
> +**     and     v0.8b, v0.8b, v\1.8b
> +**     ret
> +*/
> +v4hi
> +g_v4hi (v4hi x)
> +{
> +  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 5, 2, 7 });
> +}
> +
> +/*
> +** f_v8hi:
> +**     adrp    x([0-9]+), .LC([0-9]+)
> +**     ldr     q([0-9]+), \[x\1, #:lo12:.LC\2\]
> +**     and     v0.16b, v0.16b, v\3.16b
> +**     ret
> +*/
> +v8hi
> +f_v8hi (v8hi x)
> +{
> +  return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 },
> +                           (v8hi){ 0, 8, 2, 9, 4, 10, 12, 11 });
> +}
> +
> +/*
> +** f_v4si:
> +**     movi    v([0-9]+).2d, 0xffffffff
> +**     and     v0.16b, v0.16b, v\1.16b
> +**     ret
> +*/
> +v4si
> +f_v4si (v4si x)
> +{
> +  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 4, 2, 5 });
> +}
> +
> +/*
> +** g_v4si:
> +**     movi    v([0-9]+).2d, 0xffffffff00000000
> +**     and     v0.16b, v0.16b, v\1.16b
> +**     ret
> +*/
> +v4si
> +g_v4si (v4si x)
> +{
> +  return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 1, 5, 3, 7 });
> +}
> +
> +/*
> +** h_v4si:
> +**     movi    v([0-9]+).2d, 0xffffffff00000000
> +**     and     v0.16b, v0.16b, v\1.16b
> +**     ret
> +*/
> +v4si
> +h_v4si (v4si x)
> +{
> +  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 7, 1, 6, 3 });
> +}
> +
> +/*
> +** f_v4sf:
> +**     movi    v([0-9]+).2d, 0xffffffff
> +**     and     v0.16b, v\1.16b, v0.16b
> +**     ret
> +*/
> +v4sf
> +f_v4sf (v4sf x)
> +{
> +  return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 6, 2, 7 });
> +}
> +
> +/*
> +** f_v8qi:
> +**     movi    d([0-9]+), 0xff00ff00ff
> +**     and     v0.8b, v0.8b, v\1.8b
> +**     ret
> +*/
> +v8qi
> +f_v8qi (v8qi x)
> +{
> +  return __builtin_shuffle (x, (v8qi){ 0, 0, 0, 0, 0, 0, 0, 0 },
> +                           (v8qi){ 0, 8, 2, 9, 4, 10, 12, 11 });
> +}
> +
> +/*
> +** f_v16qi:
> +**     adrp    x([0-9]+), .LC([0-9]+)
> +**     ldr     q([0-9]+), \[x\1, #:lo12:.LC\2\]
> +**     and     v0.16b, v0.16b, v\3.16b
> +**     ret
> +*/
> +v16qi
> +f_v16qi (v16qi x)
> +{
> +  return __builtin_shuffle (
> +      x, (v16qi){ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
> +      (v16qi){ 16, 1, 17, 3, 18, 5, 19, 7, 20, 9, 21, 11, 22, 13, 23, 24 });
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx-pr94680.c 
> b/gcc/testsuite/gcc.target/i386/avx-pr94680.c
> index cb5041b6af3..4dc5315265a 100644
> --- a/gcc/testsuite/gcc.target/i386/avx-pr94680.c
> +++ b/gcc/testsuite/gcc.target/i386/avx-pr94680.c
> @@ -1,6 +1,7 @@
>  /* { dg-do compile } */
>  /* { dg-options "-mavx -mno-avx512f -O2" } */
> -/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%xmm[0-9]} 12 } 
> } */
> +/* xfailed due to PR target/119922 */
> +/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%xmm[0-9]} 12 { 
> xfail *-*-* } } } */
>  /* { dg-final { scan-assembler-not "pxor" } } */
>
>  typedef float v8sf __attribute__((vector_size(32)));
> diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vmovd-1.c 
> b/gcc/testsuite/gcc.target/i386/avx10_2-vmovd-1.c
> index 21bd1a1ef0a..593906bf36e 100644
> --- a/gcc/testsuite/gcc.target/i386/avx10_2-vmovd-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vmovd-1.c
> @@ -4,7 +4,8 @@
>  /* { dg-final { scan-assembler-times "vmovss\t\[0-9\]+\\(%e\[bs\]p\\), 
> %xmm0" 1 { target ia32 } } } */
>  /* { dg-final { scan-assembler-times "vmovd\t%xmm0, %xmm0" 3 { target ia32 } 
> } } */
>  /* { dg-final { scan-assembler-times "vmovd\t%edi, %xmm0" 1 { target { ! 
> ia32 } } } } */
> -/* { dg-final { scan-assembler-times "vmovd\t%xmm0, %xmm0" 4 { target { ! 
> ia32 } } } } */
> +/* xfailed due to PR target/119922 */
> +/* { dg-final { scan-assembler-times "vmovd\t%xmm0, %xmm0" 4 { target { ! 
> ia32 } xfail *-*-* } } } */
>
>
>  #include<immintrin.h>
> diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vmovw-1.c 
> b/gcc/testsuite/gcc.target/i386/avx10_2-vmovw-1.c
> index 49fa51dc2ec..cb30a682260 100644
> --- a/gcc/testsuite/gcc.target/i386/avx10_2-vmovw-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vmovw-1.c
> @@ -3,7 +3,8 @@
>  /* { dg-final { scan-assembler-times "vmovw\t\[0-9\]+\\(%e\[bs\]p\\), %xmm0" 
> 4 { target ia32 } } } */
>  /* { dg-final { scan-assembler-times "vmovw\t%xmm0, %xmm0" 4 { target ia32 } 
> } } */
>  /* { dg-final { scan-assembler-times "vmovw\t%edi, %xmm0" 1 { target { ! 
> ia32 } } } } */
> -/* { dg-final { scan-assembler-times "vmovw\t%xmm0, %xmm0" 7 { target { ! 
> ia32 } } } } */
> +/* xfailed due to PR target/119922 */
> +/* { dg-final { scan-assembler-times "vmovw\t%xmm0, %xmm0" 7 { target { ! 
> ia32 } xfail *-*-* } } } */
>
>  #include<immintrin.h>
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c 
> b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
> index c27431aae72..af41b14ed7c 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
> @@ -1,6 +1,7 @@
>  /* { dg-do compile } */
>  /* { dg-options "-mavx512bw -mavx512vbmi -O2" } */
> -/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 12} 
> } */
> +/* xfailed due to PR target/119922 */
> +/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 12 { 
> xfail *-*-* } } } */
>  /* { dg-final { scan-assembler-not "pxor" } } */
>
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c 
> b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c
> index bfe11236eef..631f26be9b5 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c
> @@ -1,7 +1,8 @@
>  /* { dg-do compile } */
>  /* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
>  /* { dg-final { scan-assembler-times "vmovdqa" 4 } } */
> -/* { dg-final { scan-assembler-times "vmovq" 2 } } */
> +/* xfailed due to PR target/119922 */
> +/* { dg-final { scan-assembler-times "vmovq" 2 { xfail *-*-* } } } */
>
>  typedef _Float16 v32hf __attribute__((vector_size (64)));
>  typedef _Float16 v16hf __attribute__((vector_size (32)));
> diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr94680.c 
> b/gcc/testsuite/gcc.target/i386/sse2-pr94680.c
> index 7e0ff9f6bc7..84692410534 100644
> --- a/gcc/testsuite/gcc.target/i386/sse2-pr94680.c
> +++ b/gcc/testsuite/gcc.target/i386/sse2-pr94680.c
> @@ -1,6 +1,7 @@
>  /* { dg-do compile } */
>  /* { dg-options "-msse2 -mno-sse4.1 -O2" } */
> -/* { dg-final { scan-assembler-times {(?n)(?:mov|psrldq).*%xmm[0-9]} 12 } } 
> */
> +/* xfailed due to PR target/119922 */
> +/* { dg-final { scan-assembler-times {(?n)(?:mov|psrldq).*%xmm[0-9]} 12 { 
> xfail *-*-* } } } */
>  /* { dg-final { scan-assembler-not "pxor" } } */
>
>  typedef float v4sf __attribute__((vector_size(16)));
> --
> 2.17.1
>

Reply via email to