On Sat, Apr 26, 2025 at 2:42 AM Pengxuan Zheng <quic_pzh...@quicinc.com> wrote: > > Certain permute that blends a vector with zero can be interpreted as an AND > of a > mask. This idea was suggested by Richard Sandiford when he was reviewing my > patch which tries to optimizes certain vector permute with the FMOV > instruction > for the aarch64 target. Canonicalizing this class of vector permute as AND can > be more general and potentially benefit more targets. > > For example, for the aarch64 target, at present: > > v4hi > f_v4hi (v4hi x) > { > return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 }); > } > > generates: > > f_v4hi: > uzp1 v0.2d, v0.2d, v0.2d > adrp x0, .LC0 > ldr d31, [x0, #:lo12:.LC0] > tbl v0.8b, {v0.16b}, v31.8b > ret > .LC0: > .byte -1 > .byte -1 > .byte 2 > .byte 3 > .byte -1 > .byte -1 > .byte 6 > .byte 7 > > With this patch, it generates: > > f_v4hi: > mvni v31.2s, 0xff, msl 8 > and v0.8b, v0.8b, v31.8b > ret > > However, we do have to xfail a few i386 tests due to the new canonicalization > this patch introduces and PR119922 has been filed to track these regressions.
That you need to XFAIL x86 tests suggests you want to implement this in the backends vec_perm_const expander instead. Richard. > PR target/100165 > > gcc/ChangeLog: > > * optabs.cc (vec_perm_and_mask): New function. > (expand_vec_perm_const): Add new AND canonicalization. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/avx-pr94680.c: XFAIL. > * gcc.target/i386/avx10_2-vmovd-1.c: Likewise. > * gcc.target/i386/avx10_2-vmovw-1.c: Likewise. > * gcc.target/i386/avx512f-pr94680.c: Likewise. > * gcc.target/i386/avx512fp16-pr94680.c: Likewise. > * gcc.target/i386/sse2-pr94680.c: Likewise. > * gcc.target/aarch64/and-be.c: New test. > * gcc.target/aarch64/and.c: New test. > > Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com> > --- > gcc/optabs.cc | 69 +++++++++- > gcc/testsuite/gcc.target/aarch64/and-be.c | 125 ++++++++++++++++++ > gcc/testsuite/gcc.target/aarch64/and.c | 125 ++++++++++++++++++ > gcc/testsuite/gcc.target/i386/avx-pr94680.c | 3 +- > .../gcc.target/i386/avx10_2-vmovd-1.c | 3 +- > .../gcc.target/i386/avx10_2-vmovw-1.c | 3 +- > .../gcc.target/i386/avx512f-pr94680.c | 3 +- > .../gcc.target/i386/avx512fp16-pr94680.c | 3 +- > gcc/testsuite/gcc.target/i386/sse2-pr94680.c | 3 +- > 9 files changed, 330 insertions(+), 7 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/aarch64/and-be.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/and.c > > diff --git a/gcc/optabs.cc b/gcc/optabs.cc > index 0a14b1eef8a..dca9df42673 100644 > --- a/gcc/optabs.cc > +++ b/gcc/optabs.cc > @@ -6384,6 +6384,50 @@ expand_vec_perm_1 (enum insn_code icode, rtx target, > return NULL_RTX; > } > > +/* Check if vec_perm mask SEL is a constant equivalent to an and operation of > + the non-zero vec_perm operand with some mask consisting of 0xffs and > 0x00s, > + assuming the other vec_perm operand is a constant vector of zeros. Return > + the mask for the equivalent and operation, or NULL_RTX if the vec_perm can > + not be modeled as an and. MODE is the mode of the value being anded. > + ZERO_OP0_P is true if the first operand of the vec_perm is a constant > vector > + of zeros or false if the second operand of the vec_perm is a constant > vector > + of zeros. */ > +static rtx > +vec_perm_and_mask (machine_mode mode, const vec_perm_indices &sel, > + bool zero_op0_p) > +{ > + unsigned int nelt; > + if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) > + return NULL_RTX; > + > + rtx_vector_builder builder (mode, nelt, 1); > + machine_mode emode = GET_MODE_INNER (mode); > + > + for (unsigned int i = 0; i < nelt; i++) > + { > + if (!zero_op0_p) > + { > + if (known_eq (sel[i], i)) > + builder.quick_push (CONSTM1_RTX (emode)); > + else if (known_ge (sel[i], nelt)) > + builder.quick_push (CONST0_RTX (emode)); > + else > + return NULL_RTX; > + } > + else > + { > + if (known_eq (sel[i], nelt + i)) > + builder.quick_push (CONSTM1_RTX (emode)); > + else if (known_lt (sel[i], nelt)) > + builder.quick_push (CONST0_RTX (emode)); > + else > + return NULL_RTX; > + } > + } > + > + return builder.build (); > +} > + > /* Implement a permutation of vectors v0 and v1 using the permutation > vector in SEL and return the result. Use TARGET to hold the result > if nonnull and convenient. > @@ -6422,12 +6466,18 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx > v1, > insn_code shift_code_qi = CODE_FOR_nothing; > optab shift_optab = unknown_optab; > rtx v2 = v0; > + bool zero_op0_p = false; > + bool zero_op1_p = false; > if (v1 == CONST0_RTX (GET_MODE (v1))) > - shift_optab = vec_shr_optab; > + { > + shift_optab = vec_shr_optab; > + zero_op1_p = true; > + } > else if (v0 == CONST0_RTX (GET_MODE (v0))) > { > shift_optab = vec_shl_optab; > v2 = v1; > + zero_op0_p = true; > } > if (shift_optab != unknown_optab) > { > @@ -6463,6 +6513,23 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx > v1, > } > } > } > + /* See if the vec_perm can be interpreted as an and operation. We only do > + this if one of the operands is all zeros. */ > + if (sel_mode != BLKmode && (zero_op0_p || zero_op1_p)) > + { > + insn_code and_code = optab_handler (and_optab, sel_mode); > + rtx and_mask = vec_perm_and_mask (sel_mode, indices, zero_op0_p); > + if (and_code != CODE_FOR_nothing && and_mask) > + { > + class expand_operand ops[3]; > + rtx tmp = gen_reg_rtx (sel_mode); > + create_output_operand (&ops[0], tmp, sel_mode); > + create_input_operand (&ops[1], gen_lowpart (sel_mode, v2), > sel_mode); > + create_input_operand (&ops[2], and_mask, sel_mode); > + if (maybe_expand_insn (and_code, 3, ops)) > + return gen_lowpart (mode, ops[0].value); > + } > + } > > if (targetm.vectorize.vec_perm_const != NULL) > { > diff --git a/gcc/testsuite/gcc.target/aarch64/and-be.c > b/gcc/testsuite/gcc.target/aarch64/and-be.c > new file mode 100644 > index 00000000000..8ed87949f0b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/and-be.c > @@ -0,0 +1,125 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mbig-endian" } */ > +/* { dg-final { check-function-bodies "**" "" "" } } */ > + > +typedef short v4hi __attribute__ ((vector_size (8))); > +typedef char v8qi __attribute__ ((vector_size (8))); > +typedef int v4si __attribute__ ((vector_size (16))); > +typedef float v4sf __attribute__ ((vector_size (16))); > +typedef short v8hi __attribute__ ((vector_size (16))); > +typedef char v16qi __attribute__ ((vector_size (16))); > + > + > +/* > +** f_v4hi: > +** movi v([0-9]+).2s, 0xff, msl 8 > +** and v0.8b, v0.8b, v\1.8b > +** ret > +*/ > +v4hi > +f_v4hi (v4hi x) > +{ > + return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 }); > +} > + > +/* > +** g_v4hi: > +** mvni v([0-9]+).2s, 0xff, msl 8 > +** and v0.8b, v0.8b, v\1.8b > +** ret > +*/ > +v4hi > +g_v4hi (v4hi x) > +{ > + return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 5, 2, 7 }); > +} > + > +/* > +** f_v8hi: > +** adrp x([0-9]+), .LC([0-9]+) > +** ldr q([0-9]+), \[x\1, #:lo12:.LC\2\] > +** and v0.16b, v0.16b, v\3.16b > +** ret > +*/ > +v8hi > +f_v8hi (v8hi x) > +{ > + return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 }, > + (v8hi){ 0, 8, 2, 9, 4, 10, 12, 11 }); > +} > + > +/* > +** f_v4si: > +** movi v([0-9]+).2d, 0xffffffff00000000 > +** and v0.16b, v0.16b, v\1.16b > +** ret > +*/ > +v4si > +f_v4si (v4si x) > +{ > + return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 4, 2, 5 }); > +} > + > +/* > +** g_v4si: > +** movi v([0-9]+).2d, 0xffffffff > +** and v0.16b, v0.16b, v\1.16b > +** ret > +*/ > +v4si > +g_v4si (v4si x) > +{ > + return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 1, 5, 3, 7 }); > +} > + > +/* > +** h_v4si: > +** movi v([0-9]+).2d, 0xffffffff > +** and v0.16b, v0.16b, v\1.16b > +** ret > +*/ > +v4si > +h_v4si (v4si x) > +{ > + return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 7, 1, 6, 3 }); > +} > + > +/* > +** f_v4sf: > +** movi v([0-9]+).2d, 0xffffffff00000000 > +** and v0.16b, v\1.16b, v0.16b > +** ret > +*/ > +v4sf > +f_v4sf (v4sf x) > +{ > + return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 6, 2, 7 }); > +} > + > +/* > +** f_v8qi: > +** movi d([0-9]+), 0xff00ff00ff000000 > +** and v0.8b, v0.8b, v\1.8b > +** ret > +*/ > +v8qi > +f_v8qi (v8qi x) > +{ > + return __builtin_shuffle (x, (v8qi){ 0, 0, 0, 0, 0, 0, 0, 0 }, > + (v8qi){ 0, 8, 2, 9, 4, 10, 12, 11 }); > +} > + > +/* > +** f_v16qi: > +** adrp x([0-9]+), .LC([0-9]+) > +** ldr q([0-9]+), \[x\1, #:lo12:.LC\2\] > +** and v0.16b, v0.16b, v\3.16b > +** ret > +*/ > +v16qi > +f_v16qi (v16qi x) > +{ > + return __builtin_shuffle ( > + x, (v16qi){ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, > + (v16qi){ 16, 1, 17, 3, 18, 5, 19, 7, 20, 9, 21, 11, 22, 13, 23, 24 }); > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/and.c > b/gcc/testsuite/gcc.target/aarch64/and.c > new file mode 100644 > index 00000000000..56586612b6e > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/and.c > @@ -0,0 +1,125 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > +/* { dg-final { check-function-bodies "**" "" "" } } */ > + > +typedef short v4hi __attribute__ ((vector_size (8))); > +typedef char v8qi __attribute__ ((vector_size (8))); > +typedef int v4si __attribute__ ((vector_size (16))); > +typedef float v4sf __attribute__ ((vector_size (16))); > +typedef short v8hi __attribute__ ((vector_size (16))); > +typedef char v16qi __attribute__ ((vector_size (16))); > + > + > +/* > +** f_v4hi: > +** mvni v([0-9]+).2s, 0xff, msl 8 > +** and v0.8b, v0.8b, v\1.8b > +** ret > +*/ > +v4hi > +f_v4hi (v4hi x) > +{ > + return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 }); > +} > + > +/* > +** g_v4hi: > +** movi v([0-9]+).2s, 0xff, msl 8 > +** and v0.8b, v0.8b, v\1.8b > +** ret > +*/ > +v4hi > +g_v4hi (v4hi x) > +{ > + return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 5, 2, 7 }); > +} > + > +/* > +** f_v8hi: > +** adrp x([0-9]+), .LC([0-9]+) > +** ldr q([0-9]+), \[x\1, #:lo12:.LC\2\] > +** and v0.16b, v0.16b, v\3.16b > +** ret > +*/ > +v8hi > +f_v8hi (v8hi x) > +{ > + return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 }, > + (v8hi){ 0, 8, 2, 9, 4, 10, 12, 11 }); > +} > + > +/* > +** f_v4si: > +** movi v([0-9]+).2d, 0xffffffff > +** and v0.16b, v0.16b, v\1.16b > +** ret > +*/ > +v4si > +f_v4si (v4si x) > +{ > + return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 4, 2, 5 }); > +} > + > +/* > +** g_v4si: > +** movi v([0-9]+).2d, 0xffffffff00000000 > +** and v0.16b, v0.16b, v\1.16b > +** ret > +*/ > +v4si > +g_v4si (v4si x) > +{ > + return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 1, 5, 3, 7 }); > +} > + > +/* > +** h_v4si: > +** movi v([0-9]+).2d, 0xffffffff00000000 > +** and v0.16b, v0.16b, v\1.16b > +** ret > +*/ > +v4si > +h_v4si (v4si x) > +{ > + return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 7, 1, 6, 3 }); > +} > + > +/* > +** f_v4sf: > +** movi v([0-9]+).2d, 0xffffffff > +** and v0.16b, v\1.16b, v0.16b > +** ret > +*/ > +v4sf > +f_v4sf (v4sf x) > +{ > + return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 6, 2, 7 }); > +} > + > +/* > +** f_v8qi: > +** movi d([0-9]+), 0xff00ff00ff > +** and v0.8b, v0.8b, v\1.8b > +** ret > +*/ > +v8qi > +f_v8qi (v8qi x) > +{ > + return __builtin_shuffle (x, (v8qi){ 0, 0, 0, 0, 0, 0, 0, 0 }, > + (v8qi){ 0, 8, 2, 9, 4, 10, 12, 11 }); > +} > + > +/* > +** f_v16qi: > +** adrp x([0-9]+), .LC([0-9]+) > +** ldr q([0-9]+), \[x\1, #:lo12:.LC\2\] > +** and v0.16b, v0.16b, v\3.16b > +** ret > +*/ > +v16qi > +f_v16qi (v16qi x) > +{ > + return __builtin_shuffle ( > + x, (v16qi){ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, > + (v16qi){ 16, 1, 17, 3, 18, 5, 19, 7, 20, 9, 21, 11, 22, 13, 23, 24 }); > +} > diff --git a/gcc/testsuite/gcc.target/i386/avx-pr94680.c > b/gcc/testsuite/gcc.target/i386/avx-pr94680.c > index cb5041b6af3..4dc5315265a 100644 > --- a/gcc/testsuite/gcc.target/i386/avx-pr94680.c > +++ b/gcc/testsuite/gcc.target/i386/avx-pr94680.c > @@ -1,6 +1,7 @@ > /* { dg-do compile } */ > /* { dg-options "-mavx -mno-avx512f -O2" } */ > -/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%xmm[0-9]} 12 } > } */ > +/* xfailed due to PR target/119922 */ > +/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%xmm[0-9]} 12 { > xfail *-*-* } } } */ > /* { dg-final { scan-assembler-not "pxor" } } */ > > typedef float v8sf __attribute__((vector_size(32))); > diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vmovd-1.c > b/gcc/testsuite/gcc.target/i386/avx10_2-vmovd-1.c > index 21bd1a1ef0a..593906bf36e 100644 > --- a/gcc/testsuite/gcc.target/i386/avx10_2-vmovd-1.c > +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vmovd-1.c > @@ -4,7 +4,8 @@ > /* { dg-final { scan-assembler-times "vmovss\t\[0-9\]+\\(%e\[bs\]p\\), > %xmm0" 1 { target ia32 } } } */ > /* { dg-final { scan-assembler-times "vmovd\t%xmm0, %xmm0" 3 { target ia32 } > } } */ > /* { dg-final { scan-assembler-times "vmovd\t%edi, %xmm0" 1 { target { ! > ia32 } } } } */ > -/* { dg-final { scan-assembler-times "vmovd\t%xmm0, %xmm0" 4 { target { ! > ia32 } } } } */ > +/* xfailed due to PR target/119922 */ > +/* { dg-final { scan-assembler-times "vmovd\t%xmm0, %xmm0" 4 { target { ! > ia32 } xfail *-*-* } } } */ > > > #include<immintrin.h> > diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vmovw-1.c > b/gcc/testsuite/gcc.target/i386/avx10_2-vmovw-1.c > index 49fa51dc2ec..cb30a682260 100644 > --- a/gcc/testsuite/gcc.target/i386/avx10_2-vmovw-1.c > +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vmovw-1.c > @@ -3,7 +3,8 @@ > /* { dg-final { scan-assembler-times "vmovw\t\[0-9\]+\\(%e\[bs\]p\\), %xmm0" > 4 { target ia32 } } } */ > /* { dg-final { scan-assembler-times "vmovw\t%xmm0, %xmm0" 4 { target ia32 } > } } */ > /* { dg-final { scan-assembler-times "vmovw\t%edi, %xmm0" 1 { target { ! > ia32 } } } } */ > -/* { dg-final { scan-assembler-times "vmovw\t%xmm0, %xmm0" 7 { target { ! > ia32 } } } } */ > +/* xfailed due to PR target/119922 */ > +/* { dg-final { scan-assembler-times "vmovw\t%xmm0, %xmm0" 7 { target { ! > ia32 } xfail *-*-* } } } */ > > #include<immintrin.h> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c > b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c > index c27431aae72..af41b14ed7c 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c > +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c > @@ -1,6 +1,7 @@ > /* { dg-do compile } */ > /* { dg-options "-mavx512bw -mavx512vbmi -O2" } */ > -/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 12} > } */ > +/* xfailed due to PR target/119922 */ > +/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 12 { > xfail *-*-* } } } */ > /* { dg-final { scan-assembler-not "pxor" } } */ > > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c > b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c > index bfe11236eef..631f26be9b5 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c > @@ -1,7 +1,8 @@ > /* { dg-do compile } */ > /* { dg-options "-mavx512fp16 -mavx512vl -O2" } */ > /* { dg-final { scan-assembler-times "vmovdqa" 4 } } */ > -/* { dg-final { scan-assembler-times "vmovq" 2 } } */ > +/* xfailed due to PR target/119922 */ > +/* { dg-final { scan-assembler-times "vmovq" 2 { xfail *-*-* } } } */ > > typedef _Float16 v32hf __attribute__((vector_size (64))); > typedef _Float16 v16hf __attribute__((vector_size (32))); > diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr94680.c > b/gcc/testsuite/gcc.target/i386/sse2-pr94680.c > index 7e0ff9f6bc7..84692410534 100644 > --- a/gcc/testsuite/gcc.target/i386/sse2-pr94680.c > +++ b/gcc/testsuite/gcc.target/i386/sse2-pr94680.c > @@ -1,6 +1,7 @@ > /* { dg-do compile } */ > /* { dg-options "-msse2 -mno-sse4.1 -O2" } */ > -/* { dg-final { scan-assembler-times {(?n)(?:mov|psrldq).*%xmm[0-9]} 12 } } > */ > +/* xfailed due to PR target/119922 */ > +/* { dg-final { scan-assembler-times {(?n)(?:mov|psrldq).*%xmm[0-9]} 12 { > xfail *-*-* } } } */ > /* { dg-final { scan-assembler-not "pxor" } } */ > > typedef float v4sf __attribute__((vector_size(16))); > -- > 2.17.1 >