Richard Biener <richard.guent...@gmail.com> writes: > On Sat, Apr 26, 2025 at 2:42 AM Pengxuan Zheng <quic_pzh...@quicinc.com> > wrote: >> >> Certain permute that blends a vector with zero can be interpreted as an AND >> of a >> mask. This idea was suggested by Richard Sandiford when he was reviewing my >> patch which tries to optimizes certain vector permute with the FMOV >> instruction >> for the aarch64 target. Canonicalizing this class of vector permute as AND >> can >> be more general and potentially benefit more targets. >> >> For example, for the aarch64 target, at present: >> >> v4hi >> f_v4hi (v4hi x) >> { >> return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 }); >> } >> >> generates: >> >> f_v4hi: >> uzp1 v0.2d, v0.2d, v0.2d >> adrp x0, .LC0 >> ldr d31, [x0, #:lo12:.LC0] >> tbl v0.8b, {v0.16b}, v31.8b >> ret >> .LC0: >> .byte -1 >> .byte -1 >> .byte 2 >> .byte 3 >> .byte -1 >> .byte -1 >> .byte 6 >> .byte 7 >> >> With this patch, it generates: >> >> f_v4hi: >> mvni v31.2s, 0xff, msl 8 >> and v0.8b, v0.8b, v31.8b >> ret >> >> However, we do have to xfail a few i386 tests due to the new canonicalization >> this patch introduces and PR119922 has been filed to track these regressions. > > That you need to XFAIL x86 tests suggests you want to implement this in > the backends vec_perm_const expander instead.
I'd suggested that we provide a target-independent routine/helper for detecting this case, since the concept isn't target specific. But I agree that it should be opt-in, called from target evpc routines. The vec_perm_and_mask function looks good to me. I think it would be possible to handle the VLA case too, but that can be future work. Thanks, Richard >> PR target/100165 >> >> gcc/ChangeLog: >> >> * optabs.cc (vec_perm_and_mask): New function. >> (expand_vec_perm_const): Add new AND canonicalization. >> >> gcc/testsuite/ChangeLog: >> >> * gcc.target/i386/avx-pr94680.c: XFAIL. >> * gcc.target/i386/avx10_2-vmovd-1.c: Likewise. >> * gcc.target/i386/avx10_2-vmovw-1.c: Likewise. >> * gcc.target/i386/avx512f-pr94680.c: Likewise. >> * gcc.target/i386/avx512fp16-pr94680.c: Likewise. >> * gcc.target/i386/sse2-pr94680.c: Likewise. >> * gcc.target/aarch64/and-be.c: New test. >> * gcc.target/aarch64/and.c: New test. >> >> Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com> >> --- >> gcc/optabs.cc | 69 +++++++++- >> gcc/testsuite/gcc.target/aarch64/and-be.c | 125 ++++++++++++++++++ >> gcc/testsuite/gcc.target/aarch64/and.c | 125 ++++++++++++++++++ >> gcc/testsuite/gcc.target/i386/avx-pr94680.c | 3 +- >> .../gcc.target/i386/avx10_2-vmovd-1.c | 3 +- >> .../gcc.target/i386/avx10_2-vmovw-1.c | 3 +- >> .../gcc.target/i386/avx512f-pr94680.c | 3 +- >> .../gcc.target/i386/avx512fp16-pr94680.c | 3 +- >> gcc/testsuite/gcc.target/i386/sse2-pr94680.c | 3 +- >> 9 files changed, 330 insertions(+), 7 deletions(-) >> create mode 100644 gcc/testsuite/gcc.target/aarch64/and-be.c >> create mode 100644 gcc/testsuite/gcc.target/aarch64/and.c >> >> diff --git a/gcc/optabs.cc b/gcc/optabs.cc >> index 0a14b1eef8a..dca9df42673 100644 >> --- a/gcc/optabs.cc >> +++ b/gcc/optabs.cc >> @@ -6384,6 +6384,50 @@ expand_vec_perm_1 (enum insn_code icode, rtx target, >> return NULL_RTX; >> } >> >> +/* Check if vec_perm mask SEL is a constant equivalent to an and operation >> of >> + the non-zero vec_perm operand with some mask consisting of 0xffs and >> 0x00s, >> + assuming the other vec_perm operand is a constant vector of zeros. >> Return >> + the mask for the equivalent and operation, or NULL_RTX if the vec_perm >> can >> + not be modeled as an and. MODE is the mode of the value being anded. >> + ZERO_OP0_P is true if the first operand of the vec_perm is a constant >> vector >> + of zeros or false if the second operand of the vec_perm is a constant >> vector >> + of zeros. */ >> +static rtx >> +vec_perm_and_mask (machine_mode mode, const vec_perm_indices &sel, >> + bool zero_op0_p) >> +{ >> + unsigned int nelt; >> + if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) >> + return NULL_RTX; >> + >> + rtx_vector_builder builder (mode, nelt, 1); >> + machine_mode emode = GET_MODE_INNER (mode); >> + >> + for (unsigned int i = 0; i < nelt; i++) >> + { >> + if (!zero_op0_p) >> + { >> + if (known_eq (sel[i], i)) >> + builder.quick_push (CONSTM1_RTX (emode)); >> + else if (known_ge (sel[i], nelt)) >> + builder.quick_push (CONST0_RTX (emode)); >> + else >> + return NULL_RTX; >> + } >> + else >> + { >> + if (known_eq (sel[i], nelt + i)) >> + builder.quick_push (CONSTM1_RTX (emode)); >> + else if (known_lt (sel[i], nelt)) >> + builder.quick_push (CONST0_RTX (emode)); >> + else >> + return NULL_RTX; >> + } >> + } >> + >> + return builder.build (); >> +} >> + >> /* Implement a permutation of vectors v0 and v1 using the permutation >> vector in SEL and return the result. Use TARGET to hold the result >> if nonnull and convenient. >> @@ -6422,12 +6466,18 @@ expand_vec_perm_const (machine_mode mode, rtx v0, >> rtx v1, >> insn_code shift_code_qi = CODE_FOR_nothing; >> optab shift_optab = unknown_optab; >> rtx v2 = v0; >> + bool zero_op0_p = false; >> + bool zero_op1_p = false; >> if (v1 == CONST0_RTX (GET_MODE (v1))) >> - shift_optab = vec_shr_optab; >> + { >> + shift_optab = vec_shr_optab; >> + zero_op1_p = true; >> + } >> else if (v0 == CONST0_RTX (GET_MODE (v0))) >> { >> shift_optab = vec_shl_optab; >> v2 = v1; >> + zero_op0_p = true; >> } >> if (shift_optab != unknown_optab) >> { >> @@ -6463,6 +6513,23 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx >> v1, >> } >> } >> } >> + /* See if the vec_perm can be interpreted as an and operation. We only do >> + this if one of the operands is all zeros. */ >> + if (sel_mode != BLKmode && (zero_op0_p || zero_op1_p)) >> + { >> + insn_code and_code = optab_handler (and_optab, sel_mode); >> + rtx and_mask = vec_perm_and_mask (sel_mode, indices, zero_op0_p); >> + if (and_code != CODE_FOR_nothing && and_mask) >> + { >> + class expand_operand ops[3]; >> + rtx tmp = gen_reg_rtx (sel_mode); >> + create_output_operand (&ops[0], tmp, sel_mode); >> + create_input_operand (&ops[1], gen_lowpart (sel_mode, v2), >> sel_mode); >> + create_input_operand (&ops[2], and_mask, sel_mode); >> + if (maybe_expand_insn (and_code, 3, ops)) >> + return gen_lowpart (mode, ops[0].value); >> + } >> + } >> >> if (targetm.vectorize.vec_perm_const != NULL) >> { >> diff --git a/gcc/testsuite/gcc.target/aarch64/and-be.c >> b/gcc/testsuite/gcc.target/aarch64/and-be.c >> new file mode 100644 >> index 00000000000..8ed87949f0b >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/aarch64/and-be.c >> @@ -0,0 +1,125 @@ >> +/* { dg-do compile } */ >> +/* { dg-options "-O2 -mbig-endian" } */ >> +/* { dg-final { check-function-bodies "**" "" "" } } */ >> + >> +typedef short v4hi __attribute__ ((vector_size (8))); >> +typedef char v8qi __attribute__ ((vector_size (8))); >> +typedef int v4si __attribute__ ((vector_size (16))); >> +typedef float v4sf __attribute__ ((vector_size (16))); >> +typedef short v8hi __attribute__ ((vector_size (16))); >> +typedef char v16qi __attribute__ ((vector_size (16))); >> + >> + >> +/* >> +** f_v4hi: >> +** movi v([0-9]+).2s, 0xff, msl 8 >> +** and v0.8b, v0.8b, v\1.8b >> +** ret >> +*/ >> +v4hi >> +f_v4hi (v4hi x) >> +{ >> + return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 }); >> +} >> + >> +/* >> +** g_v4hi: >> +** mvni v([0-9]+).2s, 0xff, msl 8 >> +** and v0.8b, v0.8b, v\1.8b >> +** ret >> +*/ >> +v4hi >> +g_v4hi (v4hi x) >> +{ >> + return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 5, 2, 7 }); >> +} >> + >> +/* >> +** f_v8hi: >> +** adrp x([0-9]+), .LC([0-9]+) >> +** ldr q([0-9]+), \[x\1, #:lo12:.LC\2\] >> +** and v0.16b, v0.16b, v\3.16b >> +** ret >> +*/ >> +v8hi >> +f_v8hi (v8hi x) >> +{ >> + return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 }, >> + (v8hi){ 0, 8, 2, 9, 4, 10, 12, 11 }); >> +} >> + >> +/* >> +** f_v4si: >> +** movi v([0-9]+).2d, 0xffffffff00000000 >> +** and v0.16b, v0.16b, v\1.16b >> +** ret >> +*/ >> +v4si >> +f_v4si (v4si x) >> +{ >> + return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 4, 2, 5 }); >> +} >> + >> +/* >> +** g_v4si: >> +** movi v([0-9]+).2d, 0xffffffff >> +** and v0.16b, v0.16b, v\1.16b >> +** ret >> +*/ >> +v4si >> +g_v4si (v4si x) >> +{ >> + return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 1, 5, 3, 7 }); >> +} >> + >> +/* >> +** h_v4si: >> +** movi v([0-9]+).2d, 0xffffffff >> +** and v0.16b, v0.16b, v\1.16b >> +** ret >> +*/ >> +v4si >> +h_v4si (v4si x) >> +{ >> + return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 7, 1, 6, 3 }); >> +} >> + >> +/* >> +** f_v4sf: >> +** movi v([0-9]+).2d, 0xffffffff00000000 >> +** and v0.16b, v\1.16b, v0.16b >> +** ret >> +*/ >> +v4sf >> +f_v4sf (v4sf x) >> +{ >> + return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 6, 2, 7 }); >> +} >> + >> +/* >> +** f_v8qi: >> +** movi d([0-9]+), 0xff00ff00ff000000 >> +** and v0.8b, v0.8b, v\1.8b >> +** ret >> +*/ >> +v8qi >> +f_v8qi (v8qi x) >> +{ >> + return __builtin_shuffle (x, (v8qi){ 0, 0, 0, 0, 0, 0, 0, 0 }, >> + (v8qi){ 0, 8, 2, 9, 4, 10, 12, 11 }); >> +} >> + >> +/* >> +** f_v16qi: >> +** adrp x([0-9]+), .LC([0-9]+) >> +** ldr q([0-9]+), \[x\1, #:lo12:.LC\2\] >> +** and v0.16b, v0.16b, v\3.16b >> +** ret >> +*/ >> +v16qi >> +f_v16qi (v16qi x) >> +{ >> + return __builtin_shuffle ( >> + x, (v16qi){ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, >> + (v16qi){ 16, 1, 17, 3, 18, 5, 19, 7, 20, 9, 21, 11, 22, 13, 23, 24 }); >> +} >> diff --git a/gcc/testsuite/gcc.target/aarch64/and.c >> b/gcc/testsuite/gcc.target/aarch64/and.c >> new file mode 100644 >> index 00000000000..56586612b6e >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/aarch64/and.c >> @@ -0,0 +1,125 @@ >> +/* { dg-do compile } */ >> +/* { dg-options "-O2" } */ >> +/* { dg-final { check-function-bodies "**" "" "" } } */ >> + >> +typedef short v4hi __attribute__ ((vector_size (8))); >> +typedef char v8qi __attribute__ ((vector_size (8))); >> +typedef int v4si __attribute__ ((vector_size (16))); >> +typedef float v4sf __attribute__ ((vector_size (16))); >> +typedef short v8hi __attribute__ ((vector_size (16))); >> +typedef char v16qi __attribute__ ((vector_size (16))); >> + >> + >> +/* >> +** f_v4hi: >> +** mvni v([0-9]+).2s, 0xff, msl 8 >> +** and v0.8b, v0.8b, v\1.8b >> +** ret >> +*/ >> +v4hi >> +f_v4hi (v4hi x) >> +{ >> + return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 }); >> +} >> + >> +/* >> +** g_v4hi: >> +** movi v([0-9]+).2s, 0xff, msl 8 >> +** and v0.8b, v0.8b, v\1.8b >> +** ret >> +*/ >> +v4hi >> +g_v4hi (v4hi x) >> +{ >> + return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 5, 2, 7 }); >> +} >> + >> +/* >> +** f_v8hi: >> +** adrp x([0-9]+), .LC([0-9]+) >> +** ldr q([0-9]+), \[x\1, #:lo12:.LC\2\] >> +** and v0.16b, v0.16b, v\3.16b >> +** ret >> +*/ >> +v8hi >> +f_v8hi (v8hi x) >> +{ >> + return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 }, >> + (v8hi){ 0, 8, 2, 9, 4, 10, 12, 11 }); >> +} >> + >> +/* >> +** f_v4si: >> +** movi v([0-9]+).2d, 0xffffffff >> +** and v0.16b, v0.16b, v\1.16b >> +** ret >> +*/ >> +v4si >> +f_v4si (v4si x) >> +{ >> + return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 4, 2, 5 }); >> +} >> + >> +/* >> +** g_v4si: >> +** movi v([0-9]+).2d, 0xffffffff00000000 >> +** and v0.16b, v0.16b, v\1.16b >> +** ret >> +*/ >> +v4si >> +g_v4si (v4si x) >> +{ >> + return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 1, 5, 3, 7 }); >> +} >> + >> +/* >> +** h_v4si: >> +** movi v([0-9]+).2d, 0xffffffff00000000 >> +** and v0.16b, v0.16b, v\1.16b >> +** ret >> +*/ >> +v4si >> +h_v4si (v4si x) >> +{ >> + return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 7, 1, 6, 3 }); >> +} >> + >> +/* >> +** f_v4sf: >> +** movi v([0-9]+).2d, 0xffffffff >> +** and v0.16b, v\1.16b, v0.16b >> +** ret >> +*/ >> +v4sf >> +f_v4sf (v4sf x) >> +{ >> + return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 6, 2, 7 }); >> +} >> + >> +/* >> +** f_v8qi: >> +** movi d([0-9]+), 0xff00ff00ff >> +** and v0.8b, v0.8b, v\1.8b >> +** ret >> +*/ >> +v8qi >> +f_v8qi (v8qi x) >> +{ >> + return __builtin_shuffle (x, (v8qi){ 0, 0, 0, 0, 0, 0, 0, 0 }, >> + (v8qi){ 0, 8, 2, 9, 4, 10, 12, 11 }); >> +} >> + >> +/* >> +** f_v16qi: >> +** adrp x([0-9]+), .LC([0-9]+) >> +** ldr q([0-9]+), \[x\1, #:lo12:.LC\2\] >> +** and v0.16b, v0.16b, v\3.16b >> +** ret >> +*/ >> +v16qi >> +f_v16qi (v16qi x) >> +{ >> + return __builtin_shuffle ( >> + x, (v16qi){ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, >> + (v16qi){ 16, 1, 17, 3, 18, 5, 19, 7, 20, 9, 21, 11, 22, 13, 23, 24 }); >> +} >> diff --git a/gcc/testsuite/gcc.target/i386/avx-pr94680.c >> b/gcc/testsuite/gcc.target/i386/avx-pr94680.c >> index cb5041b6af3..4dc5315265a 100644 >> --- a/gcc/testsuite/gcc.target/i386/avx-pr94680.c >> +++ b/gcc/testsuite/gcc.target/i386/avx-pr94680.c >> @@ -1,6 +1,7 @@ >> /* { dg-do compile } */ >> /* { dg-options "-mavx -mno-avx512f -O2" } */ >> -/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%xmm[0-9]} 12 >> } } */ >> +/* xfailed due to PR target/119922 */ >> +/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%xmm[0-9]} 12 >> { xfail *-*-* } } } */ >> /* { dg-final { scan-assembler-not "pxor" } } */ >> >> typedef float v8sf __attribute__((vector_size(32))); >> diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vmovd-1.c >> b/gcc/testsuite/gcc.target/i386/avx10_2-vmovd-1.c >> index 21bd1a1ef0a..593906bf36e 100644 >> --- a/gcc/testsuite/gcc.target/i386/avx10_2-vmovd-1.c >> +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vmovd-1.c >> @@ -4,7 +4,8 @@ >> /* { dg-final { scan-assembler-times "vmovss\t\[0-9\]+\\(%e\[bs\]p\\), >> %xmm0" 1 { target ia32 } } } */ >> /* { dg-final { scan-assembler-times "vmovd\t%xmm0, %xmm0" 3 { target ia32 >> } } } */ >> /* { dg-final { scan-assembler-times "vmovd\t%edi, %xmm0" 1 { target { ! >> ia32 } } } } */ >> -/* { dg-final { scan-assembler-times "vmovd\t%xmm0, %xmm0" 4 { target { ! >> ia32 } } } } */ >> +/* xfailed due to PR target/119922 */ >> +/* { dg-final { scan-assembler-times "vmovd\t%xmm0, %xmm0" 4 { target { ! >> ia32 } xfail *-*-* } } } */ >> >> >> #include<immintrin.h> >> diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vmovw-1.c >> b/gcc/testsuite/gcc.target/i386/avx10_2-vmovw-1.c >> index 49fa51dc2ec..cb30a682260 100644 >> --- a/gcc/testsuite/gcc.target/i386/avx10_2-vmovw-1.c >> +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vmovw-1.c >> @@ -3,7 +3,8 @@ >> /* { dg-final { scan-assembler-times "vmovw\t\[0-9\]+\\(%e\[bs\]p\\), >> %xmm0" 4 { target ia32 } } } */ >> /* { dg-final { scan-assembler-times "vmovw\t%xmm0, %xmm0" 4 { target ia32 >> } } } */ >> /* { dg-final { scan-assembler-times "vmovw\t%edi, %xmm0" 1 { target { ! >> ia32 } } } } */ >> -/* { dg-final { scan-assembler-times "vmovw\t%xmm0, %xmm0" 7 { target { ! >> ia32 } } } } */ >> +/* xfailed due to PR target/119922 */ >> +/* { dg-final { scan-assembler-times "vmovw\t%xmm0, %xmm0" 7 { target { ! >> ia32 } xfail *-*-* } } } */ >> >> #include<immintrin.h> >> >> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c >> b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c >> index c27431aae72..af41b14ed7c 100644 >> --- a/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c >> +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c >> @@ -1,6 +1,7 @@ >> /* { dg-do compile } */ >> /* { dg-options "-mavx512bw -mavx512vbmi -O2" } */ >> -/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 12} >> } */ >> +/* xfailed due to PR target/119922 */ >> +/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 12 >> { xfail *-*-* } } } */ >> /* { dg-final { scan-assembler-not "pxor" } } */ >> >> >> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c >> b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c >> index bfe11236eef..631f26be9b5 100644 >> --- a/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c >> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c >> @@ -1,7 +1,8 @@ >> /* { dg-do compile } */ >> /* { dg-options "-mavx512fp16 -mavx512vl -O2" } */ >> /* { dg-final { scan-assembler-times "vmovdqa" 4 } } */ >> -/* { dg-final { scan-assembler-times "vmovq" 2 } } */ >> +/* xfailed due to PR target/119922 */ >> +/* { dg-final { scan-assembler-times "vmovq" 2 { xfail *-*-* } } } */ >> >> typedef _Float16 v32hf __attribute__((vector_size (64))); >> typedef _Float16 v16hf __attribute__((vector_size (32))); >> diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr94680.c >> b/gcc/testsuite/gcc.target/i386/sse2-pr94680.c >> index 7e0ff9f6bc7..84692410534 100644 >> --- a/gcc/testsuite/gcc.target/i386/sse2-pr94680.c >> +++ b/gcc/testsuite/gcc.target/i386/sse2-pr94680.c >> @@ -1,6 +1,7 @@ >> /* { dg-do compile } */ >> /* { dg-options "-msse2 -mno-sse4.1 -O2" } */ >> -/* { dg-final { scan-assembler-times {(?n)(?:mov|psrldq).*%xmm[0-9]} 12 } } >> */ >> +/* xfailed due to PR target/119922 */ >> +/* { dg-final { scan-assembler-times {(?n)(?:mov|psrldq).*%xmm[0-9]} 12 { >> xfail *-*-* } } } */ >> /* { dg-final { scan-assembler-not "pxor" } } */ >> >> typedef float v4sf __attribute__((vector_size(16))); >> -- >> 2.17.1 >>