https://gcc.gnu.org/g:dc501cb0dc857663f7fa762f3dbf0ae60973d2c3
commit r16-702-gdc501cb0dc857663f7fa762f3dbf0ae60973d2c3 Author: Pengxuan Zheng <quic_pzh...@quicinc.com> Date: Wed May 7 10:47:37 2025 -0700 aarch64: Recognize vector permute patterns which can be interpreted as AND [PR100165] Certain permute that blends a vector with zero can be interpreted as an AND of a mask. This idea was suggested by Richard Sandiford when he was reviewing my patch which tries to optimizes certain vector permute with the FMOV instruction for the aarch64 target. For example, for the aarch64 target, at present: v4hi f_v4hi (v4hi x) { return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 }); } generates: f_v4hi: uzp1 v0.2d, v0.2d, v0.2d adrp x0, .LC0 ldr d31, [x0, #:lo12:.LC0] tbl v0.8b, {v0.16b}, v31.8b ret .LC0: .byte -1 .byte -1 .byte 2 .byte 3 .byte -1 .byte -1 .byte 6 .byte 7 With this patch, it generates: f_v4hi: mvni v31.2s, 0xff, msl 8 and v0.8b, v0.8b, v31.8b ret This patch also provides a target-independent routine for detecting vector permute patterns which can be interpreted as AND. Changes since v1: * v2: Rework the patch to only perform the optimization for aarch64 by calling the target independent routine vec_perm_and_mask. PR target/100165 gcc/ChangeLog: * config/aarch64/aarch64.cc (aarch64_evpc_and): New. (aarch64_expand_vec_perm_const_1): Call aarch64_evpc_and. * optabs.cc (vec_perm_and_mask): New. * optabs.h (vec_perm_and_mask): New prototype. gcc/testsuite/ChangeLog: * gcc.target/aarch64/and-be.c: New test. * gcc.target/aarch64/and-le.c: New test. Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com> Diff: --- gcc/config/aarch64/aarch64.cc | 36 +++++++++ gcc/optabs.cc | 44 +++++++++++ gcc/optabs.h | 4 + gcc/testsuite/gcc.target/aarch64/and-be.c | 123 ++++++++++++++++++++++++++++++ gcc/testsuite/gcc.target/aarch64/and-le.c | 123 ++++++++++++++++++++++++++++++ 5 files changed, 330 insertions(+) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index f5552e4b86ce..34f9725485d2 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -26886,6 +26886,40 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d) return true; } +/* Recognize patterns suitable for the AND instructions. */ +static bool +aarch64_evpc_and (struct expand_vec_perm_d *d) +{ + /* Either d->op0 or d->op1 should be a vector of all zeros. */ + if (d->one_vector_p || (!d->zero_op0_p && !d->zero_op1_p)) + return false; + + machine_mode mode = d->vmode; + machine_mode sel_mode; + if (!related_int_vector_mode (mode).exists (&sel_mode)) + return false; + + insn_code and_code = optab_handler (and_optab, sel_mode); + rtx and_mask = vec_perm_and_mask (sel_mode, d->perm, d->zero_op0_p); + if (and_code == CODE_FOR_nothing || !and_mask) + return false; + + if (d->testing_p) + return true; + + class expand_operand ops[3]; + rtx in = d->zero_op0_p ? d->op1 : d->op0; + create_output_operand (&ops[0], gen_lowpart (sel_mode, d->target), sel_mode); + create_input_operand (&ops[1], gen_lowpart (sel_mode, in), sel_mode); + create_input_operand (&ops[2], and_mask, sel_mode); + expand_insn (and_code, 3, ops); + rtx result = gen_lowpart (mode, ops[0].value); + if (!rtx_equal_p (d->target, result)) + emit_move_insn (d->target, result); + + return true; +} + static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) { @@ -26921,6 +26955,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) return true; else if (aarch64_evpc_uzp (d)) return true; + else if (aarch64_evpc_and (d)) + return true; else if (aarch64_evpc_trn (d)) return true; else if (aarch64_evpc_sel (d)) diff --git a/gcc/optabs.cc b/gcc/optabs.cc index 92d6d50d55a0..5c9450f61450 100644 --- a/gcc/optabs.cc +++ b/gcc/optabs.cc @@ -6362,6 +6362,50 @@ expand_vec_perm_1 (enum insn_code icode, rtx target, return NULL_RTX; } +/* Check if vec_perm mask SEL is a constant equivalent to an and operation of + the non-zero vec_perm operand with some mask consisting of 0xffs and 0x00s, + assuming the other vec_perm operand is a constant vector of zeros. Return + the mask for the equivalent and operation, or NULL_RTX if the vec_perm can + not be modeled as an and. MODE is the mode of the value being anded. + ZERO_OP0_P is true if the first operand of the vec_perm is a constant vector + of zeros or false if the second operand of the vec_perm is a constant vector + of zeros. */ +rtx +vec_perm_and_mask (machine_mode mode, const vec_perm_indices &sel, + bool zero_op0_p) +{ + unsigned int nelt; + if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) + return NULL_RTX; + + rtx_vector_builder builder (mode, nelt, 1); + machine_mode emode = GET_MODE_INNER (mode); + + for (unsigned int i = 0; i < nelt; i++) + { + if (zero_op0_p) + { + if (known_eq (sel[i], nelt + i)) + builder.quick_push (CONSTM1_RTX (emode)); + else if (known_lt (sel[i], nelt)) + builder.quick_push (CONST0_RTX (emode)); + else + return NULL_RTX; + } + else + { + if (known_eq (sel[i], i)) + builder.quick_push (CONSTM1_RTX (emode)); + else if (known_ge (sel[i], nelt)) + builder.quick_push (CONST0_RTX (emode)); + else + return NULL_RTX; + } + } + + return builder.build (); +} + /* Implement a permutation of vectors v0 and v1 using the permutation vector in SEL and return the result. Use TARGET to hold the result if nonnull and convenient. diff --git a/gcc/optabs.h b/gcc/optabs.h index ae525c848d32..a8b0e93d60bc 100644 --- a/gcc/optabs.h +++ b/gcc/optabs.h @@ -334,6 +334,10 @@ extern bool have_insn_for (enum rtx_code, machine_mode); /* Generate a conditional trap instruction. */ extern rtx_insn *gen_cond_trap (enum rtx_code, rtx, rtx, rtx); +/* Check whether the vec_perm can be interpreted as an and operation. */ +extern rtx vec_perm_and_mask (machine_mode mode, const vec_perm_indices &sel, + bool zero_op0_p); + /* Generate code for VEC_PERM_EXPR. */ extern rtx expand_vec_perm_var (machine_mode, rtx, rtx, rtx, rtx); extern rtx expand_vec_perm_const (machine_mode, rtx, rtx, diff --git a/gcc/testsuite/gcc.target/aarch64/and-be.c b/gcc/testsuite/gcc.target/aarch64/and-be.c new file mode 100644 index 000000000000..7457dd5e35cf --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/and-be.c @@ -0,0 +1,123 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mbig-endian" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +typedef short v4hi __attribute__ ((vector_size (8))); +typedef char v8qi __attribute__ ((vector_size (8))); +typedef int v4si __attribute__ ((vector_size (16))); +typedef float v4sf __attribute__ ((vector_size (16))); +typedef short v8hi __attribute__ ((vector_size (16))); +typedef char v16qi __attribute__ ((vector_size (16))); + + +/* +** f_v4hi: +** movi v([0-9]+).2s, 0xff, msl 8 +** and v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b) +** ret +*/ +v4hi +f_v4hi (v4hi x) +{ + return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 }); +} + +/* +** g_v4hi: +** mvni v([0-9]+).2s, 0xff, msl 8 +** and v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b) +** ret +*/ +v4hi +g_v4hi (v4hi x) +{ + return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 5, 2, 7 }); +} + +/* +** f_v8hi: +** ... +** and v0.16b, (?:v0.16b, v[0-9]+.16b|v[0-9]+.16b, v0.16b) +** ret +*/ +v8hi +f_v8hi (v8hi x) +{ + return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 }, + (v8hi){ 0, 8, 2, 9, 4, 10, 12, 11 }); +} + +/* +** f_v4si: +** movi v([0-9]+).2d, 0xffffffff00000000 +** and v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b) +** ret +*/ +v4si +f_v4si (v4si x) +{ + return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 4, 2, 5 }); +} + +/* +** g_v4si: +** movi v([0-9]+).2d, 0xffffffff +** and v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b) +** ret +*/ +v4si +g_v4si (v4si x) +{ + return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 1, 5, 3, 7 }); +} + +/* +** h_v4si: +** movi v([0-9]+).2d, 0xffffffff +** and v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b) +** ret +*/ +v4si +h_v4si (v4si x) +{ + return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 7, 1, 6, 3 }); +} + +/* +** f_v4sf: +** movi v([0-9]+).2d, 0xffffffff00000000 +** and v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b) +** ret +*/ +v4sf +f_v4sf (v4sf x) +{ + return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 6, 2, 7 }); +} + +/* +** f_v8qi: +** movi d([0-9]+), 0xff00ff00ff000000 +** and v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b) +** ret +*/ +v8qi +f_v8qi (v8qi x) +{ + return __builtin_shuffle (x, (v8qi){ 0, 0, 0, 0, 0, 0, 0, 0 }, + (v8qi){ 0, 8, 2, 9, 4, 10, 12, 11 }); +} + +/* +** f_v16qi: +** ... +** and v0.16b, (?:v0.16b, v[0-9]+.16b|v[0-9]+.16b, v0.16b) +** ret +*/ +v16qi +f_v16qi (v16qi x) +{ + return __builtin_shuffle ( + x, (v16qi){ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + (v16qi){ 16, 1, 17, 3, 18, 5, 19, 7, 20, 9, 21, 11, 22, 13, 23, 24 }); +} diff --git a/gcc/testsuite/gcc.target/aarch64/and-le.c b/gcc/testsuite/gcc.target/aarch64/and-le.c new file mode 100644 index 000000000000..398813bd7148 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/and-le.c @@ -0,0 +1,123 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mlittle-endian" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +typedef short v4hi __attribute__ ((vector_size (8))); +typedef char v8qi __attribute__ ((vector_size (8))); +typedef int v4si __attribute__ ((vector_size (16))); +typedef float v4sf __attribute__ ((vector_size (16))); +typedef short v8hi __attribute__ ((vector_size (16))); +typedef char v16qi __attribute__ ((vector_size (16))); + + +/* +** f_v4hi: +** mvni v([0-9]+).2s, 0xff, msl 8 +** and v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b) +** ret +*/ +v4hi +f_v4hi (v4hi x) +{ + return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 }); +} + +/* +** g_v4hi: +** movi v([0-9]+).2s, 0xff, msl 8 +** and v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b) +** ret +*/ +v4hi +g_v4hi (v4hi x) +{ + return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 5, 2, 7 }); +} + +/* +** f_v8hi: +** ... +** and v0.16b, (?:v0.16b, v[0-9]+.16b|v[0-9]+.16b, v0.16b) +** ret +*/ +v8hi +f_v8hi (v8hi x) +{ + return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 }, + (v8hi){ 0, 8, 2, 9, 4, 10, 12, 11 }); +} + +/* +** f_v4si: +** movi v([0-9]+).2d, 0xffffffff +** and v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b) +** ret +*/ +v4si +f_v4si (v4si x) +{ + return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 4, 2, 5 }); +} + +/* +** g_v4si: +** movi v([0-9]+).2d, 0xffffffff00000000 +** and v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b) +** ret +*/ +v4si +g_v4si (v4si x) +{ + return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 1, 5, 3, 7 }); +} + +/* +** h_v4si: +** movi v([0-9]+).2d, 0xffffffff00000000 +** and v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b) +** ret +*/ +v4si +h_v4si (v4si x) +{ + return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 7, 1, 6, 3 }); +} + +/* +** f_v4sf: +** movi v([0-9]+).2d, 0xffffffff +** and v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b) +** ret +*/ +v4sf +f_v4sf (v4sf x) +{ + return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 6, 2, 7 }); +} + +/* +** f_v8qi: +** movi d([0-9]+), 0xff00ff00ff +** and v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b) +** ret +*/ +v8qi +f_v8qi (v8qi x) +{ + return __builtin_shuffle (x, (v8qi){ 0, 0, 0, 0, 0, 0, 0, 0 }, + (v8qi){ 0, 8, 2, 9, 4, 10, 12, 11 }); +} + +/* +** f_v16qi: +** ... +** and v0.16b, (?:v0.16b, v[0-9]+.16b|v[0-9]+.16b, v0.16b) +** ret +*/ +v16qi +f_v16qi (v16qi x) +{ + return __builtin_shuffle ( + x, (v16qi){ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + (v16qi){ 16, 1, 17, 3, 18, 5, 19, 7, 20, 9, 21, 11, 22, 13, 23, 24 }); +}