Pengxuan Zheng <quic_pzh...@quicinc.com> writes: > +/* Recognize patterns suitable for the AND instructions. */ > +static bool > +aarch64_evpc_and (struct expand_vec_perm_d *d) > +{ > + /* Either d->op0 or d->op1 should be a vector of all zeros. */ > + if (d->one_vector_p || (!d->zero_op0_p && !d->zero_op1_p)) > + return false; > + > + machine_mode mode = d->vmode; > + machine_mode sel_mode; > + if (!related_int_vector_mode (mode).exists (&sel_mode)) > + return false; > + > + insn_code and_code = optab_handler (and_optab, sel_mode); > + rtx and_mask = vec_perm_and_mask (sel_mode, d->perm, d->zero_op0_p); > + if (and_code == CODE_FOR_nothing || !and_mask) > + return false; > + > + if (d->testing_p) > + return true; > + > + class expand_operand ops[3]; > + rtx in = d->zero_op0_p ? d->op1 : d->op0; > + create_output_operand (&ops[0], gen_lowpart (sel_mode, d->target), > sel_mode); > + create_input_operand (&ops[1], gen_lowpart (sel_mode, in), sel_mode); > + create_input_operand (&ops[2], and_mask, sel_mode); > + expand_insn (and_code, 3, ops);
This might in some cases force the target into a fresh register, so we should add something like: rtx result = gen_lowpart (mode, ops[0].value); if (!rtx_equal_p (d->target, result)) emit_move_insn (d->target, result); (untested!) > + > + return true; > +} > + > static bool > aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) > { > @@ -26924,6 +26955,8 @@ aarch64_expand_vec_perm_const_1 (struct > expand_vec_perm_d *d) > return true; > else if (aarch64_evpc_uzp (d)) > return true; > + else if (aarch64_evpc_and (d)) > + return true; > else if (aarch64_evpc_trn (d)) > return true; > else if (aarch64_evpc_sel (d)) > diff --git a/gcc/optabs.cc b/gcc/optabs.cc > index 0a14b1eef8a..860f25bc490 100644 > --- a/gcc/optabs.cc > +++ b/gcc/optabs.cc > @@ -6384,6 +6384,50 @@ expand_vec_perm_1 (enum insn_code icode, rtx target, > return NULL_RTX; > } > > +/* Check if vec_perm mask SEL is a constant equivalent to an and operation of > + the non-zero vec_perm operand with some mask consisting of 0xffs and > 0x00s, > + assuming the other vec_perm operand is a constant vector of zeros. Return > + the mask for the equivalent and operation, or NULL_RTX if the vec_perm can > + not be modeled as an and. MODE is the mode of the value being anded. > + ZERO_OP0_P is true if the first operand of the vec_perm is a constant > vector > + of zeros or false if the second operand of the vec_perm is a constant > vector > + of zeros. */ > +rtx > +vec_perm_and_mask (machine_mode mode, const vec_perm_indices &sel, > + bool zero_op0_p) > +{ > + unsigned int nelt; > + if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) > + return NULL_RTX; > + > + rtx_vector_builder builder (mode, nelt, 1); > + machine_mode emode = GET_MODE_INNER (mode); > + > + for (unsigned int i = 0; i < nelt; i++) > + { > + if (!zero_op0_p) > + { > + if (known_eq (sel[i], i)) > + builder.quick_push (CONSTM1_RTX (emode)); > + else if (known_ge (sel[i], nelt)) > + builder.quick_push (CONST0_RTX (emode)); > + else > + return NULL_RTX; > + } > + else > + { > + if (known_eq (sel[i], nelt + i)) > + builder.quick_push (CONSTM1_RTX (emode)); > + else if (known_lt (sel[i], nelt)) > + builder.quick_push (CONST0_RTX (emode)); > + else > + return NULL_RTX; > + } Very minor (didn't notice last time), but IMO it would be easier to follow as "if (zero_op0_p)", with the then and else swapped. That avoids double negation when reading the else block. > + } > + > + return builder.build (); > +} > + > /* Implement a permutation of vectors v0 and v1 using the permutation > vector in SEL and return the result. Use TARGET to hold the result > if nonnull and convenient. > [...] > diff --git a/gcc/testsuite/gcc.target/aarch64/and-be.c > b/gcc/testsuite/gcc.target/aarch64/and-be.c > new file mode 100644 > index 00000000000..8ed87949f0b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/and-be.c > @@ -0,0 +1,125 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mbig-endian" } */ > +/* { dg-final { check-function-bodies "**" "" "" } } */ > + > +typedef short v4hi __attribute__ ((vector_size (8))); > +typedef char v8qi __attribute__ ((vector_size (8))); > +typedef int v4si __attribute__ ((vector_size (16))); > +typedef float v4sf __attribute__ ((vector_size (16))); > +typedef short v8hi __attribute__ ((vector_size (16))); > +typedef char v16qi __attribute__ ((vector_size (16))); > + > + > +/* > +** f_v4hi: > +** movi v([0-9]+).2s, 0xff, msl 8 > +** and v0.8b, v0.8b, v\1.8b For extra future-proofing it would be better to make this: > +** and v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b) Same for the others > +** ret > +*/ > +v4hi > +f_v4hi (v4hi x) > +{ > + return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 }); > +} > + > +/* > +** g_v4hi: > +** mvni v([0-9]+).2s, 0xff, msl 8 > +** and v0.8b, v0.8b, v\1.8b > +** ret > +*/ > +v4hi > +g_v4hi (v4hi x) > +{ > + return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 5, 2, 7 }); > +} > + > +/* > +** f_v8hi: > +** adrp x([0-9]+), .LC([0-9]+) > +** ldr q([0-9]+), \[x\1, #:lo12:.LC\2\] This doesn't really add anything to the test, so it'd probably be worth stubbing it out with ... > +** and v0.16b, v0.16b, v\3.16b > +** ret > +*/ to give: ** ... ** and v0.8b, (?:v0.8b, v[0-9]+.8b|v[0-9]+.8b, v0.8b) ** ret Same for other tests that can't use a move immediate. Please leave 24 hours for others to comment on the target-independent part, but otherwise the patch is ok with the changes above. Thanks again for doing this: it's a really nice improvement. Richard