Pengxuan Zheng <[email protected]> writes:
> +/* Recognize patterns suitable for the AND instructions. */
> +static bool
> +aarch64_evpc_and (struct expand_vec_perm_d *d)
> +{
> + /* Either d->op0 or d->op1 should be a vector of all zeros. */
> + if (d->one_vector_p || (!d->zero_op0_p && !d->zero_op1_p))
> + return false;
> +
> + machine_mode mode = d->vmode;
> + machine_mode sel_mode;
> + if (!related_int_vector_mode (mode).exists (&sel_mode))
> + return false;
> +
> + insn_code and_code = optab_handler (and_optab, sel_mode);
> + rtx and_mask = vec_perm_and_mask (sel_mode, d->perm, d->zero_op0_p);
> + if (and_code == CODE_FOR_nothing || !and_mask)
> + return false;
> +
> + if (d->testing_p)
> + return true;
> +
> + class expand_operand ops[3];
> + rtx in = d->zero_op0_p ? d->op1 : d->op0;
> + create_output_operand (&ops[0], gen_lowpart (sel_mode, d->target),
> sel_mode);
> + create_input_operand (&ops[1], gen_lowpart (sel_mode, in), sel_mode);
> + create_input_operand (&ops[2], and_mask, sel_mode);
> + expand_insn (and_code, 3, ops);
This might in some cases force the target into a fresh register,
so we should add something like:
rtx result = gen_lowpart (mode, ops[0].value);
if (!rtx_equal_p (d->target, result))
emit_move_insn (d->target, result);
(untested!)
> +
> + return true;
> +}
> +
> static bool
> aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> {
> @@ -26924,6 +26955,8 @@ aarch64_expand_vec_perm_const_1 (struct
> expand_vec_perm_d *d)
> return true;
> else if (aarch64_evpc_uzp (d))
> return true;
> + else if (aarch64_evpc_and (d))
> + return true;
> else if (aarch64_evpc_trn (d))
> return true;
> else if (aarch64_evpc_sel (d))
> diff --git a/gcc/optabs.cc b/gcc/optabs.cc
> index 0a14b1eef8a..860f25bc490 100644
> --- a/gcc/optabs.cc
> +++ b/gcc/optabs.cc
> @@ -6384,6 +6384,50 @@ expand_vec_perm_1 (enum insn_code icode, rtx target,
> return NULL_RTX;
> }
>
> +/* Check if vec_perm mask SEL is a constant equivalent to an and operation of
> + the non-zero vec_perm operand with some mask consisting of 0xffs and
> 0x00s,
> + assuming the other vec_perm operand is a constant vector of zeros. Return
> + the mask for the equivalent and operation, or NULL_RTX if the vec_perm can
> + not be modeled as an and. MODE is the mode of the value being anded.
> + ZERO_OP0_P is true if the first operand of the vec_perm is a constant
> vector
> + of zeros or false if the second operand of the vec_perm is a constant
> vector
> + of zeros. */
> +rtx
> +vec_perm_and_mask (machine_mode mode, const vec_perm_indices &sel,
> + bool zero_op0_p)
> +{
> + unsigned int nelt;
> + if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
> + return NULL_RTX;
> +
> + rtx_vector_builder builder (mode, nelt, 1);
> + machine_mode emode = GET_MODE_INNER (mode);
> +
> + for (unsigned int i = 0; i < nelt; i++)
> + {
> + if (!zero_op0_p)
> + {
> + if (known_eq (sel[i], i))
> + builder.quick_push (CONSTM1_RTX (emode));
> + else if (known_ge (sel[i], nelt))
> + builder.quick_push (CONST0_RTX (emode));
> + else
> + return NULL_RTX;
> + }
> + else
> + {
> + if (known_eq (sel[i], nelt + i))
> + builder.quick_push (CONSTM1_RTX (emode));
> + else if (known_lt (sel[i], nelt))
> + builder.quick_push (CONST0_RTX (emode));
> + else
> + return NULL_RTX;
> + }
Very minor (didn't notice last time), but IMO it would be easier
to follow as "if (zero_op0_p)", with the then and else swapped.
That avoids double negation when reading the else block.
> + }
> +
> + return builder.build ();
> +}
> +
> /* Implement a permutation of vectors v0 and v1 using the permutation
> vector in SEL and return the result. Use TARGET to hold the result
> if nonnull and convenient.
> [...]
> diff --git a/gcc/testsuite/gcc.target/aarch64/and-be.c
> b/gcc/testsuite/gcc.target/aarch64/and-be.c
> new file mode 100644
> index 00000000000..8ed87949f0b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/and-be.c
> @@ -0,0 +1,125 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mbig-endian" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +typedef short v4hi __attribute__ ((vector_size (8)));
> +typedef char v8qi __attribute__ ((vector_size (8)));
> +typedef int v4si __attribute__ ((vector_size (16)));
> +typedef float v4sf __attribute__ ((vector_size (16)));
> +typedef short v8hi __attribute__ ((vector_size (16)));
> +typedef char v16qi __attribute__ ((vector_size (16)));
> +
> +
> +/*
> +** f_v4hi:
> +** movi v([0-9]+).2s, 0xff, msl 8
> +** and v0.8b, v0.8b, v\1.8b
For extra future-proofing it would be better to make this:
> +** and v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b)
Same for the others
> +** ret
> +*/
> +v4hi
> +f_v4hi (v4hi x)
> +{
> + return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 });
> +}
> +
> +/*
> +** g_v4hi:
> +** mvni v([0-9]+).2s, 0xff, msl 8
> +** and v0.8b, v0.8b, v\1.8b
> +** ret
> +*/
> +v4hi
> +g_v4hi (v4hi x)
> +{
> + return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 5, 2, 7 });
> +}
> +
> +/*
> +** f_v8hi:
> +** adrp x([0-9]+), .LC([0-9]+)
> +** ldr q([0-9]+), \[x\1, #:lo12:.LC\2\]
This doesn't really add anything to the test, so it'd probably be worth
stubbing it out with ...
> +** and v0.16b, v0.16b, v\3.16b
> +** ret
> +*/
to give:
** ...
** and v0.8b, (?:v0.8b, v[0-9]+.8b|v[0-9]+.8b, v0.8b)
** ret
Same for other tests that can't use a move immediate.
Please leave 24 hours for others to comment on the target-independent
part, but otherwise the patch is ok with the changes above. Thanks again
for doing this: it's a really nice improvement.
Richard