Pengxuan Zheng <quic_pzh...@quicinc.com> writes:
> +/* Recognize patterns suitable for the AND instructions.  */
> +static bool
> +aarch64_evpc_and (struct expand_vec_perm_d *d)
> +{
> +  /* Either d->op0 or d->op1 should be a vector of all zeros.  */
> +  if (d->one_vector_p || (!d->zero_op0_p && !d->zero_op1_p))
> +    return false;
> +
> +  machine_mode mode = d->vmode;
> +  machine_mode sel_mode;
> +  if (!related_int_vector_mode (mode).exists (&sel_mode))
> +    return false;
> +
> +  insn_code and_code = optab_handler (and_optab, sel_mode);
> +  rtx and_mask = vec_perm_and_mask (sel_mode, d->perm, d->zero_op0_p);
> +  if (and_code == CODE_FOR_nothing || !and_mask)
> +    return false;
> +
> +  if (d->testing_p)
> +    return true;
> +
> +  class expand_operand ops[3];
> +  rtx in = d->zero_op0_p ? d->op1 : d->op0;
> +  create_output_operand (&ops[0], gen_lowpart (sel_mode, d->target), 
> sel_mode);
> +  create_input_operand (&ops[1], gen_lowpart (sel_mode, in), sel_mode);
> +  create_input_operand (&ops[2], and_mask, sel_mode);
> +  expand_insn (and_code, 3, ops);

This might in some cases force the target into a fresh register,
so we should add something like:

  rtx result = gen_lowpart (mode, ops[0].value);
  if (!rtx_equal_p (d->target, result))
    emit_move_insn (d->target, result);

(untested!)

> +
> +  return true;
> +}
> +
>  static bool
>  aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>  {
> @@ -26924,6 +26955,8 @@ aarch64_expand_vec_perm_const_1 (struct 
> expand_vec_perm_d *d)
>           return true;
>         else if (aarch64_evpc_uzp (d))
>           return true;
> +       else if (aarch64_evpc_and (d))
> +         return true;
>         else if (aarch64_evpc_trn (d))
>           return true;
>         else if (aarch64_evpc_sel (d))
> diff --git a/gcc/optabs.cc b/gcc/optabs.cc
> index 0a14b1eef8a..860f25bc490 100644
> --- a/gcc/optabs.cc
> +++ b/gcc/optabs.cc
> @@ -6384,6 +6384,50 @@ expand_vec_perm_1 (enum insn_code icode, rtx target,
>    return NULL_RTX;
>  }
>  
> +/* Check if vec_perm mask SEL is a constant equivalent to an and operation of
> +   the non-zero vec_perm operand with some mask consisting of 0xffs and 
> 0x00s,
> +   assuming the other vec_perm operand is a constant vector of zeros.  Return
> +   the mask for the equivalent and operation, or NULL_RTX if the vec_perm can
> +   not be modeled as an and.  MODE is the mode of the value being anded.
> +   ZERO_OP0_P is true if the first operand of the vec_perm is a constant 
> vector
> +   of zeros or false if the second operand of the vec_perm is a constant 
> vector
> +   of zeros.  */
> +rtx
> +vec_perm_and_mask (machine_mode mode, const vec_perm_indices &sel,
> +                bool zero_op0_p)
> +{
> +  unsigned int nelt;
> +  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
> +    return NULL_RTX;
> +
> +  rtx_vector_builder builder (mode, nelt, 1);
> +  machine_mode emode = GET_MODE_INNER (mode);
> +
> +  for (unsigned int i = 0; i < nelt; i++)
> +    {
> +      if (!zero_op0_p)
> +     {
> +       if (known_eq (sel[i], i))
> +         builder.quick_push (CONSTM1_RTX (emode));
> +       else if (known_ge (sel[i], nelt))
> +         builder.quick_push (CONST0_RTX (emode));
> +       else
> +         return NULL_RTX;
> +     }
> +      else
> +     {
> +       if (known_eq (sel[i], nelt + i))
> +         builder.quick_push (CONSTM1_RTX (emode));
> +       else if (known_lt (sel[i], nelt))
> +         builder.quick_push (CONST0_RTX (emode));
> +       else
> +         return NULL_RTX;
> +     }

Very minor (didn't notice last time), but IMO it would be easier
to follow as "if (zero_op0_p)", with the then and else swapped.
That avoids double negation when reading the else block.

> +    }
> +
> +  return builder.build ();
> +}
> +
>  /* Implement a permutation of vectors v0 and v1 using the permutation
>     vector in SEL and return the result.  Use TARGET to hold the result
>     if nonnull and convenient.
> [...]
> diff --git a/gcc/testsuite/gcc.target/aarch64/and-be.c 
> b/gcc/testsuite/gcc.target/aarch64/and-be.c
> new file mode 100644
> index 00000000000..8ed87949f0b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/and-be.c
> @@ -0,0 +1,125 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mbig-endian" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +typedef short v4hi __attribute__ ((vector_size (8)));
> +typedef char v8qi __attribute__ ((vector_size (8)));
> +typedef int v4si __attribute__ ((vector_size (16)));
> +typedef float v4sf __attribute__ ((vector_size (16)));
> +typedef short v8hi __attribute__ ((vector_size (16)));
> +typedef char v16qi __attribute__ ((vector_size (16)));
> +
> +
> +/*
> +** f_v4hi:
> +**   movi    v([0-9]+).2s, 0xff, msl 8
> +**   and     v0.8b, v0.8b, v\1.8b

For extra future-proofing it would be better to make this:

> +**   and     v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b)

Same for the others

> +**   ret
> +*/
> +v4hi
> +f_v4hi (v4hi x)
> +{
> +  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 });
> +}
> +
> +/*
> +** g_v4hi:
> +**   mvni    v([0-9]+).2s, 0xff, msl 8
> +**   and     v0.8b, v0.8b, v\1.8b
> +**   ret
> +*/
> +v4hi
> +g_v4hi (v4hi x)
> +{
> +  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 5, 2, 7 });
> +}
> +
> +/*
> +** f_v8hi:
> +**   adrp    x([0-9]+), .LC([0-9]+)
> +**   ldr     q([0-9]+), \[x\1, #:lo12:.LC\2\]

This doesn't really add anything to the test, so it'd probably be worth
stubbing it out with ...

> +**   and     v0.16b, v0.16b, v\3.16b
> +**   ret
> +*/

to give:

**      ...
**      and     v0.8b, (?:v0.8b, v[0-9]+.8b|v[0-9]+.8b, v0.8b)
**      ret

Same for other tests that can't use a move immediate.

Please leave 24 hours for others to comment on the target-independent
part, but otherwise the patch is ok with the changes above.  Thanks again
for doing this: it's a really nice improvement.

Richard

Reply via email to