Re: [PATCH 2/2]AArch64: lower 2 reg TBL permutes with one zero register to 1 reg TBL.

Richard Sandiford Fri, 05 Jul 2024 01:36:38 -0700

Tamar Christina <[email protected]> writes:
>> > +v16qi f3b (v16qi a)
>> > +{
>> > +  v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
>> > +  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 
>> > 5, 10, 6, 11,
>> 7, 12);
>> > +}
>> > +
>> > +/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, 
>> > v[0-
>> 9]+.16b} 5 } } */
>> 
>> It'd be good to test with zeros as the first argument too.
>> 
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>       * config/aarch64/aarch64.cc (struct expand_vec_perm_d): Add zero_op0_p
>       and zero_op_p1.
>       (aarch64_evpc_tbl): Implement register value remapping.
>       (aarch64_vectorize_vec_perm_const): Detect if operand is a zero dup
>       before it's forced to a reg.
>
> gcc/testsuite/ChangeLog:
>
>       * gcc.target/aarch64/tbl_with_zero_1.c: New test.
>       * gcc.target/aarch64/tbl_with_zero_2.c: New test.
>
> -- inline copy of patch --
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 
> 469eb938953a70bc6b0ce3d4aa16f773e40ee03e..2d596c19a31a09b4ccbc957d42dce91e453a0dec
>  100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -25413,6 +25413,7 @@ struct expand_vec_perm_d
>    unsigned int vec_flags;
>    unsigned int op_vec_flags;
>    bool one_vector_p;
> +  bool zero_op0_p, zero_op1_p;
>    bool testing_p;
>  };
>  
> @@ -25909,13 +25910,38 @@ aarch64_evpc_tbl (struct expand_vec_perm_d *d)
>    /* to_constant is safe since this routine is specific to Advanced SIMD
>       vectors.  */
>    unsigned int nelt = d->perm.length ().to_constant ();
> +
> +  /* If one register is the constant vector of 0 then we only need
> +     a one reg TBL and we map any accesses to the vector of 0 to -1.  We 
> can't
> +     do this earlier since vec_perm_indices clamps elements to within range 
> so
> +     we can only do it during codegen.  */
> +  if (d->zero_op0_p)
> +    d->op0 = d->op1;
> +  else if (d->zero_op1_p)
> +    d->op1 = d->op0;
> +
>    for (unsigned int i = 0; i < nelt; ++i)
> -    /* If big-endian and two vectors we end up with a weird mixed-endian
> -       mode on NEON.  Reverse the index within each word but not the word
> -       itself.  to_constant is safe because we checked is_constant above.  */
> -    rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
> -                     ? d->perm[i].to_constant () ^ (nelt - 1)
> -                     : d->perm[i].to_constant ());
> +    {
> +      auto val = d->perm[i].to_constant ();
> +
> +      /* If we're selecting from a 0 vector, we can just use an out of range
> +      index instead.  */
> +      if ((d->zero_op0_p && val < nelt) || (d->zero_op1_p && val >= nelt))
> +     rperm[i] = constm1_rtx;
> +      else
> +     {
> +       /* If we are remapping a zero register as the first parameter we need
> +          to adjust the indices of the non-zero register.  */
> +       if (d->zero_op0_p)
> +         val = val % nelt;
> +
> +       /* If big-endian and two vectors we end up with a weird mixed-endian
> +          mode on NEON.  Reverse the index within each word but not the word
> +          itself.  to_constant is safe because we checked is_constant
> +          above.  */
> +       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? val ^ (nelt - 1) : val);
> +     }
> +    }
>  
>    sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
>    sel = force_reg (vmode, sel);
> @@ -26161,6 +26187,7 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, 
> machine_mode op_mode,
>                                 const vec_perm_indices &sel)
>  {
>    struct expand_vec_perm_d d;
> +  d.zero_op0_p = d.zero_op1_p = false;


This is redundant with the assignments below.  OK with that line removed.

Not sure whether the test will work on Darwin & mingw, but we can restrict
to ELF targets later if necessary.

Thanks,
Richard

>  
>    /* Check whether the mask can be applied to a single vector.  */
>    if (sel.ninputs () == 1
> @@ -26179,6 +26206,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, 
> machine_mode op_mode,
>    else
>      d.one_vector_p = false;
>  
> +  d.zero_op0_p = op0 == CONST0_RTX (op_mode);
> +  d.zero_op1_p = op1 == CONST0_RTX (op_mode);
>    d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
>                    sel.nelts_per_input ());
>    d.vmode = vmode;
> diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c 
> b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..5595127f3302164b1eb06be50d5c37d41095eb06
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c
> @@ -0,0 +1,40 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O1" } */
> +
> +typedef unsigned int v4si __attribute__ ((vector_size (16)));
> +
> +v4si f1 (v4si a)
> +{
> +  v4si zeros = {0,0,0,0};
> +  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6);
> +}
> +
> +typedef unsigned short v8hi __attribute__ ((vector_size (16)));
> +
> +v8hi f2a (v8hi a)
> +{
> +  v8hi zeros = {0,0,0,0,0,0,0,0};
> +  return __builtin_shufflevector (a, zeros, 0, 9, 1, 10, 2, 11, 3, 12);
> +}
> +
> +v8hi f2b (v8hi a)
> +{
> +  v8hi zeros = {0,0,0,0,0,0,0,0};
> +  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8);
> +}
> +
> +typedef unsigned char v16qi __attribute__ ((vector_size (16)));
> +
> +v16qi f3a (v16qi a)
> +{
> +  v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
> +  return __builtin_shufflevector (a, zeros, 0, 17, 1, 18, 2, 19, 3, 20, 4, 
> 21, 5, 22, 6, 23, 7, 24);
> +}
> +
> +v16qi f3b (v16qi a)
> +{
> +  v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
> +  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 
> 10, 6, 11, 7, 12);
> +}
> +
> +/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, 
> v[0-9]+.16b} 5 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c 
> b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..e7d5a678aa5178c00036fd91fc4d776f188d898e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target le } */
> +/* { dg-additional-options "-O1" } */
> +
> +typedef unsigned int v4si __attribute__ ((vector_size (16)));
> +
> +v4si f1 (v4si a)
> +{
> +  v4si zeros = {0,0,0,0};
> +  return __builtin_shufflevector (zeros, a, 0, 5, 1, 6);
> +}
> +
> +v4si f2 (v4si a)
> +{
> +  v4si zeros = {0,0,0,0};
> +  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6);
> +}
> +
> +/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, 
> v[0-9]+.16b} 2 } } */
> +/* { dg-final { scan-assembler-times 
> {(\.byte\s+-1\n\s+){4}(\.byte\s+[4-7]+\n\s+){4}(\.byte\s+-1\n\s+){4}(\.byte\s+(8|9|10|11)+\n?\s*){4}}
>  1 } } */

Re: [PATCH 2/2]AArch64: lower 2 reg TBL permutes with one zero register to 1 reg TBL.

Reply via email to