Tamar Christina <[email protected]> writes:
> Hi All,
>
> When a two reg TBL is performed with one operand being a zero vector we can
> instead use a single reg TBL and map the indices for accessing the zero vector
> to an out of range constant.
>
> On AArch64 out of range indices into a TBL have a defined semantics of setting
> the element to zero. Many uArches have a slower 2-reg TBL than 1-reg TBL.
>
> Before this change we had:
>
> typedef unsigned int v4si __attribute__ ((vector_size (16)));
>
> v4si f1 (v4si a)
> {
> v4si zeros = {0,0,0,0};
> return __builtin_shufflevector (a, zeros, 0, 5, 1, 6);
> }
>
> which generates:
>
> f1:
> mov v30.16b, v0.16b
> movi v31.4s, 0
> adrp x0, .LC0
> ldr q0, [x0, #:lo12:.LC0]
> tbl v0.16b, {v30.16b - v31.16b}, v0.16b
> ret
>
> .LC0:
> .byte 0
> .byte 1
> .byte 2
> .byte 3
> .byte 20
> .byte 21
> .byte 22
> .byte 23
> .byte 4
> .byte 5
> .byte 6
> .byte 7
> .byte 24
> .byte 25
> .byte 26
> .byte 27
>
> and with the patch:
>
> f1:
> adrp x0, .LC0
> ldr q31, [x0, #:lo12:.LC0]
> tbl v0.16b, {v0.16b}, v31.16b
> ret
Nice!
>
> .LC0:
> .byte 0
> .byte 1
> .byte 2
> .byte 3
> .byte -1
> .byte -1
> .byte -1
> .byte -1
> .byte 4
> .byte 5
> .byte 6
> .byte 7
> .byte -1
> .byte -1
> .byte -1
> .byte -1
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> This sequence is generated often by openmp and aside from the
> strict performance impact of this change, it also gives better
> register allocation as we no longer have the consecutive
> register limitation.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> * config/aarch64/aarch64.cc (struct expand_vec_perm_d): Add zero_op0 and
> zero_op1.
> (aarch64_evpc_tbl): Implement register value remapping.
> (aarch64_vectorize_vec_perm_const): Detect if operand is a zero dup
> before it's forced to a reg.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/tbl_with_zero.c: New test.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index
> 6f49d1482042efabedbe723aa59ecf129b84f4ad..655b93e65a7b686a2b82e8ada64cac084ca397d4
> 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -25384,6 +25384,7 @@ struct expand_vec_perm_d
> unsigned int vec_flags;
> unsigned int op_vec_flags;
> bool one_vector_p;
> + bool zero_op0, zero_op1;
Nit: might as well add _p suffixes for consistency with the existing fields.
> bool testing_p;
> };
>
> @@ -25880,13 +25881,38 @@ aarch64_evpc_tbl (struct expand_vec_perm_d *d)
> /* to_constant is safe since this routine is specific to Advanced SIMD
> vectors. */
> unsigned int nelt = d->perm.length ().to_constant ();
> +
> + /* If one register is the constant vector of 0 then we only need
> + a one reg TBL and we map any accesses to the vector of 0 to -1. We
> can't
> + do this earlier since vec_perm_indices clamps elements to within range
> so
> + we can only do it during codegen. */
> + if (d->zero_op0)
> + d->op0 = d->op1;
> + else if (d->zero_op1)
> + d->op1 = d->op0;
> +
> for (unsigned int i = 0; i < nelt; ++i)
> - /* If big-endian and two vectors we end up with a weird mixed-endian
> - mode on NEON. Reverse the index within each word but not the word
> - itself. to_constant is safe because we checked is_constant above. */
> - rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
> - ? d->perm[i].to_constant () ^ (nelt - 1)
> - : d->perm[i].to_constant ());
> + {
> + auto val = d->perm[i].to_constant ();
> +
> + /* If we're selecting from a 0 vector, we can just use an out of range
> + index instead. */
> + if ((d->zero_op0 && val < nelt) || (d->zero_op1 && val >= nelt))
> + rperm[i] = constm1_rtx;
> + else
> + {
> + /* If we are remapping a zero register as the first parameter we need
> + to adjust the indices of the non-zero register. */
> + if (d->zero_op0)
> + val = val % nelt;
> +
> + /* If big-endian and two vectors we end up with a weird mixed-endian
> + mode on NEON. Reverse the index within each word but not the word
> + itself. to_constant is safe because we checked is_constant
> + above. */
> + rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? val ^ (nelt - 1) : val);
> + }
> + }
>
> sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
> sel = force_reg (vmode, sel);
> @@ -26132,6 +26158,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode,
> machine_mode op_mode,
> const vec_perm_indices &sel)
> {
> struct expand_vec_perm_d d;
> + d.zero_op0 = d.zero_op1 = false;
> + rtx e;
>
> /* Check whether the mask can be applied to a single vector. */
> if (sel.ninputs () == 1
> @@ -26147,6 +26175,12 @@ aarch64_vectorize_vec_perm_const (machine_mode
> vmode, machine_mode op_mode,
> d.one_vector_p = true;
> op0 = op1;
> }
> + else if (op0 && ((d.zero_op0 = const_vec_duplicate_p (op0, &e))
> + && const0_rtx == e))
> + d.one_vector_p = false;
> + else if (op1 && ((d.zero_op1 = const_vec_duplicate_p (op1, &e))
> + && const0_rtx == e))
> + d.one_vector_p = false;
> else
> d.one_vector_p = false;
Given the bracketing, this sets d.zero_op0/1 if the operands are a
duplicate of any value, not just zero.
IMO it'd be simpler to drop the two hunks above and instead add:
d.zero_op0_p = op0 == CONST0_RTX (op_mode);
d.zero_op1_p = op1 == CONST0_RTX (op_mode);
> diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero.c
> b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..5595127f3302164b1eb06be50d5c37d41095eb06
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero.c
> @@ -0,0 +1,40 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O1" } */
> +
> +typedef unsigned int v4si __attribute__ ((vector_size (16)));
> +
> +v4si f1 (v4si a)
> +{
> + v4si zeros = {0,0,0,0};
> + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6);
> +}
> +
> +typedef unsigned short v8hi __attribute__ ((vector_size (16)));
> +
> +v8hi f2a (v8hi a)
> +{
> + v8hi zeros = {0,0,0,0,0,0,0,0};
> + return __builtin_shufflevector (a, zeros, 0, 9, 1, 10, 2, 11, 3, 12);
> +}
> +
> +v8hi f2b (v8hi a)
> +{
> + v8hi zeros = {0,0,0,0,0,0,0,0};
> + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8);
> +}
> +
> +typedef unsigned char v16qi __attribute__ ((vector_size (16)));
> +
> +v16qi f3a (v16qi a)
> +{
> + v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
> + return __builtin_shufflevector (a, zeros, 0, 17, 1, 18, 2, 19, 3, 20, 4,
> 21, 5, 22, 6, 23, 7, 24);
> +}
> +
> +v16qi f3b (v16qi a)
> +{
> + v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
> + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5,
> 10, 6, 11, 7, 12);
> +}
> +
> +/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\},
> v[0-9]+.16b} 5 } } */
It'd be good to test with zeros as the first argument too.
Thanks,
Richard