RE: [PATCH 2/2]AArch64: lower 2 reg TBL permutes with one zero register to 1 reg TBL.

Tamar Christina Fri, 05 Jul 2024 01:17:06 -0700

> > +v16qi f3b (v16qi a)
> > +{
> > +  v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
> > +  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 
> > 5, 10, 6, 11,
> 7, 12);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, 
> > v[0-
> 9]+.16b} 5 } } */
> 
> It'd be good to test with zeros as the first argument too.
>


Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * config/aarch64/aarch64.cc (struct expand_vec_perm_d): Add zero_op0_p
        and zero_op_p1.
        (aarch64_evpc_tbl): Implement register value remapping.
        (aarch64_vectorize_vec_perm_const): Detect if operand is a zero dup
        before it's forced to a reg.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/tbl_with_zero_1.c: New test.
        * gcc.target/aarch64/tbl_with_zero_2.c: New test.

-- inline copy of patch --

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
469eb938953a70bc6b0ce3d4aa16f773e40ee03e..2d596c19a31a09b4ccbc957d42dce91e453a0dec
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25413,6 +25413,7 @@ struct expand_vec_perm_d
   unsigned int vec_flags;
   unsigned int op_vec_flags;
   bool one_vector_p;
+  bool zero_op0_p, zero_op1_p;
   bool testing_p;
 };
 
@@ -25909,13 +25910,38 @@ aarch64_evpc_tbl (struct expand_vec_perm_d *d)
   /* to_constant is safe since this routine is specific to Advanced SIMD
      vectors.  */
   unsigned int nelt = d->perm.length ().to_constant ();
+
+  /* If one register is the constant vector of 0 then we only need
+     a one reg TBL and we map any accesses to the vector of 0 to -1.  We can't
+     do this earlier since vec_perm_indices clamps elements to within range so
+     we can only do it during codegen.  */
+  if (d->zero_op0_p)
+    d->op0 = d->op1;
+  else if (d->zero_op1_p)
+    d->op1 = d->op0;
+
   for (unsigned int i = 0; i < nelt; ++i)
-    /* If big-endian and two vectors we end up with a weird mixed-endian
-       mode on NEON.  Reverse the index within each word but not the word
-       itself.  to_constant is safe because we checked is_constant above.  */
-    rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
-                       ? d->perm[i].to_constant () ^ (nelt - 1)
-                       : d->perm[i].to_constant ());
+    {
+      auto val = d->perm[i].to_constant ();
+
+      /* If we're selecting from a 0 vector, we can just use an out of range
+        index instead.  */
+      if ((d->zero_op0_p && val < nelt) || (d->zero_op1_p && val >= nelt))
+       rperm[i] = constm1_rtx;
+      else
+       {
+         /* If we are remapping a zero register as the first parameter we need
+            to adjust the indices of the non-zero register.  */
+         if (d->zero_op0_p)
+           val = val % nelt;
+
+         /* If big-endian and two vectors we end up with a weird mixed-endian
+            mode on NEON.  Reverse the index within each word but not the word
+            itself.  to_constant is safe because we checked is_constant
+            above.  */
+         rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? val ^ (nelt - 1) : val);
+       }
+    }
 
   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
   sel = force_reg (vmode, sel);
@@ -26161,6 +26187,7 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, 
machine_mode op_mode,
                                  const vec_perm_indices &sel)
 {
   struct expand_vec_perm_d d;
+  d.zero_op0_p = d.zero_op1_p = false;
 
   /* Check whether the mask can be applied to a single vector.  */
   if (sel.ninputs () == 1
@@ -26179,6 +26206,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, 
machine_mode op_mode,
   else
     d.one_vector_p = false;
 
+  d.zero_op0_p = op0 == CONST0_RTX (op_mode);
+  d.zero_op1_p = op1 == CONST0_RTX (op_mode);
   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
                     sel.nelts_per_input ());
   d.vmode = vmode;
diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c 
b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c
new file mode 100644
index 
0000000000000000000000000000000000000000..5595127f3302164b1eb06be50d5c37d41095eb06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+
+typedef unsigned int v4si __attribute__ ((vector_size (16)));
+
+v4si f1 (v4si a)
+{
+  v4si zeros = {0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6);
+}
+
+typedef unsigned short v8hi __attribute__ ((vector_size (16)));
+
+v8hi f2a (v8hi a)
+{
+  v8hi zeros = {0,0,0,0,0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 9, 1, 10, 2, 11, 3, 12);
+}
+
+v8hi f2b (v8hi a)
+{
+  v8hi zeros = {0,0,0,0,0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8);
+}
+
+typedef unsigned char v16qi __attribute__ ((vector_size (16)));
+
+v16qi f3a (v16qi a)
+{
+  v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 
5, 22, 6, 23, 7, 24);
+}
+
+v16qi f3b (v16qi a)
+{
+  v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 
10, 6, 11, 7, 12);
+}
+
+/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, 
v[0-9]+.16b} 5 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c 
b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c
new file mode 100644
index 
0000000000000000000000000000000000000000..e7d5a678aa5178c00036fd91fc4d776f188d898e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target le } */
+/* { dg-additional-options "-O1" } */
+
+typedef unsigned int v4si __attribute__ ((vector_size (16)));
+
+v4si f1 (v4si a)
+{
+  v4si zeros = {0,0,0,0};
+  return __builtin_shufflevector (zeros, a, 0, 5, 1, 6);
+}
+
+v4si f2 (v4si a)
+{
+  v4si zeros = {0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6);
+}
+
+/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, 
v[0-9]+.16b} 2 } } */
+/* { dg-final { scan-assembler-times 
{(\.byte\s+-1\n\s+){4}(\.byte\s+[4-7]+\n\s+){4}(\.byte\s+-1\n\s+){4}(\.byte\s+(8|9|10|11)+\n?\s*){4}}
 1 } } */

rb18606.patch
Description: rb18606.patch

RE: [PATCH 2/2]AArch64: lower 2 reg TBL permutes with one zero register to 1 reg TBL.

Reply via email to