[PATCH v1 1/3] LoongArch: Optimize two 256-bit vectors correspond highpart and lowpart splicing shuffle.

Li Wei Fri, 21 Feb 2025 20:27:45 -0800

In LoongArch, we have xvshuf.{b/h/w/d} instructions which can dealt the
situation that all low 128-bit elements of the target vector are shuffled
by concatenating the low 128-bit elements of the two input vectors, and
all high 128-bit elements of the target vector are similarly shuffled.
Therefore, we added recognition for such situations and used the xvshuf
instruction for optimization.


gcc/ChangeLog:

        * config/loongarch/loongarch.cc (loongarch_if_match_xvshuffle):
        Add new condition.
        (loongarch_expand_vec_perm_const): Add new function.

gcc/testsuite/ChangeLog:

        * gcc.target/loongarch/vec_perm-verify-xvshuf.c: New test.
        * gcc.target/loongarch/vec_perm-xvshuf.c: New test.
---
 gcc/config/loongarch/loongarch.cc             |  69 ++++++++++++
 .../loongarch/vec_perm-verify-xvshuf.c        | 106 ++++++++++++++++++
 .../gcc.target/loongarch/vec_perm-xvshuf.c    |  17 +++
 3 files changed, 192 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/vec_perm-verify-xvshuf.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/vec_perm-xvshuf.c

diff --git a/gcc/config/loongarch/loongarch.cc 
b/gcc/config/loongarch/loongarch.cc
index e9978370e8c..3ac6a74f15b 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -9318,6 +9318,34 @@ loongarch_is_elem_duplicate (struct expand_vec_perm_d *d)
   return result;
 }
 
+/* If the target vector low 128-bit element comes from the low 128-bit element
+   of op0 or op1, and the target vector high 128-bit element comes from the
+   high 128-bit element of op0 or op1, the corresponding xvshuf.{h/w/d}
+   instruction can be matched.  */
+static bool
+loongarch_if_match_xvshuffle (struct expand_vec_perm_d *d)
+{
+  for (int i = 0; i < d->nelt; i++)
+    {
+      unsigned char buf = d->perm[i];
+
+      if (i < d->nelt / 2)
+       {
+         if ((buf >= d->nelt / 2 && buf < d->nelt)
+             || buf >= (d->nelt + d->nelt / 2))
+           return false;
+       }
+      else
+       {
+         if ((buf >= d->nelt && buf < (d->nelt + d->nelt / 2))
+             || buf < d->nelt / 2)
+           return false;
+       }
+    }
+
+  return true;
+}
+
 /* In LASX, some permutation insn does not have the behavior that gcc expects
    when compiler wants to emit a vector permutation.
 
@@ -9570,6 +9598,47 @@ loongarch_expand_vec_perm_const (struct 
expand_vec_perm_d *d)
          return true;
        }
 
+      if (loongarch_if_match_xvshuffle (d))
+       {
+         if (d->testing_p)
+           return true;
+
+         /* Selector example: E_V8SImode, { 0, 9, 2, 11, 4, 13, 6, 15 }.  */
+         /* If target low 128-bit has op1 low 128-bit element {9, 11}, we
+            need subtract half of d->nelt (so index in range (4, 7)) to form
+            the 256-bit intermediate vector vec0.
+            Similarly, if target high 128-bit has op0 high 128-bit element
+            {4, 6}, we need subtract half of d->nelt (so index in range
+            (0, 3)) to form the 256-bit intermediate vector vec1.
+            Especially if target high 128-bit has op1 high 128-bit element
+            {13, 15}, we need modulo d->nelt (so index in range (4, 7)) to
+            form the 256-bit intermediate vector vec1.  */
+         for (i = 0; i < d->nelt; i += 1)
+           {
+             if (i < d->nelt / 2)
+               {
+                 if (d->perm[i] >= d->nelt)
+                   remapped[i] = d->perm[i] - d->nelt / 2;
+                 else
+                   remapped[i] = d->perm[i];
+               }
+             else
+               {
+                 if (d->perm[i] < d->nelt)
+                   remapped[i] = d->perm[i] - d->nelt / 2;
+                 else
+                   remapped[i] = d->perm[i] % d->nelt;
+               }
+           }
+
+         /* Selector after: { 0, 5, 2, 7, 0, 5, 2, 7 }.  */
+         for (i = 0; i < d->nelt; i += 1)
+           rperm[i] = GEN_INT (remapped[i]);
+
+         flag = true;
+         goto expand_perm_const_end;
+       }
+
 expand_perm_const_end:
       if (flag)
        {
diff --git a/gcc/testsuite/gcc.target/loongarch/vec_perm-verify-xvshuf.c 
b/gcc/testsuite/gcc.target/loongarch/vec_perm-verify-xvshuf.c
new file mode 100644
index 00000000000..658dacfe340
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vec_perm-verify-xvshuf.c
@@ -0,0 +1,106 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -mlasx -w -fno-strict-aliasing" } */
+
+#include "./vector/simd_correctness_check.h"
+#include <lasxintrin.h>
+#define N 16
+
+typedef int TYPE;
+
+void
+foo (TYPE a[], TYPE b[], TYPE c[])
+{
+  for (int i = 0; i < N; i += 2)
+    {
+      c[i + 0] = a[i + 0] + b[i + 0];
+      c[i + 1] = a[i + 1] - b[i + 1];
+    }
+}
+
+__m256i
+change_to_256vec (TYPE c[], int offset)
+{
+  __m256i __m256i_op;
+  int type_bit_len = sizeof (TYPE) * 8;
+  long int tmp;
+
+  for (int i = offset; i < 256 / type_bit_len + offset; i += 2)
+    {
+      __m256i_op[(i - offset) / 2] = 0x0;
+      __m256i_op[(i - offset) / 2] |= c[i];
+      tmp = ((long int)c[i + 1] << type_bit_len);
+      __m256i_op[(i - offset) / 2] |= tmp;
+    }
+
+  return __m256i_op;
+}
+
+int
+main ()
+{
+  TYPE a[N], b[N], c[N];
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = i;
+      b[i] = N + i;
+    }
+
+  // c = {16,-16,20,-16,24,-16,28,-16,32,-16,36,-16,40,-16,44,-16};
+  foo (a, b, c);
+
+  __m256i ans1 = change_to_256vec (c, 0);
+  __m256i ans2 = change_to_256vec (c, N / 2);
+
+  __m256i __m256i_op0, __m256i_op1, __m256i_op2, __m256i_out, __m256i_result;
+  __m256i __m256i_op3, __m256i_op4, __m256i_op5, __m256i_out2, __m256i_result2;
+
+  *((unsigned long *)&__m256i_op0[3]) = 0x0000000700000002;
+  *((unsigned long *)&__m256i_op0[2]) = 0x0000000500000000;
+  *((unsigned long *)&__m256i_op0[1]) = 0x0000000700000002;
+  *((unsigned long *)&__m256i_op0[0]) = 0x0000000500000000;
+
+  *((unsigned long *)&__m256i_op1[3]) = 0x0000001e0000001c;
+  *((unsigned long *)&__m256i_op1[2]) = 0x0000001a00000018;
+  *((unsigned long *)&__m256i_op1[1]) = 0x0000001600000014;
+  *((unsigned long *)&__m256i_op1[0]) = 0x0000001200000010;
+
+  *((unsigned long *)&__m256i_op2[3]) = 0xfffffff0fffffff0;
+  *((unsigned long *)&__m256i_op2[2]) = 0xfffffff0fffffff0;
+  *((unsigned long *)&__m256i_op2[1]) = 0xfffffff0fffffff0;
+  *((unsigned long *)&__m256i_op2[0]) = 0xfffffff0fffffff0;
+
+  // __m256i_result = {16,-16,20,-16,24,-16,28,-16};
+  *((unsigned long *)&__m256i_result[3]) = 0xfffffff00000001c;
+  *((unsigned long *)&__m256i_result[2]) = 0xfffffff000000018;
+  *((unsigned long *)&__m256i_result[1]) = 0xfffffff000000014;
+  *((unsigned long *)&__m256i_result[0]) = 0xfffffff000000010;
+
+  __m256i_out = __lasx_xvshuf_w (__m256i_op0, __m256i_op2, __m256i_op1);
+  ASSERTEQ_64 (__LINE__, __m256i_result, __m256i_out);
+  ASSERTEQ_64 (__LINE__, ans1, __m256i_out);
+
+  *((unsigned long *)&__m256i_op3[3]) = 0x0000000700000002;
+  *((unsigned long *)&__m256i_op3[2]) = 0x0000000500000000;
+  *((unsigned long *)&__m256i_op3[1]) = 0x0000000700000002;
+  *((unsigned long *)&__m256i_op3[0]) = 0x0000000500000000;
+
+  *((unsigned long *)&__m256i_op4[3]) = 0x0000002e0000002c;
+  *((unsigned long *)&__m256i_op4[2]) = 0x0000002a00000028;
+  *((unsigned long *)&__m256i_op4[1]) = 0x0000002600000024;
+  *((unsigned long *)&__m256i_op4[0]) = 0x0000002200000020;
+
+  *((unsigned long *)&__m256i_op5[3]) = 0xfffffff0fffffff0;
+  *((unsigned long *)&__m256i_op5[2]) = 0xfffffff0fffffff0;
+  *((unsigned long *)&__m256i_op5[1]) = 0xfffffff0fffffff0;
+  *((unsigned long *)&__m256i_op5[0]) = 0xfffffff0fffffff0;
+
+  // __m256i_result2 = {32,-16,36,-16,40,-16,44,-16};
+  *((unsigned long *)&__m256i_result2[3]) = 0xfffffff00000002c;
+  *((unsigned long *)&__m256i_result2[2]) = 0xfffffff000000028;
+  *((unsigned long *)&__m256i_result2[1]) = 0xfffffff000000024;
+  *((unsigned long *)&__m256i_result2[0]) = 0xfffffff000000020;
+
+  __m256i_out2 = __lasx_xvshuf_w (__m256i_op3, __m256i_op5, __m256i_op4);
+  ASSERTEQ_64 (__LINE__, __m256i_result2, __m256i_out2);
+  ASSERTEQ_64 (__LINE__, ans2, __m256i_out2);
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vec_perm-xvshuf.c 
b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvshuf.c
new file mode 100644
index 00000000000..6b19c2c2fd8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvshuf.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mlasx" } */
+/* { dg-final { scan-assembler "xvshuf.w" } } */
+/* { dg-final { scan-assembler-not "xvperm.w" } } */
+/* { dg-final { scan-assembler-not "xvbitsel.v" } } */
+
+void
+foo (int a[], int b[], int c[])
+{
+  for (int i = 0; i < 100; i += 4)
+    {
+      c[i + 0] = a[i + 0] + b[i + 0];
+      c[i + 1] = a[i + 1] - b[i + 1];
+      c[i + 2] = a[i + 2] - b[i + 2];
+      c[i + 3] = a[i + 3] + b[i + 3];
+    }
+}
-- 
2.38.1

[PATCH v1 1/3] LoongArch: Optimize two 256-bit vectors correspond highpart and lowpart splicing shuffle.

Reply via email to