Pushed to r16-4957.

Thanks!

在 2025/11/2 上午11:31, Guo Jie 写道:
1. When the selector is 0x0, 0x1, 0x10, or 0x11, the result of
xvpermi.q does not depend on the output operand, thus eliminating
the dependency chain of the output operand as input, which can
reduce the number of instructions.

2. When the selector is 0x22, 0x23, 0x32, or 0x33, the result of
xvpermi.q does not depend on the second input operand, thus
eliminating the dependency chain of the second input operand,
which can also reduce the number of instructions.

gcc/ChangeLog:

        * config/loongarch/lasx.md (lasx_xvpermi_q_<LASX:mode>):
        Add new splitter for optimization.

gcc/testsuite/ChangeLog:

        * gcc.target/loongarch/vec_pack_unpack_256.c: Adjust to changed
        lasx_xvpermi_q_<LASX:mode> template.
        * gcc.target/loongarch/vector/lasx/lasx-builtin.c: Ditto.
        * gcc.target/loongarch/lasx-xvpermi_q-opt.c: New test.

---
  gcc/config/loongarch/lasx.md                  | 33 +++++++++++++-
  .../gcc.target/loongarch/lasx-xvpermi_q-opt.c | 44 +++++++++++++++++++
  .../loongarch/vec_pack_unpack_256.c           | 18 +++++---
  .../loongarch/vector/lasx/lasx-builtin.c      |  2 +-
  4 files changed, 89 insertions(+), 8 deletions(-)
  create mode 100644 gcc/testsuite/gcc.target/loongarch/lasx-xvpermi_q-opt.c

diff --git a/gcc/config/loongarch/lasx.md b/gcc/config/loongarch/lasx.md
index c4186b0a779..e66b5b69bdd 100644
--- a/gcc/config/loongarch/lasx.md
+++ b/gcc/config/loongarch/lasx.md
@@ -517,7 +517,7 @@
     (set_attr "mode" "<MODE>")])
;; xvpermi.q
-(define_insn "lasx_xvpermi_q_<LASX:mode>"
+(define_insn_and_split "lasx_xvpermi_q_<LASX:mode>"
    [(set (match_operand:LASX 0 "register_operand" "=f")
        (unspec:LASX
          [(match_operand:LASX 1 "register_operand" "0")
@@ -527,6 +527,37 @@
    "ISA_HAS_LASX"
  {
    return "xvpermi.q\t%u0,%u2,%3";
+}
+  "&& ((INTVAL (operands[3]) & 0xee) == 0x0
+       || (INTVAL (operands[3]) & 0xee) == 0x22)"
+  [(const_int 0)]
+{
+  HOST_WIDE_INT selector = INTVAL (operands[3]);
+  /* Reduce the dependency caused by using output operands[0] as input.  */
+  switch (INTVAL (operands[3]))
+    {
+    case 0x22:
+    case 0x23:
+    case 0x33:
+      selector -= 0x22;
+      operands[2] = operands[1];
+    /* FALLTHRU.  */
+    case 0x0:
+    case 0x1:
+    case 0x11:
+      emit_insn (gen_lasx_xvpermi_d_<mode> (operands[0], operands[2],
+                                           GEN_INT (selector * 0xa + 0x44)));
+      break;
+    case 0x10:
+      emit_move_insn (operands[0], operands[2]);
+      break;
+    case 0x32:
+      emit_move_insn (operands[0], operands[1]);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  DONE;
  }
    [(set_attr "type" "simd_splat")
     (set_attr "mode" "<MODE>")])
diff --git a/gcc/testsuite/gcc.target/loongarch/lasx-xvpermi_q-opt.c 
b/gcc/testsuite/gcc.target/loongarch/lasx-xvpermi_q-opt.c
new file mode 100644
index 00000000000..16fb9dfecdc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/lasx-xvpermi_q-opt.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mlasx -ftree-vectorize" } */
+
+#include <lasxintrin.h>
+
+#define TEST_FUNC(imm)                                                        \
+  __m256i                                                                     \
+  test_##imm (__m256i op0, __m256i op1)                                       \
+  {                                                                           \
+    return __lasx_xvpermi_q (op0, op1, imm);                                  \
+  }
+
+TEST_FUNC (0x00)
+/* { dg-final { scan-assembler-not "test_0x00:.*\txvld.*xvld.*-test_0x00"} } */
+/* { dg-final { scan-assembler-times "test_0x00:.*\txvpermi\\.d.*-test_0x00" 1 
} } */
+
+TEST_FUNC (0x01)
+/* { dg-final { scan-assembler-not "test_0x01:.*\txvld.*xvld.*-test_0x01"} } */
+/* { dg-final { scan-assembler-times "test_0x01:.*\txvpermi\\.d.*-test_0x01" 1 
} } */
+
+TEST_FUNC (0x10)
+/* { dg-final { scan-assembler-not "test_0x10:.*\txvld.*xvld.*-test_0x10"} } */
+/* { dg-final { scan-assembler-not "test_0x10:.*\txvpermi.*-test_0x10"} } */
+
+TEST_FUNC (0x11)
+/* { dg-final { scan-assembler-not "test_0x11:.*\txvld.*xvld.*-test_0x11"} } */
+/* { dg-final { scan-assembler-times "test_0x11:.*\txvpermi\\.d.*-test_0x11" 1 
} } */
+
+TEST_FUNC (0x22)
+/* { dg-final { scan-assembler-not "test_0x22:.*\txvld.*xvld.*-test_0x22"} } */
+/* { dg-final { scan-assembler-times "test_0x22:.*\txvpermi\\.d.*-test_0x22" 1 
} } */
+
+TEST_FUNC (0x23)
+/* { dg-final { scan-assembler-not "test_0x23:.*\txvld.*xvld.*-test_0x23"} } */
+/* { dg-final { scan-assembler-times "test_0x23:.*\txvpermi\\.d.*-test_0x23" 1 
} } */
+
+TEST_FUNC (0x32)
+/* { dg-final { scan-assembler-not "test_0x32:.*\txvld.*xvld.*-test_0x32"} } */
+/* { dg-final { scan-assembler-not "test_0x32:.*\txvpermi.*-test_0x32"} } */
+
+TEST_FUNC (0x33)
+/* { dg-final { scan-assembler-not "test_0x33:.*\txvld.*xvld.*-test_0x33"} } */
+/* { dg-final { scan-assembler-times "test_0x33:.*\txvpermi\\.d.*-test_0x33" 1 
} } */
+
diff --git a/gcc/testsuite/gcc.target/loongarch/vec_pack_unpack_256.c 
b/gcc/testsuite/gcc.target/loongarch/vec_pack_unpack_256.c
index 506b7bdb03e..5b2fd9b0599 100644
--- a/gcc/testsuite/gcc.target/loongarch/vec_pack_unpack_256.c
+++ b/gcc/testsuite/gcc.target/loongarch/vec_pack_unpack_256.c
@@ -55,7 +55,8 @@ test_vec_unpacks_float_hi_lo_v8si (void)
  }
/* { dg-final { scan-assembler "test_vec_unpacks_hi_lo_v8si:.*\tvext2xv\\.d\\.w.*-test_vec_unpacks_hi_lo_v8si" } } */
-/* { dg-final { scan-assembler 
"test_vec_unpacks_hi_lo_v8si:.*\txvpermi\\.q.*-test_vec_unpacks_hi_lo_v8si" } } 
*/
+/* { dg-final { scan-assembler 
"test_vec_unpacks_hi_lo_v8si:.*\txvpermi\\.d.*-test_vec_unpacks_hi_lo_v8si" } } 
*/
+/* { dg-final { scan-assembler-not 
"test_vec_unpacks_hi_lo_v8si:.*\txvpermi\\.q.*-test_vec_unpacks_hi_lo_v8si" } } 
*/
  void
  test_vec_unpacks_hi_lo_v8si (void)
  {
@@ -64,7 +65,8 @@ test_vec_unpacks_hi_lo_v8si (void)
  }
/* { dg-final { scan-assembler "test_vec_unpacks_hi_lo_v16hi:.*\tvext2xv\\.w\\.h.*-test_vec_unpacks_hi_lo_v16hi" } } */
-/* { dg-final { scan-assembler 
"test_vec_unpacks_hi_lo_v16hi:.*\txvpermi\\.q.*-test_vec_unpacks_hi_lo_v16hi" } 
} */
+/* { dg-final { scan-assembler 
"test_vec_unpacks_hi_lo_v16hi:.*\txvpermi\\.d.*-test_vec_unpacks_hi_lo_v16hi" } 
} */
+/* { dg-final { scan-assembler-not 
"test_vec_unpacks_hi_lo_v16hi:.*\txvpermi\\.q.*-test_vec_unpacks_hi_lo_v16hi" } 
} */
  void
  test_vec_unpacks_hi_lo_v16hi (void)
  {
@@ -73,7 +75,8 @@ test_vec_unpacks_hi_lo_v16hi (void)
  }
/* { dg-final { scan-assembler "test_vec_unpacks_hi_lo_v32qi:.*\tvext2xv\\.h\\.b.*-test_vec_unpacks_hi_lo_v32qi" } } */
-/* { dg-final { scan-assembler 
"test_vec_unpacks_hi_lo_v32qi:.*\txvpermi\\.q.*-test_vec_unpacks_hi_lo_v32qi" } 
} */
+/* { dg-final { scan-assembler 
"test_vec_unpacks_hi_lo_v32qi:.*\txvpermi\\.d.*-test_vec_unpacks_hi_lo_v32qi" } 
} */
+/* { dg-final { scan-assembler-not 
"test_vec_unpacks_hi_lo_v32qi:.*\txvpermi\\.q.*-test_vec_unpacks_hi_lo_v32qi" } 
} */
  void
  test_vec_unpacks_hi_lo_v32qi (void)
  {
@@ -91,7 +94,8 @@ test_vec_unpacks_hi_lo_v8sf (void)
  }
/* { dg-final { scan-assembler "test_vec_unpacku_hi_lo_v8si:.*\tvext2xv\\.du\\.wu.*-test_vec_unpacku_hi_lo_v8si" } } */
-/* { dg-final { scan-assembler 
"test_vec_unpacku_hi_lo_v8si:.*\txvpermi\\.q.*-test_vec_unpacku_hi_lo_v8si" } } 
*/
+/* { dg-final { scan-assembler 
"test_vec_unpacku_hi_lo_v8si:.*\txvpermi\\.d.*-test_vec_unpacku_hi_lo_v8si" } } 
*/
+/* { dg-final { scan-assembler-not 
"test_vec_unpacku_hi_lo_v8si:.*\txvpermi\\.q.*-test_vec_unpacku_hi_lo_v8si" } } 
*/
  void
  test_vec_unpacku_hi_lo_v8si (void)
  {
@@ -100,7 +104,8 @@ test_vec_unpacku_hi_lo_v8si (void)
  }
/* { dg-final { scan-assembler "test_vec_unpacku_hi_lo_v16hi:.*\tvext2xv\\.wu\\.hu.*-test_vec_unpacku_hi_lo_v16hi" } } */
-/* { dg-final { scan-assembler 
"test_vec_unpacku_hi_lo_v16hi:.*\txvpermi\\.q.*-test_vec_unpacku_hi_lo_v16hi" } 
} */
+/* { dg-final { scan-assembler 
"test_vec_unpacku_hi_lo_v16hi:.*\txvpermi\\.d.*-test_vec_unpacku_hi_lo_v16hi" } 
} */
+/* { dg-final { scan-assembler-not 
"test_vec_unpacku_hi_lo_v16hi:.*\txvpermi\\.q.*-test_vec_unpacku_hi_lo_v16hi" } 
} */
  void
  test_vec_unpacku_hi_lo_v16hi (void)
  {
@@ -109,7 +114,8 @@ test_vec_unpacku_hi_lo_v16hi (void)
  }
/* { dg-final { scan-assembler "test_vec_unpacku_hi_lo_v32qi:.*\tvext2xv\\.hu\\.bu.*-test_vec_unpacku_hi_lo_v32qi" } } */
-/* { dg-final { scan-assembler 
"test_vec_unpacku_hi_lo_v32qi:.*\txvpermi\\.q.*-test_vec_unpacku_hi_lo_v32qi" } 
} */
+/* { dg-final { scan-assembler 
"test_vec_unpacku_hi_lo_v32qi:.*\txvpermi\\.d.*-test_vec_unpacku_hi_lo_v32qi" } 
} */
+/* { dg-final { scan-assembler-not 
"test_vec_unpacku_hi_lo_v32qi:.*\txvpermi\\.q.*-test_vec_unpacku_hi_lo_v32qi" } 
} */
  void
  test_vec_unpacku_hi_lo_v32qi (void)
  {
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-builtin.c 
b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-builtin.c
index 64ff870a4c5..3f34a430c4e 100644
--- a/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-builtin.c
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-builtin.c
@@ -3301,7 +3301,7 @@ __lasx_vext2xv_du_bu (v32i8 _1)
  v32i8
  __lasx_xvpermi_q (v32i8 _1, v32i8 _2)
  {
-  return __builtin_lasx_xvpermi_q (_1, _2, 1);
+  return __builtin_lasx_xvpermi_q (_1, _2, 0x20);
  }
  v4i64
  __lasx_xvpermi_d (v4i64 _1)

Reply via email to