Currently, the shuffle in which LoongArch selects two vectors at
corresponding positions is implemented through the [x]vshuf instruction,
but this will introduce additional index copies. In this case, the
[x]vbitsel.v instruction can be used for optimization.
gcc/ChangeLog:
* config/loongarch/loongarch.cc (loongarch_try_expand_lsx_vshuf_const):
Adjust.
(loongarch_is_bitsel_pattern): Add new check function.
(loongarch_expand_vec_perm_bitsel): Add new implement function.
(loongarch_expand_lsx_shuffle): Adjust.
(loongarch_expand_vec_perm_const): Add new optimize case.
* config/loongarch/lsx.md (lsx_vbitsel_<lsxfmt>): Adjust insn
pattern mode.
(lsx_vbitsel_<lsxfmt_f>): Adjust insn pattern mode.
gcc/testsuite/ChangeLog:
* gcc.target/loongarch/vec_perm-xvshuf.c: Move to...
* gcc.target/loongarch/vec_perm-xvbitsel.c: ...here.
* gcc.target/loongarch/vec_perm-vbitsel.c: New test.
---
gcc/config/loongarch/loongarch.cc | 164 +++++++++++++++++-
gcc/config/loongarch/lsx.md | 14 +-
.../gcc.target/loongarch/vec_perm-vbitsel.c | 17 ++
...{vec_perm-xvshuf.c => vec_perm-xvbitsel.c} | 4 +-
4 files changed, 188 insertions(+), 11 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/loongarch/vec_perm-vbitsel.c
rename gcc/testsuite/gcc.target/loongarch/{vec_perm-xvshuf.c =>
vec_perm-xvbitsel.c} (77%)
diff --git a/gcc/config/loongarch/loongarch.cc
b/gcc/config/loongarch/loongarch.cc
index 3ac6a74f15b..a17fc1dfec2 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -8372,7 +8372,10 @@ loongarch_try_expand_lsx_vshuf_const (struct
expand_vec_perm_d *d)
else
{
sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, rperm));
- emit_move_insn (d->target, sel);
+ /* Weakening dependencies by copying indices (for vshuf). */
+ tmp = gen_reg_rtx (d->vmode);
+ emit_move_insn (tmp, sel);
+ emit_move_insn (d->target, tmp);
}
switch (d->vmode)
@@ -8444,9 +8447,31 @@ loongarch_is_imm_set_shuffle (struct expand_vec_perm_d
*d)
return true;
}
+/* Check if the d->perm meets the requirements of the [x]vbitsel.v insn. */
+static bool
+loongarch_is_bitsel_pattern (struct expand_vec_perm_d *d)
+{
+ bool result = true;
+
+ for (int i = 0; i < d->nelt; i++)
+ {
+ unsigned char buf = d->perm[i];
+ if ((buf % d->nelt) != i)
+ {
+ result = false;
+ break;
+ }
+ }
+
+ return result;
+}
+
static bool
loongarch_expand_vec_perm_even_odd (struct expand_vec_perm_d *);
+static bool
+loongarch_expand_vec_perm_bitsel (struct expand_vec_perm_d *);
+
/* Try to match and expand all kinds of 128-bit const vector permutation
cases. */
@@ -8462,6 +8487,9 @@ loongarch_expand_lsx_shuffle (struct expand_vec_perm_d *d)
if (loongarch_expand_vec_perm_even_odd (d))
return true;
+ if (loongarch_expand_vec_perm_bitsel (d))
+ return true;
+
return loongarch_try_expand_lsx_vshuf_const (d);
}
@@ -9122,6 +9150,132 @@ loongarch_expand_vec_perm (rtx target, rtx op0, rtx
op1, rtx sel)
}
}
+/* Try to use the [x]vbitsel.v insn to optimize the vector shuffle, which
+ can reduce one copy insn in the loop compared to [x]vshuff. */
+static bool
+loongarch_expand_vec_perm_bitsel (struct expand_vec_perm_d *d)
+{
+ if (!ISA_HAS_LSX && !ISA_HAS_LASX)
+ return false;
+
+ if (!loongarch_is_bitsel_pattern (d))
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ int i, val;
+ rtx tmp, tmp2, sel, op0, op1, target;
+ rtx rperm[MAX_VECT_LEN];
+
+ for (i = 0; i < d->nelt; i += 1)
+ {
+ /* Here -1 means that all bits of the corresponding type are 1
+ (including the sign bit). */
+ val = d->perm[i] >= d->nelt ? -1 : 0;
+ rperm[i] = GEN_INT (val);
+ }
+
+ tmp2 = gen_reg_rtx (d->vmode);
+
+ switch (d->vmode)
+ {
+ case E_V4DFmode:
+ sel = gen_rtx_CONST_VECTOR (E_V4DImode, gen_rtvec_v (d->nelt,
+ rperm));
+ /* Because the [x]vbitsel.v insn pattern requires that all src
+ operands and dest operands are of the same type, they need to
+ be type-converted. */
+ tmp = simplify_gen_subreg (E_V4DImode, tmp2, d->vmode, 0);
+ emit_move_insn (tmp, sel);
+ break;
+ case E_V2DFmode:
+ sel = gen_rtx_CONST_VECTOR (E_V2DImode, gen_rtvec_v (d->nelt,
+ rperm));
+ tmp = simplify_gen_subreg (E_V2DImode, tmp2, d->vmode, 0);
+ emit_move_insn (tmp, sel);
+ break;
+ case E_V8SFmode:
+ sel = gen_rtx_CONST_VECTOR (E_V8SImode, gen_rtvec_v (d->nelt,
+ rperm));
+ tmp = simplify_gen_subreg (E_V8SImode, tmp2, d->vmode, 0);
+ emit_move_insn (tmp, sel);
+ break;
+ case E_V4SFmode:
+ sel = gen_rtx_CONST_VECTOR (E_V4SImode, gen_rtvec_v (d->nelt,
+ rperm));
+ tmp = simplify_gen_subreg (E_V4SImode, tmp2, d->vmode, 0);
+ emit_move_insn (tmp, sel);
+ break;
+ default:
+ sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt,
+ rperm));
+ emit_move_insn (tmp2, sel);
+ break;
+ }
+
+ target = d->target;
+ op0 = d->op0;
+ op1 = d->one_vector_p ? d->op0 : d->op1;
+
+ if (GET_MODE_SIZE (d->vmode) == 16)
+ {
+ switch (d->vmode)
+ {
+ case E_V2DFmode:
+ emit_insn (gen_lsx_vbitsel_d_f (target, op0, op1, tmp2));
+ break;
+ case E_V2DImode:
+ emit_insn (gen_lsx_vbitsel_d (target, op0, op1, tmp2));
+ break;
+ case E_V4SFmode:
+ emit_insn (gen_lsx_vbitsel_w_f (target, op0, op1, tmp2));
+ break;
+ case E_V4SImode:
+ emit_insn (gen_lsx_vbitsel_w (target, op0, op1, tmp2));
+ break;
+ case E_V8HImode:
+ emit_insn (gen_lsx_vbitsel_h (target, op0, op1, tmp2));
+ break;
+ case E_V16QImode:
+ emit_insn (gen_lsx_vbitsel_b (target, op0, op1, tmp2));
+ break;
+ default:
+ gcc_unreachable ();
+ break;
+ }
+ }
+ else
+ {
+ switch (d->vmode)
+ {
+ case E_V4DFmode:
+ emit_insn (gen_lasx_xvbitsel_d_f (target, op0, op1, tmp2));
+ break;
+ case E_V4DImode:
+ emit_insn (gen_lasx_xvbitsel_d (target, op0, op1, tmp2));
+ break;
+ case E_V8SFmode:
+ emit_insn (gen_lasx_xvbitsel_w_f (target, op0, op1, tmp2));
+ break;
+ case E_V8SImode:
+ emit_insn (gen_lasx_xvbitsel_w (target, op0, op1, tmp2));
+ break;
+ case E_V16HImode:
+ emit_insn (gen_lasx_xvbitsel_h (target, op0, op1, tmp2));
+ break;
+ case E_V32QImode:
+ emit_insn (gen_lasx_xvbitsel_b (target, op0, op1, tmp2));
+ break;
+ default:
+ gcc_unreachable ();
+ break;
+ }
+ }
+
+ return true;
+}
+
/* Following are the assist function for const vector permutation support. */
static bool
loongarch_is_quad_duplicate (struct expand_vec_perm_d *d)
@@ -9598,6 +9752,9 @@ loongarch_expand_vec_perm_const (struct expand_vec_perm_d
*d)
return true;
}
+ if (loongarch_expand_vec_perm_bitsel (d))
+ return true;
+
if (loongarch_if_match_xvshuffle (d))
{
if (d->testing_p)
@@ -9666,7 +9823,10 @@ expand_perm_const_end:
default:
sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt,
rperm));
- emit_move_insn (d->target, sel);
+ /* Weakening dependencies by copying indices (for xvshuf). */
+ tmp = gen_reg_rtx (d->vmode);
+ emit_move_insn (tmp, sel);
+ emit_move_insn (d->target, tmp);
break;
}
diff --git a/gcc/config/loongarch/lsx.md b/gcc/config/loongarch/lsx.md
index ca0066a21ed..0a39b905712 100644
--- a/gcc/config/loongarch/lsx.md
+++ b/gcc/config/loongarch/lsx.md
@@ -1073,13 +1073,13 @@ (define_insn "lsx_vbitrevi_<lsxfmt>"
[(set_attr "type" "simd_bit")
(set_attr "mode" "<MODE>")])
-(define_insn "lsx_vbitsel_<lsxfmt>"
- [(set (match_operand:ILSX 0 "register_operand" "=f")
- (ior:ILSX (and:ILSX (not:ILSX
- (match_operand:ILSX 3 "register_operand" "f"))
- (match_operand:ILSX 1 "register_operand" "f"))
- (and:ILSX (match_dup 3)
- (match_operand:ILSX 2 "register_operand" "f"))))]
+(define_insn "lsx_vbitsel_<lsxfmt_f>"
+ [(set (match_operand:LSX 0 "register_operand" "=f")
+ (ior:LSX (and:LSX (not:LSX
+ (match_operand:LSX 3 "register_operand" "f"))
+ (match_operand:LSX 1 "register_operand" "f"))
+ (and:LSX (match_dup 3)
+ (match_operand:LSX 2 "register_operand" "f"))))]
"ISA_HAS_LSX"
"vbitsel.v\t%w0,%w1,%w2,%w3"
[(set_attr "type" "simd_bitmov")
diff --git a/gcc/testsuite/gcc.target/loongarch/vec_perm-vbitsel.c
b/gcc/testsuite/gcc.target/loongarch/vec_perm-vbitsel.c
new file mode 100644
index 00000000000..7a5118273c5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vec_perm-vbitsel.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mlsx" } */
+/* { dg-final { scan-assembler-not "vshuf.w" } } */
+/* { dg-final { scan-assembler-not "vori.b" } } */
+/* { dg-final { scan-assembler "vbitsel.v" } } */
+
+void
+foo (int a[], int b[], int c[])
+{
+ for (int i = 0; i < 100; i += 4)
+ {
+ c[i + 0] = a[i + 0] + b[i + 0];
+ c[i + 1] = a[i + 1] - b[i + 1];
+ c[i + 2] = a[i + 2] - b[i + 2];
+ c[i + 3] = a[i + 3] + b[i + 3];
+ }
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vec_perm-xvshuf.c
b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel.c
similarity index 77%
rename from gcc/testsuite/gcc.target/loongarch/vec_perm-xvshuf.c
rename to gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel.c
index 6b19c2c2fd8..b3808b550e5 100644
--- a/gcc/testsuite/gcc.target/loongarch/vec_perm-xvshuf.c
+++ b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel.c
@@ -1,8 +1,8 @@
/* { dg-do compile } */
/* { dg-options "-O3 -mlasx" } */
-/* { dg-final { scan-assembler "xvshuf.w" } } */
+/* { dg-final { scan-assembler-not "xvshuf.w" } } */
/* { dg-final { scan-assembler-not "xvperm.w" } } */
-/* { dg-final { scan-assembler-not "xvbitsel.v" } } */
+/* { dg-final { scan-assembler "xvbitsel.v" } } */
void
foo (int a[], int b[], int c[])
--
2.38.1