Currently, the shuffle in which LoongArch selects two vectors at corresponding positions is implemented through the [x]vshuf instruction, but this will introduce additional index copies. In this case, the [x]vbitsel.v instruction can be used for optimization.
gcc/ChangeLog: * config/loongarch/loongarch.cc (loongarch_try_expand_lsx_vshuf_const): Adjust. (loongarch_is_bitsel_pattern): Add new check function. (loongarch_expand_vec_perm_bitsel): Add new implement function. (loongarch_expand_lsx_shuffle): Adjust. (loongarch_expand_vec_perm_const): Add new optimize case. * config/loongarch/lsx.md (lsx_vbitsel_<lsxfmt>): Adjust insn pattern mode. (lsx_vbitsel_<lsxfmt_f>): Adjust insn pattern mode. gcc/testsuite/ChangeLog: * gcc.target/loongarch/vec_perm-xvshuf.c: Move to... * gcc.target/loongarch/vec_perm-xvbitsel.c: ...here. * gcc.target/loongarch/vec_perm-vbitsel.c: New test. --- gcc/config/loongarch/loongarch.cc | 164 +++++++++++++++++- gcc/config/loongarch/lsx.md | 14 +- .../gcc.target/loongarch/vec_perm-vbitsel.c | 17 ++ ...{vec_perm-xvshuf.c => vec_perm-xvbitsel.c} | 4 +- 4 files changed, 188 insertions(+), 11 deletions(-) create mode 100644 gcc/testsuite/gcc.target/loongarch/vec_perm-vbitsel.c rename gcc/testsuite/gcc.target/loongarch/{vec_perm-xvshuf.c => vec_perm-xvbitsel.c} (77%) diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc index 3ac6a74f15b..a17fc1dfec2 100644 --- a/gcc/config/loongarch/loongarch.cc +++ b/gcc/config/loongarch/loongarch.cc @@ -8372,7 +8372,10 @@ loongarch_try_expand_lsx_vshuf_const (struct expand_vec_perm_d *d) else { sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, rperm)); - emit_move_insn (d->target, sel); + /* Weakening dependencies by copying indices (for vshuf). */ + tmp = gen_reg_rtx (d->vmode); + emit_move_insn (tmp, sel); + emit_move_insn (d->target, tmp); } switch (d->vmode) @@ -8444,9 +8447,31 @@ loongarch_is_imm_set_shuffle (struct expand_vec_perm_d *d) return true; } +/* Check if the d->perm meets the requirements of the [x]vbitsel.v insn. */ +static bool +loongarch_is_bitsel_pattern (struct expand_vec_perm_d *d) +{ + bool result = true; + + for (int i = 0; i < d->nelt; i++) + { + unsigned char buf = d->perm[i]; + if ((buf % d->nelt) != i) + { + result = false; + break; + } + } + + return result; +} + static bool loongarch_expand_vec_perm_even_odd (struct expand_vec_perm_d *); +static bool +loongarch_expand_vec_perm_bitsel (struct expand_vec_perm_d *); + /* Try to match and expand all kinds of 128-bit const vector permutation cases. */ @@ -8462,6 +8487,9 @@ loongarch_expand_lsx_shuffle (struct expand_vec_perm_d *d) if (loongarch_expand_vec_perm_even_odd (d)) return true; + if (loongarch_expand_vec_perm_bitsel (d)) + return true; + return loongarch_try_expand_lsx_vshuf_const (d); } @@ -9122,6 +9150,132 @@ loongarch_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel) } } +/* Try to use the [x]vbitsel.v insn to optimize the vector shuffle, which + can reduce one copy insn in the loop compared to [x]vshuff. */ +static bool +loongarch_expand_vec_perm_bitsel (struct expand_vec_perm_d *d) +{ + if (!ISA_HAS_LSX && !ISA_HAS_LASX) + return false; + + if (!loongarch_is_bitsel_pattern (d)) + return false; + + if (d->testing_p) + return true; + + int i, val; + rtx tmp, tmp2, sel, op0, op1, target; + rtx rperm[MAX_VECT_LEN]; + + for (i = 0; i < d->nelt; i += 1) + { + /* Here -1 means that all bits of the corresponding type are 1 + (including the sign bit). */ + val = d->perm[i] >= d->nelt ? -1 : 0; + rperm[i] = GEN_INT (val); + } + + tmp2 = gen_reg_rtx (d->vmode); + + switch (d->vmode) + { + case E_V4DFmode: + sel = gen_rtx_CONST_VECTOR (E_V4DImode, gen_rtvec_v (d->nelt, + rperm)); + /* Because the [x]vbitsel.v insn pattern requires that all src + operands and dest operands are of the same type, they need to + be type-converted. */ + tmp = simplify_gen_subreg (E_V4DImode, tmp2, d->vmode, 0); + emit_move_insn (tmp, sel); + break; + case E_V2DFmode: + sel = gen_rtx_CONST_VECTOR (E_V2DImode, gen_rtvec_v (d->nelt, + rperm)); + tmp = simplify_gen_subreg (E_V2DImode, tmp2, d->vmode, 0); + emit_move_insn (tmp, sel); + break; + case E_V8SFmode: + sel = gen_rtx_CONST_VECTOR (E_V8SImode, gen_rtvec_v (d->nelt, + rperm)); + tmp = simplify_gen_subreg (E_V8SImode, tmp2, d->vmode, 0); + emit_move_insn (tmp, sel); + break; + case E_V4SFmode: + sel = gen_rtx_CONST_VECTOR (E_V4SImode, gen_rtvec_v (d->nelt, + rperm)); + tmp = simplify_gen_subreg (E_V4SImode, tmp2, d->vmode, 0); + emit_move_insn (tmp, sel); + break; + default: + sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, + rperm)); + emit_move_insn (tmp2, sel); + break; + } + + target = d->target; + op0 = d->op0; + op1 = d->one_vector_p ? d->op0 : d->op1; + + if (GET_MODE_SIZE (d->vmode) == 16) + { + switch (d->vmode) + { + case E_V2DFmode: + emit_insn (gen_lsx_vbitsel_d_f (target, op0, op1, tmp2)); + break; + case E_V2DImode: + emit_insn (gen_lsx_vbitsel_d (target, op0, op1, tmp2)); + break; + case E_V4SFmode: + emit_insn (gen_lsx_vbitsel_w_f (target, op0, op1, tmp2)); + break; + case E_V4SImode: + emit_insn (gen_lsx_vbitsel_w (target, op0, op1, tmp2)); + break; + case E_V8HImode: + emit_insn (gen_lsx_vbitsel_h (target, op0, op1, tmp2)); + break; + case E_V16QImode: + emit_insn (gen_lsx_vbitsel_b (target, op0, op1, tmp2)); + break; + default: + gcc_unreachable (); + break; + } + } + else + { + switch (d->vmode) + { + case E_V4DFmode: + emit_insn (gen_lasx_xvbitsel_d_f (target, op0, op1, tmp2)); + break; + case E_V4DImode: + emit_insn (gen_lasx_xvbitsel_d (target, op0, op1, tmp2)); + break; + case E_V8SFmode: + emit_insn (gen_lasx_xvbitsel_w_f (target, op0, op1, tmp2)); + break; + case E_V8SImode: + emit_insn (gen_lasx_xvbitsel_w (target, op0, op1, tmp2)); + break; + case E_V16HImode: + emit_insn (gen_lasx_xvbitsel_h (target, op0, op1, tmp2)); + break; + case E_V32QImode: + emit_insn (gen_lasx_xvbitsel_b (target, op0, op1, tmp2)); + break; + default: + gcc_unreachable (); + break; + } + } + + return true; +} + /* Following are the assist function for const vector permutation support. */ static bool loongarch_is_quad_duplicate (struct expand_vec_perm_d *d) @@ -9598,6 +9752,9 @@ loongarch_expand_vec_perm_const (struct expand_vec_perm_d *d) return true; } + if (loongarch_expand_vec_perm_bitsel (d)) + return true; + if (loongarch_if_match_xvshuffle (d)) { if (d->testing_p) @@ -9666,7 +9823,10 @@ expand_perm_const_end: default: sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, rperm)); - emit_move_insn (d->target, sel); + /* Weakening dependencies by copying indices (for xvshuf). */ + tmp = gen_reg_rtx (d->vmode); + emit_move_insn (tmp, sel); + emit_move_insn (d->target, tmp); break; } diff --git a/gcc/config/loongarch/lsx.md b/gcc/config/loongarch/lsx.md index ca0066a21ed..0a39b905712 100644 --- a/gcc/config/loongarch/lsx.md +++ b/gcc/config/loongarch/lsx.md @@ -1073,13 +1073,13 @@ (define_insn "lsx_vbitrevi_<lsxfmt>" [(set_attr "type" "simd_bit") (set_attr "mode" "<MODE>")]) -(define_insn "lsx_vbitsel_<lsxfmt>" - [(set (match_operand:ILSX 0 "register_operand" "=f") - (ior:ILSX (and:ILSX (not:ILSX - (match_operand:ILSX 3 "register_operand" "f")) - (match_operand:ILSX 1 "register_operand" "f")) - (and:ILSX (match_dup 3) - (match_operand:ILSX 2 "register_operand" "f"))))] +(define_insn "lsx_vbitsel_<lsxfmt_f>" + [(set (match_operand:LSX 0 "register_operand" "=f") + (ior:LSX (and:LSX (not:LSX + (match_operand:LSX 3 "register_operand" "f")) + (match_operand:LSX 1 "register_operand" "f")) + (and:LSX (match_dup 3) + (match_operand:LSX 2 "register_operand" "f"))))] "ISA_HAS_LSX" "vbitsel.v\t%w0,%w1,%w2,%w3" [(set_attr "type" "simd_bitmov") diff --git a/gcc/testsuite/gcc.target/loongarch/vec_perm-vbitsel.c b/gcc/testsuite/gcc.target/loongarch/vec_perm-vbitsel.c new file mode 100644 index 00000000000..7a5118273c5 --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/vec_perm-vbitsel.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mlsx" } */ +/* { dg-final { scan-assembler-not "vshuf.w" } } */ +/* { dg-final { scan-assembler-not "vori.b" } } */ +/* { dg-final { scan-assembler "vbitsel.v" } } */ + +void +foo (int a[], int b[], int c[]) +{ + for (int i = 0; i < 100; i += 4) + { + c[i + 0] = a[i + 0] + b[i + 0]; + c[i + 1] = a[i + 1] - b[i + 1]; + c[i + 2] = a[i + 2] - b[i + 2]; + c[i + 3] = a[i + 3] + b[i + 3]; + } +} diff --git a/gcc/testsuite/gcc.target/loongarch/vec_perm-xvshuf.c b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel.c similarity index 77% rename from gcc/testsuite/gcc.target/loongarch/vec_perm-xvshuf.c rename to gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel.c index 6b19c2c2fd8..b3808b550e5 100644 --- a/gcc/testsuite/gcc.target/loongarch/vec_perm-xvshuf.c +++ b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel.c @@ -1,8 +1,8 @@ /* { dg-do compile } */ /* { dg-options "-O3 -mlasx" } */ -/* { dg-final { scan-assembler "xvshuf.w" } } */ +/* { dg-final { scan-assembler-not "xvshuf.w" } } */ /* { dg-final { scan-assembler-not "xvperm.w" } } */ -/* { dg-final { scan-assembler-not "xvbitsel.v" } } */ +/* { dg-final { scan-assembler "xvbitsel.v" } } */ void foo (int a[], int b[], int c[]) -- 2.38.1