Under the V4SImode, the vec_construct with the load index {0, 1, 0, 1}
use vldrepl.d, the vec_construct with the load index {0, 1, 0, 0} use
vldrepl.d and vshuf4i, reduced the usage of scalar load and vinsgr2vr.
gcc/ChangeLog:
* config/loongarch/lsx.md (lsx_vshuf4i_mem_w_0): Add template.
(lsx_vldrepl_merge_w_0): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c:
---
gcc/config/loongarch/lsx.md | 62 +++++++++++++++++++
.../vector/lsx/lsx-vec-construct-opt.c | 21 ++++++-
2 files changed, 80 insertions(+), 3 deletions(-)
diff --git a/gcc/config/loongarch/lsx.md b/gcc/config/loongarch/lsx.md
index cd87757827d..0dea66b572e 100644
--- a/gcc/config/loongarch/lsx.md
+++ b/gcc/config/loongarch/lsx.md
@@ -1631,6 +1631,39 @@ (define_insn "lsx_vshuf4i_<lsxfmt_f>"
[(set_attr "type" "simd_shf")
(set_attr "mode" "<MODE>")])
+(define_insn_and_split "lsx_vshuf4i_mem_w_0"
+ [(set (match_operand:V4SI 0 "register_operand" "=f")
+ (vec_merge:V4SI
+ (vec_duplicate:V4SI
+ (mem:SI (match_operand:DI 1 "register_operand" "r")))
+ (vec_duplicate:V4SI
+ (mem:SI (plus:DI (match_dup 1) (const_int 4))))
+ (match_operand 2 "const_uimm4_operand" "")))]
+ "ISA_HAS_LSX"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ operands[0] = gen_rtx_REG (V2DImode, REGNO (operands[0]));
+ emit_insn (gen_lsx_vldrepl_d_insn_0 (operands[0], operands[1]));
+
+ operands[0] = gen_rtx_REG (V4SImode, REGNO (operands[0]));
+ rtx sel[4];
+ int op2 = INTVAL (operands[2]);
+ int mask = 1;
+
+ /* Convert imm to an selection. */
+ for (int i = 0; i < 4; ++i)
+ {
+ sel[i] = (op2 & mask) ? const0_rtx : const1_rtx;
+ mask = mask << 1;
+ }
+
+ rtx shuf4i_mask = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, sel));
+ emit_insn (gen_lsx_vshuf4i_w (operands[0], operands[0], shuf4i_mask));
+ DONE;
+})
+
(define_insn "lsx_vsrar_<lsxfmt>"
[(set (match_operand:ILSX 0 "register_operand" "=f")
(unspec:ILSX [(match_operand:ILSX 1 "register_operand" "f")
@@ -2550,6 +2583,35 @@ (define_insn "lsx_vldrepl_<lsxfmt_f>_insn_0"
(set_attr "mode" "<MODE>")
(set_attr "length" "4")])
+;; In 128-bits register, the template implements the load of identical
+;; consecutive SImode data into both the upper 64 bits and lower 64 bits.
+;; Operand[2] performs a vec_merge operation on two consecutive addresses
+;; SImode data items, and places the result in either the lower 64 bits or
+;; the upper 64 bits. When operand[3] is 0, the lower 64 bits are copied
+;; to the upper 64 bits; when operand[3] is 1, the upper 64 bits are copied
+;; to the lower 64 bits.
+
+(define_insn "lsx_vldrepl_merge_w_0"
+ [(set (match_operand:V4SI 0 "register_operand" "=f")
+ (unspec:V4SI
+ [(vec_merge:V4SI
+ (vec_duplicate:V4SI
+ (mem:SI (match_operand:DI 1 "register_operand" "r")))
+ (vec_duplicate:V4SI
+ (mem:SI (plus:DI (match_dup 1) (const_int 4))))
+ (match_operand 2 "const_uimm4_operand" ""))
+ (match_operand 3 "const_0_or_1_operand" "")]
+ UNSPEC_LSX_VREPLVEI_MIRROR))]
+ "ISA_HAS_LSX
+ && (INTVAL (operands[3]) ? (INTVAL (operands[2]) & 0xc) == 0x4
+ : (INTVAL (operands[2]) & 0x3) == 0x1)"
+{
+ return "vldrepl.d\t%w0,%1,0";
+}
+ [(set_attr "type" "simd_load")
+ (set_attr "mode" "V4SI")
+ (set_attr "length" "4")])
+
;; Offset store by sel
(define_expand "lsx_vstelm_<lsxfmt_f>"
[(match_operand:LSX 0 "register_operand")
diff --git
a/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c
b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c
index 92da1c8af9c..a35cda62f12 100644
--- a/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c
@@ -20,9 +20,9 @@ vec_construct_v2i64 ()
return res;
}
-/* Only load the lowest 2 elements and directly copy them to high half-part,
- reducing more vinsgr2vr.w. */
-/* { dg-final { scan-assembler-times "v4i32:.*\tvreplvei\\.d.*v4i32" 1 } } */
+/* Load the lowest 2 elements and directly copy them to high half-part
+ by vldrepl.d. */
+/* { dg-final { scan-assembler-times "v4i32:.*\tvldrepl\\.d.*v4i32" 1 } } */
v4i32
vec_construct_v4i32 ()
{
@@ -32,6 +32,21 @@ vec_construct_v4i32 ()
return res;
}
+/* Load 2 elements of a vector simultaneously by vldrepl.d and shuffle by the
+ vshuf4i.w to avoid use vinsgr2vr. */
+/* { dg-final { scan-assembler-times "v4i32_1:.*\tvldrepl\\.d.*v4i32_1" 1 } }
+ */
+/* { dg-final { scan-assembler-times "v4i32_1:.*\tvshuf4i\\.w.*v4i32_1" 1 } }
+ */
+v4i32
+vec_construct_v4i32_1 ()
+{
+ v4i32 res =
+ { x_si[0], x_si[1], x_si[0], x_si[0] }
+ ;
+ return res;
+}
+
/* Only load the lowest 4 elements and directly copy them to high half-part,
reducing more vinsgr2vr.h. */
/* { dg-final { scan-assembler-times "v8i16:.*\tvreplvei\\.d.*v8i16" 1 } } */
--
2.20.1