This patch addresses an issue where GCC generates sub-optimal code for
64-bit (DImode) loads and stores on RV32 targets when the 'zilsd'
extension is enabled. Instead of utilizing the 'ld' instruction
provided by zilsd, the compiler was splitting the operation into two
32-bit 'lw' instructions.
The issue was caused by:
1. The cost model underestimating the cost of splitting vs keeping
DImode operations.
2. The lack of specific shift patterns for RV32 DImode, causing early
splitting in the expand pass.
3. Strict address validity checks rejecting DImode LO_SUM addresses.
This patch:
1. Adjusts riscv_rtx_costs to favor ZILSD DImode operations.
- Load/Store cost is set to COSTS_N_INSNS (1) as ZILSD provides single
ld/sd instructions. This is cheaper than the split cost of 2.
- Register move cost is set to COSTS_N_INSNS (2) - 1. This acts as a
tie-breaker to discourage the lower-subreg pass from splitting
DImode moves, preserving 64-bit integrity for later combine passes.
2. Relaxes riscv_valid_lo_sum_p for ZILSD.
3. Implements deferred splitting for DImode shifts (ashl, ashr, lshr)
using a new helper riscv_expand_di_shift_32bit, which uses a
scratch register to ensure valid immediate handling and avoid ICEs.
Regression tested on riscv-sim (rv32i_zilsd).
Fixed gcc.target/riscv/zilsd-code-gen-split-subreg-2.c.
gcc/ChangeLog:
* config/riscv/riscv-protos.h (riscv_expand_di_shift_32bit): New
prototype.
* config/riscv/riscv.cc (riscv_valid_lo_sum_p): Allow DImode LO_SUM for
ZILSD.
(riscv_address_insns): Adjust cost calculation for ZILSD DImode.
(riscv_rtx_costs): Adjust costs to favor DImode operations on ZILSD;
explicitly set SImode move costs.
(riscv_expand_di_shift_32bit): New helper function to expand 64-bit
shifts into 32-bit instructions using a scratch register.
* config/riscv/riscv.md (<optab>di3): Renamed to...
(<optab>di3_rv64): ...this.
(<optab>di3): New define_expand to dispatch between rv64 and zilsd.
(<optab>di3_rv32): New define_insn_and_split for ZILSD.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/zilsd-code-gen-split-subreg-2.c: Remove XFAIL and
TODO.
Signed-off-by: Xiang'ao Liu <[email protected]>
---
gcc/config/riscv/riscv-protos.h | 2 +
gcc/config/riscv/riscv.cc | 154 +++++++++++++++++-
gcc/config/riscv/riscv.md | 42 ++++-
.../riscv/zilsd-code-gen-split-subreg-2.c | 7 +-
4 files changed, 189 insertions(+), 16 deletions(-)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index e3911c49c50..12efb4c0197 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -861,6 +861,8 @@ extern void riscv_reset_previous_fndecl (void);
extern rtx riscv_prefetch_cookie (rtx, rtx);
extern bool riscv_prefetch_offset_address_p (rtx, machine_mode);
+extern void riscv_expand_di_shift_32bit (rtx, rtx, rtx, rtx, enum rtx_code);
+
struct riscv_tune_param;
/* Information about one micro-arch we know about. */
struct riscv_tune_info {
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 4fe2717a3f6..a9b43453ae6 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -2230,7 +2230,8 @@ riscv_valid_lo_sum_p (enum riscv_symbol_type sym_type,
machine_mode mode,
/* We may need to split multiword moves, so make sure that each word
can be accessed without inducing a carry. */
if (size > BITS_PER_WORD
- && (!TARGET_STRICT_ALIGN || size > align))
+ && (!TARGET_STRICT_ALIGN || size > align)
+ && !TARGET_ZILSD)
return false;
return true;
@@ -2570,7 +2571,8 @@ riscv_address_insns (rtx x, machine_mode mode, bool
might_split_p)
/* BLKmode is used for single unaligned loads and stores and should
not count as a multiword mode. */
- if (!riscv_vla_mode_p (mode) && mode != BLKmode && might_split_p)
+ if (!riscv_vla_mode_p (mode) && mode != BLKmode && might_split_p
+ && !(TARGET_ZILSD && mode == DImode && !TARGET_64BIT))
n += (GET_MODE_SIZE (mode).to_constant () + UNITS_PER_WORD - 1) /
UNITS_PER_WORD;
if (addr.type == ADDRESS_LO_SUM)
@@ -4160,23 +4162,28 @@ riscv_rtx_costs (rtx x, machine_mode mode, int
outer_code, int opno ATTRIBUTE_UN
return true;
}
+ if (TARGET_ZILSD && !TARGET_64BIT
+ && REG_P (SET_SRC (x)) && mode == word_mode)
+ {
+ *total = COSTS_N_INSNS (1);
+ return true;
+ }
/* Register move for XLEN * 2. */
if (TARGET_ZILSD
&& register_operand (SET_SRC (x), GET_MODE (SET_SRC (x)))
&& riscv_2x_xlen_mode_p (mode))
{
- /* We still need two instruction for move with ZILSD,
- but let minus one cost to let subreg split don't.
- TODO: Add riscv_tune_param for this. */
+ /* We still need two instructions for move with ZILSD,
+ but use a slightly lower cost to discourage subreg
+ splitting. */
*total = COSTS_N_INSNS (2) - 1;
return true;
}
- /* Load for XLEN * 2. */
+ /* Load for XLEN * 2. ZILSD provides a single ld instruction. */
if (TARGET_ZILSD && MEM_P (SET_SRC (x))
&& riscv_2x_xlen_mode_p (mode))
{
- /* TODO: Add riscv_tune_param for this. */
*total = COSTS_N_INSNS (1);
return true;
}
@@ -4185,11 +4192,10 @@ riscv_rtx_costs (rtx x, machine_mode mode, int
outer_code, int opno ATTRIBUTE_UN
return true;
}
- /* Store for XLEN * 2. */
+ /* Store for XLEN * 2. ZILSD provides a single sd instruction. */
if (TARGET_ZILSD && MEM_P (SET_DEST (x)) && REG_P (SET_SRC (x))
&& riscv_2x_xlen_mode_p (mode))
{
- /* TODO: Add riscv_tune_param for this. */
*total = COSTS_N_INSNS (1);
return true;
}
@@ -16756,6 +16762,136 @@ riscv_prefetch_offset_address_p (rtx x, machine_mode
mode)
#undef TARGET_DOCUMENTATION_NAME
#define TARGET_DOCUMENTATION_NAME "RISC-V"
+/* Helper to expand 64-bit shifts on RV32 (ZILSD) using 32-bit operations. */
+void
+riscv_expand_di_shift_32bit (rtx dest, rtx src, rtx count, rtx scratch,
+ enum rtx_code code)
+{
+ rtx dest_lo = gen_lowpart (SImode, dest);
+ rtx dest_hi = gen_highpart (SImode, dest);
+ rtx src_lo = gen_lowpart (SImode, src);
+ rtx src_hi = gen_highpart (SImode, src);
+
+ /* Shift amount as SImode register. */
+ rtx sh_reg = gen_reg_rtx (SImode);
+ if (CONST_INT_P (count))
+ emit_move_insn (sh_reg, count);
+ else
+ emit_insn (gen_zero_extendqisi2 (sh_reg, count));
+
+ /* Create temporaries for shift count comparison. */
+ rtx sh_lt_32_label = gen_label_rtx ();
+ rtx done_label = gen_label_rtx ();
+
+ /* sh_minus_32 = sh_reg - 32. */
+ rtx sh_minus_32 = gen_reg_rtx (SImode);
+ emit_insn (gen_addsi3 (sh_minus_32, sh_reg, GEN_INT (-32)));
+
+ /* if (sh_minus_32 < 0) goto sh_lt_32_label (i.e., if sh_reg < 32). */
+ emit_cmp_and_jump_insns (sh_minus_32, const0_rtx, LT, NULL_RTX, SImode, 0,
+ sh_lt_32_label);
+
+ /* --- Case: sh_reg >= 32 --- */
+ if (code == ASHIFTRT)
+ {
+ /* dest_hi = src_hi >> 31 (sign extend). */
+ emit_insn (gen_ashrsi3 (dest_hi, src_hi, GEN_INT (31)));
+ /* dest_lo = src_hi >> (sh_reg - 32). */
+ emit_insn (gen_ashrsi3 (dest_lo, src_hi,
+ gen_lowpart (QImode, sh_minus_32)));
+ }
+ else if (code == LSHIFTRT)
+ {
+ /* dest_hi = 0. */
+ emit_move_insn (dest_hi, const0_rtx);
+ /* dest_lo = src_hi >> (sh_reg - 32). */
+ emit_insn (gen_lshrsi3 (dest_lo, src_hi,
+ gen_lowpart (QImode, sh_minus_32)));
+ }
+ else /* ASHIFT (Left Shift). */
+ {
+ /* dest_lo = 0. */
+ emit_move_insn (dest_lo, const0_rtx);
+ /* dest_hi = src_lo << (sh_reg - 32). */
+ emit_insn (gen_ashlsi3 (dest_hi, src_lo,
+ gen_lowpart (QImode, sh_minus_32)));
+ }
+ emit_jump (done_label); /* Jump to end. */
+
+ /* --- Case: sh_reg < 32 (sh_lt_32_label) --- */
+ emit_label (sh_lt_32_label);
+
+ /* Create a mask that is 0 if sh_reg == 0, and -1 otherwise. */
+ rtx mask = gen_reg_rtx (SImode);
+ rtx tmp_not_zero = gen_reg_rtx (SImode);
+ emit_insn (gen_rtx_SET (tmp_not_zero,
+ gen_rtx_NE (SImode, sh_reg, const0_rtx)));
+ emit_insn (gen_negsi2 (mask, tmp_not_zero));
+
+ rtx sh_complement_32 = gen_reg_rtx (SImode);
+ /* sh_complement_32 = 32 - sh_reg. */
+ emit_move_insn (sh_complement_32, GEN_INT (32));
+ emit_insn (gen_subsi3 (sh_complement_32, sh_complement_32, sh_reg));
+
+ if (code == ASHIFTRT)
+ {
+ /* dest_hi = src_hi >> sh_reg. */
+ emit_insn (gen_ashrsi3 (dest_hi, src_hi, gen_lowpart (QImode, sh_reg)));
+
+ /* tmp1 = src_lo >> sh_reg. */
+ rtx tmp1 = gen_reg_rtx (SImode);
+ emit_insn (gen_lshrsi3 (tmp1, src_lo, gen_lowpart (QImode, sh_reg)));
+
+ /* tmp2 = (src_hi << (32 - sh_reg)) & mask. */
+ rtx tmp2 = gen_reg_rtx (SImode);
+ emit_insn (gen_ashlsi3 (tmp2, src_hi,
+ gen_lowpart (QImode, sh_complement_32)));
+ emit_insn (gen_andsi3 (tmp2, tmp2, mask));
+
+ /* dest_lo = tmp1 | tmp2. */
+ emit_insn (gen_iorsi3 (dest_lo, tmp1, tmp2));
+ }
+ else if (code == LSHIFTRT)
+ {
+ /* dest_hi = src_hi >>> sh_reg. */
+ emit_insn (gen_lshrsi3 (dest_hi, src_hi, gen_lowpart (QImode, sh_reg)));
+
+ /* tmp1 = src_lo >>> sh_reg. */
+ rtx tmp1 = gen_reg_rtx (SImode);
+ emit_insn (gen_lshrsi3 (tmp1, src_lo, gen_lowpart (QImode, sh_reg)));
+
+ /* tmp2 = (src_hi << (32 - sh_reg)) & mask. */
+ rtx tmp2 = gen_reg_rtx (SImode);
+ emit_insn (gen_ashlsi3 (tmp2, src_hi,
+ gen_lowpart (QImode, sh_complement_32)));
+ emit_insn (gen_andsi3 (tmp2, tmp2, mask));
+
+ /* dest_lo = tmp1 | tmp2. */
+ emit_insn (gen_iorsi3 (dest_lo, tmp1, tmp2));
+ }
+ else /* ASHIFT (Left Shift). */
+ {
+ /* dest_lo = src_lo << sh_reg. */
+ emit_insn (gen_ashlsi3 (dest_lo, src_lo, gen_lowpart (QImode, sh_reg)));
+
+ /* tmp1 = src_hi << sh_reg. */
+ rtx tmp1 = gen_reg_rtx (SImode);
+ emit_insn (gen_ashlsi3 (tmp1, src_hi, gen_lowpart (QImode, sh_reg)));
+
+ /* tmp2 = (src_lo >>> (32 - sh_reg)) & mask. */
+ rtx tmp2 = gen_reg_rtx (SImode);
+ emit_insn (gen_lshrsi3 (tmp2, src_lo,
+ gen_lowpart (QImode, sh_complement_32)));
+ emit_insn (gen_andsi3 (tmp2, tmp2, mask));
+
+ /* dest_hi = tmp1 | tmp2. */
+ emit_insn (gen_iorsi3 (dest_hi, tmp1, tmp2));
+ }
+
+ emit_label (done_label); /* End of shift logic. */
+}
+
+
struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-riscv.h"
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 6f8cd26e5c9..2fceb5d3b62 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -777,7 +777,7 @@
[(set_attr "type" "arith")
(set_attr "mode" "DI")])
-(define_expand "addv<mode>4"
+ (define_expand "addv<mode>4"
[(set (match_operand:GPR 0 "register_operand" "=r,r")
(plus:GPR (match_operand:GPR 1 "register_operand" " r,r")
(match_operand:GPR 2 "arith_operand" " r,I")))
@@ -2928,7 +2928,27 @@
}
})
-(define_insn "<optab>di3"
+(define_expand "<optab>di3"
+ [(set (match_operand:DI 0 "register_operand")
+ (any_shift:DI (match_operand:DI 1 "register_operand")
+ (match_operand:QI 2 "arith_operand")))]
+ "TARGET_64BIT || (TARGET_ZILSD && !TARGET_64BIT)"
+{
+ if (TARGET_64BIT)
+ {
+ emit_insn (gen_<optab>di3_rv64 (operands[0], operands[1], operands[2]));
+ DONE;
+ }
+ else
+ {
+ rtx scratch = gen_reg_rtx (SImode);
+ emit_insn (gen_<optab>di3_rv32 (operands[0], operands[1], operands[2],
+ scratch));
+ DONE;
+ }
+})
+
+(define_insn "<optab>di3_rv64"
[(set (match_operand:DI 0 "register_operand" "= r")
(any_shift:DI
(match_operand:DI 1 "register_operand" " r")
@@ -2944,6 +2964,24 @@
[(set_attr "type" "shift")
(set_attr "mode" "DI")])
+(define_insn_and_split "<optab>di3_rv32"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (any_shift:DI (match_operand:DI 1 "register_operand" "r")
+ (match_operand:QI 2 "arith_operand" "rI")))
+ (clobber (match_operand:SI 3 "register_operand" "=r"))]
+ "TARGET_ZILSD && !TARGET_64BIT"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ riscv_expand_di_shift_32bit (operands[0], operands[1], operands[2],
+ operands[3], <CODE>);
+ DONE;
+}
+ [(set_attr "type" "shift")
+ (set_attr "mode" "DI")])
+
+
(define_insn "*<optab><GPR:mode>3_mask_1"
[(set (match_operand:GPR 0 "register_operand" "= r")
(any_shift:GPR
diff --git a/gcc/testsuite/gcc.target/riscv/zilsd-code-gen-split-subreg-2.c
b/gcc/testsuite/gcc.target/riscv/zilsd-code-gen-split-subreg-2.c
index 3adcd21ea06..8e68efa702a 100644
--- a/gcc/testsuite/gcc.target/riscv/zilsd-code-gen-split-subreg-2.c
+++ b/gcc/testsuite/gcc.target/riscv/zilsd-code-gen-split-subreg-2.c
@@ -6,11 +6,8 @@ long long foo(long long x)
{
return y >> x;
}
-/* TODO: We should not split that 64 bit load into two 32 bit load if we have
- zilsd, but we split that during the expand time, so it's hard to fix via
cost
- model turning, we could either fix that for expander, or...combine those two
- 32 bit load back later. */
-/* { dg-final { scan-assembler-times "ld\t" 1 { xfail riscv*-*-* } } } */
+
+/* { dg-final { scan-assembler-times "ld\t" 1 } } */
/* Os and Oz will use libcall, so the 64 bit load won't be split. */
/* { dg-skip-if "" { *-*-* } { "-O0" "-Os" "-Oz" } } */
--
2.43.0