https://gcc.gnu.org/g:dcba959fb30dc250eeb6fdd05aa878e5f1fc8c2d
commit r16-2174-gdcba959fb30dc250eeb6fdd05aa878e5f1fc8c2d Author: Robin Dapp <rd...@ventanamicro.com> Date: Thu Jul 10 09:41:48 2025 +0200 RISC-V: Make zero-stride load broadcast a tunable. This patch makes the zero-stride load broadcast idiom dependent on a uarch-tunable "use_zero_stride_load". Right now we have quite a few paths that reach a strided load and some of them are not exactly straightforward. While broadcast is relatively rare on rv64 targets it is more common on rv32 targets that want to vectorize 64-bit elements. While the patch is more involved than I would have liked it could have even touched more places. The whole broadcast-like insn path feels a bit hackish due to the several optimizations we employ. Some of the complications stem from the fact that we lump together real broadcasts, vector single-element sets, and strided broadcasts. The strided-load alternatives currently require a memory_constraint to work properly which causes more complications when trying to disable just these. In short, the whole pred_broadcast handling in combination with the sew64_scalar_helper could use work in the future. I was about to start with it in this patch but soon realized that it would only distract from the original intent. What can help in the future is split strided and non-strided broadcast entirely, as well as the single-element sets. Yet unclear is whether we need to pay special attention for misaligned strided loads (PR120782). I regtested on rv32 and rv64 with strided_load_broadcast_p forced to true and false. With either I didn't observe any new execution failures but obviously there are new scan failures with strided broadcast turned off. PR target/118734 gcc/ChangeLog: * config/riscv/constraints.md (Wdm): Use tunable for Wdm constraint. * config/riscv/riscv-protos.h (emit_avltype_insn): Declare. (can_be_broadcasted_p): Rename to... (can_be_broadcast_p): ...this. * config/riscv/predicates.md: Use renamed function. (strided_load_broadcast_p): Declare. * config/riscv/riscv-selftests.cc (run_broadcast_selftests): Only run broadcast selftest if strided broadcasts are OK. * config/riscv/riscv-v.cc (emit_avltype_insn): New function. (sew64_scalar_helper): Only emit a pred_broadcast if the new tunable says so. (can_be_broadcasted_p): Rename to... (can_be_broadcast_p): ...this and use new tunable. * config/riscv/riscv.cc (struct riscv_tune_param): Add strided broad tunable. (strided_load_broadcast_p): Implement. * config/riscv/vector.md: Use strided_load_broadcast_p () and work around 64-bit broadcast on rv32 targets. Diff: --- gcc/config/riscv/constraints.md | 7 ++-- gcc/config/riscv/predicates.md | 2 +- gcc/config/riscv/riscv-protos.h | 4 ++- gcc/config/riscv/riscv-selftests.cc | 10 ++++-- gcc/config/riscv/riscv-v.cc | 58 +++++++++++++++++++++++++++----- gcc/config/riscv/riscv.cc | 20 +++++++++++ gcc/config/riscv/vector.md | 66 +++++++++++++++++++++++++++---------- 7 files changed, 133 insertions(+), 34 deletions(-) diff --git a/gcc/config/riscv/constraints.md b/gcc/config/riscv/constraints.md index ccab1a2e29df..5ecaa19eb014 100644 --- a/gcc/config/riscv/constraints.md +++ b/gcc/config/riscv/constraints.md @@ -237,10 +237,11 @@ (and (match_code "const_vector") (match_test "rtx_equal_p (op, riscv_vector::gen_scalar_move_mask (GET_MODE (op)))"))) -(define_memory_constraint "Wdm" +(define_constraint "Wdm" "Vector duplicate memory operand" - (and (match_code "mem") - (match_code "reg" "0"))) + (and (match_test "strided_load_broadcast_p ()") + (and (match_code "mem") + (match_code "reg" "0")))) ;; Vendor ISA extension constraints. diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md index 8baad2fae7a9..1f9a6b562e53 100644 --- a/gcc/config/riscv/predicates.md +++ b/gcc/config/riscv/predicates.md @@ -617,7 +617,7 @@ ;; The scalar operand can be directly broadcast by RVV instructions. (define_predicate "direct_broadcast_operand" - (match_test "riscv_vector::can_be_broadcasted_p (op)")) + (match_test "riscv_vector::can_be_broadcast_p (op)")) ;; A CONST_INT operand that has exactly two bits cleared. (define_predicate "const_nottwobits_operand" diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 38f63ea84248..a41c4c299fac 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -604,6 +604,7 @@ void emit_vlmax_vsetvl (machine_mode, rtx); void emit_hard_vlmax_vsetvl (machine_mode, rtx); void emit_vlmax_insn (unsigned, unsigned, rtx *); void emit_nonvlmax_insn (unsigned, unsigned, rtx *, rtx); +void emit_avltype_insn (unsigned, unsigned, rtx *, avl_type, rtx = nullptr); void emit_vlmax_insn_lra (unsigned, unsigned, rtx *, rtx); enum vlmul_type get_vlmul (machine_mode); rtx get_vlmax_rtx (machine_mode); @@ -760,7 +761,7 @@ uint8_t get_sew (rtx_insn *); enum vlmul_type get_vlmul (rtx_insn *); int count_regno_occurrences (rtx_insn *, unsigned int); bool imm_avl_p (machine_mode); -bool can_be_broadcasted_p (rtx); +bool can_be_broadcast_p (rtx); bool gather_scatter_valid_offset_p (machine_mode); HOST_WIDE_INT estimated_poly_value (poly_int64, unsigned int); bool whole_reg_to_reg_move_p (rtx *, machine_mode, int); @@ -813,6 +814,7 @@ extern const char *th_output_move (rtx, rtx); extern bool th_print_operand_address (FILE *, machine_mode, rtx); #endif +extern bool strided_load_broadcast_p (void); extern bool riscv_use_divmod_expander (void); void riscv_init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int); extern bool diff --git a/gcc/config/riscv/riscv-selftests.cc b/gcc/config/riscv/riscv-selftests.cc index 34d01ac76b75..9ca1ffee394f 100644 --- a/gcc/config/riscv/riscv-selftests.cc +++ b/gcc/config/riscv/riscv-selftests.cc @@ -342,9 +342,13 @@ run_broadcast_selftests (void) expand_vector_broadcast (mode, mem); \ insn = get_last_insn (); \ src = SET_SRC (PATTERN (insn)); \ - ASSERT_TRUE (MEM_P (XEXP (src, 0))); \ - ASSERT_TRUE ( \ - rtx_equal_p (src, gen_rtx_VEC_DUPLICATE (mode, XEXP (src, 0)))); \ + if (strided_load_broadcast_p ()) \ + { \ + ASSERT_TRUE (MEM_P (XEXP (src, 0))); \ + ASSERT_TRUE ( \ + rtx_equal_p (src, \ + gen_rtx_VEC_DUPLICATE (mode, XEXP (src, 0)))); \ + } \ end_sequence (); \ /* Test vmv.v.x or vfmv.v.f. */ \ start_sequence (); \ diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index 2efe56afc9eb..242ac087764f 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -437,6 +437,26 @@ emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl) e.emit_insn ((enum insn_code) icode, ops); } +/* Emit either a VLMAX insn or a non-VLMAX insn depending on TYPE. For a + non-VLMAX insn, the length must be specified in VL. */ + +void +emit_avltype_insn (unsigned icode, unsigned insn_flags, rtx *ops, + avl_type type, rtx vl) +{ + if (type != avl_type::VLMAX && vl != NULL_RTX) + { + insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false); + e.set_vl (vl); + e.emit_insn ((enum insn_code) icode, ops); + } + else + { + insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true); + e.emit_insn ((enum insn_code) icode, ops); + } +} + /* Return true if the vector duplicated by a super element which is the fusion of consecutive elements. @@ -2144,21 +2164,40 @@ sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl, return false; } + bool avoid_strided_broadcast = false; if (CONST_INT_P (*scalar_op)) { if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode))) - *scalar_op = force_const_mem (scalar_mode, *scalar_op); + { + if (strided_load_broadcast_p ()) + *scalar_op = force_const_mem (scalar_mode, *scalar_op); + else + avoid_strided_broadcast = true; + } else *scalar_op = force_reg (scalar_mode, *scalar_op); } rtx tmp = gen_reg_rtx (vector_mode); - rtx ops[] = {tmp, *scalar_op}; - if (type == VLMAX) - emit_vlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops); + if (!avoid_strided_broadcast) + { + rtx ops[] = {tmp, *scalar_op}; + emit_avltype_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops, + type, vl); + } else - emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops, - vl); + { + /* Load scalar as V1DI and broadcast via vrgather.vi. */ + rtx tmp1 = gen_reg_rtx (V1DImode); + emit_move_insn (tmp1, lowpart_subreg (V1DImode, *scalar_op, + scalar_mode)); + tmp1 = lowpart_subreg (vector_mode, tmp1, V1DImode); + + rtx ops[] = {tmp, tmp1, CONST0_RTX (Pmode)}; + emit_vlmax_insn (code_for_pred_gather_scalar (vector_mode), + BINARY_OP, ops); + } + emit_vector_func (operands, tmp); return true; @@ -5769,9 +5808,9 @@ count_regno_occurrences (rtx_insn *rinsn, unsigned int regno) return count; } -/* Return true if the OP can be directly broadcasted. */ +/* Return true if the OP can be directly broadcast. */ bool -can_be_broadcasted_p (rtx op) +can_be_broadcast_p (rtx op) { machine_mode mode = GET_MODE (op); /* We don't allow RA (register allocation) reload generate @@ -5783,7 +5822,8 @@ can_be_broadcasted_p (rtx op) return false; if (satisfies_constraint_K (op) || register_operand (op, mode) - || satisfies_constraint_Wdm (op) || rtx_equal_p (op, CONST0_RTX (mode))) + || (strided_load_broadcast_p () && satisfies_constraint_Wdm (op)) + || rtx_equal_p (op, CONST0_RTX (mode))) return true; return can_create_pseudo_p () && nonmemory_operand (op, mode); diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 023adc3284df..a4428f0e96d2 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -306,6 +306,7 @@ struct riscv_tune_param bool vector_unaligned_access; bool use_divmod_expansion; bool overlap_op_by_pieces; + bool use_zero_stride_load; bool speculative_sched_vsetvl; unsigned int fusible_ops; const struct cpu_vector_cost *vec_costs; @@ -469,6 +470,7 @@ static const struct riscv_tune_param generic_tune_info = { false, /* vector_unaligned_access */ false, /* use_divmod_expansion */ false, /* overlap_op_by_pieces */ + true, /* use_zero_stride_load */ false, /* speculative_sched_vsetvl */ RISCV_FUSE_NOTHING, /* fusible_ops */ NULL, /* vector cost */ @@ -492,6 +494,7 @@ static const struct riscv_tune_param rocket_tune_info = { false, /* vector_unaligned_access */ false, /* use_divmod_expansion */ false, /* overlap_op_by_pieces */ + true, /* use_zero_stride_load */ false, /* speculative_sched_vsetvl */ RISCV_FUSE_NOTHING, /* fusible_ops */ NULL, /* vector cost */ @@ -515,6 +518,7 @@ static const struct riscv_tune_param sifive_7_tune_info = { false, /* vector_unaligned_access */ false, /* use_divmod_expansion */ false, /* overlap_op_by_pieces */ + true, /* use_zero_stride_load */ false, /* speculative_sched_vsetvl */ RISCV_FUSE_NOTHING, /* fusible_ops */ NULL, /* vector cost */ @@ -538,6 +542,7 @@ static const struct riscv_tune_param sifive_p400_tune_info = { false, /* vector_unaligned_access */ false, /* use_divmod_expansion */ false, /* overlap_op_by_pieces */ + true, /* use_zero_stride_load */ false, /* speculative_sched_vsetvl */ RISCV_FUSE_LUI_ADDI | RISCV_FUSE_AUIPC_ADDI, /* fusible_ops */ &generic_vector_cost, /* vector cost */ @@ -561,6 +566,7 @@ static const struct riscv_tune_param sifive_p600_tune_info = { false, /* vector_unaligned_access */ false, /* use_divmod_expansion */ false, /* overlap_op_by_pieces */ + true, /* use_zero_stride_load */ false, /* speculative_sched_vsetvl */ RISCV_FUSE_LUI_ADDI | RISCV_FUSE_AUIPC_ADDI, /* fusible_ops */ &generic_vector_cost, /* vector cost */ @@ -584,6 +590,7 @@ static const struct riscv_tune_param thead_c906_tune_info = { false, /* vector_unaligned_access */ false, /* use_divmod_expansion */ false, /* overlap_op_by_pieces */ + true, /* use_zero_stride_load */ false, /* speculative_sched_vsetvl */ RISCV_FUSE_NOTHING, /* fusible_ops */ NULL, /* vector cost */ @@ -607,6 +614,7 @@ static const struct riscv_tune_param xiangshan_nanhu_tune_info = { false, /* vector_unaligned_access */ false, /* use_divmod_expansion */ false, /* overlap_op_by_pieces */ + true, /* use_zero_stride_load */ false, /* speculative_sched_vsetvl */ RISCV_FUSE_ZEXTW | RISCV_FUSE_ZEXTH, /* fusible_ops */ NULL, /* vector cost */ @@ -630,6 +638,7 @@ static const struct riscv_tune_param generic_ooo_tune_info = { true, /* vector_unaligned_access */ false, /* use_divmod_expansion */ true, /* overlap_op_by_pieces */ + true, /* use_zero_stride_load */ false, /* speculative_sched_vsetvl */ RISCV_FUSE_NOTHING, /* fusible_ops */ &generic_vector_cost, /* vector cost */ @@ -653,6 +662,7 @@ static const struct riscv_tune_param tt_ascalon_d8_tune_info = { true, /* vector_unaligned_access */ true, /* use_divmod_expansion */ true, /* overlap_op_by_pieces */ + true, /* use_zero_stride_load */ false, /* speculative_sched_vsetvl */ RISCV_FUSE_NOTHING, /* fusible_ops */ &generic_vector_cost, /* vector cost */ @@ -676,6 +686,7 @@ static const struct riscv_tune_param optimize_size_tune_info = { false, /* vector_unaligned_access */ false, /* use_divmod_expansion */ false, /* overlap_op_by_pieces */ + true, /* use_zero_stride_load */ false, /* speculative_sched_vsetvl */ RISCV_FUSE_NOTHING, /* fusible_ops */ NULL, /* vector cost */ @@ -699,6 +710,7 @@ static const struct riscv_tune_param mips_p8700_tune_info = { false, /* vector_unaligned_access */ true, /* use_divmod_expansion */ false, /* overlap_op_by_pieces */ + true, /* use_zero_stride_load */ false, /* speculative_sched_vsetvl */ RISCV_FUSE_NOTHING, /* fusible_ops */ NULL, /* vector cost */ @@ -12765,6 +12777,14 @@ riscv_lshift_subword (machine_mode mode ATTRIBUTE_UNUSED, rtx value, rtx shift, gen_lowpart (QImode, shift))); } +/* Return TRUE if we should use the zero stride load, FALSE otherwise. */ + +bool +strided_load_broadcast_p () +{ + return tune_param->use_zero_stride_load; +} + /* Return TRUE if we should use the divmod expander, FALSE otherwise. This allows the behavior to be tuned for specific implementations as well as when optimizing for size. */ diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md index 2046f04a02c5..c5b23b37f7dc 100644 --- a/gcc/config/riscv/vector.md +++ b/gcc/config/riscv/vector.md @@ -1580,8 +1580,22 @@ "&& 1" [(const_int 0)] { - riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode), - riscv_vector::UNARY_OP, operands); + if (!strided_load_broadcast_p () + && TARGET_ZVFHMIN && !TARGET_ZVFH && <VEL>mode == HFmode) + { + /* For Float16, reinterpret as HImode, broadcast and reinterpret + back. */ + poly_uint64 nunits = GET_MODE_NUNITS (<MODE>mode); + machine_mode vmodehi + = riscv_vector::get_vector_mode (HImode, nunits).require (); + rtx ops[] = {lowpart_subreg (vmodehi, operands[0], <MODE>mode), + lowpart_subreg (HImode, operands[1], HFmode)}; + riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (vmodehi), + riscv_vector::UNARY_OP, ops); + } + else + riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode), + riscv_vector::UNARY_OP, operands); DONE; } [(set_attr "type" "vector")] @@ -2171,7 +2185,7 @@ } } else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode) - && (immediate_operand (operands[3], Pmode) + && (immediate_operand (operands[3], Pmode) || (CONST_POLY_INT_P (operands[3]) && known_ge (rtx_to_poly_int64 (operands[3]), 0U) && known_le (rtx_to_poly_int64 (operands[3]), GET_MODE_SIZE (<MODE>mode))))) @@ -2224,12 +2238,7 @@ "(register_operand (operands[3], <VEL>mode) || CONST_POLY_INT_P (operands[3])) && GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)" - [(set (match_dup 0) - (if_then_else:V_VLSI (unspec:<VM> [(match_dup 1) (match_dup 4) - (match_dup 5) (match_dup 6) (match_dup 7) - (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) - (vec_duplicate:V_VLSI (match_dup 3)) - (match_dup 2)))] + [(const_int 0)] { gcc_assert (can_create_pseudo_p ()); if (CONST_POLY_INT_P (operands[3])) @@ -2238,12 +2247,6 @@ emit_move_insn (tmp, operands[3]); operands[3] = tmp; } - rtx m = assign_stack_local (<VEL>mode, GET_MODE_SIZE (<VEL>mode), - GET_MODE_ALIGNMENT (<VEL>mode)); - m = validize_mem (m); - emit_move_insn (m, operands[3]); - m = gen_rtx_MEM (<VEL>mode, force_reg (Pmode, XEXP (m, 0))); - operands[3] = m; /* For SEW = 64 in RV32 system, we expand vmv.s.x: andi a2,a2,1 @@ -2254,6 +2257,35 @@ operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]); operands[1] = CONSTM1_RTX (<VM>mode); } + + /* If the target doesn't want a strided-load broadcast we go with a regular + V1DImode load and a broadcast gather. */ + if (strided_load_broadcast_p ()) + { + rtx mem = assign_stack_local (<VEL>mode, GET_MODE_SIZE (<VEL>mode), + GET_MODE_ALIGNMENT (<VEL>mode)); + mem = validize_mem (mem); + emit_move_insn (mem, operands[3]); + mem = gen_rtx_MEM (<VEL>mode, force_reg (Pmode, XEXP (mem, 0))); + + emit_insn + (gen_pred_broadcast<mode> + (operands[0], operands[1], operands[2], mem, + operands[4], operands[5], operands[6], operands[7])); + } + else + { + rtx tmp = gen_reg_rtx (V1DImode); + emit_move_insn (tmp, lowpart_subreg (V1DImode, operands[3], + <VEL>mode)); + tmp = lowpart_subreg (<MODE>mode, tmp, V1DImode); + + emit_insn + (gen_pred_gather<mode>_scalar + (operands[0], operands[1], operands[2], tmp, CONST0_RTX (Pmode), + operands[4], operands[5], operands[6], operands[7])); + } + DONE; } [(set_attr "type" "vimov,vimov,vlds,vlds,vlds,vlds,vimovxv,vimovxv") (set_attr "mode" "<MODE>")]) @@ -2293,9 +2325,9 @@ (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) (vec_duplicate:V_VLSF_ZVFHMIN - (match_operand:<VEL> 3 "direct_broadcast_operand" "Wdm, Wdm, Wdm, Wdm")) + (match_operand:<VEL> 3 "direct_broadcast_operand" " A, A, A, A")) (match_operand:V_VLSF_ZVFHMIN 2 "vector_merge_operand" " vu, 0, vu, 0")))] - "TARGET_VECTOR" + "TARGET_VECTOR && strided_load_broadcast_p ()" "@ vlse<sew>.v\t%0,%3,zero,%1.t vlse<sew>.v\t%0,%3,zero,%1.t