LGTM :)
On Thu, Jul 10, 2025 at 6:00 PM Robin Dapp <rdapp....@gmail.com> wrote: > > Hi, > > Changes from v1: > - Use Himode broadcast instead of float broadcast, saving two conversion > insns. > > Let's be daring and leave the thorough testing to the CI first while my own > testing is in progress :) > > This patch makes the zero-stride load broadcast idiom dependent on a > uarch-tunable "use_zero_stride_load". Right now we have quite a few > paths that reach a strided load and some of them are not exactly > straightforward. > > While broadcast is relatively rare on rv64 targets it is more common on > rv32 targets that want to vectorize 64-bit elements. > > While the patch is more involved than I would have liked it could have > even touched more places. The whole broadcast-like insn path feels a > bit hackish due to the several optimizations we employ. Some of the > complications stem from the fact that we lump together real broadcasts, > vector single-element sets, and strided broadcasts. The strided-load > alternatives currently require a memory_constraint to work properly > which causes more complications when trying to disable just these. > > In short, the whole pred_broadcast handling in combination with the > sew64_scalar_helper could use work in the future. I was about to start > with it in this patch but soon realized that it would only distract from > the original intent. What can help in the future is split strided and > non-strided broadcast entirely, as well as the single-element sets. > > Yet unclear is whether we need to pay special attention for misaligned > strided loads (PR120782). > > I regtested on rv32 and rv64 with strided_load_broadcast_p forced to > true and false. With either I didn't observe any new execution failures > but obviously there are new scan failures with strided broadcast turned > off. > > Regards > Robin > > PR target/118734 > > gcc/ChangeLog: > > * config/riscv/constraints.md (Wdm): Use tunable for Wdm > constraint. > * config/riscv/riscv-protos.h (emit_avltype_insn): Declare. > (can_be_broadcasted_p): Rename to... > (can_be_broadcast_p): ...this. > * config/riscv/predicates.md: Use renamed function. > (strided_load_broadcast_p): Declare. > * config/riscv/riscv-selftests.cc (run_broadcast_selftests): > Only run broadcast selftest if strided broadcasts are OK. > * config/riscv/riscv-v.cc (emit_avltype_insn): New function. > (sew64_scalar_helper): Only emit a pred_broadcast if the new > tunable says so. > (can_be_broadcasted_p): Rename to... > (can_be_broadcast_p): ...this and use new tunable. > * config/riscv/riscv.cc (struct riscv_tune_param): Add strided > broad tunable. > (strided_load_broadcast_p): Implement. > * config/riscv/vector.md: Use strided_load_broadcast_p () and > work around 64-bit broadcast on rv32 targets. > --- > gcc/config/riscv/constraints.md | 7 +-- > gcc/config/riscv/predicates.md | 2 +- > gcc/config/riscv/riscv-protos.h | 4 +- > gcc/config/riscv/riscv-selftests.cc | 10 +++-- > gcc/config/riscv/riscv-v.cc | 58 +++++++++++++++++++++---- > gcc/config/riscv/riscv.cc | 20 +++++++++ > gcc/config/riscv/vector.md | 66 +++++++++++++++++++++-------- > 7 files changed, 133 insertions(+), 34 deletions(-) > > diff --git a/gcc/config/riscv/constraints.md b/gcc/config/riscv/constraints.md > index ccab1a2e29d..5ecaa19eb01 100644 > --- a/gcc/config/riscv/constraints.md > +++ b/gcc/config/riscv/constraints.md > @@ -237,10 +237,11 @@ (define_constraint "Wb1" > (and (match_code "const_vector") > (match_test "rtx_equal_p (op, riscv_vector::gen_scalar_move_mask > (GET_MODE (op)))"))) > > -(define_memory_constraint "Wdm" > +(define_constraint "Wdm" > "Vector duplicate memory operand" > - (and (match_code "mem") > - (match_code "reg" "0"))) > + (and (match_test "strided_load_broadcast_p ()") > + (and (match_code "mem") > + (match_code "reg" "0")))) > > ;; Vendor ISA extension constraints. > > diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md > index 8baad2fae7a..1f9a6b562e5 100644 > --- a/gcc/config/riscv/predicates.md > +++ b/gcc/config/riscv/predicates.md > @@ -617,7 +617,7 @@ (define_special_predicate "vector_any_register_operand" > > ;; The scalar operand can be directly broadcast by RVV instructions. > (define_predicate "direct_broadcast_operand" > - (match_test "riscv_vector::can_be_broadcasted_p (op)")) > + (match_test "riscv_vector::can_be_broadcast_p (op)")) > > ;; A CONST_INT operand that has exactly two bits cleared. > (define_predicate "const_nottwobits_operand" > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h > index 38f63ea8424..a41c4c299fa 100644 > --- a/gcc/config/riscv/riscv-protos.h > +++ b/gcc/config/riscv/riscv-protos.h > @@ -604,6 +604,7 @@ void emit_vlmax_vsetvl (machine_mode, rtx); > void emit_hard_vlmax_vsetvl (machine_mode, rtx); > void emit_vlmax_insn (unsigned, unsigned, rtx *); > void emit_nonvlmax_insn (unsigned, unsigned, rtx *, rtx); > +void emit_avltype_insn (unsigned, unsigned, rtx *, avl_type, rtx = nullptr); > void emit_vlmax_insn_lra (unsigned, unsigned, rtx *, rtx); > enum vlmul_type get_vlmul (machine_mode); > rtx get_vlmax_rtx (machine_mode); > @@ -760,7 +761,7 @@ uint8_t get_sew (rtx_insn *); > enum vlmul_type get_vlmul (rtx_insn *); > int count_regno_occurrences (rtx_insn *, unsigned int); > bool imm_avl_p (machine_mode); > -bool can_be_broadcasted_p (rtx); > +bool can_be_broadcast_p (rtx); > bool gather_scatter_valid_offset_p (machine_mode); > HOST_WIDE_INT estimated_poly_value (poly_int64, unsigned int); > bool whole_reg_to_reg_move_p (rtx *, machine_mode, int); > @@ -813,6 +814,7 @@ extern const char *th_output_move (rtx, rtx); > extern bool th_print_operand_address (FILE *, machine_mode, rtx); > #endif > > +extern bool strided_load_broadcast_p (void); > extern bool riscv_use_divmod_expander (void); > void riscv_init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int); > extern bool > diff --git a/gcc/config/riscv/riscv-selftests.cc > b/gcc/config/riscv/riscv-selftests.cc > index 34d01ac76b7..9ca1ffee394 100644 > --- a/gcc/config/riscv/riscv-selftests.cc > +++ b/gcc/config/riscv/riscv-selftests.cc > @@ -342,9 +342,13 @@ run_broadcast_selftests (void) > expand_vector_broadcast (mode, mem); > \ > insn = get_last_insn (); > \ > src = SET_SRC (PATTERN (insn)); > \ > - ASSERT_TRUE (MEM_P (XEXP (src, 0))); > \ > - ASSERT_TRUE ( > \ > - rtx_equal_p (src, gen_rtx_VEC_DUPLICATE (mode, XEXP (src, 0)))); > \ > + if (strided_load_broadcast_p ()) > \ > + { > \ > + ASSERT_TRUE (MEM_P (XEXP (src, 0))); > \ > + ASSERT_TRUE ( > \ > + rtx_equal_p (src, > \ > + gen_rtx_VEC_DUPLICATE (mode, XEXP (src, 0)))); > \ > + } > \ > end_sequence (); > \ > /* Test vmv.v.x or vfmv.v.f. */ > \ > start_sequence (); > \ > diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc > index 22d194909cf..0e7adcd08f5 100644 > --- a/gcc/config/riscv/riscv-v.cc > +++ b/gcc/config/riscv/riscv-v.cc > @@ -437,6 +437,26 @@ emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, > rtx *ops, rtx vl) > e.emit_insn ((enum insn_code) icode, ops); > } > > +/* Emit either a VLMAX insn or a non-VLMAX insn depending on TYPE. For a > + non-VLMAX insn, the length must be specified in VL. */ > + > +void > +emit_avltype_insn (unsigned icode, unsigned insn_flags, rtx *ops, > + avl_type type, rtx vl) > +{ > + if (type != avl_type::VLMAX && vl != NULL_RTX) > + { > + insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false); > + e.set_vl (vl); > + e.emit_insn ((enum insn_code) icode, ops); > + } > + else > + { > + insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true); > + e.emit_insn ((enum insn_code) icode, ops); > + } > +} > + > /* Return true if the vector duplicated by a super element which is the > fusion > of consecutive elements. > > @@ -2144,21 +2164,40 @@ sew64_scalar_helper (rtx *operands, rtx *scalar_op, > rtx vl, > return false; > } > > + bool avoid_strided_broadcast = false; > if (CONST_INT_P (*scalar_op)) > { > if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode))) > - *scalar_op = force_const_mem (scalar_mode, *scalar_op); > + { > + if (strided_load_broadcast_p ()) > + *scalar_op = force_const_mem (scalar_mode, *scalar_op); > + else > + avoid_strided_broadcast = true; > + } > else > *scalar_op = force_reg (scalar_mode, *scalar_op); > } > > rtx tmp = gen_reg_rtx (vector_mode); > - rtx ops[] = {tmp, *scalar_op}; > - if (type == VLMAX) > - emit_vlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops); > + if (!avoid_strided_broadcast) > + { > + rtx ops[] = {tmp, *scalar_op}; > + emit_avltype_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, > ops, > + type, vl); > + } > else > - emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops, > - vl); > + { > + /* Load scalar as V1DI and broadcast via vrgather.vi. */ > + rtx tmp1 = gen_reg_rtx (V1DImode); > + emit_move_insn (tmp1, lowpart_subreg (V1DImode, *scalar_op, > + scalar_mode)); > + tmp1 = lowpart_subreg (vector_mode, tmp1, V1DImode); > + > + rtx ops[] = {tmp, tmp1, CONST0_RTX (Pmode)}; > + emit_vlmax_insn (code_for_pred_gather_scalar (vector_mode), > + BINARY_OP, ops); > + } > + > emit_vector_func (operands, tmp); > > return true; > @@ -5768,9 +5807,9 @@ count_regno_occurrences (rtx_insn *rinsn, unsigned int > regno) > return count; > } > > -/* Return true if the OP can be directly broadcasted. */ > +/* Return true if the OP can be directly broadcast. */ > bool > -can_be_broadcasted_p (rtx op) > +can_be_broadcast_p (rtx op) > { > machine_mode mode = GET_MODE (op); > /* We don't allow RA (register allocation) reload generate > @@ -5782,7 +5821,8 @@ can_be_broadcasted_p (rtx op) > return false; > > if (satisfies_constraint_K (op) || register_operand (op, mode) > - || satisfies_constraint_Wdm (op) || rtx_equal_p (op, CONST0_RTX > (mode))) > + || (strided_load_broadcast_p () && satisfies_constraint_Wdm (op)) > + || rtx_equal_p (op, CONST0_RTX (mode))) > return true; > > return can_create_pseudo_p () && nonmemory_operand (op, mode); > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc > index 58bb1dbe8c4..7a903dba855 100644 > --- a/gcc/config/riscv/riscv.cc > +++ b/gcc/config/riscv/riscv.cc > @@ -302,6 +302,7 @@ struct riscv_tune_param > bool vector_unaligned_access; > bool use_divmod_expansion; > bool overlap_op_by_pieces; > + bool use_zero_stride_load; > bool speculative_sched_vsetvl; > unsigned int fusible_ops; > const struct cpu_vector_cost *vec_costs; > @@ -465,6 +466,7 @@ static const struct riscv_tune_param generic_tune_info = { > false, /* vector_unaligned_access */ > false, /* use_divmod_expansion */ > false, /* overlap_op_by_pieces */ > + true, /* > use_zero_stride_load */ > false, /* speculative_sched_vsetvl */ > RISCV_FUSE_NOTHING, /* fusible_ops */ > NULL, /* vector cost */ > @@ -488,6 +490,7 @@ static const struct riscv_tune_param rocket_tune_info = { > false, /* vector_unaligned_access */ > false, /* use_divmod_expansion */ > false, /* overlap_op_by_pieces */ > + true, /* > use_zero_stride_load */ > false, /* speculative_sched_vsetvl */ > RISCV_FUSE_NOTHING, /* fusible_ops */ > NULL, /* vector cost */ > @@ -511,6 +514,7 @@ static const struct riscv_tune_param sifive_7_tune_info = > { > false, /* vector_unaligned_access */ > false, /* use_divmod_expansion */ > false, /* overlap_op_by_pieces */ > + true, /* > use_zero_stride_load */ > false, /* speculative_sched_vsetvl */ > RISCV_FUSE_NOTHING, /* fusible_ops */ > NULL, /* vector cost */ > @@ -534,6 +538,7 @@ static const struct riscv_tune_param > sifive_p400_tune_info = { > false, /* vector_unaligned_access */ > false, /* use_divmod_expansion */ > false, /* overlap_op_by_pieces */ > + true, /* > use_zero_stride_load */ > false, /* speculative_sched_vsetvl */ > RISCV_FUSE_LUI_ADDI | RISCV_FUSE_AUIPC_ADDI, /* fusible_ops */ > &generic_vector_cost, /* vector cost */ > @@ -557,6 +562,7 @@ static const struct riscv_tune_param > sifive_p600_tune_info = { > false, /* vector_unaligned_access */ > false, /* use_divmod_expansion */ > false, /* overlap_op_by_pieces */ > + true, /* > use_zero_stride_load */ > false, /* speculative_sched_vsetvl */ > RISCV_FUSE_LUI_ADDI | RISCV_FUSE_AUIPC_ADDI, /* fusible_ops */ > &generic_vector_cost, /* vector cost */ > @@ -580,6 +586,7 @@ static const struct riscv_tune_param thead_c906_tune_info > = { > false, /* vector_unaligned_access */ > false, /* use_divmod_expansion */ > false, /* overlap_op_by_pieces */ > + true, /* > use_zero_stride_load */ > false, /* speculative_sched_vsetvl */ > RISCV_FUSE_NOTHING, /* fusible_ops */ > NULL, /* vector cost */ > @@ -603,6 +610,7 @@ static const struct riscv_tune_param > xiangshan_nanhu_tune_info = { > false, /* vector_unaligned_access */ > false, /* use_divmod_expansion */ > false, /* overlap_op_by_pieces */ > + true, /* > use_zero_stride_load */ > false, /* speculative_sched_vsetvl */ > RISCV_FUSE_ZEXTW | RISCV_FUSE_ZEXTH, /* fusible_ops */ > NULL, /* vector cost */ > @@ -626,6 +634,7 @@ static const struct riscv_tune_param > generic_ooo_tune_info = { > true, /* > vector_unaligned_access */ > false, /* use_divmod_expansion */ > true, /* > overlap_op_by_pieces */ > + true, /* > use_zero_stride_load */ > false, /* speculative_sched_vsetvl */ > RISCV_FUSE_NOTHING, /* fusible_ops */ > &generic_vector_cost, /* vector cost */ > @@ -649,6 +658,7 @@ static const struct riscv_tune_param > tt_ascalon_d8_tune_info = { > true, /* > vector_unaligned_access */ > true, /* > use_divmod_expansion */ > true, /* > overlap_op_by_pieces */ > + true, /* > use_zero_stride_load */ > false, /* speculative_sched_vsetvl */ > RISCV_FUSE_NOTHING, /* fusible_ops */ > &generic_vector_cost, /* vector cost */ > @@ -672,6 +682,7 @@ static const struct riscv_tune_param > optimize_size_tune_info = { > false, /* vector_unaligned_access */ > false, /* use_divmod_expansion */ > false, /* overlap_op_by_pieces */ > + true, /* > use_zero_stride_load */ > false, /* speculative_sched_vsetvl */ > RISCV_FUSE_NOTHING, /* fusible_ops */ > NULL, /* vector cost */ > @@ -695,6 +706,7 @@ static const struct riscv_tune_param mips_p8700_tune_info > = { > false, /* vector_unaligned_access */ > true, /* use_divmod_expansion */ > false, /* overlap_op_by_pieces */ > + true, /* > use_zero_stride_load */ > false, /* speculative_sched_vsetvl */ > RISCV_FUSE_NOTHING, /* fusible_ops */ > NULL, /* vector cost */ > @@ -12400,6 +12412,14 @@ riscv_lshift_subword (machine_mode mode > ATTRIBUTE_UNUSED, rtx value, rtx shift, > gen_lowpart (QImode, > shift))); > } > > +/* Return TRUE if we should use the zero stride load, FALSE otherwise. */ > + > +bool > +strided_load_broadcast_p () > +{ > + return tune_param->use_zero_stride_load; > +} > + > /* Return TRUE if we should use the divmod expander, FALSE otherwise. This > allows the behavior to be tuned for specific implementations as well as > when optimizing for size. */ > diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md > index 6753b01db59..292e40a0200 100644 > --- a/gcc/config/riscv/vector.md > +++ b/gcc/config/riscv/vector.md > @@ -1580,8 +1580,22 @@ (define_insn_and_split "*vec_duplicate<mode>" > "&& 1" > [(const_int 0)] > { > - riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode), > - riscv_vector::UNARY_OP, operands); > + if (!strided_load_broadcast_p () > + && TARGET_ZVFHMIN && !TARGET_ZVFH && <VEL>mode == HFmode) > + { > + /* For Float16, reinterpret as HImode, broadcast and reinterpret > + back. */ > + poly_uint64 nunits = GET_MODE_NUNITS (<MODE>mode); > + machine_mode vmodehi > + = riscv_vector::get_vector_mode (HImode, nunits).require (); > + rtx ops[] = {lowpart_subreg (vmodehi, operands[0], <MODE>mode), > + lowpart_subreg (HImode, operands[1], HFmode)}; > + riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (vmodehi), > + riscv_vector::UNARY_OP, ops); > + } > + else > + riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode), > + riscv_vector::UNARY_OP, operands); > DONE; > } > [(set_attr "type" "vector")] > @@ -2171,7 +2185,7 @@ (define_expand "@pred_broadcast<mode>" > } > } > else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode) > - && (immediate_operand (operands[3], Pmode) > + && (immediate_operand (operands[3], Pmode) > || (CONST_POLY_INT_P (operands[3]) > && known_ge (rtx_to_poly_int64 (operands[3]), 0U) > && known_le (rtx_to_poly_int64 (operands[3]), > GET_MODE_SIZE (<MODE>mode))))) > @@ -2224,12 +2238,7 @@ (define_insn_and_split "*pred_broadcast<mode>" > "(register_operand (operands[3], <VEL>mode) > || CONST_POLY_INT_P (operands[3])) > && GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)" > - [(set (match_dup 0) > - (if_then_else:V_VLSI (unspec:<VM> [(match_dup 1) (match_dup 4) > - (match_dup 5) (match_dup 6) (match_dup 7) > - (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) > - (vec_duplicate:V_VLSI (match_dup 3)) > - (match_dup 2)))] > + [(const_int 0)] > { > gcc_assert (can_create_pseudo_p ()); > if (CONST_POLY_INT_P (operands[3])) > @@ -2238,12 +2247,6 @@ (define_insn_and_split "*pred_broadcast<mode>" > emit_move_insn (tmp, operands[3]); > operands[3] = tmp; > } > - rtx m = assign_stack_local (<VEL>mode, GET_MODE_SIZE (<VEL>mode), > - GET_MODE_ALIGNMENT (<VEL>mode)); > - m = validize_mem (m); > - emit_move_insn (m, operands[3]); > - m = gen_rtx_MEM (<VEL>mode, force_reg (Pmode, XEXP (m, 0))); > - operands[3] = m; > > /* For SEW = 64 in RV32 system, we expand vmv.s.x: > andi a2,a2,1 > @@ -2254,6 +2257,35 @@ (define_insn_and_split "*pred_broadcast<mode>" > operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]); > operands[1] = CONSTM1_RTX (<VM>mode); > } > + > + /* If the target doesn't want a strided-load broadcast we go with a > regular > + V1DImode load and a broadcast gather. */ > + if (strided_load_broadcast_p ()) > + { > + rtx mem = assign_stack_local (<VEL>mode, GET_MODE_SIZE (<VEL>mode), > + GET_MODE_ALIGNMENT (<VEL>mode)); > + mem = validize_mem (mem); > + emit_move_insn (mem, operands[3]); > + mem = gen_rtx_MEM (<VEL>mode, force_reg (Pmode, XEXP (mem, 0))); > + > + emit_insn > + (gen_pred_broadcast<mode> > + (operands[0], operands[1], operands[2], mem, > + operands[4], operands[5], operands[6], operands[7])); > + } > + else > + { > + rtx tmp = gen_reg_rtx (V1DImode); > + emit_move_insn (tmp, lowpart_subreg (V1DImode, operands[3], > + <VEL>mode)); > + tmp = lowpart_subreg (<MODE>mode, tmp, V1DImode); > + > + emit_insn > + (gen_pred_gather<mode>_scalar > + (operands[0], operands[1], operands[2], tmp, CONST0_RTX (Pmode), > + operands[4], operands[5], operands[6], operands[7])); > + } > + DONE; > } > [(set_attr "type" "vimov,vimov,vlds,vlds,vlds,vlds,vimovxv,vimovxv") > (set_attr "mode" "<MODE>")]) > @@ -2293,9 +2325,9 @@ (define_insn "*pred_broadcast<mode>_zvfhmin" > (reg:SI VL_REGNUM) > (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) > (vec_duplicate:V_VLSF_ZVFHMIN > - (match_operand:<VEL> 3 "direct_broadcast_operand" > "Wdm, Wdm, Wdm, Wdm")) > + (match_operand:<VEL> 3 "direct_broadcast_operand" " > A, A, A, A")) > (match_operand:V_VLSF_ZVFHMIN 2 "vector_merge_operand" " > vu, 0, vu, 0")))] > - "TARGET_VECTOR" > + "TARGET_VECTOR && strided_load_broadcast_p ()" > "@ > vlse<sew>.v\t%0,%3,zero,%1.t > vlse<sew>.v\t%0,%3,zero,%1.t > -- > 2.50.0 > >