Re: [PATCH] RISC-V: Make zero-stride load broadcast a tunable.

Kito Cheng Thu, 10 Jul 2025 02:19:33 -0700

> diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
> index 6753b01db59..866aaf1e8a0 100644
> --- a/gcc/config/riscv/vector.md
> +++ b/gcc/config/riscv/vector.md
> @@ -1580,8 +1580,27 @@ (define_insn_and_split "*vec_duplicate<mode>"
>    "&& 1"
>    [(const_int 0)]
>    {
> -    riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode),
> -                                   riscv_vector::UNARY_OP, operands);
> +    if (!strided_load_broadcast_p ()
> +       && TARGET_ZVFHMIN && !TARGET_ZVFH && <VEL>mode == HFmode)
> +      {
> +       /* For Float16, load, convert to float, then broadcast and
> +          truncate.  */
> +       rtx tmpsf = gen_reg_rtx (SFmode);
> +       emit_insn (gen_extendhfsf2 (tmpsf, operands[1]));
> +       poly_uint64 nunits = GET_MODE_NUNITS (<MODE>mode);


This could be HF -> HI (bitcast) then a HI pred_broadcast then bitcast
back to a HF vector again,
this could prevent us introducing trunc here,
and I would prefer to improve this since RVA23 profile only mandatory
Zvfhmin not Zvfh.

> +       machine_mode vmodesf
> +         = riscv_vector::get_vector_mode (SFmode, nunits).require ();
> +       rtx tmp = gen_reg_rtx (vmodesf);
> +       rtx ops[] =  {tmp, tmpsf};
> +       riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (vmodesf),
> +                                      riscv_vector::UNARY_OP, ops);
> +       rtx ops2[] = {operands[0], tmp};
> +       riscv_vector::emit_vlmax_insn (code_for_pred_trunc (vmodesf),
> +                                      riscv_vector::UNARY_OP_FRM_DYN, ops2);
> +      }
> +    else
> +      riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (<MODE>mode),
> +                                    riscv_vector::UNARY_OP, operands);
>      DONE;
>    }
>    [(set_attr "type" "vector")]
> @@ -2171,7 +2190,7 @@ (define_expand "@pred_broadcast<mode>"
>         }
>      }
>    else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)
> -           && (immediate_operand (operands[3], Pmode)
> +          && (immediate_operand (operands[3], Pmode)
>                || (CONST_POLY_INT_P (operands[3])
>                    && known_ge (rtx_to_poly_int64 (operands[3]), 0U)
>                    && known_le (rtx_to_poly_int64 (operands[3]), 
> GET_MODE_SIZE (<MODE>mode)))))
> @@ -2224,12 +2243,7 @@ (define_insn_and_split "*pred_broadcast<mode>"
>    "(register_operand (operands[3], <VEL>mode)
>    || CONST_POLY_INT_P (operands[3]))
>    && GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)"
> -  [(set (match_dup 0)
> -       (if_then_else:V_VLSI (unspec:<VM> [(match_dup 1) (match_dup 4)
> -            (match_dup 5) (match_dup 6) (match_dup 7)
> -            (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
> -         (vec_duplicate:V_VLSI (match_dup 3))
> -         (match_dup 2)))]
> +  [(const_int 0)]
>    {
>      gcc_assert (can_create_pseudo_p ());
>      if (CONST_POLY_INT_P (operands[3]))
> @@ -2238,12 +2252,6 @@ (define_insn_and_split "*pred_broadcast<mode>"
>         emit_move_insn (tmp, operands[3]);
>         operands[3] = tmp;
>        }
> -    rtx m = assign_stack_local (<VEL>mode, GET_MODE_SIZE (<VEL>mode),
> -                               GET_MODE_ALIGNMENT (<VEL>mode));
> -    m = validize_mem (m);
> -    emit_move_insn (m, operands[3]);
> -    m = gen_rtx_MEM (<VEL>mode, force_reg (Pmode, XEXP (m, 0)));
> -    operands[3] = m;
>
>      /* For SEW = 64 in RV32 system, we expand vmv.s.x:
>         andi a2,a2,1
> @@ -2254,6 +2262,35 @@ (define_insn_and_split "*pred_broadcast<mode>"
>         operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]);
>         operands[1] = CONSTM1_RTX (<VM>mode);
>        }
> +
> +    /* If the target doesn't want a strided-load broadcast we go with a 
> regular
> +       V1DImode load and a broadcast gather.  */
> +    if (strided_load_broadcast_p ())
> +      {
> +       rtx mem = assign_stack_local (<VEL>mode, GET_MODE_SIZE (<VEL>mode),
> +                                     GET_MODE_ALIGNMENT (<VEL>mode));
> +       mem = validize_mem (mem);
> +       emit_move_insn (mem, operands[3]);
> +       mem = gen_rtx_MEM (<VEL>mode, force_reg (Pmode, XEXP (mem, 0)));
> +
> +       emit_insn
> +         (gen_pred_broadcast<mode>
> +          (operands[0], operands[1], operands[2], mem,
> +           operands[4], operands[5], operands[6], operands[7]));
> +      }
> +    else
> +      {
> +       rtx tmp = gen_reg_rtx (V1DImode);
> +       emit_move_insn (tmp, lowpart_subreg (V1DImode, operands[3],
> +                                            <VEL>mode));
> +       tmp = lowpart_subreg (<MODE>mode, tmp, V1DImode);
> +
> +       emit_insn
> +         (gen_pred_gather<mode>_scalar
> +          (operands[0], operands[1], operands[2], tmp, CONST0_RTX (Pmode),
> +           operands[4], operands[5], operands[6], operands[7]));
> +      }
> +    DONE;
>    }
>    [(set_attr "type" "vimov,vimov,vlds,vlds,vlds,vlds,vimovxv,vimovxv")
>     (set_attr "mode" "<MODE>")])
> @@ -2293,9 +2330,9 @@ (define_insn "*pred_broadcast<mode>_zvfhmin"
>              (reg:SI VL_REGNUM)
>              (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
>           (vec_duplicate:V_VLSF_ZVFHMIN
> -           (match_operand:<VEL>        3 "direct_broadcast_operand"      
> "Wdm, Wdm, Wdm, Wdm"))
> +           (match_operand:<VEL>        3 "direct_broadcast_operand"      "  
> A,   A,   A,   A"))
>           (match_operand:V_VLSF_ZVFHMIN 2 "vector_merge_operand"          " 
> vu,   0,  vu,   0")))]
> -  "TARGET_VECTOR"
> +  "TARGET_VECTOR && strided_load_broadcast_p ()"
>    "@
>     vlse<sew>.v\t%0,%3,zero,%1.t
>     vlse<sew>.v\t%0,%3,zero,%1.t
> --
> 2.50.0
>
>

Re: [PATCH] RISC-V: Make zero-stride load broadcast a tunable.

Reply via email to