On Wed, Sep 03, 2025 at 09:52:01PM +0800, Chao Liu wrote:
> From: Chao Liu <[email protected]>
>
> This commit improves the performance of QEMU when emulating strided vector
> loads and stores by substituting the call for the helper function with the
> generation of equivalent TCG operations.
>
> PS:
>
> An implementation is permitted to cause an illegal instruction if vstart
> is not 0 and it is set to a value that can not be produced implicitly by
> the implementation, but memory accesses will generally always need to
> deal with page faults.
>
> So, if a strided vector memory access instruction has non-zero vstart,
> check it through vlse/vsse helpers function.
[...]
> typedef void gen_helper_ldst_stride(TCGv_ptr, TCGv_ptr, TCGv,
> TCGv, TCGv_env, TCGv_i32);
>
> static bool ldst_stride_trans(uint32_t vd, uint32_t rs1, uint32_t rs2,
> uint32_t data, gen_helper_ldst_stride *fn,
> - DisasContext *s)
> + DisasContext *s, bool is_load)
> {
> - TCGv_ptr dest, mask;
> - TCGv base, stride;
> - TCGv_i32 desc;
> + if (!s->vstart_eq_zero) {
> + TCGv_ptr dest, mask;
> + TCGv base, stride;
> + TCGv_i32 desc;
>
> - dest = tcg_temp_new_ptr();
> - mask = tcg_temp_new_ptr();
> - base = get_gpr(s, rs1, EXT_NONE);
> - stride = get_gpr(s, rs2, EXT_NONE);
> - desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
> - s->cfg_ptr->vlenb, data));
> + dest = tcg_temp_new_ptr();
> + mask = tcg_temp_new_ptr();
> + base = get_gpr(s, rs1, EXT_NONE);
> + stride = get_gpr(s, rs2, EXT_NONE);
> + desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
> + s->cfg_ptr->vlenb, data));
>
> - tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
> - tcg_gen_addi_ptr(mask, tcg_env, vreg_ofs(s, 0));
> + tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
> + tcg_gen_addi_ptr(mask, tcg_env, vreg_ofs(s, 0));
> + mark_vs_dirty(s);
> + fn(dest, mask, base, stride, tcg_env, desc);
> + return true;
Most of the lines changed here should just be indenting the
existing code into the 'if' branch. So maybe to split the patch
up a little and make less churn, you could do patch 1 that moves
this code into a function like gen_call_helper_ldst_stride().
Then after patch 2 it would be
if (!s->vstart_eq_zero) {
/* vstart != 0 helper slowpath */
gen_call_helper_ldst_stride(vd, rs1, rs2, data, fn, is, is_load);
return true;
}
[...]
> + }
> +
> + TCGv dest = tcg_temp_new();
> +
> + uint32_t nf = FIELD_EX32(data, VDATA, NF);
> + uint32_t vm = FIELD_EX32(data, VDATA, VM);
> +
> + /* Destination register and mask register */
> + tcg_gen_addi_tl(dest, (TCGv)tcg_env, vreg_ofs(s, vd));
> +
> + /*
> + * Select the appropriate load/tore to retrieve data from the vector
^^^^ store
[...]
> @@ -899,7 +1165,7 @@ static bool ld_stride_op(DisasContext *s, arg_rnfvm *a,
> uint8_t eew)
> {
> uint32_t data = 0;
> gen_helper_ldst_stride *fn;
> - static gen_helper_ldst_stride * const fns[4] = {
> + static gen_helper_ldst_stride *const fns[4] = {
> gen_helper_vlse8_v, gen_helper_vlse16_v,
> gen_helper_vlse32_v, gen_helper_vlse64_v
> };
This probably comes from my patch, just remove the hunk to
reduce patch size. Ditto for any other stray "cleanups".
> @@ -915,7 +1181,7 @@ static bool ld_stride_op(DisasContext *s, arg_rnfvm *a,
> uint8_t eew)
> data = FIELD_DP32(data, VDATA, NF, a->nf);
> data = FIELD_DP32(data, VDATA, VTA, s->vta);
> data = FIELD_DP32(data, VDATA, VMA, s->vma);
> - return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s);
> + return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s, true);
> }
>
> static bool ld_stride_check(DisasContext *s, arg_rnfvm* a, uint8_t eew)
> @@ -933,23 +1199,13 @@ GEN_VEXT_TRANS(vlse64_v, MO_64, rnfvm, ld_stride_op,
> ld_stride_check)
> static bool st_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
> {
> uint32_t data = 0;
> - gen_helper_ldst_stride *fn;
> - static gen_helper_ldst_stride * const fns[4] = {
> - /* masked stride store */
> - gen_helper_vsse8_v, gen_helper_vsse16_v,
> - gen_helper_vsse32_v, gen_helper_vsse64_v
> - };
I gave you an old patch without the stores done, sorry. You
just need to pass the store helper fn through here similarly
as for loads (i.e., this hunk should just be the one liner
change to add extra 'is_load=false' argument to the
ldst_stride_trans() call.).
Thanks,
Nick