On 2025/9/4 12:37, Nicholas Piggin wrote:
> On Wed, Sep 03, 2025 at 09:52:01PM +0800, Chao Liu wrote:
>> From: Chao Liu <[email protected]>
>>
>> This commit improves the performance of QEMU when emulating strided vector
>> loads and stores by substituting the call for the helper function with the
>> generation of equivalent TCG operations.
>>
>> PS:
>>
>> An implementation is permitted to cause an illegal instruction if vstart
>> is not 0 and it is set to a value that can not be produced implicitly by
>> the implementation, but memory accesses will generally always need to
>> deal with page faults.
>>
>> So, if a strided vector memory access instruction has non-zero vstart,
>> check it through vlse/vsse helpers function.
> 
> [...]
> 
>>  typedef void gen_helper_ldst_stride(TCGv_ptr, TCGv_ptr, TCGv,
>>                                      TCGv, TCGv_env, TCGv_i32);
>>  
>>  static bool ldst_stride_trans(uint32_t vd, uint32_t rs1, uint32_t rs2,
>>                                uint32_t data, gen_helper_ldst_stride *fn,
>> -                              DisasContext *s)
>> +                              DisasContext *s, bool is_load)
>>  {
>> -    TCGv_ptr dest, mask;
>> -    TCGv base, stride;
>> -    TCGv_i32 desc;
>> +    if (!s->vstart_eq_zero) {
>> +        TCGv_ptr dest, mask;
>> +        TCGv base, stride;
>> +        TCGv_i32 desc;
>>  
>> -    dest = tcg_temp_new_ptr();
>> -    mask = tcg_temp_new_ptr();
>> -    base = get_gpr(s, rs1, EXT_NONE);
>> -    stride = get_gpr(s, rs2, EXT_NONE);
>> -    desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
>> -                                      s->cfg_ptr->vlenb, data));
>> +        dest = tcg_temp_new_ptr();
>> +        mask = tcg_temp_new_ptr();
>> +        base = get_gpr(s, rs1, EXT_NONE);
>> +        stride = get_gpr(s, rs2, EXT_NONE);
>> +        desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
>> +                                          s->cfg_ptr->vlenb, data));
>>  
>> -    tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
>> -    tcg_gen_addi_ptr(mask, tcg_env, vreg_ofs(s, 0));
>> +        tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
>> +        tcg_gen_addi_ptr(mask, tcg_env, vreg_ofs(s, 0));
>> +        mark_vs_dirty(s);
>> +        fn(dest, mask, base, stride, tcg_env, desc);
>> +        return true;
> 
> Most of the lines changed here should just be indenting the
> existing code into the 'if' branch. So maybe to split the patch
> up a little and make less churn, you could do patch 1 that moves
> this code into a function like gen_call_helper_ldst_stride().
> 
> Then after patch 2 it would be
> 
>     if (!s->vstart_eq_zero) {
>         /* vstart != 0 helper slowpath */
>         gen_call_helper_ldst_stride(vd, rs1, rs2, data, fn, is, is_load);
>         return true;
>     }
> 
>     [...]
> 

Good idea. I've given it a try, and splitting out the
gen_call_helper_ldst_stride() into a separate patch is also very convenient for
review.

I'll include this change in the next version of the patch.

>> +    }
>> +
>> +    TCGv dest = tcg_temp_new();
>> +
>> +    uint32_t nf = FIELD_EX32(data, VDATA, NF);
>> +    uint32_t vm = FIELD_EX32(data, VDATA, VM);
>> +
>> +    /* Destination register and mask register */
>> +    tcg_gen_addi_tl(dest, (TCGv)tcg_env, vreg_ofs(s, vd));
>> +
>> +    /*
>> +     * Select the appropriate load/tore to retrieve data from the vector
>                                       ^^^^ store
> 

get~

> [...]
> 
>> @@ -899,7 +1165,7 @@ static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, 
>> uint8_t eew)
>>  {
>>      uint32_t data = 0;
>>      gen_helper_ldst_stride *fn;
>> -    static gen_helper_ldst_stride * const fns[4] = {
>> +    static gen_helper_ldst_stride *const fns[4] = {
>>          gen_helper_vlse8_v, gen_helper_vlse16_v,
>>          gen_helper_vlse32_v, gen_helper_vlse64_v
>>      };
> 
> This probably comes from my patch, just remove the hunk to
> reduce patch size. Ditto for any other stray "cleanups".
> 
>> @@ -915,7 +1181,7 @@ static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, 
>> uint8_t eew)
>>      data = FIELD_DP32(data, VDATA, NF, a->nf);
>>      data = FIELD_DP32(data, VDATA, VTA, s->vta);
>>      data = FIELD_DP32(data, VDATA, VMA, s->vma);
>> -    return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s);
>> +    return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s, true);
>>  }
>>  
>>  static bool ld_stride_check(DisasContext *s, arg_rnfvm* a, uint8_t eew)
>> @@ -933,23 +1199,13 @@ GEN_VEXT_TRANS(vlse64_v, MO_64, rnfvm, ld_stride_op, 
>> ld_stride_check)
>>  static bool st_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
>>  {
>>      uint32_t data = 0;
>> -    gen_helper_ldst_stride *fn;
>> -    static gen_helper_ldst_stride * const fns[4] = {
>> -        /* masked stride store */
>> -        gen_helper_vsse8_v,  gen_helper_vsse16_v,
>> -        gen_helper_vsse32_v,  gen_helper_vsse64_v
>> -    };
> 
> I gave you an old patch without the stores done, sorry. You
> just need to pass the store helper fn through here similarly
> as for loads (i.e., this hunk should just be the one liner
> change to add extra 'is_load=false' argument to the
> ldst_stride_trans() call.).
> 
> Thanks,
> Nick

get~


Reply via email to