On Tue, Sep 29, 2020 at 12:55 PM 夏 晋 via Gcc <[email protected]> wrote:
>
> Hi everyone,
> I tried to set the "vlen" after the add & multi, as shown in the following
> code:
> ➜
> vf32 x3,x4;
> void foo1(float16_t* input, float16_t* output, int vlen){
> vf32 add = x3 + x4;
> vf32 mul = x3 * x4;
> __builtin_riscv_vlen(vlen); //<----
> storevf(&output[0], add);
> storevf(&output[4], mul);
> }
> but after compilation, the "vlen" is reordered:
> ➜
> foo1:
> lui a5,%hi(.LANCHOR0)
> addi a5,a5,%lo(.LANCHOR0)
> addi a4,a5,64
> vfld v0,a5
> vfld v1,a4
> csrw vlen,a2 //<----
> vfadd v2,v0,v1
> addi a5,a1,8
> vfmul v0,v0,v1
> vfst v2,a1
> vfst v0,a5
> ret
> And I've tried to add some barrier code shown as the following:
> ➜
> #define barrier() __asm__ __volatile__("": : :"memory")
> vf32 x3,x4;
> void foo1(float16_t* input, float16_t* output, int vlen){
> vf32 add = x3 + x4;
> vf32 mul = x3 * x4;
> barrier();
> __builtin_riscv_vlen(vlen);
> barrier();
> storevf(&output[0], add);
> storevf(&output[4], mul);
> }
> ➜
> vf32 x3,x4;
> void foo1(float16_t* input, float16_t* output, int vlen){
> vf32 add = x3 + x4;
> vf32 mul = x3 * x4;
> __asm__ __volatile__ ("csrw\tvlen,%0" : : "rJ"(vlen) : "memory");
> storevf(&output[0], add);
> storevf(&output[4], mul);
> }
> Both methods compiled out the same false assembly.
> =======
> But if I tried the code like: (add & multi are using different operands)
> ➜
> vf32 x1,x2;
> vf32 x3,x4;
> void foo1(float16_t* input, float16_t* output, int vlen){
> vf32 add = x3 + x4;
> vf32 mul = x1 * x2;
> __builtin_riscv_vlen(vlen);
> storevf(&output[0], add);
> storevf(&output[4], mul);
> }
> the assembly will be right:
> ➜
> foo1:
> lui a5,%hi(.LANCHOR0)
> addi a5,a5,%lo(.LANCHOR0)
> addi a0,a5,64
> addi a3,a5,128
> addi a4,a5,192
> vfld v1,a5
> vfld v3,a0
> vfld v0,a3
> vfld v2,a4
> vfadd v1,v1,v3
> vfmul v0,v0,v2
> csrw vlen,a2 <----
> addi a5,a1,8
> vfst v1,a1
> vfst v0,a5
> ret
>
> Is there any other way for coding or other option for gcc compilation to deal
> with this issue.
> Any suggestion would be appreciated. Thank you very much!
You need to present GCC with a data dependence that prevents the re-ordering
for example by adding input/outputs for add/mul like
asm volatile ("crsw\tvlen, %0" : "=r" (add), "=r" (mul) : "0" (add),
"0" (mul), "rJ" (vlen));
Richard.
> Best,
> Jin