https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111313
Bug ID: 111313
Summary: RISC-V: Incorrect code gen for 2 level loop
Product: gcc
Version: 14.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c
Assignee: unassigned at gcc dot gnu.org
Reporter: pan2.li at intel dot com
Target Milestone: ---
Created attachment 55846
--> https://gcc.gnu.org/bugzilla/attachment.cgi?id=55846&action=edit
Reproduce code
Given we have an example code as below.
#define K 32
signed short in[2*K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
signed short coeff[K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
__attribute__ ((noinline)) void
test ()
{
for (int j = 0; j < K; j++)
{
for (int i = 0; i < 2*K; i++)
in[i][j] = i+j;
for (int i = 0; i < K; i++)
coeff[i][j] = i + 2;
}
}
When compile with option similar to "-march=rv64imafdcv -mabi=lp64d
-mcmodel=medlow -fdiagnostics-plain-output -flto -ffat-lto-objects --param
riscv-autovec-preference=scalable -Wno-psabi -ftree-vectorize
-fno-tree-loop-distribute-patterns -fno-vect-cost-model -fno-common
-fdump-tree-vect-details "
The assembly code will be:
init_in:
lui t1,%hi(coeff)
lui a7,%hi(in)
csrr a0,vlenb
addi t1,t1,%lo(coeff)
addi a7,a7,%lo(in)
srli a0,a0,2
li a6,0
li t3,32
vsetvli a1,zero,e16,mf2,ta,ma
vid.v v3
vsll.vi v3,v3,6
.L2:
mv a2,a7
li a4,64
vmv.v.x v4,a6 <= this insn will have e16 first, and then e32 when loop
back
vsetvli zero,zero,e32,m1,ta,ma
vid.v v2
.L3:
vsetvli zero,zero,e16,mf2,ta,ma
vmv1r.v v1,v2
vncvt.x.x.w v1,v1
vsetvli a5,a4,e8,mf4,ta,ma
vsetvli a3,zero,e16,mf2,ta,ma
sub a4,a4,a5
vadd.vv v1,v1,v4
vsetvli zero,a5,e16,mf2,ta,ma
slli a5,a5,6
vsuxei16.v v1,(a2),v3
vsetvli a1,zero,e32,m1,ta,ma
add a2,a2,a5
vmv.v.x v1,a0
vadd.vv v2,v2,v1
bne a4,zero,.L3
mv a2,t1
li a4,32
vid.v v2
.L4:
vsetvli zero,zero,e16,mf2,ta,ma
vmv1r.v v1,v2
vncvt.x.x.w v1,v1
vsetvli a5,a4,e8,mf4,ta,ma
vsetvli a3,zero,e16,mf2,ta,ma
sub a4,a4,a5
vadd.vi v1,v1,2
vsetvli zero,a5,e16,mf2,ta,ma
slli a5,a5,6
vsuxei16.v v1,(a2),v3
vsetvli a1,zero,e32,m1,ta,ma
add a2,a2,a5
vmv.v.x v1,a0
vadd.vv v2,v2,v1
bne a4,zero,.L4
addiw a6,a6,1
addi t1,t1,2
addi a7,a7,2
bne a6,t3,.L2
ret