https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111888
Bug ID: 111888 Summary: RISC-V: Horrible redundant number vsetvl instructions in vectorized codes Product: gcc Version: 14.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c Assignee: unassigned at gcc dot gnu.org Reporter: juzhe.zhong at rivai dot ai Target Milestone: --- https://godbolt.org/z/9G5MMa3Tq void foo (int32_t *__restrict a, int32_t *__restrict b, int32_t *__restrict c, int32_t *__restrict a2, int32_t *__restrict b2, int32_t *__restrict c2, int32_t *__restrict a3, int32_t *__restrict b3, int32_t *__restrict c3, int32_t *__restrict a4, int32_t *__restrict b4, int32_t *__restrict c4, int32_t *__restrict a5, int32_t *__restrict b5, int32_t *__restrict c5, int32_t *__restrict d, int32_t *__restrict d2, int32_t *__restrict d3, int32_t *__restrict d4, int32_t *__restrict d5, int n) { for (int i = 0; i < n; i++) { a[i] = b[i] + c[i]; b5[i] = b[i] + c[i]; a2[i] = b2[i] + c2[i]; a3[i] = b3[i] + c3[i]; a4[i] = b4[i] + c4[i]; a5[i] = a[i] + a4[i]; d2[i] = a2[i] + c2[i]; d3[i] = a3[i] + c3[i]; d4[i] = a4[i] + c4[i]; d5[i] = a[i] + a4[i]; a[i] = a5[i] + b5[i] + a[i]; c2[i] = a[i] + c[i]; c3[i] = b5[i] * a5[i]; c4[i] = a2[i] * a3[i]; c5[i] = b5[i] * a2[i]; c[i] = a[i] + c3[i]; c2[i] = a[i] + c4[i]; a5[i] = a[i] + a4[i]; a[i] = a[i] + b5[i] + a[i] * a2[i] * a3[i] * a4[i] * a5[i] * c[i] * c2[i] * c3[i] * c4[i] * c5[i] * d[i] * d2[i] * d3[i] * d4[i] * d5[i]; } } Loop body: vsetvli t1,t4,e8,mf4,ta,ma vle32.v v1,0(a1) vle32.v v4,0(a2) vle32.v v2,0(s10) vsetvli t3,zero,e32,m1,ta,ma vadd.vv v4,v4,v1 vsetvli zero,t4,e32,m1,ta,ma vle32.v v7,0(s9) vle32.v v1,0(a4) vse32.v v4,0(t0) vsetvli t3,zero,e32,m1,ta,ma vadd.vv v2,v7,v2 vsetvli zero,t4,e32,m1,ta,ma vse32.v v2,0(t5) vsetvli t3,zero,e32,m1,ta,ma vadd.vv v5,v2,v4 vsetvli zero,t4,e32,m1,ta,ma vse32.v v5,0(s3) vsetvli t3,zero,e32,m1,ta,ma vadd.vv v3,v5,v4 vsetvli zero,t4,e32,m1,ta,ma vle32.v v9,0(a5) vsetvli t3,zero,e32,m1,ta,ma vadd.vv v3,v3,v4 vsetvli zero,t4,e32,m1,ta,ma vle32.v v6,0(a7) vsetvli t3,zero,e32,m1,ta,ma vadd.vv v1,v9,v1 vsetvli zero,t4,e32,m1,ta,ma vle32.v v8,0(s8) vse32.v v1,0(a3) vsetvli t3,zero,e32,m1,ta,ma vadd.vv v6,v8,v6 vsetvli zero,t4,e32,m1,ta,ma vse32.v v6,0(a6) vsetvli t3,zero,e32,m1,ta,ma vmul.vv v11,v5,v4 vsetvli zero,t4,e32,m1,ta,ma vse32.v v11,0(s4) vsetvli t3,zero,e32,m1,ta,ma vadd.vv v13,v11,v3 vsetvli zero,t4,e32,m1,ta,ma vse32.v v13,0(s6) vsetvli t3,zero,e32,m1,ta,ma vmul.vv v10,v6,v1 vsetvli zero,t4,e32,m1,ta,ma vse32.v v10,0(s5) vsetvli t3,zero,e32,m1,ta,ma vmul.vv v12,v1,v4 vsetvli zero,t4,e32,m1,ta,ma vse32.v v12,0(t2) vsetvli t3,zero,e32,m1,ta,ma vadd.vv v9,v1,v9 vsetvli zero,t4,e32,m1,ta,ma vse32.v v9,0(s0) vsetvli t3,zero,e32,m1,ta,ma vadd.vv v8,v6,v8 vsetvli zero,t4,e32,m1,ta,ma vse32.v v8,0(s1) vsetvli t3,zero,e32,m1,ta,ma vadd.vv v7,v2,v7 vsetvli zero,t4,e32,m1,ta,ma vse32.v v7,0(s2) vsetvli t3,zero,e32,m1,ta,ma vmul.vv v1,v3,v1 vmul.vv v1,v1,v6 vadd.vv v6,v10,v3 vmul.vv v1,v1,v2 vadd.vv v2,v3,v2 vmul.vv v1,v1,v2 vmul.vv v1,v1,v13 vsetvli zero,t1,e32,m1,ta,ma vse32.v v6,0(s7) vsetvli t3,zero,e32,m1,ta,ma vmul.vv v1,v1,v6 vsetvli zero,t1,e32,m1,ta,ma vse32.v v2,0(t6) vsetvli t3,zero,e32,m1,ta,ma vmul.vv v1,v1,v11 vsetvli zero,t1,e32,m1,ta,ma vle32.v v2,0(s11) vsetvli t3,zero,e32,m1,ta,ma slli t3,t1,2 vmul.vv v1,v1,v10 vadd.vv v3,v3,v4 vmul.vv v1,v1,v12 sub t4,t4,t1 vmul.vv v1,v1,v2 vmul.vv v1,v1,v9 vmul.vv v1,v1,v8 vmul.vv v1,v1,v7 vmadd.vv v5,v1,v3 vsetvli zero,t1,e32,m1,ta,ma vse32.v v5,0(a0) So many redundant AVL toggling. Ideally, it should be only a single vsetvl instruction in the header of the loop. All other vsetvls should be elided. It's known issue for a long time. And I will be working on it recently base on refactored VSETVL PASS.