https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106694
Bug ID: 106694 Summary: Redundant move instructions in ARM SVE intrinsics use cases Product: gcc Version: rust/master Status: UNCONFIRMED Severity: normal Priority: P3 Component: c Assignee: unassigned at gcc dot gnu.org Reporter: juzhe.zhong at rivai dot ai Target Milestone: --- Here is the example: https://godbolt.org/z/4zPK7j8vT The codes: #include "arm_sve.h" int coalesce (svbool_t pg, int64_t* base, int n, int32_t *in1, int64_t *in2, int64_t*out) { svint64x4_t result = svld4_s64 (pg, base); svint64_t v0 = svget4_s64(result, 0); svint64_t v1 = svget4_s64(result, 1); svint64_t v2 = svget4_s64(result, 2); svint64_t v3 = svget4_s64(result, 3); for (int i = 0; i < n; i += 1) { svint64_t v18 = svld1_s64(pg, in1); svint64_t v19 = svld1_s64(pg, in2); v0 = svmad_s64_z(pg, v0, v18, v19); v1 = svmad_s64_z(pg, v1, v18, v19); v2 = svmad_s64_z(pg, v2, v18, v19); v3 = svmad_s64_z(pg, v3, v18, v19); } svst1_s64(pg, out+0,v0); svst1_s64(pg, out+1,v1); svst1_s64(pg, out+2,v2); svst1_s64(pg, out+3,v3); } Assembly: coalesce: ld4d {z24.d - z27.d}, p0/z, [x0] mov z5.d, z24.d mov z4.d, z25.d mov z3.d, z26.d mov z2.d, z27.d cmp w1, 0 ble .L2 mov w0, 0 ld1d z1.d, p0/z, [x2] ld1d z0.d, p0/z, [x3] .L3: add w0, w0, 1 movprfx z5.d, p0/z, z5.d mad z5.d, p0/m, z1.d, z0.d movprfx z4.d, p0/z, z4.d mad z4.d, p0/m, z1.d, z0.d movprfx z3.d, p0/z, z3.d mad z3.d, p0/m, z1.d, z0.d movprfx z2.d, p0/z, z2.d mad z2.d, p0/m, z1.d, z0.d cmp w1, w0 bne .L3 .L2: add x3, x4, 8 add x2, x4, 16 add x1, x4, 24 st1d z5.d, p0, [x4] st1d z4.d, p0, [x3] st1d z3.d, p0, [x2] st1d z2.d, p0, [x1] ret The "mov" instructions are redundant. I think the issue is that GCC doesn't have the register coalescing like LLVM. The subreg can not propagate accorss basic blocks. Can someone implement register coalesing or subreg forwarding across basic blocks? Thanks.