https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106694

            Bug ID: 106694
           Summary: Redundant move instructions in ARM SVE intrinsics use
                    cases
           Product: gcc
           Version: rust/master
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: juzhe.zhong at rivai dot ai
  Target Milestone: ---

Here is the example: https://godbolt.org/z/4zPK7j8vT

The codes:
#include "arm_sve.h"

int coalesce (svbool_t pg, int64_t* base, int n, int32_t *in1, int64_t *in2,
int64_t*out)
{
  svint64x4_t result = svld4_s64 (pg, base);
  svint64_t v0 = svget4_s64(result, 0);
  svint64_t v1 = svget4_s64(result, 1);
  svint64_t v2 = svget4_s64(result, 2);
  svint64_t v3 = svget4_s64(result, 3);

  for (int i = 0; i < n; i += 1)
    {
        svint64_t v18 = svld1_s64(pg, in1);
        svint64_t v19 = svld1_s64(pg, in2);
        v0 = svmad_s64_z(pg, v0, v18, v19);
        v1 = svmad_s64_z(pg, v1, v18, v19);
        v2 = svmad_s64_z(pg, v2, v18, v19);
        v3 = svmad_s64_z(pg, v3, v18, v19);
    }
  svst1_s64(pg, out+0,v0);
  svst1_s64(pg, out+1,v1);
  svst1_s64(pg, out+2,v2);
  svst1_s64(pg, out+3,v3);
}

Assembly:
coalesce:
        ld4d    {z24.d - z27.d}, p0/z, [x0]
        mov     z5.d, z24.d
        mov     z4.d, z25.d
        mov     z3.d, z26.d
        mov     z2.d, z27.d
        cmp     w1, 0
        ble     .L2
        mov     w0, 0
        ld1d    z1.d, p0/z, [x2]
        ld1d    z0.d, p0/z, [x3]
.L3:
        add     w0, w0, 1
        movprfx z5.d, p0/z, z5.d
        mad     z5.d, p0/m, z1.d, z0.d
        movprfx z4.d, p0/z, z4.d
        mad     z4.d, p0/m, z1.d, z0.d
        movprfx z3.d, p0/z, z3.d
        mad     z3.d, p0/m, z1.d, z0.d
        movprfx z2.d, p0/z, z2.d
        mad     z2.d, p0/m, z1.d, z0.d
        cmp     w1, w0
        bne     .L3
.L2:
        add     x3, x4, 8
        add     x2, x4, 16
        add     x1, x4, 24
        st1d    z5.d, p0, [x4]
        st1d    z4.d, p0, [x3]
        st1d    z3.d, p0, [x2]
        st1d    z2.d, p0, [x1]
        ret

The "mov" instructions are redundant. I think the issue is that GCC doesn't
have the register coalescing like LLVM.

The subreg can not propagate accorss basic blocks.

Can someone implement register coalesing or subreg forwarding across basic
blocks?

Thanks.

Reply via email to