http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53056

             Bug #: 53056
           Summary: bad code generated for ARM NEON with vector types in
                    structs
    Classification: Unclassified
           Product: gcc
           Version: 4.7.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
        AssignedTo: unassig...@gcc.gnu.org
        ReportedBy: drw...@yahoo.com


Consider the following snippet:

typedef int vi16 __attribute__((vector_size(16*sizeof(int))));

vi16 add(vi16 a, vi16b) {
  return a + b;
}

compile with arm-linux-gnueabi-gcc-4.7 -O2 -march=armv7-a -mhard-float
-mfpu=neon

Some fairly good code is produced:

        sub     sp, sp, #8
        str     r4, [sp, #-4]!
        fstmfdd sp!, {d8, d9, d10, d11, d12, d13}
        add     ip, sp, #52
        add     r4, sp, #116
        vldmia  r4, {d24-d31}
        stmia   ip, {r2, r3}
        vldmia  ip, {d6-d13}
        vadd.i32        q8, q3, q12
        vadd.i32        q9, q4, q13
        vadd.i32        q10, q5, q14
        vadd.i32        q11, q6, q15
        vstmia  r0, {d16-d23}
        fldmfdd sp!, {d8, d9, d10, d11, d12, d13}
        ldmfd   sp!, {r4}
        add     sp, sp, #8
        bx      lr

However, the the vector is embedded in a struct the code generation becomes
awful.

typedef struct A {
  vi16 v;
} A;

vi16 add1(A a, A b) {
  return a.v + b.v;
}

// Same code as add1
vi16 add2(A a, A b) {
  vi16* av = &a.v;
  vi16* bv = &b.v;
  return a.v + b.v;
}

Both add1 and add2 produce the same code:

add1:
        @ args = 128, pretend = 8, frame = 128
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        sub     sp, sp, #8
        sub     sp, sp, #128
        add     r1, sp, #128
        stmia   sp, {r2, r3}
        stmia   r1, {r2, r3}
        ldr     r3, [sp, #192]
        ldr     r2, [r1, #8]
        str     r3, [sp, #64]
        ldr     r3, [sp, #196]
        str     r2, [sp, #8]
        ldr     r2, [r1, #12]
        str     r3, [sp, #68]
        ldr     r3, [sp, #200]
        ... lots and lots of load and store instructions.

But adding an "optimization barrier" to add2 produces similar code to the
original example, by making the compiler "forget" the origin of the pointer.

vi16 add3(A a, A b) {
  vi16* av = &a.v;
  vi16* bv = &b.v;
  asm("" : "+r"(av), "+r"(bv));  // causes good code to be generated.
  return *av + *bv;
}

Reply via email to