Re: Vectorised copy

Michael Hope Sun, 04 Sep 2011 18:33:32 -0700

On Sat, Sep 3, 2011 at 4:54 AM, Ulrich Weigand
<ulrich.weig...@de.ibm.com> wrote:
> Michael Hope <michael.h...@linaro.org> wrote:
>
>> int *a;
>> int *b;
>> int *c;
>>
>> const int ad[320];
>> const int bd[320];
>> const int cd[320];
>>
>> void fill()
>> {
>>   for (int i = 0; i < 320; i++)
>>     {
>>       a[i] = ad[i];
>>       b[i] = bd[i];
>>       c[i] = cd[i];
>>     }
>> }
> [snip]
>> Can we always use the second form?  What optimisation is preventing it?
>
> Without having looked into this in detail, my guess would be
> it depends on whether the compiler is able to prove that the
> memory pointed to by a, b, and c is distinct (instead of having
> a potential overlap if those are pointers into the same array).
>
> Does it help if you make a, b, and c function arguments to fill,
> and mark them restrict?


Yip, I had a go with that originally.  Here's the variants:

(1) - local source, local destination:

int a[320];
int b[320];
int c[320];

const int ad[320];
const int bd[320];
const int cd[320];

void fill()
{
  for (int i = 0; i < 320; i++)
    {
      a[i] = ad[i];
      b[i] = bd[i];
      c[i] = cd[i];
    }
}

gives the best:

fill:
        push    {r4, r5, r6}
        ldr     r6, .L5
        ldr     r5, .L5+4
        ldr     r4, .L5+8
        sub     r3, r6, #1280
        ldr     r0, .L5+12
        ldr     r1, .L5+16
        ldr     r2, .L5+20
.L2:
        vldmia  r0!, {d16-d17}
        vldmia  r5!, {d18-d19}
        vstmia  r4!, {d18-d19}
        vstmia  r1!, {d16-d17}
        vldmia  r2!, {d16-d17}
        vstmia  r3!, {d16-d17}
        cmp     r3, r6
        bne     .L2
        pop     {r4, r5, r6}
        bx      lr

(2) - extern destination, local source with -fno-section-anchors to
make the code more readable:

extern int a[320];
extern int b[320];
extern int c[320];

const int ad[320];
const int bd[320];
const int cd[320];

void fill()
{
  for (int i = 0; i < 320; i++)
    {
      a[i] = ad[i];
      b[i] = bd[i];
      c[i] = cd[i];
    }
}

fill:
        ldr     r2, .L5
        push    {r4, r5, r6, r7, r8}
        ldr     r0, .L5+4
        mov     r3, r2
        add     r8, r2, #1280
        ldr     r7, .L5+8
        ldr     r6, .L5+12
        rsb     ip, r3, r0
        ldr     r1, .L5+16
        ldr     r2, .L5+20
        subs    r7, r7, r3
        subs    r6, r6, r3
.L2:
        add     r5, ip, r3
        adds    r4, r7, r3
        vldmia  r2!, {d16-d17}
        vldmia  r1!, {d18-d19}
        adds    r0, r6, r3
        vst1.32 {q9}, [r5]
        vst1.32 {q8}, [r4]
        vldmia  r3, {d16-d17}
        adds    r3, r3, #16
        cmp     r3, r8
        vst1.32 {q8}, [r0]
        bne     .L2
        pop     {r4, r5, r6, r7, r8}
        bx      lr

(3) destination as arguments, restrict:

void fill3(int * __restrict a, int * __restrict b, int * __restrict c)
{
  for (int i = 0; i < 320; i++)
    {
      a[i] = ad[i];
      b[i] = bd[i];
      c[i] = cd[i];
    }
}

fill3:
        push    {r4, r5, r6, r7, r8}
        ldr     r6, .L23
        ldr     r5, .L23+4
        ldr     r4, .L23+8
        mov     r3, r6
        subs    r0, r0, r3
        add     r6, r6, #1280
        subs    r1, r1, r3
        subs    r2, r2, r3
.L21:
        add     r8, r3, r0
        add     ip, r3, r1
        vldmia  r4!, {d16-d17}
        vldmia  r5!, {d18-d19}
        adds    r7, r3, r2
        vst1.32 {q9}, [r8]
        vst1.32 {q8}, [ip]
        vldmia  r3, {d16-d17}
        adds    r3, r3, #16
        cmp     r3, r6
        vst1.32 {q8}, [r7]
        bne     .L21
        pop     {r4, r5, r6, r7, r8}
        bx      lr

(4) destination as aligned structs:

struct blob
{
  int v[320];
} __attribute__((aligned(128)));

void fill(struct blob * __restrict a, struct blob * __restrict b,
struct blob * __restrict c)
{
  for (int i = 0; i < 320; i++)
    {
      a->v[i] = ad[i];
      b->v[i] = bd[i];
      c->v[i] = cd[i];
    }
}

fill:
        push    {r4, r5, r6}
        add     r6, r2, #1280
        ldr     r3, .L5
        ldr     r4, .L5+4
        ldr     r5, .L5+8
.L2:
        vldmia  r3!, {d16-d17}
        vstmia  r0!, {d16-d17}
        vldmia  r4!, {d16-d17}
        vstmia  r1!, {d16-d17}
        vldmia  r5!, {d16-d17}
        vstmia  r2!, {d16-d17}
        cmp     r2, r6
        bne     .L2
        pop     {r4, r5, r6}
        bx      lr

Version (3) seems to rejigger the destination pointers.  I assume this
is as a side effect to not knowing if the target is aligned?

-- Michael

_______________________________________________
linaro-toolchain mailing list
linaro-toolchain@lists.linaro.org
http://lists.linaro.org/mailman/listinfo/linaro-toolchain

Re: Vectorised copy

Reply via email to