https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65965

            Bug ID: 65965
           Summary: Straight-line memcpy/memset not vectorized when
                    equivalent loop is
           Product: gcc
           Version: 5.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: alalaw01 at gcc dot gnu.org
  Target Milestone: ---

Testcase:
void
test(int *__restrict__ a, int *__restrict__ b)
{
  a[0] = b[0];
  a[1] = b[1];
  a[2] = b[2];
  a[3] = b[3];
  a[5] = 0;
  a[6] = 0;
  a[7] = 0;
  a[8] = 0;
}
produces (at -O3) on AArch64:
test:
        ldp     w4, w3, [x1]
        ldp     w2, w1, [x1, 8]
        stp     w4, w3, [x0]
        stp     w2, w1, [x0, 8]
        stp     wzr, wzr, [x0, 20]
        stp     wzr, wzr, [x0, 28]
        ret
or on x86_64/-mavx:
test:
.LFB0:
        movl    (%rsi), %eax
        movl    $0, 20(%rdi)
        movl    $0, 24(%rdi)
        movl    $0, 28(%rdi)
        movl    $0, 32(%rdi)
        movl    %eax, (%rdi)
        movl    4(%rsi), %eax
        movl    %eax, 4(%rdi)
        movl    8(%rsi), %eax
        movl    %eax, 8(%rdi)
        movl    12(%rsi), %eax
        movl    %eax, 12(%rdi)
        ret
(there is no -fdump-tree-vect)

In contrast, testcase
void
test(int *__restrict__ a, int *__restrict__ b)
{
  for (int i = 0; i < 4; i++) a[i] = b[i];
  for (int i = 0; i < 4; i++) a[i+4] = 0;
}
the memcpy is recognized by ldist, and the 'memset' by slp1 (neither of which
triggers on the first case), producing (superior) AArch64:
test:
        movi    v0.4s, 0
        ldp     x2, x3, [x1]
        stp     x2, x3, [x0]
        str     q0, [x0, 16]
        ret
or x86_64:
test:
.LFB0:
        movq    (%rsi), %rax
        movq    8(%rsi), %rdx
        vpxor   %xmm0, %xmm0, %xmm0
        movq    %rax, (%rdi)
        movq    %rdx, 8(%rdi)
        vmovups %xmm0, 16(%rdi)
        ret

Reply via email to