https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120683

H.J. Lu <hjl.tools at gmail dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
   Last reconfirmed|                            |2025-06-16
     Ever confirmed|0                           |1
             Status|UNCONFIRMED                 |NEW
            Summary|vector_loop generates       |vector_loop generates
                   |horrible prologue and       |horrible prologue and
                   |epilogue on memset          |epilogue on memset/memcpy

--- Comment #1 from H.J. Lu <hjl.tools at gmail dot com> ---
memcpy isn't much better:

---
#include <sys/types.h>

void
foo (void *dest, void *src, size_t len)
{
  __builtin_memcpy (dest, src, len);
}
---


-O2 -mmemcpy-strategy=vector_loop:256:noalign,libcall:-1:noalign
-minline-all-stringops

generates:

foo:
.LFB6:
        .cfi_startproc
        movq    %rdi, %r8
        movq    %rsi, %rax
        cmpq    $64, %rdx
        jnb     .L13
.L2:
        andl    $63, %edx
        je      .L1
        xorl    %ecx, %ecx
.L5:
        movzbl  (%rax,%rcx), %esi
        movb    %sil, (%r8,%rcx)
        addq    $1, %rcx
        cmpq    %rdx, %rcx
        jb      .L5
.L1:
        ret
        .p2align 4,,10
        .p2align 3
.L13:
        movq    %rdx, %rcx
        xorl    %eax, %eax
        andq    $-64, %rcx
.L3:
        movdqu  (%rsi,%rax), %xmm3
        movdqu  16(%rsi,%rax), %xmm2
        movdqu  32(%rsi,%rax), %xmm1
        movdqu  48(%rsi,%rax), %xmm0
        movups  %xmm3, (%rdi,%rax)
        movups  %xmm2, 16(%rdi,%rax)
        movups  %xmm1, 32(%rdi,%rax)
        movups  %xmm0, 48(%rdi,%rax)
        addq    $64, %rax
        cmpq    %rcx, %rax
        jb      .L3
        leaq    (%rdi,%rax), %r8
        addq    %rsi, %rax
        jmp     .L2
        .cfi_endproc
.LFE6:
        .size   foo, .-foo

Reply via email to