https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120683

            Bug ID: 120683
           Summary: vector_loop generates horrible prologue and epilogue
                    on memset
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hjl.tools at gmail dot com
                CC: hubicka at ucw dot cz, liuhongt at gcc dot gnu.org,
                    ubizjak at gmail dot com
  Target Milestone: ---
            Target: x86-64

For

---
#include <sys/types.h>

void
foo (void *p1, size_t len)
{
  __builtin_memset(p1, 0, len);
}
---

-O2 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign
-minline-all-stringops

generates horrible prologue and epilogue:

foo:
.LFB6:
        .cfi_startproc
        movq    %rdi, %rax
        pxor    %xmm0, %xmm0
        cmpq    $64, %rsi
        jnb     .L18
.L2:
        andl    $63, %esi
        je      .L1
        xorl    %edx, %edx
        testb   $1, %sil
        je      .L5
        movl    $1, %edx
        movb    $0, (%rax)
        cmpq    %rsi, %rdx
        jnb     .L19
.L5:
        movb    $0, (%rax,%rdx)
        movb    $0, 1(%rax,%rdx)
        addq    $2, %rdx
        cmpq    %rsi, %rdx
        jb      .L5
.L1:
        ret
        .p2align 4,,10
        .p2align 3
.L18:
        movq    %rsi, %rdx
        xorl    %eax, %eax
        andq    $-64, %rdx
.L3:
        movups  %xmm0, (%rdi,%rax)
        movups  %xmm0, 16(%rdi,%rax)
        movups  %xmm0, 32(%rdi,%rax)
        movups  %xmm0, 48(%rdi,%rax)
        addq    $64, %rax
        cmpq    %rdx, %rax
        jb      .L3
        addq    %rdi, %rax
        jmp     .L2
.L19:
        ret
        .cfi_endproc

It uses a 1-byte loop to clear a 8 byte memory.

Reply via email to