https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101366
Bug ID: 101366 Summary: x86 memset codegen for constant sized is suboptimal Product: gcc Version: unknown Status: UNCONFIRMED Severity: normal Priority: P3 Component: rtl-optimization Assignee: unassigned at gcc dot gnu.org Reporter: kutdanila at yandex dot ru Target Milestone: --- https://gcc.godbolt.org/z/hP99MYMEG void Test(char* dst) { __m128i pattern = _mm_set1_epi8(dst[-1]); for (int i = 0; i < 4; i++) { _mm_loadu_si128(reinterpret_cast<__m128i*>(dst + 16 * i), pattern); } } vs void TestStd(char* s) { memset(s, s[-1], 64); } -O3 -msse4.2 Test(char*): movzbl -1(%rdi), %eax pxor %xmm1, %xmm1 movd %eax, %xmm0 pshufb %xmm1, %xmm0 movups %xmm0, (%rdi) movups %xmm0, 16(%rdi) movups %xmm0, 32(%rdi) movups %xmm0, 48(%rdi) ret TestStd(char*): movabsq $72340172838076673, %rdx movzbl -1(%rdi), %eax movq %rax, %rcx imulq %rdx, %rcx mulq %rdx addq %rcx, %rdx movq %rax, (%rdi) movq %rdx, 8(%rdi) movq %rax, 16(%rdi) movq %rdx, 24(%rdi) movq %rax, 32(%rdi) movq %rdx, 40(%rdi) movq %rax, 48(%rdi) movq %rdx, 56(%rdi) ret