https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120428
Bug ID: 120428 Summary: [15/16 regression] Suboptimal autovec involving blocked permutation and std::copy Product: gcc Version: 15.1.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: shawn at shawnxu dot org Target Milestone: --- On x86-64 with avx512, PR115444 caused the following code to vectorize sub-optimally: template<std::size_t BlockSize, typename T, std::size_t N, std::size_t OrderSize> void permute(T (&data)[N], const std::array<std::size_t, OrderSize>& order) { constexpr std::size_t TotalSize = N * sizeof(T); static_assert(TotalSize % (BlockSize * OrderSize) == 0, "ChunkSize * OrderSize must perfectly divide TotalSize"); constexpr std::size_t ProcessChunkSize = BlockSize * OrderSize; std::array<std::byte, ProcessChunkSize> buffer{}; std::byte* const bytes = reinterpret_cast<std::byte*>(data); for (std::size_t i = 0; i < TotalSize; i += ProcessChunkSize) { std::byte* const values = &bytes[i]; for (std::size_t j = 0; j < OrderSize; j++) { auto* const buffer_chunk = &buffer[j * BlockSize]; auto* const value_chunk = &values[order[j] * BlockSize]; std::copy(value_chunk, value_chunk + BlockSize, buffer_chunk); } std::copy(std::begin(buffer), std::end(buffer), values); } } void permute_weights(std::int16_t (&biases)[4096]) { static constexpr std::array<std::size_t, 8> order{0, 2, 4, 6, 1, 3, 5, 7}; permute<16>(biases, order); } * Before PR11544: $ ../gcc-before/bin/g++ -S -O3 -mavx512f permute.cpp $ cat permute.s .file "permute.cpp" .text #APP .globl _ZSt21ios_base_library_initv #NO_APP .p2align 4 .globl _Z15permute_weightsRA4096_s .type _Z15permute_weightsRA4096_s, @function _Z15permute_weightsRA4096_s: .LFB2070: .cfi_startproc leaq 16(%rdi), %rax leaq 8208(%rdi), %rdx .p2align 4 .p2align 3 .L2: vmovdqu 48(%rax), %xmm4 vmovdqu 80(%rax), %xmm3 subq $-128, %rax vmovdqu -128(%rax), %xmm2 vmovdqu -96(%rax), %xmm1 vmovdqu -64(%rax), %xmm0 vmovdqu -112(%rax), %xmm5 vmovdqu %xmm3, -96(%rax) vmovdqu %xmm4, -112(%rax) vmovdqu %xmm5, -128(%rax) vmovdqu %xmm2, -80(%rax) vmovdqu %xmm1, -64(%rax) vmovdqu %xmm0, -48(%rax) cmpq %rdx, %rax jne .L2 ret .cfi_endproc .LFE2070: .size _Z15permute_weightsRA4096_s, .-_Z15permute_weightsRA4096_s .ident "GCC: (GNU) 15.0.0 20241016 (experimental)" .section .note.GNU-stack,"",@progbits * After PR11544: $ ../gcc-after/bin/g++ -S -O3 -mavx512f permute.cpp $ cat permute.s .file "permute.cpp" .text #APP .globl _ZSt21ios_base_library_initv #NO_APP .p2align 4 .globl _Z15permute_weightsRA4096_s .type _Z15permute_weightsRA4096_s, @function _Z15permute_weightsRA4096_s: .LFB2059: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rdi, %rax leaq 8192(%rdi), %rdx movq %rsp, %rbp .cfi_def_cfa_register 6 andq $-64, %rsp subq $8, %rsp .p2align 4 .p2align 3 .L2: vmovdqu (%rax), %xmm0 subq $-128, %rax vmovdqa %xmm0, -120(%rsp) vmovdqu -96(%rax), %xmm0 vmovdqa %xmm0, -104(%rsp) vmovdqu -64(%rax), %xmm0 vmovdqa %xmm0, -88(%rsp) vmovdqu -32(%rax), %xmm0 vmovdqa %xmm0, -72(%rsp) vmovdqu -112(%rax), %xmm0 vmovdqa %xmm0, -56(%rsp) vmovdqu -80(%rax), %xmm0 vmovdqa %xmm0, -40(%rsp) vmovdqu -48(%rax), %xmm0 vmovdqa %xmm0, -24(%rsp) vmovdqu -16(%rax), %xmm0 vmovdqa %xmm0, -8(%rsp) vmovdqa64 -120(%rsp), %zmm0 vmovdqu64 %zmm0, -128(%rax) vmovdqa64 -56(%rsp), %zmm0 vmovdqu64 %zmm0, -64(%rax) cmpq %rdx, %rax jne .L2 vzeroupper leave .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE2059: .size _Z15permute_weightsRA4096_s, .-_Z15permute_weightsRA4096_s .ident "GCC: (GNU) 15.0.0 20241016 (experimental)" .section .note.GNU-stack,"",@progbits Example assembly generation: https://godbolt.org/z/q1hjxajdo No regression observed when replacing std::copy with std::memcpy: https://godbolt.org/z/Kq5ae7ePo Benchmarking on a slightly different (larger array, aligned storage) variant shows 50% slowdown with the single register version: https://pastebin.com/bKrAPFWj