https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120428
Bug ID: 120428
Summary: [15/16 regression] Suboptimal autovec involving
blocked permutation and std::copy
Product: gcc
Version: 15.1.1
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: shawn at shawnxu dot org
Target Milestone: ---
On x86-64 with avx512, PR115444 caused the following code to vectorize
sub-optimally:
template<std::size_t BlockSize, typename T, std::size_t N, std::size_t
OrderSize>
void permute(T (&data)[N], const std::array<std::size_t, OrderSize>& order) {
constexpr std::size_t TotalSize = N * sizeof(T);
static_assert(TotalSize % (BlockSize * OrderSize) == 0,
"ChunkSize * OrderSize must perfectly divide TotalSize");
constexpr std::size_t ProcessChunkSize = BlockSize * OrderSize;
std::array<std::byte, ProcessChunkSize> buffer{};
std::byte* const bytes = reinterpret_cast<std::byte*>(data);
for (std::size_t i = 0; i < TotalSize; i += ProcessChunkSize)
{
std::byte* const values = &bytes[i];
for (std::size_t j = 0; j < OrderSize; j++)
{
auto* const buffer_chunk = &buffer[j * BlockSize];
auto* const value_chunk = &values[order[j] * BlockSize];
std::copy(value_chunk, value_chunk + BlockSize, buffer_chunk);
}
std::copy(std::begin(buffer), std::end(buffer), values);
}
}
void permute_weights(std::int16_t (&biases)[4096]) {
static constexpr std::array<std::size_t, 8> order{0, 2, 4, 6, 1, 3, 5, 7};
permute<16>(biases, order);
}
* Before PR11544:
$ ../gcc-before/bin/g++ -S -O3 -mavx512f permute.cpp
$ cat permute.s
.file "permute.cpp"
.text
#APP
.globl _ZSt21ios_base_library_initv
#NO_APP
.p2align 4
.globl _Z15permute_weightsRA4096_s
.type _Z15permute_weightsRA4096_s, @function
_Z15permute_weightsRA4096_s:
.LFB2070:
.cfi_startproc
leaq 16(%rdi), %rax
leaq 8208(%rdi), %rdx
.p2align 4
.p2align 3
.L2:
vmovdqu 48(%rax), %xmm4
vmovdqu 80(%rax), %xmm3
subq $-128, %rax
vmovdqu -128(%rax), %xmm2
vmovdqu -96(%rax), %xmm1
vmovdqu -64(%rax), %xmm0
vmovdqu -112(%rax), %xmm5
vmovdqu %xmm3, -96(%rax)
vmovdqu %xmm4, -112(%rax)
vmovdqu %xmm5, -128(%rax)
vmovdqu %xmm2, -80(%rax)
vmovdqu %xmm1, -64(%rax)
vmovdqu %xmm0, -48(%rax)
cmpq %rdx, %rax
jne .L2
ret
.cfi_endproc
.LFE2070:
.size _Z15permute_weightsRA4096_s, .-_Z15permute_weightsRA4096_s
.ident "GCC: (GNU) 15.0.0 20241016 (experimental)"
.section .note.GNU-stack,"",@progbits
* After PR11544:
$ ../gcc-after/bin/g++ -S -O3 -mavx512f permute.cpp
$ cat permute.s
.file "permute.cpp"
.text
#APP
.globl _ZSt21ios_base_library_initv
#NO_APP
.p2align 4
.globl _Z15permute_weightsRA4096_s
.type _Z15permute_weightsRA4096_s, @function
_Z15permute_weightsRA4096_s:
.LFB2059:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rdi, %rax
leaq 8192(%rdi), %rdx
movq %rsp, %rbp
.cfi_def_cfa_register 6
andq $-64, %rsp
subq $8, %rsp
.p2align 4
.p2align 3
.L2:
vmovdqu (%rax), %xmm0
subq $-128, %rax
vmovdqa %xmm0, -120(%rsp)
vmovdqu -96(%rax), %xmm0
vmovdqa %xmm0, -104(%rsp)
vmovdqu -64(%rax), %xmm0
vmovdqa %xmm0, -88(%rsp)
vmovdqu -32(%rax), %xmm0
vmovdqa %xmm0, -72(%rsp)
vmovdqu -112(%rax), %xmm0
vmovdqa %xmm0, -56(%rsp)
vmovdqu -80(%rax), %xmm0
vmovdqa %xmm0, -40(%rsp)
vmovdqu -48(%rax), %xmm0
vmovdqa %xmm0, -24(%rsp)
vmovdqu -16(%rax), %xmm0
vmovdqa %xmm0, -8(%rsp)
vmovdqa64 -120(%rsp), %zmm0
vmovdqu64 %zmm0, -128(%rax)
vmovdqa64 -56(%rsp), %zmm0
vmovdqu64 %zmm0, -64(%rax)
cmpq %rdx, %rax
jne .L2
vzeroupper
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2059:
.size _Z15permute_weightsRA4096_s, .-_Z15permute_weightsRA4096_s
.ident "GCC: (GNU) 15.0.0 20241016 (experimental)"
.section .note.GNU-stack,"",@progbits
Example assembly generation: https://godbolt.org/z/q1hjxajdo
No regression observed when replacing std::copy with std::memcpy:
https://godbolt.org/z/Kq5ae7ePo
Benchmarking on a slightly different (larger array, aligned storage) variant
shows 50% slowdown with the single register version:
https://pastebin.com/bKrAPFWj