16 regression] Suboptimal autovec involving blocked permutation and std::copy

shawn at shawnxu dot org via Gcc-bugs Sat, 24 May 2025 21:17:33 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120428


            Bug ID: 120428
           Summary: [15/16 regression] Suboptimal autovec involving
                    blocked permutation and std::copy
           Product: gcc
           Version: 15.1.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: shawn at shawnxu dot org
  Target Milestone: ---

On x86-64 with avx512, PR115444 caused the following code to vectorize
sub-optimally:

template<std::size_t BlockSize, typename T, std::size_t N, std::size_t
OrderSize>
void permute(T (&data)[N], const std::array<std::size_t, OrderSize>& order) {
    constexpr std::size_t TotalSize = N * sizeof(T);

    static_assert(TotalSize % (BlockSize * OrderSize) == 0,
                  "ChunkSize * OrderSize must perfectly divide TotalSize");

    constexpr std::size_t ProcessChunkSize = BlockSize * OrderSize;

    std::array<std::byte, ProcessChunkSize> buffer{};

    std::byte* const bytes = reinterpret_cast<std::byte*>(data);

    for (std::size_t i = 0; i < TotalSize; i += ProcessChunkSize)
    {
        std::byte* const values = &bytes[i];

        for (std::size_t j = 0; j < OrderSize; j++)
        {
            auto* const buffer_chunk = &buffer[j * BlockSize];
            auto* const value_chunk  = &values[order[j] * BlockSize];

            std::copy(value_chunk, value_chunk + BlockSize, buffer_chunk);
        }

        std::copy(std::begin(buffer), std::end(buffer), values);
    }
}

void permute_weights(std::int16_t (&biases)[4096]) {
    static constexpr std::array<std::size_t, 8> order{0, 2, 4, 6, 1, 3, 5, 7};
    permute<16>(biases, order);
}


* Before PR11544:
$ ../gcc-before/bin/g++ -S -O3 -mavx512f permute.cpp
$ cat permute.s
        .file   "permute.cpp"
        .text
#APP
        .globl _ZSt21ios_base_library_initv
#NO_APP
        .p2align 4
        .globl  _Z15permute_weightsRA4096_s
        .type   _Z15permute_weightsRA4096_s, @function
_Z15permute_weightsRA4096_s:
.LFB2070:
        .cfi_startproc
        leaq    16(%rdi), %rax
        leaq    8208(%rdi), %rdx
        .p2align 4
        .p2align 3
.L2:
        vmovdqu 48(%rax), %xmm4
        vmovdqu 80(%rax), %xmm3
        subq    $-128, %rax
        vmovdqu -128(%rax), %xmm2
        vmovdqu -96(%rax), %xmm1
        vmovdqu -64(%rax), %xmm0
        vmovdqu -112(%rax), %xmm5
        vmovdqu %xmm3, -96(%rax)
        vmovdqu %xmm4, -112(%rax)
        vmovdqu %xmm5, -128(%rax)
        vmovdqu %xmm2, -80(%rax)
        vmovdqu %xmm1, -64(%rax)
        vmovdqu %xmm0, -48(%rax)
        cmpq    %rdx, %rax
        jne     .L2
        ret
        .cfi_endproc
.LFE2070:
        .size   _Z15permute_weightsRA4096_s, .-_Z15permute_weightsRA4096_s
        .ident  "GCC: (GNU) 15.0.0 20241016 (experimental)"
        .section        .note.GNU-stack,"",@progbits

* After PR11544:

$ ../gcc-after/bin/g++ -S -O3 -mavx512f permute.cpp
$ cat permute.s
        .file   "permute.cpp"
        .text
#APP
        .globl _ZSt21ios_base_library_initv
#NO_APP
        .p2align 4
        .globl  _Z15permute_weightsRA4096_s
        .type   _Z15permute_weightsRA4096_s, @function
_Z15permute_weightsRA4096_s:
.LFB2059:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        movq    %rdi, %rax
        leaq    8192(%rdi), %rdx
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        andq    $-64, %rsp
        subq    $8, %rsp
        .p2align 4
        .p2align 3
.L2:
        vmovdqu (%rax), %xmm0
        subq    $-128, %rax
        vmovdqa %xmm0, -120(%rsp)
        vmovdqu -96(%rax), %xmm0
        vmovdqa %xmm0, -104(%rsp)
        vmovdqu -64(%rax), %xmm0
        vmovdqa %xmm0, -88(%rsp)
        vmovdqu -32(%rax), %xmm0
        vmovdqa %xmm0, -72(%rsp)
        vmovdqu -112(%rax), %xmm0
        vmovdqa %xmm0, -56(%rsp)
        vmovdqu -80(%rax), %xmm0
        vmovdqa %xmm0, -40(%rsp)
        vmovdqu -48(%rax), %xmm0
        vmovdqa %xmm0, -24(%rsp)
        vmovdqu -16(%rax), %xmm0
        vmovdqa %xmm0, -8(%rsp)
        vmovdqa64       -120(%rsp), %zmm0
        vmovdqu64       %zmm0, -128(%rax)
        vmovdqa64       -56(%rsp), %zmm0
        vmovdqu64       %zmm0, -64(%rax)
        cmpq    %rdx, %rax
        jne     .L2
        vzeroupper
        leave
        .cfi_def_cfa 7, 8
        ret
        .cfi_endproc
.LFE2059:
        .size   _Z15permute_weightsRA4096_s, .-_Z15permute_weightsRA4096_s
        .ident  "GCC: (GNU) 15.0.0 20241016 (experimental)"
        .section        .note.GNU-stack,"",@progbits


Example assembly generation: https://godbolt.org/z/q1hjxajdo
No regression observed when replacing std::copy with std::memcpy:
https://godbolt.org/z/Kq5ae7ePo
Benchmarking on a slightly different (larger array, aligned storage) variant
shows 50% slowdown with the single register version:
https://pastebin.com/bKrAPFWj

[Bug target/120428] New: [15/16 regression] Suboptimal autovec involving blocked permutation and std::copy

Reply via email to