https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108506

            Bug ID: 108506
           Summary: bit_cast from 32-byte vector generates worse code than
                    memcpy
           Product: gcc
           Version: 13.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c++
          Assignee: unassigned at gcc dot gnu.org
          Reporter: m.cencora at gmail dot com
  Target Milestone: ---

Gcc trunk on x86-64 produces much worse assembly for 'deserialize' func than
for equivalent 'deserialize2'.
These two should be equivalent as bit_cast should be just a type-safe
equivalent of memcpy (that is the only difference between the two funcs).

g++ -std=c++23 -O3 -mavx2

using v32uc = unsigned char __attribute((vector_size(32)));

constexpr auto N = 1024;

struct Foo
{
    int a[8];
};

static_assert(sizeof(Foo) == sizeof(v32uc));

void deserialize(const unsigned char* input, Foo* output)
{
    for (auto i = 0u; i != N; ++i)
    {
        v32uc vec;
        __builtin_memcpy(&vec, input, sizeof(vec));
        input += sizeof(vec);

        vec = __builtin_shuffle(vec,
            v32uc{
                3, 2, 1, 0,
                7, 6, 5, 4,
                11, 10, 9, 8,
                15, 14, 13, 12,
                19, 18, 17, 16,
                23, 22, 21, 20,
                27, 26, 25, 24,
                31, 30, 29, 28
                }
        );
        *output = __builtin_bit_cast(Foo, vec);
        output++;
    }
}

void deserialize2(const unsigned char* input, Foo* output)
{
    for (auto i = 0u; i != N; ++i)
    {
        v32uc vec;
        __builtin_memcpy(&vec, input, sizeof(vec));
        input += sizeof(vec);

        vec = __builtin_shuffle(vec,
            v32uc{
                3, 2, 1, 0,
                7, 6, 5, 4,
                11, 10, 9, 8,
                15, 14, 13, 12,
                19, 18, 17, 16,
                23, 22, 21, 20,
                27, 26, 25, 24,
                31, 30, 29, 28
                }
        );
        __builtin_memcpy(output, &vec, sizeof(vec));
        output++;
    }
}


Disassembly:

deserialize(unsigned char const*, Foo*):
  push rbp
  xor eax, eax
  mov rbp, rsp
  and rsp, -32
  vmovdqa ymm1, YMMWORD PTR .LC0[rip]
.L2:
  vmovdqu ymm3, YMMWORD PTR [rdi+rax]
  vpshufb ymm2, ymm3, ymm1
  vmovdqa YMMWORD PTR [rsp-32], ymm2
  mov rdx, QWORD PTR [rsp-32]
  mov rcx, QWORD PTR [rsp-24]
  vmovdqa xmm4, XMMWORD PTR [rsp-16]
  vmovq xmm0, rdx
  vpinsrq xmm0, xmm0, rcx, 1
  vmovdqu XMMWORD PTR [rsi+16+rax], xmm4
  vmovdqu XMMWORD PTR [rsi+rax], xmm0
  add rax, 32
  cmp rax, 32768
  jne .L2
  vzeroupper
  leave
  ret
deserialize2(unsigned char const*, Foo*):
  vmovdqa ymm1, YMMWORD PTR .LC0[rip]
  xor eax, eax
.L7:
  vmovdqu ymm2, YMMWORD PTR [rdi+rax]
  vpshufb ymm0, ymm2, ymm1
  vmovdqu YMMWORD PTR [rsi+rax], ymm0
  add rax, 32
  cmp rax, 32768
  jne .L7
  vzeroupper
  ret
.LC0:
  .byte 3
  .byte 2
  .byte 1
  .byte 0
  .byte 7
  .byte 6
  .byte 5
  .byte 4
  .byte 11
  .byte 10
  .byte 9
  .byte 8
  .byte 15
  .byte 14
  .byte 13
  .byte 12
  .byte 3
  .byte 2
  .byte 1
  .byte 0
  .byte 7
  .byte 6
  .byte 5
  .byte 4
  .byte 11
  .byte 10
  .byte 9
  .byte 8
  .byte 15
  .byte 14
  .byte 13
  .byte 12

Reply via email to