https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108506
Bug ID: 108506 Summary: bit_cast from 32-byte vector generates worse code than memcpy Product: gcc Version: 13.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c++ Assignee: unassigned at gcc dot gnu.org Reporter: m.cencora at gmail dot com Target Milestone: --- Gcc trunk on x86-64 produces much worse assembly for 'deserialize' func than for equivalent 'deserialize2'. These two should be equivalent as bit_cast should be just a type-safe equivalent of memcpy (that is the only difference between the two funcs). g++ -std=c++23 -O3 -mavx2 using v32uc = unsigned char __attribute((vector_size(32))); constexpr auto N = 1024; struct Foo { int a[8]; }; static_assert(sizeof(Foo) == sizeof(v32uc)); void deserialize(const unsigned char* input, Foo* output) { for (auto i = 0u; i != N; ++i) { v32uc vec; __builtin_memcpy(&vec, input, sizeof(vec)); input += sizeof(vec); vec = __builtin_shuffle(vec, v32uc{ 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 } ); *output = __builtin_bit_cast(Foo, vec); output++; } } void deserialize2(const unsigned char* input, Foo* output) { for (auto i = 0u; i != N; ++i) { v32uc vec; __builtin_memcpy(&vec, input, sizeof(vec)); input += sizeof(vec); vec = __builtin_shuffle(vec, v32uc{ 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 } ); __builtin_memcpy(output, &vec, sizeof(vec)); output++; } } Disassembly: deserialize(unsigned char const*, Foo*): push rbp xor eax, eax mov rbp, rsp and rsp, -32 vmovdqa ymm1, YMMWORD PTR .LC0[rip] .L2: vmovdqu ymm3, YMMWORD PTR [rdi+rax] vpshufb ymm2, ymm3, ymm1 vmovdqa YMMWORD PTR [rsp-32], ymm2 mov rdx, QWORD PTR [rsp-32] mov rcx, QWORD PTR [rsp-24] vmovdqa xmm4, XMMWORD PTR [rsp-16] vmovq xmm0, rdx vpinsrq xmm0, xmm0, rcx, 1 vmovdqu XMMWORD PTR [rsi+16+rax], xmm4 vmovdqu XMMWORD PTR [rsi+rax], xmm0 add rax, 32 cmp rax, 32768 jne .L2 vzeroupper leave ret deserialize2(unsigned char const*, Foo*): vmovdqa ymm1, YMMWORD PTR .LC0[rip] xor eax, eax .L7: vmovdqu ymm2, YMMWORD PTR [rdi+rax] vpshufb ymm0, ymm2, ymm1 vmovdqu YMMWORD PTR [rsi+rax], ymm0 add rax, 32 cmp rax, 32768 jne .L7 vzeroupper ret .LC0: .byte 3 .byte 2 .byte 1 .byte 0 .byte 7 .byte 6 .byte 5 .byte 4 .byte 11 .byte 10 .byte 9 .byte 8 .byte 15 .byte 14 .byte 13 .byte 12 .byte 3 .byte 2 .byte 1 .byte 0 .byte 7 .byte 6 .byte 5 .byte 4 .byte 11 .byte 10 .byte 9 .byte 8 .byte 15 .byte 14 .byte 13 .byte 12