https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108506
--- Comment #5 from m.cencora at gmail dot com ---
Almost, there is still some unnecessary stack spill with bit_cast case:
"deserialize(unsigned char const*, Foo*)":
push rbp
xor eax, eax
mov rbp, rsp
and rsp, -32
vmovdqa ymm1, YMMWORD PTR .LC0[rip]
.L2:
vmovdqu ymm0, YMMWORD PTR [rdi+rax]
vpshufb ymm0, ymm0, ymm1
vmovdqa YMMWORD PTR [rsp-32], ymm0 // unnecessary
vmovdqu YMMWORD PTR [rsi+rax], ymm0
add rax, 32
cmp rax, 32768
jne .L2
vzeroupper
leave
ret
"deserialize2(unsigned char const*, Foo*)":
vmovdqa ymm1, YMMWORD PTR .LC0[rip]
xor eax, eax
.L7:
vmovdqu ymm0, YMMWORD PTR [rdi+rax]
vpshufb ymm0, ymm0, ymm1
vmovdqu YMMWORD PTR [rsi+rax], ymm0
add rax, 32
cmp rax, 32768
jne .L7
vzeroupper
ret