https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112992
--- Comment #5 from Hongtao Liu <liuhongt at gcc dot gnu.org> ---
(In reply to Roger Sayle from comment #0)
> The following four functions should in theory all produce the same code:
>
> typedef unsigned long long v4di __attribute((vector_size(32)));
> typedef unsigned int v8si __attribute((vector_size(32)));
> typedef unsigned short v16hi __attribute((vector_size(32)));
> typedef unsigned char v32qi __attribute((vector_size(32)));
>
> #define MASK 0x01010101
> #define MASKL 0x0101010101010101ULL
> #define MASKS 0x0101
>
> v4di fooq() {
> return (v4di){MASKL,MASKL,MASKL,MASKL};
> }
>
> v8si food() {
> return (v8si){MASK,MASK,MASK,MASK,MASK,MASK,MASK,MASK};
> }
>
> v16hi foow() {
> return (v16hi){MASKS,MASKS,MASKS,MASKS,MASKS,MASKS,MASKS,MASKS,
> MASKS,MASKS,MASKS,MASKS,MASKS,MASKS,MASKS,MASKS};
> }
>
> v32qi foob() {
> return (v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
> }
>
> On x86_64 with -mavx, we currently produce very different implementations:
>
> fooq:
> movabs rax, 72340172838076673
> push rbp
> mov rbp, rsp
> and rsp, -32
> mov QWORD PTR [rsp-8], rax
> vbroadcastsd ymm0, QWORD PTR [rsp-8]
> leave
> ret
> food:
> vbroadcastss ymm0, DWORD PTR .LC2[rip]
> ret
> foow:
> vmovdqa ymm0, YMMWORD PTR .LC3[rip]
> ret
> foob:
> vmovdqa ymm0, YMMWORD PTR .LC4[rip]
> ret
>
> clang currently produces the vbroadcastss for all four.
I guess here, you mean .rodata optimization, not sure about this part, with the
fix we now generate
.file "test.c"
.text
.p2align 4
.globl fooq
.type fooq, @function
fooq:
.LFB0:
.cfi_startproc
vbroadcastsd .LC1(%rip), %ymm0
ret
.cfi_endproc
.LFE0:
.size fooq, .-fooq
.p2align 4
.globl food
.type food, @function
food:
.LFB1:
.cfi_startproc
vbroadcastss .LC3(%rip), %ymm0
ret
.cfi_endproc
.LFE1:
.size food, .-food
.p2align 4
.globl foow
.type foow, @function
foow:
.LFB2:
.cfi_startproc
vmovdqa .LC4(%rip), %ymm0
ret
.cfi_endproc
.LFE2:
.size foow, .-foow
.p2align 4
.globl foob
.type foob, @function
foob:
.LFB3:
.cfi_startproc
vmovdqa .LC5(%rip), %ymm0
ret
.cfi_endproc
.LFE3:
.size foob, .-foob
.set .LC1,.LC4
.set .LC3,.LC4
.section .rodata.cst32,"aM",@progbits,32
.align 32
.LC4:
.value 257
.value 257
.value 257
.value 257
.value 257
.value 257
.value 257
.value 257
.value 257
.value 257
.value 257
.value 257
.value 257
.value 257
.value 257
.value 257
.set .LC5,.LC4
.ident "GCC: (GNU) 14.0.0 20231212 (experimental)"
.section .note.GNU-stack,"",@progbits