https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99411
Bug ID: 99411
Summary: s311 benchmark of TSVC is vectorized by clang better
than by gcc
Product: gcc
Version: 11.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: hubicka at gcc dot gnu.org
Target Milestone: ---
typedef float real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D];
int main()
{
// reductions
// sum reduction
real_t sum;
for (int nl = 0; nl < iterations*10; nl++) {
sum = (real_t)0.;
for (int i = 0; i < LEN_1D; i++) {
sum += a[i];
}
}
return sum > 4;
}
We produce with -O2 -march=znver2
.L2:
movl $a, %eax
vxorps %xmm0, %xmm0, %xmm0
.p2align 4
.p2align 3
.L3:
vaddps (%rax), %ymm0, %ymm0
addq $32, %rax
cmpq $a+128000, %rax
jne .L3
vextractf128 $0x1, %ymm0, %xmm1
decl %edx
vaddps %xmm0, %xmm1, %xmm1
vmovhlps %xmm1, %xmm1, %xmm0
vaddps %xmm1, %xmm0, %xmm0
vshufps $85, %xmm0, %xmm0, %xmm1
vaddps %xmm0, %xmm1, %xmm0
jne .L2
xorl %eax, %eax
vcomiss .LC0(%rip), %xmm0
seta %al
vzeroupper
ret
.cfi_endproc
clang does:
main: # @main
.cfi_startproc
# %bb.0:
xorl %eax, %eax
.p2align 4, 0x90
.LBB0_1: # =>This Loop Header: Depth=1
# Child Loop BB0_2 Depth 2
vxorps %xmm0, %xmm0, %xmm0
movq $-128000, %rcx # imm = 0xFFFE0C00
vxorps %xmm1, %xmm1, %xmm1
vxorps %xmm2, %xmm2, %xmm2
vxorps %xmm3, %xmm3, %xmm3
.p2align 4, 0x90
.LBB0_2: # Parent Loop BB0_1 Depth=1
# => This Inner Loop Header: Depth=2
vaddps a+128000(%rcx), %ymm0, %ymm0
vaddps a+128032(%rcx), %ymm1, %ymm1
vaddps a+128064(%rcx), %ymm2, %ymm2
vaddps a+128096(%rcx), %ymm3, %ymm3
subq $-128, %rcx
jne .LBB0_2
# %bb.3: # in Loop: Header=BB0_1 Depth=1
incl %eax
cmpl $1000000, %eax # imm = 0xF4240
jne .LBB0_1
# %bb.4:
vaddps %ymm0, %ymm1, %ymm0
xorl %eax, %eax
vaddps %ymm0, %ymm2, %ymm0
vaddps %ymm0, %ymm3, %ymm0
vextractf128 $1, %ymm0, %xmm1
vaddps %xmm1, %xmm0, %xmm0
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
vaddps %xmm1, %xmm0, %xmm0
vmovshdup %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3]
vaddss %xmm1, %xmm0, %xmm0
vucomiss .LCPI0_0(%rip), %xmm0
seta %al
vzeroupper
retq
On zen3 hardware gcc version runs 2.4s, while clang's 0.8s