https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119876
Bug ID: 119876
Summary: suboptimal code for avx512 conditinal move
Product: gcc
Version: unknown
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: hubicka at gcc dot gnu.org
Target Milestone: ---
int a[1000];
int b[1000];
int c[1000];
int d[1000];
void test()
{
for (int i = 0; i < 1000; i++)
a[i] = b[i] > 0 ? c[i] + 1 : c[i] + 2;
}
is copmpiled with -O3 -march=znver5 as:
test:
.LFB0:
.cfi_startproc
movl $2, %edx
vpxor %xmm2, %xmm2, %xmm2
xorl %eax, %eax
vpbroadcastd %edx, %zmm3
vpternlogd $0xFF, %zmm2, %zmm2, %zmm2
vpxor %xmm4, %xmm4, %xmm4
vpsrld $31, %zmm2, %zmm2
.p2align 6
.p2align 4
.p2align 3
.L2:
vmovdqa32 b(%rax), %zmm0
vmovdqa32 c(%rax), %zmm1
addq $64, %rax
vpcmpd $6, %zmm4, %zmm0, %k1
vpaddd %zmm3, %zmm1, %zmm0
vpaddd %zmm2, %zmm1, %zmm0{%k1}
vmovdqa32 %zmm0, a-64(%rax)
cmpq $3968, %rax
jne .L2
While clang does (with -fno-unroll-loops)
vpbroadcastd .LCPI0_0(%rip), %zmm0 # zmm0 =
[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
leaq b(%rip), %rax
leaq c(%rip), %rcx
leaq a(%rip), %rdx
xorl %esi, %esi
.p2align 4, 0x90
.LBB0_1: # =>This Inner Loop Header: Depth=1
vpcmpgtd (%rsi,%rax), %zmm0, %k1
vpblendmd .LCPI0_1(%rip){1to16}, %zmm0, %zmm1 {%k1}
vpaddd (%rsi,%rcx), %zmm1, %zmm1
vmovdqu64 %zmm1, (%rsi,%rdx)
addq $64, %rsi
cmpq $3968, %rsi # imm = 0xF80
jne .LBB0_1