https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119876
Bug ID: 119876 Summary: suboptimal code for avx512 conditinal move Product: gcc Version: unknown Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: hubicka at gcc dot gnu.org Target Milestone: --- int a[1000]; int b[1000]; int c[1000]; int d[1000]; void test() { for (int i = 0; i < 1000; i++) a[i] = b[i] > 0 ? c[i] + 1 : c[i] + 2; } is copmpiled with -O3 -march=znver5 as: test: .LFB0: .cfi_startproc movl $2, %edx vpxor %xmm2, %xmm2, %xmm2 xorl %eax, %eax vpbroadcastd %edx, %zmm3 vpternlogd $0xFF, %zmm2, %zmm2, %zmm2 vpxor %xmm4, %xmm4, %xmm4 vpsrld $31, %zmm2, %zmm2 .p2align 6 .p2align 4 .p2align 3 .L2: vmovdqa32 b(%rax), %zmm0 vmovdqa32 c(%rax), %zmm1 addq $64, %rax vpcmpd $6, %zmm4, %zmm0, %k1 vpaddd %zmm3, %zmm1, %zmm0 vpaddd %zmm2, %zmm1, %zmm0{%k1} vmovdqa32 %zmm0, a-64(%rax) cmpq $3968, %rax jne .L2 While clang does (with -fno-unroll-loops) vpbroadcastd .LCPI0_0(%rip), %zmm0 # zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] leaq b(%rip), %rax leaq c(%rip), %rcx leaq a(%rip), %rdx xorl %esi, %esi .p2align 4, 0x90 .LBB0_1: # =>This Inner Loop Header: Depth=1 vpcmpgtd (%rsi,%rax), %zmm0, %k1 vpblendmd .LCPI0_1(%rip){1to16}, %zmm0, %zmm1 {%k1} vpaddd (%rsi,%rcx), %zmm1, %zmm1 vmovdqu64 %zmm1, (%rsi,%rdx) addq $64, %rsi cmpq $3968, %rsi # imm = 0xF80 jne .LBB0_1