https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119876

            Bug ID: 119876
           Summary: suboptimal code for avx512 conditinal move
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

int a[1000];
int b[1000];
int c[1000];
int d[1000];
void test()
{
        for (int i = 0; i < 1000; i++)
                a[i] = b[i] > 0 ? c[i] + 1 : c[i] + 2;
}

is copmpiled with -O3 -march=znver5 as:

test:
.LFB0:
        .cfi_startproc
        movl    $2, %edx
        vpxor   %xmm2, %xmm2, %xmm2
        xorl    %eax, %eax
        vpbroadcastd    %edx, %zmm3
        vpternlogd      $0xFF, %zmm2, %zmm2, %zmm2
        vpxor   %xmm4, %xmm4, %xmm4
        vpsrld  $31, %zmm2, %zmm2
        .p2align 6
        .p2align 4
        .p2align 3
.L2:
        vmovdqa32       b(%rax), %zmm0
        vmovdqa32       c(%rax), %zmm1
        addq    $64, %rax
        vpcmpd  $6, %zmm4, %zmm0, %k1
        vpaddd  %zmm3, %zmm1, %zmm0
        vpaddd  %zmm2, %zmm1, %zmm0{%k1}
        vmovdqa32       %zmm0, a-64(%rax)
        cmpq    $3968, %rax
        jne     .L2

While clang does (with -fno-unroll-loops)

       vpbroadcastd    .LCPI0_0(%rip), %zmm0   # zmm0 =
[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
        leaq    b(%rip), %rax
        leaq    c(%rip), %rcx
        leaq    a(%rip), %rdx
        xorl    %esi, %esi
        .p2align        4, 0x90
.LBB0_1:                                # =>This Inner Loop Header: Depth=1
        vpcmpgtd        (%rsi,%rax), %zmm0, %k1
        vpblendmd       .LCPI0_1(%rip){1to16}, %zmm0, %zmm1 {%k1}
        vpaddd  (%rsi,%rcx), %zmm1, %zmm1
        vmovdqu64       %zmm1, (%rsi,%rdx)
        addq    $64, %rsi
        cmpq    $3968, %rsi                     # imm = 0xF80
        jne     .LBB0_1

Reply via email to