https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72805

            Bug ID: 72805
           Summary: AVX512: invalid code generation involving masks
           Product: gcc
           Version: 7.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: wen...@mitsuba-renderer.org
  Target Milestone: ---

Consider the following minimal program, which initializes an 16 int AVX512
vector with -1 entries, does a componen-twise "< 0" comparison, and prints the
resulting mask.

Since there are 16 entries, the expected output is "65535". GCC trunk prints
"255" (compilation flags: g++-7 -S -mavx512f  test.c -o test.s
-fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-exceptions). The
issue goes away when compiling at higher optimization levels, though that is
clearly not a good solution.

#include <immintrin.h>
#include <stdio.h>

__attribute__((noinline))
int test() { 
    __m512i value = _mm512_set1_epi32(-1);
    return (int) _mm512_cmp_epi32_mask(value, _mm512_setzero_si512(), 1 /*
_MM_CMPINT_LT */);
}

int main(int argc, char *argv[]) {
    printf("%i\n", test());
    return 0;
}

Looking at the assembly reveals the problem:

__Z4testv:
        leaq    8(%rsp), %r10
        andq    $-64, %rsp
        pushq   -8(%r10)
        pushq   %rbp
        movq    %rsp, %rbp
        pushq   %r10
        subq    $112, %rsp
        movl    $-1, -52(%rbp)
        vmovdqa64       -176(%rbp), %zmm0
        movl    $-1, %eax
        kmovw   %eax, %k2
        vpbroadcastd    -52(%rbp), %zmm0{%k2}
        vmovdqa64       %zmm0, -240(%rbp)
        vpxord  %zmm0, %zmm0, %zmm0
        vmovdqa64       %zmm0, %zmm1
        vmovdqa64       -240(%rbp), %zmm0
        movl    $-1, %eax
        kmovw   %eax, %k3
        vpcmpd  $1, %zmm1, %zmm0, %k1{%k3}
        kmovw   %k1, %eax
        movzbl  %al, %eax                    <----- UH OH
        addq    $112, %rsp
        popq    %r10
        popq    %rbp
        leaq    -8(%r10), %rsp
        ret

For some reason, GCC things that the mask is only eight byte wide and uses a
"movzbl" instruction.

At higher optimization levels, many of the moves are elided, and the mask is
directly copied to %eax. Very mysterious.

__Z4testv:
        vpternlogd      $0xFF, %zmm0, %zmm0, %zmm0
        vpxord  %zmm1, %zmm1, %zmm1
        vpcmpd  $1, %zmm1, %zmm0, %k1
        kmovw   %k1, %eax
        movzwl  %ax, %eax
        ret

Reply via email to