https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100267

--- Comment #2 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Konstantin Ananyev from comment #0)
> The code snippet below compiles ok with '-O2' for gcc-9.
> But with gcc-10 (and gcc-11) it generates -Wuninitialized warnings.
> Another thing (which is probably worse) 'gcc-10 -O2' generates code with
> unnecessary loads for ymm registers from the initiliazed portion of the
> stack.
> As I understand, thats where from these -Wuninitialized warnings come from:
> by some reason gcc-10 wants to put local '__m256i pdatap[2]' variables
> on the stack.
> Note that only '-O2' affected, '-O3' looks good for all versions I tried
> (gcc-9, gcc-10, gcc-11)..
> 
> =====================
> $ cat tavx512u5.c
> 
> #include <stddef.h>
> #include <stdint.h>
> #include <x86intrin.h>
> 
> 
> struct flow_avx512 {
>         uint32_t num_packets;
>         uint32_t total_packets;
>         const uint8_t **idata;
> };
> 
> static inline void
> start_flow_avx512x8(const struct flow_avx512 *flow, uint32_t num,
>                     uint32_t msk, __m256i pdata[2])
> {
>         uint32_t n, m[2], nm[2];
>         __m256i nd[2];
> 
>         m[0] = msk & 0xF;
>         m[1] = msk >> 4;
> 
>         n = __builtin_popcount(m[0]);
>         nm[0] = (1 << n) - 1;
>         nm[1] = (1 << (num - n)) - 1;
> 
>         nd[0] = _mm256_maskz_loadu_epi64(nm[0],
>                                 flow->idata + flow->num_packets);
>         nd[1] = _mm256_maskz_loadu_epi64(nm[1],
>                         flow->idata + flow->num_packets + n);
> 
>         pdata[0] = _mm256_mask_expand_epi64(pdata[0], m[0], nd[0]);
>         pdata[1] = _mm256_mask_expand_epi64(pdata[1], m[1], nd[1]);
> }
> 
> __m256i
> dummyf1_avx512x8(const struct flow_avx512 *flow)
> {
>         __m256i pdata[2];
> 
>         start_flow_avx512x8(flow, 8, 0xFF, pdata);
>         return _mm256_add_epi64(pdata[0], pdata[1]);
> }
> 
> ====================
> Good version (gcc-9) first:
> gcc-9 -m64 -mavx512f -mavx512vl -mavx512cd -mavx512bw -Wall -O2 -o
> tavx512u5.gcc9-O2.o -c tavx512u5.c
> 
> $ objdump -d tavx512u5.gcc9-O2.o
> 
> tavx512u5.gcc9-O2.o:     file format elf64-x86-64
> 
> Disassembly of section .text:
> 
> 0000000000000000 <dummyf1_avx512x8>:
>    0:   f3 0f 1e fa             endbr64
>    4:   8b 17                   mov    (%rdi),%edx
>    6:   48 8b 47 08             mov    0x8(%rdi),%rax
>    a:   b9 0f 00 00 00          mov    $0xf,%ecx
>    f:   c5 f8 92 c9             kmovw  %ecx,%k1
>   13:   62 f2 fd a9 89 0c d0    vpexpandq (%rax,%rdx,8),%ymm1{%k1}{z}
>   1a:   62 f2 fd a9 89 44 d0    vpexpandq 0x20(%rax,%rdx,8),%ymm0{%k1}{z}
>   21:   04
>   22:   c5 f5 d4 c0             vpaddq %ymm0,%ymm1,%ymm0
>   26:   c3                      retq
> 

k1 is 0xf, so pdata is not used in _mm256_mask_expand_epi64, but gcc failed to
simplify vpexpandq (%rax,%rdx,8),%ymm1{%k1}{z} to vpexpandq (%rax,%rdx,8),
%ymm1 since we didn't support vpexpandq w/o mask? clang's codegen seems to be
optimal https://godbolt.org/z/d79v11Gz3

cut from sse.md, it seems we only support vpexpandq w/ mask since all
corresponding intrinsics are w/ mask.
----
(define_expand "<avx512>_expand<mode>_maskz"
  [(set (match_operand:VI48F 0 "register_operand")
        (unspec:VI48F
          [(match_operand:VI48F 1 "nonimmediate_operand")
           (match_operand:VI48F 2 "nonimm_or_0_operand")
           (match_operand:<avx512fmaskmode> 3 "register_operand")]
          UNSPEC_EXPAND))]
  "TARGET_AVX512F"
  "operands[2] = CONST0_RTX (<MODE>mode);")

(define_insn "<avx512>_expand<mode>_mask"
  [(set (match_operand:VI48F 0 "register_operand" "=v,v")
        (unspec:VI48F
          [(match_operand:VI48F 1 "nonimmediate_operand" "v,m")
           (match_operand:VI48F 2 "nonimm_or_0_operand" "0C,0C")
           (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")]
          UNSPEC_EXPAND))]
  "TARGET_AVX512F"
  "v<sseintprefix>expand<ssemodesuffix>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
  [(set_attr "type" "ssemov")
   (set_attr "prefix" "evex")
   (set_attr "memory" "none,load")
   (set_attr "mode" "<sseinsnmode>")])

(define_insn "expand<mode>_mask"
  [(set (match_operand:VI12_AVX512VLBW 0 "register_operand" "=v,v")
        (unspec:VI12_AVX512VLBW
          [(match_operand:VI12_AVX512VLBW 1 "nonimmediate_operand" "v,m")
           (match_operand:VI12_AVX512VLBW 2 "nonimm_or_0_operand" "0C,0C")
           (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")]
          UNSPEC_EXPAND))]
  "TARGET_AVX512VBMI2"
  "v<sseintprefix>expand<ssemodesuffix>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
  [(set_attr "type" "ssemov")
   (set_attr "prefix" "evex")
   (set_attr "memory" "none,load")
   (set_attr "mode" "<sseinsnmode>")])
----

Reply via email to