https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100267
--- Comment #2 from Hongtao.liu <crazylht at gmail dot com> --- (In reply to Konstantin Ananyev from comment #0) > The code snippet below compiles ok with '-O2' for gcc-9. > But with gcc-10 (and gcc-11) it generates -Wuninitialized warnings. > Another thing (which is probably worse) 'gcc-10 -O2' generates code with > unnecessary loads for ymm registers from the initiliazed portion of the > stack. > As I understand, thats where from these -Wuninitialized warnings come from: > by some reason gcc-10 wants to put local '__m256i pdatap[2]' variables > on the stack. > Note that only '-O2' affected, '-O3' looks good for all versions I tried > (gcc-9, gcc-10, gcc-11).. > > ===================== > $ cat tavx512u5.c > > #include <stddef.h> > #include <stdint.h> > #include <x86intrin.h> > > > struct flow_avx512 { > uint32_t num_packets; > uint32_t total_packets; > const uint8_t **idata; > }; > > static inline void > start_flow_avx512x8(const struct flow_avx512 *flow, uint32_t num, > uint32_t msk, __m256i pdata[2]) > { > uint32_t n, m[2], nm[2]; > __m256i nd[2]; > > m[0] = msk & 0xF; > m[1] = msk >> 4; > > n = __builtin_popcount(m[0]); > nm[0] = (1 << n) - 1; > nm[1] = (1 << (num - n)) - 1; > > nd[0] = _mm256_maskz_loadu_epi64(nm[0], > flow->idata + flow->num_packets); > nd[1] = _mm256_maskz_loadu_epi64(nm[1], > flow->idata + flow->num_packets + n); > > pdata[0] = _mm256_mask_expand_epi64(pdata[0], m[0], nd[0]); > pdata[1] = _mm256_mask_expand_epi64(pdata[1], m[1], nd[1]); > } > > __m256i > dummyf1_avx512x8(const struct flow_avx512 *flow) > { > __m256i pdata[2]; > > start_flow_avx512x8(flow, 8, 0xFF, pdata); > return _mm256_add_epi64(pdata[0], pdata[1]); > } > > ==================== > Good version (gcc-9) first: > gcc-9 -m64 -mavx512f -mavx512vl -mavx512cd -mavx512bw -Wall -O2 -o > tavx512u5.gcc9-O2.o -c tavx512u5.c > > $ objdump -d tavx512u5.gcc9-O2.o > > tavx512u5.gcc9-O2.o: file format elf64-x86-64 > > Disassembly of section .text: > > 0000000000000000 <dummyf1_avx512x8>: > 0: f3 0f 1e fa endbr64 > 4: 8b 17 mov (%rdi),%edx > 6: 48 8b 47 08 mov 0x8(%rdi),%rax > a: b9 0f 00 00 00 mov $0xf,%ecx > f: c5 f8 92 c9 kmovw %ecx,%k1 > 13: 62 f2 fd a9 89 0c d0 vpexpandq (%rax,%rdx,8),%ymm1{%k1}{z} > 1a: 62 f2 fd a9 89 44 d0 vpexpandq 0x20(%rax,%rdx,8),%ymm0{%k1}{z} > 21: 04 > 22: c5 f5 d4 c0 vpaddq %ymm0,%ymm1,%ymm0 > 26: c3 retq > k1 is 0xf, so pdata is not used in _mm256_mask_expand_epi64, but gcc failed to simplify vpexpandq (%rax,%rdx,8),%ymm1{%k1}{z} to vpexpandq (%rax,%rdx,8), %ymm1 since we didn't support vpexpandq w/o mask? clang's codegen seems to be optimal https://godbolt.org/z/d79v11Gz3 cut from sse.md, it seems we only support vpexpandq w/ mask since all corresponding intrinsics are w/ mask. ---- (define_expand "<avx512>_expand<mode>_maskz" [(set (match_operand:VI48F 0 "register_operand") (unspec:VI48F [(match_operand:VI48F 1 "nonimmediate_operand") (match_operand:VI48F 2 "nonimm_or_0_operand") (match_operand:<avx512fmaskmode> 3 "register_operand")] UNSPEC_EXPAND))] "TARGET_AVX512F" "operands[2] = CONST0_RTX (<MODE>mode);") (define_insn "<avx512>_expand<mode>_mask" [(set (match_operand:VI48F 0 "register_operand" "=v,v") (unspec:VI48F [(match_operand:VI48F 1 "nonimmediate_operand" "v,m") (match_operand:VI48F 2 "nonimm_or_0_operand" "0C,0C") (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")] UNSPEC_EXPAND))] "TARGET_AVX512F" "v<sseintprefix>expand<ssemodesuffix>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}" [(set_attr "type" "ssemov") (set_attr "prefix" "evex") (set_attr "memory" "none,load") (set_attr "mode" "<sseinsnmode>")]) (define_insn "expand<mode>_mask" [(set (match_operand:VI12_AVX512VLBW 0 "register_operand" "=v,v") (unspec:VI12_AVX512VLBW [(match_operand:VI12_AVX512VLBW 1 "nonimmediate_operand" "v,m") (match_operand:VI12_AVX512VLBW 2 "nonimm_or_0_operand" "0C,0C") (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")] UNSPEC_EXPAND))] "TARGET_AVX512VBMI2" "v<sseintprefix>expand<ssemodesuffix>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}" [(set_attr "type" "ssemov") (set_attr "prefix" "evex") (set_attr "memory" "none,load") (set_attr "mode" "<sseinsnmode>")]) ----