https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103750
--- Comment #11 from Hongtao.liu <crazylht at gmail dot com> --- (In reply to Thiago Macieira from comment #6) > It got worse. Now I'm seeing: > > .L807: > vmovdqu16 (%rsi), %ymm2 > vmovdqu16 32(%rsi), %ymm3 > vpcmpuw $6, %ymm0, %ymm2, %k2 > vpcmpuw $6, %ymm0, %ymm3, %k3 > kmovw %k2, %eax > kmovw %k3, %edx > kmovd %eax, %k4 > kmovd %edx, %k5 > kortestd %k5, %k4 > je .L814 > (define_insn "*zero_extend<mode>si2" [(set (match_operand:SI 0 "register_operand" "=r,*r,*k") (zero_extend:SI (match_operand:SWI12 1 "nonimmediate_operand" "<r>m,*k,*km")))] "!(TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun))" zero_extendhisi is supported with k alternative, it should be optimized to vmovdqu16 (%rsi), %ymm2 vmovdqu16 32(%rsi), %ymm3 vpcmpuw $6, %ymm0, %ymm2, %k2 vpcmpuw $6, %ymm0, %ymm3, %k3 kmovw %k2, %k4 kmovw %k3, %k5 kortestd %k5, %k4 And considering vpcmpuw will implicitly zero extend k2&k3, it can be further optimized to vmovdqu16 (%rsi), %ymm2 vmovdqu16 32(%rsi), %ymm3 vpcmpuw $6, %ymm0, %ymm2, %k2 vpcmpuw $6, %ymm0, %ymm3, %k3 kortestd %k3, %k2 > Code snippet: > > auto loadAndCompare = [maxval](const Char *ptr, unsigned mask = ~0U) > { > if constexpr (sizeof(Char) == 1) { > __m256i mval = _mm256_set1_epi8(maxval); > __m256i data = _mm256_maskz_loadu_epi8(mask, ptr); > return _mm256_cmpgt_epu8_mask(data, mval); > } else if constexpr (sizeof(Char) == 2) { > __m256i mval = _mm256_set1_epi16(maxval); > __m256i data = _mm256_maskz_loadu_epi16(mask, ptr); > return _mm256_cmpgt_epu16_mask(data, mval); > } else if constexpr (sizeof(Char) == 4) { > __m256i mval = _mm256_set1_epi32(maxval); > __m256i data = _mm256_maskz_loadu_epi32(mask, ptr); > return _mm256_cmpgt_epu32_mask(data, mval); > } > }; > /*...*/ > auto mask1 = loadAndCompare(n1); > auto mask2 = loadAndCompare(n2); > > I can make a compilable version if you need me to Yes, please.