https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103750
--- Comment #6 from Thiago Macieira <thiago at kde dot org> --- It got worse. Now I'm seeing: .L807: vmovdqu16 (%rsi), %ymm2 vmovdqu16 32(%rsi), %ymm3 vpcmpuw $6, %ymm0, %ymm2, %k2 vpcmpuw $6, %ymm0, %ymm3, %k3 kmovw %k2, %eax kmovw %k3, %edx kmovd %eax, %k4 kmovd %edx, %k5 kortestd %k5, %k4 je .L814 Code snippet: auto loadAndCompare = [maxval](const Char *ptr, unsigned mask = ~0U) { if constexpr (sizeof(Char) == 1) { __m256i mval = _mm256_set1_epi8(maxval); __m256i data = _mm256_maskz_loadu_epi8(mask, ptr); return _mm256_cmpgt_epu8_mask(data, mval); } else if constexpr (sizeof(Char) == 2) { __m256i mval = _mm256_set1_epi16(maxval); __m256i data = _mm256_maskz_loadu_epi16(mask, ptr); return _mm256_cmpgt_epu16_mask(data, mval); } else if constexpr (sizeof(Char) == 4) { __m256i mval = _mm256_set1_epi32(maxval); __m256i data = _mm256_maskz_loadu_epi32(mask, ptr); return _mm256_cmpgt_epu32_mask(data, mval); } }; /*...*/ auto mask1 = loadAndCompare(n1); auto mask2 = loadAndCompare(n2); I can make a compilable version if you need me to