https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103750

--- Comment #6 from Thiago Macieira <thiago at kde dot org> ---
It got worse. Now I'm seeing:

.L807:
        vmovdqu16       (%rsi), %ymm2
        vmovdqu16       32(%rsi), %ymm3
        vpcmpuw $6, %ymm0, %ymm2, %k2
        vpcmpuw $6, %ymm0, %ymm3, %k3
        kmovw   %k2, %eax
        kmovw   %k3, %edx
        kmovd   %eax, %k4
        kmovd   %edx, %k5
        kortestd        %k5, %k4
        je      .L814

Code snippet:

        auto loadAndCompare = [maxval](const Char *ptr, unsigned mask = ~0U) {
            if constexpr (sizeof(Char) == 1) {
                __m256i mval = _mm256_set1_epi8(maxval);
                __m256i data = _mm256_maskz_loadu_epi8(mask, ptr);
                return _mm256_cmpgt_epu8_mask(data, mval);
            } else if constexpr (sizeof(Char) == 2) {
                __m256i mval = _mm256_set1_epi16(maxval);
                __m256i data = _mm256_maskz_loadu_epi16(mask, ptr);
                return _mm256_cmpgt_epu16_mask(data, mval);
            } else if constexpr (sizeof(Char) == 4) {
                __m256i mval = _mm256_set1_epi32(maxval);
                __m256i data = _mm256_maskz_loadu_epi32(mask, ptr);
                return _mm256_cmpgt_epu32_mask(data, mval);
            }
        };
/*...*/
            auto mask1 = loadAndCompare(n1);
            auto mask2 = loadAndCompare(n2);

I can make a compilable version if you need me to

Reply via email to