https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103774

--- Comment #2 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
#include <immintrin.h>

#if defined(__INTEL_COMPILER) || defined(_MSC_VER)
auto __tzcnt_u16(unsigned /*short*/ value)
{
#    ifdef Q_CC_INTEL
    unsigned short res;
    asm("tzcntw %w1, %0" : "=r" (res) : "r" (value));
    return res;
#    else
    return _tzcnt_u32(value | 0xffff0000U);
#    endif
}
#endif

const char16_t *qustrchr1(char16_t *n, char16_t *e, char16_t c) noexcept
{
    __m256i mch256 = _mm256_set1_epi16(c);
    for ( ; n < e; n += 32) {
        __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i
*>(n));
        __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n)
+ 1);
        __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256);
        __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256);
        if (_kortestz_mask16_u8(mask1, mask2))
            continue;

        unsigned idx = _tzcnt_u32(mask1);
        if (mask1 == 0) {
            idx = __tzcnt_u16(mask2);
            n += 16;
        }
        return n + idx;
    }
    return e;
}

const char16_t *qustrchr2(char16_t *n, char16_t *e, char16_t c) noexcept
{
    __m256i mch256 = _mm256_set1_epi16(c);
    for ( ; n < e; n += 32) {
        __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i
*>(n));
        __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n)
+ 1);
        __mmask16 mask1 = _mm256_cmpeq_epu16_mask(mch256, data1);
        __mmask16 mask2 = _mm256_cmpeq_epu16_mask(mch256, data2);
        if (_kortestz_mask16_u8(mask1, mask2))
            continue;

        unsigned idx = _tzcnt_u32(mask1);
        if (mask1 == 0) {
            idx = __tzcnt_u16(mask2);
            n += 16;
        }
        return n + idx;
    }
    return e;
}

Reply via email to