https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103774
--- Comment #2 from Andrew Pinski <pinskia at gcc dot gnu.org> --- #include <immintrin.h> #if defined(__INTEL_COMPILER) || defined(_MSC_VER) auto __tzcnt_u16(unsigned /*short*/ value) { # ifdef Q_CC_INTEL unsigned short res; asm("tzcntw %w1, %0" : "=r" (res) : "r" (value)); return res; # else return _tzcnt_u32(value | 0xffff0000U); # endif } #endif const char16_t *qustrchr1(char16_t *n, char16_t *e, char16_t c) noexcept { __m256i mch256 = _mm256_set1_epi16(c); for ( ; n < e; n += 32) { __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n)); __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1); __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256); __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256); if (_kortestz_mask16_u8(mask1, mask2)) continue; unsigned idx = _tzcnt_u32(mask1); if (mask1 == 0) { idx = __tzcnt_u16(mask2); n += 16; } return n + idx; } return e; } const char16_t *qustrchr2(char16_t *n, char16_t *e, char16_t c) noexcept { __m256i mch256 = _mm256_set1_epi16(c); for ( ; n < e; n += 32) { __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n)); __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1); __mmask16 mask1 = _mm256_cmpeq_epu16_mask(mch256, data1); __mmask16 mask2 = _mm256_cmpeq_epu16_mask(mch256, data2); if (_kortestz_mask16_u8(mask1, mask2)) continue; unsigned idx = _tzcnt_u32(mask1); if (mask1 == 0) { idx = __tzcnt_u16(mask2); n += 16; } return n + idx; } return e; }