================ @@ -274,6 +278,23 @@ StringRef::size_type StringRef::find_first_not_of(StringRef Chars, /// Note: O(size() + Chars.size()) StringRef::size_type StringRef::find_last_of(StringRef Chars, size_t From) const { +#ifdef __SSE2__ + if (Chars.size() == 2) { + __m128i Needle0 = _mm_set1_epi8(Chars[0]); + __m128i Needle1 = _mm_set1_epi8(Chars[1]); + size_type Sz = std::min(From, Length); + do { + Sz = Sz < 16 ? 0 : Sz - 16; + __m128i Buffer = _mm_loadu_si128((const __m128i *)(Data + Sz)); + unsigned Mask = _mm_movemask_epi8(_mm_or_si128( + _mm_cmpeq_epi8(Buffer, Needle0), _mm_cmpeq_epi8(Buffer, Needle1))); + if (Mask != 0) { + return Sz + sizeof(Mask) * CHAR_BIT - llvm::countl_zero(Mask); + } + } while (Sz); + return npos; + } +#endif ---------------- joker-eph wrote:
Can this be abstracted or made out-of-line? I'm wondering about the scalability of HW-specific intrinsics in-line (anticipating for the incoming `#elif defined(ARM64)`...) https://github.com/llvm/llvm-project/pull/71865 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits