================ @@ -614,12 +623,15 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b) /// 1: Clear the corresponding byte in the destination. \n /// 0: Copy the selected source byte to the corresponding byte in the /// destination. \n -/// Bits [3:0] select the source byte to be copied. +/// Bits [2:0] select the source byte to be copied. /// \returns A 64-bit integer vector containing the copied or cleared values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_shuffle_pi8(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b); + return __trunc64(__builtin_ia32_pshufb128( + (__v16qi)__builtin_shufflevector( + (__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1), ---------------- jyknight wrote:
The behavior is supposed to be that only the bottom 3 bits of `__b` affect the result for the 64-bit operation. For the 128-bit operation, however, the bottom 4 bits are used. By duplicating the input vector, we automatically get the proper behavior without having to mask out bit 3 of `__b`. https://github.com/llvm/llvm-project/pull/96540 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits