================ @@ -124,10 +143,11 @@ _mm_cvtm64_si64(__m64 __m) /// written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2); + return __extract2_32(__builtin_ia32_packsswb128((__v8hi)__anyext128(__m1), ---------------- jyknight wrote:
So, the current version assembles to: ``` 0: 66 0f 63 c1 packsswb %xmm1, %xmm0 4: 66 0f 70 c0 e8 pshufd $0xe8, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3] 9: c3 retq ``` You're suggesting to instead shuffle the inputs, like: ``` return __trunc64(__builtin_ia32_packsswb128( __builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){})); ``` I agree, that's better. Saves 1 byte of code, and also less register pressure. ``` 0: 66 0f 6c c1 punpcklqdq %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0] 4: 66 0f 63 c0 packsswb %xmm0, %xmm0 8: c3 retq ``` Done -- eliminated all uses of `__extract2_32`. https://github.com/llvm/llvm-project/pull/96540 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits