Hello,this time, +-* for 128 bit integer vectors. I am using an unsigned type so the compiler knows that we expect wrapping. I don't know why Intel's description of mullo insists that the multiplication is signed, that only matters for the high part...
Next parts (waiting for approval for this one) should be: - same thing with 256 and 512 bit integer vectors - & | ^ (integer only) Maybe (or it can wait until the next release): - < > == abs min max (integer only) 2014-10-20 Marc Glisse <marc.gli...@inria.fr> * config/i386/emmintrin.h (__v2du, __v4su, __v8hu, __v16qu): New typedefs. (_mm_add_epi8, _mm_add_epi16, _mm_add_epi32, _mm_add_epi64, _mm_sub_epi8, _mm_sub_epi16, _mm_sub_epi32, _mm_sub_epi64, _mm_mullo_epi16): Use vector extensions instead of builtins. * config/i386/smmintrin.h (_mm_mullo_epi32): Likewise. -- Marc Glisse
Index: gcc/config/i386/emmintrin.h =================================================================== --- gcc/config/i386/emmintrin.h (revision 216422) +++ gcc/config/i386/emmintrin.h (working copy) @@ -32,23 +32,27 @@ #ifndef __SSE2__ #pragma GCC push_options #pragma GCC target("sse2") #define __DISABLE_SSE2__ #endif /* __SSE2__ */ /* SSE2 */ typedef double __v2df __attribute__ ((__vector_size__ (16))); typedef long long __v2di __attribute__ ((__vector_size__ (16))); +typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16))); typedef int __v4si __attribute__ ((__vector_size__ (16))); +typedef unsigned int __v4su __attribute__ ((__vector_size__ (16))); typedef short __v8hi __attribute__ ((__vector_size__ (16))); +typedef unsigned short __v8hu __attribute__ ((__vector_size__ (16))); typedef char __v16qi __attribute__ ((__vector_size__ (16))); +typedef unsigned char __v16qu __attribute__ ((__vector_size__ (16))); /* The Intel API is flexible enough that we must allow aliasing with other vector types, and their scalar components. */ typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__)); /* Create a selector for use with the SHUFPD instruction. */ #define _MM_SHUFFLE2(fp1,fp0) \ (((fp1) << 1) | (fp0)) @@ -999,39 +1003,39 @@ _mm_unpacklo_epi32 (__m128i __A, __m128i extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_epi64 (__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_epi8 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B); + return (__m128i) ((__v16qu)__A + (__v16qu)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_epi16 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B); + return (__m128i) ((__v8hu)__A + (__v8hu)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_epi32 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B); + return (__m128i) ((__v4su)__A + (__v4su)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_epi64 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B); + return (__m128i) ((__v2du)__A + (__v2du)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_epi8 (__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_epi16 (__m128i __A, __m128i __B) @@ -1047,39 +1051,39 @@ _mm_adds_epu8 (__m128i __A, __m128i __B) extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_adds_epu16 (__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_epi8 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B); + return (__m128i) ((__v16qu)__A - (__v16qu)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_epi16 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B); + return (__m128i) ((__v8hu)__A - (__v8hu)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_epi32 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B); + return (__m128i) ((__v4su)__A - (__v4su)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_epi64 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B); + return (__m128i) ((__v2du)__A - (__v2du)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_epi8 (__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_subs_epi16 (__m128i __A, __m128i __B) @@ -1107,21 +1111,21 @@ _mm_madd_epi16 (__m128i __A, __m128i __B extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mulhi_epi16 (__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mullo_epi16 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B); + return (__m128i) ((__v8hu)__A * (__v8hu)__B); } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_su32 (__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_epu32 (__m128i __A, __m128i __B) Index: gcc/config/i386/smmintrin.h =================================================================== --- gcc/config/i386/smmintrin.h (revision 216422) +++ gcc/config/i386/smmintrin.h (working copy) @@ -318,21 +318,21 @@ extern __inline __m128i __attribute__((_ _mm_max_epu32 (__m128i __X, __m128i __Y) { return (__m128i) __builtin_ia32_pmaxud128 ((__v4si)__X, (__v4si)__Y); } /* Packed integer 32-bit multiplication with truncation of upper halves of results. */ extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mullo_epi32 (__m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_pmulld128 ((__v4si)__X, (__v4si)__Y); + return (__m128i) ((__v4su)__X * (__v4su)__Y); } /* Packed integer 32-bit multiplication of 2 pairs of operands with two 64-bit results. */ extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_epi32 (__m128i __X, __m128i __Y) { return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__X, (__v4si)__Y); }