[PATCH] D86855: Convert __m64 intrinsics to unconditionally use SSE2 instead of MMX instructions.

James Y Knight via Phabricator via cfe-commits Sun, 30 Aug 2020 15:29:37 -0700

jyknight created this revision.
jyknight added reviewers: craig.topper, spatel, RKSimon.
Herald added subscribers: cfe-commits, danielkiss.
Herald added a project: clang.
jyknight requested review of this revision.


Preliminary patch, posted to go along with discussion on llvm-dev.

3DNow! intrinsics are not converted, as of yet.

Tests have not been updated to match new expected IR output. Currently failing:
 Clang :: CodeGen/attr-target-x86-mmx.c
 Clang :: CodeGen/mmx-builtins.c
 Clang :: CodeGen/mmx-shift-with-immediate.c
 Clang :: Headers/xmmintrin.c


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D86855

Files:
  clang/lib/Headers/emmintrin.h
  clang/lib/Headers/mmintrin.h
  clang/lib/Headers/tmmintrin.h
  clang/lib/Headers/xmmintrin.h
  clang/test/Sema/x86-builtin-palignr.c

Index: clang/test/Sema/x86-builtin-palignr.c
===================================================================
--- clang/test/Sema/x86-builtin-palignr.c
+++ clang/test/Sema/x86-builtin-palignr.c
@@ -4,5 +4,5 @@
 #include <tmmintrin.h>
 
 __m64 test1(__m64 a, __m64 b, int c) {
-   return _mm_alignr_pi8(a, b, c); // expected-error {{argument to '__builtin_ia32_palignr' must be a constant integer}}
+   return _mm_alignr_pi8(a, b, c); // expected-error {{argument to '__builtin_ia32_psrldqi128_byteshift' must be a constant integer}}
 }
Index: clang/lib/Headers/xmmintrin.h
===================================================================
--- clang/lib/Headers/xmmintrin.h
+++ clang/lib/Headers/xmmintrin.h
@@ -29,7 +29,12 @@
 
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
+#define __DEFAULT_FN_ATTRS_SSE2 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(64)))
+
+#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ (__v2di){}, 0)
+#define __zext128(x) (__m128i)__builtin_shufflevector((__v1di)(x), __extension__ (__v1di){}, 0, 1)
+#define __anyext128(x) (__m128i)__builtin_shufflevector((__v1di)(x), __extension__ (__v1di){}, 0, -1)
+#define __zeroupper64(x) (__m128i)__builtin_shufflevector((__v2di)(x), __extension__ (__v2di){}, 0, 2)
 
 /// Adds the 32-bit float values in the low-order bits of the operands.
 ///
@@ -1354,10 +1359,10 @@
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtps_pi32(__m128 __a)
 {
-  return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
+  return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a)));
 }
 
 /// Converts two low-order float values in a 128-bit vector of
@@ -1370,7 +1375,7 @@
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvt_ps2pi(__m128 __a)
 {
   return _mm_cvtps_pi32(__a);
@@ -1447,10 +1452,10 @@
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvttps_pi32(__m128 __a)
 {
-  return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
+  return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a)));
 }
 
 /// Converts two low-order float values in a 128-bit vector of [4 x
@@ -1464,7 +1469,7 @@
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtt_ps2pi(__m128 __a)
 {
   return _mm_cvttps_pi32(__a);
@@ -1559,10 +1564,13 @@
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    converted value of the second operand. The upper 64 bits are copied from
 ///    the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
 {
-  return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
+  return (__m128)__builtin_shufflevector(
+      (__v4sf)__a,
+      __builtin_convertvector((__v4si)__zext128(__b), __v4sf),
+      4, 5, 2, 3);
 }
 
 /// Converts two elements of a 64-bit vector of [2 x i32] into two
@@ -1582,7 +1590,7 @@
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    converted value from the second operand. The upper 64 bits are copied
 ///    from the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
 {
   return _mm_cvtpi32_ps(__a, __b);
@@ -2116,10 +2124,10 @@
 ///    A pointer to an aligned memory location used to store the register value.
 /// \param __a
 ///    A 64-bit integer containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS_MMX
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_pi(__m64 *__p, __m64 __a)
 {
-  __builtin_ia32_movntq(__p, __a);
+  __builtin_nontemporal_store(__a, __p);
 }
 
 /// Moves packed float values from a 128-bit vector of [4 x float] to a
@@ -2181,7 +2189,7 @@
 ///    3: Bits [63:48] are copied to the destination.
 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
 #define _mm_extract_pi16(a, n) \
-  (int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)
+  (int)(((__v4hi)(a))[n])
 
 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
@@ -2212,7 +2220,7 @@
 /// \returns A 64-bit integer vector containing the copied packed data from the
 ///    operands.
 #define _mm_insert_pi16(a, d, n) \
-  (__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n)
+  __extension__ ({ __m64 __m = (a); __m[(n)] = (int)(d); __m; })
 
 /// Compares each of the corresponding packed 16-bit integer values of
 ///    the 64-bit integer vectors, and writes the greater value to the
@@ -2227,10 +2235,10 @@
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_max_pi16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
+  return __trunc64(__builtin_ia32_pmaxsw128((__v8hi)__anyext128(__a), (__v8hi)__anyext128(__b)));
 }
 
 /// Compares each of the corresponding packed 8-bit unsigned integer
@@ -2246,10 +2254,10 @@
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_max_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
+  return __trunc64(__builtin_ia32_pmaxub128((__v16qi)__anyext128(__a), (__v16qi)__anyext128(__b)));
 }
 
 /// Compares each of the corresponding packed 16-bit integer values of
@@ -2265,10 +2273,10 @@
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_min_pi16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
+  return __trunc64(__builtin_ia32_pminsw128((__v8hi)__anyext128(__a), (__v8hi)__anyext128(__b)));
 }
 
 /// Compares each of the corresponding packed 8-bit unsigned integer
@@ -2284,10 +2292,10 @@
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_min_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
+  return __trunc64(__builtin_ia32_pminub128((__v16qi)__anyext128(__a), (__v16qi)__anyext128(__b)));
 }
 
 /// Takes the most significant bit from each 8-bit element in a 64-bit
@@ -2302,10 +2310,10 @@
 ///    A 64-bit integer vector containing the values with bits to be extracted.
 /// \returns The most significant bit from each 8-bit element in \a __a,
 ///    written to bits [7:0].
-static __inline__ int __DEFAULT_FN_ATTRS_MMX
+static __inline__ int __DEFAULT_FN_ATTRS_SSE2
 _mm_movemask_pi8(__m64 __a)
 {
-  return __builtin_ia32_pmovmskb((__v8qi)__a);
+  return __builtin_ia32_pmovmskb128((__v16qi)__anyext128(__a));
 }
 
 /// Multiplies packed 16-bit unsigned integer values and writes the
@@ -2321,10 +2329,10 @@
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_mulhi_pu16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
+  return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a), (__v8hi)__anyext128(__b)));
 }
 
 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
@@ -2359,7 +2367,7 @@
 ///    11: assigned from bits [63:48] of \a a.
 /// \returns A 64-bit integer vector containing the shuffled values.
 #define _mm_shuffle_pi16(a, n) \
-  (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
+  (__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__ (__v4hi){}, (n) & 0x3, ((n) >> 2) & 0x3, ((n) >> 4) & 0x3, ((n) >> 6) & 0x3)
 
 /// Conditionally copies the values from each 8-bit element in the first
 ///    64-bit integer vector operand to the specified memory location, as
@@ -2384,10 +2392,22 @@
 ///    A pointer to a 64-bit memory location that will receive the conditionally
 ///    copied integer values. The address of the memory location does not have
 ///    to be aligned.
-static __inline__ void __DEFAULT_FN_ATTRS_MMX
+static __inline__ void __DEFAULT_FN_ATTRS_SSE2
 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
 {
-  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
+  // This is complex, because we need to support the case where __p is pointing
+  // within the last 15-8 bytes of a page. In that case, the 128-bit maskmovdqu
+  // may trap, where a 64-bit maskmovq would not. (Even with the mask bits 0, it
+  // still traps on unmapped memory).
+  __m128i __d128  = __anyext128(__d);
+  __m128i __n128  = __zext128(__n);
+  if (((__SIZE_TYPE__)__p & 0xfff) > 4096-15 && ((__SIZE_TYPE__)__p & 0xfff) < 4096-8) {
+    __p -= 8;
+    __d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8);
+    __n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8);
+  }
+
+  __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);
 }
 
 /// Computes the rounded averages of the packed unsigned 8-bit integer
@@ -2403,10 +2423,10 @@
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_avg_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
+  return __trunc64(__builtin_ia32_pavgb128((__v16qi)__anyext128(__a), (__v16qi)__anyext128(__b)));
 }
 
 /// Computes the rounded averages of the packed unsigned 16-bit integer
@@ -2422,10 +2442,10 @@
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_avg_pu16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
+  return __trunc64(__builtin_ia32_pavgw128((__v8hi)__anyext128(__a), (__v8hi)__anyext128(__b)));
 }
 
 /// Subtracts the corresponding 8-bit unsigned integer values of the two
@@ -2444,10 +2464,10 @@
 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
 ///    sets of absolute differences between both operands. The upper bits are
 ///    cleared.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sad_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
+  return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a), (__v16qi)__zext128(__b)));
 }
 
 #if defined(__cplusplus)
@@ -2725,7 +2745,7 @@
 ///    from the corresponding elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi16_ps(__m64 __a)
 {
   __m64 __b, __c;
@@ -2755,7 +2775,7 @@
 ///    destination are copied from the corresponding elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpu16_ps(__m64 __a)
 {
   __m64 __b, __c;
@@ -2784,7 +2804,7 @@
 ///    from the corresponding lower 4 elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi8_ps(__m64 __a)
 {
   __m64 __b;
@@ -2809,7 +2829,7 @@
 ///    operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpu8_ps(__m64 __a)
 {
   __m64 __b;
@@ -2836,7 +2856,7 @@
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    copied and converted values from the first operand. The upper 64 bits
 ///    contain the copied and converted values from the second operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
 {
   __m128 __c;
@@ -2865,7 +2885,7 @@
 ///    A 128-bit floating-point vector of [4 x float].
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtps_pi16(__m128 __a)
 {
   __m64 __b, __c;
@@ -2895,7 +2915,7 @@
 ///    128-bit floating-point vector of [4 x float].
 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
 ///    converted values and the uppper 32 bits are set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtps_pi8(__m128 __a)
 {
   __m64 __b, __c;
@@ -2997,8 +3017,12 @@
 #define _m_ _mm_
 #define _m_ _mm_
 
+#undef __trunc64
+#undef __zext128
+#undef __anyext128
+#undef __zeroupper64
 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
+#undef __DEFAULT_FN_ATTRS_SSE2
 
 /* Ugly hack for backwards-compatibility (compatible with gcc) */
 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
Index: clang/lib/Headers/tmmintrin.h
===================================================================
--- clang/lib/Headers/tmmintrin.h
+++ clang/lib/Headers/tmmintrin.h
@@ -14,7 +14,9 @@
 
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64)))
-#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64)))
+#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ (__v2di){}, 0)
+#define __anyext128(x) (__m128i)__builtin_shufflevector((__v1di)(x), __extension__ (__v1di){}, 0, -1)
+#define __extract2_32(a) (__m64)__builtin_shufflevector((__v4si)(a), __extension__ (__v4si){}, 0, 2);
 
 /// Computes the absolute value of each of the packed 8-bit signed
 ///    integers in the source operand and stores the 8-bit unsigned integer
@@ -28,10 +30,10 @@
 ///    A 64-bit vector of [8 x i8].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi8(__m64 __a)
 {
-    return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
+    return __trunc64(__builtin_ia32_pabsb128((__v16qi)__anyext128(__a)));
 }
 
 /// Computes the absolute value of each of the packed 8-bit signed
@@ -64,10 +66,10 @@
 ///    A 64-bit vector of [4 x i16].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi16(__m64 __a)
 {
-    return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
+    return __trunc64(__builtin_ia32_pabsw128((__v8hi)__anyext128(__a)));
 }
 
 /// Computes the absolute value of each of the packed 16-bit signed
@@ -100,10 +102,10 @@
 ///    A 64-bit vector of [2 x i32].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi32(__m64 __a)
 {
-    return (__m64)__builtin_ia32_pabsd((__v2si)__a);
+    return __trunc64(__builtin_ia32_pabsd128((__v4si)__anyext128(__a)));
 }
 
 /// Computes the absolute value of each of the packed 32-bit signed
@@ -168,7 +170,10 @@
 /// \returns A 64-bit integer vector containing the concatenated right-shifted
 ///    value.
 #define _mm_alignr_pi8(a, b, n) \
-  (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))
+  (__m64)__builtin_shufflevector(                                       \
+      __builtin_ia32_psrldqi128_byteshift(                              \
+          __builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0),      \
+          (n)), __extension__ (__v2di){}, 0)
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 ///    128-bit vectors of [8 x i16].
@@ -233,10 +238,10 @@
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadd_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
+    return __extract2_32(__builtin_ia32_phaddw128((__v8hi)__anyext128(__a), (__v8hi)__anyext128(__b)));
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -256,10 +261,10 @@
 ///    destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadd_pi32(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
+    return __extract2_32(__builtin_ia32_phaddd128((__v4si)__anyext128(__a), (__v4si)__anyext128(__b)));
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -306,10 +311,10 @@
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    sums of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadds_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
+    return __extract2_32(__builtin_ia32_phaddsw128((__v8hi)__anyext128(__a), (__v8hi)__anyext128(__b)));
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -375,10 +380,10 @@
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsub_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
+    return __extract2_32(__builtin_ia32_phsubw128((__v8hi)__anyext128(__a), (__v8hi)__anyext128(__b)));
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -398,10 +403,10 @@
 ///    the destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsub_pi32(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
+    return __extract2_32(__builtin_ia32_phsubd128((__v4si)__anyext128(__a), (__v4si)__anyext128(__b)));
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -448,10 +453,10 @@
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    differences of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsubs_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
+    return __extract2_32(__builtin_ia32_phsubsw128((__v8hi)__anyext128(__a), (__v8hi)__anyext128(__b)));
 }
 
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -512,10 +517,10 @@
 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_maddubs_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
+    return __extract2_32(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a), (__v16qi)__anyext128(__b)));
 }
 
 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
@@ -552,10 +557,10 @@
 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
 ///    products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__anyext128(__a), (__v8hi)__anyext128(__b)));
 }
 
 /// Copies the 8-bit integers from a 128-bit integer vector to the
@@ -603,10 +608,10 @@
 ///    destination. \n
 ///    Bits [3:0] select the source byte to be copied.
 /// \returns A 64-bit integer vector containing the copied or cleared values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_shuffle_pi8(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
+    return __trunc64(__builtin_ia32_pshufb128((__v16qi)__anyext128(__a), (__v16qi)__anyext128(__b)));
 }
 
 /// For each 8-bit integer in the first source operand, perform one of
@@ -707,10 +712,10 @@
 ///    A 64-bit integer vector containing control bytes corresponding to
 ///    positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi8(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
+    return __trunc64(__builtin_ia32_psignb128((__v16qi)__anyext128(__a), (__v16qi)__anyext128(__b)));
 }
 
 /// For each 16-bit integer in the first source operand, perform one of
@@ -733,10 +738,10 @@
 ///    A 64-bit integer vector containing control words corresponding to
 ///    positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_psignw128((__v8hi)__anyext128(__a), (__v8hi)__anyext128(__b)));
 }
 
 /// For each 32-bit integer in the first source operand, perform one of
@@ -759,13 +764,15 @@
 ///    A 64-bit integer vector containing two control doublewords corresponding
 ///    to positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi32(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
+    return __trunc64(__builtin_ia32_psignd128((__v4si)__anyext128(__a), (__v4si)__anyext128(__b)));
 }
 
+#undef __extract2_32
+#undef __anyext128
+#undef __trunc64
 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
 
 #endif /* __TMMINTRIN_H */
Index: clang/lib/Headers/mmintrin.h
===================================================================
--- clang/lib/Headers/mmintrin.h
+++ clang/lib/Headers/mmintrin.h
@@ -17,8 +17,19 @@
 typedef short __v4hi __attribute__((__vector_size__(8)));
 typedef char __v8qi __attribute__((__vector_size__(8)));
 
+/* SSE/SSE2 types */
+typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
+typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef short __v8hi __attribute__((__vector_size__(16)));
+typedef char __v16qi __attribute__((__vector_size__(16)));
+
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64)))
+#define __DEFAULT_FN_ATTRS_SSE2 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(64)))
+
+#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ (__v2di){}, 0)
+#define __anyext128(x) (__m128i)__builtin_shufflevector((__v1di)(x), __extension__ (__v1di){}, 0, -1)
+#define __extract2_32(a) (__m64)__builtin_shufflevector((__v4si)(a), __extension__ (__v4si){}, 0, 2);
 
 /// Clears the MMX state by setting the state of the x87 stack registers
 ///    to empty.
@@ -27,11 +38,15 @@
 ///
 /// This intrinsic corresponds to the <c> EMMS </c> instruction.
 ///
+#ifdef __MMX__ // FIXME: make the builtin just be a no-op when mmx is disabled, remove ifdef.
 static __inline__ void  __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
 _mm_empty(void)
 {
     __builtin_ia32_emms();
 }
+#else
+static __inline__ void  __attribute__((__always_inline__, __nodebug__)) _mm_empty(void) {}
+#endif
 
 /// Constructs a 64-bit integer vector, setting the lower 32 bits to the
 ///    value of the 32-bit integer parameter and setting the upper 32 bits to 0.
@@ -44,10 +59,10 @@
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
 ///    parameter. The upper 32 bits are set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtsi32_si64(int __i)
 {
-    return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
+    return __extension__ (__m64)(__v2si){__i, 0};
 }
 
 /// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
@@ -61,10 +76,10 @@
 ///    A 64-bit integer vector.
 /// \returns A 32-bit signed integer value containing the lower 32 bits of the
 ///    parameter.
-static __inline__ int __DEFAULT_FN_ATTRS
+static __inline__ int __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtsi64_si32(__m64 __m)
 {
-    return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
+    return ((__v2si)__m)[0];
 }
 
 /// Casts a 64-bit signed integer value into a 64-bit integer vector.
@@ -77,7 +92,7 @@
 ///    A 64-bit signed integer.
 /// \returns A 64-bit integer vector containing the same bitwise pattern as the
 ///    parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtsi64_m64(long long __i)
 {
     return (__m64)__i;
@@ -93,7 +108,7 @@
 ///    A 64-bit integer vector.
 /// \returns A 64-bit signed integer containing the same bitwise pattern as the
 ///    parameter.
-static __inline__ long long __DEFAULT_FN_ATTRS
+static __inline__ long long __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtm64_si64(__m64 __m)
 {
     return (long long)__m;
@@ -123,10 +138,10 @@
 ///    [4 x i8] values are written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_packs_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
+    return __extract2_32(__builtin_ia32_packsswb128((__v8hi)__anyext128(__m1), (__v8hi)__anyext128(__m2)));
 }
 
 /// Converts 32-bit signed integers from both 64-bit integer vector
@@ -153,10 +168,10 @@
 ///    [2 x i16] values are written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_packs_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
+    return __extract2_32(__builtin_ia32_packssdw128((__v4si)__anyext128(__m1), (__v4si)__anyext128(__m2)));
 }
 
 /// Converts 16-bit signed integers from both 64-bit integer vector
@@ -183,10 +198,10 @@
 ///    the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_packs_pu16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
+    return __extract2_32(__builtin_ia32_packuswb128((__v8hi)__anyext128(__m1), (__v8hi)__anyext128(__m2)));
 }
 
 /// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
@@ -210,10 +225,10 @@
 ///    Bits [63:56] are written to bits [63:56] of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, 4, 12, 5, 13, 6, 14, 7, 15);
 }
 
 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
@@ -233,10 +248,10 @@
 ///    Bits [63:48] are written to bits [63:48] of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, 2, 6, 3, 7);
 }
 
 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
@@ -254,10 +269,10 @@
 ///    the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
+    return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 3);
 }
 
 /// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
@@ -281,10 +296,10 @@
 ///    Bits [31:24] are written to bits [63:56] of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, 0, 8, 1, 9, 2, 10, 3, 11);
 }
 
 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
@@ -304,10 +319,10 @@
 ///    Bits [31:16] are written to bits [63:48] of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, 0, 4, 1, 5);
 }
 
 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
@@ -325,10 +340,10 @@
 ///    the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
+    return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2);
 }
 
 /// Adds each 8-bit integer element of the first 64-bit integer vector
@@ -346,10 +361,10 @@
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_add_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)(((__v8qi)__m1) + ((__v8qi)__m2));
 }
 
 /// Adds each 16-bit integer element of the first 64-bit integer vector
@@ -367,10 +382,10 @@
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_add_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hi)__m1) + ((__v4hi)__m2));
 }
 
 /// Adds each 32-bit integer element of the first 64-bit integer vector
@@ -388,10 +403,10 @@
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_add_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)(((__v2si)__m1) + ((__v2si)__m2));
 }
 
 /// Adds each 8-bit signed integer element of the first 64-bit integer
@@ -410,10 +425,10 @@
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
 ///    of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
+    return __trunc64(__builtin_ia32_paddsb128((__v16qi)__anyext128(__m1), (__v16qi)__anyext128(__m2)));
 }
 
 /// Adds each 16-bit signed integer element of the first 64-bit integer
@@ -433,10 +448,10 @@
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
 ///    of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_paddsw128((__v8hi)__anyext128(__m1), (__v8hi)__anyext128(__m2)));
 }
 
 /// Adds each 8-bit unsigned integer element of the first 64-bit integer
@@ -455,10 +470,10 @@
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    unsigned sums of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pu8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
+    return __trunc64(__builtin_ia32_paddusb128((__v16qi)__anyext128(__m1), (__v16qi)__anyext128(__m2)));
 }
 
 /// Adds each 16-bit unsigned integer element of the first 64-bit integer
@@ -477,10 +492,10 @@
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    unsigned sums of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pu16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_paddusw128((__v8hi)__anyext128(__m1), (__v8hi)__anyext128(__m2)));
 }
 
 /// Subtracts each 8-bit integer element of the second 64-bit integer
@@ -498,10 +513,10 @@
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sub_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)(((__v8qi)__m1) - ((__v8qi)__m2));
 }
 
 /// Subtracts each 16-bit integer element of the second 64-bit integer
@@ -519,10 +534,10 @@
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sub_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hi)__m1) - ((__v4hi)__m2));
 }
 
 /// Subtracts each 32-bit integer element of the second 64-bit integer
@@ -540,10 +555,10 @@
 ///    A 64-bit integer vector of [2 x i32] containing the subtrahends.
 /// \returns A 64-bit integer vector of [2 x i32] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sub_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)(((__v2si)__m1) - ((__v2si)__m2));
 }
 
 /// Subtracts each 8-bit signed integer element of the second 64-bit
@@ -563,10 +578,10 @@
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
+    return __trunc64(__builtin_ia32_psubsb128((__v16qi)__anyext128(__m1), (__v16qi)__anyext128(__m2)));
 }
 
 /// Subtracts each 16-bit signed integer element of the second 64-bit
@@ -586,10 +601,10 @@
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_psubsw128((__v8hi)__anyext128(__m1), (__v8hi)__anyext128(__m2)));
 }
 
 /// Subtracts each 8-bit unsigned integer element of the second 64-bit
@@ -610,10 +625,10 @@
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pu8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
+    return __trunc64(__builtin_ia32_psubusb128((__v16qi)__anyext128(__m1), (__v16qi)__anyext128(__m2)));
 }
 
 /// Subtracts each 16-bit unsigned integer element of the second 64-bit
@@ -634,10 +649,10 @@
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pu16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_psubusw128((__v8hi)__anyext128(__m1), (__v8hi)__anyext128(__m2)));
 }
 
 /// Multiplies each 16-bit signed integer element of the first 64-bit
@@ -661,10 +676,10 @@
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of
 ///    products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_madd_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1), (__v8hi)__anyext128(__m2)));
 }
 
 /// Multiplies each 16-bit signed integer element of the first 64-bit
@@ -682,10 +697,10 @@
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
 ///    of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__anyext128(__m1), (__v8hi)__anyext128(__m2)));
 }
 
 /// Multiplies each 16-bit signed integer element of the first 64-bit
@@ -703,10 +718,10 @@
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
 ///    of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hi)__m1) * ((__v4hi)__m2));
 }
 
 /// Left-shifts each 16-bit signed integer element of the first
@@ -726,10 +741,10 @@
 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
 ///    values. If \a __count is greater or equal to 16, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sll_pi16(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psllw128((__v8hi)__anyext128(__m), (__v8hi)__anyext128(__count)));
 }
 
 /// Left-shifts each 16-bit signed integer element of a 64-bit integer
@@ -748,10 +763,10 @@
 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
 ///    values. If \a __count is greater or equal to 16, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_slli_pi16(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psllwi128((__v8hi)__anyext128(__m), __count));
 }
 
 /// Left-shifts each 32-bit signed integer element of the first
@@ -771,10 +786,10 @@
 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
 ///    values. If \a __count is greater or equal to 32, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sll_pi32(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_pslld128((__v4si)__anyext128(__m), (__v4si)__anyext128(__count)));
 }
 
 /// Left-shifts each 32-bit signed integer element of a 64-bit integer
@@ -793,10 +808,10 @@
 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
 ///    values. If \a __count is greater or equal to 32, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_slli_pi32(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_pslldi128((__v4si)__anyext128(__m), __count));
 }
 
 /// Left-shifts the first 64-bit integer parameter by the number of bits
@@ -813,10 +828,10 @@
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector containing the left-shifted value. If
 ///     \a __count is greater or equal to 64, the result is set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sll_si64(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psllq128((__v2di)__anyext128(__m), __anyext128(__count)));
 }
 
 /// Left-shifts the first parameter, which is a 64-bit integer, by the
@@ -833,10 +848,10 @@
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector containing the left-shifted value. If
 ///     \a __count is greater or equal to 64, the result is set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_slli_si64(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psllqi128((__v2di)__anyext128(__m), __count));
 }
 
 /// Right-shifts each 16-bit integer element of the first parameter,
@@ -857,10 +872,10 @@
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sra_pi16(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psraw128((__v8hi)__anyext128(__m), (__v8hi)__anyext128(__count)));
 }
 
 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
@@ -880,10 +895,10 @@
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srai_pi16(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psrawi128((__v8hi)__anyext128(__m), __count));
 }
 
 /// Right-shifts each 32-bit integer element of the first parameter,
@@ -904,10 +919,10 @@
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sra_pi32(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psrad128((__v4si)__anyext128(__m), (__v4si)__anyext128(__count)));
 }
 
 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
@@ -927,10 +942,10 @@
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srai_pi32(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psradi128((__v4si)__anyext128(__m), __count));
 }
 
 /// Right-shifts each 16-bit integer element of the first parameter,
@@ -950,10 +965,10 @@
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srl_pi16(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psrlw128((__v8hi)__anyext128(__m), (__v8hi)__anyext128(__count)));
 }
 
 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
@@ -972,10 +987,10 @@
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srli_pi16(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psrlwi128((__v8hi)__anyext128(__m), __count));
 }
 
 /// Right-shifts each 32-bit integer element of the first parameter,
@@ -995,10 +1010,10 @@
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srl_pi32(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psrld128((__v4si)__anyext128(__m), (__v4si)__anyext128(__count)));
 }
 
 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
@@ -1017,10 +1032,10 @@
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srli_pi32(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psrldi128((__v4si)__anyext128(__m), __count));
 }
 
 /// Right-shifts the first 64-bit integer parameter by the number of bits
@@ -1037,10 +1052,10 @@
 /// \param __count
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector containing the right-shifted value.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srl_si64(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psrlq128((__v2di)__anyext128(__m), __anyext128(__count)));
 }
 
 /// Right-shifts the first parameter, which is a 64-bit integer, by the
@@ -1058,10 +1073,10 @@
 /// \param __count
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector containing the right-shifted value.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srli_si64(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psrlqi128((__v2di)__anyext128(__m), __count));
 }
 
 /// Performs a bitwise AND of two 64-bit integer vectors.
@@ -1076,10 +1091,10 @@
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise AND of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_and_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(((__v1di)__m1) & ((__v1di)__m2));
 }
 
 /// Performs a bitwise NOT of the first 64-bit integer vector, and then
@@ -1097,10 +1112,10 @@
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise AND of the second
 ///    parameter and the one's complement of the first parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_andnot_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(~((__v1di)__m1) & ((__v1di)__m2));
 }
 
 /// Performs a bitwise OR of two 64-bit integer vectors.
@@ -1115,10 +1130,10 @@
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise OR of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_or_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(((__v1di)__m1) | ((__v1di)__m2));
 }
 
 /// Performs a bitwise exclusive OR of two 64-bit integer vectors.
@@ -1133,10 +1148,10 @@
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_xor_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(((__v1di)__m1) ^ ((__v1di)__m2));
 }
 
 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
@@ -1155,10 +1170,10 @@
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2));
 }
 
 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
@@ -1177,10 +1192,10 @@
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2));
 }
 
 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
@@ -1199,10 +1214,10 @@
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)(((__v2si)__m1) == ((__v2si)__m2));
 }
 
 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
@@ -1221,10 +1236,10 @@
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)((__v8qi)__m1 > (__v8qi)__m2);
 }
 
 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
@@ -1243,10 +1258,10 @@
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)((__v4hi)__m1 > (__v4hi)__m2);
 }
 
 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
@@ -1265,10 +1280,10 @@
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)((__v2si)__m1 > (__v2si)__m2);
 }
 
 /// Constructs a 64-bit integer vector initialized to zero.
@@ -1278,7 +1293,7 @@
 /// This intrinsic corresponds to the <c> PXOR </c> instruction.
 ///
 /// \returns An initialized 64-bit integer vector with all elements set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setzero_si64(void)
 {
     return __extension__ (__m64){ 0LL };
@@ -1299,10 +1314,10 @@
 ///    A 32-bit integer value used to initialize the lower 32 bits of the
 ///    result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set_pi32(int __i1, int __i0)
 {
-    return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
+    return __extension__ (__m64)(__v2si){__i0, __i1};
 }
 
 /// Constructs a 64-bit integer vector initialized with the specified
@@ -1322,10 +1337,10 @@
 /// \param __s0
 ///    A 16-bit integer value used to initialize bits [15:0] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
 {
-    return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
+    return __extension__ (__m64)(__v4hi){__s0, __s1, __s2, __s3};
 }
 
 /// Constructs a 64-bit integer vector initialized with the specified
@@ -1353,12 +1368,11 @@
 /// \param __b0
 ///    An 8-bit integer value used to initialize bits [7:0] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
             char __b1, char __b0)
 {
-    return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
-                                               __b4, __b5, __b6, __b7);
+    return __extension__ (__m64)(__v8qi){__b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7};
 }
 
 /// Constructs a 64-bit integer vector of [2 x i32], with each of the
@@ -1374,10 +1388,10 @@
 ///    A 32-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [2 x i32].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set1_pi32(int __i)
 {
-    return _mm_set_pi32(__i, __i);
+    return __extension__ (__m64)(__v2si){__i, __i};
 }
 
 /// Constructs a 64-bit integer vector of [4 x i16], with each of the
@@ -1393,10 +1407,10 @@
 ///    A 16-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [4 x i16].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set1_pi16(short __w)
 {
-    return _mm_set_pi16(__w, __w, __w, __w);
+    return __extension__ (__m64)(__v4hi){__w, __w, __w, __w};
 }
 
 /// Constructs a 64-bit integer vector of [8 x i8], with each of the
@@ -1411,10 +1425,10 @@
 ///    An 8-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [8 x i8].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set1_pi8(char __b)
 {
-    return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
+    return __extension__ (__m64)(__v8qi){__b, __b, __b, __b, __b, __b, __b, __b};
 }
 
 /// Constructs a 64-bit integer vector, initialized in reverse order with
@@ -1432,10 +1446,10 @@
 ///    A 32-bit integer value used to initialize the upper 32 bits of the
 ///    result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setr_pi32(int __i0, int __i1)
 {
-    return _mm_set_pi32(__i1, __i0);
+    return __extension__ (__m64)(__v2si){__i1, __i0};
 }
 
 /// Constructs a 64-bit integer vector, initialized in reverse order with
@@ -1455,10 +1469,10 @@
 /// \param __w3
 ///    A 16-bit integer value used to initialize bits [63:48] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
 {
-    return _mm_set_pi16(__w3, __w2, __w1, __w0);
+    return __extension__ (__m64)(__v4hi){__w3, __w2, __w1, __w0};
 }
 
 /// Constructs a 64-bit integer vector, initialized in reverse order with
@@ -1486,14 +1500,17 @@
 /// \param __b7
 ///    An 8-bit integer value used to initialize bits [63:56] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
              char __b6, char __b7)
 {
-    return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
+    return __extension__ (__m64)(__v8qi){__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0};
 }
 
-#undef __DEFAULT_FN_ATTRS
+#undef __extract2_32
+#undef __anyext128
+#undef __trunc64
+#undef __DEFAULT_FN_ATTRS_SSE2
 
 /* Aliases for compatibility. */
 #define _m_empty _mm_empty
Index: clang/lib/Headers/emmintrin.h
===================================================================
--- clang/lib/Headers/emmintrin.h
+++ clang/lib/Headers/emmintrin.h
@@ -35,7 +35,9 @@
 
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64)))
+
+#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ (__v2di){}, 0)
+#define __anyext128(x) (__m128i)__builtin_shufflevector((__v1di)(x), __extension__ (__v1di){}, 0, -1)
 
 /// Adds lower double-precision values in both operands and returns the
 ///    sum in the lower 64 bits of the result. The upper 64 bits of the result
@@ -1504,10 +1506,10 @@
 /// \param __a
 ///    A 128-bit vector of [2 x double].
 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtpd_pi32(__m128d __a)
 {
-  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
+  return __trunc64(__builtin_ia32_cvtpd2dq((__v2df)__a));
 }
 
 /// Converts the two double-precision floating-point elements of a
@@ -1524,10 +1526,10 @@
 /// \param __a
 ///    A 128-bit vector of [2 x double].
 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvttpd_pi32(__m128d __a)
 {
-  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
+  return __trunc64(__builtin_ia32_cvttpd2dq((__v2df)__a));
 }
 
 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
@@ -1541,10 +1543,10 @@
 /// \param __a
 ///    A 64-bit vector of [2 x i32].
 /// \returns A 128-bit vector of [2 x double] containing the converted values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtpi32_pd(__m64 __a)
 {
-  return __builtin_ia32_cvtpi2pd((__v2si)__a);
+  return (__m128d) __builtin_convertvector((__v2si)__a, __v2df);
 }
 
 /// Returns the low-order element of a 128-bit vector of [2 x double] as
@@ -2175,10 +2177,10 @@
 /// \param __b
 ///    A 64-bit integer.
 /// \returns A 64-bit integer containing the sum of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_add_si64(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
+  return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b));
 }
 
 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
@@ -2507,10 +2509,10 @@
 /// \param __b
 ///    A 64-bit integer containing one of the source operands.
 /// \returns A 64-bit integer vector containing the product of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mul_su32(__m64 __a, __m64 __b)
 {
-  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
+  return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a), (__v4si)__anyext128(__b)));
 }
 
 /// Multiplies 32-bit unsigned integer values contained in the lower
@@ -2621,10 +2623,10 @@
 ///    A 64-bit integer vector containing the subtrahend.
 /// \returns A 64-bit integer vector containing the difference of the values in
 ///    the operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sub_si64(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
+  return (__m64)((unsigned long long)__a - (unsigned long long)__b);
 }
 
 /// Subtracts the corresponding elements of two [2 x i64] vectors.
@@ -4965,8 +4967,10 @@
 #if defined(__cplusplus)
 } // extern "C"
 #endif
+
+#undef __anyext128
+#undef __trunc64
 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
 
 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D86855: Convert __m64 intrinsics to unconditionally use SSE2 instead of MMX instructions.

Reply via email to