[clang] [llvm] Clang: convert `__m64` intrinsics to unconditionally use SSE2 instead of MMX. (PR #96540)

via cfe-commits Mon, 24 Jun 2024 12:30:20 -0700

github-actions[bot] wrote:

<!--LLVM CODE FORMAT COMMENT: {clang-format}-->



:warning: C/C++ code formatter, clang-format found issues in your code. 
:warning:

<details>
<summary>
You can test this locally with the following command:
</summary>

``````````bash
git-clang-format --diff af6acd7442646fde56de919964bd52d7bb7922b2 
a17a0df1c3551693283dd806b901d3020f33e67f --extensions 'c,h,cpp' -- 
mmx-tests/test.c clang/lib/CodeGen/CGBuiltin.cpp clang/lib/Headers/emmintrin.h 
clang/lib/Headers/mmintrin.h clang/lib/Headers/tmmintrin.h 
clang/lib/Headers/xmmintrin.h clang/test/CodeGen/X86/mmx-builtins.c 
clang/test/CodeGen/X86/mmx-inline-asm.c 
clang/test/CodeGen/X86/mmx-shift-with-immediate.c 
clang/test/CodeGen/attr-target-x86-mmx.c clang/test/Headers/xmmintrin.c 
clang/test/Sema/x86-builtin-palignr.c
``````````

</details>

<details>
<summary>
View the diff from clang-format here.
</summary>

``````````diff
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 02160285d5..a3176570a4 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -49,10 +49,15 @@ typedef __bf16 __m128bh __attribute__((__vector_size__(16), 
__aligned__(16)));
 #endif
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, 
__target__("sse2,no-evex512"), __min_vector_width__(128)))
-
-#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ 
(__v2di){}, 0)
-#define __anyext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), 
__extension__ (__v2si){}, 0, 1, -1, -1)
+#define __DEFAULT_FN_ATTRS                                                     
\
+  __attribute__((__always_inline__, __nodebug__,                               
\
+                 __target__("sse2,no-evex512"), __min_vector_width__(128)))
+
+#define __trunc64(x)                                                           
\
+  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __anyext128(x)                                                         
\
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   
\
+                                    1, -1, -1)
 
 /// Adds lower double-precision values in both operands and returns the
 ///    sum in the lower 64 bits of the result. The upper 64 bits of the result
diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h
index 71d7487673..d4ccb3c92f 100644
--- a/clang/lib/Headers/mmintrin.h
+++ b/clang/lib/Headers/mmintrin.h
@@ -22,8 +22,8 @@ typedef short __v4hi __attribute__((__vector_size__(8)));
 typedef char __v8qi __attribute__((__vector_size__(8)));
 
 /* Unsigned types */
-typedef unsigned long long __v1du __attribute__ ((__vector_size__ (8)));
-typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+typedef unsigned long long __v1du __attribute__((__vector_size__(8)));
+typedef unsigned int __v2su __attribute__((__vector_size__(8)));
 typedef unsigned short __v4hu __attribute__((__vector_size__(8)));
 typedef unsigned char __v8qu __attribute__((__vector_size__(8)));
 
@@ -33,17 +33,23 @@ typedef signed char __v8qs 
__attribute__((__vector_size__(8)));
 
 /* SSE/SSE2 types */
 typedef long long __m128i __attribute__((__vector_size__(16), 
__aligned__(16)));
-typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef long long __v2di __attribute__((__vector_size__(16)));
 typedef int __v4si __attribute__((__vector_size__(16)));
 typedef short __v8hi __attribute__((__vector_size__(16)));
 typedef char __v16qi __attribute__((__vector_size__(16)));
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS_SSE2 __attribute__((__always_inline__, __nodebug__, 
__target__("sse2,no-evex512"), __min_vector_width__(64)))
-
-#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ 
(__v2di){}, 0)
-#define __anyext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), 
__extension__ (__v2si){}, 0, 1, -1, -1)
-#define __extract2_32(a) (__m64)__builtin_shufflevector((__v4si)(a), 
__extension__ (__v4si){}, 0, 2);
+#define __DEFAULT_FN_ATTRS_SSE2                                                
\
+  __attribute__((__always_inline__, __nodebug__,                               
\
+                 __target__("sse2,no-evex512"), __min_vector_width__(64)))
+
+#define __trunc64(x)                                                           
\
+  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __anyext128(x)                                                         
\
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   
\
+                                    1, -1, -1)
+#define __extract2_32(a)                                                       
\
+  (__m64) __builtin_shufflevector((__v4si)(a), __extension__(__v4si){}, 0, 2);
 
 /// Clears the MMX state by setting the state of the x87 stack registers
 ///    to empty.
@@ -69,10 +75,8 @@ _mm_empty(void) {
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector. The lower 32 bits contain the value of 
the
 ///    parameter. The upper 32 bits are set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_cvtsi32_si64(int __i)
-{
-    return __extension__ (__m64)(__v2si){__i, 0};
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi32_si64(int __i) {
+  return __extension__(__m64)(__v2si){__i, 0};
 }
 
 /// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
@@ -86,10 +90,8 @@ _mm_cvtsi32_si64(int __i)
 ///    A 64-bit integer vector.
 /// \returns A 32-bit signed integer value containing the lower 32 bits of the
 ///    parameter.
-static __inline__ int __DEFAULT_FN_ATTRS_SSE2
-_mm_cvtsi64_si32(__m64 __m)
-{
-    return ((__v2si)__m)[0];
+static __inline__ int __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi64_si32(__m64 __m) {
+  return ((__v2si)__m)[0];
 }
 
 /// Casts a 64-bit signed integer value into a 64-bit integer vector.
@@ -102,10 +104,8 @@ _mm_cvtsi64_si32(__m64 __m)
 ///    A 64-bit signed integer.
 /// \returns A 64-bit integer vector containing the same bitwise pattern as the
 ///    parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_cvtsi64_m64(long long __i)
-{
-    return (__m64)__i;
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi64_m64(long long __i) 
{
+  return (__m64)__i;
 }
 
 /// Casts a 64-bit integer vector into a 64-bit signed integer value.
@@ -118,10 +118,8 @@ _mm_cvtsi64_m64(long long __i)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit signed integer containing the same bitwise pattern as the
 ///    parameter.
-static __inline__ long long __DEFAULT_FN_ATTRS_SSE2
-_mm_cvtm64_si64(__m64 __m)
-{
-    return (long long)__m;
+static __inline__ long long __DEFAULT_FN_ATTRS_SSE2 _mm_cvtm64_si64(__m64 __m) 
{
+  return (long long)__m;
 }
 
 /// Converts, with saturation, 16-bit signed integers from both 64-bit integer
@@ -143,11 +141,10 @@ _mm_cvtm64_si64(__m64 __m)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pi16(__m64 __m1, __m64 __m2)
-{
-    return __extract2_32(__builtin_ia32_packsswb128((__v8hi)__anyext128(__m1),
-                                                    
(__v8hi)__anyext128(__m2)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pi16(__m64 __m1,
+                                                               __m64 __m2) {
+  return __extract2_32(__builtin_ia32_packsswb128((__v8hi)__anyext128(__m1),
+                                                  (__v8hi)__anyext128(__m2)));
 }
 
 /// Converts, with saturation, 32-bit signed integers from both 64-bit integer
@@ -169,11 +166,10 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pi32(__m64 __m1, __m64 __m2)
-{
-    return __extract2_32(__builtin_ia32_packssdw128((__v4si)__anyext128(__m1),
-                                                    
(__v4si)__anyext128(__m2)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pi32(__m64 __m1,
+                                                               __m64 __m2) {
+  return __extract2_32(__builtin_ia32_packssdw128((__v4si)__anyext128(__m1),
+                                                  (__v4si)__anyext128(__m2)));
 }
 
 /// Converts, with saturation, 16-bit signed integers from both 64-bit integer
@@ -195,11 +191,10 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pu16(__m64 __m1, __m64 __m2)
-{
-    return __extract2_32(__builtin_ia32_packuswb128((__v8hi)__anyext128(__m1),
-                                                    
(__v8hi)__anyext128(__m2)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pu16(__m64 __m1,
+                                                               __m64 __m2) {
+  return __extract2_32(__builtin_ia32_packuswb128((__v8hi)__anyext128(__m1),
+                                                  (__v8hi)__anyext128(__m2)));
 }
 
 /// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
@@ -223,11 +218,10 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2)
 ///    Bits [63:56] are written to bits [63:56] of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
-                                          4, 12, 5, 13, 6, 14, 7, 15);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi8(__m64 __m1,
+                                                                 __m64 __m2) {
+  return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, 4, 12, 5,
+                                        13, 6, 14, 7, 15);
 }
 
 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
@@ -247,11 +241,9 @@ _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
 ///    Bits [63:48] are written to bits [63:48] of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
-                                          2, 6, 3, 7);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi16(__m64 __m1,
+                                                                  __m64 __m2) {
+  return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, 2, 6, 3, 
7);
 }
 
 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
@@ -269,10 +261,9 @@ _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
 ///    the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 3);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi32(__m64 __m1,
+                                                                  __m64 __m2) {
+  return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 3);
 }
 
 /// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
@@ -296,11 +287,10 @@ _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
 ///    Bits [31:24] are written to bits [63:56] of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
-                                          0, 8, 1, 9, 2, 10, 3, 11);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi8(__m64 __m1,
+                                                                 __m64 __m2) {
+  return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, 0, 8, 1, 9,
+                                        2, 10, 3, 11);
 }
 
 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
@@ -320,11 +310,9 @@ _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
 ///    Bits [31:16] are written to bits [63:48] of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
-                                          0, 4, 1, 5);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi16(__m64 __m1,
+                                                                  __m64 __m2) {
+  return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, 0, 4, 1, 
5);
 }
 
 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
@@ -342,10 +330,9 @@ _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
 ///    the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi32(__m64 __m1,
+                                                                  __m64 __m2) {
+  return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2);
 }
 
 /// Adds each 8-bit integer element of the first 64-bit integer vector
@@ -363,10 +350,9 @@ _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_add_pi8(__m64 __m1, __m64 __m2)
-{
-    return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi8(__m64 __m1,
+                                                            __m64 __m2) {
+  return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2));
 }
 
 /// Adds each 16-bit integer element of the first 64-bit integer vector
@@ -384,10 +370,9 @@ _mm_add_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_add_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi16(__m64 __m1,
+                                                             __m64 __m2) {
+  return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2));
 }
 
 /// Adds each 32-bit integer element of the first 64-bit integer vector
@@ -405,10 +390,9 @@ _mm_add_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_add_pi32(__m64 __m1, __m64 __m2)
-{
-    return (__m64)(((__v2su)__m1) + ((__v2su)__m2));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi32(__m64 __m1,
+                                                             __m64 __m2) {
+  return (__m64)(((__v2su)__m1) + ((__v2su)__m2));
 }
 
 /// Adds, with saturation, each 8-bit signed integer element of the first
@@ -429,10 +413,9 @@ _mm_add_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
 ///    of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_adds_pi8(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_elementwise_add_sat((__v8qs)__m1, (__v8qs)__m2);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pi8(__m64 __m1,
+                                                             __m64 __m2) {
+  return (__m64)__builtin_elementwise_add_sat((__v8qs)__m1, (__v8qs)__m2);
 }
 
 /// Adds, with saturation, each 16-bit signed integer element of the first
@@ -453,10 +436,9 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
 ///    of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_adds_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_elementwise_add_sat((__v4hi)__m1, (__v4hi)__m2);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pi16(__m64 __m1,
+                                                              __m64 __m2) {
+  return (__m64)__builtin_elementwise_add_sat((__v4hi)__m1, (__v4hi)__m2);
 }
 
 /// Adds, with saturation, each 8-bit unsigned integer element of the first
@@ -476,10 +458,9 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    unsigned sums of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_adds_pu8(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_elementwise_add_sat((__v8qu)__m1, (__v8qu)__m2);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pu8(__m64 __m1,
+                                                             __m64 __m2) {
+  return (__m64)__builtin_elementwise_add_sat((__v8qu)__m1, (__v8qu)__m2);
 }
 
 /// Adds, with saturation, each 16-bit unsigned integer element of the first
@@ -499,10 +480,9 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    unsigned sums of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_adds_pu16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_elementwise_add_sat((__v4hu)__m1, (__v4hu)__m2);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pu16(__m64 __m1,
+                                                              __m64 __m2) {
+  return (__m64)__builtin_elementwise_add_sat((__v4hu)__m1, (__v4hu)__m2);
 }
 
 /// Subtracts each 8-bit integer element of the second 64-bit integer
@@ -520,10 +500,9 @@ _mm_adds_pu16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_sub_pi8(__m64 __m1, __m64 __m2)
-{
-    return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi8(__m64 __m1,
+                                                            __m64 __m2) {
+  return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2));
 }
 
 /// Subtracts each 16-bit integer element of the second 64-bit integer
@@ -541,10 +520,9 @@ _mm_sub_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_sub_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi16(__m64 __m1,
+                                                             __m64 __m2) {
+  return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2));
 }
 
 /// Subtracts each 32-bit integer element of the second 64-bit integer
@@ -562,10 +540,9 @@ _mm_sub_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32] containing the subtrahends.
 /// \returns A 64-bit integer vector of [2 x i32] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_sub_pi32(__m64 __m1, __m64 __m2)
-{
-    return (__m64)(((__v2su)__m1) - ((__v2su)__m2));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi32(__m64 __m1,
+                                                             __m64 __m2) {
+  return (__m64)(((__v2su)__m1) - ((__v2su)__m2));
 }
 
 /// Subtracts, with saturation, each 8-bit signed integer element of the second
@@ -586,10 +563,9 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_subs_pi8(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_elementwise_sub_sat((__v8qs)__m1, (__v8qs)__m2);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pi8(__m64 __m1,
+                                                             __m64 __m2) {
+  return (__m64)__builtin_elementwise_sub_sat((__v8qs)__m1, (__v8qs)__m2);
 }
 
 /// Subtracts, with saturation, each 16-bit signed integer element of the
@@ -610,10 +586,9 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_subs_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_elementwise_sub_sat((__v4hi)__m1, (__v4hi)__m2);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pi16(__m64 __m1,
+                                                              __m64 __m2) {
+  return (__m64)__builtin_elementwise_sub_sat((__v4hi)__m1, (__v4hi)__m2);
 }
 
 /// Subtracts each 8-bit unsigned integer element of the second 64-bit
@@ -634,10 +609,9 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_subs_pu8(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_elementwise_sub_sat((__v8qu)__m1, (__v8qu)__m2);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pu8(__m64 __m1,
+                                                             __m64 __m2) {
+  return (__m64)__builtin_elementwise_sub_sat((__v8qu)__m1, (__v8qu)__m2);
 }
 
 /// Subtracts each 16-bit unsigned integer element of the second 64-bit
@@ -658,10 +632,9 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_subs_pu16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)__builtin_elementwise_sub_sat((__v4hu)__m1, (__v4hu)__m2);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pu16(__m64 __m1,
+                                                              __m64 __m2) {
+  return (__m64)__builtin_elementwise_sub_sat((__v4hu)__m1, (__v4hu)__m2);
 }
 
 /// Multiplies each 16-bit signed integer element of the first 64-bit
@@ -685,11 +658,10 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of
 ///    products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_madd_pi16(__m64 __m1, __m64 __m2)
-{
-    return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1),
-                                               (__v8hi)__anyext128(__m2)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_madd_pi16(__m64 __m1,
+                                                              __m64 __m2) {
+  return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1),
+                                             (__v8hi)__anyext128(__m2)));
 }
 
 /// Multiplies each 16-bit signed integer element of the first 64-bit
@@ -707,11 +679,10 @@ _mm_madd_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
 ///    of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
-{
-    return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__anyext128(__m1),
-                                              (__v8hi)__anyext128(__m2)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mulhi_pi16(__m64 __m1,
+                                                               __m64 __m2) {
+  return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__anyext128(__m1),
+                                            (__v8hi)__anyext128(__m2)));
 }
 
 /// Multiplies each 16-bit signed integer element of the first 64-bit
@@ -729,10 +700,9 @@ _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
 ///    of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_mullo_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mullo_pi16(__m64 __m1,
+                                                               __m64 __m2) {
+  return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2));
 }
 
 /// Left-shifts each 16-bit signed integer element of the first
@@ -752,11 +722,10 @@ _mm_mullo_pi16(__m64 __m1, __m64 __m2)
 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
 ///    values. If \a __count is greater or equal to 16, the result is set to 
all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_sll_pi16(__m64 __m, __m64 __count)
-{
-    return __trunc64(__builtin_ia32_psllw128((__v8hi)__anyext128(__m),
-                                             (__v8hi)__anyext128(__count)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_pi16(__m64 __m,
+                                                             __m64 __count) {
+  return __trunc64(__builtin_ia32_psllw128((__v8hi)__anyext128(__m),
+                                           (__v8hi)__anyext128(__count)));
 }
 
 /// Left-shifts each 16-bit signed integer element of a 64-bit integer
@@ -775,11 +744,9 @@ _mm_sll_pi16(__m64 __m, __m64 __count)
 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
 ///    values. If \a __count is greater or equal to 16, the result is set to 
all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_slli_pi16(__m64 __m, int __count)
-{
-    return __trunc64(__builtin_ia32_psllwi128((__v8hi)__anyext128(__m),
-                                              __count));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_pi16(__m64 __m,
+                                                              int __count) {
+  return __trunc64(__builtin_ia32_psllwi128((__v8hi)__anyext128(__m), 
__count));
 }
 
 /// Left-shifts each 32-bit signed integer element of the first
@@ -799,11 +766,10 @@ _mm_slli_pi16(__m64 __m, int __count)
 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
 ///    values. If \a __count is greater or equal to 32, the result is set to 
all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_sll_pi32(__m64 __m, __m64 __count)
-{
-    return __trunc64(__builtin_ia32_pslld128((__v4si)__anyext128(__m),
-                                             (__v4si)__anyext128(__count)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_pi32(__m64 __m,
+                                                             __m64 __count) {
+  return __trunc64(__builtin_ia32_pslld128((__v4si)__anyext128(__m),
+                                           (__v4si)__anyext128(__count)));
 }
 
 /// Left-shifts each 32-bit signed integer element of a 64-bit integer
@@ -822,11 +788,9 @@ _mm_sll_pi32(__m64 __m, __m64 __count)
 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
 ///    values. If \a __count is greater or equal to 32, the result is set to 
all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_slli_pi32(__m64 __m, int __count)
-{
-    return __trunc64(__builtin_ia32_pslldi128((__v4si)__anyext128(__m),
-                                              __count));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_pi32(__m64 __m,
+                                                              int __count) {
+  return __trunc64(__builtin_ia32_pslldi128((__v4si)__anyext128(__m), 
__count));
 }
 
 /// Left-shifts the first 64-bit integer parameter by the number of bits
@@ -843,11 +807,10 @@ _mm_slli_pi32(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector containing the left-shifted value. If
 ///     \a __count is greater or equal to 64, the result is set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_sll_si64(__m64 __m, __m64 __count)
-{
-    return __trunc64(__builtin_ia32_psllq128((__v2di)__anyext128(__m),
-                                             __anyext128(__count)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_si64(__m64 __m,
+                                                             __m64 __count) {
+  return __trunc64(
+      __builtin_ia32_psllq128((__v2di)__anyext128(__m), __anyext128(__count)));
 }
 
 /// Left-shifts the first parameter, which is a 64-bit integer, by the
@@ -864,11 +827,9 @@ _mm_sll_si64(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector containing the left-shifted value. If
 ///     \a __count is greater or equal to 64, the result is set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_slli_si64(__m64 __m, int __count)
-{
-    return __trunc64(__builtin_ia32_psllqi128((__v2di)__anyext128(__m),
-                                              __count));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_si64(__m64 __m,
+                                                              int __count) {
+  return __trunc64(__builtin_ia32_psllqi128((__v2di)__anyext128(__m), 
__count));
 }
 
 /// Right-shifts each 16-bit integer element of the first parameter,
@@ -889,11 +850,10 @@ _mm_slli_si64(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_sra_pi16(__m64 __m, __m64 __count)
-{
-    return __trunc64(__builtin_ia32_psraw128((__v8hi)__anyext128(__m),
-                                             (__v8hi)__anyext128(__count)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sra_pi16(__m64 __m,
+                                                             __m64 __count) {
+  return __trunc64(__builtin_ia32_psraw128((__v8hi)__anyext128(__m),
+                                           (__v8hi)__anyext128(__count)));
 }
 
 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
@@ -913,11 +873,9 @@ _mm_sra_pi16(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_srai_pi16(__m64 __m, int __count)
-{
-    return __trunc64(__builtin_ia32_psrawi128((__v8hi)__anyext128(__m),
-                                              __count));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srai_pi16(__m64 __m,
+                                                              int __count) {
+  return __trunc64(__builtin_ia32_psrawi128((__v8hi)__anyext128(__m), 
__count));
 }
 
 /// Right-shifts each 32-bit integer element of the first parameter,
@@ -938,11 +896,10 @@ _mm_srai_pi16(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_sra_pi32(__m64 __m, __m64 __count)
-{
-    return __trunc64(__builtin_ia32_psrad128((__v4si)__anyext128(__m),
-                                             (__v4si)__anyext128(__count)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sra_pi32(__m64 __m,
+                                                             __m64 __count) {
+  return __trunc64(__builtin_ia32_psrad128((__v4si)__anyext128(__m),
+                                           (__v4si)__anyext128(__count)));
 }
 
 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
@@ -962,11 +919,9 @@ _mm_sra_pi32(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_srai_pi32(__m64 __m, int __count)
-{
-    return __trunc64(__builtin_ia32_psradi128((__v4si)__anyext128(__m),
-                                              __count));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srai_pi32(__m64 __m,
+                                                              int __count) {
+  return __trunc64(__builtin_ia32_psradi128((__v4si)__anyext128(__m), 
__count));
 }
 
 /// Right-shifts each 16-bit integer element of the first parameter,
@@ -986,11 +941,10 @@ _mm_srai_pi32(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_srl_pi16(__m64 __m, __m64 __count)
-{
-    return __trunc64(__builtin_ia32_psrlw128((__v8hi)__anyext128(__m),
-                                             (__v8hi)__anyext128(__count)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_pi16(__m64 __m,
+                                                             __m64 __count) {
+  return __trunc64(__builtin_ia32_psrlw128((__v8hi)__anyext128(__m),
+                                           (__v8hi)__anyext128(__count)));
 }
 
 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
@@ -1009,11 +963,9 @@ _mm_srl_pi16(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_srli_pi16(__m64 __m, int __count)
-{
-    return __trunc64(__builtin_ia32_psrlwi128((__v8hi)__anyext128(__m),
-                                              __count));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_pi16(__m64 __m,
+                                                              int __count) {
+  return __trunc64(__builtin_ia32_psrlwi128((__v8hi)__anyext128(__m), 
__count));
 }
 
 /// Right-shifts each 32-bit integer element of the first parameter,
@@ -1033,11 +985,10 @@ _mm_srli_pi16(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_srl_pi32(__m64 __m, __m64 __count)
-{
-    return __trunc64(__builtin_ia32_psrld128((__v4si)__anyext128(__m),
-                                             (__v4si)__anyext128(__count)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_pi32(__m64 __m,
+                                                             __m64 __count) {
+  return __trunc64(__builtin_ia32_psrld128((__v4si)__anyext128(__m),
+                                           (__v4si)__anyext128(__count)));
 }
 
 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
@@ -1056,11 +1007,9 @@ _mm_srl_pi32(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_srli_pi32(__m64 __m, int __count)
-{
-    return __trunc64(__builtin_ia32_psrldi128((__v4si)__anyext128(__m),
-                                              __count));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_pi32(__m64 __m,
+                                                              int __count) {
+  return __trunc64(__builtin_ia32_psrldi128((__v4si)__anyext128(__m), 
__count));
 }
 
 /// Right-shifts the first 64-bit integer parameter by the number of bits
@@ -1077,11 +1026,10 @@ _mm_srli_pi32(__m64 __m, int __count)
 /// \param __count
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector containing the right-shifted value.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_srl_si64(__m64 __m, __m64 __count)
-{
-    return __trunc64(__builtin_ia32_psrlq128((__v2di)__anyext128(__m),
-                                             __anyext128(__count)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_si64(__m64 __m,
+                                                             __m64 __count) {
+  return __trunc64(
+      __builtin_ia32_psrlq128((__v2di)__anyext128(__m), __anyext128(__count)));
 }
 
 /// Right-shifts the first parameter, which is a 64-bit integer, by the
@@ -1099,11 +1047,9 @@ _mm_srl_si64(__m64 __m, __m64 __count)
 /// \param __count
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector containing the right-shifted value.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_srli_si64(__m64 __m, int __count)
-{
-    return __trunc64(__builtin_ia32_psrlqi128((__v2di)__anyext128(__m),
-                                              __count));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_si64(__m64 __m,
+                                                              int __count) {
+  return __trunc64(__builtin_ia32_psrlqi128((__v2di)__anyext128(__m), 
__count));
 }
 
 /// Performs a bitwise AND of two 64-bit integer vectors.
@@ -1118,10 +1064,9 @@ _mm_srli_si64(__m64 __m, int __count)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise AND of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_and_si64(__m64 __m1, __m64 __m2)
-{
-    return (__m64)(((__v1du)__m1) & ((__v1du)__m2));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_and_si64(__m64 __m1,
+                                                             __m64 __m2) {
+  return (__m64)(((__v1du)__m1) & ((__v1du)__m2));
 }
 
 /// Performs a bitwise NOT of the first 64-bit integer vector, and then
@@ -1139,10 +1084,9 @@ _mm_and_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise AND of the second
 ///    parameter and the one's complement of the first parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_andnot_si64(__m64 __m1, __m64 __m2)
-{
-    return (__m64)(~((__v1du)__m1) & ((__v1du)__m2));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_andnot_si64(__m64 __m1,
+                                                                __m64 __m2) {
+  return (__m64)(~((__v1du)__m1) & ((__v1du)__m2));
 }
 
 /// Performs a bitwise OR of two 64-bit integer vectors.
@@ -1157,10 +1101,9 @@ _mm_andnot_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise OR of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_or_si64(__m64 __m1, __m64 __m2)
-{
-    return (__m64)(((__v1du)__m1) | ((__v1du)__m2));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_or_si64(__m64 __m1,
+                                                            __m64 __m2) {
+  return (__m64)(((__v1du)__m1) | ((__v1du)__m2));
 }
 
 /// Performs a bitwise exclusive OR of two 64-bit integer vectors.
@@ -1175,10 +1118,9 @@ _mm_or_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise exclusive OR of 
both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_xor_si64(__m64 __m1, __m64 __m2)
-{
-    return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_xor_si64(__m64 __m1,
+                                                             __m64 __m2) {
+  return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2));
 }
 
 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
@@ -1197,10 +1139,9 @@ _mm_xor_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
-{
-    return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi8(__m64 __m1,
+                                                              __m64 __m2) {
+  return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2));
 }
 
 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
@@ -1219,10 +1160,9 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi16(__m64 __m1,
+                                                               __m64 __m2) {
+  return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2));
 }
 
 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
@@ -1241,10 +1181,9 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
-{
-    return (__m64)(((__v2si)__m1) == ((__v2si)__m2));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi32(__m64 __m1,
+                                                               __m64 __m2) {
+  return (__m64)(((__v2si)__m1) == ((__v2si)__m2));
 }
 
 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
@@ -1263,12 +1202,11 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
-{
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi8(__m64 __m1,
+                                                              __m64 __m2) {
   /* This function always performs a signed comparison, but __v8qi is a char
      which may be signed or unsigned, so use __v8qs. */
-    return (__m64)((__v8qs)__m1 > (__v8qs)__m2);
+  return (__m64)((__v8qs)__m1 > (__v8qs)__m2);
 }
 
 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
@@ -1287,10 +1225,9 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
-{
-    return (__m64)((__v4hi)__m1 > (__v4hi)__m2);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi16(__m64 __m1,
+                                                               __m64 __m2) {
+  return (__m64)((__v4hi)__m1 > (__v4hi)__m2);
 }
 
 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
@@ -1309,10 +1246,9 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
-{
-    return (__m64)((__v2si)__m1 > (__v2si)__m2);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi32(__m64 __m1,
+                                                               __m64 __m2) {
+  return (__m64)((__v2si)__m1 > (__v2si)__m2);
 }
 
 /// Constructs a 64-bit integer vector initialized to zero.
@@ -1322,10 +1258,8 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
 /// This intrinsic corresponds to the <c> PXOR </c> instruction.
 ///
 /// \returns An initialized 64-bit integer vector with all elements set to 
zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_setzero_si64(void)
-{
-    return __extension__ (__m64){ 0LL };
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setzero_si64(void) {
+  return __extension__(__m64){0LL};
 }
 
 /// Constructs a 64-bit integer vector initialized with the specified
@@ -1343,10 +1277,9 @@ _mm_setzero_si64(void)
 ///    A 32-bit integer value used to initialize the lower 32 bits of the
 ///    result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_set_pi32(int __i1, int __i0)
-{
-    return __extension__ (__m64)(__v2si){__i0, __i1};
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set_pi32(int __i1,
+                                                             int __i0) {
+  return __extension__(__m64)(__v2si){__i0, __i1};
 }
 
 /// Constructs a 64-bit integer vector initialized with the specified
@@ -1366,10 +1299,11 @@ _mm_set_pi32(int __i1, int __i0)
 /// \param __s0
 ///    A 16-bit integer value used to initialize bits [15:0] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
-{
-    return __extension__ (__m64)(__v4hi){__s0, __s1, __s2, __s3};
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set_pi16(short __s3,
+                                                             short __s2,
+                                                             short __s1,
+                                                             short __s0) {
+  return __extension__(__m64)(__v4hi){__s0, __s1, __s2, __s3};
 }
 
 /// Constructs a 64-bit integer vector initialized with the specified
@@ -1399,10 +1333,9 @@ _mm_set_pi16(short __s3, short __s2, short __s1, short 
__s0)
 /// \returns An initialized 64-bit integer vector.
 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
-            char __b1, char __b0)
-{
-    return __extension__ (__m64)(__v8qi){__b0, __b1, __b2, __b3,
-                                         __b4, __b5, __b6, __b7};
+            char __b1, char __b0) {
+  return __extension__(__m64)(__v8qi){__b0, __b1, __b2, __b3,
+                                      __b4, __b5, __b6, __b7};
 }
 
 /// Constructs a 64-bit integer vector of [2 x i32], with each of the
@@ -1418,10 +1351,8 @@ _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, 
char __b3, char __b2,
 ///    A 32-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [2 x i32].
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_set1_pi32(int __i)
-{
-    return _mm_set_pi32(__i, __i);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set1_pi32(int __i) {
+  return _mm_set_pi32(__i, __i);
 }
 
 /// Constructs a 64-bit integer vector of [4 x i16], with each of the
@@ -1437,10 +1368,8 @@ _mm_set1_pi32(int __i)
 ///    A 16-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [4 x i16].
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_set1_pi16(short __w)
-{
-    return _mm_set_pi16(__w, __w, __w, __w);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set1_pi16(short __w) {
+  return _mm_set_pi16(__w, __w, __w, __w);
 }
 
 /// Constructs a 64-bit integer vector of [8 x i8], with each of the
@@ -1455,10 +1384,8 @@ _mm_set1_pi16(short __w)
 ///    An 8-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [8 x i8].
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_set1_pi8(char __b)
-{
-    return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set1_pi8(char __b) {
+  return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
 }
 
 /// Constructs a 64-bit integer vector, initialized in reverse order with
@@ -1476,10 +1403,9 @@ _mm_set1_pi8(char __b)
 ///    A 32-bit integer value used to initialize the upper 32 bits of the
 ///    result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_setr_pi32(int __i0, int __i1)
-{
-    return _mm_set_pi32(__i1, __i0);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setr_pi32(int __i0,
+                                                              int __i1) {
+  return _mm_set_pi32(__i1, __i0);
 }
 
 /// Constructs a 64-bit integer vector, initialized in reverse order with
@@ -1499,10 +1425,11 @@ _mm_setr_pi32(int __i0, int __i1)
 /// \param __w3
 ///    A 16-bit integer value used to initialize bits [63:48] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
-{
-    return _mm_set_pi16(__w3, __w2, __w1, __w0);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setr_pi16(short __w0,
+                                                              short __w1,
+                                                              short __w2,
+                                                              short __w3) {
+  return _mm_set_pi16(__w3, __w2, __w1, __w0);
 }
 
 /// Constructs a 64-bit integer vector, initialized in reverse order with
@@ -1532,9 +1459,8 @@ _mm_setr_pi16(short __w0, short __w1, short __w2, short 
__w3)
 /// \returns An initialized 64-bit integer vector.
 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
-             char __b6, char __b7)
-{
-    return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
+             char __b6, char __b7) {
+  return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
 }
 
 #undef __extract2_32
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index c448cd3f00..bac1ab2208 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -17,11 +17,17 @@
 #include <pmmintrin.h>
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, 
__target__("ssse3,no-evex512"), __min_vector_width__(64)))
-
-#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ 
(__v2di){}, 0)
-#define __anyext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), 
__extension__ (__v2si){}, 0, 1, -1, -1)
-#define __extract2_32(a) (__m64)__builtin_shufflevector((__v4si)(a), 
__extension__ (__v4si){}, 0, 2);
+#define __DEFAULT_FN_ATTRS                                                     
\
+  __attribute__((__always_inline__, __nodebug__,                               
\
+                 __target__("ssse3,no-evex512"), __min_vector_width__(64)))
+
+#define __trunc64(x)                                                           
\
+  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __anyext128(x)                                                         
\
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   
\
+                                    1, -1, -1)
+#define __extract2_32(a)                                                       
\
+  (__m64) __builtin_shufflevector((__v4si)(a), __extension__(__v4si){}, 0, 2);
 
 /// Computes the absolute value of each of the packed 8-bit signed
 ///    integers in the source operand and stores the 8-bit unsigned integer
@@ -35,9 +41,7 @@
 ///    A 64-bit vector of [8 x i8].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_abs_pi8(__m64 __a)
-{
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_abs_pi8(__m64 __a) {
   return (__m64)__builtin_elementwise_abs((__v8qs)__a);
 }
 
@@ -71,10 +75,8 @@ _mm_abs_epi8(__m128i __a)
 ///    A 64-bit vector of [4 x i16].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_abs_pi16(__m64 __a)
-{
-    return (__m64)__builtin_elementwise_abs((__v4hi)__a);
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_abs_pi16(__m64 __a) {
+  return (__m64)__builtin_elementwise_abs((__v4hi)__a);
 }
 
 /// Computes the absolute value of each of the packed 16-bit signed
@@ -107,10 +109,8 @@ _mm_abs_epi16(__m128i __a)
 ///    A 64-bit vector of [2 x i32].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_abs_pi32(__m64 __a)
-{
-    return (__m64)__builtin_elementwise_abs((__v2si)__a);
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_abs_pi32(__m64 __a) {
+  return (__m64)__builtin_elementwise_abs((__v2si)__a);
 }
 
 /// Computes the absolute value of each of the packed 32-bit signed
@@ -174,11 +174,11 @@ _mm_abs_epi32(__m128i __a)
 ///    An immediate operand specifying how many bytes to right-shift the 
result.
 /// \returns A 64-bit integer vector containing the concatenated right-shifted
 ///    value.
-#define _mm_alignr_pi8(a, b, n) \
-  ((__m64)__builtin_shufflevector(                                       \
-       __builtin_ia32_psrldqi128_byteshift(                              \
-           __builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0),      \
-           (n)), __extension__ (__v2di){}, 0))
+#define _mm_alignr_pi8(a, b, n)                                                
\
+  ((__m64)__builtin_shufflevector(                                             
\
+      __builtin_ia32_psrldqi128_byteshift(                                     
\
+          __builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0), (n)),       
\
+      __extension__(__v2di){}, 0))
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 ///    128-bit vectors of [8 x i16].
@@ -243,11 +243,9 @@ _mm_hadd_epi32(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of 
both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_hadd_pi16(__m64 __a, __m64 __b)
-{
-    return __extract2_32(__builtin_ia32_phaddw128((__v8hi)__anyext128(__a),
-                                                  (__v8hi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hadd_pi16(__m64 __a, __m64 __b) 
{
+  return __extract2_32(__builtin_ia32_phaddw128((__v8hi)__anyext128(__a),
+                                                (__v8hi)__anyext128(__b)));
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -267,11 +265,9 @@ _mm_hadd_pi16(__m64 __a, __m64 __b)
 ///    destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of 
both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_hadd_pi32(__m64 __a, __m64 __b)
-{
-    return __extract2_32(__builtin_ia32_phaddd128((__v4si)__anyext128(__a),
-                                                  (__v4si)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hadd_pi32(__m64 __a, __m64 __b) 
{
+  return __extract2_32(__builtin_ia32_phaddd128((__v4si)__anyext128(__a),
+                                                (__v4si)__anyext128(__b)));
 }
 
 /// Horizontally adds, with saturation, the adjacent pairs of values contained
@@ -320,11 +316,10 @@ _mm_hadds_epi16(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    sums of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_hadds_pi16(__m64 __a, __m64 __b)
-{
-    return __extract2_32(__builtin_ia32_phaddsw128((__v8hi)__anyext128(__a),
-                                                   (__v8hi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hadds_pi16(__m64 __a,
+                                                          __m64 __b) {
+  return __extract2_32(__builtin_ia32_phaddsw128((__v8hi)__anyext128(__a),
+                                                 (__v8hi)__anyext128(__b)));
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -390,11 +385,9 @@ _mm_hsub_epi32(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_hsub_pi16(__m64 __a, __m64 __b)
-{
-    return __extract2_32(__builtin_ia32_phsubw128((__v8hi)__anyext128(__a),
-                                                  (__v8hi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hsub_pi16(__m64 __a, __m64 __b) 
{
+  return __extract2_32(__builtin_ia32_phsubw128((__v8hi)__anyext128(__a),
+                                                (__v8hi)__anyext128(__b)));
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -414,11 +407,9 @@ _mm_hsub_pi16(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_hsub_pi32(__m64 __a, __m64 __b)
-{
-    return __extract2_32(__builtin_ia32_phsubd128((__v4si)__anyext128(__a),
-                                                  (__v4si)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hsub_pi32(__m64 __a, __m64 __b) 
{
+  return __extract2_32(__builtin_ia32_phsubd128((__v4si)__anyext128(__a),
+                                                (__v4si)__anyext128(__b)));
 }
 
 /// Horizontally subtracts, with saturation, the adjacent pairs of values
@@ -467,11 +458,10 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    differences of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_hsubs_pi16(__m64 __a, __m64 __b)
-{
-    return __extract2_32(__builtin_ia32_phsubsw128((__v8hi)__anyext128(__a),
-                                                   (__v8hi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hsubs_pi16(__m64 __a,
+                                                          __m64 __b) {
+  return __extract2_32(__builtin_ia32_phsubsw128((__v8hi)__anyext128(__a),
+                                                 (__v8hi)__anyext128(__b)));
 }
 
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -532,11 +522,10 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_maddubs_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
-                                                 (__v16qi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_maddubs_pi16(__m64 __a,
+                                                            __m64 __b) {
+  return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
+                                               (__v16qi)__anyext128(__b)));
 }
 
 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
@@ -573,11 +562,10 @@ _mm_mulhrs_epi16(__m128i __a, __m128i __b)
 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
 ///    products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_mulhrs_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__anyext128(__a),
-                                                (__v8hi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mulhrs_pi16(__m64 __a,
+                                                           __m64 __b) {
+  return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__anyext128(__a),
+                                              (__v8hi)__anyext128(__b)));
 }
 
 /// Copies the 8-bit integers from a 128-bit integer vector to the
@@ -625,13 +613,12 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b)
 ///    destination. \n
 ///    Bits [2:0] select the source byte to be copied.
 /// \returns A 64-bit integer vector containing the copied or cleared values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_shuffle_pi8(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_pshufb128(
-        (__v16qi)__builtin_shufflevector(
-            (__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1),
-        (__v16qi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_shuffle_pi8(__m64 __a,
+                                                           __m64 __b) {
+  return __trunc64(__builtin_ia32_pshufb128(
+      (__v16qi)__builtin_shufflevector((__v2si)(__a), __extension__(__v2si){},
+                                       0, 1, 0, 1),
+      (__v16qi)__anyext128(__b)));
 }
 
 /// For each 8-bit integer in the first source operand, perform one of
@@ -732,11 +719,9 @@ _mm_sign_epi32(__m128i __a, __m128i __b)
 ///    A 64-bit integer vector containing control bytes corresponding to
 ///    positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sign_pi8(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_psignb128((__v16qi)__anyext128(__a),
-                                              (__v16qi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sign_pi8(__m64 __a, __m64 __b) {
+  return __trunc64(__builtin_ia32_psignb128((__v16qi)__anyext128(__a),
+                                            (__v16qi)__anyext128(__b)));
 }
 
 /// For each 16-bit integer in the first source operand, perform one of
@@ -759,11 +744,9 @@ _mm_sign_pi8(__m64 __a, __m64 __b)
 ///    A 64-bit integer vector containing control words corresponding to
 ///    positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sign_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_psignw128((__v8hi)__anyext128(__a),
-                                              (__v8hi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sign_pi16(__m64 __a, __m64 __b) 
{
+  return __trunc64(__builtin_ia32_psignw128((__v8hi)__anyext128(__a),
+                                            (__v8hi)__anyext128(__b)));
 }
 
 /// For each 32-bit integer in the first source operand, perform one of
@@ -786,11 +769,9 @@ _mm_sign_pi16(__m64 __a, __m64 __b)
 ///    A 64-bit integer vector containing two control doublewords corresponding
 ///    to positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sign_pi32(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_psignd128((__v4si)__anyext128(__a),
-                                              (__v4si)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sign_pi32(__m64 __a, __m64 __b) 
{
+  return __trunc64(__builtin_ia32_psignd128((__v4si)__anyext128(__a),
+                                            (__v4si)__anyext128(__b)));
 }
 
 #undef __extract2_32
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index b6b413d15e..cbb9a96668 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -32,13 +32,24 @@ typedef unsigned int __v4su 
__attribute__((__vector_size__(16)));
 #endif
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, 
__target__("sse,no-evex512"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_SSE2 __attribute__((__always_inline__, __nodebug__, 
__target__("sse2,no-evex512"), __min_vector_width__(64)))
-
-#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ 
(__v2di){}, 0)
-#define __zext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), 
__extension__ (__v2si){}, 0, 1, 2, 3)
-#define __anyext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), 
__extension__ (__v2si){}, 0, 1, -1, -1)
-#define __zeroupper64(x) (__m128i)__builtin_shufflevector((__v4si)(x), 
__extension__ (__v4si){}, 0, 1, 4, 5)
+#define __DEFAULT_FN_ATTRS                                                     
\
+  __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), 
\
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS_SSE2                                                
\
+  __attribute__((__always_inline__, __nodebug__,                               
\
+                 __target__("sse2,no-evex512"), __min_vector_width__(64)))
+
+#define __trunc64(x)                                                           
\
+  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __zext128(x)                                                           
\
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   
\
+                                    1, 2, 3)
+#define __anyext128(x)                                                         
\
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   
\
+                                    1, -1, -1)
+#define __zeroupper64(x)                                                       
\
+  (__m128i) __builtin_shufflevector((__v4si)(x), __extension__(__v4si){}, 0,   
\
+                                    1, 4, 5)
 
 /// Adds the 32-bit float values in the low-order bits of the operands.
 ///
@@ -1449,9 +1460,7 @@ _mm_cvtss_si64(__m128 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_cvtps_pi32(__m128 __a)
-{
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi32(__m128 __a) {
   return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a)));
 }
 
@@ -1469,9 +1478,7 @@ _mm_cvtps_pi32(__m128 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_cvt_ps2pi(__m128 __a)
-{
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvt_ps2pi(__m128 __a) {
   return _mm_cvtps_pi32(__a);
 }
 
@@ -1559,9 +1566,7 @@ _mm_cvttss_si64(__m128 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_cvttps_pi32(__m128 __a)
-{
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvttps_pi32(__m128 __a) {
   return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a)));
 }
 
@@ -1580,9 +1585,7 @@ _mm_cvttps_pi32(__m128 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_cvtt_ps2pi(__m128 __a)
-{
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtt_ps2pi(__m128 __a) {
   return _mm_cvttps_pi32(__a);
 }
 
@@ -1675,13 +1678,11 @@ _mm_cvtsi64_ss(__m128 __a, long long __b)
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    converted value of the second operand. The upper 64 bits are copied from
 ///    the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
-_mm_cvtpi32_ps(__m128 __a, __m64 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi32_ps(__m128 __a,
+                                                                __m64 __b) {
   return (__m128)__builtin_shufflevector(
-      (__v4sf)__a,
-      __builtin_convertvector((__v4si)__zext128(__b), __v4sf),
-      4, 5, 2, 3);
+      (__v4sf)__a, __builtin_convertvector((__v4si)__zext128(__b), __v4sf), 4,
+      5, 2, 3);
 }
 
 /// Converts two elements of a 64-bit vector of [2 x i32] into two
@@ -1701,9 +1702,8 @@ _mm_cvtpi32_ps(__m128 __a, __m64 __b)
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    converted value from the second operand. The upper 64 bits are copied
 ///    from the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
-_mm_cvt_pi2ps(__m128 __a, __m64 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvt_pi2ps(__m128 __a,
+                                                               __m64 __b) {
   return _mm_cvtpi32_ps(__a, __b);
 }
 
@@ -2235,9 +2235,7 @@ _mm_storer_ps(float *__p, __m128 __a)
 ///    A pointer to an aligned memory location used to store the register 
value.
 /// \param __a
 ///    A 64-bit integer containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_stream_pi(void *__p, __m64 __a)
-{
+static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pi(void *__p, __m64 __a) {
   __builtin_nontemporal_store(__a, (__m64 *)__p);
 }
 
@@ -2299,7 +2297,7 @@ void _mm_sfence(void);
 ///    2: Bits [47:32] are copied to the destination. \n
 ///    3: Bits [63:48] are copied to the destination.
 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
-#define _mm_extract_pi16(a, n) \
+#define _mm_extract_pi16(a, n)                                                 
\
   ((int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
 
 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
@@ -2346,9 +2344,8 @@ void _mm_sfence(void);
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_max_pi16(__m64 __a, __m64 __b)
-{
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_max_pi16(__m64 __a,
+                                                             __m64 __b) {
   return (__m64)__builtin_elementwise_max((__v4hi)__a, (__v4hi)__b);
 }
 
@@ -2365,9 +2362,8 @@ _mm_max_pi16(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_max_pu8(__m64 __a, __m64 __b)
-{
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_max_pu8(__m64 __a,
+                                                            __m64 __b) {
   return (__m64)__builtin_elementwise_max((__v8qu)__a, (__v8qu)__b);
 }
 
@@ -2384,9 +2380,8 @@ _mm_max_pu8(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_min_pi16(__m64 __a, __m64 __b)
-{
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_min_pi16(__m64 __a,
+                                                             __m64 __b) {
   return (__m64)__builtin_elementwise_min((__v4hi)__a, (__v4hi)__b);
 }
 
@@ -2403,9 +2398,8 @@ _mm_min_pi16(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_min_pu8(__m64 __a, __m64 __b)
-{
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_min_pu8(__m64 __a,
+                                                            __m64 __b) {
   return (__m64)__builtin_elementwise_min((__v8qu)__a, (__v8qu)__b);
 }
 
@@ -2421,9 +2415,7 @@ _mm_min_pu8(__m64 __a, __m64 __b)
 ///    A 64-bit integer vector containing the values with bits to be extracted.
 /// \returns The most significant bit from each 8-bit element in \a __a,
 ///    written to bits [7:0].
-static __inline__ int __DEFAULT_FN_ATTRS_SSE2
-_mm_movemask_pi8(__m64 __a)
-{
+static __inline__ int __DEFAULT_FN_ATTRS_SSE2 _mm_movemask_pi8(__m64 __a) {
   return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a));
 }
 
@@ -2440,9 +2432,8 @@ _mm_movemask_pi8(__m64 __a)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_mulhi_pu16(__m64 __a, __m64 __b)
-{
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mulhi_pu16(__m64 __a,
+                                                               __m64 __b) {
   return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a),
                                              (__v8hi)__anyext128(__b)));
 }
@@ -2481,9 +2472,9 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
 ///    <c>[b6, b4, b2, b0]</c>.
 /// \returns A 64-bit integer vector containing the shuffled values.
-#define _mm_shuffle_pi16(a, n) \
-  ((__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__ 
(__v4hi){}, \
-                                  (n) & 0x3, ((n) >> 2) & 0x3, \
+#define _mm_shuffle_pi16(a, n)                                                 
\
+  ((__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__(__v4hi){}, 
\
+                                  (n) & 0x3, ((n) >> 2) & 0x3,                 
\
                                   ((n) >> 4) & 0x3, ((n) >> 6) & 0x3))
 
 /// Conditionally copies the values from each 8-bit element in the first
@@ -2509,17 +2500,17 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
 ///    A pointer to a 64-bit memory location that will receive the 
conditionally
 ///    copied integer values. The address of the memory location does not have
 ///    to be aligned.
-static __inline__ void __DEFAULT_FN_ATTRS_SSE2
-_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
-{
+static __inline__ void __DEFAULT_FN_ATTRS_SSE2 _mm_maskmove_si64(__m64 __d,
+                                                                 __m64 __n,
+                                                                 char *__p) {
   // This is complex, because we need to support the case where __p is pointing
   // within the last 15 to 8 bytes of a page. In that case, using a 128-bit
   // write might cause a trap where a 64-bit maskmovq would not. (Memory
   // locations not selected by the mask bits might still cause traps.)
-  __m128i __d128  = __anyext128(__d);
-  __m128i __n128  = __zext128(__n);
-  if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 &&
-      ((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) {
+  __m128i __d128 = __anyext128(__d);
+  __m128i __n128 = __zext128(__n);
+  if (((__SIZE_TYPE__)__p & 0xfff) >= 4096 - 15 &&
+      ((__SIZE_TYPE__)__p & 0xfff) <= 4096 - 8) {
     // If there's a risk of spurious trap due to a 128-bit write, back up the
     // pointer by 8 bytes and shift values in registers to match.
     __p -= 8;
@@ -2543,9 +2534,8 @@ _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_avg_pu8(__m64 __a, __m64 __b)
-{
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_avg_pu8(__m64 __a,
+                                                            __m64 __b) {
   return __trunc64(__builtin_ia32_pavgb128((__v16qi)__anyext128(__a),
                                            (__v16qi)__anyext128(__b)));
 }
@@ -2563,9 +2553,8 @@ _mm_avg_pu8(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_avg_pu16(__m64 __a, __m64 __b)
-{
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_avg_pu16(__m64 __a,
+                                                             __m64 __b) {
   return __trunc64(__builtin_ia32_pavgw128((__v8hi)__anyext128(__a),
                                            (__v8hi)__anyext128(__b)));
 }
@@ -2586,9 +2575,8 @@ _mm_avg_pu16(__m64 __a, __m64 __b)
 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of 
the
 ///    sets of absolute differences between both operands. The upper bits are
 ///    cleared.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_sad_pu8(__m64 __a, __m64 __b)
-{
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sad_pu8(__m64 __a,
+                                                            __m64 __b) {
   return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a),
                                             (__v16qi)__zext128(__b)));
 }
@@ -2871,9 +2859,7 @@ _mm_movelh_ps(__m128 __a, __m128 __b)
 ///    from the corresponding elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and 
converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
-_mm_cvtpi16_ps(__m64 __a)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi16_ps(__m64 __a) {
   return __builtin_convertvector((__v4hi)__a, __v4sf);
 }
 
@@ -2889,9 +2875,7 @@ _mm_cvtpi16_ps(__m64 __a)
 ///    destination are copied from the corresponding elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and 
converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
-_mm_cvtpu16_ps(__m64 __a)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpu16_ps(__m64 __a) {
   return __builtin_convertvector((__v4hu)__a, __v4sf);
 }
 
@@ -2907,12 +2891,10 @@ _mm_cvtpu16_ps(__m64 __a)
 ///    from the corresponding lower 4 elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and 
converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
-_mm_cvtpi8_ps(__m64 __a)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi8_ps(__m64 __a) {
   return __builtin_convertvector(
-      __builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){},
-                              0, 1, 2, 3), __v4sf);
+      __builtin_shufflevector((__v8qs)__a, __extension__(__v8qs){}, 0, 1, 2, 
3),
+      __v4sf);
 }
 
 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
@@ -2928,12 +2910,10 @@ _mm_cvtpi8_ps(__m64 __a)
 ///    operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and 
converted
 ///    values from the source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
-_mm_cvtpu8_ps(__m64 __a)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpu8_ps(__m64 __a) {
   return __builtin_convertvector(
-      __builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){},
-                              0, 1, 2, 3), __v4sf);
+      __builtin_shufflevector((__v8qu)__a, __extension__(__v8qu){}, 0, 1, 2, 
3),
+      __v4sf);
 }
 
 /// Converts the two 32-bit signed integer values from each 64-bit vector
@@ -2952,12 +2932,10 @@ _mm_cvtpu8_ps(__m64 __a)
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    copied and converted values from the first operand. The upper 64 bits
 ///    contain the copied and converted values from the second operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
-_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi32x2_ps(__m64 __a,
+                                                                  __m64 __b) {
   return __builtin_convertvector(
-      __builtin_shufflevector((__v2si)__a, (__v2si)__b,
-                              0, 1, 2, 3), __v4sf);
+      __builtin_shufflevector((__v2si)__a, (__v2si)__b, 0, 1, 2, 3), __v4sf);
 }
 
 /// Converts each single-precision floating-point element of a 128-bit
@@ -2977,9 +2955,7 @@ _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
 ///    A 128-bit floating-point vector of [4 x float].
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_cvtps_pi16(__m128 __a)
-{
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi16(__m128 __a) {
   return __trunc64(__builtin_ia32_packssdw128(
       (__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps()));
 }
@@ -3002,9 +2978,7 @@ _mm_cvtps_pi16(__m128 __a)
 ///    128-bit floating-point vector of [4 x float].
 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
 ///    converted values and the uppper 32 bits are set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_cvtps_pi8(__m128 __a)
-{
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi8(__m128 __a) {
   __m64 __b, __c;
 
   __b = _mm_cvtps_pi16(__a);

``````````

</details>


https://github.com/llvm/llvm-project/pull/96540
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] Clang: convert `__m64` intrinsics to unconditionally use SSE2 instead of MMX. (PR #96540)

Reply via email to