tkrupa created this revision.
tkrupa added reviewers: craig.topper, RKSimon, spatel, sroland.
Herald added a subscriber: cfe-commits.

I did the div differently because it got split to three basic blocks with
a branch condition (due to div being an expensive operation)
and couldn't be combined back.

Corresponding LLVM revision: https://reviews.llvm.org/D47978


Repository:
  rC Clang

https://reviews.llvm.org/D47979

Files:
  lib/CodeGen/CGBuiltin.cpp
  lib/Headers/avx512fintrin.h
  test/CodeGen/avx512f-builtins.c

Index: test/CodeGen/avx512f-builtins.c
===================================================================
--- test/CodeGen/avx512f-builtins.c
+++ test/CodeGen/avx512f-builtins.c
@@ -2298,12 +2298,29 @@
 }
 __m128 test_mm_mask_add_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_add_ss
-  // CHECK: @llvm.x86.avx512.mask.add.ss.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.add.ss.round
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fadd float %{{.*}}, %{{.*}}
+  // CHECK: insertelement <4 x float> %{{.*}}, i32 0
+  // CHECK: and i32 {{.*}}, 1
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: br {{.*}}, {{.*}}, {{.*}}
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_mask_add_ss(__W,__U,__A,__B); 
 }
 __m128 test_mm_maskz_add_ss(__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_add_ss
-  // CHECK: @llvm.x86.avx512.mask.add.ss.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.add.ss.round
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fadd float %{{.*}}, %{{.*}}
+  // CHECK: insertelement <4 x float> %{{.*}}, i32 0
+  // CHECK: and i32 {{.*}}, 1
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: br {{.*}}, {{.*}}, {{.*}}
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_maskz_add_ss(__U,__A,__B); 
 }
 __m128d test_mm_add_round_sd(__m128d __A, __m128d __B) {
@@ -2323,12 +2340,29 @@
 }
 __m128d test_mm_mask_add_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_add_sd
-  // CHECK: @llvm.x86.avx512.mask.add.sd.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.add.sd.round
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: fadd double %{{.*}}, %{{.*}}
+  // CHECK: insertelement <2 x double> {{.*}}, i32 0
+  // CHECK: and i32 {{.*}}, 1
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: br {{.*}}, {{.*}}, {{.*}}
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_mask_add_sd(__W,__U,__A,__B); 
 }
 __m128d test_mm_maskz_add_sd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_add_sd
-  // CHECK: @llvm.x86.avx512.mask.add.sd.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.add.sd.round
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: fadd double %{{.*}}, %{{.*}}
+  // CHECK: insertelement <2 x double> {{.*}}, i32 0
+  // CHECK: and i32 {{.*}}, 1
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: br {{.*}}, {{.*}}, {{.*}}
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_maskz_add_sd(__U,__A,__B); 
 }
 __m512d test_mm512_sub_round_pd(__m512d __A, __m512d __B) {
@@ -2402,12 +2436,29 @@
 }
 __m128 test_mm_mask_sub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_sub_ss
-  // CHECK: @llvm.x86.avx512.mask.sub.ss.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.sub.ss.round
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fsub float %{{.*}}, %{{.*}}
+  // CHECK: insertelement <4 x float> {{.*}}, i32 0
+  // CHECK: and i32 {{.*}}, 1
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: br {{.*}}, {{.*}}, {{.*}}
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_mask_sub_ss(__W,__U,__A,__B); 
 }
 __m128 test_mm_maskz_sub_ss(__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_sub_ss
-  // CHECK: @llvm.x86.avx512.mask.sub.ss.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.sub.ss.round
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fsub float %{{.*}}, %{{.*}}
+  // CHECK: insertelement <4 x float> {{.*}}, i32 0
+  // CHECK: and i32 {{.*}}, 1
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: br {{.*}}, {{.*}}, {{.*}}
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_maskz_sub_ss(__U,__A,__B); 
 }
 __m128d test_mm_sub_round_sd(__m128d __A, __m128d __B) {
@@ -2427,12 +2478,29 @@
 }
 __m128d test_mm_mask_sub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_sub_sd
-  // CHECK: @llvm.x86.avx512.mask.sub.sd.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.sub.sd.round
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: fsub double %{{.*}}, %{{.*}}
+  // CHECK: insertelement <2 x double> {{.*}}, i32 0
+  // CHECK: and i32 {{.*}}, 1
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: br {{.*}}, {{.*}}, {{.*}}
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_mask_sub_sd(__W,__U,__A,__B); 
 }
 __m128d test_mm_maskz_sub_sd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_sub_sd
-  // CHECK: @llvm.x86.avx512.mask.sub.sd.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.sub.sd.round
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: fsub double %{{.*}}, %{{.*}}
+  // CHECK: insertelement <2 x double> {{.*}}, i32 0
+  // CHECK: and i32 {{.*}}, 1
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: br {{.*}}, {{.*}}, {{.*}}
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_maskz_sub_sd(__U,__A,__B); 
 }
 __m512d test_mm512_mul_round_pd(__m512d __A, __m512d __B) {
@@ -2506,12 +2574,29 @@
 }
 __m128 test_mm_mask_mul_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_mul_ss
-  // CHECK: @llvm.x86.avx512.mask.mul.ss.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.mul.ss.round
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fmul float %{{.*}}, %{{.*}}
+  // CHECK: insertelement <4 x float> {{.*}}, i32 0
+  // CHECK: and i32 {{.*}}, 1
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: br {{.*}}, {{.*}}, {{.*}}
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_mask_mul_ss(__W,__U,__A,__B); 
 }
 __m128 test_mm_maskz_mul_ss(__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_mul_ss
-  // CHECK: @llvm.x86.avx512.mask.mul.ss.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.mul.ss.round
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fmul float %{{.*}}, %{{.*}}
+  // CHECK: insertelement <4 x float> {{.*}}, i32 0
+  // CHECK: and i32 {{.*}}, 1
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: br {{.*}}, {{.*}}, {{.*}}
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_maskz_mul_ss(__U,__A,__B); 
 }
 __m128d test_mm_mul_round_sd(__m128d __A, __m128d __B) {
@@ -2531,12 +2616,29 @@
 }
 __m128d test_mm_mask_mul_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_mul_sd
-  // CHECK: @llvm.x86.avx512.mask.mul.sd.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.mul.sd.round
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: fmul double %{{.*}}, %{{.*}}
+  // CHECK: insertelement <2 x double> {{.*}}, i32 0
+  // CHECK: and i32 {{.*}}, 1
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: br {{.*}}, {{.*}}, {{.*}}
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_mask_mul_sd(__W,__U,__A,__B); 
 }
 __m128d test_mm_maskz_mul_sd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_mul_sd
-  // CHECK: @llvm.x86.avx512.mask.mul.sd.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.mul.sd.round
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: fmul double %{{.*}}, %{{.*}}
+  // CHECK: insertelement <2 x double> {{.*}}, i32 0
+  // CHECK: and i32 {{.*}}, 1
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: br {{.*}}, {{.*}}, {{.*}}
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_maskz_mul_sd(__U,__A,__B); 
 }
 __m512d test_mm512_div_round_pd(__m512d __A, __m512d __B) {
@@ -2620,12 +2722,27 @@
 }
 __m128 test_mm_mask_div_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_div_ss
-  // CHECK: @llvm.x86.avx512.mask.div.ss.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.div.ss.round
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: fdiv float %{{.*}}, %{{.*}}
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK: select i1 %{{.*}}, float %{{.*}}, float %{{.*}}
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
   return _mm_mask_div_ss(__W,__U,__A,__B); 
 }
 __m128 test_mm_maskz_div_ss(__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_div_ss
-  // CHECK: @llvm.x86.avx512.mask.div.ss.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.div.ss.round
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: fdiv float %{{.*}}, %{{.*}}
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK: select i1 %{{.*}}, float %{{.*}}, float %{{.*}}
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
   return _mm_maskz_div_ss(__U,__A,__B); 
 }
 __m128d test_mm_div_round_sd(__m128d __A, __m128d __B) {
@@ -2645,12 +2762,27 @@
 }
 __m128d test_mm_mask_div_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_div_sd
-  // CHECK: @llvm.x86.avx512.mask.div.sd.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.div.sd.round
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: fdiv double %{{.*}}, %{{.*}}
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK: select i1 %{{.*}}, double %{{.*}}, double %{{.*}}
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
   return _mm_mask_div_sd(__W,__U,__A,__B); 
 }
 __m128d test_mm_maskz_div_sd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_div_sd
-  // CHECK: @llvm.x86.avx512.mask.div.sd.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.div.sd.round
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: fdiv double %{{.*}}, %{{.*}}
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK: select i1 %{{.*}}, double %{{.*}}, double %{{.*}}
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
   return _mm_maskz_div_sd(__U,__A,__B); 
 }
 __m128 test_mm_max_round_ss(__m128 __A, __m128 __B) {
Index: lib/Headers/avx512fintrin.h
===================================================================
--- lib/Headers/avx512fintrin.h
+++ lib/Headers/avx512fintrin.h
@@ -1962,20 +1962,16 @@
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_addss_round_mask ((__v4sf) __A,
-                (__v4sf) __B,
-                (__v4sf) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_add_ss(__A, __B);
+  __A[0] = (__U & 1) ? __A[0] : __W[0];
+  return __A;
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_addss_round_mask ((__v4sf) __A,
-                (__v4sf) __B,
-                (__v4sf)  _mm_setzero_ps (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_add_ss(__A, __B);
+  __A[0] = (__U & 1) ? __A[0] : 0;
+  return __A;
 }
 
 #define _mm_add_round_ss(A, B, R) \
@@ -1998,20 +1994,16 @@
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_addsd_round_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_add_sd(__A, __B);
+  __A[0] = (__U & 1) ? __A[0] : __W[0];
+  return __A;
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_addsd_round_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df)  _mm_setzero_pd (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_add_sd(__A, __B);
+  __A[0] = (__U & 1) ? __A[0] : 0;
+  return __A;
 }
 #define _mm_add_round_sd(A, B, R) \
   (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
@@ -2097,20 +2089,16 @@
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_subss_round_mask ((__v4sf) __A,
-                (__v4sf) __B,
-                (__v4sf) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_sub_ss(__A, __B);
+  __A[0] = (__U & 1) ? __A[0] : __W[0];
+  return __A;
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_subss_round_mask ((__v4sf) __A,
-                (__v4sf) __B,
-                (__v4sf)  _mm_setzero_ps (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_sub_ss(__A, __B);
+  __A[0] = (__U & 1) ? __A[0] : 0;
+  return __A;
 }
 #define _mm_sub_round_ss(A, B, R) \
   (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
@@ -2132,20 +2120,16 @@
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_subsd_round_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_sub_sd(__A, __B);
+  __A[0] = (__U & 1) ? __A[0] : __W[0];
+  return __A;
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_subsd_round_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df)  _mm_setzero_pd (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_sub_sd(__A, __B);
+  __A[0] = (__U & 1) ? __A[0] : 0;
+  return __A;
 }
 
 #define _mm_sub_round_sd(A, B, R) \
@@ -2232,20 +2216,16 @@
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_mulss_round_mask ((__v4sf) __A,
-                (__v4sf) __B,
-                (__v4sf) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_mul_ss(__A, __B);
+  __A[0] = (__U & 1) ? __A[0] : __W[0];
+  return __A;
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) {
-  return (__m128) __builtin_ia32_mulss_round_mask ((__v4sf) __A,
-                (__v4sf) __B,
-                (__v4sf)  _mm_setzero_ps (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_mul_ss(__A, __B);
+  __A[0] = (__U & 1) ? __A[0] : 0;
+  return __A;
 }
 #define _mm_mul_round_ss(A, B, R) \
   (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
@@ -2267,20 +2247,16 @@
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_mulsd_round_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df) __W,
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_mul_sd(__A, __B);
+  __A[0] = (__U & 1) ? __A[0] : __W[0];
+  return __A;
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
-  return (__m128d) __builtin_ia32_mulsd_round_mask ((__v2df) __A,
-                (__v2df) __B,
-                (__v2df)  _mm_setzero_pd (),
-                (__mmask8) __U,
-                _MM_FROUND_CUR_DIRECTION);
+  __A = _mm_mul_sd(__A, __B);
+  __A[0] = (__U & 1) ? __A[0] : 0;
+  return __A;
 }
 
 #define _mm_mul_round_sd(A, B, R) \
Index: lib/CodeGen/CGBuiltin.cpp
===================================================================
--- lib/CodeGen/CGBuiltin.cpp
+++ lib/CodeGen/CGBuiltin.cpp
@@ -9902,6 +9902,35 @@
   case X86::BI__builtin_ia32_pternlogq256_maskz:
     return EmitX86Ternlog(*this, /*ZeroMask*/true, Ops);
 
+  case X86::BI__builtin_ia32_divss_round_mask:
+  case X86::BI__builtin_ia32_divsd_round_mask: {
+    Intrinsic::ID ID;
+    switch (BuiltinID) {
+    default: llvm_unreachable("Unsupported intrinsic!");
+    case X86::BI__builtin_ia32_divss_round_mask:
+      ID = Intrinsic::x86_avx512_mask_div_ss_round; break;
+    case X86::BI__builtin_ia32_divsd_round_mask:
+      ID = Intrinsic::x86_avx512_mask_div_sd_round; break;
+    }
+    Function *Intr = CGM.getIntrinsic(ID);
+
+    // If round parameter is not _MM_FROUND_CUR_DIRECTION, don't lower.
+    if (cast<llvm::ConstantInt>(Ops[4])->getZExtValue() != (uint64_t)4)
+      return Builder.CreateCall(Intr, Ops);
+
+    Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
+    Value *B = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
+    Value *C = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
+    Value *Mask = Ops[3];
+    Value *Div = Builder.CreateFDiv(A, B);
+    llvm::VectorType *MaskTy = llvm::VectorType::get(Builder.getInt1Ty(),
+                             cast<IntegerType>(Mask->getType())->getBitWidth());
+    Mask = Builder.CreateBitCast(Mask, MaskTy);
+    Mask = Builder.CreateExtractElement(Mask, (uint64_t)0);
+    Value *Select = Builder.CreateSelect(Mask, Div, C);
+    return Builder.CreateInsertElement(Ops[0], Select, (uint64_t)0);
+  }
+
   // 3DNow!
   case X86::BI__builtin_ia32_pswapdsf:
   case X86::BI__builtin_ia32_pswapdsi: {
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to