pengfei created this revision.
pengfei added reviewers: skan, RKSimon, craig.topper, FreddyYe, LuoYuanke.
pengfei requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

As discussed on D120395 <https://reviews.llvm.org/D120395>, we should prohibit 
arithmetic operations for
__m[128|256|512]bh as well. But them may be used for ABI type in future,
so replace them with __m[128|256|512]i.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D120411

Files:
  clang/lib/Headers/avx512bf16intrin.h
  clang/lib/Headers/avx512vlbf16intrin.h
  clang/test/CodeGen/X86/avx512bf16-builtins.c
  clang/test/CodeGen/X86/avx512bf16-error.c
  clang/test/CodeGen/X86/avx512vlbf16-builtins.c

Index: clang/test/CodeGen/X86/avx512vlbf16-builtins.c
===================================================================
--- clang/test/CodeGen/X86/avx512vlbf16-builtins.c
+++ clang/test/CodeGen/X86/avx512vlbf16-builtins.c
@@ -4,127 +4,127 @@
 
 #include <immintrin.h>
 
-__m128bh test_mm_cvtne2ps2bf16(__m128 A, __m128 B) {
+__m128i test_mm_cvtne2ps2bf16(__m128 A, __m128 B) {
   // CHECK-LABEL: @test_mm_cvtne2ps2bf16
   // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.128
-  // CHECK: ret <8 x i16> %{{.*}}
+  // CHECK: ret <2 x i64> %{{.*}}
   return _mm_cvtne2ps_pbh(A, B);
 }
 
-__m128bh test_mm_maskz_cvtne2ps2bf16(__m128 A, __m128 B, __mmask8 U) {
+__m128i test_mm_maskz_cvtne2ps2bf16(__m128 A, __m128 B, __mmask8 U) {
   // CHECK-LABEL: @test_mm_maskz_cvtne2ps2bf16
   // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.128
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
-  // CHECK: ret <8 x i16> %{{.*}}
+  // CHECK: ret <2 x i64> %{{.*}}
   return _mm_maskz_cvtne2ps_pbh(U, A, B);
 }
 
-__m128bh test_mm_mask_cvtne2ps2bf16(__m128bh C, __mmask8 U, __m128 A, __m128 B) {
+__m128i test_mm_mask_cvtne2ps2bf16(__m128i C, __mmask8 U, __m128 A, __m128 B) {
   // CHECK-LABEL: @test_mm_mask_cvtne2ps2bf16
   // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.128
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
-  // CHECK: ret <8 x i16> %{{.*}}
+  // CHECK: ret <2 x i64> %{{.*}}
   return _mm_mask_cvtne2ps_pbh(C, U, A, B);
 }
 
-__m256bh test_mm256_cvtne2ps2bf16(__m256 A, __m256 B) {
+__m256i test_mm256_cvtne2ps2bf16(__m256 A, __m256 B) {
   // CHECK-LABEL: @test_mm256_cvtne2ps2bf16
   // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.256
-  // CHECK: ret <16 x i16> %{{.*}}
+  // CHECK: ret <4 x i64> %{{.*}}
   return _mm256_cvtne2ps_pbh(A, B);
 }
 
-__m256bh test_mm256_maskz_cvtne2ps2bf16(__m256 A, __m256 B, __mmask16 U) {
+__m256i test_mm256_maskz_cvtne2ps2bf16(__m256 A, __m256 B, __mmask16 U) {
   // CHECK-LABEL: @test_mm256_maskz_cvtne2ps2bf16
   // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.256
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
-  // CHECK: ret <16 x i16> %{{.*}}
+  // CHECK: ret <4 x i64> %{{.*}}
   return _mm256_maskz_cvtne2ps_pbh(U, A, B);
 }
 
-__m256bh test_mm256_mask_cvtne2ps2bf16(__m256bh C, __mmask16 U, __m256 A, __m256 B) {
+__m256i test_mm256_mask_cvtne2ps2bf16(__m256i C, __mmask16 U, __m256 A, __m256 B) {
   // CHECK-LABEL: @test_mm256_mask_cvtne2ps2bf16
   // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.256
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
-  // CHECK: ret <16 x i16> %{{.*}}
+  // CHECK: ret <4 x i64> %{{.*}}
   return _mm256_mask_cvtne2ps_pbh(C, U, A, B);
 }
 
-__m512bh test_mm512_cvtne2ps2bf16(__m512 A, __m512 B) {
+__m512i test_mm512_cvtne2ps2bf16(__m512 A, __m512 B) {
   // CHECK-LABEL: @test_mm512_cvtne2ps2bf16
   // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.512
-  // CHECK: ret <32 x i16> %{{.*}}
+  // CHECK: ret <8 x i64> %{{.*}}
   return _mm512_cvtne2ps_pbh(A, B);
 }
 
-__m512bh test_mm512_maskz_cvtne2ps2bf16(__m512 A, __m512 B, __mmask32 U) {
+__m512i test_mm512_maskz_cvtne2ps2bf16(__m512 A, __m512 B, __mmask32 U) {
   // CHECK-LABEL: @test_mm512_maskz_cvtne2ps2bf16
   // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
-  // CHECK: ret <32 x i16> %{{.*}}
+  // CHECK: ret <8 x i64> %{{.*}}
   return _mm512_maskz_cvtne2ps_pbh(U, A, B);
 }
 
-__m512bh test_mm512_mask_cvtne2ps2bf16(__m512bh C, __mmask32 U, __m512 A, __m512 B) {
+__m512i test_mm512_mask_cvtne2ps2bf16(__m512i C, __mmask32 U, __m512 A, __m512 B) {
   // CHECK-LABEL: @test_mm512_mask_cvtne2ps2bf16
   // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
-  // CHECK: ret <32 x i16> %{{.*}}
+  // CHECK: ret <8 x i64> %{{.*}}
   return _mm512_mask_cvtne2ps_pbh(C, U, A, B);
 }
 
-__m128bh test_mm_cvtneps2bf16(__m128 A) {
+__m128i test_mm_cvtneps2bf16(__m128 A) {
   // CHECK-LABEL: @test_mm_cvtneps2bf16
   // CHECK: @llvm.x86.avx512bf16.mask.cvtneps2bf16.128
-  // CHECK: ret <8 x i16> %{{.*}}
+  // CHECK: ret <2 x i64> %{{.*}}
   return _mm_cvtneps_pbh(A);
 }
 
-__m128bh test_mm_mask_cvtneps2bf16(__m128bh C, __mmask8 U, __m128 A) {
+__m128i test_mm_mask_cvtneps2bf16(__m128i C, __mmask8 U, __m128 A) {
   // CHECK-LABEL: @test_mm_mask_cvtneps2bf16
   // CHECK: @llvm.x86.avx512bf16.mask.cvtneps2bf16.
-  // CHECK: ret <8 x i16> %{{.*}}
+  // CHECK: ret <2 x i64> %{{.*}}
   return _mm_mask_cvtneps_pbh(C, U, A);
 }
 
-__m128bh test_mm_maskz_cvtneps2bf16(__m128 A, __mmask8 U) {
+__m128i test_mm_maskz_cvtneps2bf16(__m128 A, __mmask8 U) {
   // CHECK-LABEL: @test_mm_maskz_cvtneps2bf16
   // CHECK: @llvm.x86.avx512bf16.mask.cvtneps2bf16.128
-  // CHECK: ret <8 x i16> %{{.*}}
+  // CHECK: ret <2 x i64> %{{.*}}
   return _mm_maskz_cvtneps_pbh(U, A);
 }
 
-__m128bh test_mm256_cvtneps2bf16(__m256 A) {
+__m128i test_mm256_cvtneps2bf16(__m256 A) {
   // CHECK-LABEL: @test_mm256_cvtneps2bf16
   // CHECK: @llvm.x86.avx512bf16.cvtneps2bf16.256
-  // CHECK: ret <8 x i16> %{{.*}}
+  // CHECK: ret <2 x i64> %{{.*}}
   return _mm256_cvtneps_pbh(A);
 }
 
-__m128bh test_mm256_mask_cvtneps2bf16(__m128bh C, __mmask8 U, __m256 A) {
+__m128i test_mm256_mask_cvtneps2bf16(__m128i C, __mmask8 U, __m256 A) {
   // CHECK-LABEL: @test_mm256_mask_cvtneps2bf16
   // CHECK: @llvm.x86.avx512bf16.cvtneps2bf16.256
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
-  // CHECK: ret <8 x i16> %{{.*}}
+  // CHECK: ret <2 x i64> %{{.*}}
   return _mm256_mask_cvtneps_pbh(C, U, A);
 }
 
-__m128bh test_mm256_maskz_cvtneps2bf16(__m256 A, __mmask8 U) {
+__m128i test_mm256_maskz_cvtneps2bf16(__m256 A, __mmask8 U) {
   // CHECK-LABEL: @test_mm256_maskz_cvtneps2bf16
   // CHECK: @llvm.x86.avx512bf16.cvtneps2bf16.256
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
-  // CHECK: ret <8 x i16> %{{.*}}
+  // CHECK: ret <2 x i64> %{{.*}}
   return _mm256_maskz_cvtneps_pbh(U, A);
 }
 
-__m128 test_mm_dpbf16_ps(__m128 D, __m128bh A, __m128bh B) {
+__m128 test_mm_dpbf16_ps(__m128 D, __m128i A, __m128i B) {
   // CHECK-LABEL: @test_mm_dpbf16_ps
   // CHECK: @llvm.x86.avx512bf16.dpbf16ps.128
   // CHECK: ret <4 x float> %{{.*}}
   return _mm_dpbf16_ps(D, A, B);
 }
 
-__m128 test_mm_maskz_dpbf16_ps(__m128 D, __m128bh A, __m128bh B, __mmask8 U) {
+__m128 test_mm_maskz_dpbf16_ps(__m128 D, __m128i A, __m128i B, __mmask8 U) {
   // CHECK-LABEL: @test_mm_maskz_dpbf16_ps
   // CHECK: @llvm.x86.avx512bf16.dpbf16ps.128
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
@@ -132,21 +132,21 @@
   return _mm_maskz_dpbf16_ps(U, D, A, B);
 }
 
-__m128 test_mm_mask_dpbf16_ps(__m128 D, __m128bh A, __m128bh B, __mmask8 U) {
+__m128 test_mm_mask_dpbf16_ps(__m128 D, __m128i A, __m128i B, __mmask8 U) {
   // CHECK-LABEL: @test_mm_mask_dpbf16_ps
   // CHECK: @llvm.x86.avx512bf16.dpbf16ps.128
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   // CHECK: ret <4 x float> %{{.*}}
   return _mm_mask_dpbf16_ps(D, U, A, B);
 }
-__m256 test_mm256_dpbf16_ps(__m256 D, __m256bh A, __m256bh B) {
+__m256 test_mm256_dpbf16_ps(__m256 D, __m256i A, __m256i B) {
   // CHECK-LABEL: @test_mm256_dpbf16_ps
   // CHECK: @llvm.x86.avx512bf16.dpbf16ps.256
   // CHECK: ret <8 x float> %{{.*}}
   return _mm256_dpbf16_ps(D, A, B);
 }
 
-__m256 test_mm256_maskz_dpbf16_ps(__m256 D, __m256bh A, __m256bh B, __mmask8 U) {
+__m256 test_mm256_maskz_dpbf16_ps(__m256 D, __m256i A, __m256i B, __mmask8 U) {
   // CHECK-LABEL: @test_mm256_maskz_dpbf16_ps
   // CHECK: @llvm.x86.avx512bf16.dpbf16ps.256
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
@@ -154,7 +154,7 @@
   return _mm256_maskz_dpbf16_ps(U, D, A, B);
 }
 
-__m256 test_mm256_mask_dpbf16_ps(__m256 D, __m256bh A, __m256bh B, __mmask8 U) {
+__m256 test_mm256_mask_dpbf16_ps(__m256 D, __m256i A, __m256i B, __mmask8 U) {
   // CHECK-LABEL: @test_mm256_mask_dpbf16_ps
   // CHECK: @llvm.x86.avx512bf16.dpbf16ps.256
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
@@ -169,7 +169,7 @@
   return _mm_cvtness_sbh(A);
 }
 
-__m128 test_mm_cvtpbh_ps(__m128bh A) {
+__m128 test_mm_cvtpbh_ps(__m128i A) {
   // CHECK-LABEL: @test_mm_cvtpbh_ps
   // CHECK: sext <4 x i16> %{{.*}} to <4 x i32>
   // CHECK: @llvm.x86.sse2.pslli.d
@@ -178,7 +178,7 @@
   return _mm_cvtpbh_ps(A);
 }
 
-__m256 test_mm256_cvtpbh_ps(__m128bh A) {
+__m256 test_mm256_cvtpbh_ps(__m128i A) {
   // CHECK-LABEL: @test_mm256_cvtpbh_ps
   // CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
   // CHECK: @llvm.x86.avx2.pslli.d
@@ -187,7 +187,7 @@
   return _mm256_cvtpbh_ps(A);
 }
 
-__m128 test_mm_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
+__m128 test_mm_maskz_cvtpbh_ps(__mmask8 M, __m128i A) {
   // CHECK-LABEL: @test_mm_maskz_cvtpbh_ps
   // CHECK: sext <4 x i16> %{{.*}} to <4 x i32>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
@@ -197,7 +197,7 @@
   return _mm_maskz_cvtpbh_ps(M, A);
 }
 
-__m256 test_mm256_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
+__m256 test_mm256_maskz_cvtpbh_ps(__mmask8 M, __m128i A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtpbh_ps
   // CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
@@ -207,7 +207,7 @@
   return _mm256_maskz_cvtpbh_ps(M, A);
 }
 
-__m128 test_mm_mask_cvtpbh_ps(__m128 S, __mmask8 M, __m128bh A) {
+__m128 test_mm_mask_cvtpbh_ps(__m128 S, __mmask8 M, __m128i A) {
   // CHECK-LABEL: @test_mm_mask_cvtpbh_ps
   // CHECK: sext <4 x i16> %{{.*}} to <4 x i32>
   // CHECK: @llvm.x86.sse2.pslli.d
@@ -217,7 +217,7 @@
   return _mm_mask_cvtpbh_ps(S, M, A);
 }
 
-__m256 test_mm256_mask_cvtpbh_ps(__m256 S, __mmask8 M, __m128bh A) {
+__m256 test_mm256_mask_cvtpbh_ps(__m256 S, __mmask8 M, __m128i A) {
   // CHECK-LABEL: @test_mm256_mask_cvtpbh_ps
   // CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
   // CHECK: @llvm.x86.avx2.pslli.d
Index: clang/test/CodeGen/X86/avx512bf16-error.c
===================================================================
--- clang/test/CodeGen/X86/avx512bf16-error.c
+++ clang/test/CodeGen/X86/avx512bf16-error.c
@@ -11,3 +11,9 @@
 __bfloat16 bar(__bfloat16 a, __bfloat16 b) {
   return a + b;
 }
+
+// expected-warning@+2 3 {{'__m128bh' is deprecated: use __m128i instead}}
+// expected-note@* 3 {{'__m128bh' has been explicitly marked deprecated here}}
+__m128bh baz(__m128bh a, __m128bh b) {
+  return a + b;
+}
Index: clang/test/CodeGen/X86/avx512bf16-builtins.c
===================================================================
--- clang/test/CodeGen/X86/avx512bf16-builtins.c
+++ clang/test/CodeGen/X86/avx512bf16-builtins.c
@@ -13,60 +13,60 @@
   return _mm_cvtsbh_ss(A);
 }
 
-__m512bh test_mm512_cvtne2ps_pbh(__m512 A, __m512 B) {
+__m512i test_mm512_cvtne2ps_pbh(__m512 A, __m512 B) {
   // CHECK-LABEL: @test_mm512_cvtne2ps_pbh
   // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.512
-  // CHECK: ret <32 x i16> %{{.*}}
+  // CHECK: ret <8 x i64> %{{.*}}
   return _mm512_cvtne2ps_pbh(A, B);
 }
 
-__m512bh test_mm512_maskz_cvtne2ps_pbh(__m512 A, __m512 B, __mmask32 U) {
+__m512i test_mm512_maskz_cvtne2ps_pbh(__m512 A, __m512 B, __mmask32 U) {
   // CHECK-LABEL: @test_mm512_maskz_cvtne2ps_pbh
   // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
-  // CHECK: ret <32 x i16> %{{.*}}
+  // CHECK: ret <8 x i64> %{{.*}}
   return _mm512_maskz_cvtne2ps_pbh(U, A, B);
 }
 
-__m512bh test_mm512_mask_cvtne2ps_pbh(__m512bh C, __mmask32 U, __m512 A, __m512 B) {
+__m512i test_mm512_mask_cvtne2ps_pbh(__m512i C, __mmask32 U, __m512 A, __m512 B) {
   // CHECK-LABEL: @test_mm512_mask_cvtne2ps_pbh
   // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
-  // CHECK: ret <32 x i16> %{{.*}}
+  // CHECK: ret <8 x i64> %{{.*}}
   return _mm512_mask_cvtne2ps_pbh(C, U, A, B);
 }
 
-__m256bh test_mm512_cvtneps_pbh(__m512 A) {
+__m256i test_mm512_cvtneps_pbh(__m512 A) {
   // CHECK-LABEL: @test_mm512_cvtneps_pbh
   // CHECK: @llvm.x86.avx512bf16.cvtneps2bf16.512
-  // CHECK: ret <16 x i16> %{{.*}}
+  // CHECK: ret <4 x i64> %{{.*}}
   return _mm512_cvtneps_pbh(A);
 }
 
-__m256bh test_mm512_mask_cvtneps_pbh(__m256bh C, __mmask16 U, __m512 A) {
+__m256i test_mm512_mask_cvtneps_pbh(__m256i C, __mmask16 U, __m512 A) {
   // CHECK-LABEL: @test_mm512_mask_cvtneps_pbh
   // CHECK: @llvm.x86.avx512bf16.cvtneps2bf16.512
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
-  // CHECK: ret <16 x i16> %{{.*}}
+  // CHECK: ret <4 x i64> %{{.*}}
   return _mm512_mask_cvtneps_pbh(C, U, A);
 }
 
-__m256bh test_mm512_maskz_cvtneps_pbh(__m512 A, __mmask16 U) {
+__m256i test_mm512_maskz_cvtneps_pbh(__m512 A, __mmask16 U) {
   // CHECK-LABEL: @test_mm512_maskz_cvtneps_pbh
   // CHECK: @llvm.x86.avx512bf16.cvtneps2bf16.512
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
-  // CHECK: ret <16 x i16> %{{.*}}
+  // CHECK: ret <4 x i64> %{{.*}}
   return _mm512_maskz_cvtneps_pbh(U, A);
 }
 
-__m512 test_mm512_dpbf16_ps(__m512 D, __m512bh A, __m512bh B) {
+__m512 test_mm512_dpbf16_ps(__m512 D, __m512i A, __m512i B) {
   // CHECK-LABEL: @test_mm512_dpbf16_ps
   // CHECK: @llvm.x86.avx512bf16.dpbf16ps.512
   // CHECK: ret <16 x float> %{{.*}}
   return _mm512_dpbf16_ps(D, A, B);
 }
 
-__m512 test_mm512_maskz_dpbf16_ps(__m512 D, __m512bh A, __m512bh B, __mmask16 U) {
+__m512 test_mm512_maskz_dpbf16_ps(__m512 D, __m512i A, __m512i B, __mmask16 U) {
   // CHECK-LABEL: @test_mm512_maskz_dpbf16_ps
   // CHECK: @llvm.x86.avx512bf16.dpbf16ps.512
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
@@ -74,7 +74,7 @@
   return _mm512_maskz_dpbf16_ps(U, D, A, B);
 }
 
-__m512 test_mm512_mask_dpbf16_ps(__m512 D, __m512bh A, __m512bh B, __mmask16 U) {
+__m512 test_mm512_mask_dpbf16_ps(__m512 D, __m512i A, __m512i B, __mmask16 U) {
   // CHECK-LABEL: @test_mm512_mask_dpbf16_ps
   // CHECK: @llvm.x86.avx512bf16.dpbf16ps.512
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
@@ -82,18 +82,18 @@
   return _mm512_mask_dpbf16_ps(D, U, A, B);
 }
 
-__m512 test_mm512_cvtpbh_ps(__m256bh A) {
+__m512 test_mm512_cvtpbh_ps(__m256i A) {
   // CHECK-LABEL: @test_mm512_cvtpbh_ps
-  // CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
+  // CHECK: bitcast <8 x i64> %{{.*}} to <16 x i32>
   // CHECK: @llvm.x86.avx512.pslli.d.512
   // CHECK: bitcast <8 x i64> %{{.*}} to <16 x float>
   // CHECK: ret <16 x float> %{{.*}}
   return _mm512_cvtpbh_ps(A);
 }
 
-__m512 test_mm512_maskz_cvtpbh_ps(__mmask16 M, __m256bh A) {
+__m512 test_mm512_maskz_cvtpbh_ps(__mmask16 M, __m256i A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtpbh_ps
-  // CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
+  // CHECK: bitcast <8 x i64> %{{.*}} to <16 x i32>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   // CHECK: @llvm.x86.avx512.pslli.d.512
   // CHECK: bitcast <8 x i64> %{{.*}} to <16 x float>
@@ -101,9 +101,9 @@
   return _mm512_maskz_cvtpbh_ps(M, A);
 }
 
-__m512 test_mm512_mask_cvtpbh_ps(__m512 S, __mmask16 M, __m256bh A) {
+__m512 test_mm512_mask_cvtpbh_ps(__m512 S, __mmask16 M, __m256i A) {
   // CHECK-LABEL: @test_mm512_mask_cvtpbh_ps
-  // CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
+  // CHECK: bitcast <8 x i64> %{{.*}} to <16 x i32>
   // CHECK: @llvm.x86.avx512.pslli.d.512
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   // CHECK: bitcast <8 x i64> %{{.*}} to <16 x float>
Index: clang/lib/Headers/avx512vlbf16intrin.h
===================================================================
--- clang/lib/Headers/avx512vlbf16intrin.h
+++ clang/lib/Headers/avx512vlbf16intrin.h
@@ -13,7 +13,8 @@
 #ifndef __AVX512VLBF16INTRIN_H
 #define __AVX512VLBF16INTRIN_H
 
-typedef short __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
+typedef short __m128bh __attribute__((__vector_size__(16), __aligned__(16),
+                                      deprecated("use __m128i instead")));
 
 #define __DEFAULT_FN_ATTRS128 \
   __attribute__((__always_inline__, __nodebug__, \
@@ -34,10 +35,10 @@
 ///    A 128-bit vector of [4 x float].
 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
 ///    conversion of __B, and higher 64 bits come from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
-  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
-                                                    (__v4sf) __B);
+  return (__m128i)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
+                                                   (__v4sf) __B);
 }
 
 /// Convert Two Packed Single Data to One Packed BF16 Data.
@@ -57,9 +58,9 @@
 ///    A 1 means conversion of __A or __B. A 0 means element from __W.
 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
 ///    conversion of __B, and higher 64 bits come from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtne2ps_pbh(__m128i __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                              (__v8hi)_mm_cvtne2ps_pbh(__A, __B),
                                              (__v8hi)__W);
 }
@@ -79,9 +80,9 @@
 ///    A 1 means conversion of __A or __B. A 0 means element is zero.
 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
 ///    conversion of __B, and higher 64 bits come from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                              (__v8hi)_mm_cvtne2ps_pbh(__A, __B),
                                              (__v8hi)_mm_setzero_si128());
 }
@@ -98,10 +99,10 @@
 ///    A 256-bit vector of [8 x float].
 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
 ///    conversion of __B, and higher 128 bits come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
-  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
-                                                    (__v8sf) __B);
+  return (__m256i)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
+                                                   (__v8sf) __B);
 }
 
 /// Convert Two Packed Single Data to One Packed BF16 Data.
@@ -121,9 +122,9 @@
 ///    A 1 means conversion of __A or __B. A 0 means element from __W.
 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
 ///    conversion of __B, and higher 128 bits come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
-  return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtne2ps_pbh(__m256i __W, __mmask16 __U, __m256 __A, __m256 __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                          (__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
                                          (__v16hi)__W);
 }
@@ -143,9 +144,9 @@
 ///    A 1 means conversion of __A or __B. A 0 means element is zero.
 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
 ///    conversion of __B, and higher 128 bits come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
-  return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                          (__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
                                          (__v16hi)_mm256_setzero_si256());
 }
@@ -160,9 +161,9 @@
 ///    A 128-bit vector of [4 x float].
 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
 ///    conversion of __A, and higher 64 bits are 0.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtneps_pbh(__m128 __A) {
-  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
+  return (__m128i)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
                                                   (__v8hi)_mm_undefined_si128(),
                                                   (__mmask8)-1);
 }
@@ -182,11 +183,11 @@
 ///    A 1 means conversion of __A. A 0 means element from __W.
 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
 ///    conversion of __A, and higher 64 bits are 0.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
-  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
-                                                        (__v8hi)__W,
-                                                        (__mmask8)__U);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtneps_pbh(__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
+                                                       (__v8hi)__W,
+                                                       (__mmask8)__U);
 }
 
 /// Convert Packed Single Data to Packed BF16 Data.
@@ -202,9 +203,9 @@
 ///    A 1 means conversion of __A. A 0 means element is zero.
 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
 ///    conversion of __A, and higher 64 bits are 0.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
-  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
+  return (__m128i)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
                                                     (__v8hi)_mm_setzero_si128(),
                                                     (__mmask8)__U);
 }
@@ -218,9 +219,9 @@
 /// \param __A
 ///    A 256-bit vector of [8 x float].
 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS256
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvtneps_pbh(__m256 __A) {
-  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
+  return (__m128i)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
                                                   (__v8hi)_mm_undefined_si128(),
                                                   (__mmask8)-1);
 }
@@ -239,11 +240,11 @@
 ///    A 8-bit mask value specifying what is chosen for each element.
 ///    A 1 means conversion of __A. A 0 means element from __W.
 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
-  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
-                                                        (__v8hi)__W,
-                                                        (__mmask8)__U);
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtneps_pbh(__m128i __W, __mmask8 __U, __m256 __A) {
+  return (__m128i)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
+                                                       (__v8hi)__W,
+                                                       (__mmask8)__U);
 }
 
 /// Convert Packed Single Data to Packed BF16 Data.
@@ -258,9 +259,9 @@
 ///    A 8-bit mask value specifying what is chosen for each element.
 ///    A 1 means conversion of __A. A 0 means element is zero.
 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS256
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
-  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
+  return (__m128i)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
                                                     (__v8hi)_mm_setzero_si128(),
                                                     (__mmask8)__U);
 }
@@ -280,7 +281,7 @@
 /// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
 ///  __A, __B and __D
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
+_mm_dpbf16_ps(__m128 __D, __m128i __A, __m128i __B) {
   return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
                                              (__v4si)__A,
                                              (__v4si)__B);
@@ -304,7 +305,7 @@
 /// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
 ///  __A, __B and __D
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
+_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
                                            (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
                                            (__v4sf)__D);
@@ -328,7 +329,7 @@
 /// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
 ///  __A, __B and __D
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
+_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128i __A, __m128i __B) {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
                                            (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
                                            (__v4sf)_mm_setzero_si128());
@@ -349,7 +350,7 @@
 /// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
 ///  __A, __B and __D
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
+_mm256_dpbf16_ps(__m256 __D, __m256i __A, __m256i __B) {
   return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
                                              (__v8si)__A,
                                              (__v8si)__B);
@@ -373,7 +374,7 @@
 /// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
 ///  __A, __B and __D
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
+_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256i __A, __m256i __B) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                         (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
                                         (__v8sf)__D);
@@ -397,7 +398,7 @@
 /// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
 ///  __A, __B and __D
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
+_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256i __A, __m256i __B) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                         (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
                                         (__v8sf)_mm256_setzero_si256());
@@ -428,7 +429,7 @@
 /// \param __A
 ///    A 128-bit vector of [4 x bfloat].
 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128i __A) {
   return _mm_castsi128_ps(
       (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
 }
@@ -440,7 +441,7 @@
 /// \param __A
 ///    A 128-bit vector of [8 x bfloat].
 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
+static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128i __A) {
   return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
       (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
 }
@@ -456,7 +457,7 @@
 ///    A 128-bit vector of [4 x bfloat].
 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
+_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128i __A) {
   return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
       (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
 }
@@ -472,7 +473,7 @@
 ///    A 128-bit vector of [8 x bfloat].
 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
+_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128i __A) {
   return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
       (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
 }
@@ -491,7 +492,7 @@
 ///    A 128-bit vector of [4 x bfloat].
 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
+_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128i __A) {
   return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
       (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
       16));
@@ -511,7 +512,7 @@
 ///    A 128-bit vector of [8 x bfloat].
 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
+_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128i __A) {
   return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
       (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
       16));
Index: clang/lib/Headers/avx512bf16intrin.h
===================================================================
--- clang/lib/Headers/avx512bf16intrin.h
+++ clang/lib/Headers/avx512bf16intrin.h
@@ -13,8 +13,10 @@
 #ifndef __AVX512BF16INTRIN_H
 #define __AVX512BF16INTRIN_H
 
-typedef short __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
-typedef short __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
+typedef short __m512bh __attribute__((__vector_size__(64), __aligned__(64),
+                                      deprecated("use __m512i instead")));
+typedef short __m256bh __attribute__((__vector_size__(32), __aligned__(32),
+                                      deprecated("use __m256i instead")));
 
 /// \typedef __bfloat16
 ///    A target specific type to represent the storage only brain floating-point
@@ -56,10 +58,10 @@
 ///    A 512-bit vector of [16 x float].
 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
 ///    conversion of __B, and higher 256 bits come from conversion of __A.
-static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
-  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
-                                                    (__v16sf) __B);
+  return (__m512i)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
+                                                   (__v16sf) __B);
 }
 
 /// Convert Two Packed Single Data to One Packed BF16 Data.
@@ -79,9 +81,9 @@
 ///    A 1 means conversion of __A or __B. A 0 means element from __W.
 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
 ///    conversion of __B, and higher 256 bits come from conversion of __A.
-static __inline__ __m512bh __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
-  return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U,
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtne2ps_pbh(__m512i __W, __mmask32 __U, __m512 __A, __m512 __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                         (__v32hi)_mm512_cvtne2ps_pbh(__A, __B),
                                         (__v32hi)__W);
 }
@@ -101,9 +103,9 @@
 ///    A 1 means conversion of __A or __B. A 0 means element is zero.
 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
 ///    conversion of __B, and higher 256 bits come from conversion of __A.
-static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
-  return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U,
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                         (__v32hi)_mm512_cvtne2ps_pbh(__A, __B),
                                         (__v32hi)_mm512_setzero_si512());
 }
@@ -117,9 +119,9 @@
 /// \param __A
 ///    A 512-bit vector of [16 x float].
 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS512
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_cvtneps_pbh(__m512 __A) {
-  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
+  return (__m256i)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
                                               (__v16hi)_mm256_undefined_si256(),
                                               (__mmask16)-1);
 }
@@ -138,11 +140,11 @@
 ///    A 16-bit mask value specifying what is chosen for each element.
 ///    A 1 means conversion of __A. A 0 means element from __W.
 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
-  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
-                                                        (__v16hi)__W,
-                                                        (__mmask16)__U);
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtneps_pbh(__m256i __W, __mmask16 __U, __m512 __A) {
+  return (__m256i)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
+                                                       (__v16hi)__W,
+                                                       (__mmask16)__U);
 }
 
 /// Convert Packed Single Data to Packed BF16 Data.
@@ -157,9 +159,9 @@
 ///    A 16-bit mask value specifying what is chosen for each element.
 ///    A 1 means conversion of __A. A 0 means element is zero.
 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS512
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
-  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
+  return (__m256i)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
                                                 (__v16hi)_mm256_setzero_si256(),
                                                 (__mmask16)__U);
 }
@@ -179,7 +181,7 @@
 /// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
 ///  __A, __B and __D
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
+_mm512_dpbf16_ps(__m512 __D, __m512i __A, __m512i __B) {
   return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
                                              (__v16si) __A,
                                              (__v16si) __B);
@@ -203,7 +205,7 @@
 /// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
 ///  __A, __B and __D
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
+_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512i __A, __m512i __B) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
                                        (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
                                        (__v16sf)__D);
@@ -227,7 +229,7 @@
 /// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
 ///  __A, __B and __D
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
+_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512i __A, __m512i __B) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
                                        (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
                                        (__v16sf)_mm512_setzero_si512());
@@ -240,7 +242,7 @@
 /// \param __A
 ///    A 256-bit vector of [16 x bfloat].
 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
+static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256i __A) {
   return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
       (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
 }
@@ -256,7 +258,7 @@
 ///    A 256-bit vector of [16 x bfloat].
 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
+_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256i __A) {
   return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
       (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
 }
@@ -274,7 +276,7 @@
 ///    A 256-bit vector of [16 x bfloat].
 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
+_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256i __A) {
   return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
       (__m512i)__S, (__mmask16)__U,
       (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
  • [PATCH] D120411: [X86] Replace... Phoebe Wang via Phabricator via cfe-commits

Reply via email to