https://github.com/wubowski updated https://github.com/llvm/llvm-project/pull/162927
From a4014ec1c26f93435cf4982f264ddf96e045b723 Mon Sep 17 00:00:00 2001 From: Jason <[email protected]> Date: Fri, 10 Oct 2025 16:29:41 -0500 Subject: [PATCH 1/2] Rewrote bf16->f32 conversion intrinsics --- clang/lib/Headers/avx512bf16intrin.h | 12 ++++-------- clang/lib/Headers/avx512vlbf16intrin.h | 20 ++++++-------------- 2 files changed, 10 insertions(+), 22 deletions(-) diff --git a/clang/lib/Headers/avx512bf16intrin.h b/clang/lib/Headers/avx512bf16intrin.h index 3973f0e389685..4968136f7c7cd 100644 --- a/clang/lib/Headers/avx512bf16intrin.h +++ b/clang/lib/Headers/avx512bf16intrin.h @@ -36,7 +36,7 @@ typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead"))); /// \returns A float data whose sign field and exponent field keep unchanged, /// and fraction field is extended to 23 bits. static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) { - return __builtin_ia32_cvtsbf162ss_32(__A); + return float(__A); } /// Convert Two Packed Single Data to One Packed BF16 Data. @@ -236,8 +236,7 @@ _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) { /// A 256-bit vector of [16 x bfloat]. /// \returns A 512-bit vector of [16 x float] come from conversion of __A static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) { - return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32( - (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16)); + return (__m512)__builtin_convertvector(__A, __v16sf); } /// Convert Packed BF16 Data to Packed float Data using zeroing mask. @@ -252,8 +251,7 @@ static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) { /// \returns A 512-bit vector of [16 x float] come from conversion of __A static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) { - return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32( - (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16)); + return _mm512_maskz_mov_ps(__U, (__m512)__builtin_convertvector(__A, __v16sf)); } /// Convert Packed BF16 Data to Packed float Data using merging mask. @@ -270,9 +268,7 @@ _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) { /// \returns A 512-bit vector of [16 x float] come from conversion of __A static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) { - return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32( - (__m512i)__S, (__mmask16)__U, - (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16)); + return _mm512_mask_mov_ps(__S, __U, (__m512)__builtin_convertvector(__A, __v16sf)); } #undef __DEFAULT_FN_ATTRS diff --git a/clang/lib/Headers/avx512vlbf16intrin.h b/clang/lib/Headers/avx512vlbf16intrin.h index 2d7ea0114d6a5..a06a4bd8923f5 100644 --- a/clang/lib/Headers/avx512vlbf16intrin.h +++ b/clang/lib/Headers/avx512vlbf16intrin.h @@ -422,8 +422,7 @@ static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) { /// A 128-bit vector of [4 x bfloat]. /// \returns A 128-bit vector of [4 x float] come from conversion of __A static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) { - return _mm_castsi128_ps( - (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16)); + return (__m128)__builtin_convertvector(__A, __v4sf); } /// Convert Packed BF16 Data to Packed float Data. @@ -434,8 +433,7 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) { /// A 128-bit vector of [8 x bfloat]. /// \returns A 256-bit vector of [8 x float] come from conversion of __A static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) { - return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32( - (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16)); + return (__m256)__builtin_convertvector(__A, __v8sf); } /// Convert Packed BF16 Data to Packed float Data using zeroing mask. @@ -450,8 +448,7 @@ static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) { /// \returns A 128-bit vector of [4 x float] come from conversion of __A static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { - return _mm_castsi128_ps((__m128i)_mm_slli_epi32( - (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16)); + return __mm_maskz_mov_ps(__U, (__m128)__builtin_convertvector(__A, __v4sf)); } /// Convert Packed BF16 Data to Packed float Data using zeroing mask. @@ -466,8 +463,7 @@ _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { /// \returns A 256-bit vector of [8 x float] come from conversion of __A static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { - return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32( - (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16)); + return __mm256_maskz_mov_ps(__U, (__m256)__builtin_convertvector(__A, __v8sf)); } /// Convert Packed BF16 Data to Packed float Data using merging mask. @@ -485,9 +481,7 @@ _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { /// \returns A 128-bit vector of [4 x float] come from conversion of __A static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) { - return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32( - (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A), - 16)); + return __mm_mask_mov_ps(__S, __U, (__m128)__builtin_convertvector(__A, __v4sf)); } /// Convert Packed BF16 Data to Packed float Data using merging mask. @@ -505,9 +499,7 @@ _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) { /// \returns A 256-bit vector of [8 x float] come from conversion of __A static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) { - return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32( - (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), - 16)); + return __mm256_mask_mov_ps(__S, __U, (__m256)__builtin_convertvector(__A, __v8sf)); } #undef __DEFAULT_FN_ATTRS128 From 1b66270903baa3276c7c0f78511ac2daf9e4c7f1 Mon Sep 17 00:00:00 2001 From: Jason <[email protected]> Date: Fri, 10 Oct 2025 17:33:41 -0500 Subject: [PATCH 2/2] fixed underscore typo with mask intrinsics --- clang/lib/Headers/avx512vlbf16intrin.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/lib/Headers/avx512vlbf16intrin.h b/clang/lib/Headers/avx512vlbf16intrin.h index a06a4bd8923f5..2051694fc9d4f 100644 --- a/clang/lib/Headers/avx512vlbf16intrin.h +++ b/clang/lib/Headers/avx512vlbf16intrin.h @@ -448,7 +448,7 @@ static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) { /// \returns A 128-bit vector of [4 x float] come from conversion of __A static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { - return __mm_maskz_mov_ps(__U, (__m128)__builtin_convertvector(__A, __v4sf)); + return _mm_maskz_mov_ps(__U, (__m128)__builtin_convertvector(__A, __v4sf)); } /// Convert Packed BF16 Data to Packed float Data using zeroing mask. @@ -463,7 +463,7 @@ _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { /// \returns A 256-bit vector of [8 x float] come from conversion of __A static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { - return __mm256_maskz_mov_ps(__U, (__m256)__builtin_convertvector(__A, __v8sf)); + return _mm256_maskz_mov_ps(__U, (__m256)__builtin_convertvector(__A, __v8sf)); } /// Convert Packed BF16 Data to Packed float Data using merging mask. @@ -481,7 +481,7 @@ _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { /// \returns A 128-bit vector of [4 x float] come from conversion of __A static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) { - return __mm_mask_mov_ps(__S, __U, (__m128)__builtin_convertvector(__A, __v4sf)); + return _mm_mask_mov_ps(__S, __U, (__m128)__builtin_convertvector(__A, __v4sf)); } /// Convert Packed BF16 Data to Packed float Data using merging mask. @@ -499,7 +499,7 @@ _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) { /// \returns A 256-bit vector of [8 x float] come from conversion of __A static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) { - return __mm256_mask_mov_ps(__S, __U, (__m256)__builtin_convertvector(__A, __v8sf)); + return _mm256_mask_mov_ps(__S, __U, (__m256)__builtin_convertvector(__A, __v8sf)); } #undef __DEFAULT_FN_ATTRS128 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
