llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-mc Author: Freddy Ye (FreddyLeaf) <details> <summary>Changes</summary> Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965 --- Patch is 1.24 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101603.diff 30 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsX86.def (+62) - (modified) clang/lib/Basic/Targets/X86.cpp (+1) - (modified) clang/lib/CodeGen/CGBuiltin.cpp (+23) - (modified) clang/lib/Headers/CMakeLists.txt (+2) - (added) clang/lib/Headers/avx10_2_512bf16intrin.h (+565) - (added) clang/lib/Headers/avx10_2bf16intrin.h (+1088) - (modified) clang/lib/Headers/immintrin.h (+2) - (added) clang/test/CodeGen/X86/avx10_2_512bf16-builtins.c (+1054) - (added) clang/test/CodeGen/X86/avx10_2bf16-builtins.c (+2018) - (modified) llvm/include/llvm/IR/IntrinsicsX86.td (+410) - (modified) llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp (+6-2) - (modified) llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp (+11-1) - (modified) llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp (+11) - (modified) llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp (+9) - (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+33-4) - (modified) llvm/lib/Target/X86/X86InstrAVX10.td (+310) - (modified) llvm/lib/Target/X86/X86InstrFragmentsSIMD.td (+10) - (modified) llvm/lib/Target/X86/X86InstrUtils.td (+3-3) - (modified) llvm/lib/Target/X86/X86IntrinsicsInfo.h (+54) - (added) llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll (+587) - (added) llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll (+296) - (added) llvm/test/CodeGen/X86/avx10_2bf16-arith.ll (+1168) - (added) llvm/test/CodeGen/X86/avx10_2bf16-intrinsics.ll (+536) - (added) llvm/test/MC/Disassembler/X86/avx10.2-bf16-32.txt (+3015) - (added) llvm/test/MC/Disassembler/X86/avx10.2-bf16-64.txt (+3015) - (added) llvm/test/MC/X86/avx10.2-bf16-32-att.s (+3014) - (added) llvm/test/MC/X86/avx10.2-bf16-32-intel.s (+3014) - (added) llvm/test/MC/X86/avx10.2-bf16-64-att.s (+3014) - (added) llvm/test/MC/X86/avx10.2-bf16-64-intel.s (+3014) - (modified) llvm/test/TableGen/x86-fold-tables.inc (+494) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def index e4aa8661b9a806..48376ee0527980 100644 --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -2261,6 +2261,68 @@ TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_512_mask, "V32cV32xV32cUi", "nV:512:" TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256") TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256") TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512") + +// AVX10.2 BF16 +TARGET_BUILTIN(__builtin_ia32_loadsbf16128_mask, "V8yV8yC*V8yUc", "nV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_storesbf16128_mask, "vV8y*V8yUc", "nV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vaddnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vaddnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vaddnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vdivnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vdivnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vdivnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vmaxpbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vmaxpbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vmaxpbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vminpbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vminpbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vminpbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vmulnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vmulnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vmulnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vsubnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vsubnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vsubnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vcomsbf16eq, "iV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcomsbf16lt, "iV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcomsbf16neq, "iV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcomsbf16ge, "iV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcomsbf16gt, "iV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcomsbf16le, "iV8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcmppbf16512_mask,"UiV32yV32yIiUi", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vcmppbf16256_mask,"UsV16yV16yIiUs", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcmppbf16128_mask,"UcV8yV8yIiUc", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16128_mask, "UcV8yIiUc", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16256_mask, "UsV16yIiUs", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16512_mask, "UiV32yIiUi", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vscalefpbf16128_mask, "V8yV8yV8yV8yUc", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vscalefpbf16256_mask, "V16yV16yV16yV16yUs", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vscalefpbf16512_mask, "V32yV32yV32yV32yUi", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vrcppbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vrcppbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vrcppbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vgetexppbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vgetexppbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vgetexppbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vreducenepbf16128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vreducenepbf16256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vreducenepbf16512_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16512_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16, "V8yV8y", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16256, "V16yV16y", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16512, "V32yV32y", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh512, "V32yV32yV32yV32y", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh256, "V16yV16yV16yV16y", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh128, "V8yV8yV8yV8y", "ncV:128:", "avx10.2-256") + #undef BUILTIN #undef TARGET_BUILTIN #undef TARGET_HEADER_BUILTIN diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index a9cbdb7b10dff8..62c382b67ad14a 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -306,6 +306,7 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features, HasAVX10_1_512 = true; } else if (Feature == "+avx10.2-256") { HasAVX10_2 = true; + HasFullBFloat16 = true; } else if (Feature == "+avx10.2-512") { HasAVX10_2_512 = true; } else if (Feature == "+avx512cd") { diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 2a733e4d834cfa..94af4e5f723c9a 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14728,6 +14728,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_storeups512_mask: return EmitX86MaskedStore(*this, Ops, Align(1)); + case X86::BI__builtin_ia32_storesbf16128_mask: case X86::BI__builtin_ia32_storesh128_mask: case X86::BI__builtin_ia32_storess128_mask: case X86::BI__builtin_ia32_storesd128_mask: @@ -14836,6 +14837,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_vfmaddph512_mask: case X86::BI__builtin_ia32_vfmaddph512_maskz: case X86::BI__builtin_ia32_vfmaddph512_mask3: + case X86::BI__builtin_ia32_vfmaddnepbh128: + case X86::BI__builtin_ia32_vfmaddnepbh256: + case X86::BI__builtin_ia32_vfmaddnepbh512: case X86::BI__builtin_ia32_vfmaddps512_mask: case X86::BI__builtin_ia32_vfmaddps512_maskz: case X86::BI__builtin_ia32_vfmaddps512_mask3: @@ -14920,6 +14924,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_loaddqudi512_mask: return EmitX86MaskedLoad(*this, Ops, Align(1)); + case X86::BI__builtin_ia32_loadsbf16128_mask: case X86::BI__builtin_ia32_loadsh128_mask: case X86::BI__builtin_ia32_loadss128_mask: case X86::BI__builtin_ia32_loadsd128_mask: @@ -16074,6 +16079,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_sqrtph256: case X86::BI__builtin_ia32_sqrtph: case X86::BI__builtin_ia32_sqrtph512: + case X86::BI__builtin_ia32_vsqrtnepbf16256: + case X86::BI__builtin_ia32_vsqrtnepbf16: + case X86::BI__builtin_ia32_vsqrtnepbf16512: case X86::BI__builtin_ia32_sqrtps512: case X86::BI__builtin_ia32_sqrtpd512: { if (Ops.size() == 2) { @@ -16293,6 +16301,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_fpclassps128_mask: case X86::BI__builtin_ia32_fpclassps256_mask: case X86::BI__builtin_ia32_fpclassps512_mask: + case X86::BI__builtin_ia32_vfpclasspbf16128_mask: + case X86::BI__builtin_ia32_vfpclasspbf16256_mask: + case X86::BI__builtin_ia32_vfpclasspbf16512_mask: case X86::BI__builtin_ia32_fpclassph128_mask: case X86::BI__builtin_ia32_fpclassph256_mask: case X86::BI__builtin_ia32_fpclassph512_mask: @@ -16307,6 +16318,15 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, Intrinsic::ID ID; switch (BuiltinID) { default: llvm_unreachable("Unsupported intrinsic!"); + case X86::BI__builtin_ia32_vfpclasspbf16128_mask: + ID = Intrinsic::x86_avx10_fpclass_nepbf16_128; + break; + case X86::BI__builtin_ia32_vfpclasspbf16256_mask: + ID = Intrinsic::x86_avx10_fpclass_nepbf16_256; + break; + case X86::BI__builtin_ia32_vfpclasspbf16512_mask: + ID = Intrinsic::x86_avx10_fpclass_nepbf16_512; + break; case X86::BI__builtin_ia32_fpclassph128_mask: ID = Intrinsic::x86_avx512fp16_fpclass_ph_128; break; @@ -16465,6 +16485,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_vcmppd256_round_mask: case X86::BI__builtin_ia32_vcmpps256_round_mask: case X86::BI__builtin_ia32_vcmpph256_round_mask: + case X86::BI__builtin_ia32_vcmppbf16512_mask: + case X86::BI__builtin_ia32_vcmppbf16256_mask: + case X86::BI__builtin_ia32_vcmppbf16128_mask: IsMaskFCmp = true; [[fallthrough]]; case X86::BI__builtin_ia32_cmpps: diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 5a62538792f301..90d431f8627965 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -147,11 +147,13 @@ set(x86_files amxcomplexintrin.h amxfp16intrin.h amxintrin.h + avx10_2_512bf16intrin.h avx10_2_512convertintrin.h avx10_2_512minmaxintrin.h avx10_2_512niintrin.h avx10_2_512satcvtintrin.h avx10_2convertintrin.h + avx10_2bf16intrin.h avx10_2minmaxintrin.h avx10_2niintrin.h avx10_2satcvtintrin.h diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h new file mode 100644 index 00000000000000..158d5686c8f02f --- /dev/null +++ b/clang/lib/Headers/avx10_2_512bf16intrin.h @@ -0,0 +1,565 @@ +/*===----------- avx10_2_512bf16intrin.h - AVX10-BF16 intrinsics ---------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use <avx10_2_512bf16intrin.h> directly; include <immintrin.h> instead." +#endif + +#ifdef __SSE2__ + +#ifndef __AVX10_2_512BF16INTRIN_H +#define __AVX10_2_512BF16INTRIN_H + +/* Define the default attributes for the functions in this file. */ +typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1))); + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS512 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \ + __min_vector_width__(512))) + +static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) { + return __builtin_bit_cast(__m512bh, _mm512_setzero_ps()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_undefined_pbh(void) { + return (__m512bh)__builtin_ia32_undef512(); +} + +static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set1_pbh(__bf16 bf) { + return (__m512bh)(__v32bf){bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, + bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, + bf, bf, bf, bf, bf, bf, bf, bf, bf, bf}; +} + +static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set_pbh( + __bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, __bf16 bf6, + __bf16 bf7, __bf16 bf8, __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12, + __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16, __bf16 bf17, + __bf16 bf18, __bf16 bf19, __bf16 bf20, __bf16 bf21, __bf16 bf22, + __bf16 bf23, __bf16 bf24, __bf16 bf25, __bf16 bf26, __bf16 bf27, + __bf16 bf28, __bf16 bf29, __bf16 bf30, __bf16 bf31, __bf16 bf32) { + return (__m512bh)(__v32bf){bf32, bf31, bf30, bf29, bf28, bf27, bf26, bf25, + bf24, bf23, bf22, bf21, bf20, bf19, bf18, bf17, + bf16, bf15, bf14, bf13, bf12, bf11, bf10, bf9, + bf8, bf7, bf6, bf5, bf4, bf3, bf2, bf1}; +} + +#define _mm512_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, bf9, bf10, \ + bf11, bf12, bf13, bf14, bf15, bf16, bf17, bf18, bf19, \ + bf20, bf21, bf22, bf23, bf24, bf25, bf26, bf27, bf28, \ + bf29, bf30, bf31, bf32) \ + _mm512_set_pbh((bf32), (bf31), (bf30), (bf29), (bf28), (bf27), (bf26), \ + (bf25), (bf24), (bf23), (bf22), (bf21), (bf20), (bf19), \ + (bf18), (bf17), (bf16), (bf15), (bf14), (bf13), (bf12), \ + (bf11), (bf10), (bf9), (bf8), (bf7), (bf6), (bf5), (bf4), \ + (bf3), (bf2), (bf1)) + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_castpbf16_ps(__m512bh __a) { + return (__m512)__a; +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_castpbf16_pd(__m512bh __a) { + return (__m512d)__a; +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_castpbf16_si512(__m512bh __a) { + return (__m512i)__a; +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_castps_pbh(__m512 __a) { + return (__m512bh)__a; +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_castpd_pbh(__m512d __a) { + return (__m512bh)__a; +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_castsi512_pbh(__m512i __a) { + return (__m512bh)__a; +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS512 +_mm512_castpbf16512_pbh128(__m512bh __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS512 +_mm512_castpbf16512_pbh256(__m512bh __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_castpbf16128_pbh512(__m128bh __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_castpbf16256_pbh512(__m256bh __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_zextpbf16128_pbh512(__m128bh __a) { + return __builtin_shufflevector( + __a, (__v8bf)_mm_setzero_pbh(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_zextpbf16256_pbh512(__m256bh __a) { + return __builtin_shufflevector(__a, (__v16bf)_mm256_setzero_pbh(), 0, 1, 2, 3, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_abs_pbh(__m512bh __A) { + return (__m512bh)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), + (__m512i)__A); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_load_pbh(void const *__p) { + return *(const __m512bh *)__p; +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_loadu_pbh(void const *__p) { + struct __loadu_pbh { + __m512bh_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_pbh *)__p)->__v; +} + +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_pbh(void *__P, + __m512bh __A) { + *(__m512bh *)__P = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_pbh(void *__P, + __m512bh __A) { + struct __storeu_pbh { + __m512bh_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_pbh *)__P)->__v = __A; +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) { + return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, (__v32bf)__W, + (__v32bf)__A); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) { + return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, + (__v32hi)__B); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_permutexvar_pbh(__m512i __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_addne_pbh(__m512bh __A, __m512bh __B) { + return (__m512bh)((__v32bf)__A + (__v32bf)__B); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_addne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_addne_pbh(__A, __B), (__v32bf)__W); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_addne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_addne_pbh(__A, __B), + (__v32bf)_mm512_setzero_pbh()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_subne_pbh(__m512bh __A, __m512bh __B) { + return (__m512bh)((__v32bf)__A - (__v32bf)__B); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_subne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_subne_pbh(__A, __B), (__v32bf)__W); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_subne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_subne_pbh(__A, __B), + (__v32bf)_mm512_setzero_pbh()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mulne_pbh(__m512bh __A, __m512bh __B) { + return (__m512bh)((__v32bf)__A * (__v32bf)__B); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_mulne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_mulne_pbh(__A, __B), (__v32bf)__W); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_mulne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_sel... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/101603 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits