Author: Malay Sanghi Date: 2024-11-12T15:05:43+08:00 New Revision: f77101ea7913ab6a6b28ad03c152c615a89900f6
URL: https://github.com/llvm/llvm-project/commit/f77101ea7913ab6a6b28ad03c152c615a89900f6 DIFF: https://github.com/llvm/llvm-project/commit/f77101ea7913ab6a6b28ad03c152c615a89900f6.diff LOG: [X86][AMX] Support AMX-MOVRS (#115151) Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368 Added: clang/lib/Headers/amxmovrsintrin.h clang/lib/Headers/amxmovrstransposeintrin.h clang/test/CodeGen/X86/amx_movrs.c clang/test/CodeGen/X86/amx_movrs_api.c clang/test/CodeGen/X86/amx_movrs_errors.c clang/test/CodeGen/X86/amx_movrs_tranpose.c clang/test/CodeGen/X86/amx_movrs_tranpose_api.c clang/test/CodeGen/X86/amx_movrs_transpose_errors.c llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s Modified: clang/docs/ReleaseNotes.rst clang/include/clang/Basic/BuiltinsX86_64.def clang/include/clang/Driver/Options.td clang/lib/Basic/Targets/X86.cpp clang/lib/Basic/Targets/X86.h clang/lib/CodeGen/CGBuiltin.cpp clang/lib/Headers/CMakeLists.txt clang/lib/Headers/immintrin.h clang/lib/Sema/SemaX86.cpp llvm/include/llvm/IR/IntrinsicsX86.td llvm/include/llvm/TargetParser/X86TargetParser.def llvm/lib/Target/X86/X86.td llvm/lib/Target/X86/X86ExpandPseudo.cpp llvm/lib/Target/X86/X86ISelDAGToDAG.cpp llvm/lib/Target/X86/X86ISelLowering.cpp llvm/lib/Target/X86/X86InstrAMX.td llvm/lib/Target/X86/X86InstrPredicates.td llvm/lib/Target/X86/X86LowerAMXType.cpp llvm/lib/Target/X86/X86RegisterInfo.cpp llvm/lib/TargetParser/Host.cpp llvm/lib/TargetParser/X86TargetParser.cpp Removed: ################################################################################ diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 4ef48bed58d95c..c8aa053986cc00 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -741,6 +741,7 @@ X86 Support * Supported intrinsics of ``_mm(256|512)_(mask(z))_loadrs_epi(8|16|32|64)``. - Support ISA of ``AMX-FP8``. - Support ISA of ``AMX-TRANSPOSE``. +- Support ISA of ``AMX-MOVRS``. - Support ISA of ``AMX-AVX512``. - Support ISA of ``AMX-TF32``. diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def index 25c10d39df32e2..f853b4313dae07 100644 --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -117,7 +117,9 @@ TARGET_BUILTIN(__builtin_ia32_uwrmsr, "vULLiULLi", "n", "usermsr") // AMX internal builtin TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_tileloaddrs64_internal, "V256iUsUsvC*z", "n", "amx-movrs") TARGET_BUILTIN(__builtin_ia32_tileloaddt164_internal, "V256iUsUsvC*z", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_tileloaddrst164_internal, "V256iUsUsvC*z", "n", "amx-movrs") TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") @@ -129,10 +131,15 @@ TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex") TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose") TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose") + TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512,avx10.2-512") TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512") TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512") @@ -147,6 +154,13 @@ TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tilerelease, "v", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tilezero, "vUc", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose") + +TARGET_BUILTIN(__builtin_ia32_tileloaddrs64, "vIUcvC*z", "n", "amx-movrs") +TARGET_BUILTIN(__builtin_ia32_tileloaddrst164, "vIUcvC*z", "n", "amx-movrs") TARGET_BUILTIN(__builtin_ia32_tileloadd64, "vIUcvC*z", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tileloaddt164, "vIUcvC*z", "n", "amx-tile") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 1304ef3c5a228b..9fb7f8bb6489b0 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6303,6 +6303,8 @@ def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>; def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>; def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>; def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group<m_x86_Features_Group>; +def mamx_movrs: Flag<["-"], "mamx-movrs">, Group<m_x86_Features_Group>; +def mno_amx_movrs: Flag<["-"], "mno-amx-movrs">, Group<m_x86_Features_Group>; def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group<m_x86_Features_Group>; def mno_cmpccxadd : Flag<["-"], "mno-cmpccxadd">, Group<m_x86_Features_Group>; def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index dc85e9aa77cd3d..086b4415412e67 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -430,6 +430,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features, HasAMXCOMPLEX = true; } else if (Feature == "+amx-fp8") { HasAMXFP8 = true; + } else if (Feature == "+amx-movrs") { + HasAMXMOVRS = true; } else if (Feature == "+amx-transpose") { HasAMXTRANSPOSE = true; } else if (Feature == "+amx-avx512") { @@ -957,6 +959,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__AMX_COMPLEX__"); if (HasAMXFP8) Builder.defineMacro("__AMX_FP8__"); + if (HasAMXMOVRS) + Builder.defineMacro("__AMX_MOVRS__"); if (HasAMXTRANSPOSE) Builder.defineMacro("__AMX_TRANSPOSE__"); if (HasAMXAVX512) @@ -1094,6 +1098,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { .Case("amx-fp16", true) .Case("amx-fp8", true) .Case("amx-int8", true) + .Case("amx-movrs", true) .Case("amx-tf32", true) .Case("amx-tile", true) .Case("amx-transpose", true) @@ -1216,6 +1221,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("amx-fp16", HasAMXFP16) .Case("amx-fp8", HasAMXFP8) .Case("amx-int8", HasAMXINT8) + .Case("amx-movrs", HasAMXMOVRS) .Case("amx-tf32", HasAMXTF32) .Case("amx-tile", HasAMXTILE) .Case("amx-transpose", HasAMXTRANSPOSE) diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index 04b1d5d33ea231..06a7eed8177cb2 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -158,6 +158,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasAMXBF16 = false; bool HasAMXCOMPLEX = false; bool HasAMXFP8 = false; + bool HasAMXMOVRS = false; bool HasAMXTRANSPOSE = false; bool HasAMXAVX512 = false; bool HasAMXTF32 = false; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 430ac5626f89d7..0807542825f634 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -17025,9 +17025,13 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, } // Corresponding to intrisics which will return 2 tiles (tile0_tile1). case X86::BI__builtin_ia32_t2rpntlvwz0_internal: + case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: + case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: case X86::BI__builtin_ia32_t2rpntlvwz1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: { + case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: + case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: + case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: { Intrinsic::ID IID; switch (BuiltinID) { default: @@ -17035,15 +17039,27 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_t2rpntlvwz0_internal: IID = Intrinsic::x86_t2rpntlvwz0_internal; break; + case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: + IID = Intrinsic::x86_t2rpntlvwz0rs_internal; + break; case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: IID = Intrinsic::x86_t2rpntlvwz0t1_internal; break; + case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: + IID = Intrinsic::x86_t2rpntlvwz0rst1_internal; + break; case X86::BI__builtin_ia32_t2rpntlvwz1_internal: IID = Intrinsic::x86_t2rpntlvwz1_internal; break; + case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: + IID = Intrinsic::x86_t2rpntlvwz1rs_internal; + break; case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: IID = Intrinsic::x86_t2rpntlvwz1t1_internal; break; + case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: + IID = Intrinsic::x86_t2rpntlvwz1rst1_internal; + break; } // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride) diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index a094305bcec5e4..7227df93edece4 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -151,6 +151,8 @@ set(x86_files amxfp16intrin.h amxfp8intrin.h amxintrin.h + amxmovrsintrin.h + amxmovrstransposeintrin.h amxtf32intrin.h amxtf32transposeintrin.h amxtransposeintrin.h diff --git a/clang/lib/Headers/amxmovrsintrin.h b/clang/lib/Headers/amxmovrsintrin.h new file mode 100644 index 00000000000000..5fe2fdecb8b5dd --- /dev/null +++ b/clang/lib/Headers/amxmovrsintrin.h @@ -0,0 +1,48 @@ +/*===-------- amxmovrsintrin.h - AMX MOVRS intrinsics -*- C++ -*---------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * ===-------------------------------------------------------------------=== */ + +#ifndef __IMMINTRIN_H +#error "Never use <amxmovrsintrin.h> directly; include <immintrin.h> instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMXMOVRSINTRIN_H +#define __AMXMOVRSINTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS_MOVRS \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-movrs"))) + +#define _tile_loaddrs(dst, base, stride) \ + __builtin_ia32_tileloaddrs64((dst), ((const void *)(base)), \ + (__SIZE_TYPE__)(stride)) +#define _tile_stream_loaddrs(dst, base, stride) \ + __builtin_ia32_tileloaddrst164((dst), ((const void *)(base)), \ + (__SIZE_TYPE__)(stride)) +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS +_tile_loaddrs_internal(unsigned short m, unsigned short n, const void *base, + __SIZE_TYPE__ stride) { + return __builtin_ia32_tileloaddrs64_internal(m, n, base, + (__SIZE_TYPE__)(stride)); +} +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS +_tile_loaddrst1_internal(unsigned short m, unsigned short n, const void *base, + __SIZE_TYPE__ stride) { + return __builtin_ia32_tileloaddrst164_internal(m, n, base, + (__SIZE_TYPE__)(stride)); +} +static __inline__ void __DEFAULT_FN_ATTRS_MOVRS +__tile_loaddrs(__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) { + dst->tile = _tile_loaddrs_internal(dst->row, dst->col, base, stride); +} +static __inline__ void __DEFAULT_FN_ATTRS_MOVRS __tile_stream_loaddrs( + __tile1024i *dst, const void *base, __SIZE_TYPE__ stride) { + dst->tile = _tile_loaddrst1_internal(dst->row, dst->col, base, stride); +} +#undef __DEFAULT_FN_ATTRS_MOVRS +#endif /* __x86_64__ */ +#endif /* __AMXMOVRSINTRIN_H */ diff --git a/clang/lib/Headers/amxmovrstransposeintrin.h b/clang/lib/Headers/amxmovrstransposeintrin.h new file mode 100644 index 00000000000000..17a9f7506a0421 --- /dev/null +++ b/clang/lib/Headers/amxmovrstransposeintrin.h @@ -0,0 +1,200 @@ +/* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics --------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * ===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error \ + "Never use <amxmovrstransposeintrin.h> directly; use <immintrin.h> instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMX_MOVRS_TRANSPOSEINTRIN_H +#define __AMX_MOVRS_TRANSPOSEINTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("amx-transpose,amx-movrs"))) + +#define _tile_2rpntlvwz0rs(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride) +#define _tile_2rpntlvwz0rst1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride) +#define _tile_2rpntlvwz1rs(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride) +#define _tile_2rpntlvwz1rst1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride) + +static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rs_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + // Use __tile1024i_1024a* to escape the alignment check in + // clang/test/Headers/x86-intrinsics-headers-clean.cpp + __builtin_ia32_t2rpntlvwz0rs_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rst1_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz0rst1_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rs_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz1rs_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rst1_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz1rst1_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. +/// Provides a hint to the implementation that the data will likely become +/// read shared in the near future and the data caching can be optimized. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> T2RPNTLVWZ0RS </c> instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS +static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1RS </c> instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS +static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. The last row will be not be read from memory but instead +/// filled with zeros. +/// Provides a hint to the implementation that the data will likely become +/// read shared in the near future and the data caching can be optimized. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS +static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. The last row will be not be read from memory but instead +/// filled with zeros. +/// Provides a hint to the implementation that the data will likely become +/// read shared in the near future and the data caching can be optimized. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1RS </c> instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS +static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +#undef __DEFAULT_FN_ATTRS +#endif /* __x86_64__ */ +#endif /* __AMX_MOVRS_TRANSPOSEINTRIN_H */ \ No newline at end of file diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index 87a502238ae162..f0dd7160ec7ff4 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -656,6 +656,15 @@ _storebe_i64(void * __P, long long __D) { #include <amxtransposeintrin.h> #endif +#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_MOVRS__) +#include <amxmovrsintrin.h> +#endif + +#if !defined(__SCE__) || __has_feature(modules) || \ + (defined(__AMX_MOVRS__) && defined(__AMX_TRANSPOSE__)) +#include <amxmovrstransposeintrin.h> +#endif + #if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_AVX512__) #include <amxavx512intrin.h> #endif diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp index d7c8ed351f410a..85b195491025d3 100644 --- a/clang/lib/Sema/SemaX86.cpp +++ b/clang/lib/Sema/SemaX86.cpp @@ -629,12 +629,18 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) { return false; case X86::BI__builtin_ia32_tileloadd64: case X86::BI__builtin_ia32_tileloaddt164: + case X86::BI__builtin_ia32_tileloaddrs64: + case X86::BI__builtin_ia32_tileloaddrst164: case X86::BI__builtin_ia32_tilestored64: case X86::BI__builtin_ia32_tilezero: case X86::BI__builtin_ia32_t2rpntlvwz0: case X86::BI__builtin_ia32_t2rpntlvwz0t1: case X86::BI__builtin_ia32_t2rpntlvwz1: case X86::BI__builtin_ia32_t2rpntlvwz1t1: + case X86::BI__builtin_ia32_t2rpntlvwz0rst1: + case X86::BI__builtin_ia32_t2rpntlvwz1rs: + case X86::BI__builtin_ia32_t2rpntlvwz1rst1: + case X86::BI__builtin_ia32_t2rpntlvwz0rs: case X86::BI__builtin_ia32_tcvtrowps2pbf16h: case X86::BI__builtin_ia32_tcvtrowps2pbf16l: case X86::BI__builtin_ia32_tcvtrowps2phh: diff --git a/clang/test/CodeGen/X86/amx_movrs.c b/clang/test/CodeGen/X86/amx_movrs.c new file mode 100755 index 00000000000000..4a8f001baafcea --- /dev/null +++ b/clang/test/CodeGen/X86/amx_movrs.c @@ -0,0 +1,25 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-movrs -target-feature +avx512f \ +// RUN: -emit-llvm -o - -Wall -Werror -pedantic \ +// RUN: -Wno-gnu-statement-expression| FileCheck %s + +#include <immintrin.h> +#include <stddef.h> + +#define STRIDE 32 + +char buf[1024]; + +void test_tile_loadd(short row, short col) { + // CHECK-LABEL: define dso_local void @test_tile_loadd( + // CHECK: call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) + // CHECK-NEXT: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) + _tile_loaddrs_internal(row, col, buf, STRIDE); +} + +void test_tile_loaddt1(short row, short col) { + // CHECK-LABEL: define dso_local void @test_tile_loaddt1( + // CHECK: call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) + // CHECK-NEXT: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) + _tile_loaddrst1_internal(row, col, buf, STRIDE); +} diff --git a/clang/test/CodeGen/X86/amx_movrs_api.c b/clang/test/CodeGen/X86/amx_movrs_api.c new file mode 100755 index 00000000000000..cf430adf140852 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_movrs_api.c @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-movrs -emit-llvm -o - -Wall -Werror -pedantic \ +// RUN: -Wno-gnu-statement-expression| FileCheck %s + +#include <immintrin.h> +#include <stddef.h> + +#define STRIDE 32 + +char buf[1024]; + +void test_tile_loadd(short row) { + // CHECK-LABEL: define dso_local void @test_tile_loadd( + // CHECK: call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) + // CHECK-NEXT: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) + __tile1024i a = {row, 8}; + __tile_loaddrs(&a, buf, STRIDE); +} + +void test_tile_loaddt1(short row) { + // CHECK-LABEL: define dso_local void @test_tile_loaddt1( + // CHECK: call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) + // CHECK-NEXT: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) + __tile1024i a = {row, 8}; + __tile_stream_loaddrs(&a, buf, STRIDE); +} + +void test_tile_loadd_macro(void *data) { + // CHECK-LABEL: define dso_local void @test_tile_loadd_macro( + // CHECK: call void @llvm.x86.tileloaddrs64(i8 {{.*}}, ptr %{{.*}}, i64 {{.*}}) + // CHECK: call void @llvm.x86.tileloaddrst164(i8 {{.*}}, ptr %{{.*}}, i64 {{.*}}) + _tile_loaddrs(4, data, STRIDE); + _tile_stream_loaddrs(2, data, STRIDE); +} diff --git a/clang/test/CodeGen/X86/amx_movrs_errors.c b/clang/test/CodeGen/X86/amx_movrs_errors.c new file mode 100755 index 00000000000000..495ea299236e2f --- /dev/null +++ b/clang/test/CodeGen/X86/amx_movrs_errors.c @@ -0,0 +1,13 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-movrs -verify + +#include <immintrin.h> +#include <stddef.h> + +char buf[1024]; + +void test_tile_load() { + _tile_loaddrs(20, buf, 32); // expected-error {{argument value 20 is outside the valid range [0, 7]}} + _tile_stream_loaddrs(-1, buf, 20); // expected-error {{argument value 255 is outside the valid range [0, 7]}} +} diff --git a/clang/test/CodeGen/X86/amx_movrs_tranpose.c b/clang/test/CodeGen/X86/amx_movrs_tranpose.c new file mode 100755 index 00000000000000..192c153835e1e6 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_movrs_tranpose.c @@ -0,0 +1,53 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-movrs -emit-llvm -o - -Wall -Werror -pedantic \ +// RUN: -target-feature +amx-transpose -Wno-gnu-statement-expression| FileCheck %s + +#include <immintrin.h> +#include <stddef.h> + +char buf[2048]; +#define STRIDE 32 + +// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz0rs_internal( +// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024 +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +void test_tile_2rpntlvwz0rs_internal(int row, int col0, int col1, void *D0, void *D1, void *B) { + _tile_2rpntlvwz0rs_internal(row, col0, col1, D0, D1, B, 1); +} + +// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz0rst1_internal( +// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024 +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +void test_tile_2rpntlvwz0rst1_internal(int row, int col0, int col1, void *D0, void *D1, void *B) { + _tile_2rpntlvwz0rst1_internal(row, col0, col1, D0, D1, B, 1); +} + +// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz1rs_internal( +// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024 +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +void test_tile_2rpntlvwz1rs_internal(int row, int col0, int col1, void *D0, void *D1, void *B) { + _tile_2rpntlvwz1rs_internal(row, col0, col1, D0, D1, B, 1); +} + +// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz1rst1_internal( +// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024 +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +void test_tile_2rpntlvwz1rst1_internal(int row, int col0, int col1, void *D0, void *D1, void *B) { + _tile_2rpntlvwz1rst1_internal(row, col0, col1, D0, D1, B, 1); +} diff --git a/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c b/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c new file mode 100755 index 00000000000000..b174cc5067bf30 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c @@ -0,0 +1,81 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-movrs -emit-llvm -o - -Wall -Werror -pedantic \ +// RUN: -target-feature +amx-transpose -Wno-gnu-statement-expression| FileCheck %s + +#include <immintrin.h> +#include <stddef.h> + +char buf[2048]; +#define STRIDE 32 + +void test_tile_2rpntlvwz0rs(const void *A, size_t B) { + // CHECK-LABEL: @test_tile_2rpntlvwz0rs + // CHECK: call void @llvm.x86.t2rpntlvwz0rs(i8 1, ptr %{{.*}}, i64 %{{.*}}) + _tile_2rpntlvwz0rs(1, A, B); +} + +void test_tile_2rpntlvwz0rst1(const void *A, size_t B) { + // CHECK-LABEL: @test_tile_2rpntlvwz0rst1 + // CHECK: call void @llvm.x86.t2rpntlvwz0rst1(i8 1, ptr %{{.*}}, i64 %{{.*}}) + _tile_2rpntlvwz0rst1(1, A, B); +} + +void test_tile_2rpntlvwz1rs(const void *A, size_t B) { + // CHECK-LABEL: @test_tile_2rpntlvwz1rs + // CHECK: call void @llvm.x86.t2rpntlvwz1rs(i8 1, ptr %{{.*}}, i64 %{{.*}}) + _tile_2rpntlvwz1rs(1, A, B); +} + +void test_tile_2rpntlvwz1rst1(const void *A, size_t B) { + // CHECK-LABEL: @test_tile_2rpntlvwz1rst1 + // CHECK: call void @llvm.x86.t2rpntlvwz1rst1(i8 1, ptr %{{.*}}, i64 %{{.*}}) + _tile_2rpntlvwz1rst1(1, A, B); +} + +void test__tile_2rpntlvwz0rs(__tile1024i dst0, __tile1024i dst1) { + //CHECK-LABEL: @test__tile_2rpntlvwz0rs + //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + __tile_2rpntlvwz0rs(&dst0, &dst1, buf, STRIDE); +} + +void test__tile_2rpntlvwz0rst1(__tile1024i dst0, __tile1024i dst1) { + //CHECK-LABEL: @test__tile_2rpntlvwz0rst1 + //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + __tile_2rpntlvwz0rst1(&dst0, &dst1, buf, STRIDE); +} + +void test__tile_2rpntlvwz1rs(__tile1024i dst0, __tile1024i dst1) { + //CHECK-LABEL: @test__tile_2rpntlvwz1rs + //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + __tile_2rpntlvwz1rs(&dst0, &dst1, buf, STRIDE); +} + +void test__tile_2rpntlvwz1rst1(__tile1024i dst0, __tile1024i dst1) { + //CHECK-LABEL: @test__tile_2rpntlvwz1rst1 + //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + __tile_2rpntlvwz1rst1(&dst0, &dst1, buf, STRIDE); +} diff --git a/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c new file mode 100755 index 00000000000000..840b52bbb29bbf --- /dev/null +++ b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c @@ -0,0 +1,22 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-int8 -target-feature +amx-transpose -target-feature +amx-movrs \ +// RUN: -verify + +#include <immintrin.h> +#include <stddef.h> + +void test_tile_2rpntlvwz0rs(const void *A, size_t B) { + _tile_2rpntlvwz0rs(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +void test_tile_2rpntlvwz0rst1(const void *A, size_t B) { + _tile_2rpntlvwz0rst1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +void test_tile_2rpntlvwz1rs(const void *A, size_t B) { + _tile_2rpntlvwz1rs(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +void test_tile_2rpntlvwz1rst1(const void *A, size_t B) { + _tile_2rpntlvwz1rst1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index b2d6f44b7927a9..fcb506e9ebbfcc 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5882,6 +5882,12 @@ let TargetPrefix = "x86" in { def int_x86_tilestored64 : ClangBuiltin<"__builtin_ia32_tilestored64">, Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], [ImmArg<ArgIndex<0>>]>; + def int_x86_tileloaddrs64 : ClangBuiltin<"__builtin_ia32_tileloaddrs64">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg<ArgIndex<0>>]>; + def int_x86_tileloaddrst164 : ClangBuiltin<"__builtin_ia32_tileloaddrst164">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg<ArgIndex<0>>]>; def int_x86_tdpbssd : ClangBuiltin<"__builtin_ia32_tdpbssd">, Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, @@ -5952,6 +5958,20 @@ let TargetPrefix = "x86" in { Intrinsic<[], [llvm_i8_ty, llvm_i8_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; + // AMX-MORVS, AMX-TRANSPOSE + def int_x86_t2rpntlvwz0rs : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0rs">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg<ArgIndex<0>>]>; + def int_x86_t2rpntlvwz0rst1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0rst1">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg<ArgIndex<0>>]>; + def int_x86_t2rpntlvwz1rs : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1rs">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg<ArgIndex<0>>]>; + def int_x86_t2rpntlvwz1rst1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1rst1">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg<ArgIndex<0>>]>; + // AMX-AVX512 def int_x86_tcvtrowd2ps : ClangBuiltin<"__builtin_ia32_tcvtrowd2ps">, Intrinsic<[llvm_v16f32_ty], [llvm_i8_ty, llvm_i32_ty], @@ -5986,6 +6006,16 @@ let TargetPrefix = "x86" in { Intrinsic<[llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], []>; + def int_x86_tileloaddrs64_internal : + ClangBuiltin<"__builtin_ia32_tileloaddrs64_internal">, + Intrinsic<[llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + []>; + def int_x86_tileloaddrst164_internal : + ClangBuiltin<"__builtin_ia32_tileloaddrst164_internal">, + Intrinsic<[llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + []>; def int_x86_tdpbssd_internal : ClangBuiltin<"__builtin_ia32_tdpbssd_internal">, Intrinsic<[llvm_x86amx_ty], @@ -6050,6 +6080,24 @@ let TargetPrefix = "x86" in { llvm_x86amx_ty, llvm_x86amx_ty, llvm_x86amx_ty], []>; + // AMX-MORVS, AMX-TRANSPOSE - internal intrinsics + def int_x86_t2rpntlvwz0rs_internal : + Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + [IntrArgMemOnly, IntrReadMem]>; + def int_x86_t2rpntlvwz0rst1_internal : + Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + [IntrArgMemOnly]>; + def int_x86_t2rpntlvwz1rs_internal : + Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + [IntrArgMemOnly]>; + def int_x86_t2rpntlvwz1rst1_internal : + Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + [IntrArgMemOnly]>; + def int_x86_t2rpntlvwz0_internal : Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def index 3b643563775688..5cad8e117f4ee2 100644 --- a/llvm/include/llvm/TargetParser/X86TargetParser.def +++ b/llvm/include/llvm/TargetParser/X86TargetParser.def @@ -266,6 +266,7 @@ X86_FEATURE (MOVRS, "movrs") X86_FEATURE (ZU, "zu") X86_FEATURE (AMX_FP8, "amx-fp8") X86_FEATURE (AMX_TRANSPOSE, "amx-transpose") +X86_FEATURE (AMX_MOVRS, "amx-movrs") X86_FEATURE (AMX_AVX512, "amx-avx512") X86_FEATURE (AMX_TF32, "amx-tf32") // These features aren't really CPU features, but the frontend can set them. diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 35bbffdb20942d..509632183dc016 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -273,6 +273,9 @@ def FeatureAMXCOMPLEX : SubtargetFeature<"amx-complex", "HasAMXCOMPLEX", "true", def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true", "Support AMX-FP8 instructions", [FeatureAMXTILE]>; +def FeatureAMXMOVRS : SubtargetFeature<"amx-movrs", "HasAMXMOVRS", "true", + "Support AMX-MOVRS instructions", + [FeatureAMXTILE]>; def FeatureAMXTRANSPOSE : SubtargetFeature<"amx-transpose", "HasAMXTRANSPOSE", "true", "Support AMX amx-transpose instructions", [FeatureAMXTILE]>; diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 4f045d78f75fb2..73ca4b09c0aa53 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -559,6 +559,8 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, } case X86::PTILELOADDV: case X86::PTILELOADDT1V: + case X86::PTILELOADDRSV: + case X86::PTILELOADDRST1V: case X86::PTCVTROWD2PSrreV: case X86::PTCVTROWD2PSrriV: case X86::PTCVTROWPS2PBF16HrreV: @@ -575,6 +577,12 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.removeOperand(i); unsigned Opc; switch (Opcode) { + case X86::PTILELOADDRSV: + Opc = X86::TILELOADDRS; + break; + case X86::PTILELOADDRST1V: + Opc = X86::TILELOADDRST1; + break; case X86::PTILELOADDV: Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD); break; @@ -719,7 +727,11 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, case X86::PT2RPNTLVWZ0V: case X86::PT2RPNTLVWZ0T1V: case X86::PT2RPNTLVWZ1V: - case X86::PT2RPNTLVWZ1T1V: { + case X86::PT2RPNTLVWZ1T1V: + case X86::PT2RPNTLVWZ0RSV: + case X86::PT2RPNTLVWZ0RST1V: + case X86::PT2RPNTLVWZ1RSV: + case X86::PT2RPNTLVWZ1RST1V: { for (unsigned i = 3; i > 0; --i) MI.removeOperand(i); unsigned Opc; @@ -736,6 +748,18 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, case X86::PT2RPNTLVWZ1T1V: Opc = X86::T2RPNTLVWZ1T1; break; + case X86::PT2RPNTLVWZ0RSV: + Opc = X86::T2RPNTLVWZ0RS; + break; + case X86::PT2RPNTLVWZ0RST1V: + Opc = X86::T2RPNTLVWZ0RST1; + break; + case X86::PT2RPNTLVWZ1RSV: + Opc = X86::T2RPNTLVWZ1RS; + break; + case X86::PT2RPNTLVWZ1RST1V: + Opc = X86::T2RPNTLVWZ1RST1; + break; default: llvm_unreachable("Impossible Opcode!"); } diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 8ad8641d1de485..72de0e0e8761f2 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -337,6 +337,10 @@ namespace { case X86::PT2RPNTLVWZ0T1V: case X86::PT2RPNTLVWZ1V: case X86::PT2RPNTLVWZ1T1V: + case X86::PT2RPNTLVWZ0RSV: + case X86::PT2RPNTLVWZ0RST1V: + case X86::PT2RPNTLVWZ1RSV: + case X86::PT2RPNTLVWZ1RST1V: return true; } for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) { @@ -5157,6 +5161,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, Res); return; } + case Intrinsic::x86_tileloaddrs64_internal: + case Intrinsic::x86_tileloaddrst164_internal: + if (!Subtarget->hasAMXMOVRS()) + break; + [[fallthrough]]; case Intrinsic::x86_tileloadd64_internal: case Intrinsic::x86_tileloaddt164_internal: { if (!Subtarget->hasAMXTILE()) @@ -5164,9 +5173,23 @@ void X86DAGToDAGISel::Select(SDNode *Node) { auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); - unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal - ? X86::PTILELOADDV - : X86::PTILELOADDT1V; + unsigned Opc; + switch (IntNo) { + default: + llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_tileloaddrs64_internal: + Opc = X86::PTILELOADDRSV; + break; + case Intrinsic::x86_tileloaddrst164_internal: + Opc = X86::PTILELOADDRST1V; + break; + case Intrinsic::x86_tileloadd64_internal: + Opc = X86::PTILELOADDV; + break; + case Intrinsic::x86_tileloaddt164_internal: + Opc = X86::PTILELOADDT1V; + break; + } // _tile_loadd_internal(row, col, buf, STRIDE) SDValue Base = Node->getOperand(4); SDValue Scale = getI8Imm(1, dl); @@ -5270,6 +5293,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, CNode); return; } + case Intrinsic::x86_tileloaddrs64: + case Intrinsic::x86_tileloaddrst164: + if (!Subtarget->hasAMXMOVRS()) + break; + [[fallthrough]]; case Intrinsic::x86_tileloadd64: case Intrinsic::x86_tileloaddt164: case Intrinsic::x86_tilestored64: { @@ -5282,7 +5310,13 @@ void X86DAGToDAGISel::Select(SDNode *Node) { switch (IntNo) { default: llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break; + case Intrinsic::x86_tileloaddrs64: + Opc = X86::PTILELOADDRS; + break; case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break; + case Intrinsic::x86_tileloaddrst164: + Opc = X86::PTILELOADDRST1; + break; case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break; } // FIXME: Match displacement and scale. @@ -5305,6 +5339,13 @@ void X86DAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, CNode); return; } + case Intrinsic::x86_t2rpntlvwz0rs: + case Intrinsic::x86_t2rpntlvwz0rst1: + case Intrinsic::x86_t2rpntlvwz1rs: + case Intrinsic::x86_t2rpntlvwz1rst1: + if (!Subtarget->hasAMXMOVRS()) + break; + [[fallthrough]]; case Intrinsic::x86_t2rpntlvwz0: case Intrinsic::x86_t2rpntlvwz0t1: case Intrinsic::x86_t2rpntlvwz1: @@ -5330,6 +5371,18 @@ void X86DAGToDAGISel::Select(SDNode *Node) { case Intrinsic::x86_t2rpntlvwz1t1: Opc = X86::PT2RPNTLVWZ1T1; break; + case Intrinsic::x86_t2rpntlvwz0rs: + Opc = X86::PT2RPNTLVWZ0RS; + break; + case Intrinsic::x86_t2rpntlvwz0rst1: + Opc = X86::PT2RPNTLVWZ0RST1; + break; + case Intrinsic::x86_t2rpntlvwz1rs: + Opc = X86::PT2RPNTLVWZ1RS; + break; + case Intrinsic::x86_t2rpntlvwz1rst1: + Opc = X86::PT2RPNTLVWZ1RST1; + break; } // FIXME: Match displacement and scale. unsigned TIndex = Node->getConstantOperandVal(2); @@ -5340,9 +5393,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) { SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); SDValue Segment = CurDAG->getRegister(0, MVT::i16); SDValue Chain = Node->getOperand(0); - MachineSDNode *CNode; SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain}; - CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); ReplaceNode(Node, CNode); return; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c08efc9f7271e6..db04f3a48d4d03 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -27345,6 +27345,10 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); } + case Intrinsic::x86_t2rpntlvwz0rs_internal: + case Intrinsic::x86_t2rpntlvwz0rst1_internal: + case Intrinsic::x86_t2rpntlvwz1rs_internal: + case Intrinsic::x86_t2rpntlvwz1rst1_internal: case Intrinsic::x86_t2rpntlvwz0_internal: case Intrinsic::x86_t2rpntlvwz0t1_internal: case Intrinsic::x86_t2rpntlvwz1_internal: @@ -27368,6 +27372,18 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, case Intrinsic::x86_t2rpntlvwz1t1_internal: Opc = X86::PT2RPNTLVWZ1T1V; break; + case Intrinsic::x86_t2rpntlvwz0rs_internal: + Opc = X86::PT2RPNTLVWZ0RSV; + break; + case Intrinsic::x86_t2rpntlvwz0rst1_internal: + Opc = X86::PT2RPNTLVWZ0RST1V; + break; + case Intrinsic::x86_t2rpntlvwz1rs_internal: + Opc = X86::PT2RPNTLVWZ1RSV; + break; + case Intrinsic::x86_t2rpntlvwz1rst1_internal: + Opc = X86::PT2RPNTLVWZ1RST1V; + break; } SDLoc DL(Op); @@ -37533,6 +37549,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); return BB; } + case X86::PTILELOADDRS: + case X86::PTILELOADDRST1: case X86::PTILELOADD: case X86::PTILELOADDT1: case X86::PTILESTORED: { @@ -37550,6 +37568,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED); break; #undef GET_EGPR_IF_ENABLED + case X86::PTILELOADDRS: + Opc = X86::TILELOADDRS; + break; + case X86::PTILELOADDRST1: + Opc = X86::TILELOADDRST1; + break; } MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc)); @@ -37590,6 +37614,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); // The pseudo is gone now. return BB; } + case X86::PT2RPNTLVWZ0RS: + case X86::PT2RPNTLVWZ0RST1: + case X86::PT2RPNTLVWZ1RS: + case X86::PT2RPNTLVWZ1RST1: case X86::PT2RPNTLVWZ0: case X86::PT2RPNTLVWZ0T1: case X86::PT2RPNTLVWZ1: @@ -37611,6 +37639,18 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::PT2RPNTLVWZ1T1: Opc = X86::T2RPNTLVWZ1T1; break; + case X86::PT2RPNTLVWZ0RS: + Opc = X86::T2RPNTLVWZ0RS; + break; + case X86::PT2RPNTLVWZ0RST1: + Opc = X86::T2RPNTLVWZ0RST1; + break; + case X86::PT2RPNTLVWZ1RS: + Opc = X86::T2RPNTLVWZ1RS; + break; + case X86::PT2RPNTLVWZ1RST1: + Opc = X86::T2RPNTLVWZ1RST1; + break; } MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define); diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index 04527716e31627..059bfb4c70bcf9 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -370,6 +370,70 @@ let Predicates = [HasAMXTRANSPOSE, In64BitMode] in { } } // HasAMXTILE, HasAMXTRANSPOSE +let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in { + def T2RPNTLVWZ0RS : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src1), + "t2rpntlvwz0rs\t{$src1, $dst|$dst, $src1}", + []>, VEX, T_MAP5; + def T2RPNTLVWZ0RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src1), + "t2rpntlvwz0rst1\t{$src1, $dst|$dst, $src1}", + []>, VEX, T_MAP5; + def T2RPNTLVWZ1RS : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src1), + "t2rpntlvwz1rs\t{$src1, $dst|$dst, $src1}", + []>, VEX, T_MAP5, PD; + def T2RPNTLVWZ1RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src1), + "t2rpntlvwz1rst1\t{$src1, $dst|$dst, $src1}", + []>, VEX, T_MAP5, PD; + let isPseudo = true in { + def PT2RPNTLVWZ0RSV : PseudoI<(outs TILEPair:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), + []>; + def PT2RPNTLVWZ0RST1V : PseudoI<(outs TILEPair:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), + []>; + def PT2RPNTLVWZ1RSV : PseudoI<(outs TILEPair:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), + []>; + def PT2RPNTLVWZ1RST1V : PseudoI<(outs TILEPair:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), + []>; + } + let usesCustomInserter = 1 in { + def PT2RPNTLVWZ0RS : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; + def PT2RPNTLVWZ0RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; + def PT2RPNTLVWZ1RS : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; + def PT2RPNTLVWZ1RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; + } +} // HasAMXMOVRS, HasAMXTRANSPOSE + +let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in { + def TILELOADDRS : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), + (ins sibmem:$src1), + "tileloaddrs\t{$src1, $dst|$dst, $src1}", + []>, VEX, T8, XD; + def TILELOADDRST1 : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), + (ins sibmem:$src1), + "tileloaddrst1\t{$src1, $dst|$dst, $src1}", + []>, VEX, T8, PD; + + let isPseudo = true, mayLoad = 1 in { + def PTILELOADDRSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, + GR16:$src2, + opaquemem:$src3), []>; + def PTILELOADDRST1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1, + GR16:$src2, + opaquemem:$src3), []>; + } + + let usesCustomInserter = 1, mayLoad = 1 in { + def PTILELOADDRS : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>; + def PTILELOADDRST1 : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>; + } +} // HasAMXMOVRS, In64BitMode + multiclass m_tcvtrowd2ps { let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in { let SchedRW = [WriteSystem] in { diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index a9ec5f660ff1d8..eb2e93a94b197c 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -184,6 +184,7 @@ def HasAMXBF16 : Predicate<"Subtarget->hasAMXBF16()">; def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">; def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">; def HasAMXFP8 : Predicate<"Subtarget->hasAMXFP8()">; +def HasAMXMOVRS : Predicate<"Subtarget->hasAMXMOVRS()">; def HasAMXTRANSPOSE : Predicate<"Subtarget->hasAMXTRANSPOSE()">; def HasAMXAVX512 : Predicate<"Subtarget->hasAMXAVX512()">; def HasAMXTF32 : Predicate<"Subtarget->hasAMXTF32()">; diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index 0e74cfa75e9606..3dc69d4ecb0201 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -227,7 +227,13 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II, case Intrinsic::x86_t2rpntlvwz1t1_internal: case Intrinsic::x86_tileloadd64_internal: case Intrinsic::x86_tileloaddt164_internal: - case Intrinsic::x86_tilestored64_internal: { + case Intrinsic::x86_tilestored64_internal: + case Intrinsic::x86_t2rpntlvwz0rs_internal: + case Intrinsic::x86_t2rpntlvwz0rst1_internal: + case Intrinsic::x86_t2rpntlvwz1rs_internal: + case Intrinsic::x86_t2rpntlvwz1rst1_internal: + case Intrinsic::x86_tileloaddrs64_internal: + case Intrinsic::x86_tileloaddrst164_internal: { Row = II->getArgOperand(0); Col = II->getArgOperand(1); break; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 09418c9bb74d34..08f6f43c72638b 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -1077,6 +1077,8 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, case X86::PTCMMIMFP16PSV: case X86::PTCMMRLFP16PSV: case X86::PTTRANSPOSEDV: + case X86::PTILELOADDRSV: + case X86::PTILELOADDRST1V: case X86::PTMMULTF32PSV: case X86::PTTMMULTF32PSV: { MachineOperand &MO1 = MI->getOperand(1); @@ -1088,7 +1090,11 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, case X86::PT2RPNTLVWZ0V: case X86::PT2RPNTLVWZ0T1V: case X86::PT2RPNTLVWZ1V: - case X86::PT2RPNTLVWZ1T1V: { + case X86::PT2RPNTLVWZ1T1V: + case X86::PT2RPNTLVWZ0RSV: + case X86::PT2RPNTLVWZ0RST1V: + case X86::PT2RPNTLVWZ1RSV: + case X86::PT2RPNTLVWZ1RST1V: { MachineOperand &MO1 = MI->getOperand(1); MachineOperand &MO2 = MI->getOperand(2); MachineOperand &MO3 = MI->getOperand(3); diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 140e565e1686f2..58ba2553633221 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -1882,6 +1882,7 @@ const StringMap<bool> sys::getHostCPUFeatures() { Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave; Features["amx-tf32"] = HasLeaf1E && ((EAX >> 6) & 1) && HasAMXSave; Features["amx-avx512"] = HasLeaf1E && ((EAX >> 7) & 1) && HasAMXSave; + Features["amx-movrs"] = HasLeaf1E && ((EAX >> 8) & 1) && HasAMXSave; bool HasLeaf24 = MaxLevel >= 0x24 && !getX86CpuIDAndInfo(0x24, &EAX, &EBX, &ECX, &EDX); diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp index 6b53424833bd47..0da740743c9b7c 100644 --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -600,6 +600,7 @@ constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_FP8 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_TRANSPOSE = FeatureAMX_TILE; +constexpr FeatureBitset ImpliedFeaturesAMX_MOVRS = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_AVX512 = FeatureAMX_TILE | FeatureAVX10_2_512; constexpr FeatureBitset ImpliedFeaturesAMX_TF32 = FeatureAMX_TILE; diff --git a/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll new file mode 100755 index 00000000000000..da212a1850964e --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll @@ -0,0 +1,108 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-movrs | FileCheck %s + +define void @test_amx_internal(i16 %m, i16 %n, ptr %buf, i64 %s) { +; CHECK-LABEL: test_amx_internal: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00 +; CHECK-NEXT: subq $3072, %rsp # imm = 0xC00 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; CHECK-NEXT: movw %ax, %cx +; CHECK-NEXT: movw %di, %ax +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: tileloaddrs (%rdx,%rsi), %tmm0 +; CHECK-NEXT: movl $64, %esi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi) +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: tilerelease +; CHECK-NEXT: retq +entry: + %t1 = call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %m, i16 %n, ptr %buf, i64 %s) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + ret void +} +declare x86_amx @llvm.x86.tileloaddrs64.internal(i16, i16, ptr, i64) + +define void @test_amx_old(i16 %m, i16 %n, ptr %buf) { +; CHECK-LABEL: test_amx_old: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl $32, %eax +; CHECK-NEXT: tileloaddrs (%rdx,%rax), %tmm2 +; CHECK-NEXT: retq +entry: + call void @llvm.x86.tileloaddrs64(i8 2, ptr %buf, i64 32) + ret void +} +declare void @llvm.x86.tileloaddrs64(i8 immarg, ptr, i64) + +define void @test_amx_t1_internal(i16 %m, i16 %n, ptr %buf, i64 %s) { +; CHECK-LABEL: test_amx_t1_internal: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00 +; CHECK-NEXT: subq $3072, %rsp # imm = 0xC00 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; CHECK-NEXT: movw %ax, %cx +; CHECK-NEXT: movw %di, %ax +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: tileloaddrst1 (%rdx,%rsi), %tmm0 +; CHECK-NEXT: movl $64, %esi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi) +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: tilerelease +; CHECK-NEXT: retq +entry: + %t1 = call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %m, i16 %n, ptr %buf, i64 %s) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + ret void +} +declare x86_amx @llvm.x86.tileloaddrst164.internal(i16, i16, ptr, i64) + +define void @test_amx_t1_old(i16 %m, i16 %n, ptr %buf) { +; CHECK-LABEL: test_amx_t1_old: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl $32, %eax +; CHECK-NEXT: tileloaddrst1 (%rdx,%rax), %tmm2 +; CHECK-NEXT: retq +entry: + call void @llvm.x86.tileloaddrst164(i8 2, ptr %buf, i64 32) + ret void +} +declare void @llvm.x86.tileloaddrst164(i8 immarg, ptr, i64) diff --git a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll new file mode 100755 index 00000000000000..146b69773eb186 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll @@ -0,0 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O0 +; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O2 + +define void @test_amx(i64 %stride, i8* %addr1) #0 { +; CHECK-LABEL: test_amx: +; CHECK: # %bb.0: +; CHECK-NEXT: t2rpntlvwz0rs (%rsi,%rdi), %tmm0 +; CHECK-NEXT: t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 +; CHECK-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0 +; CHECK-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 +; CHECK-NEXT: retq + call void @llvm.x86.t2rpntlvwz0rs(i8 1, i8* %addr1, i64 %stride) + call void @llvm.x86.t2rpntlvwz0rst1(i8 2, i8* %addr1, i64 %stride) + call void @llvm.x86.t2rpntlvwz1rs(i8 1, i8* %addr1, i64 %stride) + call void @llvm.x86.t2rpntlvwz1rst1(i8 2, i8* %addr1, i64 %stride) + ret void +} +declare void @llvm.x86.t2rpntlvwz0rs(i8 , i8* , i64 ) +declare void @llvm.x86.t2rpntlvwz0rst1(i8 , i8* , i64 ) +declare void @llvm.x86.t2rpntlvwz1rs(i8 , i8* , i64 ) +declare void @llvm.x86.t2rpntlvwz1rst1(i8 , i8* , i64 ) + +define void @test_amx2(i8* %base, i64 %stride) #0 { +; O0-LABEL: test_amx2: +; O0: # %bb.0: +; O0-NEXT: xorps %xmm0, %xmm0 +; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O0-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw $8, %ax +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; O0-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 +; O0-NEXT: movw $8, %ax +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; O0-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 +; O0-NEXT: movw $8, %ax +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; O0-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 +; O0-NEXT: tilerelease +; O0-NEXT: retq +; +; O2-LABEL: test_amx2: +; O2: # %bb.0: +; O2-NEXT: xorps %xmm0, %xmm0 +; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O2-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; O2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; O2-NEXT: movw $8, %ax +; O2-NEXT: t2rpntlvwz0rs (%rdi,%rsi), %tmm4 +; O2-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 +; O2-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 +; O2-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 +; O2-NEXT: tilerelease +; O2-NEXT: retq + call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + ret void +} +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16, i16, i16, i8*, i64) +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16, i16, i16, i8*, i64) +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16, i16, i16, i8*, i64) +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16, i16, i16, i8*, i64) diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt new file mode 100755 index 00000000000000..6df44c87d2332f --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt @@ -0,0 +1,98 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s -check-prefix=ATT +# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s -check-prefix=INTEL + +# ATT: t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6 +# INTEL: t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456] +0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2 +# INTEL: t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291] +0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz0rs 64(%rbx), %tmm6 +# INTEL: t2rpntlvwz0rs tmm6, [rbx + 64] +0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40 + +# ATT: t2rpntlvwz0rs -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz0rs tmm2, [2*rbp - 32] +0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6 +# INTEL: t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456] +0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2 +# INTEL: t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291] +0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz0rst1 64(%rbx), %tmm6 +# INTEL: t2rpntlvwz0rst1 tmm6, [rbx + 64] +0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40 + +# ATT: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz0rst1 tmm2, [2*rbp - 32] +0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6 +# INTEL: t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456] +0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2 +# INTEL: t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291] +0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz1rs 64(%rbx), %tmm6 +# INTEL: t2rpntlvwz1rs tmm6, [rbx + 64] +0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40 + +# ATT: t2rpntlvwz1rs -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz1rs tmm2, [2*rbp - 32] +0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6 +# INTEL: t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456] +0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2 +# INTEL: t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291] +0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz1rst1 64(%rbx), %tmm6 +# INTEL: t2rpntlvwz1rst1 tmm6, [rbx + 64] +0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40 + +# ATT: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz1rst1 tmm2, [2*rbp - 32] +0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: tileloaddrs 268435456(%rbp,%r14,8), %tmm6 +# INTEL: tileloaddrs tmm6, [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: tileloaddrs 291(%r8,%rax,4), %tmm3 +# INTEL: tileloaddrs tmm3, [r8 + 4*rax + 291] +0xc4,0xc2,0x7b,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00 + +# ATT: tileloaddrs 64(%rbx), %tmm6 +# INTEL: tileloaddrs tmm6, [rbx + 64] +0xc4,0xe2,0x7b,0x4a,0x74,0x23,0x40 + +# ATT: tileloaddrs -32(,%rbp,2), %tmm3 +# INTEL: tileloaddrs tmm3, [2*rbp - 32] +0xc4,0xe2,0x7b,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff + +# ATT: tileloaddrst1 268435456(%rbp,%r14,8), %tmm6 +# INTEL: tileloaddrst1 tmm6, [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x79,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: tileloaddrst1 291(%r8,%rax,4), %tmm3 +# INTEL: tileloaddrst1 tmm3, [r8 + 4*rax + 291] +0xc4,0xc2,0x79,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00 + +# ATT: tileloaddrst1 64(%rbx), %tmm6 +# INTEL: tileloaddrst1 tmm6, [rbx + 64] +0xc4,0xe2,0x79,0x4a,0x74,0x23,0x40 + +# ATT: tileloaddrst1 -32(,%rbp,2), %tmm3 +# INTEL: tileloaddrst1 tmm3, [2*rbp - 32] +0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s new file mode 100755 index 00000000000000..d780ad4f0e3691 --- /dev/null +++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s @@ -0,0 +1,89 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s + +// CHECK: t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6 +// CHECK: encoding: [0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6 + +// CHECK: t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2 +// CHECK: encoding: [0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2 + +// CHECK: t2rpntlvwz0rs 64(%rbx), %tmm6 +// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40] + t2rpntlvwz0rs 64(%rbx), %tmm6 + +// CHECK: t2rpntlvwz0rs -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz0rs -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6 +// CHECK: encoding: [0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6 + +// CHECK: t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2 +// CHECK: encoding: [0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2 + +// CHECK: t2rpntlvwz0rst1 64(%rbx), %tmm6 +// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40] + t2rpntlvwz0rst1 64(%rbx), %tmm6 + +// CHECK: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6 +// CHECK: encoding: [0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6 + +// CHECK: t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2 +// CHECK: encoding: [0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2 + +// CHECK: t2rpntlvwz1rs 64(%rbx), %tmm6 +// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40] + t2rpntlvwz1rs 64(%rbx), %tmm6 + +// CHECK: t2rpntlvwz1rs -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz1rs -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6 +// CHECK: encoding: [0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6 + +// CHECK: t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2 +// CHECK: encoding: [0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2 + +// CHECK: t2rpntlvwz1rst1 64(%rbx), %tmm6 +// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40] + t2rpntlvwz1rst1 64(%rbx), %tmm6 + +// CHECK: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 + +// CHECK: tileloaddrs 268435456(%rbp,%r14,8), %tmm6 +// CHECK: encoding: [0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10] + tileloaddrs 268435456(%rbp,%r14,8), %tmm6 + +// CHECK: tileloaddrs 291(%r8,%rax,4), %tmm3 +// CHECK: encoding: [0xc4,0xc2,0x7b,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00] + tileloaddrs 291(%r8,%rax,4), %tmm3 + +// CHECK: tileloaddrs -32(,%rbp,2), %tmm3 +// CHECK: encoding: [0xc4,0xe2,0x7b,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] + tileloaddrs -32(,%rbp,2), %tmm3 + +// CHECK: tileloaddrst1 268435456(%rbp,%r14,8), %tmm6 +// CHECK: encoding: [0xc4,0xa2,0x79,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10] + tileloaddrst1 268435456(%rbp,%r14,8), %tmm6 + +// CHECK: tileloaddrst1 291(%r8,%rax,4), %tmm3 +// CHECK: encoding: [0xc4,0xc2,0x79,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00] + tileloaddrst1 291(%r8,%rax,4), %tmm3 + +// CHECK: tileloaddrst1 -32(,%rbp,2), %tmm3 +// CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] + tileloaddrst1 -32(,%rbp,2), %tmm3 \ No newline at end of file diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s new file mode 100755 index 00000000000000..ccc7ac51a98a44 --- /dev/null +++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s @@ -0,0 +1,97 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291] + +// CHECK: t2rpntlvwz0rs tmm6, [rbx + 64] +// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40] + t2rpntlvwz0rs tmm6, [rbx + 64] + +// CHECK: t2rpntlvwz0rs tmm2, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz0rs tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291] + +// CHECK: t2rpntlvwz0rst1 tmm6, [rbx + 64] +// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40] + t2rpntlvwz0rst1 tmm6, [rbx + 64] + +// CHECK: t2rpntlvwz0rst1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz0rst1 tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291] + +// CHECK: t2rpntlvwz1rs tmm6, [rbx + 64] +// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40] + t2rpntlvwz1rs tmm6, [rbx + 64] + +// CHECK: t2rpntlvwz1rs tmm2, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz1rs tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291] + +// CHECK: t2rpntlvwz1rst1 tmm6, [rbx + 64] +// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40] + t2rpntlvwz1rst1 tmm6, [rbx + 64] + +// CHECK: t2rpntlvwz1rst1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz1rst1 tmm2, [2*rbp - 32] + +// CHECK: tileloaddrs tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10] + tileloaddrs tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: tileloaddrs tmm3, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x7b,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00] + tileloaddrs tmm3, [r8 + 4*rax + 291] + +// CHECK: tileloaddrs tmm6, [rbx + 64] +// CHECK: encoding: [0xc4,0xe2,0x7b,0x4a,0x74,0x23,0x40] + tileloaddrs tmm6, [rbx + 64] + +// CHECK: tileloaddrs tmm3, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe2,0x7b,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] + tileloaddrs tmm3, [2*rbp - 32] + +// CHECK: tileloaddrst1 tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x79,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10] + tileloaddrst1 tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: tileloaddrst1 tmm3, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x79,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00] + tileloaddrst1 tmm3, [r8 + 4*rax + 291] + +// CHECK: tileloaddrst1 tmm6, [rbx + 64] +// CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x74,0x23,0x40] + tileloaddrst1 tmm6, [rbx + 64] + +// CHECK: tileloaddrst1 tmm3, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] + tileloaddrst1 tmm3, [2*rbp - 32] _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits