https://github.com/phoebewang updated https://github.com/llvm/llvm-project/pull/173545
>From 679dfffe288e8d7a2915058f6f6444ccd7d1a678 Mon Sep 17 00:00:00 2001 From: Phoebe Wang <[email protected]> Date: Thu, 25 Dec 2025 19:21:06 +0800 Subject: [PATCH 1/2] [X86][AMX-AVX512] Remove patterns for immediate variants The immediate variants use the low 6-bit as row index, while register variants use low 16-bit. We cannot select the immediate variants using the same intrinsic. So let's just disable them. --- llvm/lib/Target/X86/X86InstrAMX.td | 36 +++++-------------- .../CodeGen/X86/amx-across-func-tilemovrow.ll | 30 +++++++++------- .../test/CodeGen/X86/amx-avx512-intrinsics.ll | 6 ++-- .../CodeGen/X86/amx-tile-avx512-internals.ll | 13 +++---- 4 files changed, 36 insertions(+), 49 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index 6b8b8f720ddd7..d40913db6a777 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -387,57 +387,41 @@ defm TCVTROWD2PS : m_tcvtrowd2ps; let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in { let SchedRW = [WriteSystem] in { let usesCustomInserter = 1 in { - def PTCVTROWD2PSrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2), - [(set VR512:$dst, (int_x86_tcvtrowd2ps timm:$src1, imm:$src2))]>; + def PTCVTROWD2PSrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2), []>; def PTCVTROWD2PSrre : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, GR32:$src2), [(set VR512:$dst, (int_x86_tcvtrowd2ps timm:$src1, GR32:$src2))]>; } def PTCVTROWD2PSrriV : PseudoI<(outs VR512:$dst), - (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), - [(set VR512: $dst, - (int_x86_tcvtrowd2ps_internal GR16:$src1, GR16:$src2, - TILE:$src3, imm:$src4))]>; + (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), []>; def PTCVTROWD2PSrreV : PseudoI<(outs VR512:$dst), (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4), [(set VR512: $dst, (int_x86_tcvtrowd2ps_internal GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4))]>; def PTCVTROWPS2BF16HrriV : PseudoI<(outs VR512:$dst), - (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), - [(set VR512: $dst, - (int_x86_tcvtrowps2bf16h_internal GR16:$src1, GR16:$src2, - TILE:$src3, imm:$src4))]>; + (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), []>; def PTCVTROWPS2BF16HrreV : PseudoI<(outs VR512:$dst), (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4), [(set VR512: $dst, (int_x86_tcvtrowps2bf16h_internal GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4))]>; def PTCVTROWPS2BF16LrriV : PseudoI<(outs VR512:$dst), - (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), - [(set VR512: $dst, - (int_x86_tcvtrowps2bf16l_internal GR16:$src1, GR16:$src2, - TILE:$src3, imm:$src4))]>; + (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), []>; def PTCVTROWPS2BF16LrreV : PseudoI<(outs VR512:$dst), (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4), [(set VR512: $dst, (int_x86_tcvtrowps2bf16l_internal GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4))]>; def PTCVTROWPS2PHHrriV : PseudoI<(outs VR512:$dst), - (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), - [(set VR512: $dst, - (int_x86_tcvtrowps2phh_internal GR16:$src1, GR16:$src2, - TILE:$src3, imm:$src4))]>; + (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), []>; def PTCVTROWPS2PHHrreV : PseudoI<(outs VR512:$dst), (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4), [(set VR512: $dst, (int_x86_tcvtrowps2phh_internal GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4))]>; def PTCVTROWPS2PHLrriV : PseudoI<(outs VR512:$dst), - (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), - [(set VR512: $dst, - (int_x86_tcvtrowps2phl_internal GR16:$src1, GR16:$src2, - TILE:$src3, imm:$src4))]>; + (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), []>; def PTCVTROWPS2PHLrreV : PseudoI<(outs VR512:$dst), (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4), [(set VR512: $dst, @@ -495,17 +479,13 @@ defm TILEMOVROW : AMXAVX512_TILEMOVE<0x07, 0x4A, "tilemovrow">; let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in { let SchedRW = [WriteSystem] in { let usesCustomInserter = 1 in { - def PTILEMOVROWrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2), - [(set VR512:$dst, (int_x86_tilemovrow timm:$src1, imm:$src2))]>; + def PTILEMOVROWrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2), []>; def PTILEMOVROWrre : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, GR32:$src2), [(set VR512:$dst, (int_x86_tilemovrow timm:$src1, GR32:$src2))]>; } def PTILEMOVROWrriV : PseudoI<(outs VR512:$dst), - (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), - [(set VR512: $dst, - (int_x86_tilemovrow_internal GR16:$src1, GR16:$src2, - TILE:$src3, imm:$src4))]>; + (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), []>; def PTILEMOVROWrreV : PseudoI<(outs VR512:$dst), (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4), [(set VR512: $dst, diff --git a/llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll b/llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll index 885bc805d6552..5f4e6283b57a6 100644 --- a/llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll +++ b/llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll @@ -28,15 +28,15 @@ define dso_local <16 x i32> @test_api(i16 signext %0, i16 signext %1) nounwind { ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $2112, %rsp # imm = 0x840 -; CHECK-NEXT: movl %esi, %ebx -; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: movl %esi, %ebp +; CHECK-NEXT: movl %edi, %ebx ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovups %zmm0, (%rsp) ; CHECK-NEXT: movb $1, (%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %bp, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %bl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg (%rsp) ; CHECK-NEXT: movl $buf, %eax ; CHECK-NEXT: movl $32, %ecx @@ -51,11 +51,12 @@ define dso_local <16 x i32> @test_api(i16 signext %0, i16 signext %1) nounwind { ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg (%rsp) -; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tileloadd 64(%rsp,%rax), %tmm1 # 1024-byte Folded Reload -; CHECK-NEXT: tilemovrow $2, %tmm1, %zmm0 -; CHECK-NEXT: tileloadd 1088(%rsp,%rax), %tmm0 # 1024-byte Folded Reload -; CHECK-NEXT: tilemovrow $2, %tmm0, %zmm1 +; CHECK-NEXT: movl $2, %eax +; CHECK-NEXT: movabsq $64, %rcx +; CHECK-NEXT: tileloadd 64(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload +; CHECK-NEXT: tilemovrow %eax, %tmm1, %zmm0 +; CHECK-NEXT: tileloadd 1088(%rsp,%rcx), %tmm0 # 1024-byte Folded Reload +; CHECK-NEXT: tilemovrow %eax, %tmm0, %zmm1 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: addq $2112, %rsp # imm = 0x840 ; CHECK-NEXT: popq %rbx @@ -82,8 +83,9 @@ define dso_local <16 x i32> @test_api(i16 signext %0, i16 signext %1) nounwind { ; IPRA-NEXT: movl $buf+1024, %eax ; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm1 ; IPRA-NEXT: callq foo -; IPRA-NEXT: tilemovrow $2, %tmm1, %zmm0 -; IPRA-NEXT: tilemovrow $2, %tmm0, %zmm1 +; IPRA-NEXT: movl $2, %eax +; IPRA-NEXT: tilemovrow %eax, %tmm1, %zmm0 +; IPRA-NEXT: tilemovrow %eax, %tmm0, %zmm1 ; IPRA-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; IPRA-NEXT: addq $72, %rsp ; IPRA-NEXT: tilerelease @@ -138,8 +140,9 @@ define dso_local <16 x i32> @test_api(i16 signext %0, i16 signext %1) nounwind { ; O0-NEXT: movw %dx, {{[0-9]+}}(%rsp) ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; O0-NEXT: tileloadd (%rsi,%rdi), %tmm0 +; O0-NEXT: movl $2, %esi ; O0-NEXT: movw $8, %cx -; O0-NEXT: tilemovrow $2, %tmm0, %zmm0 +; O0-NEXT: tilemovrow %esi, %tmm0, %zmm0 ; O0-NEXT: movl $64, %esi ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; O0-NEXT: movw $8, %cx @@ -148,8 +151,9 @@ define dso_local <16 x i32> @test_api(i16 signext %0, i16 signext %1) nounwind { ; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 +; O0-NEXT: movl $2, %edx ; O0-NEXT: movw $8, %cx -; O0-NEXT: tilemovrow $2, %tmm0, %zmm1 +; O0-NEXT: tilemovrow %edx, %tmm0, %zmm1 ; O0-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; O0-NEXT: movq %rbp, %rsp ; O0-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll b/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll index 41208d6adb300..a36f93e34a983 100644 --- a/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll @@ -13,7 +13,8 @@ define <16 x float> @test_tcvtrowd2ps(i32 %A) { define <16 x float> @test_tcvtrowd2psi() { ; CHECK-LABEL: test_tcvtrowd2psi: ; CHECK: # %bb.0: -; CHECK-NEXT: tcvtrowd2ps $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7e,0x48,0x07,0xc1,0x7f] +; CHECK-NEXT: movl $127, %eax # encoding: [0xb8,0x7f,0x00,0x00,0x00] +; CHECK-NEXT: tcvtrowd2ps %eax, %tmm1, %zmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x4a,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] %ret = call <16 x float> @llvm.x86.tcvtrowd2ps(i8 1, i32 127) ret <16 x float> %ret @@ -108,7 +109,8 @@ define <16 x i32> @test_tilemovrow(i32 %A) { define <16 x i32> @test_tilemovrowi() { ; CHECK-LABEL: test_tilemovrowi: ; CHECK: # %bb.0: -; CHECK-NEXT: tilemovrow $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x07,0xc1,0x7f] +; CHECK-NEXT: movl $127, %eax # encoding: [0xb8,0x7f,0x00,0x00,0x00] +; CHECK-NEXT: tilemovrow %eax, %tmm1, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x4a,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] %ret = call <16 x i32> @llvm.x86.tilemovrow(i8 1, i32 127) ret <16 x i32> %ret diff --git a/llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll b/llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll index dc8252ae7aca1..1ec6d1cf7bfe9 100644 --- a/llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll +++ b/llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll @@ -14,17 +14,18 @@ define void @test_amx(i8* %pointer, i8* %base, i32 %index, i64 %stride) { ; CHECK-NEXT: movw $8, %ax ; CHECK-NEXT: tileloadd (%rsi,%rcx), %tmm0 ; CHECK-NEXT: tcvtrowd2ps %edx, %tmm0, %zmm0 -; CHECK-NEXT: tcvtrowd2ps $16, %tmm0, %zmm0 +; CHECK-NEXT: movl $16, %esi +; CHECK-NEXT: tcvtrowd2ps %esi, %tmm0, %zmm0 ; CHECK-NEXT: tcvtrowps2bf16h %edx, %tmm0, %zmm0 -; CHECK-NEXT: tcvtrowps2bf16h $16, %tmm0, %zmm0 +; CHECK-NEXT: tcvtrowps2bf16h %esi, %tmm0, %zmm0 ; CHECK-NEXT: tcvtrowps2bf16l %edx, %tmm0, %zmm0 -; CHECK-NEXT: tcvtrowps2bf16l $16, %tmm0, %zmm0 +; CHECK-NEXT: tcvtrowps2bf16l %esi, %tmm0, %zmm0 ; CHECK-NEXT: tcvtrowps2phh %edx, %tmm0, %zmm0 -; CHECK-NEXT: tcvtrowps2phh $16, %tmm0, %zmm0 +; CHECK-NEXT: tcvtrowps2phh %esi, %tmm0, %zmm0 ; CHECK-NEXT: tcvtrowps2phl %edx, %tmm0, %zmm0 -; CHECK-NEXT: tcvtrowps2phl $16, %tmm0, %zmm0 +; CHECK-NEXT: tcvtrowps2phl %esi, %tmm0, %zmm0 ; CHECK-NEXT: tilemovrow %edx, %tmm0, %zmm0 -; CHECK-NEXT: tilemovrow $16, %tmm0, %zmm0 +; CHECK-NEXT: tilemovrow %esi, %tmm0, %zmm0 ; CHECK-NEXT: tilestored %tmm0, (%rdi,%rcx) ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper >From 14e85101b31fb2cc6429a61e61e8c72d49feb27f Mon Sep 17 00:00:00 2001 From: Phoebe Wang <[email protected]> Date: Tue, 30 Dec 2025 15:38:58 +0800 Subject: [PATCH 2/2] Add *i intrinsics --- clang/include/clang/Basic/BuiltinsX86_64.td | 6 + clang/lib/Headers/amxavx512intrin.h | 214 ++++++++++++++++++ clang/lib/Sema/SemaX86.cpp | 9 + clang/test/CodeGen/X86/amx_amx512_errors.c | 11 + clang/test/CodeGen/X86/amxavx512-builtins.c | 36 +++ llvm/include/llvm/IR/IntrinsicsX86.td | 18 ++ .../lib/Target/X86/AsmParser/X86AsmParser.cpp | 15 ++ .../Target/X86/AsmParser/X86AsmParserCommon.h | 4 + llvm/lib/Target/X86/AsmParser/X86Operand.h | 9 + llvm/lib/Target/X86/X86InstrAMX.td | 38 +++- llvm/lib/Target/X86/X86InstrOperands.td | 16 ++ .../CodeGen/X86/amx-across-func-tilemovrow.ll | 30 ++- .../test/CodeGen/X86/amx-avx512-intrinsics.ll | 18 +- .../CodeGen/X86/amx-tile-avx512-internals.ll | 13 +- 14 files changed, 394 insertions(+), 43 deletions(-) create mode 100644 clang/test/CodeGen/X86/amx_amx512_errors.c diff --git a/clang/include/clang/Basic/BuiltinsX86_64.td b/clang/include/clang/Basic/BuiltinsX86_64.td index 2e68a5a016ef0..2bd62bd5e2663 100644 --- a/clang/include/clang/Basic/BuiltinsX86_64.td +++ b/clang/include/clang/Basic/BuiltinsX86_64.td @@ -304,6 +304,12 @@ let Features = "amx-avx512,avx10.2", Attributes = [NoThrow, RequiredVectorWidth< def tcvtrowps2phh : X86Builtin<"_Vector<32, _Float16>(_Constant unsigned char, unsigned int)">; def tcvtrowps2phl : X86Builtin<"_Vector<32, _Float16>(_Constant unsigned char, unsigned int)">; def tilemovrow : X86Builtin<"_Vector<16, int>(_Constant unsigned char, unsigned int)">; + def tcvtrowd2psi : X86Builtin<"_Vector<16, float>(_Constant unsigned char, _Constant unsigned int)">; + def tcvtrowps2bf16hi : X86Builtin<"_Vector<32, __bf16>(_Constant unsigned char, _Constant unsigned int)">; + def tcvtrowps2bf16li : X86Builtin<"_Vector<32, __bf16>(_Constant unsigned char, _Constant unsigned int)">; + def tcvtrowps2phhi : X86Builtin<"_Vector<32, _Float16>(_Constant unsigned char, _Constant unsigned int)">; + def tcvtrowps2phli : X86Builtin<"_Vector<32, _Float16>(_Constant unsigned char, _Constant unsigned int)">; + def tilemovrowi : X86Builtin<"_Vector<16, int>(_Constant unsigned char, _Constant unsigned int)">; } let Features = "amx-fp16", Attributes = [NoThrow] in { diff --git a/clang/lib/Headers/amxavx512intrin.h b/clang/lib/Headers/amxavx512intrin.h index 5f21c25b05ad1..2a88a9377fa99 100644 --- a/clang/lib/Headers/amxavx512intrin.h +++ b/clang/lib/Headers/amxavx512intrin.h @@ -52,6 +52,40 @@ /// The row of the source tile #define _tile_cvtrowd2ps(tsrc, row) __builtin_ia32_tcvtrowd2ps(tsrc, row) +/// Moves a row from a tile register to a zmm destination register, converting +/// the int32 source elements to fp32. The row of the tile is selected by a +/// 8b immediate value. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m512i _tile_cvtrowd2psi(__tile tsrc, const unsigned int imm8); +/// \endcode +/// +/// \code{.operation} +/// VL := 512 +/// VL_bytes := VL >> 3 +/// row_index := imm8 & 0x3f +/// row_chunk := (imm8 >> 6) * VL_bytes +/// FOR i := 0 TO (VL_bytes / 4) - 1 +/// IF i + row_chunk / 4 >= tsrc.colsb / 4 +/// dst.dword[i] := 0 +/// ELSE +/// dst.f32[i] := CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE) +/// FI +/// ENDFOR +/// dst[MAX_VL-1:VL] := 0 +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCVTROWD2PS instruction. +/// +/// \param tsrc +/// The source tile. Max size is 1024 Bytes. +/// \param imm8 +/// The row of the source tile +#define _tile_cvtrowd2psi(tsrc, imm8) __builtin_ia32_tcvtrowd2psi(tsrc, imm8) + /// Moves a row from a tile register to a zmm destination register, converting /// the fp32 source elements to bf16. It places the resulting bf16 elements /// in the high 16 bits within each dword. The row of the tile is selected @@ -89,6 +123,43 @@ #define _tile_cvtrowps2bf16h(tsrc, row) \ __builtin_ia32_tcvtrowps2bf16h(tsrc, row) +/// Moves a row from a tile register to a zmm destination register, converting +/// the fp32 source elements to bf16. It places the resulting bf16 elements +/// in the high 16 bits within each dword. The row of the tile is selected +/// by a 8b immediate value. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m512i _tile_cvtrowps2bf16hi(__tile tsrc, const unsigned int imm8); +/// \endcode +/// +/// \code{.operation} +/// VL := 512 +/// VL_bytes := VL >> 3 +/// row_index := imm8 & 0x3ff +/// row_chunk := (imm8 >> 6) * VL_bytes +/// FOR i := 0 TO (VL_bytes / 4) - 1 +/// IF i + row_chunk / 4 >= tsrc.colsb / 4 +/// dst.dword[i] := 0 +/// ELSE +/// dst.word[2*i+0] := 0 +/// dst.bf16[2*i+1] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) +/// FI +/// ENDFOR +/// dst[MAX_VL-1:VL] := 0 +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCVTROWPS2BF16H instruction. +/// +/// \param tsrc +/// The source tile. Max size is 1024 Bytes. +/// \param imm8 +/// The the row of the source tile. +#define _tile_cvtrowps2bf16hi(tsrc, imm8) \ + __builtin_ia32_tcvtrowps2bf16hi(tsrc, imm8) + /// Moves a row from a tile register to a zmm destination register, converting /// the fp32 source elements to bf16. It places the resulting bf16 elements /// in the low 16 bits within each dword. The row of the tile is selected @@ -126,6 +197,43 @@ #define _tile_cvtrowps2bf16l(tsrc, row) \ __builtin_ia32_tcvtrowps2bf16l(tsrc, row) +/// Moves a row from a tile register to a zmm destination register, converting +/// the fp32 source elements to bf16. It places the resulting bf16 elements +/// in the low 16 bits within each dword. The row of the tile is selected +/// by a 8b immediate value. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m512i _tile_cvtrowps2bf16li(__tile tsrc, const unsigned int imm8); +/// \endcode +/// +/// \code{.operation} +/// VL := 512 +/// VL_bytes := VL >> 3 +/// row_index := imm8 & 0x3ff +/// row_chunk := (imm8 >> 6) * VL_bytes +/// FOR i := 0 TO (VL_bytes / 4) - 1 +/// IF i + row_chunk / 4 >= tsrc.colsb / 4 +/// dst.dword[i] := 0 +/// ELSE +/// dst.word[2*i+1] := 0 +/// dst.bf16[2*i+0] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) +/// FI +/// ENDFOR +/// dst[MAX_VL-1:VL] := 0 +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCVTROWPS2BF16L instruction. +/// +/// \param tsrc +/// The source tile. Max size is 1024 Bytes. +/// \param imm8 +/// The the row of the source tile. +#define _tile_cvtrowps2bf16li(tsrc, imm8) \ + __builtin_ia32_tcvtrowps2bf16li(tsrc, imm8) + /// Moves a row from a tile register to a zmm destination register, converting /// the fp32 source elements to fp16. It places the resulting fp16 elements /// in the high 16 bits within each dword. The row of the tile is selected @@ -162,6 +270,43 @@ /// The the row of the source tile. #define _tile_cvtrowps2phh(tsrc, row) __builtin_ia32_tcvtrowps2phh(tsrc, row) +/// Moves a row from a tile register to a zmm destination register, converting +/// the fp32 source elements to fp16. It places the resulting fp16 elements +/// in the high 16 bits within each dword. The row of the tile is selected +/// by a 8b immediate value. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m512i _tile_cvtrowps2phhi(__tile tsrc, constunsigned int imm8); +/// \endcode +/// +/// \code{.operation} +/// VL := 512 +/// VL_bytes := VL >> 3 +/// row_index := imm8 & 0x3ff +/// row_chunk := (imm8 >> 6) * VL_bytes +/// FOR i := 0 TO (VL_bytes / 4) - 1 +/// IF i + row_chunk / 4 >= tsrc.colsb / 4 +/// dst.dword[i] := 0 +/// ELSE +/// dst.word[2*i+0] := 0 +/// dst.fp16[2*i+1] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) +/// FI +/// ENDFOR +/// dst[MAX_VL-1:VL] := 0 +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCVTROWPS2PHH instruction. +/// +/// \param tsrc +/// The source tile. Max size is 1024 Bytes. +/// \param imm8 +/// The the row of the source tile. +#define _tile_cvtrowps2phhi(tsrc, imm8) \ + __builtin_ia32_tcvtrowps2phhi(tsrc, imm8) + /// Moves a row from a tile register to a zmm destination register, converting /// the fp32 source elements to fp16. It places the resulting fp16 elements /// in the low 16 bits within each dword. The row of the tile is selected @@ -198,6 +343,43 @@ /// The the row of the source tile. #define _tile_cvtrowps2phl(tsrc, row) __builtin_ia32_tcvtrowps2phl(tsrc, row) +/// Moves a row from a tile register to a zmm destination register, converting +/// the fp32 source elements to fp16. It places the resulting fp16 elements +/// in the low 16 bits within each dword. The row of the tile is selected +/// by a 8b immediate value. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m512i _tile_cvtrowps2phli(__tile tsrc, const unsigned int imm8); +/// \endcode +/// +/// \code{.operation} +/// VL := 512 +/// VL_bytes := VL >> 3 +/// row_index := imm8 & 0x3ff +/// row_chunk := (imm8 >> 6) * VL_bytes +/// FOR i := 0 TO (VL_bytes / 4) - 1 +/// IF i + row_chunk / 4 >= tsrc.colsb / 4 +/// dst.dword[i] := 0 +/// ELSE +/// dst.word[2*i+1] := 0 +/// dst.fp16[2*i+0] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) +/// FI +/// ENDFOR +/// dst[MAX_VL-1:VL] := 0 +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCVTROWPS2PHL instruction. +/// +/// \param tsrc +/// The source tile. Max size is 1024 Bytes. +/// \param imm8 +/// The the row of the source tile. +#define _tile_cvtrowps2phli(tsrc, imm8) \ + __builtin_ia32_tcvtrowps2phli(tsrc, imm8) + /// Move one row of a tile data to a v16f32 data. /// The row of the tile is selected by a 32b GPR. /// @@ -230,6 +412,38 @@ /// \endcode #define _tile_movrow(a, b) ((__m512i)__builtin_ia32_tilemovrow(a, b)) +/// Move one row of a tile data to a v16f32 data. +/// The row of the tile is selected by a 8b immediate value. +/// +/// \headerfile <immintrin.h> +/// +/// \code +/// __m512 _tile_movrowi(__tile a, const unsigned b); +/// \endcode +/// +/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction. +/// +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source r32. Size is 4 Bytes. +/// \returns +/// The destination v16f32 data. Size is 64 Bytes. +/// +/// \code{.operation} +/// VL := 512 +/// VL_bytes := VL>>3 +/// row_index := b&0x3ff +/// row_chunk := (b>>6) * VL_bytes +/// FOR i := 0 TO (VL_bytes-1) +/// IF (row_chunk + i >= a.colsb) +/// dst.byte[i] := 0 +/// ELSE +/// dst.byte[i] := a.row[row_index].byte[row_chunk+i] +/// ENDFOR +/// \endcode +#define _tile_movrowi(a, b) ((__m512i)__builtin_ia32_tilemovrowi(a, b)) + /// This is internal intrinsic. C/C++ user should avoid calling it directly. static __inline__ __m512 __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowd2ps_internal( diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp index 2f61bdd9a6540..39799e2203727 100644 --- a/clang/lib/Sema/SemaX86.cpp +++ b/clang/lib/Sema/SemaX86.cpp @@ -510,6 +510,15 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_tdphf8ps: case X86::BI__builtin_ia32_tmmultf32ps: return CheckBuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2}); + case X86::BI__builtin_ia32_tcvtrowps2bf16hi: + case X86::BI__builtin_ia32_tcvtrowps2bf16li: + case X86::BI__builtin_ia32_tcvtrowps2phhi: + case X86::BI__builtin_ia32_tcvtrowps2phli: + case X86::BI__builtin_ia32_tcvtrowd2psi: + case X86::BI__builtin_ia32_tilemovrowi: + return CheckBuiltinTileArgumentsRange(TheCall, 0) || + SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 255, + /*RangeIsError=*/false); } } static bool isX86_32Builtin(unsigned BuiltinID) { diff --git a/clang/test/CodeGen/X86/amx_amx512_errors.c b/clang/test/CodeGen/X86/amx_amx512_errors.c new file mode 100644 index 0000000000000..5ea3d14b93f67 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_amx512_errors.c @@ -0,0 +1,11 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-avx512 -target-feature +avx10.2 -verify + +#include <immintrin.h> +#include <stddef.h> + +void test_tile_mmultf32ps() { + _tile_cvtrowd2psi(16, 2); // expected-error {{argument value 16 is outside the valid range [0, 7]}} + _tile_cvtrowd2psi(1, 260); // expected-error {{argument value 260 is outside the valid range [0, 255]}} +} + diff --git a/clang/test/CodeGen/X86/amxavx512-builtins.c b/clang/test/CodeGen/X86/amxavx512-builtins.c index dada53c2d6f83..073adab349764 100644 --- a/clang/test/CodeGen/X86/amxavx512-builtins.c +++ b/clang/test/CodeGen/X86/amxavx512-builtins.c @@ -39,3 +39,39 @@ __m512i test_tile_movrow(unsigned int A) { // CHECK: %1 = call <16 x i32> @llvm.x86.tilemovrow(i8 1, i32 %{{.*}}) return _tile_movrow(1, A); } + +__m512 test_tile_cvtrowd2psi(void) { + // CHECK-LABEL: @test_tile_cvtrowd2psi( + // CHECK: call <16 x float> @llvm.x86.tcvtrowd2psi(i8 1, i32 2) + return _tile_cvtrowd2psi(1, 2); +} + +__m512bh test_tile_cvtrowps2bf16hi(void) { + // CHECK-LABEL: @test_tile_cvtrowps2bf16hi( + // CHECK: call <32 x bfloat> @llvm.x86.tcvtrowps2bf16hi(i8 1, i32 2) + return _tile_cvtrowps2bf16hi(1, 2); +} + +__m512bh test_tile_cvtrowps2bf16li(void) { + // CHECK-LABEL: @test_tile_cvtrowps2bf16li( + // CHECK: call <32 x bfloat> @llvm.x86.tcvtrowps2bf16li(i8 1, i32 2) + return _tile_cvtrowps2bf16li(1, 2); +} + +__m512h test_tile_cvtrowps2phhi(void) { + // CHECK-LABEL: @test_tile_cvtrowps2phhi( + // CHECK: call <32 x half> @llvm.x86.tcvtrowps2phhi(i8 1, i32 2) + return _tile_cvtrowps2phhi(1, 2); +} + +__m512h test_tile_cvtrowps2phli(void) { + // CHECK-LABEL: @test_tile_cvtrowps2phli( + // CHECK: call <32 x half> @llvm.x86.tcvtrowps2phli(i8 1, i32 2) + return _tile_cvtrowps2phli(1, 2); +} + +__m512i test_tile_movrowi(void) { + // CHECK-LABEL: @test_tile_movrowi + // CHECK: %0 = call <16 x i32> @llvm.x86.tilemovrowi(i8 1, i32 2) + return _tile_movrowi(1, 2); +} diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index e36187ea54d6f..b75a0485d6263 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5526,6 +5526,24 @@ let TargetPrefix = "x86" in { def int_x86_tilemovrow : ClangBuiltin<"__builtin_ia32_tilemovrow">, Intrinsic<[llvm_v16i32_ty], [llvm_i8_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>]>; + def int_x86_tcvtrowd2psi : ClangBuiltin<"__builtin_ia32_tcvtrowd2psi">, + Intrinsic<[llvm_v16f32_ty], [llvm_i8_ty, llvm_i32_ty], + [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; + def int_x86_tcvtrowps2bf16hi : ClangBuiltin<"__builtin_ia32_tcvtrowps2bf16hi">, + Intrinsic<[llvm_v32bf16_ty], [llvm_i8_ty, llvm_i32_ty], + [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; + def int_x86_tcvtrowps2bf16li : ClangBuiltin<"__builtin_ia32_tcvtrowps2bf16li">, + Intrinsic<[llvm_v32bf16_ty], [llvm_i8_ty, llvm_i32_ty], + [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; + def int_x86_tcvtrowps2phhi : ClangBuiltin<"__builtin_ia32_tcvtrowps2phhi">, + Intrinsic<[llvm_v32f16_ty], [llvm_i8_ty, llvm_i32_ty], + [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; + def int_x86_tcvtrowps2phli : ClangBuiltin<"__builtin_ia32_tcvtrowps2phli">, + Intrinsic<[llvm_v32f16_ty], [llvm_i8_ty, llvm_i32_ty], + [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; + def int_x86_tilemovrowi : ClangBuiltin<"__builtin_ia32_tilemovrowi">, + Intrinsic<[llvm_v16i32_ty], [llvm_i8_ty, llvm_i32_ty], + [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; // AMX - internal intrinsics def int_x86_ldtilecfg_internal : diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 90fc66e90cacc..d1dda4debe229 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -4374,6 +4374,13 @@ bool X86AsmParser::matchAndEmitATTInstruction( return Error(ErrorLoc, "immediate must be an integer in range [0, 15]", EmptyRange, MatchingInlineAsm); } + case Match_InvalidImmUnsignedi6: { + SMLoc ErrorLoc = ((X86Operand &)*Operands[ErrorInfo]).getStartLoc(); + if (ErrorLoc == SMLoc()) + ErrorLoc = IDLoc; + return Error(ErrorLoc, "immediate must be an integer in range [0, 63]", + EmptyRange, MatchingInlineAsm); + } case Match_MissingFeature: return ErrorMissingFeature(IDLoc, MissingFeatures, MatchingInlineAsm); case Match_InvalidOperand: @@ -4738,6 +4745,14 @@ bool X86AsmParser::matchAndEmitIntelInstruction( EmptyRange, MatchingInlineAsm); } + if (llvm::count(Match, Match_InvalidImmUnsignedi6) == 1) { + SMLoc ErrorLoc = ((X86Operand &)*Operands[ErrorInfo]).getStartLoc(); + if (ErrorLoc == SMLoc()) + ErrorLoc = IDLoc; + return Error(ErrorLoc, "immediate must be an integer in range [0, 63]", + EmptyRange, MatchingInlineAsm); + } + // If all of these were an outright failure, report it in a useless way. return Error(IDLoc, "unknown instruction mnemonic", EmptyRange, MatchingInlineAsm); diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h index e9be28ca77b05..b39cb33604ea5 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h @@ -39,6 +39,10 @@ inline bool isImmUnsignedi4Value(uint64_t Value) { return isUInt<4>(Value); } +inline bool isImmUnsignedi6Value(uint64_t Value) { + return isUInt<6>(Value); +} + } // End of namespace llvm #endif diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h index a31a7c3b4bd0e..acc5caa9c436f 100644 --- a/llvm/lib/Target/X86/AsmParser/X86Operand.h +++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h @@ -288,6 +288,15 @@ struct X86Operand final : public MCParsedAsmOperand { return isImmUnsignedi4Value(CE->getValue()); } + bool isImmUnsignedi6() const { + if (!isImm()) return false; + // If this isn't a constant expr, reject it. The immediate byte is shared + // with a register encoding. We can't have it affected by a relocation. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + return isImmUnsignedi6Value(CE->getValue()); + } + bool isImmUnsignedi8() const { if (!isImm()) return false; // If this isn't a constant expr, just assume it fits and let relaxation diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index d40913db6a777..7b930eac1573f 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -387,41 +387,57 @@ defm TCVTROWD2PS : m_tcvtrowd2ps; let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in { let SchedRW = [WriteSystem] in { let usesCustomInserter = 1 in { - def PTCVTROWD2PSrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2), []>; + def PTCVTROWD2PSrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2), + [(set VR512:$dst, (int_x86_tcvtrowd2psi timm:$src1, timm:$src2))]>; def PTCVTROWD2PSrre : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, GR32:$src2), [(set VR512:$dst, (int_x86_tcvtrowd2ps timm:$src1, GR32:$src2))]>; } def PTCVTROWD2PSrriV : PseudoI<(outs VR512:$dst), - (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), []>; + (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u6imm:$src4), + [(set VR512: $dst, + (int_x86_tcvtrowd2ps_internal GR16:$src1, GR16:$src2, + TILE:$src3, imm:$src4))]>; def PTCVTROWD2PSrreV : PseudoI<(outs VR512:$dst), (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4), [(set VR512: $dst, (int_x86_tcvtrowd2ps_internal GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4))]>; def PTCVTROWPS2BF16HrriV : PseudoI<(outs VR512:$dst), - (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), []>; + (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u6imm:$src4), + [(set VR512: $dst, + (int_x86_tcvtrowps2bf16h_internal GR16:$src1, GR16:$src2, + TILE:$src3, imm:$src4))]>; def PTCVTROWPS2BF16HrreV : PseudoI<(outs VR512:$dst), (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4), [(set VR512: $dst, (int_x86_tcvtrowps2bf16h_internal GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4))]>; def PTCVTROWPS2BF16LrriV : PseudoI<(outs VR512:$dst), - (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), []>; + (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u6imm:$src4), + [(set VR512: $dst, + (int_x86_tcvtrowps2bf16l_internal GR16:$src1, GR16:$src2, + TILE:$src3, imm:$src4))]>; def PTCVTROWPS2BF16LrreV : PseudoI<(outs VR512:$dst), (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4), [(set VR512: $dst, (int_x86_tcvtrowps2bf16l_internal GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4))]>; def PTCVTROWPS2PHHrriV : PseudoI<(outs VR512:$dst), - (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), []>; + (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u6imm:$src4), + [(set VR512: $dst, + (int_x86_tcvtrowps2phh_internal GR16:$src1, GR16:$src2, + TILE:$src3, imm:$src4))]>; def PTCVTROWPS2PHHrreV : PseudoI<(outs VR512:$dst), (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4), [(set VR512: $dst, (int_x86_tcvtrowps2phh_internal GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4))]>; def PTCVTROWPS2PHLrriV : PseudoI<(outs VR512:$dst), - (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), []>; + (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u6imm:$src4), + [(set VR512: $dst, + (int_x86_tcvtrowps2phl_internal GR16:$src1, GR16:$src2, + TILE:$src3, imm:$src4))]>; def PTCVTROWPS2PHLrreV : PseudoI<(outs VR512:$dst), (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4), [(set VR512: $dst, @@ -449,7 +465,7 @@ multiclass AMXAVX512_BASE<bits<8> Opcode1, bits<8> Opcode2, string Opstr, (!cast<Intrinsic>("int_x86_"#Opstr) timm:$src1, GR32:$src2))]>; def "P"#NAME#"rri" : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2), [(set VR512:$dst, - (!cast<Intrinsic>("int_x86_"#Opstr) timm:$src1, imm:$src2))]>; + (!cast<Intrinsic>("int_x86_"#Opstr#"i") timm:$src1, timm:$src2))]>; } } } @@ -479,13 +495,17 @@ defm TILEMOVROW : AMXAVX512_TILEMOVE<0x07, 0x4A, "tilemovrow">; let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in { let SchedRW = [WriteSystem] in { let usesCustomInserter = 1 in { - def PTILEMOVROWrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2), []>; + def PTILEMOVROWrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2), + [(set VR512:$dst, (int_x86_tilemovrowi timm:$src1, timm:$src2))]>; def PTILEMOVROWrre : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, GR32:$src2), [(set VR512:$dst, (int_x86_tilemovrow timm:$src1, GR32:$src2))]>; } def PTILEMOVROWrriV : PseudoI<(outs VR512:$dst), - (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), []>; + (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u6imm:$src4), + [(set VR512: $dst, + (int_x86_tilemovrow_internal GR16:$src1, GR16:$src2, + TILE:$src3, imm:$src4))]>; def PTILEMOVROWrreV : PseudoI<(outs VR512:$dst), (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4), [(set VR512: $dst, diff --git a/llvm/lib/Target/X86/X86InstrOperands.td b/llvm/lib/Target/X86/X86InstrOperands.td index 69ce4f8552609..4373f39c360a8 100644 --- a/llvm/lib/Target/X86/X86InstrOperands.td +++ b/llvm/lib/Target/X86/X86InstrOperands.td @@ -415,6 +415,14 @@ def ImmUnsignedi4AsmOperand : AsmOperandClass { let DiagnosticType = "InvalidImmUnsignedi4"; } +// 6-bit immediate used by some AMX instructions +// [0, 0x3F] +def ImmUnsignedi6AsmOperand : AsmOperandClass { + let Name = "ImmUnsignedi6"; + let RenderMethod = "addImmOperands"; + let DiagnosticType = "InvalidImmUnsignedi6"; +} + // Unsigned immediate used by SSE/AVX instructions // [0, 0xFF] // [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] @@ -475,6 +483,14 @@ def i16u8imm : Operand<i16> { let OperandType = "OPERAND_IMMEDIATE"; } +// 32-bit immediate but only 6-bits are significant and they are unsigned. +// Used by some AMX instructions that use intrinsics. +def i32u6imm : Operand<i32> { + let PrintMethod = "printU8Imm"; + let ParserMatchClass = ImmUnsignedi6AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + // 32-bit immediate but only 8-bits are significant and they are unsigned. // Used by some SSE/AVX instructions that use intrinsics. def i32u8imm : Operand<i32> { diff --git a/llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll b/llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll index 5f4e6283b57a6..885bc805d6552 100644 --- a/llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll +++ b/llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll @@ -28,15 +28,15 @@ define dso_local <16 x i32> @test_api(i16 signext %0, i16 signext %1) nounwind { ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $2112, %rsp # imm = 0x840 -; CHECK-NEXT: movl %esi, %ebp -; CHECK-NEXT: movl %edi, %ebx +; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %edi, %ebp ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovups %zmm0, (%rsp) ; CHECK-NEXT: movb $1, (%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %bp, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %bl, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg (%rsp) ; CHECK-NEXT: movl $buf, %eax ; CHECK-NEXT: movl $32, %ecx @@ -51,12 +51,11 @@ define dso_local <16 x i32> @test_api(i16 signext %0, i16 signext %1) nounwind { ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg (%rsp) -; CHECK-NEXT: movl $2, %eax -; CHECK-NEXT: movabsq $64, %rcx -; CHECK-NEXT: tileloadd 64(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload -; CHECK-NEXT: tilemovrow %eax, %tmm1, %zmm0 -; CHECK-NEXT: tileloadd 1088(%rsp,%rcx), %tmm0 # 1024-byte Folded Reload -; CHECK-NEXT: tilemovrow %eax, %tmm0, %zmm1 +; CHECK-NEXT: movabsq $64, %rax +; CHECK-NEXT: tileloadd 64(%rsp,%rax), %tmm1 # 1024-byte Folded Reload +; CHECK-NEXT: tilemovrow $2, %tmm1, %zmm0 +; CHECK-NEXT: tileloadd 1088(%rsp,%rax), %tmm0 # 1024-byte Folded Reload +; CHECK-NEXT: tilemovrow $2, %tmm0, %zmm1 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: addq $2112, %rsp # imm = 0x840 ; CHECK-NEXT: popq %rbx @@ -83,9 +82,8 @@ define dso_local <16 x i32> @test_api(i16 signext %0, i16 signext %1) nounwind { ; IPRA-NEXT: movl $buf+1024, %eax ; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm1 ; IPRA-NEXT: callq foo -; IPRA-NEXT: movl $2, %eax -; IPRA-NEXT: tilemovrow %eax, %tmm1, %zmm0 -; IPRA-NEXT: tilemovrow %eax, %tmm0, %zmm1 +; IPRA-NEXT: tilemovrow $2, %tmm1, %zmm0 +; IPRA-NEXT: tilemovrow $2, %tmm0, %zmm1 ; IPRA-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; IPRA-NEXT: addq $72, %rsp ; IPRA-NEXT: tilerelease @@ -140,9 +138,8 @@ define dso_local <16 x i32> @test_api(i16 signext %0, i16 signext %1) nounwind { ; O0-NEXT: movw %dx, {{[0-9]+}}(%rsp) ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; O0-NEXT: tileloadd (%rsi,%rdi), %tmm0 -; O0-NEXT: movl $2, %esi ; O0-NEXT: movw $8, %cx -; O0-NEXT: tilemovrow %esi, %tmm0, %zmm0 +; O0-NEXT: tilemovrow $2, %tmm0, %zmm0 ; O0-NEXT: movl $64, %esi ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; O0-NEXT: movw $8, %cx @@ -151,9 +148,8 @@ define dso_local <16 x i32> @test_api(i16 signext %0, i16 signext %1) nounwind { ; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 -; O0-NEXT: movl $2, %edx ; O0-NEXT: movw $8, %cx -; O0-NEXT: tilemovrow %edx, %tmm0, %zmm1 +; O0-NEXT: tilemovrow $2, %tmm0, %zmm1 ; O0-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; O0-NEXT: movq %rbp, %rsp ; O0-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll b/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll index a36f93e34a983..62cafc5a43c7b 100644 --- a/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll @@ -13,10 +13,9 @@ define <16 x float> @test_tcvtrowd2ps(i32 %A) { define <16 x float> @test_tcvtrowd2psi() { ; CHECK-LABEL: test_tcvtrowd2psi: ; CHECK: # %bb.0: -; CHECK-NEXT: movl $127, %eax # encoding: [0xb8,0x7f,0x00,0x00,0x00] -; CHECK-NEXT: tcvtrowd2ps %eax, %tmm1, %zmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x4a,0xc1] +; CHECK-NEXT: tcvtrowd2ps $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7e,0x48,0x07,0xc1,0x7f] ; CHECK-NEXT: retq # encoding: [0xc3] - %ret = call <16 x float> @llvm.x86.tcvtrowd2ps(i8 1, i32 127) + %ret = call <16 x float> @llvm.x86.tcvtrowd2psi(i8 1, i32 127) ret <16 x float> %ret } declare <16 x float> @llvm.x86.tcvtrowd2ps(i8 %A, i32 %B) @@ -35,7 +34,7 @@ define <32 x bfloat> @test_tcvtrowps2bf16hi() { ; CHECK: # %bb.0: ; CHECK-NEXT: tcvtrowps2bf16h $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7f,0x48,0x07,0xc1,0x7f] ; CHECK-NEXT: retq # encoding: [0xc3] - %ret = call <32 x bfloat> @llvm.x86.tcvtrowps2bf16h(i8 1, i32 127) + %ret = call <32 x bfloat> @llvm.x86.tcvtrowps2bf16hi(i8 1, i32 127) ret <32 x bfloat> %ret } declare <32 x bfloat> @llvm.x86.tcvtrowps2bf16h(i8 %A, i32 %B) @@ -54,7 +53,7 @@ define <32 x bfloat> @test_tcvtrowps2bf16li() { ; CHECK: # %bb.0: ; CHECK-NEXT: tcvtrowps2bf16l $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7e,0x48,0x77,0xc1,0x7f] ; CHECK-NEXT: retq # encoding: [0xc3] - %ret = call <32 x bfloat> @llvm.x86.tcvtrowps2bf16l(i8 1, i32 127) + %ret = call <32 x bfloat> @llvm.x86.tcvtrowps2bf16li(i8 1, i32 127) ret <32 x bfloat> %ret } declare <32 x bfloat> @llvm.x86.tcvtrowps2bf16l(i8 %A, i32 %B) @@ -73,7 +72,7 @@ define <32 x half> @test_tcvtrowps2phhi() { ; CHECK: # %bb.0: ; CHECK-NEXT: tcvtrowps2phh $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7c,0x48,0x07,0xc1,0x7f] ; CHECK-NEXT: retq # encoding: [0xc3] - %ret = call <32 x half> @llvm.x86.tcvtrowps2phh(i8 1, i32 127) + %ret = call <32 x half> @llvm.x86.tcvtrowps2phhi(i8 1, i32 127) ret <32 x half> %ret } declare <32 x half> @llvm.x86.tcvtrowps2phh(i8 %A, i32 %B) @@ -92,7 +91,7 @@ define <32 x half> @test_tcvtrowps2phli() { ; CHECK: # %bb.0: ; CHECK-NEXT: tcvtrowps2phl $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7f,0x48,0x77,0xc1,0x7f] ; CHECK-NEXT: retq # encoding: [0xc3] - %ret = call <32 x half> @llvm.x86.tcvtrowps2phl(i8 1, i32 127) + %ret = call <32 x half> @llvm.x86.tcvtrowps2phli(i8 1, i32 127) ret <32 x half> %ret } declare <32 x half> @llvm.x86.tcvtrowps2phl(i8 %A, i32 %B) @@ -109,10 +108,9 @@ define <16 x i32> @test_tilemovrow(i32 %A) { define <16 x i32> @test_tilemovrowi() { ; CHECK-LABEL: test_tilemovrowi: ; CHECK: # %bb.0: -; CHECK-NEXT: movl $127, %eax # encoding: [0xb8,0x7f,0x00,0x00,0x00] -; CHECK-NEXT: tilemovrow %eax, %tmm1, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x4a,0xc1] +; CHECK-NEXT: tilemovrow $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x07,0xc1,0x7f] ; CHECK-NEXT: retq # encoding: [0xc3] - %ret = call <16 x i32> @llvm.x86.tilemovrow(i8 1, i32 127) + %ret = call <16 x i32> @llvm.x86.tilemovrowi(i8 1, i32 127) ret <16 x i32> %ret } declare <16 x i32> @llvm.x86.tilemovrow(i8 %A, i32 %B) diff --git a/llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll b/llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll index 1ec6d1cf7bfe9..dc8252ae7aca1 100644 --- a/llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll +++ b/llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll @@ -14,18 +14,17 @@ define void @test_amx(i8* %pointer, i8* %base, i32 %index, i64 %stride) { ; CHECK-NEXT: movw $8, %ax ; CHECK-NEXT: tileloadd (%rsi,%rcx), %tmm0 ; CHECK-NEXT: tcvtrowd2ps %edx, %tmm0, %zmm0 -; CHECK-NEXT: movl $16, %esi -; CHECK-NEXT: tcvtrowd2ps %esi, %tmm0, %zmm0 +; CHECK-NEXT: tcvtrowd2ps $16, %tmm0, %zmm0 ; CHECK-NEXT: tcvtrowps2bf16h %edx, %tmm0, %zmm0 -; CHECK-NEXT: tcvtrowps2bf16h %esi, %tmm0, %zmm0 +; CHECK-NEXT: tcvtrowps2bf16h $16, %tmm0, %zmm0 ; CHECK-NEXT: tcvtrowps2bf16l %edx, %tmm0, %zmm0 -; CHECK-NEXT: tcvtrowps2bf16l %esi, %tmm0, %zmm0 +; CHECK-NEXT: tcvtrowps2bf16l $16, %tmm0, %zmm0 ; CHECK-NEXT: tcvtrowps2phh %edx, %tmm0, %zmm0 -; CHECK-NEXT: tcvtrowps2phh %esi, %tmm0, %zmm0 +; CHECK-NEXT: tcvtrowps2phh $16, %tmm0, %zmm0 ; CHECK-NEXT: tcvtrowps2phl %edx, %tmm0, %zmm0 -; CHECK-NEXT: tcvtrowps2phl %esi, %tmm0, %zmm0 +; CHECK-NEXT: tcvtrowps2phl $16, %tmm0, %zmm0 ; CHECK-NEXT: tilemovrow %edx, %tmm0, %zmm0 -; CHECK-NEXT: tilemovrow %esi, %tmm0, %zmm0 +; CHECK-NEXT: tilemovrow $16, %tmm0, %zmm0 ; CHECK-NEXT: tilestored %tmm0, (%rdi,%rcx) ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
