This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rGf8b9035aae44: [X86] Support amx-int8 intrinsic. (authored by 
LiuChen3).

Changed prior to commit:
  https://reviews.llvm.org/D97259?vs=325687&id=325706#toc

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D97259/new/

https://reviews.llvm.org/D97259

Files:
  clang/include/clang/Basic/BuiltinsX86_64.def
  clang/lib/Headers/amxintrin.h
  clang/test/CodeGen/X86/amx_api.c
  llvm/include/llvm/IR/IntrinsicsX86.td
  llvm/lib/Target/X86/X86ExpandPseudo.cpp
  llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
  llvm/lib/Target/X86/X86InstrAMX.td
  llvm/lib/Target/X86/X86LowerAMXType.cpp
  llvm/lib/Target/X86/X86PreTileConfig.cpp
  llvm/lib/Target/X86/X86RegisterInfo.cpp
  llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll

Index: llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
===================================================================
--- llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
+++ llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
@@ -19,6 +19,9 @@
 ; CHECK-NEXT:    tileloadd (%rsi,%rdx), %tmm1
 ; CHECK-NEXT:    tileloadd (%rsi,%rdx), %tmm2
 ; CHECK-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0
+; CHECK-NEXT:    tdpbsud %tmm2, %tmm1, %tmm0
+; CHECK-NEXT:    tdpbusd %tmm2, %tmm1, %tmm0
+; CHECK-NEXT:    tdpbuud %tmm2, %tmm1, %tmm0
 ; CHECK-NEXT:    tilestored %tmm0, (%rdi,%rdx)
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    vzeroupper
@@ -26,8 +29,11 @@
   %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
   %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
   %b = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
-  %d = call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d)
+  %d0 = call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b)
+  %d1 = call x86_amx @llvm.x86.tdpbsud.internal(i16 8, i16 8, i16 8, x86_amx %d0, x86_amx %a, x86_amx %b)
+  %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b)
+  %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b)
+  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d3)
 
   ret void
 }
@@ -35,4 +41,7 @@
 declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
Index: llvm/lib/Target/X86/X86RegisterInfo.cpp
===================================================================
--- llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -884,6 +884,9 @@
   // We only collect the tile shape that is defined.
   case X86::PTILELOADDV:
   case X86::PTDPBSSDV:
+  case X86::PTDPBSUDV:
+  case X86::PTDPBUSDV:
+  case X86::PTDPBUUDV:
   case X86::PTILEZEROV:
     MachineOperand &MO1 = MI->getOperand(1);
     MachineOperand &MO2 = MI->getOperand(2);
Index: llvm/lib/Target/X86/X86PreTileConfig.cpp
===================================================================
--- llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -155,6 +155,9 @@
     llvm_unreachable("Unexpected machine instruction on tile");
   case X86::PTILELOADDV:
   case X86::PTDPBSSDV:
+  case X86::PTDPBSUDV:
+  case X86::PTDPBUSDV:
+  case X86::PTDPBUUDV:
   case X86::PTILEZEROV:
     MachineOperand &MO1 = const_cast<MachineOperand &>(MI.getOperand(1));
     MachineOperand &MO2 = const_cast<MachineOperand &>(MI.getOperand(2));
@@ -249,6 +252,9 @@
   case X86::PTILELOADDV:
   case X86::PTILESTOREDV:
   case X86::PTDPBSSDV:
+  case X86::PTDPBSUDV:
+  case X86::PTDPBUSDV:
+  case X86::PTDPBUUDV:
   case X86::PTILEZEROV:
     return true;
   }
Index: llvm/lib/Target/X86/X86LowerAMXType.cpp
===================================================================
--- llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -67,7 +67,10 @@
   }
   // a * b + c
   // The shape depends on which operand.
-  case Intrinsic::x86_tdpbssd_internal: {
+  case Intrinsic::x86_tdpbssd_internal:
+  case Intrinsic::x86_tdpbsud_internal:
+  case Intrinsic::x86_tdpbusd_internal:
+  case Intrinsic::x86_tdpbuud_internal: {
     switch (OpNo) {
     case 3:
       Row = II->getArgOperand(0);
Index: llvm/lib/Target/X86/X86InstrAMX.td
===================================================================
--- llvm/lib/Target/X86/X86InstrAMX.td
+++ llvm/lib/Target/X86/X86InstrAMX.td
@@ -92,10 +92,20 @@
     }
 
     // Pseduo instruction for RA.
-    let Constraints = "$src4 = $dst" in
-    def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
-                            GR16:$src2, GR16:$src3, TILE:$src4,
-                            TILE:$src5, TILE:$src6), []>;
+    let Constraints = "$src4 = $dst" in {
+      def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+                              GR16:$src2, GR16:$src3, TILE:$src4,
+                              TILE:$src5, TILE:$src6), []>;
+      def PTDPBSUDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+                              GR16:$src2, GR16:$src3, TILE:$src4,
+                              TILE:$src5, TILE:$src6), []>;
+      def PTDPBUSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+                              GR16:$src2, GR16:$src3, TILE:$src4,
+                              TILE:$src5, TILE:$src6), []>;
+      def PTDPBUUDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+                              GR16:$src2, GR16:$src3, TILE:$src4,
+                              TILE:$src5, TILE:$src6), []>;
+    }
 
     let usesCustomInserter = 1 in {
       // Pseudo instructions, using immediates instead of tile registers.
Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4621,11 +4621,22 @@
       ReplaceNode(Node, CNode);
       return;
     }
-    case Intrinsic::x86_tdpbssd_internal: {
+
+    case Intrinsic::x86_tdpbssd_internal:
+    case Intrinsic::x86_tdpbsud_internal:
+    case Intrinsic::x86_tdpbusd_internal:
+    case Intrinsic::x86_tdpbuud_internal: {
       if (!Subtarget->hasAMXTILE())
         break;
       SDValue Chain = Node->getOperand(0);
-      unsigned Opc = X86::PTDPBSSDV;
+      unsigned Opc;
+      switch (IntNo) {
+      case Intrinsic::x86_tdpbssd_internal: Opc = X86::PTDPBSSDV; break;
+      case Intrinsic::x86_tdpbsud_internal: Opc = X86::PTDPBSUDV; break;
+      case Intrinsic::x86_tdpbusd_internal: Opc = X86::PTDPBUSDV; break;
+      case Intrinsic::x86_tdpbuud_internal: Opc = X86::PTDPBUUDV; break;
+      default: llvm_unreachable("Impossible intrinsic");
+      }
       SDValue Ops[] = {Node->getOperand(2),
                        Node->getOperand(3),
                        Node->getOperand(4),
Index: llvm/lib/Target/X86/X86ExpandPseudo.cpp
===================================================================
--- llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -467,11 +467,22 @@
     MI.setDesc(TII->get(X86::TILELOADD));
     return true;
   }
-  case X86::PTDPBSSDV: {
+  case X86::PTDPBSSDV:
+  case X86::PTDPBSUDV:
+  case X86::PTDPBUSDV:
+  case X86::PTDPBUUDV: {
     MI.untieRegOperand(4);
     for (unsigned i = 3; i > 0; --i)
       MI.RemoveOperand(i);
-    MI.setDesc(TII->get(X86::TDPBSSD));
+    unsigned Opc;
+    switch (Opcode) {
+    case X86::PTDPBSSDV: Opc = X86::TDPBSSD; break;
+    case X86::PTDPBSUDV: Opc = X86::TDPBSUD; break;
+    case X86::PTDPBUSDV: Opc = X86::TDPBUSD; break;
+    case X86::PTDPBUUDV: Opc = X86::TDPBUUD; break;
+    default: llvm_unreachable("Impossible Opcode!");
+    }
+    MI.setDesc(TII->get(Opc));
     MI.tieOperands(0, 1);
     return true;
   }
Index: llvm/include/llvm/IR/IntrinsicsX86.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsX86.td
+++ llvm/include/llvm/IR/IntrinsicsX86.td
@@ -5053,6 +5053,24 @@
                         [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
                          llvm_x86amx_ty, llvm_x86amx_ty,
                          llvm_x86amx_ty], []>;
+  def int_x86_tdpbsud_internal :
+              GCCBuiltin<"__builtin_ia32_tdpbsud_internal">,
+              Intrinsic<[llvm_x86amx_ty],
+                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
+                         llvm_x86amx_ty, llvm_x86amx_ty,
+                         llvm_x86amx_ty], []>;
+  def int_x86_tdpbusd_internal :
+              GCCBuiltin<"__builtin_ia32_tdpbusd_internal">,
+              Intrinsic<[llvm_x86amx_ty],
+                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
+                         llvm_x86amx_ty, llvm_x86amx_ty,
+                         llvm_x86amx_ty], []>;
+  def int_x86_tdpbuud_internal :
+              GCCBuiltin<"__builtin_ia32_tdpbuud_internal">,
+              Intrinsic<[llvm_x86amx_ty],
+                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
+                         llvm_x86amx_ty, llvm_x86amx_ty,
+                         llvm_x86amx_ty], []>;
   def int_x86_tilestored64_internal :
               GCCBuiltin<"__builtin_ia32_tilestored64_internal">,
               Intrinsic<[], [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty,
Index: clang/test/CodeGen/X86/amx_api.c
===================================================================
--- clang/test/CodeGen/X86/amx_api.c
+++ clang/test/CodeGen/X86/amx_api.c
@@ -46,6 +46,27 @@
   __tile_dpbssd(&c, a, b);
 }
 
+void test_tile_dpbsud(__tile1024i a, __tile1024i b, __tile1024i c) {
+  //CHECK-LABEL: @test_tile_dpbsud
+  //CHECK: call x86_amx @llvm.x86.tdpbsud.internal
+  //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32>
+  __tile_dpbsud(&c, a, b);
+}
+
+void test_tile_dpbusd(__tile1024i a, __tile1024i b, __tile1024i c) {
+  //CHECK-LABEL: @test_tile_dpbusd
+  //CHECK: call x86_amx @llvm.x86.tdpbusd.internal
+  //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32>
+  __tile_dpbusd(&c, a, b);
+}
+
+void test_tile_dpbuud(__tile1024i a, __tile1024i b, __tile1024i c) {
+  //CHECK-LABEL: @test_tile_dpbuud
+  //CHECK: call x86_amx @llvm.x86.tdpbuud.internal
+  //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32>
+  __tile_dpbuud(&c, a, b);
+}
+
 void test_tile_stored(__tile1024i c) {
   //CHECK-LABEL: @test_tile_stored
   //CHECK: {{%.*}} = bitcast <256 x i32> {{%.*}} to x86_amx
Index: clang/lib/Headers/amxintrin.h
===================================================================
--- clang/lib/Headers/amxintrin.h
+++ clang/lib/Headers/amxintrin.h
@@ -238,6 +238,24 @@
   return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
 }
 
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
+                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
+                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
+                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
+}
+
 static __inline__ void __DEFAULT_FN_ATTRS_INT8
 _tile_stored_internal(unsigned short m, unsigned short n, void *base,
                       __SIZE_TYPE__ stride, _tile1024i tile) {
@@ -264,6 +282,27 @@
                                     src1.tile, src2.tile);
 }
 
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_dpbsud(__tile1024i *dst, __tile1024i src1,
+                          __tile1024i src2) {
+  dst->tile = _tile_dpbsud_internal(src1.row, src2.col, src1.col, dst->tile,
+                                    src1.tile, src2.tile);
+}
+
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_dpbusd(__tile1024i *dst, __tile1024i src1,
+                          __tile1024i src2) {
+  dst->tile = _tile_dpbusd_internal(src1.row, src2.col, src1.col, dst->tile,
+                                    src1.tile, src2.tile);
+}
+
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_dpbuud(__tile1024i *dst, __tile1024i src1,
+                          __tile1024i src2) {
+  dst->tile = _tile_dpbuud_internal(src1.row, src2.col, src1.col, dst->tile,
+                                    src1.tile, src2.tile);
+}
+
 __DEFAULT_FN_ATTRS_TILE
 static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
   _tile_stored_internal(src.row, src.col, base, stride, src.tile);
Index: clang/include/clang/Basic/BuiltinsX86_64.def
===================================================================
--- clang/include/clang/Basic/BuiltinsX86_64.def
+++ clang/include/clang/Basic/BuiltinsX86_64.def
@@ -103,6 +103,9 @@
 // AMX internal builtin
 TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
+TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
+TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
+TARGET_BUILTIN(__builtin_ia32_tdpbuud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
 TARGET_BUILTIN(__builtin_ia32_tilestored64_internal, "vUsUsv*zV256i", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tilezero_internal, "V256iUsUs", "n", "amx-tile")
 // AMX
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to