tra created this revision.
tra added a reviewer: jlebar.
Herald added subscribers: hiraditya, sanjoy, jholewinski.
The new instructions were added added for sm_70+ GPUs in CUDA-9.1.
https://reviews.llvm.org/D45068
Files:
clang/include/clang/Basic/BuiltinsNVPTX.def
clang/lib/CodeGen/CGBuiltin.cpp
clang/lib/Driver/ToolChains/Cuda.cpp
clang/test/CodeGen/builtins-nvptx-sm_70.cu
llvm/include/llvm/IR/IntrinsicsNVVM.td
llvm/lib/Target/NVPTX/NVPTX.td
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
llvm/test/CodeGen/NVPTX/wmma.py
Index: llvm/test/CodeGen/NVPTX/wmma.py
===================================================================
--- llvm/test/CodeGen/NVPTX/wmma.py
+++ llvm/test/CodeGen/NVPTX/wmma.py
@@ -2,7 +2,7 @@
# generates correct instructions for them.
# RUN: python %s > %t.ll
-# RUN: llc < %t.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %t.ll
+# RUN: llc < %t.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx61 | FileCheck %t.ll
from itertools import product
from string import Template
@@ -36,33 +36,36 @@
check_f16_4 = "{{%s}}" % ", *".join(["%hh[0-9]+"] * 4)
check_f32_8 = "{{%s}}" % ", *".join(["%f[0-9]+"] * 8)
+known_geoms = ["m16n16k16", "m8n32k16", "m32n8k16"]
+
def gen_wmma_load_tests():
load_template = """
declare ${ret_ty} @${intrinsic}(i8 ${as}* %src ${extra_args});
; CHECK-LABEL: .func {{.*}}test_${function}(
define ${ret_ty} @test_${function}(i8 ${as}* %src ${extra_args}) {
-; CHECK ${instruction}
+; CHECK: ${instruction}
; CHECK: {${check_result}}
; CHECK: [%rd{{[0-9]+}}]${stride_pattern}
%v0 = call ${ret_ty} @${intrinsic}(i8 ${as}* %src ${extra_args});
ret ${ret_ty} %v0;
}
; CHECK-LABEL: .func{{.*}}test_${function}_o(
define ${ret_ty} @test_${function}_o(i8 ${as}* %src ${extra_args}) {
-; CHECK ${instruction}
+; CHECK: ${instruction}
; CHECK: {${check_result}}
; CHECK: [%rd{{[0-9]+}}+128]${stride_pattern}
%src1 = getelementptr i8, i8 ${as}* %src, i32 128;
%v0 = call ${ret_ty} @${intrinsic}(i8 ${as}* %src1 ${extra_args});
ret ${ret_ty} %v0;
}
"""
intrinsic_template = "llvm.nvvm.wmma.${geom}.load.${abc}.${layout}${stride}.${itype}.${pspace}"
- instruction_template = "wmma.load.${abc}.sync.${geom}.${layout}${space}.${itype}"
+ instruction_template = "wmma.load.${abc}.sync.${layout}.${geom}${space}.${itype}"
- for abc, layout, space, stride, itype in product(
+ for geom, abc, layout, space, stride, itype in product(
+ known_geoms,
"abc",
["row","col"],
["",".shared",".global"],
@@ -77,7 +80,7 @@
"itype" : itype,
"pspace" : get_pspace(space),
"as" : "addrspace(%d)" % get_aspace(space),
- "geom" : "m16n16k16",
+ "geom" : geom,
}
if itype == "f32" and abc != "c":
@@ -112,27 +115,28 @@
; CHECK-LABEL: .func {{.*}}test_${function}(
define void @test_${function}(i8 ${as}* %src, ${args}${extra_args}) {
-; CHECK ${instruction} {{.*}}[%rd{{[0-9+]}}
+; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}
; CHECK: {${check_args}}
; CHECK: ${stride_pattern}
call void @${intrinsic}(i8 ${as}* %src, ${args} ${extra_args});
ret void
}
; CHECK-LABEL: .func{{.*}}test_${function}_o(
define void @test_${function}_o(i8 ${as}* %src, ${args}${extra_args}) {
-; CHECK ${instruction} {{.*}}[%rd{{[0-9+]}}+128]
+; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}+128]
; CHECK: ${check_args}
; CHECK: ${stride_pattern}
%src1 = getelementptr i8, i8 ${as}* %src, i32 128;
call void @${intrinsic}(i8 ${as}* %src1, ${args}${extra_args});
ret void
}
"""
intrinsic_template = "llvm.nvvm.wmma.${geom}.store.${abc}.${layout}${stride}.${itype}.${pspace}"
- instruction_template = "wmma.store.${abc}.sync.${geom}.${layout}${space}.${itype}"
+ instruction_template = "wmma.store.${abc}.sync.${layout}.${geom}${space}.${itype}"
- for abc, layout, space, stride, itype in product(
+ for geom, abc, layout, space, stride, itype in product(
+ known_geoms,
"d",
["row","col"],
["",".shared",".global"],
@@ -147,7 +151,7 @@
"itype" : itype,
"pspace" : get_pspace(space),
"as" : "addrspace(%d)" % get_aspace(space),
- "geom" : "m16n16k16",
+ "geom" : geom,
}
test_params = params
@@ -174,20 +178,21 @@
; CHECK-LABEL: .func {{.*}}test_${function}(
define ${ret_ty} @test_${function}(
${args}) {
-; CHECK ${instruction} {{.*}}[%rd{{[0-9+]}}
-; CHECK ${check_d}
-; CHECK ${check_ab}
-; CHECK ${check_ab}
-; CHECK ${check_c}
+; CHECK: ${instruction}
+; CHECK-NEXT: ${check_d}
+; CHECK-NEXT: ${check_ab}
+; CHECK-NEXT: ${check_ab}
+; CHECK-NEXT: ${check_c}
%r = call ${ret_ty} @${intrinsic}(
${args});
ret ${ret_ty} %r;
}
"""
intrinsic_template = "llvm.nvvm.wmma.${geom}.mma.${alayout}.${blayout}.${dtype}.${ctype}${satf}"
instruction_template = "wmma.mma.sync.${alayout}.${blayout}.${geom}.${dtype}.${ctype}${satf}"
- for alayout, blayout, ctype, dtype, satf in product(
+ for geom, alayout, blayout, ctype, dtype, satf in product(
+ known_geoms,
["row","col"],
["row","col"],
["f16", "f32"],
@@ -200,7 +205,7 @@
"ctype" : ctype,
"dtype" : dtype,
"satf" : satf,
- "geom" : "m16n16k16",
+ "geom" : geom,
}
test_params = params
Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
===================================================================
--- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -7378,7 +7378,11 @@
class WMMA_LOAD_GALSTOS<string Geometry, string Abc, string Layout,
string Space, string Type, NVPTXRegClass regclass,
DAGOperand SrcOp, bit WithStride>
- : EmptyNVPTXInst, Requires<[hasPTX60, hasSM70]> {
+ : EmptyNVPTXInst,
+ Requires<[!if(!eq(Geometry, "m16n16k16"),
+ hasPTX60,
+ hasPTX61),
+ hasSM70]> {
// Pattern (created by WMMA_LOAD_INTR_HELPER below) that matches the intrinsic
// for this function.
PatFrag IntrMatcher = !cast<PatFrag>("INT_WMMA_"
@@ -7420,10 +7424,10 @@
let InOperandList = Ins;
let AsmString = "wmma.load."
# Abc
- # ".sync."
- # Layout
- # ".m16n16k16"
- # Space
+ # ".sync"
+ # "." # Layout
+ # "." # Geometry
+ # Space
# "." # Type # " \t"
# !if(!eq(Abc#Type, "cf16"),
"{{$r0, $r1, $r2, $r3}}",
@@ -7512,15 +7516,21 @@
defm _load_c_f32: WMMA_LOAD_GAT<Geometry, "c", "f32", Float32Regs>;
}
+defm INT_WMMA_m32n8k16: WMMA_LOAD_G<"m32n8k16">;
defm INT_WMMA_m16n16k16: WMMA_LOAD_G<"m16n16k16">;
+defm INT_WMMA_m8n32k16: WMMA_LOAD_G<"m8n32k16">;
//
// wmma.store.d.sync.[row|col].m16n16k16[|.global|.shared].[f16|f32]
//
class WMMA_STORE_D_GLSTSO<string Geometry, string Layout, string Space,
string Type, NVPTXRegClass regclass,
bit WithStride, DAGOperand DstOp>
- : EmptyNVPTXInst, Requires<[hasPTX60, hasSM70]> {
+ : EmptyNVPTXInst,
+ Requires<[!if(!eq(Geometry, "m16n16k16"),
+ hasPTX60,
+ hasPTX61),
+ hasSM70]> {
PatFrag IntrMatcher = !cast<PatFrag>("INT_WMMA"
# "_" # Geometry # "_store_d"
# "_" # Type
@@ -7641,19 +7651,21 @@
defm _store_d_f32: WMMA_STORE_D_GT<Geometry, "f32", Float32Regs>;
}
-// multiclass WMMA_STORE_D {
-// defm _m16n16k16: WMMA_STORE_D_G<"m16n16k16">;
-// }
-
+defm INT_WMMA_m32n8k16: WMMA_STORE_D_G<"m32n8k16">;
defm INT_WMMA_m16n16k16: WMMA_STORE_D_G<"m16n16k16">;
+defm INT_WMMA_m8n32k16: WMMA_STORE_D_G<"m8n32k16">;
// WMMA.MMA
class WMMA_MMA_GABDCS<string Geometry, string ALayout, string BLayout,
string DType, NVPTXRegClass d_reg,
string CType, NVPTXRegClass c_reg,
NVPTXRegClass ab_reg,
string Satfinite = "">
- : EmptyNVPTXInst, Requires<[hasPTX60, hasSM70]> {
+ : EmptyNVPTXInst,
+ Requires<[!if(!eq(Geometry, "m16n16k16"),
+ hasPTX60,
+ hasPTX61),
+ hasSM70]> {
Intrinsic Intr = !cast<Intrinsic>("int_nvvm_wmma_"
# Geometry
# "_mma"
@@ -7686,7 +7698,7 @@
let AsmString = "wmma.mma.sync."
# ALayout
# "." # BLayout
- # ".m16n16k16"
+ # "." # Geometry
# "." # DType
# "." # CType
# Satfinite # "\n\t\t"
@@ -7734,4 +7746,6 @@
defm _row: WMMA_MMA_GA<Geometry, "row">;
}
+defm INT_WMMA_MMA_m32n8k16 : WMMA_MMA_G<"m32n8k16">;
defm INT_WMMA_MMA_m16n16k16 : WMMA_MMA_G<"m16n16k16">;
+defm INT_WMMA_MMA_m8n32k16 : WMMA_MMA_G<"m8n32k16">;
Index: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
===================================================================
--- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -142,6 +142,7 @@
def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
+def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
===================================================================
--- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -3330,7 +3330,23 @@
case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride: {
+ case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::v8f16;
Info.ptrVal = I.getArgOperand(0);
@@ -3343,7 +3359,15 @@
case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride: {
+ case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::v4f16;
Info.ptrVal = I.getArgOperand(0);
@@ -3356,7 +3380,15 @@
case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride: {
+ case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::v8f32;
Info.ptrVal = I.getArgOperand(0);
@@ -3369,7 +3401,15 @@
case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride: {
+ case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
+ case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
+ case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::v4f16;
Info.ptrVal = I.getArgOperand(0);
@@ -3382,7 +3422,15 @@
case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
- case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride: {
+ case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
+ case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
+ case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
+ case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
+ case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::v8f32;
Info.ptrVal = I.getArgOperand(0);
Index: llvm/lib/Target/NVPTX/NVPTX.td
===================================================================
--- llvm/lib/Target/NVPTX/NVPTX.td
+++ llvm/lib/Target/NVPTX/NVPTX.td
@@ -52,6 +52,8 @@
"Target SM 6.2">;
def SM70 : SubtargetFeature<"sm_70", "SmVersion", "70",
"Target SM 7.0">;
+def SM72 : SubtargetFeature<"sm_72", "SmVersion", "72",
+ "Target SM 7.2">;
// PTX Versions
def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32",
@@ -68,6 +70,8 @@
"Use PTX version 5.0">;
def PTX60 : SubtargetFeature<"ptx60", "PTXVersion", "60",
"Use PTX version 6.0">;
+def PTX61 : SubtargetFeature<"ptx61", "PTXVersion", "61",
+ "Use PTX version 6.1">;
//===----------------------------------------------------------------------===//
// NVPTX supported processors.
@@ -89,6 +93,7 @@
def : Proc<"sm_61", [SM61, PTX50]>;
def : Proc<"sm_62", [SM62, PTX50]>;
def : Proc<"sm_70", [SM70, PTX60]>;
+def : Proc<"sm_72", [SM72, PTX61]>;
def NVPTXInstrInfo : InstrInfo {
}
Index: llvm/include/llvm/IR/IntrinsicsNVVM.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -3920,7 +3920,9 @@
}
multiclass NVVM_WMMA_LD {
+ defm _m32n8k16_load: NVVM_WMMA_LD_G<"m32n8k16">;
defm _m16n16k16_load: NVVM_WMMA_LD_G<"m16n16k16">;
+ defm _m8n32k16_load: NVVM_WMMA_LD_G<"m8n32k16">;
}
defm int_nvvm_wmma: NVVM_WMMA_LD;
@@ -3947,7 +3949,7 @@
# !if(WithStride, ".stride", "")
# "." # Type>;
-multiclass NVVM_WMMA_STD_GLT<string Geometry, string Layout,
+multiclass NVVM_WMMA_STD_GLT<string Geometry, string Layout,
string Type, LLVMType regty> {
def _stride: NVVM_WMMA_STD_GLSTS<Geometry, Layout, Type, regty, 1>;
def NAME: NVVM_WMMA_STD_GLSTS<Geometry, Layout, Type, regty, 0>;
@@ -3963,7 +3965,9 @@
}
multiclass NVVM_WMMA_STD {
+ defm _m32n8k16_store: NVVM_WMMA_STD_G<"m32n8k16">;
defm _m16n16k16_store: NVVM_WMMA_STD_G<"m16n16k16">;
+ defm _m8n32k16_store: NVVM_WMMA_STD_G<"m8n32k16">;
}
defm int_nvvm_wmma: NVVM_WMMA_STD;
@@ -4033,7 +4037,9 @@
}
multiclass NVVM_WMMA_MMA {
+ defm _m32n8k16_mma : NVVM_WMMA_MMA_G<"m32n8k16">;
defm _m16n16k16_mma : NVVM_WMMA_MMA_G<"m16n16k16">;
+ defm _m8n32k16_mma : NVVM_WMMA_MMA_G<"m8n32k16">;
}
defm int_nvvm_wmma : NVVM_WMMA_MMA;
Index: clang/test/CodeGen/builtins-nvptx-sm_70.cu
===================================================================
--- clang/test/CodeGen/builtins-nvptx-sm_70.cu
+++ clang/test/CodeGen/builtins-nvptx-sm_70.cu
@@ -1,9 +1,16 @@
// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_70 \
// RUN: -fcuda-is-device -target-feature +ptx60 \
// RUN: -S -emit-llvm -o - -x cuda %s \
-// RUN: | FileCheck -check-prefix=CHECK %s
+// RUN: | FileCheck -check-prefix=CHECK_M16 %s
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_70 \
+// RUN: -fcuda-is-device -target-feature +ptx61 -DPTX61 \
+// RUN: -S -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefixes=CHECK_M16,CHECK_M32_M8 %s
// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_60 \
-// RUN: -fcuda-is-device -S -o /dev/null -x cuda -verify %s
+// RUN: -DPTX61 -fcuda-is-device -S -o /dev/null -x cuda -verify=pre-sm_70 %s
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown \
+// RUN: -target-cpu sm_70 -target-feature +ptx60 \
+// RUN: -DPTX61 -fcuda-is-device -S -o /dev/null -x cuda -verify=pre-ptx61 %s
#if !defined(CUDA_VERSION)
#define __device__ __attribute__((device))
@@ -18,149 +25,443 @@
// that encounters an error, so -verify will not be able to find errors in
// subsequent functions.
-// CHECK-LABEL: nvvm_wmma
-__device__ void nvvm_wmma(int *src, int *dst,
- float *fsrc, float *fdst,
- int ldm) {
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16
- // expected-error@+1 {{'__hmma_m16n16k16_ld_a' needs target feature ptx60}}
+// CHECK-LABEL: nvvm_wmma_m16n16k16
+__device__ void nvvm_wmma_m16n16k16(int *src, int *dst,
+ float *fsrc, float *fdst,
+ int ldm) {
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_ld_a' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_ld_a(dst, src, ldm, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.f16
- // expected-error@+1 {{'__hmma_m16n16k16_ld_a' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.f16
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_ld_a' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_ld_a(dst, src+1, ldm, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.f16
- // expected-error@+1 {{'__hmma_m16n16k16_ld_b' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.f16
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_ld_b' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_ld_b(dst, src, ldm, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.f16
- // expected-error@+1 {{'__hmma_m16n16k16_ld_b' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.f16
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_ld_b' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_ld_b(dst, src+2, ldm, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f16
- // expected-error@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f16
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_ld_c_f16(dst, src, ldm, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f16
- // expected-error@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f16
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_ld_c_f16(dst, src, ldm, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32
- // expected-error@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_ld_c_f32(fdst, fsrc, ldm, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32
- // expected-error@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_ld_c_f32(fdst, fsrc, ldm, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16
- // expected-error@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_st_c_f16(dst, src, ldm, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f16
- // expected-error@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f16
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_st_c_f16(dst, src, ldm, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32
- // expected-error@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_st_c_f32(fdst, fsrc, ldm, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32
- // expected-error@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_st_c_f32(fdst, fsrc, ldm, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f16f16(dst, src, src, src, 0, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16.satfinite
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16.satfinite
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f16f16(dst, src, src, src, 0, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f16
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f16
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f16f16(dst, src, src, src, 1, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f16.satfinite
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f16.satfinite
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f16f16(dst, src, src, src, 1, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f16
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f16
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f16f16(dst, src, src, src, 2, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f16.satfinite
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f16.satfinite
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f16f16(dst, src, src, src, 2, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f16f16(dst, src, src, src, 3, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16.satfinite
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16.satfinite
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f16f16(dst, src, src, src, 3, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f32
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f32
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 0, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f32.satfinite
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f32.satfinite
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 0, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f32
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f32
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 1, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f32.satfinite
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f32.satfinite
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 1, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f32
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f32
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 2, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f32.satfinite
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f32.satfinite
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 2, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f32
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f32
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 3, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f32.satfinite
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f32.satfinite
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 3, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f16
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f16
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 0, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f16.satfinite
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f16.satfinite
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 0, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f16
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f16
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 1, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f16.satfinite
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f16.satfinite
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 1, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f16
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f16
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 2, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f16.satfinite
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f16.satfinite
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 2, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f16
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f16
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 3, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f16.satfinite
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f16.satfinite
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 3, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 0, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32.satfinite
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32.satfinite
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 0, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f32
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f32
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 1, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f32.satfinite
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f32.satfinite
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 1, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f32
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f32
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 2, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f32.satfinite
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f32.satfinite
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 2, 1);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 3, 0);
- // CHECK: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32.satfinite
- // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
+ // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32.satfinite
+ // pre-sm_70-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70,ptx60}}
__hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 3, 1);
}
+
+#ifdef PTX61
+// CHECK-LABEL: nvvm_wmma_m32n8k16
+__device__ void nvvm_wmma_m32n8k16(int *src, int *dst,
+ float *fsrc, float *fdst,
+ int ldm) {
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.f16
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_ld_a' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_ld_a(dst, src, ldm, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.f16
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_ld_a' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_ld_a(dst, src+1, ldm, 1);
+
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.f16
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_ld_b' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_ld_b(dst, src, ldm, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.f16
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_ld_b' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_ld_b(dst, src+2, ldm, 1);
+
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f16
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_ld_c_f16' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_ld_c_f16(dst, src, ldm, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f16
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_ld_c_f16' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_ld_c_f16(dst, src, ldm, 1);
+
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f32
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_ld_c_f32' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_ld_c_f32(fdst, fsrc, ldm, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f32
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_ld_c_f32' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_ld_c_f32(fdst, fsrc, ldm, 1);
+
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f16
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_st_c_f16' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_st_c_f16(dst, src, ldm, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f16
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_st_c_f16' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_st_c_f16(dst, src, ldm, 1);
+
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f32
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_st_c_f32' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_st_c_f32(fdst, fsrc, ldm, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f32
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_st_c_f32' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_st_c_f32(fdst, fsrc, ldm, 1);
+
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f16
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 0, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f16.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 0, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f16
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 1, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f16.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 1, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f16
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 2, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f16.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 2, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f16
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 3, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f16.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 3, 1);
+
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f32
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 0, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f32.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 0, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f32
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 1, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f32.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 1, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f32
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 2, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f32.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 2, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f32
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 3, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f32.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 3, 1);
+
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f16
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 0, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f16.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 0, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f16
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 1, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f16.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 1, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f16
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 2, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f16.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 2, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f16
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 3, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f16.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 3, 1);
+
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f32
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 0, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f32.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 0, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f32
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 1, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f32.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 1, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f32
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 2, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f32.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 2, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f32
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 3, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f32.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70,ptx61}}
+ __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 3, 1);
+
+
+ // m8n32k16 variants.
+
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.f16
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_ld_a' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_ld_a(dst, src, ldm, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.f16
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_ld_a' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_ld_a(dst, src+1, ldm, 1);
+
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.f16
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_ld_b' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_ld_b(dst, src, ldm, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.f16
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_ld_b' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_ld_b(dst, src+2, ldm, 1);
+
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f16
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_ld_c_f16' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_ld_c_f16(dst, src, ldm, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f16
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_ld_c_f16' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_ld_c_f16(dst, src, ldm, 1);
+
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f32
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_ld_c_f32' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_ld_c_f32(fdst, fsrc, ldm, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f32
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_ld_c_f32' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_ld_c_f32(fdst, fsrc, ldm, 1);
+
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f16
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_st_c_f16' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_st_c_f16(dst, src, ldm, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f16
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_st_c_f16' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_st_c_f16(dst, src, ldm, 1);
+
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f32
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_st_c_f32' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_st_c_f32(fdst, fsrc, ldm, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f32
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_st_c_f32' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_st_c_f32(fdst, fsrc, ldm, 1);
+
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f16
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 0, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f16.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 0, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f16
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 1, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f16.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 1, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f16
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 2, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f16.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 2, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f16
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 3, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f16.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 3, 1);
+
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f32
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 0, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f32.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 0, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f32
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 1, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f32.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 1, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f32
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 2, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f32.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 2, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f32
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 3, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f32.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 3, 1);
+
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f16
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 0, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f16.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 0, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f16
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 1, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f16.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 1, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f16
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 2, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f16.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 2, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f16
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 3, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f16.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 3, 1);
+
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f32
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 0, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f32.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 0, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f32
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 1, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f32.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 1, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f32
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 2, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f32.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 2, 1);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f32
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 3, 0);
+ // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f32.satfinite
+ // pre-ptx61-error@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70,ptx61}}
+ __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 3, 1);
+}
+#endif
Index: clang/lib/Driver/ToolChains/Cuda.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Cuda.cpp
+++ clang/lib/Driver/ToolChains/Cuda.cpp
@@ -570,17 +570,19 @@
CC1Args.push_back("-mlink-cuda-bitcode");
CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
- if (CudaInstallation.version() >= CudaVersion::CUDA_90) {
+ // Libdevice in CUDA-7.0 requires PTX version that's more recent than LLVM
+ // defaults to. Use PTX4.2 by default, which is the PTX version that came with
+ // CUDA-7.0.
+ const char *PtxFeature = "+ptx42";
+ if (CudaInstallation.version() >= CudaVersion::CUDA_91) {
+ // CUDA-9 uses new instructions that are only available in PTX6.1
+ PtxFeature = "+ptx61";
+ } else if (CudaInstallation.version() >= CudaVersion::CUDA_90) {
// CUDA-9 uses new instructions that are only available in PTX6.0
- CC1Args.push_back("-target-feature");
- CC1Args.push_back("+ptx60");
- } else {
- // Libdevice in CUDA-7.0 requires PTX version that's more recent
- // than LLVM defaults to. Use PTX4.2 which is the PTX version that
- // came with CUDA-7.0.
- CC1Args.push_back("-target-feature");
- CC1Args.push_back("+ptx42");
+ PtxFeature = "+ptx60";
}
+ CC1Args.push_back("-target-feature");
+ CC1Args.push_back(PtxFeature);
if (DeviceOffloadingKind == Action::OFK_OpenMP) {
SmallVector<StringRef, 8> LibraryPaths;
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -10525,7 +10525,15 @@
case NVPTX::BI__hmma_m16n16k16_ld_a:
case NVPTX::BI__hmma_m16n16k16_ld_b:
case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
- case NVPTX::BI__hmma_m16n16k16_ld_c_f32: {
+ case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
+ case NVPTX::BI__hmma_m32n8k16_ld_a:
+ case NVPTX::BI__hmma_m32n8k16_ld_b:
+ case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
+ case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
+ case NVPTX::BI__hmma_m8n32k16_ld_a:
+ case NVPTX::BI__hmma_m8n32k16_ld_b:
+ case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
+ case NVPTX::BI__hmma_m8n32k16_ld_c_f32: {
Address Dst = EmitPointerWithAlignment(E->getArg(0));
Value *Src = EmitScalarExpr(E->getArg(1));
Value *Ldm = EmitScalarExpr(E->getArg(2));
@@ -10556,6 +10564,46 @@
: Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride;
NumResults = 8;
break;
+ case NVPTX::BI__hmma_m32n8k16_ld_a:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride
+ : Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride;
+ NumResults = 8;
+ break;
+ case NVPTX::BI__hmma_m32n8k16_ld_b:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride
+ : Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride;
+ NumResults = 8;
+ break;
+ case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride
+ : Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride;
+ NumResults = 4;
+ break;
+ case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride
+ : Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride;
+ NumResults = 8;
+ break;
+ case NVPTX::BI__hmma_m8n32k16_ld_a:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride
+ : Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride;
+ NumResults = 8;
+ break;
+ case NVPTX::BI__hmma_m8n32k16_ld_b:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride
+ : Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride;
+ NumResults = 8;
+ break;
+ case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride
+ : Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride;
+ NumResults = 4;
+ break;
+ case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride
+ : Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride;
+ NumResults = 8;
+ break;
default:
llvm_unreachable("Unexpected builtin ID.");
}
@@ -10574,7 +10622,11 @@
}
case NVPTX::BI__hmma_m16n16k16_st_c_f16:
- case NVPTX::BI__hmma_m16n16k16_st_c_f32: {
+ case NVPTX::BI__hmma_m16n16k16_st_c_f32:
+ case NVPTX::BI__hmma_m32n8k16_st_c_f16:
+ case NVPTX::BI__hmma_m32n8k16_st_c_f32:
+ case NVPTX::BI__hmma_m8n32k16_st_c_f16:
+ case NVPTX::BI__hmma_m8n32k16_st_c_f32: {
Value *Dst = EmitScalarExpr(E->getArg(0));
Address Src = EmitPointerWithAlignment(E->getArg(1));
Value *Ldm = EmitScalarExpr(E->getArg(2));
@@ -10596,6 +10648,24 @@
IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride
: Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride;
break;
+ case NVPTX::BI__hmma_m32n8k16_st_c_f16:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride
+ : Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride;
+ NumResults = 4;
+ break;
+ case NVPTX::BI__hmma_m32n8k16_st_c_f32:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride
+ : Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride;
+ break;
+ case NVPTX::BI__hmma_m8n32k16_st_c_f16:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride
+ : Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride;
+ NumResults = 4;
+ break;
+ case NVPTX::BI__hmma_m8n32k16_st_c_f32:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride
+ : Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride;
+ break;
default:
llvm_unreachable("Unexpected builtin ID.");
}
@@ -10618,7 +10688,15 @@
case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
- case NVPTX::BI__hmma_m16n16k16_mma_f16f32: {
+ case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
+ case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
+ case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
+ case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
+ case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
+ case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
+ case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
+ case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
+ case NVPTX::BI__hmma_m8n32k16_mma_f16f32: {
Address Dst = EmitPointerWithAlignment(E->getArg(0));
Address SrcA = EmitPointerWithAlignment(E->getArg(1));
Address SrcB = EmitPointerWithAlignment(E->getArg(2));
@@ -10635,15 +10713,15 @@
bool Satf = SatfArg.getSExtValue();
// clang-format off
-#define MMA_VARIANTS(type) {{ \
- Intrinsic::nvvm_wmma_m16n16k16_mma_row_row_##type, \
- Intrinsic::nvvm_wmma_m16n16k16_mma_row_row_##type##_satfinite, \
- Intrinsic::nvvm_wmma_m16n16k16_mma_row_col_##type, \
- Intrinsic::nvvm_wmma_m16n16k16_mma_row_col_##type##_satfinite, \
- Intrinsic::nvvm_wmma_m16n16k16_mma_col_row_##type, \
- Intrinsic::nvvm_wmma_m16n16k16_mma_col_row_##type##_satfinite, \
- Intrinsic::nvvm_wmma_m16n16k16_mma_col_col_##type, \
- Intrinsic::nvvm_wmma_m16n16k16_mma_col_col_##type##_satfinite \
+#define MMA_VARIANTS(geom, type) {{ \
+ Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type, \
+ Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \
+ Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \
+ Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
+ Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type, \
+ Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \
+ Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type, \
+ Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite \
}}
// clang-format on
@@ -10657,22 +10735,62 @@
unsigned NumEltsD;
switch (BuiltinID) {
case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
- IID = getMMAIntrinsic(MMA_VARIANTS(f16_f16));
+ IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f16_f16));
NumEltsC = 4;
NumEltsD = 4;
break;
case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
- IID = getMMAIntrinsic(MMA_VARIANTS(f32_f16));
+ IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f32_f16));
NumEltsC = 4;
NumEltsD = 8;
break;
case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
- IID = getMMAIntrinsic(MMA_VARIANTS(f16_f32));
+ IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f16_f32));
NumEltsC = 8;
NumEltsD = 4;
break;
case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
- IID = getMMAIntrinsic(MMA_VARIANTS(f32_f32));
+ IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f32_f32));
+ NumEltsC = 8;
+ NumEltsD = 8;
+ break;
+ case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
+ IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f16_f16));
+ NumEltsC = 4;
+ NumEltsD = 4;
+ break;
+ case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
+ IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f32_f16));
+ NumEltsC = 4;
+ NumEltsD = 8;
+ break;
+ case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
+ IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f16_f32));
+ NumEltsC = 8;
+ NumEltsD = 4;
+ break;
+ case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
+ IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f32_f32));
+ NumEltsC = 8;
+ NumEltsD = 8;
+ break;
+ case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
+ IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f16_f16));
+ NumEltsC = 4;
+ NumEltsD = 4;
+ break;
+ case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
+ IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f32_f16));
+ NumEltsC = 4;
+ NumEltsD = 8;
+ break;
+ case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
+ IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f16_f32));
+ NumEltsC = 8;
+ NumEltsD = 4;
+ break;
+ case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
+ IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f32_f32));
NumEltsC = 8;
NumEltsD = 8;
break;
Index: clang/include/clang/Basic/BuiltinsNVPTX.def
===================================================================
--- clang/include/clang/Basic/BuiltinsNVPTX.def
+++ clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -692,17 +692,41 @@
BUILTIN(__nvvm_ldg_d2, "E2dE2dC*", "")
// Builtins to support WMMA instructions on sm_70
-TARGET_BUILTIN(__hmma_m16n16k16_ld_a, "vi*iC*UiIi", "", "ptx60")
-TARGET_BUILTIN(__hmma_m16n16k16_ld_b, "vi*iC*UiIi", "", "ptx60")
-TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f16, "vi*iC*UiIi", "", "ptx60")
-TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f32, "vf*fC*UiIi", "", "ptx60")
-TARGET_BUILTIN(__hmma_m16n16k16_st_c_f16, "vi*i*UiIi", "", "ptx60")
-TARGET_BUILTIN(__hmma_m16n16k16_st_c_f32, "vf*f*UiIi", "", "ptx60")
-
-TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", "ptx60")
-TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", "ptx60")
-TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", "ptx60")
-TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", "ptx60")
+TARGET_BUILTIN(__hmma_m16n16k16_ld_a, "vi*iC*UiIi", "", "sm_70,ptx60")
+TARGET_BUILTIN(__hmma_m16n16k16_ld_b, "vi*iC*UiIi", "", "sm_70,ptx60")
+TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f16, "vi*iC*UiIi", "", "sm_70,ptx60")
+TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f32, "vf*fC*UiIi", "", "sm_70,ptx60")
+TARGET_BUILTIN(__hmma_m16n16k16_st_c_f16, "vi*i*UiIi", "", "sm_70,ptx60")
+TARGET_BUILTIN(__hmma_m16n16k16_st_c_f32, "vf*f*UiIi", "", "sm_70,ptx60")
+
+TARGET_BUILTIN(__hmma_m32n8k16_ld_a, "vi*iC*UiIi", "", "sm_70,ptx61")
+TARGET_BUILTIN(__hmma_m32n8k16_ld_b, "vi*iC*UiIi", "", "sm_70,ptx61")
+TARGET_BUILTIN(__hmma_m32n8k16_ld_c_f16, "vi*iC*UiIi", "", "sm_70,ptx61")
+TARGET_BUILTIN(__hmma_m32n8k16_ld_c_f32, "vf*fC*UiIi", "", "sm_70,ptx61")
+TARGET_BUILTIN(__hmma_m32n8k16_st_c_f16, "vi*i*UiIi", "", "sm_70,ptx61")
+TARGET_BUILTIN(__hmma_m32n8k16_st_c_f32, "vf*f*UiIi", "", "sm_70,ptx61")
+
+TARGET_BUILTIN(__hmma_m8n32k16_ld_a, "vi*iC*UiIi", "", "sm_70,ptx61")
+TARGET_BUILTIN(__hmma_m8n32k16_ld_b, "vi*iC*UiIi", "", "sm_70,ptx61")
+TARGET_BUILTIN(__hmma_m8n32k16_ld_c_f16, "vi*iC*UiIi", "", "sm_70,ptx61")
+TARGET_BUILTIN(__hmma_m8n32k16_ld_c_f32, "vf*fC*UiIi", "", "sm_70,ptx61")
+TARGET_BUILTIN(__hmma_m8n32k16_st_c_f16, "vi*i*UiIi", "", "sm_70,ptx61")
+TARGET_BUILTIN(__hmma_m8n32k16_st_c_f32, "vf*f*UiIi", "", "sm_70,ptx61")
+
+TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", "sm_70,ptx60")
+TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", "sm_70,ptx60")
+TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", "sm_70,ptx60")
+TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", "sm_70,ptx60")
+
+TARGET_BUILTIN(__hmma_m32n8k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", "sm_70,ptx61")
+TARGET_BUILTIN(__hmma_m32n8k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", "sm_70,ptx61")
+TARGET_BUILTIN(__hmma_m32n8k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", "sm_70,ptx61")
+TARGET_BUILTIN(__hmma_m32n8k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", "sm_70,ptx61")
+
+TARGET_BUILTIN(__hmma_m8n32k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", "sm_70,ptx61")
+TARGET_BUILTIN(__hmma_m8n32k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", "sm_70,ptx61")
+TARGET_BUILTIN(__hmma_m8n32k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", "sm_70,ptx61")
+TARGET_BUILTIN(__hmma_m8n32k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", "sm_70,ptx61")
#undef BUILTIN
#undef TARGET_BUILTIN
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits