https://github.com/AlexVlx updated https://github.com/llvm/llvm-project/pull/89796
>From 662f160418c704f45e57e751168903d774b74303 Mon Sep 17 00:00:00 2001 From: Alex Voicu <alexandru.vo...@amd.com> Date: Tue, 23 Apr 2024 17:41:25 +0100 Subject: [PATCH 1/3] Add initial support for AMDGCN flavoured SPIRV. --- clang/lib/Basic/Targets.cpp | 6 +- clang/lib/Basic/Targets/SPIR.cpp | 288 +++++++++++++++++ clang/lib/Basic/Targets/SPIR.h | 51 +++ clang/lib/CodeGen/CGBuiltin.cpp | 7 + clang/test/CodeGen/target-data.c | 4 + .../test/CodeGenCUDA/builtins-spirv-amdgcn.cu | 294 ++++++++++++++++++ ...tins-unsafe-atomics-spirv-amdgcn-gfx90a.cu | 31 ++ clang/test/CodeGenCUDA/long-double.cu | 4 + clang/test/CodeGenCUDA/spirv-amdgcn-bf16.cu | 129 ++++++++ .../test/CodeGenCXX/spirv-amdgcn-float16.cpp | 38 +++ clang/test/CodeGenHIP/spirv-amdgcn-ballot.cpp | 27 ++ .../spirv-amdgcn-dpp-const-fold.hip | 46 +++ clang/test/CodeGenHIP/spirv-amdgcn-half.hip | 15 + .../predefined-macros-no-warnings.c | 1 + clang/test/Preprocessor/predefined-macros.c | 10 + ...in-spirv-amdgcn-atomic-inc-dec-failure.cpp | 25 ++ .../Sema/inline-asm-validate-spirv-amdgcn.cl | 111 +++++++ clang/test/SemaCUDA/allow-int128.cu | 3 + clang/test/SemaCUDA/amdgpu-f128.cu | 1 + clang/test/SemaCUDA/float16.cu | 1 + clang/test/SemaCUDA/fp16-arg-return.cu | 1 + .../test/SemaCUDA/spirv-amdgcn-atomic-ops.cu | 86 +++++ 22 files changed, 1178 insertions(+), 1 deletion(-) create mode 100644 clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu create mode 100644 clang/test/CodeGenCUDA/builtins-unsafe-atomics-spirv-amdgcn-gfx90a.cu create mode 100644 clang/test/CodeGenCUDA/spirv-amdgcn-bf16.cu create mode 100644 clang/test/CodeGenCXX/spirv-amdgcn-float16.cpp create mode 100644 clang/test/CodeGenHIP/spirv-amdgcn-ballot.cpp create mode 100644 clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip create mode 100644 clang/test/CodeGenHIP/spirv-amdgcn-half.hip create mode 100644 clang/test/Sema/builtin-spirv-amdgcn-atomic-inc-dec-failure.cpp create mode 100644 clang/test/Sema/inline-asm-validate-spirv-amdgcn.cl create mode 100644 clang/test/SemaCUDA/spirv-amdgcn-atomic-ops.cu diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp index e3283510c6aac7..04a13e3385d1f6 100644 --- a/clang/lib/Basic/Targets.cpp +++ b/clang/lib/Basic/Targets.cpp @@ -673,8 +673,12 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple, } case llvm::Triple::spirv64: { if (os != llvm::Triple::UnknownOS || - Triple.getEnvironment() != llvm::Triple::UnknownEnvironment) + Triple.getEnvironment() != llvm::Triple::UnknownEnvironment) { + if (os == llvm::Triple::OSType::AMDHSA) + return std::make_unique<SPIRV64AMDGCNTargetInfo>(Triple, Opts); + return nullptr; + } return std::make_unique<SPIRV64TargetInfo>(Triple, Opts); } case llvm::Triple::wasm32: diff --git a/clang/lib/Basic/Targets/SPIR.cpp b/clang/lib/Basic/Targets/SPIR.cpp index dc920177d3a910..d7d232ac9484f8 100644 --- a/clang/lib/Basic/Targets/SPIR.cpp +++ b/clang/lib/Basic/Targets/SPIR.cpp @@ -12,6 +12,8 @@ #include "SPIR.h" #include "Targets.h" +#include "clang/Basic/Builtins.h" +#include "clang/Basic/TargetBuiltins.h" using namespace clang; using namespace clang::targets; @@ -54,3 +56,289 @@ void SPIRV64TargetInfo::getTargetDefines(const LangOptions &Opts, BaseSPIRVTargetInfo::getTargetDefines(Opts, Builder); DefineStd(Builder, "SPIRV64", Opts); } + +static constexpr Builtin::Info BuiltinInfo[] = { +#define BUILTIN(ID, TYPE, ATTRS) \ + {#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, ALL_LANGUAGES}, +#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) \ + {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::NO_HEADER, ALL_LANGUAGES}, +#include "clang/Basic/BuiltinsAMDGPU.def" +}; + +namespace { +const char *AMDGPUGCCRegNames[] = { + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31", "v32", "v33", "v34", "v35", + "v36", "v37", "v38", "v39", "v40", "v41", "v42", "v43", "v44", + "v45", "v46", "v47", "v48", "v49", "v50", "v51", "v52", "v53", + "v54", "v55", "v56", "v57", "v58", "v59", "v60", "v61", "v62", + "v63", "v64", "v65", "v66", "v67", "v68", "v69", "v70", "v71", + "v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79", "v80", + "v81", "v82", "v83", "v84", "v85", "v86", "v87", "v88", "v89", + "v90", "v91", "v92", "v93", "v94", "v95", "v96", "v97", "v98", + "v99", "v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107", + "v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115", "v116", + "v117", "v118", "v119", "v120", "v121", "v122", "v123", "v124", "v125", + "v126", "v127", "v128", "v129", "v130", "v131", "v132", "v133", "v134", + "v135", "v136", "v137", "v138", "v139", "v140", "v141", "v142", "v143", + "v144", "v145", "v146", "v147", "v148", "v149", "v150", "v151", "v152", + "v153", "v154", "v155", "v156", "v157", "v158", "v159", "v160", "v161", + "v162", "v163", "v164", "v165", "v166", "v167", "v168", "v169", "v170", + "v171", "v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179", + "v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187", "v188", + "v189", "v190", "v191", "v192", "v193", "v194", "v195", "v196", "v197", + "v198", "v199", "v200", "v201", "v202", "v203", "v204", "v205", "v206", + "v207", "v208", "v209", "v210", "v211", "v212", "v213", "v214", "v215", + "v216", "v217", "v218", "v219", "v220", "v221", "v222", "v223", "v224", + "v225", "v226", "v227", "v228", "v229", "v230", "v231", "v232", "v233", + "v234", "v235", "v236", "v237", "v238", "v239", "v240", "v241", "v242", + "v243", "v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251", + "v252", "v253", "v254", "v255", "s0", "s1", "s2", "s3", "s4", + "s5", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", + "s14", "s15", "s16", "s17", "s18", "s19", "s20", "s21", "s22", + "s23", "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31", + "s32", "s33", "s34", "s35", "s36", "s37", "s38", "s39", "s40", + "s41", "s42", "s43", "s44", "s45", "s46", "s47", "s48", "s49", + "s50", "s51", "s52", "s53", "s54", "s55", "s56", "s57", "s58", + "s59", "s60", "s61", "s62", "s63", "s64", "s65", "s66", "s67", + "s68", "s69", "s70", "s71", "s72", "s73", "s74", "s75", "s76", + "s77", "s78", "s79", "s80", "s81", "s82", "s83", "s84", "s85", + "s86", "s87", "s88", "s89", "s90", "s91", "s92", "s93", "s94", + "s95", "s96", "s97", "s98", "s99", "s100", "s101", "s102", "s103", + "s104", "s105", "s106", "s107", "s108", "s109", "s110", "s111", "s112", + "s113", "s114", "s115", "s116", "s117", "s118", "s119", "s120", "s121", + "s122", "s123", "s124", "s125", "s126", "s127", "exec", "vcc", "scc", + "m0", "flat_scratch", "exec_lo", "exec_hi", "vcc_lo", "vcc_hi", + "flat_scratch_lo", "flat_scratch_hi", + "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", + "a9", "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", + "a18", "a19", "a20", "a21", "a22", "a23", "a24", "a25", "a26", + "a27", "a28", "a29", "a30", "a31", "a32", "a33", "a34", "a35", + "a36", "a37", "a38", "a39", "a40", "a41", "a42", "a43", "a44", + "a45", "a46", "a47", "a48", "a49", "a50", "a51", "a52", "a53", + "a54", "a55", "a56", "a57", "a58", "a59", "a60", "a61", "a62", + "a63", "a64", "a65", "a66", "a67", "a68", "a69", "a70", "a71", + "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79", "a80", + "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89", + "a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", + "a99", "a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107", + "a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115", "a116", + "a117", "a118", "a119", "a120", "a121", "a122", "a123", "a124", "a125", + "a126", "a127", "a128", "a129", "a130", "a131", "a132", "a133", "a134", + "a135", "a136", "a137", "a138", "a139", "a140", "a141", "a142", "a143", + "a144", "a145", "a146", "a147", "a148", "a149", "a150", "a151", "a152", + "a153", "a154", "a155", "a156", "a157", "a158", "a159", "a160", "a161", + "a162", "a163", "a164", "a165", "a166", "a167", "a168", "a169", "a170", + "a171", "a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179", + "a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187", "a188", + "a189", "a190", "a191", "a192", "a193", "a194", "a195", "a196", "a197", + "a198", "a199", "a200", "a201", "a202", "a203", "a204", "a205", "a206", + "a207", "a208", "a209", "a210", "a211", "a212", "a213", "a214", "a215", + "a216", "a217", "a218", "a219", "a220", "a221", "a222", "a223", "a224", + "a225", "a226", "a227", "a228", "a229", "a230", "a231", "a232", "a233", + "a234", "a235", "a236", "a237", "a238", "a239", "a240", "a241", "a242", + "a243", "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251", + "a252", "a253", "a254", "a255" +}; + +} // anonymous namespace + +ArrayRef<const char *> SPIRV64AMDGCNTargetInfo::getGCCRegNames() const { + return llvm::ArrayRef(AMDGPUGCCRegNames); +} + +bool SPIRV64AMDGCNTargetInfo::initFeatureMap( + llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, StringRef, + const std::vector<std::string> &FeatureVec) const { + // This represents the union of all AMDGCN features. + Features["atomic-ds-pk-add-16-insts"] = true; + Features["atomic-flat-pk-add-16-insts"] = true; + Features["atomic-buffer-global-pk-add-f16-insts"] = true; + Features["atomic-global-pk-add-bf16-inst"] = true; + Features["atomic-fadd-rtn-insts"] = true; + Features["ci-insts"] = true; + Features["dot1-insts"] = true; + Features["dot2-insts"] = true; + Features["dot3-insts"] = true; + Features["dot4-insts"] = true; + Features["dot5-insts"] = true; + Features["dot7-insts"] = true; + Features["dot8-insts"] = true; + Features["dot9-insts"] = true; + Features["dot10-insts"] = true; + Features["dot11-insts"] = true; + Features["dl-insts"] = true; + Features["16-bit-insts"] = true; + Features["dpp"] = true; + Features["gfx8-insts"] = true; + Features["gfx9-insts"] = true; + Features["gfx90a-insts"] = true; + Features["gfx940-insts"] = true; + Features["gfx10-insts"] = true; + Features["gfx10-3-insts"] = true; + Features["gfx11-insts"] = true; + Features["gfx12-insts"] = true; + Features["image-insts"] = true; + Features["fp8-conversion-insts"] = true; + Features["s-memrealtime"] = true; + Features["s-memtime-inst"] = true; + Features["gws"] = true; + Features["fp8-insts"] = true; + Features["fp8-conversion-insts"] = true; + Features["atomic-ds-pk-add-16-insts"] = true; + Features["mai-insts"] = true; + + return TargetInfo::initFeatureMap(Features, Diags, {}, FeatureVec); +} + +bool SPIRV64AMDGCNTargetInfo::validateAsmConstraint( + const char *&Name, TargetInfo::ConstraintInfo &Info) const { + // This is a 1:1 copy of AMDGPUTargetInfo::validateAsmConstraint() + static const ::llvm::StringSet<> SpecialRegs({ + "exec", "vcc", "flat_scratch", "m0", "scc", "tba", "tma", + "flat_scratch_lo", "flat_scratch_hi", "vcc_lo", "vcc_hi", "exec_lo", + "exec_hi", "tma_lo", "tma_hi", "tba_lo", "tba_hi", + }); + + switch (*Name) { + case 'I': + Info.setRequiresImmediate(-16, 64); + return true; + case 'J': + Info.setRequiresImmediate(-32768, 32767); + return true; + case 'A': + case 'B': + case 'C': + Info.setRequiresImmediate(); + return true; + default: + break; + } + + StringRef S(Name); + + if (S == "DA" || S == "DB") { + Name++; + Info.setRequiresImmediate(); + return true; + } + + bool HasLeftParen = S.consume_front("{"); + if (S.empty()) + return false; + if (S.front() != 'v' && S.front() != 's' && S.front() != 'a') { + if (!HasLeftParen) + return false; + auto E = S.find('}'); + if (!SpecialRegs.count(S.substr(0, E))) + return false; + S = S.drop_front(E + 1); + if (!S.empty()) + return false; + // Found {S} where S is a special register. + Info.setAllowsRegister(); + Name = S.data() - 1; + return true; + } + S = S.drop_front(); + if (!HasLeftParen) { + if (!S.empty()) + return false; + // Found s, v or a. + Info.setAllowsRegister(); + Name = S.data() - 1; + return true; + } + bool HasLeftBracket = S.consume_front("["); + unsigned long long N; + if (S.empty() || consumeUnsignedInteger(S, 10, N)) + return false; + if (S.consume_front(":")) { + if (!HasLeftBracket) + return false; + unsigned long long M; + if (consumeUnsignedInteger(S, 10, M) || N >= M) + return false; + } + if (HasLeftBracket) { + if (!S.consume_front("]")) + return false; + } + if (!S.consume_front("}")) + return false; + if (!S.empty()) + return false; + // Found {vn}, {sn}, {an}, {v[n]}, {s[n]}, {a[n]}, {v[n:m]}, {s[n:m]} + // or {a[n:m]}. + Info.setAllowsRegister(); + Name = S.data() - 1; + return true; +} + +std::string +SPIRV64AMDGCNTargetInfo::convertConstraint(const char *&Constraint) const { + // This is a 1:1 copy of AMDGPUTargetInfo::convertConstraint() + StringRef S(Constraint); + if (S == "DA" || S == "DB") { + return std::string("^") + std::string(Constraint++, 2); + } + + const char *Begin = Constraint; + TargetInfo::ConstraintInfo Info("", ""); + if (validateAsmConstraint(Constraint, Info)) + return std::string(Begin).substr(0, Constraint - Begin + 1); + + Constraint = Begin; + return std::string(1, *Constraint); +} + +ArrayRef<Builtin::Info> SPIRV64AMDGCNTargetInfo::getTargetBuiltins() const { + return llvm::ArrayRef(BuiltinInfo, + clang::AMDGPU::LastTSBuiltin - Builtin::FirstTSBuiltin); +} + +void SPIRV64AMDGCNTargetInfo::getTargetDefines(const LangOptions &Opts, + MacroBuilder &Builder) const { + BaseSPIRVTargetInfo::getTargetDefines(Opts, Builder); + DefineStd(Builder, "SPIRV64", Opts); + + Builder.defineMacro("__AMD__"); + Builder.defineMacro("__AMDGPU__"); + Builder.defineMacro("__AMDGCN__"); +} + +void SPIRV64AMDGCNTargetInfo::setAuxTarget(const TargetInfo *Aux) { + // This is a 1:1 copy of AMDGPUTargetInfo::setAuxTarget() + assert(HalfFormat == Aux->HalfFormat); + assert(FloatFormat == Aux->FloatFormat); + assert(DoubleFormat == Aux->DoubleFormat); + + // On x86_64 long double is 80-bit extended precision format, which is + // not supported by AMDGPU. 128-bit floating point format is also not + // supported by AMDGPU. Therefore keep its own format for these two types. + auto SaveLongDoubleFormat = LongDoubleFormat; + auto SaveFloat128Format = Float128Format; + auto SaveLongDoubleWidth = LongDoubleWidth; + auto SaveLongDoubleAlign = LongDoubleAlign; + copyAuxTarget(Aux); + LongDoubleFormat = SaveLongDoubleFormat; + Float128Format = SaveFloat128Format; + LongDoubleWidth = SaveLongDoubleWidth; + LongDoubleAlign = SaveLongDoubleAlign; + // For certain builtin types support on the host target, claim they are + // supported to pass the compilation of the host code during the device-side + // compilation. + // FIXME: As the side effect, we also accept `__float128` uses in the device + // code. To reject these builtin types supported in the host target but not in + // the device target, one approach would support `device_builtin` attribute + // so that we could tell the device builtin types from the host ones. This + // also solves the different representations of the same builtin type, such + // as `size_t` in the MSVC environment. + if (Aux->hasFloat128Type()) { + HasFloat128 = true; + Float128Format = DoubleFormat; + } +} diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h index 44265445ff004b..6b605979c9ab1d 100644 --- a/clang/lib/Basic/Targets/SPIR.h +++ b/clang/lib/Basic/Targets/SPIR.h @@ -364,6 +364,57 @@ class LLVM_LIBRARY_VISIBILITY SPIRV64TargetInfo : public BaseSPIRVTargetInfo { MacroBuilder &Builder) const override; }; +class LLVM_LIBRARY_VISIBILITY SPIRV64AMDGCNTargetInfo + : public BaseSPIRVTargetInfo { +public: + SPIRV64AMDGCNTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts) + : BaseSPIRVTargetInfo(Triple, Opts) { + assert(Triple.getArch() == llvm::Triple::spirv64 && + "Invalid architecture for 64-bit AMDGCN SPIR-V."); + assert(Triple.getVendor() == llvm::Triple::VendorType::AMD && + "64-bit AMDGCN SPIR-V target must use AMD vendor"); + assert(getTriple().getOS() == llvm::Triple::OSType::AMDHSA && + "64-bit AMDGCN SPIR-V target must use AMDHSA OS"); + assert(getTriple().getEnvironment() == llvm::Triple::UnknownEnvironment && + "64-bit SPIR-V target must use unknown environment type"); + PointerWidth = PointerAlign = 64; + SizeType = TargetInfo::UnsignedLong; + PtrDiffType = IntPtrType = TargetInfo::SignedLong; + + resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-" + "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1-P4-A0"); + + BFloat16Width = BFloat16Align = 16; + BFloat16Format = &llvm::APFloat::BFloat(); + + HasLegalHalfType = true; + HasFloat16 = true; + HalfArgsAndReturns = true; + } + + bool hasBFloat16Type() const override { return true; } + + ArrayRef<const char *> getGCCRegNames() const override; + + bool initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, + StringRef, + const std::vector<std::string> &) const override; + + bool validateAsmConstraint(const char *&Name, + TargetInfo::ConstraintInfo &Info) const override; + + std::string convertConstraint(const char *&Constraint) const override; + + ArrayRef<Builtin::Info> getTargetBuiltins() const override; + + void getTargetDefines(const LangOptions &Opts, + MacroBuilder &Builder) const override; + + void setAuxTarget(const TargetInfo *Aux) override; + + bool hasInt128Type() const override { return TargetInfo::hasInt128Type(); } +}; + } // namespace targets } // namespace clang #endif // LLVM_CLANG_LIB_BASIC_TARGETS_SPIR_H diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 7e5f2edfc732cc..db64b0a436a095 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -6083,6 +6083,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, StringRef Prefix = llvm::Triple::getArchTypePrefix(getTarget().getTriple().getArch()); if (!Prefix.empty()) { + if (Prefix == "spv" && + getTarget().getTriple().getOS() == llvm::Triple::OSType::AMDHSA) + Prefix = "amdgcn"; IntrinsicID = Intrinsic::getIntrinsicForClangBuiltin(Prefix.data(), Name); // NOTE we don't need to perform a compatibility flag check here since the // intrinsics are declared in Builtins*.def via LANGBUILTIN which filter the @@ -6254,6 +6257,10 @@ static Value *EmitTargetArchBuiltinExpr(CodeGenFunction *CGF, case llvm::Triple::riscv32: case llvm::Triple::riscv64: return CGF->EmitRISCVBuiltinExpr(BuiltinID, E, ReturnValue); + case llvm::Triple::spirv64: + if (CGF->getTarget().getTriple().getOS() != llvm::Triple::OSType::AMDHSA) + return nullptr; + return CGF->EmitAMDGPUBuiltinExpr(BuiltinID, E); default: return nullptr; } diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c index c184f314f68f80..1d40b8fe46063d 100644 --- a/clang/test/CodeGen/target-data.c +++ b/clang/test/CodeGen/target-data.c @@ -268,3 +268,7 @@ // RUN: %clang_cc1 -triple ve -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=VE // VE: target datalayout = "e-m:e-i64:64-n32:64-S128-v64:64:64-v128:64:64-v256:64:64-v512:64:64-v1024:64:64-v2048:64:64-v4096:64:64-v8192:64:64-v16384:64:64" + +// RUN: %clang_cc1 -triple spirv64-amd -o - -emit-llvm %s | \ +// RUN: FileCheck %s -check-prefix=SPIR64 +// AMDGPUSPIRV64: target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-G1-P4-A0" diff --git a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu new file mode 100644 index 00000000000000..8dbb8c538ddc16 --- /dev/null +++ b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu @@ -0,0 +1,294 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -x hip \ +// RUN: -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -emit-llvm %s \ +// RUN: -o - | FileCheck %s + +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -x hip \ +// RUN: -aux-triple x86_64-pc-windows-msvc -fcuda-is-device -emit-llvm %s \ +// RUN: -o - | FileCheck %s + +#include "Inputs/cuda.h" + +// CHECK-LABEL: @_Z16use_dispatch_ptrPi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[DISPATCH_PTR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[DISPATCH_PTR_ASCAST:%.*]] = addrspacecast ptr [[DISPATCH_PTR]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call align 4 dereferenceable(64) addrspace(4) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// CHECK-NEXT: store ptr addrspace(4) [[TMP1]], ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +__global__ void use_dispatch_ptr(int* out) { + const int* dispatch_ptr = (const int*)__builtin_amdgcn_dispatch_ptr(); + *out = *dispatch_ptr; +} + +// CHECK-LABEL: @_Z13use_queue_ptrPi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[QUEUE_PTR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[QUEUE_PTR_ASCAST:%.*]] = addrspacecast ptr [[QUEUE_PTR]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.queue.ptr() +// CHECK-NEXT: store ptr addrspace(4) [[TMP1]], ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +__global__ void use_queue_ptr(int* out) { + const int* queue_ptr = (const int*)__builtin_amdgcn_queue_ptr(); + *out = *queue_ptr; +} + +// CHECK-LABEL: @_Z19use_implicitarg_ptrPi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[IMPLICITARG_PTR_ASCAST:%.*]] = addrspacecast ptr [[IMPLICITARG_PTR]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// CHECK-NEXT: store ptr addrspace(4) [[TMP1]], ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +__global__ void use_implicitarg_ptr(int* out) { + const int* implicitarg_ptr = (const int*)__builtin_amdgcn_implicitarg_ptr(); + *out = *implicitarg_ptr; +} + +__global__ + // + void +// CHECK-LABEL: @_Z12test_ds_fmaxf( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[X:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) +// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]], i32 0, i32 0, i1 false) +// CHECK-NEXT: store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4 +// CHECK-NEXT: ret void +// + test_ds_fmax(float src) { + __shared__ float shared; + volatile float x = __builtin_amdgcn_ds_fmaxf(&shared, src, 0, 0, false); +} + +// CHECK-LABEL: @_Z12test_ds_faddf( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[X:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) +// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]], i32 0, i32 0, i1 false) +// CHECK-NEXT: store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4 +// CHECK-NEXT: ret void +// +__global__ void test_ds_fadd(float src) { + __shared__ float shared; + volatile float x = __builtin_amdgcn_ds_faddf(&shared, src, 0, 0, false); +} + +// CHECK-LABEL: @_Z12test_ds_fminfPf( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SHARED:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[SHARED_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[X:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[SHARED_ASCAST:%.*]] = addrspacecast ptr [[SHARED]] to ptr addrspace(4) +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[SHARED_COERCE:%.*]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[SHARED_ASCAST]], align 8 +// CHECK-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8 +// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(3) +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false) +// CHECK-NEXT: store volatile float [[TMP4]], ptr addrspace(4) [[X_ASCAST]], align 4 +// CHECK-NEXT: ret void +// +__global__ void test_ds_fmin(float src, float *shared) { + volatile float x = __builtin_amdgcn_ds_fminf(shared, src, 0, 0, false); +} + +#if 0 // FIXME: returning a pointer to AS4 explicitly is wrong for AMDGPU SPIRV +// +__device__ void test_ret_builtin_nondef_addrspace() { + void *x = __builtin_amdgcn_dispatch_ptr(); +} +#endif + +// CHECK-LABEL: @_Z6endpgmv( +// CHECK-NEXT: entry: +// CHECK-NEXT: call addrspace(4) void @llvm.amdgcn.endpgm() +// CHECK-NEXT: ret void +// +__global__ void endpgm() { + __builtin_amdgcn_endpgm(); +} + +// Check the 64 bit argument is correctly passed to the intrinsic without truncation or assertion. + +// CHECK-LABEL: @_Z14test_uicmp_i64Pyyy( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[A:%.*]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[B:%.*]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr addrspace(4) [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = call addrspace(4) i64 @llvm.amdgcn.icmp.i64.i64(i64 [[TMP1]], i64 [[TMP2]], i32 35) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[TMP3]], ptr addrspace(4) [[TMP4]], align 8 +// CHECK-NEXT: ret void +// +__global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, unsigned long long b) +{ + *out = __builtin_amdgcn_uicmpl(a, b, 30+5); +} + +// Check the 64 bit return value is correctly returned without truncation or assertion. + +// CHECK-LABEL: @_Z14test_s_memtimePy( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call addrspace(4) i64 @llvm.amdgcn.s.memtime() +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[TMP1]], ptr addrspace(4) [[TMP2]], align 8 +// CHECK-NEXT: ret void +// +__global__ void test_s_memtime(unsigned long long* out) +{ + *out = __builtin_amdgcn_s_memtime(); +} + +// Check a generic pointer can be passed as a shared pointer and a generic pointer. +__device__ void func(float *x); + +// CHECK-LABEL: @_Z17test_ds_fmin_funcfPf( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SHARED:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[SHARED_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[X:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[SHARED_ASCAST:%.*]] = addrspacecast ptr [[SHARED]] to ptr addrspace(4) +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[SHARED_COERCE:%.*]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[SHARED_ASCAST]], align 8 +// CHECK-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8 +// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(3) +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false) +// CHECK-NEXT: store volatile float [[TMP4]], ptr addrspace(4) [[X_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 +// CHECK-NEXT: call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP5]]) #[[ATTR7:[0-9]+]] +// CHECK-NEXT: ret void +// +__global__ void test_ds_fmin_func(float src, float *__restrict shared) { + volatile float x = __builtin_amdgcn_ds_fminf(shared, src, 0, 0, false); + func(shared); +} + +// CHECK-LABEL: @_Z14test_is_sharedPf( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[RET:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) +// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[X_COERCE:%.*]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr +// CHECK-NEXT: [[TMP3:%.*]] = call addrspace(4) i1 @llvm.amdgcn.is.shared(ptr [[TMP2]]) +// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TMP3]] to i8 +// CHECK-NEXT: store i8 [[FROMBOOL]], ptr addrspace(4) [[RET_ASCAST]], align 1 +// CHECK-NEXT: ret void +// +__global__ void test_is_shared(float *x){ + bool ret = __builtin_amdgcn_is_shared(x); +} + +// CHECK-LABEL: @_Z15test_is_privatePi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[RET:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) +// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[X_COERCE:%.*]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr +// CHECK-NEXT: [[TMP3:%.*]] = call addrspace(4) i1 @llvm.amdgcn.is.private(ptr [[TMP2]]) +// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TMP3]] to i8 +// CHECK-NEXT: store i8 [[FROMBOOL]], ptr addrspace(4) [[RET_ASCAST]], align 1 +// CHECK-NEXT: ret void +// +__global__ void test_is_private(int *x){ + bool ret = __builtin_amdgcn_is_private(x); +} diff --git a/clang/test/CodeGenCUDA/builtins-unsafe-atomics-spirv-amdgcn-gfx90a.cu b/clang/test/CodeGenCUDA/builtins-unsafe-atomics-spirv-amdgcn-gfx90a.cu new file mode 100644 index 00000000000000..1ea1d5f454762d --- /dev/null +++ b/clang/test/CodeGenCUDA/builtins-unsafe-atomics-spirv-amdgcn-gfx90a.cu @@ -0,0 +1,31 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -x hip \ +// RUN: -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -emit-llvm %s \ +// RUN: -o - | FileCheck %s + +#define __device__ __attribute__((device)) +typedef __attribute__((address_space(3))) float *LP; + +// CHECK-LABEL: define spir_func void @_Z22test_ds_atomic_add_f32Pff( +// CHECK-SAME: ptr addrspace(4) noundef [[ADDR:%.*]], float noundef [[VAL:%.*]]) addrspace(4) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[VAL_ADDR:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[RTN:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr [[ADDR_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[VAL_ADDR_ASCAST:%.*]] = addrspacecast ptr [[VAL_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[RTN_ASCAST:%.*]] = addrspacecast ptr [[RTN]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[ADDR]], ptr addrspace(4) [[ADDR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[VAL]], ptr addrspace(4) [[VAL_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ADDR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr addrspace(3) +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[VAL_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) [[TMP1]], float [[TMP2]], i32 0, i32 0, i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[RTN_ASCAST]], align 8 +// CHECK-NEXT: store float [[TMP3]], ptr addrspace(4) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +__device__ void test_ds_atomic_add_f32(float *addr, float val) { + float *rtn; + *rtn = __builtin_amdgcn_ds_faddf((LP)addr, val, 0, 0, 0); +} diff --git a/clang/test/CodeGenCUDA/long-double.cu b/clang/test/CodeGenCUDA/long-double.cu index d52de972ea3da4..898afcac124b5f 100644 --- a/clang/test/CodeGenCUDA/long-double.cu +++ b/clang/test/CodeGenCUDA/long-double.cu @@ -2,6 +2,10 @@ // RUN: -aux-triple x86_64-unknown-gnu-linux -fcuda-is-device \ // RUN: -emit-llvm -o - -x hip %s 2>&1 | FileCheck %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa \ +// RUN: -aux-triple x86_64-unknown-gnu-linux -fcuda-is-device \ +// RUN: -emit-llvm -o - -x hip %s 2>&1 | FileCheck %s + // RUN: %clang_cc1 -triple nvptx \ // RUN: -aux-triple x86_64-unknown-gnu-linux -fcuda-is-device \ // RUN: -emit-llvm -o - %s 2>&1 | FileCheck %s diff --git a/clang/test/CodeGenCUDA/spirv-amdgcn-bf16.cu b/clang/test/CodeGenCUDA/spirv-amdgcn-bf16.cu new file mode 100644 index 00000000000000..2a0f84d1daa758 --- /dev/null +++ b/clang/test/CodeGenCUDA/spirv-amdgcn-bf16.cu @@ -0,0 +1,129 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// REQUIRES: x86-registered-target + +// RUN: %clang_cc1 "-aux-triple" "x86_64-unknown-linux-gnu" "-triple" "spirv64-amd-amdhsa" \ +// RUN: -fcuda-is-device "-aux-target-cpu" "x86-64" -emit-llvm -o - %s | FileCheck %s + +#include "Inputs/cuda.h" + +// CHECK-LABEL: @_Z8test_argPDF16bDF16b( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: [[BF16:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr [[IN_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[BF16_ASCAST:%.*]] = addrspacecast ptr [[BF16]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[OUT:%.*]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[IN:%.*]], ptr addrspace(4) [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr addrspace(4) [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: store bfloat [[TMP0]], ptr addrspace(4) [[BF16_ASCAST]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load bfloat, ptr addrspace(4) [[BF16_ASCAST]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(4) [[TMP2]], align 2 +// CHECK-NEXT: ret void +// +__device__ void test_arg(__bf16 *out, __bf16 in) { + __bf16 bf16 = in; + *out = bf16; +} + +// CHECK-LABEL: @_Z9test_loadPDF16bS_( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[BF16:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr [[IN_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[BF16_ASCAST:%.*]] = addrspacecast ptr [[BF16]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[OUT:%.*]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(4) [[IN:%.*]], ptr addrspace(4) [[IN_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[IN_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load bfloat, ptr addrspace(4) [[TMP0]], align 2 +// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(4) [[BF16_ASCAST]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = load bfloat, ptr addrspace(4) [[BF16_ASCAST]], align 2 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[TMP2]], ptr addrspace(4) [[TMP3]], align 2 +// CHECK-NEXT: ret void +// +__device__ void test_load(__bf16 *out, __bf16 *in) { + __bf16 bf16 = *in; + *out = bf16; +} + +// CHECK-LABEL: @_Z8test_retDF16b( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr [[RETVAL]] to ptr addrspace(4) +// CHECK-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr [[IN_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: store bfloat [[IN:%.*]], ptr addrspace(4) [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr addrspace(4) [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: ret bfloat [[TMP0]] +// +__device__ __bf16 test_ret( __bf16 in) { + return in; +} + +// CHECK-LABEL: @_Z9test_callDF16b( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr [[RETVAL]] to ptr addrspace(4) +// CHECK-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr [[IN_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: store bfloat [[IN:%.*]], ptr addrspace(4) [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr addrspace(4) [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[CALL:%.*]] = call contract spir_func noundef addrspace(4) bfloat @_Z8test_retDF16b(bfloat noundef [[TMP0]]) #[[ATTR1:[0-9]+]] +// CHECK-NEXT: ret bfloat [[CALL]] +// +__device__ __bf16 test_call( __bf16 in) { + return test_ret(in); +} + + +// CHECK-LABEL: @_Z15test_vec_assignv( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VEC2_A:%.*]] = alloca <2 x bfloat>, align 4 +// CHECK-NEXT: [[VEC2_B:%.*]] = alloca <2 x bfloat>, align 4 +// CHECK-NEXT: [[VEC4_A:%.*]] = alloca <4 x bfloat>, align 8 +// CHECK-NEXT: [[VEC4_B:%.*]] = alloca <4 x bfloat>, align 8 +// CHECK-NEXT: [[VEC8_A:%.*]] = alloca <8 x bfloat>, align 16 +// CHECK-NEXT: [[VEC8_B:%.*]] = alloca <8 x bfloat>, align 16 +// CHECK-NEXT: [[VEC16_A:%.*]] = alloca <16 x bfloat>, align 32 +// CHECK-NEXT: [[VEC16_B:%.*]] = alloca <16 x bfloat>, align 32 +// CHECK-NEXT: [[VEC2_A_ASCAST:%.*]] = addrspacecast ptr [[VEC2_A]] to ptr addrspace(4) +// CHECK-NEXT: [[VEC2_B_ASCAST:%.*]] = addrspacecast ptr [[VEC2_B]] to ptr addrspace(4) +// CHECK-NEXT: [[VEC4_A_ASCAST:%.*]] = addrspacecast ptr [[VEC4_A]] to ptr addrspace(4) +// CHECK-NEXT: [[VEC4_B_ASCAST:%.*]] = addrspacecast ptr [[VEC4_B]] to ptr addrspace(4) +// CHECK-NEXT: [[VEC8_A_ASCAST:%.*]] = addrspacecast ptr [[VEC8_A]] to ptr addrspace(4) +// CHECK-NEXT: [[VEC8_B_ASCAST:%.*]] = addrspacecast ptr [[VEC8_B]] to ptr addrspace(4) +// CHECK-NEXT: [[VEC16_A_ASCAST:%.*]] = addrspacecast ptr [[VEC16_A]] to ptr addrspace(4) +// CHECK-NEXT: [[VEC16_B_ASCAST:%.*]] = addrspacecast ptr [[VEC16_B]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x bfloat>, ptr addrspace(4) [[VEC2_B_ASCAST]], align 4 +// CHECK-NEXT: store <2 x bfloat> [[TMP0]], ptr addrspace(4) [[VEC2_A_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(4) [[VEC4_B_ASCAST]], align 8 +// CHECK-NEXT: store <4 x bfloat> [[TMP1]], ptr addrspace(4) [[VEC4_A_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x bfloat>, ptr addrspace(4) [[VEC8_B_ASCAST]], align 16 +// CHECK-NEXT: store <8 x bfloat> [[TMP2]], ptr addrspace(4) [[VEC8_A_ASCAST]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <16 x bfloat>, ptr addrspace(4) [[VEC16_B_ASCAST]], align 32 +// CHECK-NEXT: store <16 x bfloat> [[TMP3]], ptr addrspace(4) [[VEC16_A_ASCAST]], align 32 +// CHECK-NEXT: ret void +// +__device__ void test_vec_assign() { + typedef __attribute__((ext_vector_type(2))) __bf16 bf16_x2; + bf16_x2 vec2_a, vec2_b; + vec2_a = vec2_b; + + typedef __attribute__((ext_vector_type(4))) __bf16 bf16_x4; + bf16_x4 vec4_a, vec4_b; + vec4_a = vec4_b; + + typedef __attribute__((ext_vector_type(8))) __bf16 bf16_x8; + bf16_x8 vec8_a, vec8_b; + vec8_a = vec8_b; + + typedef __attribute__((ext_vector_type(16))) __bf16 bf16_x16; + bf16_x16 vec16_a, vec16_b; + vec16_a = vec16_b; +} diff --git a/clang/test/CodeGenCXX/spirv-amdgcn-float16.cpp b/clang/test/CodeGenCXX/spirv-amdgcn-float16.cpp new file mode 100644 index 00000000000000..2487e0fcd4343a --- /dev/null +++ b/clang/test/CodeGenCXX/spirv-amdgcn-float16.cpp @@ -0,0 +1,38 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck %s + +// CHECK-LABEL: define spir_func void @_Z1fv( +// CHECK-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[X:%.*]] = alloca half, align 2 +// CHECK-NEXT: [[Y:%.*]] = alloca half, align 2 +// CHECK-NEXT: [[Z:%.*]] = alloca half, align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[X]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[Y]], align 2 +// CHECK-NEXT: [[ADD:%.*]] = fadd half [[TMP0]], [[TMP1]] +// CHECK-NEXT: store half [[ADD]], ptr [[Z]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = load half, ptr [[X]], align 2 +// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[Y]], align 2 +// CHECK-NEXT: [[SUB:%.*]] = fsub half [[TMP2]], [[TMP3]] +// CHECK-NEXT: store half [[SUB]], ptr [[Z]], align 2 +// CHECK-NEXT: [[TMP4:%.*]] = load half, ptr [[X]], align 2 +// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[Y]], align 2 +// CHECK-NEXT: [[MUL:%.*]] = fmul half [[TMP4]], [[TMP5]] +// CHECK-NEXT: store half [[MUL]], ptr [[Z]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = load half, ptr [[X]], align 2 +// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[Y]], align 2 +// CHECK-NEXT: [[DIV:%.*]] = fdiv half [[TMP6]], [[TMP7]] +// CHECK-NEXT: store half [[DIV]], ptr [[Z]], align 2 +// CHECK-NEXT: ret void +// +void f() { + _Float16 x, y, z; + + z = x + y; + + z = x - y; + + z = x * y; + + z = x / y; +} diff --git a/clang/test/CodeGenHIP/spirv-amdgcn-ballot.cpp b/clang/test/CodeGenHIP/spirv-amdgcn-ballot.cpp new file mode 100644 index 00000000000000..8226a109d8b8d9 --- /dev/null +++ b/clang/test/CodeGenHIP/spirv-amdgcn-ballot.cpp @@ -0,0 +1,27 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -aux-triple x86_64-pc-windows-msvc -x hip -emit-llvm -fcuda-is-device -o - %s | FileCheck %s + +// Unlike OpenCL, HIP depends on the C++ interpration of "unsigned long", which +// is 64 bits long on Linux and 32 bits long on Windows. The return type of the +// ballot intrinsic needs to be a 64 bit integer on both platforms. This test +// cross-compiles to Windows to confirm that the return type is indeed 64 bits +// on Windows. + +#define __device__ __attribute__((device)) + +// CHECK-LABEL: define spir_func noundef i64 @_Z3fooi( +// CHECK-SAME: i32 noundef [[P:%.*]]) addrspace(4) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[P_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr [[RETVAL]] to ptr addrspace(4) +// CHECK-NEXT: [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr [[P_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: store i32 [[P]], ptr addrspace(4) [[P_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) [[P_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = call addrspace(4) i64 @llvm.amdgcn.ballot.i64(i1 [[TOBOOL]]) +// CHECK-NEXT: ret i64 [[TMP1]] +// +__device__ unsigned long long foo(int p) { + return __builtin_amdgcn_ballot_w64(p); +} diff --git a/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip b/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip new file mode 100644 index 00000000000000..2b785200e8eeab --- /dev/null +++ b/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip @@ -0,0 +1,46 @@ +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -x hip -fcuda-is-device -emit-llvm %s \ +// RUN: -o - | FileCheck %s + +constexpr static int OpCtrl() +{ + return 15 + 1; +} + +constexpr static int RowMask() +{ + return 3 + 1; +} + +constexpr static int BankMask() +{ + return 2 + 1; +} + +constexpr static bool BountCtrl() +{ + return true & false; +} + +// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 16, i32 0, i32 0, i1 false) +__attribute__((global)) void test_update_dpp_const_fold_imm_operand_2(int* out, int a, int b) +{ + *out = __builtin_amdgcn_update_dpp(a, b, OpCtrl(), 0, 0, false); +} + +// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 4, i32 0, i1 false) +__attribute__((global)) void test_update_dpp_const_fold_imm_operand_3(int* out, int a, int b) +{ + *out = __builtin_amdgcn_update_dpp(a, b, 0, RowMask(), 0, false); +} + +// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 0, i32 3, i1 false) +__attribute__((global)) void test_update_dpp_const_fold_imm_operand_4(int* out, int a, int b) +{ + *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, BankMask(), false); +} + +// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 0, i32 0, i1 false) +__attribute__((global)) void test_update_dpp_const_fold_imm_operand_5(int* out, int a, int b) +{ + *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, 0, BountCtrl()); +} diff --git a/clang/test/CodeGenHIP/spirv-amdgcn-half.hip b/clang/test/CodeGenHIP/spirv-amdgcn-half.hip new file mode 100644 index 00000000000000..2caf766d943b11 --- /dev/null +++ b/clang/test/CodeGenHIP/spirv-amdgcn-half.hip @@ -0,0 +1,15 @@ +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -x hip -emit-llvm -fcuda-is-device -o - %s | FileCheck %s + +#define __device__ __attribute__((device)) + +// CHECK-LABEL: @_Z2d0DF16_ +// CHECK: fpext +__device__ float d0(_Float16 x) { + return x; +} + +// CHECK-LABEL: @_Z2d1f +// CHECK: fptrunc +__device__ _Float16 d1(float x) { + return x; +} diff --git a/clang/test/Preprocessor/predefined-macros-no-warnings.c b/clang/test/Preprocessor/predefined-macros-no-warnings.c index e0617f8de4da38..722e3e77214b64 100644 --- a/clang/test/Preprocessor/predefined-macros-no-warnings.c +++ b/clang/test/Preprocessor/predefined-macros-no-warnings.c @@ -173,6 +173,7 @@ // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spir64 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spirv32 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spirv64 +// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spirv64-amd-amdhsa // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple wasm32 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple wasm32-wasi // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple wasm32-emscripten diff --git a/clang/test/Preprocessor/predefined-macros.c b/clang/test/Preprocessor/predefined-macros.c index c4a9672f0814aa..7f036bff401ca0 100644 --- a/clang/test/Preprocessor/predefined-macros.c +++ b/clang/test/Preprocessor/predefined-macros.c @@ -236,6 +236,16 @@ // CHECK-SPIRV64-DAG: #define __SPIRV64__ 1 // CHECK-SPIRV64-NOT: #define __SPIRV32__ 1 +// RUN: %clang_cc1 %s -E -dM -o - -x cl -triple spirv64-amd-amdhsa \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-SPIRV64-AMDGCN +// CHECK-SPIRV64-AMDGCN-DAG: #define __IMAGE_SUPPORT__ 1 +// CHECK-SPIRV64-AMDGCN-DAG: #define __SPIRV__ 1 +// CHECK-SPIRV64-AMDGCN-DAG: #define __SPIRV64__ 1 +// CHECK-SPIRV64-AMDGCN-DAG: #define __AMD__ 1 +// CHECK-SPIRV64-AMDGCN-DAG: #define __AMDGCN__ 1 +// CHECK-SPIRV64-AMDGCN-DAG: #define __AMDGPU__ 1 +// CHECK-SPIRV64-AMDGCN-NOT: #define __SPIRV32__ 1 + // RUN: %clang_cc1 %s -E -dM -o - -x hip -triple x86_64-unknown-linux-gnu \ // RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-HIP // CHECK-HIP: #define __HIPCC__ 1 diff --git a/clang/test/Sema/builtin-spirv-amdgcn-atomic-inc-dec-failure.cpp b/clang/test/Sema/builtin-spirv-amdgcn-atomic-inc-dec-failure.cpp new file mode 100644 index 00000000000000..2b8fac72847d6a --- /dev/null +++ b/clang/test/Sema/builtin-spirv-amdgcn-atomic-inc-dec-failure.cpp @@ -0,0 +1,25 @@ +// RUN: %clang_cc1 %s -x hip -fcuda-is-device -o - \ +// RUN: -triple=spirv64-amd-amdhsa -fsyntax-only \ +// RUN: -verify=dev +// RUN: %clang_cc1 %s -x hip -triple x86_64 -o - \ +// RUN: -aux-triple spirv64-amd-amdhsa -fsyntax-only \ +// RUN: -verify=host + +// dev-no-diagnostics + +void test_host() { + __UINT32_TYPE__ val32; + __UINT64_TYPE__ val64; + + // host-error@+1 {{reference to __device__ function '__builtin_amdgcn_atomic_inc32' in __host__ function}} + val32 = __builtin_amdgcn_atomic_inc32(&val32, val32, __ATOMIC_SEQ_CST, ""); + + // host-error@+1 {{reference to __device__ function '__builtin_amdgcn_atomic_inc64' in __host__ function}} + val64 = __builtin_amdgcn_atomic_inc64(&val64, val64, __ATOMIC_SEQ_CST, ""); + + // host-error@+1 {{reference to __device__ function '__builtin_amdgcn_atomic_dec32' in __host__ function}} + val32 = __builtin_amdgcn_atomic_dec32(&val32, val32, __ATOMIC_SEQ_CST, ""); + + // host-error@+1 {{reference to __device__ function '__builtin_amdgcn_atomic_dec64' in __host__ function}} + val64 = __builtin_amdgcn_atomic_dec64(&val64, val64, __ATOMIC_SEQ_CST, ""); +} diff --git a/clang/test/Sema/inline-asm-validate-spirv-amdgcn.cl b/clang/test/Sema/inline-asm-validate-spirv-amdgcn.cl new file mode 100644 index 00000000000000..0fb1b5f3672265 --- /dev/null +++ b/clang/test/Sema/inline-asm-validate-spirv-amdgcn.cl @@ -0,0 +1,111 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -fsyntax-only -verify %s + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +kernel void test () { + + int sgpr = 0, vgpr = 0, imm = 0; + + // sgpr constraints + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "s" (imm) : ); + + __asm__ ("s_mov_b32 %0, %1" : "={s1}" (sgpr) : "{exec}" (imm) : ); + __asm__ ("s_mov_b32 %0, %1" : "={s1}" (sgpr) : "{exe" (imm) : ); // expected-error {{invalid input constraint '{exe' in asm}} + __asm__ ("s_mov_b32 %0, %1" : "={s1}" (sgpr) : "{exec" (imm) : ); // expected-error {{invalid input constraint '{exec' in asm}} + __asm__ ("s_mov_b32 %0, %1" : "={s1}" (sgpr) : "{exec}a" (imm) : ); // expected-error {{invalid input constraint '{exec}a' in asm}} + + // vgpr constraints + __asm__ ("v_mov_b32 %0, %1" : "=v" (vgpr) : "v" (imm) : ); + + // 'I' constraint (an immediate integer in the range -16 to 64) + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "I" (imm) : ); + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "I" (-16) : ); + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "I" (64) : ); + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "I" (-17) : ); // expected-error {{value '-17' out of range for constraint 'I'}} + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "I" (65) : ); // expected-error {{value '65' out of range for constraint 'I'}} + + // 'J' constraint (an immediate 16-bit signed integer) + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "J" (imm) : ); + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "J" (-32768) : ); + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "J" (32767) : ); + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "J" (-32769) : ); // expected-error {{value '-32769' out of range for constraint 'J'}} + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "J" (32768) : ); // expected-error {{value '32768' out of range for constraint 'J'}} + + // 'A' constraint (an immediate constant that can be inlined) + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "A" (imm) : ); + + // 'B' constraint (an immediate 32-bit signed integer) + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "B" (imm) : ); + + // 'C' constraint (an immediate 32-bit unsigned integer or 'A' constraint) + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "C" (imm) : ); + + // 'DA' constraint (an immediate 64-bit constant that can be split into two 'A' constants) + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "DA" (imm) : ); + + // 'DB' constraint (an immediate 64-bit constant that can be split into two 'B' constants) + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "DB" (imm) : ); + +} + +__kernel void +test_float(const __global float *a, const __global float *b, __global float *c, unsigned i) +{ + float ai = a[i]; + float bi = b[i]; + float ci; + + __asm("v_add_f32_e32 v1, v2, v3" : "={v1}"(ci) : "{v2}"(ai), "{v3}"(bi) : ); + __asm("v_add_f32_e32 v1, v2, v3" : ""(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "="(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '=' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={a}"(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '={a}' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={"(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '={' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={}"(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '={}' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={v"(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '={v' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={v1a}"(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '={v1a}' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={va}"(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '={va}' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={v1}a"(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '={v1}a' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={v1"(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '={v1' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "=v1}"(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '=v1}' in asm}} + + __asm("v_add_f32_e32 v1, v2, v3" : "={v[1]}"(ci) : "{v[2]}"(ai), "{v[3]}"(bi) : ); + __asm("v_add_f32_e32 v1, v2, v3" : "={v[1}"(ci) : "{v[2]}"(ai), "{v[3]}"(bi) : ); // expected-error {{invalid output constraint '={v[1}' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={v[1]"(ci) : "{v[2]}"(ai), "{v[3]}"(bi) : ); // expected-error {{invalid output constraint '={v[1]' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={v[a]}"(ci) : "{v[2]}"(ai), "{v[3]}"(bi) : ); // expected-error {{invalid output constraint '={v[a]}' in asm}} + + __asm("v_add_f32_e32 v1, v2, v3" : "=v"(ci) : "v"(ai), "v"(bi) : ); + __asm("v_add_f32_e32 v1, v2, v3" : "=v1"(ci) : "v2"(ai), "v3"(bi) : ); /// expected-error {{invalid output constraint '=v1' in asm}} + + __asm("v_add_f32_e32 v1, v2, v3" : "={v1}"(ci) : "{a}"(ai), "{v3}"(bi) : ); // expected-error {{invalid input constraint '{a}' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={v1}"(ci) : "{v2}"(ai), "{a}"(bi) : ); // expected-error {{invalid input constraint '{a}' in asm}} + c[i] = ci; +} + +__kernel void +test_double(const __global double *a, const __global double *b, __global double *c, unsigned i) +{ + double ai = a[i]; + double bi = b[i]; + double ci; + + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "={v[1:2]}"(ci) : "{v[3:4]}"(ai), "{v[5:6]}"(bi) : ); + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "=v{[1:2]}"(ci) : "{v[3:4]}"(ai), "{v[5:6]}"(bi) : ); //expected-error {{invalid output constraint '=v{[1:2]}' in asm}} + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "={v[1:2]a}"(ci) : "{v[3:4]}"(ai), "{v[5:6]}"(bi) : ); //expected-error {{invalid output constraint '={v[1:2]a}' in asm}} + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "={v[1:2]}a"(ci) : "{v[3:4]}"(ai), "{v[5:6]}"(bi) : ); //expected-error {{invalid output constraint '={v[1:2]}a' in asm}} + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "={v[1:"(ci) : "{v[3:4]}"(ai), "{v[5:6]}"(bi) : ); //expected-error {{invalid output constraint '={v[1:' in asm}} + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "={v[1:]}"(ci) : "{v[3:4]}"(ai), "{v[5:6]}"(bi) : ); //expected-error {{invalid output constraint '={v[1:]}' in asm}} + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "={v[:2]}"(ci) : "{v[3:4]}"(ai), "{v[5:6]}"(bi) : ); //expected-error {{invalid output constraint '={v[:2]}' in asm}} + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "={v[1:2]"(ci) : "{v[3:4]}"(ai), "{v[5:6]}"(bi) : ); //expected-error {{invalid output constraint '={v[1:2]' in asm}} + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "={v[1:2}"(ci) : "{v[3:4]}"(ai), "{v[5:6]}"(bi) : ); //expected-error {{invalid output constraint '={v[1:2}' in asm}} + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "={v[2:1]}"(ci) : "{v[3:4]}"(ai), "{v[5:6]}"(bi) : ); //expected-error {{invalid output constraint '={v[2:1]}' in asm}} + + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "=v[1:2]"(ci) : "v[3:4]"(ai), "v[5:6]"(bi) : ); //expected-error {{invalid output constraint '=v[1:2]' in asm}} + + c[i] = ci; +} + +void test_long(int arg0) { + long v15_16; + __asm volatile("v_lshlrev_b64 v[15:16], 0, %0" : "={v[15:16]}"(v15_16) : "v"(arg0)); +} diff --git a/clang/test/SemaCUDA/allow-int128.cu b/clang/test/SemaCUDA/allow-int128.cu index eb7b7e7f52862b..af3e8c2453ad18 100644 --- a/clang/test/SemaCUDA/allow-int128.cu +++ b/clang/test/SemaCUDA/allow-int128.cu @@ -1,6 +1,9 @@ // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \ // RUN: -aux-triple x86_64-unknown-linux-gnu \ // RUN: -fcuda-is-device -verify -fsyntax-only %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa \ +// RUN: -aux-triple x86_64-unknown-linux-gnu \ +// RUN: -fcuda-is-device -verify -fsyntax-only %s // RUN: %clang_cc1 -triple nvptx \ // RUN: -aux-triple x86_64-unknown-linux-gnu \ // RUN: -fcuda-is-device -verify -fsyntax-only %s diff --git a/clang/test/SemaCUDA/amdgpu-f128.cu b/clang/test/SemaCUDA/amdgpu-f128.cu index 9a0212cdb93cff..1f5a6553dcc4fd 100644 --- a/clang/test/SemaCUDA/amdgpu-f128.cu +++ b/clang/test/SemaCUDA/amdgpu-f128.cu @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -fsyntax-only -verify %s // expected-no-diagnostics typedef __float128 f128_t; diff --git a/clang/test/SemaCUDA/float16.cu b/clang/test/SemaCUDA/float16.cu index bb5ed606438491..9c7faef284fee7 100644 --- a/clang/test/SemaCUDA/float16.cu +++ b/clang/test/SemaCUDA/float16.cu @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -fsyntax-only -triple x86_64 -aux-triple amdgcn -verify %s +// RUN: %clang_cc1 -fsyntax-only -triple x86_64 -aux-triple spirv64-amd-amdhsa -verify %s // RUN: %clang_cc1 -fsyntax-only -triple x86_64 -aux-triple nvptx64 -verify %s // expected-no-diagnostics #include "Inputs/cuda.h" diff --git a/clang/test/SemaCUDA/fp16-arg-return.cu b/clang/test/SemaCUDA/fp16-arg-return.cu index 23a9613b18b284..cc73dc4751d329 100644 --- a/clang/test/SemaCUDA/fp16-arg-return.cu +++ b/clang/test/SemaCUDA/fp16-arg-return.cu @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn-amd-amdhsa -fcuda-is-device -fsyntax-only -verify %s +// RUN: %clang_cc1 -emit-llvm -o - -triple spirv64-amd-amdhsa -fcuda-is-device -fsyntax-only -verify %s // expected-no-diagnostics diff --git a/clang/test/SemaCUDA/spirv-amdgcn-atomic-ops.cu b/clang/test/SemaCUDA/spirv-amdgcn-atomic-ops.cu new file mode 100644 index 00000000000000..ea1f24670ff9a8 --- /dev/null +++ b/clang/test/SemaCUDA/spirv-amdgcn-atomic-ops.cu @@ -0,0 +1,86 @@ +// RUN: %clang_cc1 -x hip -std=c++11 -triple spirv64-amd-amdhsa -fcuda-is-device -verify -fsyntax-only %s + +#include "Inputs/cuda.h" + +__device__ int test_hip_atomic_load(int *pi32, unsigned int *pu32, long long *pll, unsigned long long *pull, float *fp, double *dbl) { + int val = __hip_atomic_load(0); // expected-error {{too few arguments to function call, expected 3, have 1}} + val = __hip_atomic_load(0, 0, 0, 0); // expected-error {{too many arguments to function call, expected 3, have 4}} + val = __hip_atomic_load(0, 0, 0); // expected-error {{address argument to atomic builtin must be a pointer ('int' invalid)}} + val = __hip_atomic_load(pi32, 0, 0); // expected-error {{synchronization scope argument to atomic operation is invalid}} + val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); + val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); + val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); + val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, 6); // expected-error {{synchronization scope argument to atomic operation is invalid}} + val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + val = __hip_atomic_load(pi32, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD); + val = __hip_atomic_load(pi32, __ATOMIC_CONSUME, __HIP_MEMORY_SCOPE_SINGLETHREAD); + val = __hip_atomic_load(pi32, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_SINGLETHREAD); + val = __hip_atomic_load(pi32, __ATOMIC_ACQ_REL, __HIP_MEMORY_SCOPE_SINGLETHREAD); // expected-warning{{memory order argument to atomic operation is invalid}} + val = __hip_atomic_load(pu32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + val = __hip_atomic_load(pll, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + val = __hip_atomic_load(pull, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + val = __hip_atomic_load(fp, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + val = __hip_atomic_load(dbl, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + return val; +} + +__device__ int test_hip_atomic_store(int *pi32, unsigned int *pu32, long long *pll, unsigned long long *pull, float *fp, double *dbl, + int i32, unsigned int u32, long long i64, unsigned long long u64, float f32, double f64) { + __hip_atomic_store(0); // expected-error {{too few arguments to function call, expected 4, have 1}} + __hip_atomic_store(0, 0, 0, 0, 0); // expected-error {{too many arguments to function call, expected 4, have 5}} + __hip_atomic_store(0, 0, 0, 0); // expected-error {{address argument to atomic builtin must be a pointer ('int' invalid)}} + __hip_atomic_store(pi32, 0, 0, 0); // expected-error {{synchronization scope argument to atomic operation is invalid}} + __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); + __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); + __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); + __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, 6); // expected-error {{synchronization scope argument to atomic operation is invalid}} + __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pi32, 0, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pi32, 0, __ATOMIC_CONSUME, __HIP_MEMORY_SCOPE_SINGLETHREAD); // expected-warning{{memory order argument to atomic operation is invalid}} + __hip_atomic_store(pi32, 0, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_SINGLETHREAD); // expected-warning{{memory order argument to atomic operation is invalid}} + __hip_atomic_store(pi32, 0, __ATOMIC_ACQ_REL, __HIP_MEMORY_SCOPE_SINGLETHREAD); // expected-warning{{memory order argument to atomic operation is invalid}} + __hip_atomic_store(pi32, i32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pi32, i32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pu32, u32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pll, i64, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pull, u64, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(fp, f32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(dbl, f64, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pi32, u32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pi32, i64, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pi32, u64, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pll, i32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(fp, i32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(fp, i64, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(dbl, i64, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(dbl, i32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + return 0; +} + +__device__ bool test_hip_atomic_cmpxchg_weak(int *ptr, int val, int desired) { + bool flag = __hip_atomic_compare_exchange_weak(0); // expected-error {{too few arguments to function call, expected 6, have 1}} + flag = __hip_atomic_compare_exchange_weak(0, 0, 0, 0, 0, 0, 0); // expected-error {{too many arguments to function call, expected 6, have 7}} + flag = __hip_atomic_compare_exchange_weak(0, 0, 0, 0, 0, 0); // expected-error {{address argument to atomic builtin must be a pointer ('int' invalid)}} + flag = __hip_atomic_compare_exchange_weak(ptr, 0, 0, 0, 0, 0); // expected-error {{synchronization scope argument to atomic operation is invalid}}, expected-warning {{null passed to a callee that requires a non-null argument}} + flag = __hip_atomic_compare_exchange_weak(ptr, 0, 0, 0, 0, __HIP_MEMORY_SCOPE_SYSTEM); // expected-warning {{null passed to a callee that requires a non-null argument}} + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_CONSUME, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_CONSUME, __HIP_MEMORY_SCOPE_SINGLETHREAD); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_SINGLETHREAD); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_ACQ_REL, __HIP_MEMORY_SCOPE_SINGLETHREAD); // expected-warning {{failure memory order argument to atomic operation is invalid}} + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_CONSUME, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + return flag; +} >From 393ce66f98e8f1e96384b0028ad13dddf99a1f30 Mon Sep 17 00:00:00 2001 From: Alex Voicu <alexandru.vo...@amd.com> Date: Tue, 23 Apr 2024 18:22:59 +0100 Subject: [PATCH 2/3] Fix formatting. --- clang/lib/Basic/Targets/SPIR.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h index 6b605979c9ab1d..d68b6dcb1340f2 100644 --- a/clang/lib/Basic/Targets/SPIR.h +++ b/clang/lib/Basic/Targets/SPIR.h @@ -365,7 +365,7 @@ class LLVM_LIBRARY_VISIBILITY SPIRV64TargetInfo : public BaseSPIRVTargetInfo { }; class LLVM_LIBRARY_VISIBILITY SPIRV64AMDGCNTargetInfo - : public BaseSPIRVTargetInfo { + : public BaseSPIRVTargetInfo { public: SPIRV64AMDGCNTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts) : BaseSPIRVTargetInfo(Triple, Opts) { >From 98db8f7ad89d1239096d1c4f8695a1557bfe4429 Mon Sep 17 00:00:00 2001 From: Alex Voicu <alexandru.vo...@amd.com> Date: Thu, 25 Apr 2024 02:58:55 +0100 Subject: [PATCH 3/3] Use `fillAMDGPUFeatureMap` instead of copy-pasta. --- clang/lib/Basic/Targets/SPIR.cpp | 39 ++----------------------- llvm/lib/TargetParser/TargetParser.cpp | 40 +++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 38 deletions(-) diff --git a/clang/lib/Basic/Targets/SPIR.cpp b/clang/lib/Basic/Targets/SPIR.cpp index d7d232ac9484f8..edce728be40a20 100644 --- a/clang/lib/Basic/Targets/SPIR.cpp +++ b/clang/lib/Basic/Targets/SPIR.cpp @@ -14,6 +14,7 @@ #include "Targets.h" #include "clang/Basic/Builtins.h" #include "clang/Basic/TargetBuiltins.h" +#include "llvm/TargetParser/TargetParser.h" using namespace clang; using namespace clang::targets; @@ -152,43 +153,7 @@ ArrayRef<const char *> SPIRV64AMDGCNTargetInfo::getGCCRegNames() const { bool SPIRV64AMDGCNTargetInfo::initFeatureMap( llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, StringRef, const std::vector<std::string> &FeatureVec) const { - // This represents the union of all AMDGCN features. - Features["atomic-ds-pk-add-16-insts"] = true; - Features["atomic-flat-pk-add-16-insts"] = true; - Features["atomic-buffer-global-pk-add-f16-insts"] = true; - Features["atomic-global-pk-add-bf16-inst"] = true; - Features["atomic-fadd-rtn-insts"] = true; - Features["ci-insts"] = true; - Features["dot1-insts"] = true; - Features["dot2-insts"] = true; - Features["dot3-insts"] = true; - Features["dot4-insts"] = true; - Features["dot5-insts"] = true; - Features["dot7-insts"] = true; - Features["dot8-insts"] = true; - Features["dot9-insts"] = true; - Features["dot10-insts"] = true; - Features["dot11-insts"] = true; - Features["dl-insts"] = true; - Features["16-bit-insts"] = true; - Features["dpp"] = true; - Features["gfx8-insts"] = true; - Features["gfx9-insts"] = true; - Features["gfx90a-insts"] = true; - Features["gfx940-insts"] = true; - Features["gfx10-insts"] = true; - Features["gfx10-3-insts"] = true; - Features["gfx11-insts"] = true; - Features["gfx12-insts"] = true; - Features["image-insts"] = true; - Features["fp8-conversion-insts"] = true; - Features["s-memrealtime"] = true; - Features["s-memtime-inst"] = true; - Features["gws"] = true; - Features["fp8-insts"] = true; - Features["fp8-conversion-insts"] = true; - Features["atomic-ds-pk-add-16-insts"] = true; - Features["mai-insts"] = true; + llvm::AMDGPU::fillAMDGPUFeatureMap({}, getTriple(), Features); return TargetInfo::initFeatureMap(Features, Diags, {}, FeatureVec); } diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 0d784a79e5bac6..50757b74606f13 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -309,7 +309,45 @@ StringRef AMDGPU::getCanonicalArchName(const Triple &T, StringRef Arch) { void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, StringMap<bool> &Features) { // XXX - What does the member GPU mean if device name string passed here? - if (T.isAMDGCN()) { + if (T.isSPIRV() && T.getOS() == Triple::OSType::AMDHSA) { + // AMDGCN SPIRV must support the union of all AMDGCN features. + Features["atomic-ds-pk-add-16-insts"] = true; + Features["atomic-flat-pk-add-16-insts"] = true; + Features["atomic-buffer-global-pk-add-f16-insts"] = true; + Features["atomic-global-pk-add-bf16-inst"] = true; + Features["atomic-fadd-rtn-insts"] = true; + Features["ci-insts"] = true; + Features["dot1-insts"] = true; + Features["dot2-insts"] = true; + Features["dot3-insts"] = true; + Features["dot4-insts"] = true; + Features["dot5-insts"] = true; + Features["dot7-insts"] = true; + Features["dot8-insts"] = true; + Features["dot9-insts"] = true; + Features["dot10-insts"] = true; + Features["dot11-insts"] = true; + Features["dl-insts"] = true; + Features["16-bit-insts"] = true; + Features["dpp"] = true; + Features["gfx8-insts"] = true; + Features["gfx9-insts"] = true; + Features["gfx90a-insts"] = true; + Features["gfx940-insts"] = true; + Features["gfx10-insts"] = true; + Features["gfx10-3-insts"] = true; + Features["gfx11-insts"] = true; + Features["gfx12-insts"] = true; + Features["image-insts"] = true; + Features["fp8-conversion-insts"] = true; + Features["s-memrealtime"] = true; + Features["s-memtime-inst"] = true; + Features["gws"] = true; + Features["fp8-insts"] = true; + Features["fp8-conversion-insts"] = true; + Features["atomic-ds-pk-add-16-insts"] = true; + Features["mai-insts"] = true; + } else if (T.isAMDGCN()) { switch (parseArchAMDGCN(GPU)) { case GK_GFX1201: case GK_GFX1200: _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits