https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/94422
>From 708374e03f1bf70006f2472f19edad1bd621e2d6 Mon Sep 17 00:00:00 2001 From: Alex MacLean <amacl...@nvidia.com> Date: Mon, 3 Jun 2024 16:46:36 +0000 Subject: [PATCH 1/4] [NVPTX] Revamp NVVMIntrRange pass --- clang/test/CodeGenCUDA/cuda-builtin-vars.cu | 24 +-- llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 32 ++-- llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 6 +- llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 58 ++++-- llvm/lib/Target/NVPTX/NVPTXUtilities.h | 16 +- llvm/lib/Target/NVPTX/NVVMIntrRange.cpp | 177 ++++++++++--------- llvm/test/CodeGen/NVPTX/intr-range.ll | 60 +++++++ llvm/test/CodeGen/NVPTX/intrinsic-old.ll | 43 ++--- 8 files changed, 249 insertions(+), 167 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/intr-range.ll diff --git a/clang/test/CodeGenCUDA/cuda-builtin-vars.cu b/clang/test/CodeGenCUDA/cuda-builtin-vars.cu index ba5e5f13ebe70..dba0a76af21dd 100644 --- a/clang/test/CodeGenCUDA/cuda-builtin-vars.cu +++ b/clang/test/CodeGenCUDA/cuda-builtin-vars.cu @@ -6,21 +6,21 @@ __attribute__((global)) void kernel(int *out) { int i = 0; - out[i++] = threadIdx.x; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() - out[i++] = threadIdx.y; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.y() - out[i++] = threadIdx.z; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.z() + out[i++] = threadIdx.x; // CHECK: call noundef {{.*}} i32 @llvm.nvvm.read.ptx.sreg.tid.x() + out[i++] = threadIdx.y; // CHECK: call noundef {{.*}} i32 @llvm.nvvm.read.ptx.sreg.tid.y() + out[i++] = threadIdx.z; // CHECK: call noundef {{.*}} i32 @llvm.nvvm.read.ptx.sreg.tid.z() - out[i++] = blockIdx.x; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() - out[i++] = blockIdx.y; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() - out[i++] = blockIdx.z; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() + out[i++] = blockIdx.x; // CHECK: call noundef {{.*}} i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + out[i++] = blockIdx.y; // CHECK: call noundef {{.*}} i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() + out[i++] = blockIdx.z; // CHECK: call noundef {{.*}} i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() - out[i++] = blockDim.x; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - out[i++] = blockDim.y; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.y() - out[i++] = blockDim.z; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.z() + out[i++] = blockDim.x; // CHECK: call noundef {{.*}} i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + out[i++] = blockDim.y; // CHECK: call noundef {{.*}} i32 @llvm.nvvm.read.ptx.sreg.ntid.y() + out[i++] = blockDim.z; // CHECK: call noundef {{.*}} i32 @llvm.nvvm.read.ptx.sreg.ntid.z() - out[i++] = gridDim.x; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() - out[i++] = gridDim.y; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() - out[i++] = gridDim.z; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() + out[i++] = gridDim.x; // CHECK: call noundef {{.*}} i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() + out[i++] = gridDim.y; // CHECK: call noundef {{.*}} i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() + out[i++] = gridDim.z; // CHECK: call noundef {{.*}} i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() out[i++] = warpSize; // CHECK: store i32 32, diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index f63697916d902..82770f8660850 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -542,30 +542,24 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, // If the NVVM IR has some of reqntid* specified, then output // the reqntid directive, and set the unspecified ones to 1. // If none of Reqntid* is specified, don't output reqntid directive. - unsigned Reqntidx, Reqntidy, Reqntidz; - Reqntidx = Reqntidy = Reqntidz = 1; - bool ReqSpecified = false; - ReqSpecified |= getReqNTIDx(F, Reqntidx); - ReqSpecified |= getReqNTIDy(F, Reqntidy); - ReqSpecified |= getReqNTIDz(F, Reqntidz); + std::optional<unsigned> Reqntidx = getReqNTIDx(F); + std::optional<unsigned> Reqntidy = getReqNTIDy(F); + std::optional<unsigned> Reqntidz = getReqNTIDz(F); - if (ReqSpecified) - O << ".reqntid " << Reqntidx << ", " << Reqntidy << ", " << Reqntidz - << "\n"; + if (Reqntidx || Reqntidy || Reqntidz) + O << ".reqntid " << Reqntidx.value_or(1) << ", " << Reqntidy.value_or(1) + << ", " << Reqntidz.value_or(1) << "\n"; // If the NVVM IR has some of maxntid* specified, then output // the maxntid directive, and set the unspecified ones to 1. // If none of maxntid* is specified, don't output maxntid directive. - unsigned Maxntidx, Maxntidy, Maxntidz; - Maxntidx = Maxntidy = Maxntidz = 1; - bool MaxSpecified = false; - MaxSpecified |= getMaxNTIDx(F, Maxntidx); - MaxSpecified |= getMaxNTIDy(F, Maxntidy); - MaxSpecified |= getMaxNTIDz(F, Maxntidz); - - if (MaxSpecified) - O << ".maxntid " << Maxntidx << ", " << Maxntidy << ", " << Maxntidz - << "\n"; + std::optional<unsigned> Maxntidx = getMaxNTIDx(F); + std::optional<unsigned> Maxntidy = getMaxNTIDy(F); + std::optional<unsigned> Maxntidz = getMaxNTIDz(F); + + if (Maxntidx || Maxntidy || Maxntidz) + O << ".maxntid " << Maxntidx.value_or(1) << ", " << Maxntidy.value_or(1) + << ", " << Maxntidz.value_or(1) << "\n"; unsigned Mincta = 0; if (getMinCTASm(F, Mincta)) diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 4dc3cea4bd8e7..657decb3308b3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -233,9 +233,9 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks( [this](ModulePassManager &PM, OptimizationLevel Level) { FunctionPassManager FPM; FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); - // FIXME: NVVMIntrRangePass is causing numerical discrepancies, - // investigate and re-enable. - // FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion())); + // Note: NVVMIntrRangePass was causing numerical discrepancies at one + // point, if issues crop up, consider disabling. + FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion())); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); }); } diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index 013afe916e86c..4305bbc99c969 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -128,6 +128,15 @@ bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop, return true; } +static std::optional<unsigned> +findOneNVVMAnnotation(const GlobalValue &GV, const std::string &PropName) { + unsigned RetVal; + bool Found = findOneNVVMAnnotation(&GV, PropName, RetVal); + if (Found) + return RetVal; + return std::nullopt; +} + bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop, std::vector<unsigned> &retval) { auto &AC = getAnnotationCache(); @@ -252,32 +261,57 @@ std::string getSamplerName(const Value &val) { return std::string(val.getName()); } -bool getMaxNTIDx(const Function &F, unsigned &x) { - return findOneNVVMAnnotation(&F, "maxntidx", x); +std::optional<unsigned> getMaxNTIDx(const Function &F) { + return findOneNVVMAnnotation(F, "maxntidx"); } -bool getMaxNTIDy(const Function &F, unsigned &y) { - return findOneNVVMAnnotation(&F, "maxntidy", y); +std::optional<unsigned> getMaxNTIDy(const Function &F) { + return findOneNVVMAnnotation(F, "maxntidy"); } -bool getMaxNTIDz(const Function &F, unsigned &z) { - return findOneNVVMAnnotation(&F, "maxntidz", z); +std::optional<unsigned> getMaxNTIDz(const Function &F) { + return findOneNVVMAnnotation(F, "maxntidz"); +} + +std::optional<unsigned> getMaxNTID(const Function &F) { + // Note: The semantics here are a bit strange. The PTX ISA states the + // following (11.4.2. Performance-Tuning Directives: .maxntid): + // + // Note that this directive guarantees that the total number of threads does + // not exceed the maximum, but does not guarantee that the limit in any + // particular dimension is not exceeded. + std::optional<unsigned> MaxNTIDx = getMaxNTIDx(F); + std::optional<unsigned> MaxNTIDy = getMaxNTIDy(F); + std::optional<unsigned> MaxNTIDz = getMaxNTIDz(F); + if (MaxNTIDx || MaxNTIDy || MaxNTIDz) + return MaxNTIDx.value_or(1) * MaxNTIDy.value_or(1) * MaxNTIDz.value_or(1); + return std::nullopt; } bool getMaxClusterRank(const Function &F, unsigned &x) { return findOneNVVMAnnotation(&F, "maxclusterrank", x); } -bool getReqNTIDx(const Function &F, unsigned &x) { - return findOneNVVMAnnotation(&F, "reqntidx", x); +std::optional<unsigned> getReqNTIDx(const Function &F) { + return findOneNVVMAnnotation(F, "reqntidx"); +} + +std::optional<unsigned> getReqNTIDy(const Function &F) { + return findOneNVVMAnnotation(F, "reqntidy"); } -bool getReqNTIDy(const Function &F, unsigned &y) { - return findOneNVVMAnnotation(&F, "reqntidy", y); +std::optional<unsigned> getReqNTIDz(const Function &F) { + return findOneNVVMAnnotation(F, "reqntidz"); } -bool getReqNTIDz(const Function &F, unsigned &z) { - return findOneNVVMAnnotation(&F, "reqntidz", z); +std::optional<unsigned> getReqNTID(const Function &F) { + // Note: The semantics here are a bit strange. See getMaxNTID. + std::optional<unsigned> ReqNTIDx = getReqNTIDx(F); + std::optional<unsigned> ReqNTIDy = getReqNTIDy(F); + std::optional<unsigned> ReqNTIDz = getReqNTIDz(F); + if (ReqNTIDx || ReqNTIDy || ReqNTIDz) + return ReqNTIDx.value_or(1) * ReqNTIDy.value_or(1) * ReqNTIDz.value_or(1); + return std::nullopt; } bool getMinCTASm(const Function &F, unsigned &x) { diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index 2872db9fa2131..e020bc0f02e96 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -48,13 +48,15 @@ std::string getTextureName(const Value &); std::string getSurfaceName(const Value &); std::string getSamplerName(const Value &); -bool getMaxNTIDx(const Function &, unsigned &); -bool getMaxNTIDy(const Function &, unsigned &); -bool getMaxNTIDz(const Function &, unsigned &); - -bool getReqNTIDx(const Function &, unsigned &); -bool getReqNTIDy(const Function &, unsigned &); -bool getReqNTIDz(const Function &, unsigned &); +std::optional<unsigned> getMaxNTIDx(const Function &); +std::optional<unsigned> getMaxNTIDy(const Function &); +std::optional<unsigned> getMaxNTIDz(const Function &); +std::optional<unsigned> getMaxNTID(const Function &F); + +std::optional<unsigned> getReqNTIDx(const Function &); +std::optional<unsigned> getReqNTIDy(const Function &); +std::optional<unsigned> getReqNTIDz(const Function &); +std::optional<unsigned> getReqNTID(const Function &); bool getMaxClusterRank(const Function &, unsigned &); bool getMinCTASm(const Function &, unsigned &); diff --git a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp index 5381646434eb8..a7ab350c40a87 100644 --- a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp +++ b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp @@ -1,4 +1,4 @@ -//===- NVVMIntrRange.cpp - Set !range metadata for NVVM intrinsics --------===// +//===- NVVMIntrRange.cpp - Set range attributes for NVVM intrinsics -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,19 +6,21 @@ // //===----------------------------------------------------------------------===// // -// This pass adds appropriate !range metadata for calls to NVVM +// This pass adds appropriate range attributes for calls to NVVM // intrinsics that return a limited range of values. // //===----------------------------------------------------------------------===// #include "NVPTX.h" -#include "llvm/IR/Constants.h" +#include "NVPTXUtilities.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/IR/PassManager.h" #include "llvm/Support/CommandLine.h" +#include <cstdint> using namespace llvm; @@ -26,25 +28,24 @@ using namespace llvm; namespace llvm { void initializeNVVMIntrRangePass(PassRegistry &); } -// Add !range metadata based on limits of given SM variant. +// Add range attributes based on limits of given SM variant. static cl::opt<unsigned> NVVMIntrRangeSM("nvvm-intr-range-sm", cl::init(20), cl::Hidden, cl::desc("SM variant")); namespace { class NVVMIntrRange : public FunctionPass { - private: - unsigned SmVersion; + unsigned SmVersion; - public: - static char ID; - NVVMIntrRange() : NVVMIntrRange(NVVMIntrRangeSM) {} - NVVMIntrRange(unsigned int SmVersion) - : FunctionPass(ID), SmVersion(SmVersion) { +public: + static char ID; + NVVMIntrRange() : NVVMIntrRange(NVVMIntrRangeSM) {} + NVVMIntrRange(unsigned int SmVersion) + : FunctionPass(ID), SmVersion(SmVersion) { - initializeNVVMIntrRangePass(*PassRegistry::getPassRegistry()); - } + initializeNVVMIntrRangePass(*PassRegistry::getPassRegistry()); + } - bool runOnFunction(Function &) override; + bool runOnFunction(Function &) override; }; } @@ -58,17 +59,17 @@ INITIALIZE_PASS(NVVMIntrRange, "nvvm-intr-range", // Adds the passed-in [Low,High) range information as metadata to the // passed-in call instruction. -static bool addRangeMetadata(uint64_t Low, uint64_t High, CallInst *C) { - // This call already has range metadata, nothing to do. - if (C->getMetadata(LLVMContext::MD_range)) +static bool addRangeAttr(uint64_t Low, uint64_t High, IntrinsicInst *II) { + if (II->getMetadata(LLVMContext::MD_range)) return false; - LLVMContext &Context = C->getParent()->getContext(); - IntegerType *Int32Ty = Type::getInt32Ty(Context); - Metadata *LowAndHigh[] = { - ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Low)), - ConstantAsMetadata::get(ConstantInt::get(Int32Ty, High))}; - C->setMetadata(LLVMContext::MD_range, MDNode::get(Context, LowAndHigh)); + const uint64_t BitWidth = II->getType()->getIntegerBitWidth(); + ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High)); + + if (auto CurrentRange = II->getRange()) + Range = Range.intersectWith(CurrentRange.value()); + + II->addRangeRetAttr(Range); return true; } @@ -76,9 +77,13 @@ static bool runNVVMIntrRange(Function &F, unsigned SmVersion) { struct { unsigned x, y, z; } MaxBlockSize, MaxGridSize; - MaxBlockSize.x = 1024; - MaxBlockSize.y = 1024; - MaxBlockSize.z = 64; + + const unsigned MetadataNTID = getReqNTID(F).value_or( + getMaxNTID(F).value_or(std::numeric_limits<unsigned>::max())); + + MaxBlockSize.x = std::min(1024u, MetadataNTID); + MaxBlockSize.y = std::min(1024u, MetadataNTID); + MaxBlockSize.z = std::min(64u, MetadataNTID); MaxGridSize.x = SmVersion >= 30 ? 0x7fffffff : 0xffff; MaxGridSize.y = 0xffff; @@ -87,69 +92,67 @@ static bool runNVVMIntrRange(Function &F, unsigned SmVersion) { // Go through the calls in this function. bool Changed = false; for (Instruction &I : instructions(F)) { - CallInst *Call = dyn_cast<CallInst>(&I); - if (!Call) + IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); + if (!II) continue; - if (Function *Callee = Call->getCalledFunction()) { - switch (Callee->getIntrinsicID()) { - // Index within block - case Intrinsic::nvvm_read_ptx_sreg_tid_x: - Changed |= addRangeMetadata(0, MaxBlockSize.x, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_tid_y: - Changed |= addRangeMetadata(0, MaxBlockSize.y, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_tid_z: - Changed |= addRangeMetadata(0, MaxBlockSize.z, Call); - break; - - // Block size - case Intrinsic::nvvm_read_ptx_sreg_ntid_x: - Changed |= addRangeMetadata(1, MaxBlockSize.x+1, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_ntid_y: - Changed |= addRangeMetadata(1, MaxBlockSize.y+1, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_ntid_z: - Changed |= addRangeMetadata(1, MaxBlockSize.z+1, Call); - break; - - // Index within grid - case Intrinsic::nvvm_read_ptx_sreg_ctaid_x: - Changed |= addRangeMetadata(0, MaxGridSize.x, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_ctaid_y: - Changed |= addRangeMetadata(0, MaxGridSize.y, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_ctaid_z: - Changed |= addRangeMetadata(0, MaxGridSize.z, Call); - break; - - // Grid size - case Intrinsic::nvvm_read_ptx_sreg_nctaid_x: - Changed |= addRangeMetadata(1, MaxGridSize.x+1, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_nctaid_y: - Changed |= addRangeMetadata(1, MaxGridSize.y+1, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_nctaid_z: - Changed |= addRangeMetadata(1, MaxGridSize.z+1, Call); - break; - - // warp size is constant 32. - case Intrinsic::nvvm_read_ptx_sreg_warpsize: - Changed |= addRangeMetadata(32, 32+1, Call); - break; - - // Lane ID is [0..warpsize) - case Intrinsic::nvvm_read_ptx_sreg_laneid: - Changed |= addRangeMetadata(0, 32, Call); - break; - - default: - break; - } + switch (II->getIntrinsicID()) { + // Index within block + case Intrinsic::nvvm_read_ptx_sreg_tid_x: + Changed |= addRangeAttr(0, MaxBlockSize.x, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_tid_y: + Changed |= addRangeAttr(0, MaxBlockSize.y, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_tid_z: + Changed |= addRangeAttr(0, MaxBlockSize.z, II); + break; + + // Block size + case Intrinsic::nvvm_read_ptx_sreg_ntid_x: + Changed |= addRangeAttr(1, MaxBlockSize.x + 1, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_ntid_y: + Changed |= addRangeAttr(1, MaxBlockSize.y + 1, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_ntid_z: + Changed |= addRangeAttr(1, MaxBlockSize.z + 1, II); + break; + + // Index within grid + case Intrinsic::nvvm_read_ptx_sreg_ctaid_x: + Changed |= addRangeAttr(0, MaxGridSize.x, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_ctaid_y: + Changed |= addRangeAttr(0, MaxGridSize.y, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_ctaid_z: + Changed |= addRangeAttr(0, MaxGridSize.z, II); + break; + + // Grid size + case Intrinsic::nvvm_read_ptx_sreg_nctaid_x: + Changed |= addRangeAttr(1, MaxGridSize.x + 1, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_nctaid_y: + Changed |= addRangeAttr(1, MaxGridSize.y + 1, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_nctaid_z: + Changed |= addRangeAttr(1, MaxGridSize.z + 1, II); + break; + + // warp size is constant 32. + case Intrinsic::nvvm_read_ptx_sreg_warpsize: + Changed |= addRangeAttr(32, 32 + 1, II); + break; + + // Lane ID is [0..warpsize) + case Intrinsic::nvvm_read_ptx_sreg_laneid: + Changed |= addRangeAttr(0, 32, II); + break; + + default: + break; } } diff --git a/llvm/test/CodeGen/NVPTX/intr-range.ll b/llvm/test/CodeGen/NVPTX/intr-range.ll new file mode 100644 index 0000000000000..3fd1672759903 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/intr-range.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 5 +; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -mcpu=sm_20 -passes=nvvm-intr-range | FileCheck %s + +define i32 @test_maxntid() { +; CHECK-LABEL: define i32 @test_maxntid( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 96) i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; CHECK-NEXT: [[TMP2:%.*]] = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z() +; CHECK-NEXT: [[TMP4:%.*]] = call range(i32 1, 97) i32 @llvm.nvvm.read.ptx.sreg.ntid.y() +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP3]], [[TMP4]] +; CHECK-NEXT: ret i32 [[TMP5]] +; + %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() + %3 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() + %4 = add i32 %1, %2 + %5 = add i32 %4, %3 + ret i32 %5 +} + +define i32 @test_reqntid() { +; CHECK-LABEL: define i32 @test_reqntid( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; CHECK-NEXT: [[TMP2:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.z() +; CHECK-NEXT: [[TMP3:%.*]] = call range(i32 1, 21) i32 @llvm.nvvm.read.ptx.sreg.ntid.y() +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], [[TMP3]] +; CHECK-NEXT: ret i32 [[TMP5]] +; + %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() + %3 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() + %4 = add i32 %1, %2 + %5 = add i32 %4, %3 + ret i32 %5 +} + +;; A case like this could occur if a function with the sreg intrinsic was +;; inlined into a kernel where the tid metadata is present, ensure the range is +;; updated. +define i32 @test_inlined() { +; CHECK-LABEL: define i32 @test_inlined( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 4) i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; CHECK-NEXT: ret i32 [[TMP1]] +; + %1 = call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() + ret i32 %1 +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() + +!nvvm.annotations = !{!0, !1, !2} +!0 = !{ptr @test_maxntid, !"kernel", i32 1, !"maxntidx", i32 32, !"maxntidz", i32 3} +!1 = !{ptr @test_reqntid, !"kernel", i32 1, !"reqntidx", i32 20} +!2 = !{ptr @test_inlined, !"kernel", i32 1, !"maxntidx", i32 4} diff --git a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll index 3930e6d774183..a53e538241e31 100644 --- a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll +++ b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll @@ -15,7 +15,7 @@ define ptx_device i32 @test_tid_x() { ; CHECK: mov.u32 %r{{[0-9]+}}, %tid.x; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[BLK_IDX_XY:[0-9]+]] +; RANGE: call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() ret i32 %x @@ -23,7 +23,7 @@ define ptx_device i32 @test_tid_x() { define ptx_device i32 @test_tid_y() { ; CHECK: mov.u32 %r{{[0-9]+}}, %tid.y; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.tid.y(), !range ![[BLK_IDX_XY]] +; RANGE: call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.y() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() ret i32 %x @@ -31,7 +31,7 @@ define ptx_device i32 @test_tid_y() { define ptx_device i32 @test_tid_z() { ; CHECK: mov.u32 %r{{[0-9]+}}, %tid.z; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.tid.z(), !range ![[BLK_IDX_Z:[0-9]+]] +; RANGE: call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() ret i32 %x @@ -46,7 +46,7 @@ define ptx_device i32 @test_tid_w() { define ptx_device i32 @test_ntid_x() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.x; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range ![[BLK_SIZE_XY:[0-9]+]] +; RANGE: call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() ret i32 %x @@ -54,7 +54,7 @@ define ptx_device i32 @test_ntid_x() { define ptx_device i32 @test_ntid_y() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.y; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y(), !range ![[BLK_SIZE_XY]] +; RANGE: call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() ret i32 %x @@ -62,7 +62,7 @@ define ptx_device i32 @test_ntid_y() { define ptx_device i32 @test_ntid_z() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.z; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z(), !range ![[BLK_SIZE_Z:[0-9]+]] +; RANGE: call range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z() ret i32 %x @@ -77,7 +77,7 @@ define ptx_device i32 @test_ntid_w() { define ptx_device i32 @test_laneid() { ; CHECK: mov.u32 %r{{[0-9]+}}, %laneid; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.laneid(), !range ![[LANEID:[0-9]+]] +; RANGE: call range(i32 0, 32) i32 @llvm.nvvm.read.ptx.sreg.laneid() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.laneid() ret i32 %x @@ -85,7 +85,7 @@ define ptx_device i32 @test_laneid() { define ptx_device i32 @test_warpsize() { ; CHECK: mov.u32 %r{{[0-9]+}}, WARP_SZ; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range ![[WARPSIZE:[0-9]+]] +; RANGE: call range(i32 32, 33) i32 @llvm.nvvm.read.ptx.sreg.warpsize() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() ret i32 %x @@ -107,7 +107,7 @@ define ptx_device i32 @test_nwarpid() { define ptx_device i32 @test_ctaid_y() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.y; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !range ![[GRID_IDX_YZ:[0-9]+]] +; RANGE: call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() ret i32 %x @@ -115,7 +115,7 @@ define ptx_device i32 @test_ctaid_y() { define ptx_device i32 @test_ctaid_z() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.z; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !range ![[GRID_IDX_YZ]] +; RANGE: call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() ret i32 %x @@ -123,8 +123,8 @@ define ptx_device i32 @test_ctaid_z() { define ptx_device i32 @test_ctaid_x() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.x; -; RANGE_30: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[GRID_IDX_X:[0-9]+]] -; RANGE_20: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[GRID_IDX_YZ]] +; RANGE_30: call range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() +; RANGE_20: call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() ret i32 %x @@ -139,7 +139,7 @@ define ptx_device i32 @test_ctaid_w() { define ptx_device i32 @test_nctaid_y() { ; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.y; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y(), !range ![[GRID_SIZE_YZ:[0-9]+]] +; RANGE: call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() ret i32 %x @@ -147,7 +147,7 @@ define ptx_device i32 @test_nctaid_y() { define ptx_device i32 @test_nctaid_z() { ; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.z; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z(), !range ![[GRID_SIZE_YZ]] +; RANGE: call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() ret i32 %x @@ -155,8 +155,8 @@ define ptx_device i32 @test_nctaid_z() { define ptx_device i32 @test_nctaid_x() { ; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x; -; RANGE_30: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x(), !range ![[GRID_SIZE_X:[0-9]+]] -; RANGE_20: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x(), !range ![[GRID_SIZE_YZ]] +; RANGE_30: call range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() +; RANGE_20: call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() ret i32 %x @@ -327,14 +327,3 @@ declare void @llvm.nvvm.bar.sync(i32 %i) !0 = !{i32 0, i32 19} ; RANGE-DAG: ![[ALREADY]] = !{i32 0, i32 19} -; RANGE-DAG: ![[BLK_IDX_XY]] = !{i32 0, i32 1024} -; RANGE-DAG: ![[BLK_IDX_XY]] = !{i32 0, i32 1024} -; RANGE-DAG: ![[BLK_IDX_Z]] = !{i32 0, i32 64} -; RANGE-DAG: ![[BLK_SIZE_XY]] = !{i32 1, i32 1025} -; RANGE-DAG: ![[BLK_SIZE_Z]] = !{i32 1, i32 65} -; RANGE-DAG: ![[LANEID]] = !{i32 0, i32 32} -; RANGE-DAG: ![[WARPSIZE]] = !{i32 32, i32 33} -; RANGE_30-DAG: ![[GRID_IDX_X]] = !{i32 0, i32 2147483647} -; RANGE-DAG: ![[GRID_IDX_YZ]] = !{i32 0, i32 65535} -; RANGE_30-DAG: ![[GRID_SIZE_X]] = !{i32 1, i32 -2147483648} -; RANGE-DAG: ![[GRID_SIZE_YZ]] = !{i32 1, i32 65536} >From b8081628cf40008cd1bccf702afabcb8f52bcca0 Mon Sep 17 00:00:00 2001 From: Alex MacLean <amacl...@nvidia.com> Date: Wed, 5 Jun 2024 21:57:55 +0000 Subject: [PATCH 2/4] Remove SmVersion from IntrRange pass --- llvm/lib/Target/NVPTX/NVPTX.h | 7 +---- llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 2 +- llvm/lib/Target/NVPTX/NVVMIntrRange.cpp | 28 ++++++-------------- llvm/test/CodeGen/NVPTX/intrinsic-old.ll | 18 ++++++------- 4 files changed, 18 insertions(+), 37 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 5eefab59a6ab5..b0cb24c63c3ce 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -40,7 +40,7 @@ FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM, ModulePass *createNVPTXAssignValidGlobalNamesPass(); ModulePass *createGenericToNVVMLegacyPass(); ModulePass *createNVPTXCtorDtorLoweringLegacyPass(); -FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion); +FunctionPass *createNVVMIntrRangePass(); FunctionPass *createNVVMReflectPass(unsigned int SmVersion); MachineFunctionPass *createNVPTXPrologEpilogPass(); MachineFunctionPass *createNVPTXReplaceImageHandlesPass(); @@ -53,12 +53,7 @@ MachineFunctionPass *createNVPTXPeephole(); MachineFunctionPass *createNVPTXProxyRegErasurePass(); struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> { - NVVMIntrRangePass(); - NVVMIntrRangePass(unsigned SmVersion) : SmVersion(SmVersion) {} PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); - -private: - unsigned SmVersion; }; struct NVVMReflectPass : PassInfoMixin<NVVMReflectPass> { diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 657decb3308b3..b60a1d747af79 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -235,7 +235,7 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks( FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); // Note: NVVMIntrRangePass was causing numerical discrepancies at one // point, if issues crop up, consider disabling. - FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion())); + FPM.addPass(NVVMIntrRangePass()); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); }); } diff --git a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp index a7ab350c40a87..9afce99ce79f1 100644 --- a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp +++ b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp @@ -28,30 +28,22 @@ using namespace llvm; namespace llvm { void initializeNVVMIntrRangePass(PassRegistry &); } -// Add range attributes based on limits of given SM variant. -static cl::opt<unsigned> NVVMIntrRangeSM("nvvm-intr-range-sm", cl::init(20), - cl::Hidden, cl::desc("SM variant")); - namespace { class NVVMIntrRange : public FunctionPass { unsigned SmVersion; public: static char ID; - NVVMIntrRange() : NVVMIntrRange(NVVMIntrRangeSM) {} - NVVMIntrRange(unsigned int SmVersion) - : FunctionPass(ID), SmVersion(SmVersion) { + NVVMIntrRange() : FunctionPass(ID) { initializeNVVMIntrRangePass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &) override; }; -} +} // namespace -FunctionPass *llvm::createNVVMIntrRangePass(unsigned int SmVersion) { - return new NVVMIntrRange(SmVersion); -} +FunctionPass *llvm::createNVVMIntrRangePass() { return new NVVMIntrRange(); } char NVVMIntrRange::ID = 0; INITIALIZE_PASS(NVVMIntrRange, "nvvm-intr-range", @@ -73,7 +65,7 @@ static bool addRangeAttr(uint64_t Low, uint64_t High, IntrinsicInst *II) { return true; } -static bool runNVVMIntrRange(Function &F, unsigned SmVersion) { +static bool runNVVMIntrRange(Function &F) { struct { unsigned x, y, z; } MaxBlockSize, MaxGridSize; @@ -85,7 +77,7 @@ static bool runNVVMIntrRange(Function &F, unsigned SmVersion) { MaxBlockSize.y = std::min(1024u, MetadataNTID); MaxBlockSize.z = std::min(64u, MetadataNTID); - MaxGridSize.x = SmVersion >= 30 ? 0x7fffffff : 0xffff; + MaxGridSize.x = 0x7fffffff; MaxGridSize.y = 0xffff; MaxGridSize.z = 0xffff; @@ -159,14 +151,10 @@ static bool runNVVMIntrRange(Function &F, unsigned SmVersion) { return Changed; } -bool NVVMIntrRange::runOnFunction(Function &F) { - return runNVVMIntrRange(F, SmVersion); -} - -NVVMIntrRangePass::NVVMIntrRangePass() : NVVMIntrRangePass(NVVMIntrRangeSM) {} +bool NVVMIntrRange::runOnFunction(Function &F) { return runNVVMIntrRange(F); } PreservedAnalyses NVVMIntrRangePass::run(Function &F, FunctionAnalysisManager &AM) { - return runNVVMIntrRange(F, SmVersion) ? PreservedAnalyses::none() - : PreservedAnalyses::all(); + return runNVVMIntrRange(F) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); } diff --git a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll index a53e538241e31..64c4ea493580d 100644 --- a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll +++ b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll @@ -1,15 +1,15 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck -allow-deprecated-dag-overlap %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck -allow-deprecated-dag-overlap %s ; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -passes=nvvm-intr-range \ -; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE --check-prefix=RANGE_20 %s +; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE %s ; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -passes=nvvm-intr-range \ -; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE --check-prefix=RANGE_20 %s +; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE %s ; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda \ -; RUN: -passes=nvvm-intr-range -nvvm-intr-range-sm=30 \ -; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE --check-prefix=RANGE_30 %s +; RUN: -passes=nvvm-intr-range \ +; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE %s ; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda \ -; RUN: -passes=nvvm-intr-range -nvvm-intr-range-sm=30 \ -; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE --check-prefix=RANGE_30 %s +; RUN: -passes=nvvm-intr-range \ +; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE %s ; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} @@ -123,8 +123,7 @@ define ptx_device i32 @test_ctaid_z() { define ptx_device i32 @test_ctaid_x() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.x; -; RANGE_30: call range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() -; RANGE_20: call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() +; RANGE: call range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() ret i32 %x @@ -155,8 +154,7 @@ define ptx_device i32 @test_nctaid_z() { define ptx_device i32 @test_nctaid_x() { ; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x; -; RANGE_30: call range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() -; RANGE_20: call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() +; RANGE: call range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() ret i32 %x >From d27648cb75b54e7da1af30fd743a55f90b02f3bc Mon Sep 17 00:00:00 2001 From: Alex MacLean <amacl...@nvidia.com> Date: Wed, 5 Jun 2024 22:05:05 +0000 Subject: [PATCH 3/4] address comments --- llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 3 +- llvm/test/CodeGen/NVPTX/intr-range.ll | 58 ++++++++++++++++++------ 2 files changed, 44 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index 4305bbc99c969..3a536db1c9727 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -131,8 +131,7 @@ bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop, static std::optional<unsigned> findOneNVVMAnnotation(const GlobalValue &GV, const std::string &PropName) { unsigned RetVal; - bool Found = findOneNVVMAnnotation(&GV, PropName, RetVal); - if (Found) + if (findOneNVVMAnnotation(&GV, PropName, RetVal)) return RetVal; return std::nullopt; } diff --git a/llvm/test/CodeGen/NVPTX/intr-range.ll b/llvm/test/CodeGen/NVPTX/intr-range.ll index 3fd1672759903..2f3e08a039f52 100644 --- a/llvm/test/CodeGen/NVPTX/intr-range.ll +++ b/llvm/test/CodeGen/NVPTX/intr-range.ll @@ -5,35 +5,59 @@ define i32 @test_maxntid() { ; CHECK-LABEL: define i32 @test_maxntid( ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 96) i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; CHECK-NEXT: [[TMP3:%.*]] = call range(i32 0, 96) i32 @llvm.nvvm.read.ptx.sreg.tid.y() ; CHECK-NEXT: [[TMP2:%.*]] = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z() +; CHECK-NEXT: [[TMP11:%.*]] = call range(i32 1, 97) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() ; CHECK-NEXT: [[TMP4:%.*]] = call range(i32 1, 97) i32 @llvm.nvvm.read.ptx.sreg.ntid.y() -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z() +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP10]], [[TMP6]] ; CHECK-NEXT: ret i32 [[TMP5]] ; %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() - %3 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() - %4 = add i32 %1, %2 - %5 = add i32 %4, %3 - ret i32 %5 + %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() + %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() + %4 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + %5 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() + %6 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z() + %7 = add i32 %1, %2 + %8 = add i32 %7, %3 + %9 = add i32 %8, %4 + %10 = add i32 %9, %5 + %11 = add i32 %10, %6 + ret i32 %11 } define i32 @test_reqntid() { ; CHECK-LABEL: define i32 @test_reqntid( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; CHECK-NEXT: [[TMP5:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.y() ; CHECK-NEXT: [[TMP2:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.z() +; CHECK-NEXT: [[TMP4:%.*]] = call range(i32 1, 21) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() ; CHECK-NEXT: [[TMP3:%.*]] = call range(i32 1, 21) i32 @llvm.nvvm.read.ptx.sreg.ntid.y() -; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], [[TMP3]] -; CHECK-NEXT: ret i32 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 1, 21) i32 @llvm.nvvm.read.ptx.sreg.ntid.z() +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP1]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP6]] +; CHECK-NEXT: ret i32 [[TMP3]] ; %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() - %3 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() - %4 = add i32 %1, %2 - %5 = add i32 %4, %3 + %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() + %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() + %4 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + %5 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() + %6 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z() + %7 = add i32 %1, %2 + %8 = add i32 %7, %3 + %9 = add i32 %8, %4 + %10 = add i32 %9, %5 + %11 = add i32 %10, %6 ret i32 %5 } @@ -50,9 +74,13 @@ define i32 @test_inlined() { ret i32 %1 } -declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() + +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z() !nvvm.annotations = !{!0, !1, !2} !0 = !{ptr @test_maxntid, !"kernel", i32 1, !"maxntidx", i32 32, !"maxntidz", i32 3} >From 1aeadbac9652ceb8b6c123840d053936e534d911 Mon Sep 17 00:00:00 2001 From: Alex MacLean <amacl...@nvidia.com> Date: Wed, 5 Jun 2024 22:19:04 +0000 Subject: [PATCH 4/4] address comments - remove redundant RUN lines --- llvm/test/CodeGen/NVPTX/intrinsic-old.ll | 8 -------- 1 file changed, 8 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll index 64c4ea493580d..85f7817f08dc4 100644 --- a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll +++ b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll @@ -2,14 +2,6 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck -allow-deprecated-dag-overlap %s ; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -passes=nvvm-intr-range \ ; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE %s -; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -passes=nvvm-intr-range \ -; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE %s -; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda \ -; RUN: -passes=nvvm-intr-range \ -; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE %s -; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda \ -; RUN: -passes=nvvm-intr-range \ -; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE %s ; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits