Author: Yonah Goldberg Date: 2025-04-10T18:33:37-07:00 New Revision: 701d726ef09ea89909df9bd2fdc63c63758fe8d6
URL: https://github.com/llvm/llvm-project/commit/701d726ef09ea89909df9bd2fdc63c63758fe8d6 DIFF: https://github.com/llvm/llvm-project/commit/701d726ef09ea89909df9bd2fdc63c63758fe8d6.diff LOG: [NVPTX] Improve NVVMReflect Efficiency (#134416) The NVVMReflect pass simply replaces calls to nvvm-reflect functions with the appropriate constant, either the architecture number, or nvvm-reflect-ftz, found in the module's metadata. The implementation is inefficient and does this by traversing through all instructions to find calls. The common case is that you never call nvvm-reflect, so this traversal is costly. This PR: - Updates the pass so that it finds the reflect functions by name, and then traverses through their uses to find the calls directly. - Adds a line (245) to make sure the dead nvvm-reflect definitions are erased. - Adds the ability to set reflect values via command line Added: llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll Modified: clang/test/CodeGen/builtins-nvptx.c llvm/lib/Target/NVPTX/NVPTX.h llvm/lib/Target/NVPTX/NVPTXPassRegistry.def llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp llvm/lib/Target/NVPTX/NVVMReflect.cpp Removed: ################################################################################ diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 71b29849618b6..e2c159aac903f 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -1,37 +1,37 @@ // REQUIRES: nvptx-registered-target // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_70 -target-feature +ptx63 \ -// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX63_SM70 -check-prefix=LP64 %s // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \ -// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \ -// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \ -// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \ -// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 -target-feature +ptx62 \ -// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 -target-feature +ptx62 \ // RUN: -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \ -// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP32 %s // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \ -// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \ -// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s // ### The last run to check with the highest SM and PTX version available // ### to make sure target builtins are still accepted. // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 \ -// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s #define __device__ __attribute__((device)) @@ -61,6 +61,7 @@ __device__ bool reflect() { unsigned x = __nvvm_reflect("__CUDA_ARCH"); return x >= 700; + } __device__ int read_ntid() { diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index ff983e52179af..98e77ca80b8d5 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -43,7 +43,7 @@ ModulePass *createNVPTXAssignValidGlobalNamesPass(); ModulePass *createGenericToNVVMLegacyPass(); ModulePass *createNVPTXCtorDtorLoweringLegacyPass(); FunctionPass *createNVVMIntrRangePass(); -FunctionPass *createNVVMReflectPass(unsigned int SmVersion); +ModulePass *createNVVMReflectPass(unsigned int SmVersion); MachineFunctionPass *createNVPTXPrologEpilogPass(); MachineFunctionPass *createNVPTXReplaceImageHandlesPass(); FunctionPass *createNVPTXImageOptimizerPass(); @@ -55,6 +55,7 @@ MachineFunctionPass *createNVPTXPeephole(); MachineFunctionPass *createNVPTXProxyRegErasurePass(); MachineFunctionPass *createNVPTXForwardParamsPass(); +void initializeNVVMReflectLegacyPassPass(PassRegistry &); void initializeGenericToNVVMLegacyPassPass(PassRegistry &); void initializeNVPTXAllocaHoistingPass(PassRegistry &); void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &); @@ -78,9 +79,9 @@ struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> { }; struct NVVMReflectPass : PassInfoMixin<NVVMReflectPass> { - NVVMReflectPass(); + NVVMReflectPass() : SmVersion(0) {} NVVMReflectPass(unsigned SmVersion) : SmVersion(SmVersion) {} - PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + PreservedAnalyses run(Module &F, ModuleAnalysisManager &AM); private: unsigned SmVersion; diff --git a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def index 34c79b8f77bae..1c813c2c51f70 100644 --- a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def +++ b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def @@ -18,6 +18,7 @@ #endif MODULE_PASS("generic-to-nvvm", GenericToNVVMPass()) MODULE_PASS("nvptx-lower-ctor-dtor", NVPTXCtorDtorLoweringPass()) +MODULE_PASS("nvvm-reflect", NVVMReflectPass()) #undef MODULE_PASS #ifndef FUNCTION_ANALYSIS @@ -36,7 +37,6 @@ FUNCTION_ALIAS_ANALYSIS("nvptx-aa", NVPTXAA()) #define FUNCTION_PASS(NAME, CREATE_PASS) #endif FUNCTION_PASS("nvvm-intr-range", NVVMIntrRangePass()) -FUNCTION_PASS("nvvm-reflect", NVVMReflectPass()) FUNCTION_PASS("nvptx-copy-byval-args", NVPTXCopyByValArgsPass()) FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this)); #undef FUNCTION_PASS diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 8a25256ea1e4a..a4c3b43aec9f2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -95,7 +95,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() { PassRegistry &PR = *PassRegistry::getPassRegistry(); // FIXME: This pass is really intended to be invoked during IR optimization, // but it's very NVPTX-specific. - initializeNVVMReflectPass(PR); + initializeNVVMReflectLegacyPassPass(PR); initializeNVVMIntrRangePass(PR); initializeGenericToNVVMLegacyPassPass(PR); initializeNVPTXAllocaHoistingPass(PR); @@ -240,11 +240,12 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PB.registerPipelineStartEPCallback( [this](ModulePassManager &PM, OptimizationLevel Level) { - FunctionPassManager FPM; // We do not want to fold out calls to nvvm.reflect early if the user // has not provided a target architecture just yet. if (Subtarget.hasTargetName()) - FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); + PM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); + + FunctionPassManager FPM; // Note: NVVMIntrRangePass was causing numerical discrepancies at one // point, if issues crop up, consider disabling. FPM.addPass(NVVMIntrRangePass()); diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp index 2809ec2303f99..7273b30e4ae2e 100644 --- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -20,12 +20,12 @@ #include "NVPTX.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/CodeGen/CommandFlags.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" -#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsNVPTX.h" @@ -39,186 +39,201 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" -#include <algorithm> #define NVVM_REFLECT_FUNCTION "__nvvm_reflect" #define NVVM_REFLECT_OCL_FUNCTION "__nvvm_reflect_ocl" +// Argument of reflect call to retrive arch number +#define CUDA_ARCH_NAME "__CUDA_ARCH" +// Argument of reflect call to retrive ftz mode +#define CUDA_FTZ_NAME "__CUDA_FTZ" +// Name of module metadata where ftz mode is stored +#define CUDA_FTZ_MODULE_NAME "nvvm-reflect-ftz" using namespace llvm; -#define DEBUG_TYPE "nvptx-reflect" +#define DEBUG_TYPE "nvvm-reflect" + +namespace llvm { +void initializeNVVMReflectLegacyPassPass(PassRegistry &); +} // namespace llvm namespace { -class NVVMReflect : public FunctionPass { +class NVVMReflect { + // Map from reflect function call arguments to the value to replace the call + // with. Should include __CUDA_FTZ and __CUDA_ARCH values. + StringMap<unsigned> ReflectMap; + bool handleReflectFunction(Module &M, StringRef ReflectName); + void populateReflectMap(Module &M); + void foldReflectCall(CallInst *Call, Constant *NewValue); + public: - static char ID; - unsigned int SmVersion; - NVVMReflect() : NVVMReflect(0) {} - explicit NVVMReflect(unsigned int Sm) : FunctionPass(ID), SmVersion(Sm) {} + // __CUDA_FTZ is assigned in `runOnModule` by checking nvvm-reflect-ftz module + // metadata. + explicit NVVMReflect(unsigned SmVersion) + : ReflectMap({{CUDA_ARCH_NAME, SmVersion * 10}}) {} + bool runOnModule(Module &M); +}; - bool runOnFunction(Function &) override; +class NVVMReflectLegacyPass : public ModulePass { + NVVMReflect Impl; + +public: + static char ID; + NVVMReflectLegacyPass(unsigned SmVersion) : ModulePass(ID), Impl(SmVersion) {} + bool runOnModule(Module &M) override; }; } // namespace -FunctionPass *llvm::createNVVMReflectPass(unsigned int SmVersion) { - return new NVVMReflect(SmVersion); +ModulePass *llvm::createNVVMReflectPass(unsigned SmVersion) { + return new NVVMReflectLegacyPass(SmVersion); } static cl::opt<bool> NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true), cl::Hidden, cl::desc("NVVM reflection, enabled by default")); -char NVVMReflect::ID = 0; -INITIALIZE_PASS(NVVMReflect, "nvvm-reflect", +char NVVMReflectLegacyPass::ID = 0; +INITIALIZE_PASS(NVVMReflectLegacyPass, "nvvm-reflect", "Replace occurrences of __nvvm_reflect() calls with 0/1", false, false) -static bool runNVVMReflect(Function &F, unsigned SmVersion) { - if (!NVVMReflectEnabled) - return false; - - if (F.getName() == NVVM_REFLECT_FUNCTION || - F.getName() == NVVM_REFLECT_OCL_FUNCTION) { - assert(F.isDeclaration() && "_reflect function should not have a body"); - assert(F.getReturnType()->isIntegerTy() && - "_reflect's return type should be integer"); - return false; +// Allow users to specify additional key/value pairs to reflect. These key/value +// pairs are the last to be added to the ReflectMap, and therefore will take +// precedence over initial values (i.e. __CUDA_FTZ from module medadata and +// __CUDA_ARCH from SmVersion). +static cl::list<std::string> ReflectList( + "nvvm-reflect-add", cl::value_desc("name=<int>"), cl::Hidden, + cl::desc("A key=value pair. Replace __nvvm_reflect(name) with value."), + cl::ValueRequired); + +// Set the ReflectMap with, first, the value of __CUDA_FTZ from module metadata, +// and then the key/value pairs from the command line. +void NVVMReflect::populateReflectMap(Module &M) { + if (auto *Flag = mdconst::extract_or_null<ConstantInt>( + M.getModuleFlag(CUDA_FTZ_MODULE_NAME))) + ReflectMap[CUDA_FTZ_NAME] = Flag->getSExtValue(); + + for (auto &Option : ReflectList) { + LLVM_DEBUG(dbgs() << "ReflectOption : " << Option << "\n"); + StringRef OptionRef(Option); + auto [Name, Val] = OptionRef.split('='); + if (Name.empty()) + report_fatal_error(Twine("Empty name in nvvm-reflect-add option '") + + Option + "'"); + if (Val.empty()) + report_fatal_error(Twine("Missing value in nvvm-reflect-add option '") + + Option + "'"); + unsigned ValInt; + if (!to_integer(Val.trim(), ValInt, 10)) + report_fatal_error( + Twine("integer value expected in nvvm-reflect-add option '") + + Option + "'"); + ReflectMap[Name] = ValInt; } +} - SmallVector<Instruction *, 4> ToRemove; - SmallVector<Instruction *, 4> ToSimplify; - - // Go through the calls in this function. Each call to __nvvm_reflect or - // llvm.nvvm.reflect should be a CallInst with a ConstantArray argument. - // First validate that. If the c-string corresponding to the ConstantArray can - // be found successfully, see if it can be found in VarMap. If so, replace the - // uses of CallInst with the value found in VarMap. If not, replace the use - // with value 0. - - // The IR for __nvvm_reflect calls diff ers between CUDA versions. - // - // CUDA 6.5 and earlier uses this sequence: - // %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8 - // (i8 addrspace(4)* getelementptr inbounds - // ([8 x i8], [8 x i8] addrspace(4)* @str, i32 0, i32 0)) - // %reflect = tail call i32 @__nvvm_reflect(i8* %ptr) - // - // The value returned by Sym->getOperand(0) is a Constant with a - // ConstantDataSequential operand which can be converted to string and used - // for lookup. - // - // CUDA 7.0 does it slightly diff erently: - // %reflect = call i32 @__nvvm_reflect(i8* addrspacecast - // (i8 addrspace(1)* getelementptr inbounds - // ([8 x i8], [8 x i8] addrspace(1)* @str, i32 0, i32 0) to i8*)) - // - // In this case, we get a Constant with a GlobalVariable operand and we need - // to dig deeper to find its initializer with the string we'll use for lookup. - for (Instruction &I : instructions(F)) { - CallInst *Call = dyn_cast<CallInst>(&I); +/// Process a reflect function by finding all its calls and replacing them with +/// appropriate constant values. For __CUDA_FTZ, uses the module flag value. +/// For __CUDA_ARCH, uses SmVersion * 10. For all other strings, uses 0. +bool NVVMReflect::handleReflectFunction(Module &M, StringRef ReflectName) { + Function *F = M.getFunction(ReflectName); + if (!F) + return false; + assert(F->isDeclaration() && "_reflect function should not have a body"); + assert(F->getReturnType()->isIntegerTy() && + "_reflect's return type should be integer"); + + const bool Changed = F->getNumUses() > 0; + for (User *U : make_early_inc_range(F->users())) { + // Reflect function calls look like: + // @arch = private unnamed_addr addrspace(1) constant [12 x i8] + // c"__CUDA_ARCH\00" call i32 @__nvvm_reflect(ptr addrspacecast (ptr + // addrspace(1) @arch to ptr)) We need to extract the string argument from + // the call (i.e. "__CUDA_ARCH") + auto *Call = dyn_cast<CallInst>(U); if (!Call) - continue; - Function *Callee = Call->getCalledFunction(); - if (!Callee || (Callee->getName() != NVVM_REFLECT_FUNCTION && - Callee->getName() != NVVM_REFLECT_OCL_FUNCTION && - Callee->getIntrinsicID() != Intrinsic::nvvm_reflect)) - continue; - - // FIXME: Improve error handling here and elsewhere in this pass. - assert(Call->getNumOperands() == 2 && - "Wrong number of operands to __nvvm_reflect function"); - - // In cuda 6.5 and earlier, we will have an extra constant-to-generic - // conversion of the string. - const Value *Str = Call->getArgOperand(0); - if (const CallInst *ConvCall = dyn_cast<CallInst>(Str)) { - // FIXME: Add assertions about ConvCall. - Str = ConvCall->getArgOperand(0); - } - // Pre opaque pointers we have a constant expression wrapping the constant - // string. - Str = Str->stripPointerCasts(); - assert(isa<Constant>(Str) && - "Format of __nvvm_reflect function not recognized"); - - const Value *Operand = cast<Constant>(Str)->getOperand(0); - if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Operand)) { - // For CUDA-7.0 style __nvvm_reflect calls, we need to find the operand's - // initializer. - assert(GV->hasInitializer() && - "Format of _reflect function not recognized"); - const Constant *Initializer = GV->getInitializer(); - Operand = Initializer; - } - - assert(isa<ConstantDataSequential>(Operand) && - "Format of _reflect function not recognized"); - assert(cast<ConstantDataSequential>(Operand)->isCString() && - "Format of _reflect function not recognized"); - - StringRef ReflectArg = cast<ConstantDataSequential>(Operand)->getAsString(); - ReflectArg = ReflectArg.substr(0, ReflectArg.size() - 1); - LLVM_DEBUG(dbgs() << "Arg of _reflect : " << ReflectArg << "\n"); - - int ReflectVal = 0; // The default value is 0 - if (ReflectArg == "__CUDA_FTZ") { - // Try to pull __CUDA_FTZ from the nvvm-reflect-ftz module flag. Our - // choice here must be kept in sync with AutoUpgrade, which uses the same - // technique to detect whether ftz is enabled. - if (auto *Flag = mdconst::extract_or_null<ConstantInt>( - F.getParent()->getModuleFlag("nvvm-reflect-ftz"))) - ReflectVal = Flag->getSExtValue(); - } else if (ReflectArg == "__CUDA_ARCH") { - ReflectVal = SmVersion * 10; - } - - // If the immediate user is a simple comparison we want to simplify it. - for (User *U : Call->users()) - if (Instruction *I = dyn_cast<Instruction>(U)) - ToSimplify.push_back(I); - - Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal)); - ToRemove.push_back(Call); + report_fatal_error( + "__nvvm_reflect can only be used in a call instruction"); + if (Call->getNumOperands() != 2) + report_fatal_error("__nvvm_reflect requires exactly one argument"); + + auto *GlobalStr = + dyn_cast<Constant>(Call->getArgOperand(0)->stripPointerCasts()); + if (!GlobalStr) + report_fatal_error("__nvvm_reflect argument must be a constant string"); + + auto *ConstantStr = + dyn_cast<ConstantDataSequential>(GlobalStr->getOperand(0)); + if (!ConstantStr) + report_fatal_error("__nvvm_reflect argument must be a string constant"); + if (!ConstantStr->isCString()) + report_fatal_error( + "__nvvm_reflect argument must be a null-terminated string"); + + StringRef ReflectArg = ConstantStr->getAsString().drop_back(); + if (ReflectArg.empty()) + report_fatal_error("__nvvm_reflect argument cannot be empty"); + // Now that we have extracted the string argument, we can look it up in the + // ReflectMap + unsigned ReflectVal = 0; // The default value is 0 + if (ReflectMap.contains(ReflectArg)) + ReflectVal = ReflectMap[ReflectArg]; + + LLVM_DEBUG(dbgs() << "Replacing call of reflect function " << F->getName() + << "(" << ReflectArg << ") with value " << ReflectVal + << "\n"); + auto *NewValue = ConstantInt::get(Call->getType(), ReflectVal); + foldReflectCall(Call, NewValue); + Call->eraseFromParent(); } - // The code guarded by __nvvm_reflect may be invalid for the target machine. - // Traverse the use-def chain, continually simplifying constant expressions - // until we find a terminator that we can then remove. - while (!ToSimplify.empty()) { - Instruction *I = ToSimplify.pop_back_val(); - if (Constant *C = ConstantFoldInstruction(I, F.getDataLayout())) { - for (User *U : I->users()) - if (Instruction *I = dyn_cast<Instruction>(U)) - ToSimplify.push_back(I); - - I->replaceAllUsesWith(C); - if (isInstructionTriviallyDead(I)) { - ToRemove.push_back(I); - } + // Remove the __nvvm_reflect function from the module + F->eraseFromParent(); + return Changed; +} + +void NVVMReflect::foldReflectCall(CallInst *Call, Constant *NewValue) { + SmallVector<Instruction *, 8> Worklist; + // Replace an instruction with a constant and add all users of the instruction + // to the worklist + auto ReplaceInstructionWithConst = [&](Instruction *I, Constant *C) { + for (auto *U : I->users()) + if (auto *UI = dyn_cast<Instruction>(U)) + Worklist.push_back(UI); + I->replaceAllUsesWith(C); + }; + + ReplaceInstructionWithConst(Call, NewValue); + + auto &DL = Call->getModule()->getDataLayout(); + while (!Worklist.empty()) { + auto *I = Worklist.pop_back_val(); + if (auto *C = ConstantFoldInstruction(I, DL)) { + ReplaceInstructionWithConst(I, C); + if (isInstructionTriviallyDead(I)) + I->eraseFromParent(); } else if (I->isTerminator()) { ConstantFoldTerminator(I->getParent()); } } - - // Removing via isInstructionTriviallyDead may add duplicates to the ToRemove - // array. Filter out the duplicates before starting to erase from parent. - std::sort(ToRemove.begin(), ToRemove.end()); - auto NewLastIter = llvm::unique(ToRemove); - ToRemove.erase(NewLastIter, ToRemove.end()); - - for (Instruction *I : ToRemove) - I->eraseFromParent(); - - return ToRemove.size() > 0; } -bool NVVMReflect::runOnFunction(Function &F) { - return runNVVMReflect(F, SmVersion); +bool NVVMReflect::runOnModule(Module &M) { + if (!NVVMReflectEnabled) + return false; + populateReflectMap(M); + bool Changed = true; + Changed |= handleReflectFunction(M, NVVM_REFLECT_FUNCTION); + Changed |= handleReflectFunction(M, NVVM_REFLECT_OCL_FUNCTION); + Changed |= + handleReflectFunction(M, Intrinsic::getName(Intrinsic::nvvm_reflect)); + return Changed; } -NVVMReflectPass::NVVMReflectPass() : NVVMReflectPass(0) {} +bool NVVMReflectLegacyPass::runOnModule(Module &M) { + return Impl.runOnModule(M); +} -PreservedAnalyses NVVMReflectPass::run(Function &F, - FunctionAnalysisManager &AM) { - return runNVVMReflect(F, SmVersion) ? PreservedAnalyses::none() - : PreservedAnalyses::all(); +PreservedAnalyses NVVMReflectPass::run(Module &M, ModuleAnalysisManager &AM) { + return NVVMReflect(SmVersion).runOnModule(M) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); } diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll new file mode 100644 index 0000000000000..0706882236d86 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-options.ll @@ -0,0 +1,54 @@ +; Test the NVVM reflect pass functionality: verifying that reflect calls are replaced with +; appropriate values based on command-line options. Verify that we can handle custom reflect arguments +; that aren't __CUDA_ARCH or __CUDA_FTZ. If that argument is given a value on the command-line, +; the reflect call should be replaced with that value. Otherwise, the reflect call should be replaced with 0. + +; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda \ +; RUN: -nvvm-reflect-add __CUDA_FTZ=1 -nvvm-reflect-add __CUDA_ARCH=350 %s -S \ +; RUN: | FileCheck %s --check-prefixes=COMMON,FTZ1,ARCH350,CUSTOM-ABSENT +; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda \ +; RUN: -nvvm-reflect-add __CUDA_FTZ=0 -nvvm-reflect-add __CUDA_ARCH=520 %s -S \ +; RUN: | FileCheck %s --check-prefixes=COMMON,FTZ0,ARCH520,CUSTOM-ABSENT +; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda \ +; RUN: -nvvm-reflect-add __CUDA_FTZ=0 -nvvm-reflect-add __CUDA_ARCH=520 \ +; RUN: -nvvm-reflect-add __CUSTOM_VALUE=42 %s -S \ +; RUN: | FileCheck %s --check-prefixes=COMMON,CUSTOM-PRESENT + +; To ensure that command line options override module options, create a copy of this test file +; with module options appended and rerun some tests. + +; RUN: cat %s > %t.options +; RUN: echo '!llvm.module.flags = !{!0}' >> %t.options +; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.options +; RUN: opt -passes=nvvm-reflect -mtriple=nvptx-nvidia-cuda \ +; RUN: -nvvm-reflect-add __CUDA_FTZ=0 -nvvm-reflect-add __CUDA_ARCH=520 %t.options -S \ +; RUN: | FileCheck %s --check-prefixes=COMMON,FTZ0,ARCH520 + +declare i32 @__nvvm_reflect(ptr) +@ftz = private unnamed_addr addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00" +@arch = private unnamed_addr addrspace(1) constant [12 x i8] c"__CUDA_ARCH\00" +@custom = private unnamed_addr addrspace(1) constant [15 x i8] c"__CUSTOM_VALUE\00" + +; COMMON-LABEL: define i32 @test_ftz() +; FTZ1: ret i32 1 +; FTZ0: ret i32 0 +define i32 @test_ftz() { + %1 = call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @ftz to ptr)) + ret i32 %1 +} + +; COMMON-LABEL: define i32 @test_arch() +; ARCH350: ret i32 350 +; ARCH520: ret i32 520 +define i32 @test_arch() { + %1 = call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @arch to ptr)) + ret i32 %1 +} + +; COMMON-LABEL: define i32 @test_custom() +; CUSTOM-ABSENT: ret i32 0 +; CUSTOM-PRESENT: ret i32 42 +define i32 @test_custom() { + %1 = call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @custom to ptr)) + ret i32 %1 +} _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits