Author: David Rivera Date: 2026-04-14T11:27:20-04:00 New Revision: 2b49a90b82ec5cf753f1ebafddb9790660c1aaa9
URL: https://github.com/llvm/llvm-project/commit/2b49a90b82ec5cf753f1ebafddb9790660c1aaa9 DIFF: https://github.com/llvm/llvm-project/commit/2b49a90b82ec5cf753f1ebafddb9790660c1aaa9.diff LOG: [CIR][CUDA] Handle CUDA module constructor and destructor emission. (#188673) Added: Modified: clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp clang/test/CIR/CodeGenCUDA/device-stub.cu Removed: ################################################################################ diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp index c30c3ef4fa3af..19b14efff1c65 100644 --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp @@ -12,6 +12,7 @@ #include "mlir/IR/IRMapping.h" #include "clang/AST/ASTContext.h" #include "clang/AST/Mangle.h" +#include "clang/Basic/Cuda.h" #include "clang/Basic/Module.h" #include "clang/Basic/SourceManager.h" #include "clang/Basic/Specifiers.h" @@ -28,10 +29,13 @@ #include "clang/CIR/MissingFeatures.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/TypeSwitch.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/VirtualFileSystem.h" #include <memory> +#include <optional> using namespace mlir; using namespace cir; @@ -122,6 +126,7 @@ struct LoweringPreparePass /// Build the CUDA module constructor that registers the fat binary /// with the CUDA runtime. void buildCUDAModuleCtor(); + std::optional<FuncOp> buildCUDAModuleDtor(); /// Handle static local variable initialization with guard variables. void handleStaticLocal(cir::GlobalOp globalOp, cir::GetGlobalOp getGlobalOp); @@ -1815,6 +1820,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() { CIRBaseBuilderTy builder(getContext()); builder.setInsertionPointToStart(mlirModule.getBody()); + Type voidTy = builder.getVoidTy(); PointerType voidPtrTy = builder.getVoidPtrTy(); PointerType voidPtrPtrTy = builder.getPointerTo(voidPtrTy); IntType intTy = builder.getSIntNTy(32); @@ -1880,8 +1886,123 @@ void LoweringPreparePass::buildCUDAModuleCtor() { gpuBinHandle.setInitialValueAttr(builder.getConstNullPtrAttr(voidPtrPtrTy)); gpuBinHandle.setPrivate(); - // TODO: ctor/dtor/register_globals - assert(!cir::MissingFeatures::globalRegistration()); + // Declare this function: + // void **__{cuda|hip}RegisterFatBinary(void *); + + std::string regFuncName = + addUnderscoredPrefix(cudaPrefix, "RegisterFatBinary"); + FuncType regFuncType = FuncType::get({voidPtrTy}, voidPtrPtrTy); + cir::FuncOp regFunc = + buildRuntimeFunction(builder, regFuncName, loc, regFuncType); + + std::string moduleCtorName = addUnderscoredPrefix(cudaPrefix, "_module_ctor"); + cir::FuncOp moduleCtor = buildRuntimeFunction( + builder, moduleCtorName, loc, FuncType::get({}, voidTy), + GlobalLinkageKind::InternalLinkage); + + globalCtorList.emplace_back(moduleCtorName, + cir::GlobalCtorAttr::getDefaultPriority()); + builder.setInsertionPointToStart(moduleCtor.addEntryBlock()); + assert(!cir::MissingFeatures::opGlobalCtorPriority()); + if (isHIP) { + llvm_unreachable("HIP Module Constructor Support"); + } else if (!astCtx->getLangOpts().GPURelocatableDeviceCode) { + + // --- Create CUDA CTOR-DTOR --- + // Register binary with CUDA runtime. This is substantially diff erent in + // default mode vs. separate compilation. + // Corresponding code: + // gpuBinaryHandle = __cudaRegisterFatBinary(&fatbinWrapper); + mlir::Value wrapper = builder.createGetGlobal(fatbinWrapper); + mlir::Value fatbinVoidPtr = builder.createBitcast(wrapper, voidPtrTy); + cir::CallOp gpuBinaryHandleCall = + builder.createCallOp(loc, regFunc, fatbinVoidPtr); + mlir::Value gpuBinaryHandle = gpuBinaryHandleCall.getResult(); + // Store the value back to the global `__cuda_gpubin_handle`. + mlir::Value gpuBinaryHandleGlobal = builder.createGetGlobal(gpuBinHandle); + builder.createStore(loc, gpuBinaryHandle, gpuBinaryHandleGlobal); + + // TODO: Generate __cuda_register_globals and emit a call. + assert(!cir::MissingFeatures::globalRegistration()); + + // From CUDA 10.1 onwards, we must call this function to end registration: + // void __cudaRegisterFatBinaryEnd(void **fatbinHandle); + // This is CUDA-specific, so no need to use `addUnderscoredPrefix`. + if (clang::CudaFeatureEnabled( + astCtx->getTargetInfo().getSDKVersion(), + clang::CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { + cir::CIRBaseBuilderTy globalBuilder(getContext()); + globalBuilder.setInsertionPointToStart(mlirModule.getBody()); + FuncOp endFunc = + buildRuntimeFunction(globalBuilder, "__cudaRegisterFatBinaryEnd", loc, + FuncType::get({voidPtrPtrTy}, voidTy)); + builder.createCallOp(loc, endFunc, gpuBinaryHandle); + } + } else + llvm_unreachable("GPU RDC NYI"); + + // Create destructor and register it with atexit() the way NVCC does it. Doing + // it during regular destructor phase worked in CUDA before 9.2 but results in + // double-free in 9.2. + if (std::optional<FuncOp> dtor = buildCUDAModuleDtor()) { + + // extern "C" int atexit(void (*f)(void)); + cir::CIRBaseBuilderTy globalBuilder(getContext()); + globalBuilder.setInsertionPointToStart(mlirModule.getBody()); + FuncOp atexit = buildRuntimeFunction( + globalBuilder, "atexit", loc, + FuncType::get(PointerType::get(dtor->getFunctionType()), intTy)); + mlir::Value dtorFunc = GetGlobalOp::create( + builder, loc, PointerType::get(dtor->getFunctionType()), + mlir::FlatSymbolRefAttr::get(dtor->getSymNameAttr())); + builder.createCallOp(loc, atexit, dtorFunc); + } + cir::ReturnOp::create(builder, loc); +} + +std::optional<FuncOp> LoweringPreparePass::buildCUDAModuleDtor() { + if (!mlirModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName())) + return {}; + + llvm::StringRef prefix = getCUDAPrefix(astCtx); + + VoidType voidTy = VoidType::get(&getContext()); + PointerType voidPtrPtrTy = PointerType::get(PointerType::get(voidTy)); + + mlir::Location loc = mlirModule.getLoc(); + + cir::CIRBaseBuilderTy builder(getContext()); + builder.setInsertionPointToStart(mlirModule.getBody()); + + // define: void __cudaUnregisterFatBinary(void ** handle); + std::string unregisterFuncName = + addUnderscoredPrefix(prefix, "UnregisterFatBinary"); + FuncOp unregisterFunc = buildRuntimeFunction( + builder, unregisterFuncName, loc, FuncType::get({voidPtrPtrTy}, voidTy)); + + // void __cuda_module_dtor(); + // Despite the name, OG doesn't treat it as a destructor, so it shouldn't be + // put into globalDtorList. If it were a real dtor, then it would cause + // double free above CUDA 9.2. The way to use it is to manually call + // atexit() at end of module ctor. + std::string dtorName = addUnderscoredPrefix(prefix, "_module_dtor"); + FuncOp dtor = + buildRuntimeFunction(builder, dtorName, loc, FuncType::get({}, voidTy), + GlobalLinkageKind::InternalLinkage); + + builder.setInsertionPointToStart(dtor.addEntryBlock()); + + // For dtor, we only need to call: + // __cudaUnregisterFatBinary(__cuda_gpubin_handle); + + std::string gpubinName = addUnderscoredPrefix(prefix, "_gpubin_handle"); + GlobalOp gpubinGlobal = cast<GlobalOp>(mlirModule.lookupSymbol(gpubinName)); + mlir::Value gpubinAddress = builder.createGetGlobal(gpubinGlobal); + mlir::Value gpubin = builder.createLoad(loc, gpubinAddress); + builder.createCallOp(loc, unregisterFunc, gpubin); + ReturnOp::create(builder, loc); + + return dtor; } void LoweringPreparePass::runOnOperation() { diff --git a/clang/test/CIR/CodeGenCUDA/device-stub.cu b/clang/test/CIR/CodeGenCUDA/device-stub.cu index 2e9deaee9b225..4562bf1523141 100644 --- a/clang/test/CIR/CodeGenCUDA/device-stub.cu +++ b/clang/test/CIR/CodeGenCUDA/device-stub.cu @@ -21,6 +21,22 @@ __global__ void kernelfunc(int i, int j, int k) {} void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } +// Check module constructor is registered in module attributes. +// CIR: cir.global_ctors = [#cir.global_ctor<"__cuda_module_ctor", 65535>] + +// Check runtime function declarations (appear before dtor in output). +// CIR: cir.func private @atexit(!cir.ptr<!cir.func<()>>) -> !s32i +// CIR: cir.func private @__cudaUnregisterFatBinary(!cir.ptr<!cir.ptr<!void>>) + +// Check the module destructor body: load handle and call UnregisterFatBinary. +// CIR: cir.func internal private @__cuda_module_dtor() +// CIR-NEXT: %[[HANDLE_ADDR:.*]] = cir.get_global @__cuda_gpubin_handle +// CIR-NEXT: %[[HANDLE:.*]] = cir.load %[[HANDLE_ADDR]] +// CIR-NEXT: cir.call @__cudaUnregisterFatBinary(%[[HANDLE]]) +// CIR-NEXT: cir.return + +// CIR: cir.func private @__cudaRegisterFatBinaryEnd(!cir.ptr<!cir.ptr<!void>>) + // CIR: cir.global "private" constant cir_private @__cuda_fatbin_str = #cir.const_array<"GPU binary would be here."> : !cir.array<!u8i x 25> {alignment = 8 : i64, section = ".nv_fatbin"} // Check the fatbin wrapper struct: { magic, version, ptr to fatbin, null }, with section. @@ -34,9 +50,34 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } // Check the GPU binary handle global. // CIR: cir.global "private" internal @__cuda_gpubin_handle = #cir.ptr<null> : !cir.ptr<!cir.ptr<!void>> +// CIR: cir.func private @__cudaRegisterFatBinary(!cir.ptr<!void>) -> !cir.ptr<!cir.ptr<!void>> + +// Check the module constructor body: register fatbin, store handle, +// call RegisterFatBinaryEnd (CUDA >= 10.1), then register dtor with atexit. +// CIR: cir.func internal private @__cuda_module_ctor() +// CIR-NEXT: %[[WRAPPER:.*]] = cir.get_global @__cuda_fatbin_wrapper +// CIR-NEXT: %[[VOID_PTR:.*]] = cir.cast bitcast %[[WRAPPER]] +// CIR-NEXT: %[[RET:.*]] = cir.call @__cudaRegisterFatBinary(%[[VOID_PTR]]) +// CIR-NEXT: %[[HANDLE_ADDR:.*]] = cir.get_global @__cuda_gpubin_handle +// CIR-NEXT: cir.store %[[RET]], %[[HANDLE_ADDR]] +// CIR-NEXT: cir.call @__cudaRegisterFatBinaryEnd(%[[RET]]) +// CIR-NEXT: %[[DTOR_PTR:.*]] = cir.get_global @__cuda_module_dtor +// CIR-NEXT: {{.*}} = cir.call @atexit(%[[DTOR_PTR]]) +// CIR-NEXT: cir.return + // OGCG: constant [25 x i8] c"GPU binary would be here.", section ".nv_fatbin", align 8 // OGCG: @__cuda_fatbin_wrapper = internal constant { i32, i32, ptr, ptr } { i32 1180844977, i32 1, ptr @{{.*}}, ptr null }, section ".nvFatBinSegment" // OGCG: @__cuda_gpubin_handle = internal global ptr null +// OGCG: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor + +// OGCG: define internal void @__cuda_module_ctor +// OGCG: call{{.*}}__cudaRegisterFatBinary(ptr @__cuda_fatbin_wrapper) +// OGCG: store ptr %{{.*}}, ptr @__cuda_gpubin_handle +// OGCG: call i32 @atexit(ptr @__cuda_module_dtor) + +// OGCG: define internal void @__cuda_module_dtor +// OGCG: load ptr, ptr @__cuda_gpubin_handle +// OGCG: call void @__cudaUnregisterFatBinary // No GPU binary — no registration infrastructure at all. // NOGPUBIN-NOT: fatbin _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
