https://github.com/RiverDave updated https://github.com/llvm/llvm-project/pull/188926
>From 2adde40164d6026cb60e7af9e1d8bda7694636bb Mon Sep 17 00:00:00 2001 From: David Rivera <[email protected]> Date: Wed, 25 Mar 2026 23:14:58 -0400 Subject: [PATCH 1/6] Avoid copies from `std::string` --- clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp index 19b14efff1c65..ef9c4556b9412 100644 --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp @@ -10,6 +10,7 @@ #include "mlir/IR/Attributes.h" #include "mlir/IR/BuiltinAttributeInterfaces.h" #include "mlir/IR/IRMapping.h" +#include "mlir/IR/BuiltinAttributeInterfaces.h" #include "clang/AST/ASTContext.h" #include "clang/AST/Mangle.h" #include "clang/Basic/Cuda.h" @@ -1750,9 +1751,9 @@ static llvm::StringRef getCUDAPrefix(clang::ASTContext *astCtx) { return "cuda"; } -static std::string addUnderscoredPrefix(llvm::StringRef prefix, +static llvm::StringRef addUnderscoredPrefix(llvm::StringRef prefix, llvm::StringRef name) { - return ("__" + prefix + name).str(); + return ("__" + prefix + name).getSingleStringRef(); } /// Creates a global constructor function for the module: @@ -1839,7 +1840,8 @@ void LoweringPreparePass::buildCUDAModuleCtor() { // Create the fatbin string constant with GPU binary contents. auto fatbinType = ArrayType::get(&getContext(), charTy, gpuBinary->getBuffer().size()); - std::string fatbinStrName = addUnderscoredPrefix(cudaPrefix, "_fatbin_str"); + llvm::StringRef fatbinStrName = + addUnderscoredPrefix(cudaPrefix, "_fatbin_str"); GlobalOp fatbinStr = GlobalOp::create(builder, loc, fatbinStrName, fatbinType, /*isConstant=*/true, {}, GlobalLinkageKind::PrivateLinkage); @@ -1854,7 +1856,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() { auto fatbinWrapperType = RecordType::get( &getContext(), {intTy, intTy, voidPtrTy, voidPtrTy}, /*packed=*/false, /*padded=*/false, RecordType::RecordKind::Struct); - std::string fatbinWrapperName = + llvm::StringRef fatbinWrapperName = addUnderscoredPrefix(cudaPrefix, "_fatbin_wrapper"); GlobalOp fatbinWrapper = GlobalOp::create( builder, loc, fatbinWrapperName, fatbinWrapperType, @@ -1877,7 +1879,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() { {magicInit, versionInit, fatbinInit, unusedInit}))); // Create the GPU binary handle global variable. - std::string gpubinHandleName = + llvm::StringRef gpubinHandleName = addUnderscoredPrefix(cudaPrefix, "_gpubin_handle"); GlobalOp gpuBinHandle = GlobalOp::create( >From dc5ee921e0505aef340ec9edc8beb33542443336 Mon Sep 17 00:00:00 2001 From: David Rivera <[email protected]> Date: Fri, 27 Mar 2026 03:38:57 -0400 Subject: [PATCH 2/6] fix twine crashes --- clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp index ef9c4556b9412..9e581b089f946 100644 --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp @@ -1751,9 +1751,9 @@ static llvm::StringRef getCUDAPrefix(clang::ASTContext *astCtx) { return "cuda"; } -static llvm::StringRef addUnderscoredPrefix(llvm::StringRef prefix, +static std::string addUnderscoredPrefix(llvm::StringRef prefix, llvm::StringRef name) { - return ("__" + prefix + name).getSingleStringRef(); + return ("__" + prefix + name).str(); } /// Creates a global constructor function for the module: @@ -1840,8 +1840,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() { // Create the fatbin string constant with GPU binary contents. auto fatbinType = ArrayType::get(&getContext(), charTy, gpuBinary->getBuffer().size()); - llvm::StringRef fatbinStrName = - addUnderscoredPrefix(cudaPrefix, "_fatbin_str"); + std::string fatbinStrName = addUnderscoredPrefix(cudaPrefix, "_fatbin_str"); GlobalOp fatbinStr = GlobalOp::create(builder, loc, fatbinStrName, fatbinType, /*isConstant=*/true, {}, GlobalLinkageKind::PrivateLinkage); @@ -1856,7 +1855,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() { auto fatbinWrapperType = RecordType::get( &getContext(), {intTy, intTy, voidPtrTy, voidPtrTy}, /*packed=*/false, /*padded=*/false, RecordType::RecordKind::Struct); - llvm::StringRef fatbinWrapperName = + std::string fatbinWrapperName = addUnderscoredPrefix(cudaPrefix, "_fatbin_wrapper"); GlobalOp fatbinWrapper = GlobalOp::create( builder, loc, fatbinWrapperName, fatbinWrapperType, @@ -1879,7 +1878,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() { {magicInit, versionInit, fatbinInit, unusedInit}))); // Create the GPU binary handle global variable. - llvm::StringRef gpubinHandleName = + std::string gpubinHandleName = addUnderscoredPrefix(cudaPrefix, "_gpubin_handle"); GlobalOp gpuBinHandle = GlobalOp::create( >From ccbc1bc53478814c8f7af46746cc8d7952e47e8f Mon Sep 17 00:00:00 2001 From: David Rivera <[email protected]> Date: Thu, 2 Apr 2026 13:45:13 -0400 Subject: [PATCH 3/6] fix fmt --- clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp index 9e581b089f946..19b14efff1c65 100644 --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp @@ -10,7 +10,6 @@ #include "mlir/IR/Attributes.h" #include "mlir/IR/BuiltinAttributeInterfaces.h" #include "mlir/IR/IRMapping.h" -#include "mlir/IR/BuiltinAttributeInterfaces.h" #include "clang/AST/ASTContext.h" #include "clang/AST/Mangle.h" #include "clang/Basic/Cuda.h" >From de686986fa84a02bb08be4a913443f39f0a1817c Mon Sep 17 00:00:00 2001 From: David Rivera <[email protected]> Date: Wed, 25 Mar 2026 22:29:47 -0400 Subject: [PATCH 4/6] [CIR][CUDA] Handle CUDA module constructor and destructor emission. --- clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp index 19b14efff1c65..2b38f9cbecfba 100644 --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp @@ -10,6 +10,7 @@ #include "mlir/IR/Attributes.h" #include "mlir/IR/BuiltinAttributeInterfaces.h" #include "mlir/IR/IRMapping.h" +#include "mlir/IR/Value.h" #include "clang/AST/ASTContext.h" #include "clang/AST/Mangle.h" #include "clang/Basic/Cuda.h" >From df4d636b90dc39b83d23a39e8d3e643f096594a9 Mon Sep 17 00:00:00 2001 From: David Rivera <[email protected]> Date: Fri, 27 Mar 2026 04:40:08 -0400 Subject: [PATCH 5/6] [CIR][CUDA] Do Runtime Kernel Registration --- .../Dialect/Transforms/LoweringPrepare.cpp | 121 +++++++++++++++++- clang/test/CIR/CodeGenCUDA/device-stub.cu | 30 ++++- 2 files changed, 147 insertions(+), 4 deletions(-) diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp index 2b38f9cbecfba..de7ab647c177c 100644 --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp @@ -10,6 +10,7 @@ #include "mlir/IR/Attributes.h" #include "mlir/IR/BuiltinAttributeInterfaces.h" #include "mlir/IR/IRMapping.h" +#include "mlir/IR/Location.h" #include "mlir/IR/Value.h" #include "clang/AST/ASTContext.h" #include "clang/AST/Mangle.h" @@ -31,6 +32,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/IR/Instructions.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/VirtualFileSystem.h" @@ -128,6 +130,9 @@ struct LoweringPreparePass /// with the CUDA runtime. void buildCUDAModuleCtor(); std::optional<FuncOp> buildCUDAModuleDtor(); + std::optional<FuncOp> buildCUDARegisterGlobals(); + void buildCUDARegisterGlobalFunctions(cir::CIRBaseBuilderTy &builder, + FuncOp regGlobalFunc); /// Handle static local variable initialization with guard variables. void handleStaticLocal(cir::GlobalOp globalOp, cir::GetGlobalOp getGlobalOp); @@ -1923,8 +1928,11 @@ void LoweringPreparePass::buildCUDAModuleCtor() { mlir::Value gpuBinaryHandleGlobal = builder.createGetGlobal(gpuBinHandle); builder.createStore(loc, gpuBinaryHandle, gpuBinaryHandleGlobal); - // TODO: Generate __cuda_register_globals and emit a call. - assert(!cir::MissingFeatures::globalRegistration()); + // --- Generate __cuda_register_globals and call it --- + std::optional<FuncOp> regGlobal = buildCUDARegisterGlobals(); + if (regGlobal) { + builder.createCallOp(loc, *regGlobal, gpuBinaryHandle); + } // From CUDA 10.1 onwards, we must call this function to end registration: // void __cudaRegisterFatBinaryEnd(void **fatbinHandle); @@ -2006,6 +2014,115 @@ std::optional<FuncOp> LoweringPreparePass::buildCUDAModuleDtor() { return dtor; } +std::optional<FuncOp> LoweringPreparePass::buildCUDARegisterGlobals() { + // There is nothing to register. + if (cudaKernelMap.empty()) + return {}; + + cir::CIRBaseBuilderTy builder(getContext()); + builder.setInsertionPointToStart(mlirModule.getBody()); + + mlir::Location loc = mlirModule.getLoc(); + llvm::StringRef cudaPrefix = getCUDAPrefix(astCtx); + + auto voidTy = VoidType::get(&getContext()); + auto voidPtrTy = PointerType::get(voidTy); + auto voidPtrPtrTy = PointerType::get(voidPtrTy); + + // Create the function: + // void __cuda_register_globals(void **fatbinHandle) + std::string regGlobalFuncName = + addUnderscoredPrefix(cudaPrefix, "_register_globals"); + auto regGlobalFuncTy = FuncType::get({voidPtrPtrTy}, voidTy); + FuncOp regGlobalFunc = + buildRuntimeFunction(builder, regGlobalFuncName, loc, regGlobalFuncTy, + /*linkage=*/GlobalLinkageKind::InternalLinkage); + builder.setInsertionPointToStart(regGlobalFunc.addEntryBlock()); + + buildCUDARegisterGlobalFunctions(builder, regGlobalFunc); + // TODO: Handle shadow registration + assert(!cir::MissingFeatures::globalRegistration()); + + ReturnOp::create(builder, loc); + return regGlobalFunc; +} + +void LoweringPreparePass::buildCUDARegisterGlobalFunctions( + cir::CIRBaseBuilderTy &builder, FuncOp regGlobalFunc) { + mlir::Location loc = mlirModule.getLoc(); + llvm::StringRef cudaPrefix = getCUDAPrefix(astCtx); + cir::CIRDataLayout dataLayout(mlirModule); + + auto voidTy = VoidType::get(&getContext()); + auto voidPtrTy = PointerType::get(voidTy); + auto voidPtrPtrTy = PointerType::get(voidPtrTy); + IntType intTy = builder.getSIntNTy(32); + IntType charTy = cir::IntType::get(&getContext(), astCtx->getCharWidth(), + /*isSigned=*/false); + + // Extract the GPU binary handle argument. + mlir::Value fatbinHandle = *regGlobalFunc.args_begin(); + + cir::CIRBaseBuilderTy globalBuilder(getContext()); + globalBuilder.setInsertionPointToStart(mlirModule.getBody()); + + // Declare CUDA internal functions: + // int __cudaRegisterFunction( + // void **fatbinHandle, + // const char *hostFunc, + // char *deviceFunc, + // const char *deviceName, + // int threadLimit, + // uint3 *tid, uint3 *bid, dim3 *bDim, dim3 *gDim, + // int *wsize + // ) + // OG doesn't care about the types at all. They're treated as void*. + + FuncOp cudaRegisterFunction = buildRuntimeFunction( + globalBuilder, addUnderscoredPrefix(cudaPrefix, "RegisterFunction"), loc, + FuncType::get({voidPtrPtrTy, voidPtrTy, voidPtrTy, voidPtrTy, intTy, + voidPtrTy, voidPtrTy, voidPtrTy, voidPtrTy, voidPtrTy}, + intTy)); + + auto makeConstantString = [&](llvm::StringRef str) -> GlobalOp { + auto strType = ArrayType::get(&getContext(), charTy, 1 + str.size()); + auto tmpString = cir::GlobalOp::create( + globalBuilder, loc, (".str" + str).str(), strType, + /*isConstant=*/true, {}, + /*linkage=*/cir::GlobalLinkageKind::PrivateLinkage); + + // We must make the string zero-terminated. + tmpString.setInitialValueAttr(ConstArrayAttr::get( + strType, StringAttr::get(&getContext(), str + "\0"))); + tmpString.setPrivate(); + return tmpString; + }; + + cir::ConstantOp cirNullPtr = builder.getNullPtr(voidPtrTy, loc); + bool isHIP = astCtx->getLangOpts().HIP; + for (auto kernelName : cudaKernelMap.keys()) { + FuncOp deviceStub = cudaKernelMap[kernelName]; + GlobalOp deviceFuncStr = makeConstantString(kernelName); + mlir::Value deviceFunc = builder.createBitcast( + builder.createGetGlobal(deviceFuncStr), voidPtrTy); + + if (isHIP) { + llvm_unreachable("HIP kernel registration NYI"); + } else { + mlir::Value hostFunc = builder.createBitcast( + GetGlobalOp::create( + builder, loc, PointerType::get(deviceStub.getFunctionType()), + mlir::FlatSymbolRefAttr::get(deviceStub.getSymNameAttr())), + voidPtrTy); + builder.createCallOp( + loc, cudaRegisterFunction, + {fatbinHandle, hostFunc, deviceFunc, deviceFunc, + ConstantOp::create(builder, loc, IntAttr::get(intTy, -1)), + cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr}); + } + } +} + void LoweringPreparePass::runOnOperation() { mlir::Operation *op = getOperation(); if (isa<::mlir::ModuleOp>(op)) diff --git a/clang/test/CIR/CodeGenCUDA/device-stub.cu b/clang/test/CIR/CodeGenCUDA/device-stub.cu index 4562bf1523141..0f9d4d68d67ff 100644 --- a/clang/test/CIR/CodeGenCUDA/device-stub.cu +++ b/clang/test/CIR/CodeGenCUDA/device-stub.cu @@ -24,7 +24,7 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } // Check module constructor is registered in module attributes. // CIR: cir.global_ctors = [#cir.global_ctor<"__cuda_module_ctor", 65535>] -// Check runtime function declarations (appear before dtor in output). +// Check runtime function declarations. // CIR: cir.func private @atexit(!cir.ptr<!cir.func<()>>) -> !s32i // CIR: cir.func private @__cudaUnregisterFatBinary(!cir.ptr<!cir.ptr<!void>>) @@ -37,6 +37,25 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } // CIR: cir.func private @__cudaRegisterFatBinaryEnd(!cir.ptr<!cir.ptr<!void>>) +// Check the __cudaRegisterFunction runtime declaration: +// int __cudaRegisterFunction(void**, void*, void*, void*, int, +// void*, void*, void*, void*, void*) +// CIR: cir.func private @__cudaRegisterFunction(!cir.ptr<!cir.ptr<!void>>, !cir.ptr<!void>, !cir.ptr<!void>, !cir.ptr<!void>, !s32i, !cir.ptr<!void>, !cir.ptr<!void>, !cir.ptr<!void>, !cir.ptr<!void>, !cir.ptr<!void>) -> !s32i + +// Check the device-side name string for kernelfunc (mangled, null-terminated). +// CIR: cir.global "private" constant cir_private @".str_Z10kernelfunciii" = #cir.const_array<"_Z10kernelfunciii", trailing_zeros> : !cir.array<!u8i x 18> + +// Check __cuda_register_globals body: one __cudaRegisterFunction call per kernel. +// CIR: cir.func internal private @__cuda_register_globals(%arg0: !cir.ptr<!cir.ptr<!void>> +// CIR-NEXT: %[[NULL:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!void> +// CIR-NEXT: %[[STR_ADDR:.*]] = cir.get_global @".str_Z10kernelfunciii" +// CIR-NEXT: %[[DEVICE_FUNC:.*]] = cir.cast bitcast %[[STR_ADDR]] +// CIR-NEXT: %[[HOST_FUNC_RAW:.*]] = cir.get_global @{{.*}}kernelfunc{{.*}} +// CIR-NEXT: %[[HOST_FUNC:.*]] = cir.cast bitcast %[[HOST_FUNC_RAW]] +// CIR-NEXT: %[[THREAD_LIMIT:.*]] = cir.const #cir.int<-1> : !s32i +// CIR-NEXT: cir.call @__cudaRegisterFunction(%{{.*}}, %[[HOST_FUNC]], %[[DEVICE_FUNC]], %[[DEVICE_FUNC]], %[[THREAD_LIMIT]], %[[NULL]], %[[NULL]], %[[NULL]], %[[NULL]], %[[NULL]]) +// CIR-NEXT: cir.return + // CIR: cir.global "private" constant cir_private @__cuda_fatbin_str = #cir.const_array<"GPU binary would be here."> : !cir.array<!u8i x 25> {alignment = 8 : i64, section = ".nv_fatbin"} // Check the fatbin wrapper struct: { magic, version, ptr to fatbin, null }, with section. @@ -53,13 +72,15 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } // CIR: cir.func private @__cudaRegisterFatBinary(!cir.ptr<!void>) -> !cir.ptr<!cir.ptr<!void>> // Check the module constructor body: register fatbin, store handle, -// call RegisterFatBinaryEnd (CUDA >= 10.1), then register dtor with atexit. +// call __cuda_register_globals, call RegisterFatBinaryEnd (CUDA >= 10.1), +// then register dtor with atexit. // CIR: cir.func internal private @__cuda_module_ctor() // CIR-NEXT: %[[WRAPPER:.*]] = cir.get_global @__cuda_fatbin_wrapper // CIR-NEXT: %[[VOID_PTR:.*]] = cir.cast bitcast %[[WRAPPER]] // CIR-NEXT: %[[RET:.*]] = cir.call @__cudaRegisterFatBinary(%[[VOID_PTR]]) // CIR-NEXT: %[[HANDLE_ADDR:.*]] = cir.get_global @__cuda_gpubin_handle // CIR-NEXT: cir.store %[[RET]], %[[HANDLE_ADDR]] +// CIR-NEXT: cir.call @__cuda_register_globals(%[[RET]]) // CIR-NEXT: cir.call @__cudaRegisterFatBinaryEnd(%[[RET]]) // CIR-NEXT: %[[DTOR_PTR:.*]] = cir.get_global @__cuda_module_dtor // CIR-NEXT: {{.*}} = cir.call @atexit(%[[DTOR_PTR]]) @@ -70,9 +91,14 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } // OGCG: @__cuda_gpubin_handle = internal global ptr null // OGCG: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor +// OGCG: define internal void @__cuda_register_globals +// OGCG: call{{.*}}__cudaRegisterFunction(ptr %0, {{.*}}kernelfunc{{.*}}, ptr @0 +// OGCG: ret void + // OGCG: define internal void @__cuda_module_ctor // OGCG: call{{.*}}__cudaRegisterFatBinary(ptr @__cuda_fatbin_wrapper) // OGCG: store ptr %{{.*}}, ptr @__cuda_gpubin_handle +// OGCG-NEXT: call void @__cuda_register_globals // OGCG: call i32 @atexit(ptr @__cuda_module_dtor) // OGCG: define internal void @__cuda_module_dtor >From 1a81469a70f6587ebe8f0bea58c7ab86ae6e49f9 Mon Sep 17 00:00:00 2001 From: David Rivera <[email protected]> Date: Tue, 14 Apr 2026 15:56:16 -0400 Subject: [PATCH 6/6] address comment --- clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp index de7ab647c177c..9015fe483a7df 100644 --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp @@ -1929,8 +1929,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() { builder.createStore(loc, gpuBinaryHandle, gpuBinaryHandleGlobal); // --- Generate __cuda_register_globals and call it --- - std::optional<FuncOp> regGlobal = buildCUDARegisterGlobals(); - if (regGlobal) { + if (std::optional<FuncOp> regGlobal = buildCUDARegisterGlobals()) { builder.createCallOp(loc, *regGlobal, gpuBinaryHandle); } _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
