https://github.com/RiverDave updated 
https://github.com/llvm/llvm-project/pull/188673

>From 7b45b6ec4d62aa5ed8ef92d1c8a84562b96ab147 Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Wed, 25 Mar 2026 23:14:58 -0400
Subject: [PATCH 1/8] Avoid copies from `std::string`

---
 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp 
b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index c30c3ef4fa3af..25290079a5ccc 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -10,6 +10,7 @@
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "mlir/IR/IRMapping.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Mangle.h"
 #include "clang/Basic/Module.h"
@@ -1745,9 +1746,9 @@ static llvm::StringRef getCUDAPrefix(clang::ASTContext 
*astCtx) {
   return "cuda";
 }
 
-static std::string addUnderscoredPrefix(llvm::StringRef prefix,
+static llvm::StringRef addUnderscoredPrefix(llvm::StringRef prefix,
                                         llvm::StringRef name) {
-  return ("__" + prefix + name).str();
+  return ("__" + prefix + name).getSingleStringRef();
 }
 
 /// Creates a global constructor function for the module:
@@ -1833,7 +1834,8 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   // Create the fatbin string constant with GPU binary contents.
   auto fatbinType =
       ArrayType::get(&getContext(), charTy, gpuBinary->getBuffer().size());
-  std::string fatbinStrName = addUnderscoredPrefix(cudaPrefix, "_fatbin_str");
+  llvm::StringRef fatbinStrName =
+      addUnderscoredPrefix(cudaPrefix, "_fatbin_str");
   GlobalOp fatbinStr = GlobalOp::create(builder, loc, fatbinStrName, 
fatbinType,
                                         /*isConstant=*/true, {},
                                         GlobalLinkageKind::PrivateLinkage);
@@ -1848,7 +1850,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   auto fatbinWrapperType = RecordType::get(
       &getContext(), {intTy, intTy, voidPtrTy, voidPtrTy},
       /*packed=*/false, /*padded=*/false, RecordType::RecordKind::Struct);
-  std::string fatbinWrapperName =
+  llvm::StringRef fatbinWrapperName =
       addUnderscoredPrefix(cudaPrefix, "_fatbin_wrapper");
   GlobalOp fatbinWrapper = GlobalOp::create(
       builder, loc, fatbinWrapperName, fatbinWrapperType,
@@ -1871,7 +1873,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
                            {magicInit, versionInit, fatbinInit, unusedInit})));
 
   // Create the GPU binary handle global variable.
-  std::string gpubinHandleName =
+  llvm::StringRef gpubinHandleName =
       addUnderscoredPrefix(cudaPrefix, "_gpubin_handle");
 
   GlobalOp gpuBinHandle = GlobalOp::create(

>From d3c1e6cd0327e0a7962c3fcec30f0f2750635555 Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Fri, 27 Mar 2026 03:38:57 -0400
Subject: [PATCH 2/8] fix twine crashes

---
 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp 
b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 25290079a5ccc..3d2ef77b4f4e9 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -1746,9 +1746,9 @@ static llvm::StringRef getCUDAPrefix(clang::ASTContext 
*astCtx) {
   return "cuda";
 }
 
-static llvm::StringRef addUnderscoredPrefix(llvm::StringRef prefix,
+static std::string addUnderscoredPrefix(llvm::StringRef prefix,
                                         llvm::StringRef name) {
-  return ("__" + prefix + name).getSingleStringRef();
+  return ("__" + prefix + name).str();
 }
 
 /// Creates a global constructor function for the module:
@@ -1834,8 +1834,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   // Create the fatbin string constant with GPU binary contents.
   auto fatbinType =
       ArrayType::get(&getContext(), charTy, gpuBinary->getBuffer().size());
-  llvm::StringRef fatbinStrName =
-      addUnderscoredPrefix(cudaPrefix, "_fatbin_str");
+  std::string fatbinStrName = addUnderscoredPrefix(cudaPrefix, "_fatbin_str");
   GlobalOp fatbinStr = GlobalOp::create(builder, loc, fatbinStrName, 
fatbinType,
                                         /*isConstant=*/true, {},
                                         GlobalLinkageKind::PrivateLinkage);
@@ -1850,7 +1849,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   auto fatbinWrapperType = RecordType::get(
       &getContext(), {intTy, intTy, voidPtrTy, voidPtrTy},
       /*packed=*/false, /*padded=*/false, RecordType::RecordKind::Struct);
-  llvm::StringRef fatbinWrapperName =
+  std::string fatbinWrapperName =
       addUnderscoredPrefix(cudaPrefix, "_fatbin_wrapper");
   GlobalOp fatbinWrapper = GlobalOp::create(
       builder, loc, fatbinWrapperName, fatbinWrapperType,
@@ -1873,7 +1872,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
                            {magicInit, versionInit, fatbinInit, unusedInit})));
 
   // Create the GPU binary handle global variable.
-  llvm::StringRef gpubinHandleName =
+  std::string gpubinHandleName =
       addUnderscoredPrefix(cudaPrefix, "_gpubin_handle");
 
   GlobalOp gpuBinHandle = GlobalOp::create(

>From 13baf135c161fef5b4b9792d1cae67e70cd543db Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Thu, 2 Apr 2026 13:45:13 -0400
Subject: [PATCH 3/8] fix fmt

---
 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp 
b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 3d2ef77b4f4e9..8d5fe42d41ba3 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -11,6 +11,7 @@
 #include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "mlir/IR/IRMapping.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Mangle.h"
 #include "clang/Basic/Module.h"

>From 5f7f992f8f28ebdfeb292a1f7030b95b5e215806 Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Thu, 2 Apr 2026 17:31:27 -0400
Subject: [PATCH 4/8] Fix missing include for memoryBuffer on linux ci

---
 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp 
b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 8d5fe42d41ba3..3829d4e021962 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -30,6 +30,7 @@
 #include "clang/CIR/MissingFeatures.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/VirtualFileSystem.h"
 

>From 562f89bcf003165878837e1e2e8bedb3f67569bd Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Thu, 2 Apr 2026 18:49:30 -0400
Subject: [PATCH 5/8] Use vfs from ast context to get gpubinary

---
 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp 
b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 3829d4e021962..8d5fe42d41ba3 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -30,7 +30,6 @@
 #include "clang/CIR/MissingFeatures.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TypeSwitch.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/VirtualFileSystem.h"
 

>From 5d46d7fd106ae1016ab4f1f34b1f22861417d4bf Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Wed, 25 Mar 2026 22:29:47 -0400
Subject: [PATCH 6/8] [CIR][CUDA] Handle CUDA module constructor and destructor
 emission.

---
 .../Dialect/Transforms/LoweringPrepare.cpp    | 123 +++++++++++++++++-
 clang/test/CIR/CodeGenCUDA/device-stub.cu     |  41 ++++++
 2 files changed, 162 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp 
b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 8d5fe42d41ba3..45b11ba1af607 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -14,6 +14,7 @@
 #include "mlir/IR/IRMapping.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Mangle.h"
+#include "clang/Basic/Cuda.h"
 #include "clang/Basic/Module.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/Specifiers.h"
@@ -30,10 +31,13 @@
 #include "clang/CIR/MissingFeatures.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/VirtualFileSystem.h"
 
 #include <memory>
+#include <optional>
 
 using namespace mlir;
 using namespace cir;
@@ -124,6 +128,7 @@ struct LoweringPreparePass
   /// Build the CUDA module constructor that registers the fat binary
   /// with the CUDA runtime.
   void buildCUDAModuleCtor();
+  std::optional<FuncOp> buildCUDAModuleDtor();
 
   /// Handle static local variable initialization with guard variables.
   void handleStaticLocal(cir::GlobalOp globalOp, cir::GetGlobalOp getGlobalOp);
@@ -1882,8 +1887,122 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   gpuBinHandle.setInitialValueAttr(builder.getConstNullPtrAttr(voidPtrPtrTy));
   gpuBinHandle.setPrivate();
 
-  // TODO: ctor/dtor/register_globals
-  assert(!cir::MissingFeatures::globalRegistration());
+  // Declare this function:
+  //    void **__{cuda|hip}RegisterFatBinary(void *);
+
+  std::string regFuncName =
+      addUnderscoredPrefix(cudaPrefix, "RegisterFatBinary");
+  FuncType regFuncType = FuncType::get({voidPtrTy}, voidPtrPtrTy);
+  cir::FuncOp regFunc =
+      buildRuntimeFunction(builder, regFuncName, loc, regFuncType);
+
+  std::string moduleCtorName = addUnderscoredPrefix(cudaPrefix, 
"_module_ctor");
+  cir::FuncOp moduleCtor = buildRuntimeFunction(
+      builder, moduleCtorName, loc, FuncType::get({}, voidTy),
+      GlobalLinkageKind::InternalLinkage);
+
+  globalCtorList.emplace_back(moduleCtorName,
+                              cir::GlobalCtorAttr::getDefaultPriority());
+  builder.setInsertionPointToStart(moduleCtor.addEntryBlock());
+  assert(!cir::MissingFeatures::opGlobalCtorPriority());
+  if (isHIP) {
+    llvm_unreachable("HIP Module Constructor Support");
+  } else if (!astCtx->getLangOpts().GPURelocatableDeviceCode) {
+
+    // --- Create CUDA CTOR-DTOR ---
+    // Register binary with CUDA runtime. This is substantially different in
+    // default mode vs. separate compilation.
+    // Corresponding code:
+    //     gpuBinaryHandle = __cudaRegisterFatBinary(&fatbinWrapper);
+    mlir::Value wrapper = builder.createGetGlobal(fatbinWrapper);
+    mlir::Value fatbinVoidPtr = builder.createBitcast(wrapper, voidPtrTy);
+    cir::CallOp gpuBinaryHandleCall =
+        builder.createCallOp(loc, regFunc, fatbinVoidPtr);
+    mlir::Value gpuBinaryHandle = gpuBinaryHandleCall.getResult();
+    // Store the value back to the global `__cuda_gpubin_handle`.
+    mlir::Value gpuBinaryHandleGlobal = builder.createGetGlobal(gpuBinHandle);
+    builder.createStore(loc, gpuBinaryHandle, gpuBinaryHandleGlobal);
+
+    // TODO: Generate __cuda_register_globals and emit a call.
+    assert(!cir::MissingFeatures::globalRegistration());
+
+    // From CUDA 10.1 onwards, we must call this function to end registration:
+    //      void __cudaRegisterFatBinaryEnd(void **fatbinHandle);
+    // This is CUDA-specific, so no need to use `addUnderscoredPrefix`.
+    if (clang::CudaFeatureEnabled(
+            astCtx->getTargetInfo().getSDKVersion(),
+            clang::CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
+      cir::CIRBaseBuilderTy globalBuilder(getContext());
+      globalBuilder.setInsertionPointToStart(mlirModule.getBody());
+      FuncOp endFunc =
+          buildRuntimeFunction(globalBuilder, "__cudaRegisterFatBinaryEnd", 
loc,
+                               FuncType::get({voidPtrPtrTy}, voidTy));
+      builder.createCallOp(loc, endFunc, gpuBinaryHandle);
+    }
+  }
+
+  // Create destructor and register it with atexit() the way NVCC does it. 
Doing
+  // it during regular destructor phase worked in CUDA before 9.2 but results 
in
+  // double-free in 9.2.
+  if (std::optional<FuncOp> dtor = buildCUDAModuleDtor()) {
+
+    // extern "C" int atexit(void (*f)(void));
+    cir::CIRBaseBuilderTy globalBuilder(getContext());
+    globalBuilder.setInsertionPointToStart(mlirModule.getBody());
+    FuncOp atexit = buildRuntimeFunction(
+        globalBuilder, "atexit", loc,
+        FuncType::get(PointerType::get(dtor->getFunctionType()), intTy));
+    mlir::Value dtorFunc = GetGlobalOp::create(
+        builder, loc, PointerType::get(dtor->getFunctionType()),
+        mlir::FlatSymbolRefAttr::get(dtor->getSymNameAttr()));
+    builder.createCallOp(loc, atexit, dtorFunc);
+  }
+  cir::ReturnOp::create(builder, loc);
+}
+
+std::optional<FuncOp> LoweringPreparePass::buildCUDAModuleDtor() {
+  if (!mlirModule->getAttr(CIRDialect::getCUDABinaryHandleAttrName()))
+    return {};
+
+  llvm::StringRef prefix = getCUDAPrefix(astCtx);
+
+  VoidType voidTy = VoidType::get(&getContext());
+  PointerType voidPtrPtrTy = PointerType::get(PointerType::get(voidTy));
+
+  mlir::Location loc = mlirModule.getLoc();
+
+  cir::CIRBaseBuilderTy builder(getContext());
+  builder.setInsertionPointToStart(mlirModule.getBody());
+
+  // define: void __cudaUnregisterFatBinary(void ** handle);
+  std::string unregisterFuncName =
+      addUnderscoredPrefix(prefix, "UnregisterFatBinary");
+  FuncOp unregisterFunc = buildRuntimeFunction(
+      builder, unregisterFuncName, loc, FuncType::get({voidPtrPtrTy}, voidTy));
+
+  // void __cuda_module_dtor();
+  // Despite the name, OG doesn't treat it as a destructor, so it shouldn't be
+  // put into globalDtorList. If it were a real dtor, then it would cause
+  // double free above CUDA 9.2. The way to use it is to manually call
+  // atexit() at end of module ctor.
+  std::string dtorName = addUnderscoredPrefix(prefix, "_module_dtor");
+  FuncOp dtor =
+      buildRuntimeFunction(builder, dtorName, loc, FuncType::get({}, voidTy),
+                           GlobalLinkageKind::InternalLinkage);
+
+  builder.setInsertionPointToStart(dtor.addEntryBlock());
+
+  // For dtor, we only need to call:
+  //    __cudaUnregisterFatBinary(__cuda_gpubin_handle);
+
+  std::string gpubinName = addUnderscoredPrefix(prefix, "_gpubin_handle");
+  GlobalOp gpubinGlobal = cast<GlobalOp>(mlirModule.lookupSymbol(gpubinName));
+  mlir::Value gpubinAddress = builder.createGetGlobal(gpubinGlobal);
+  mlir::Value gpubin = builder.createLoad(loc, gpubinAddress);
+  builder.createCallOp(loc, unregisterFunc, gpubin);
+  ReturnOp::create(builder, loc);
+
+  return dtor;
 }
 
 void LoweringPreparePass::runOnOperation() {
diff --git a/clang/test/CIR/CodeGenCUDA/device-stub.cu 
b/clang/test/CIR/CodeGenCUDA/device-stub.cu
index 2e9deaee9b225..4562bf1523141 100644
--- a/clang/test/CIR/CodeGenCUDA/device-stub.cu
+++ b/clang/test/CIR/CodeGenCUDA/device-stub.cu
@@ -21,6 +21,22 @@ __global__ void kernelfunc(int i, int j, int k) {}
 
 void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
 
+// Check module constructor is registered in module attributes.
+// CIR: cir.global_ctors = [#cir.global_ctor<"__cuda_module_ctor", 65535>]
+
+// Check runtime function declarations (appear before dtor in output).
+// CIR: cir.func private @atexit(!cir.ptr<!cir.func<()>>) -> !s32i
+// CIR: cir.func private @__cudaUnregisterFatBinary(!cir.ptr<!cir.ptr<!void>>)
+
+// Check the module destructor body: load handle and call UnregisterFatBinary.
+// CIR: cir.func internal private @__cuda_module_dtor()
+// CIR-NEXT: %[[HANDLE_ADDR:.*]] = cir.get_global @__cuda_gpubin_handle
+// CIR-NEXT: %[[HANDLE:.*]] = cir.load %[[HANDLE_ADDR]]
+// CIR-NEXT: cir.call @__cudaUnregisterFatBinary(%[[HANDLE]])
+// CIR-NEXT: cir.return
+
+// CIR: cir.func private @__cudaRegisterFatBinaryEnd(!cir.ptr<!cir.ptr<!void>>)
+
 // CIR: cir.global "private" constant cir_private @__cuda_fatbin_str = 
#cir.const_array<"GPU binary would be here."> : !cir.array<!u8i x 25> 
{alignment = 8 : i64, section = ".nv_fatbin"}
 
 // Check the fatbin wrapper struct: { magic, version, ptr to fatbin, null }, 
with section.
@@ -34,9 +50,34 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
 // Check the GPU binary handle global.
 // CIR: cir.global "private" internal @__cuda_gpubin_handle = #cir.ptr<null> : 
!cir.ptr<!cir.ptr<!void>>
 
+// CIR: cir.func private @__cudaRegisterFatBinary(!cir.ptr<!void>) -> 
!cir.ptr<!cir.ptr<!void>>
+
+// Check the module constructor body: register fatbin, store handle,
+// call RegisterFatBinaryEnd (CUDA >= 10.1), then register dtor with atexit.
+// CIR: cir.func internal private @__cuda_module_ctor()
+// CIR-NEXT: %[[WRAPPER:.*]] = cir.get_global @__cuda_fatbin_wrapper
+// CIR-NEXT: %[[VOID_PTR:.*]] = cir.cast bitcast %[[WRAPPER]]
+// CIR-NEXT: %[[RET:.*]] = cir.call @__cudaRegisterFatBinary(%[[VOID_PTR]])
+// CIR-NEXT: %[[HANDLE_ADDR:.*]] = cir.get_global @__cuda_gpubin_handle
+// CIR-NEXT: cir.store %[[RET]], %[[HANDLE_ADDR]]
+// CIR-NEXT: cir.call @__cudaRegisterFatBinaryEnd(%[[RET]])
+// CIR-NEXT: %[[DTOR_PTR:.*]] = cir.get_global @__cuda_module_dtor
+// CIR-NEXT: {{.*}} = cir.call @atexit(%[[DTOR_PTR]])
+// CIR-NEXT: cir.return
+
 // OGCG: constant [25 x i8] c"GPU binary would be here.", section 
".nv_fatbin", align 8
 // OGCG: @__cuda_fatbin_wrapper = internal constant { i32, i32, ptr, ptr } { 
i32 1180844977, i32 1, ptr @{{.*}}, ptr null }, section ".nvFatBinSegment"
 // OGCG: @__cuda_gpubin_handle = internal global ptr null
+// OGCG: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor
+
+// OGCG: define internal void @__cuda_module_ctor
+// OGCG: call{{.*}}__cudaRegisterFatBinary(ptr @__cuda_fatbin_wrapper)
+// OGCG: store ptr %{{.*}}, ptr @__cuda_gpubin_handle
+// OGCG: call i32 @atexit(ptr @__cuda_module_dtor)
+
+// OGCG: define internal void @__cuda_module_dtor
+// OGCG: load ptr, ptr @__cuda_gpubin_handle
+// OGCG: call void @__cudaUnregisterFatBinary
 
 // No GPU binary — no registration infrastructure at all.
 // NOGPUBIN-NOT: fatbin

>From 525dd8c1a12ac971881357bc5ca4f132f41531e5 Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Sun, 29 Mar 2026 14:04:44 -0400
Subject: [PATCH 7/8] unreachable on RDC compilation

---
 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp 
b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 45b11ba1af607..d141a3fa8ae1a 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -1939,7 +1939,8 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
                                FuncType::get({voidPtrPtrTy}, voidTy));
       builder.createCallOp(loc, endFunc, gpuBinaryHandle);
     }
-  }
+  } else
+    llvm_unreachable("GPU RDC NYI");
 
   // Create destructor and register it with atexit() the way NVCC does it. 
Doing
   // it during regular destructor phase worked in CUDA before 9.2 but results 
in

>From f11b4374e91647683b6642bb89cc92cbfa0d53ad Mon Sep 17 00:00:00 2001
From: David Rivera <[email protected]>
Date: Thu, 2 Apr 2026 18:01:00 -0400
Subject: [PATCH 8/8] fix undefined void ty

---
 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp 
b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index d141a3fa8ae1a..fcdf0dc8d3218 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -1822,6 +1822,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   CIRBaseBuilderTy builder(getContext());
   builder.setInsertionPointToStart(mlirModule.getBody());
 
+  Type voidTy = builder.getVoidTy();
   PointerType voidPtrTy = builder.getVoidPtrTy();
   PointerType voidPtrPtrTy = builder.getPointerTo(voidPtrTy);
   IntType intTy = builder.getSIntNTy(32);

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to