yaxunl created this revision.
yaxunl added reviewers: tra, rjmccall.

HIP generates one fat binary for all devices after linking. However, for each 
compilation
unit a ctor function is emitted which register the same fat binary. Measures 
need to be
taken to make sure the fat binary is only registered once.

Currently each ctor function calls `__hipRegisterFatBinary` and stores the 
returned value
to `_gpubin_handle`. This patch changes the linkage of `_gpubin_handle` to be 
linkonce
so that they are shared between LLVM modules. Then this patch adds check of 
value of
`_gpubin_handle` to make sure `__hipRegisterFatBinary` is only called once. The 
code
is equivalent to

  void *_gpubin_handle;
  void ctor() {
    if (_gpubin_handle == 0) {
      _gpubin_handle = __hipRegisterFatBinary(...);
    }
    // register kernels and variables.
  }

The patch also does similar change to dtors so that `__hipUnregisterFatBinary`
is called once.


https://reviews.llvm.org/D49083

Files:
  lib/CodeGen/CGCUDANV.cpp
  test/CodeGenCUDA/device-stub.cu

Index: test/CodeGenCUDA/device-stub.cu
===================================================================
--- test/CodeGenCUDA/device-stub.cu
+++ test/CodeGenCUDA/device-stub.cu
@@ -19,7 +19,7 @@
 // RUN:   | FileCheck %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fcuda-rdc -fcuda-include-gpubinary %t -o - -x hip \
-// RUN:   | FileCheck %s --check-prefixes=ALL,RDC,HIP,HIPRDC
+// RUN:   | FileCheck %s --check-prefixes=ALL,NORDC,HIP
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
 // RUN:   | FileCheck %s -check-prefix=NOGPUBIN
 
@@ -79,11 +79,11 @@
 // CUDA-SAME: section ".nvFatBinSegment"
 // HIP-SAME: section ".hipFatBinSegment"
 // * variable to save GPU binary handle after initialization
-// NORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null
+// CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null
+// HIP: @__[[PREFIX]]_gpubin_handle = linkonce global i8** null
 // * constant unnamed string with NVModuleID
 // RDC: [[MODULE_ID_GLOBAL:@.*]] = private constant
 // CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
-// HIPRDC-SAME: c"[[MODULE_ID:.+]]\00", section "__hip_module_id", align 32
 // * Make sure our constructor was added to global ctor list.
 // ALL: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor
 // * Alias to global symbol containing the NVModuleID.
@@ -120,10 +120,18 @@
 // ALL: define internal void @__[[PREFIX]]_module_ctor
 
 // In separate mode it calls __[[PREFIX]]RegisterFatBinary(&__[[PREFIX]]_fatbin_wrapper)
+// HIP only register fat binary once.
+// HIP: load i8**, i8*** @__hip_gpubin_handle
+// HIP-NEXT: icmp eq i8** {{.*}}, null
+// HIP-NEXT: br i1 {{.*}}, label %if, label %exit
+// HIP: if:
 // NORDC: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
 //   .. stores return value in __[[PREFIX]]_gpubin_handle
 // NORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle
 //   .. and then calls __[[PREFIX]]_register_globals
+// HIP-NEXT: br label %exit
+// HIP: exit:
+// HIP-NEXT: load i8**, i8*** @__hip_gpubin_handle
 // NORDC-NEXT: call void @__[[PREFIX]]_register_globals
 // * In separate mode we also register a destructor.
 // NORDC-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor)
@@ -136,7 +144,14 @@
 // Test that we've created destructor.
 // NORDC: define internal void @__[[PREFIX]]_module_dtor
 // NORDC: load{{.*}}__[[PREFIX]]_gpubin_handle
-// NORDC-NEXT: call void @__[[PREFIX]]UnregisterFatBinary
+// CUDANORDC-NEXT: call void @__[[PREFIX]]UnregisterFatBinary
+// HIP-NEXT: icmp ne i8** {{.*}}, null
+// HIP-NEXT: br i1 {{.*}}, label %if, label %exit
+// HIP: if:
+// HIP-NEXT: call void @__[[PREFIX]]UnregisterFatBinary
+// HIP-NEXT: store i8** null, i8*** @__hip_gpubin_handle
+// HIP-NEXT: br label %exit
+// HIP: exit:
 
 // There should be no __[[PREFIX]]_register_globals if we have no
 // device-side globals, but we still need to register GPU binary.
Index: lib/CodeGen/CGCUDANV.cpp
===================================================================
--- lib/CodeGen/CGCUDANV.cpp
+++ lib/CodeGen/CGCUDANV.cpp
@@ -427,9 +427,42 @@
       /*constant*/ true);
   FatbinWrapper->setSection(FatbinSectionName);
 
-  // Register binary with CUDA/HIP runtime. This is substantially different in
-  // default mode vs. separate compilation!
-  if (!RelocatableDeviceCode) {
+  // There is only one HIP fat binary per linked module, however there are
+  // multiple constructor functions. Make sure the fat binary is registered
+  // only once.
+  if (IsHIP) {
+    llvm::BasicBlock *IfBlock =
+        llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
+    llvm::BasicBlock *ExitBlock =
+        llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
+    GpuBinaryHandle = new llvm::GlobalVariable(
+        TheModule, VoidPtrPtrTy, /*isConstant=*/false,
+        llvm::GlobalValue::LinkOnceAnyLinkage,
+        /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
+        addUnderscoredPrefixToName("_gpubin_handle"));
+    auto HandleValue =
+        CtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign());
+    llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
+    llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
+    CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
+    CtorBuilder.SetInsertPoint(IfBlock);
+    // GpuBinaryHandle = __{cuda|hip}RegisterFatBinary(&FatbinWrapper);
+    llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
+        RegisterFatbinFunc,
+        CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
+    CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
+                                   CGM.getPointerAlign());
+    CtorBuilder.CreateBr(ExitBlock);
+    CtorBuilder.SetInsertPoint(ExitBlock);
+    // Call __{cuda|hip}_register_globals(GpuBinaryHandle);
+    if (RegisterGlobalsFunc) {
+      auto HandleValue =
+          CtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign());
+      CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
+    }
+  } else if (!RelocatableDeviceCode) {
+    // Register binary with CUDA/HIP runtime. This is substantially different in
+    // default mode vs. separate compilation!
     // GpuBinaryHandle = __{cuda|hip}RegisterFatBinary(&FatbinWrapper);
     llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
         RegisterFatbinFunc,
@@ -520,8 +553,26 @@
 
   auto HandleValue =
       DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign());
-  DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
-
+  // There is only one HIP fat binary per linked module, however there are
+  // multiple destructor functions. Make sure the fat binary is unregistered
+  // only once.
+  if (CGM.getLangOpts().HIP) {
+    llvm::BasicBlock *IfBlock =
+        llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
+    llvm::BasicBlock *ExitBlock =
+        llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
+    llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
+    llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
+    DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
+    DtorBuilder.SetInsertPoint(IfBlock);
+    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
+    DtorBuilder.CreateAlignedStore(Zero, GpuBinaryHandle,
+                                   CGM.getPointerAlign());
+    DtorBuilder.CreateBr(ExitBlock);
+    DtorBuilder.SetInsertPoint(ExitBlock);
+  } else {
+    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
+  }
   DtorBuilder.CreateRetVoid();
   return ModuleDtorFunc;
 }
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to