yaxunl created this revision.
yaxunl added a reviewer: tra.
yaxunl requested review of this revision.

Currently clang does not emit device template variables
instantiated only in host functions, however, nvcc is
able to do that:

https://godbolt.org/z/fneEfferY

This patch fixes this issue by refactoring and extending
the existing mechanism for emitting static device
var ODR-used by host only. Basically clang records
device variables ODR-used by host code and force
them to be emitted in device compilation. The existing
mechanism makes sure these device variables ODR-used
by host code are added to llvm.compiler-used, therefore
they are guaranteed not to be deleted.


https://reviews.llvm.org/D102270

Files:
  clang/include/clang/AST/ASTContext.h
  clang/lib/AST/ASTContext.cpp
  clang/lib/CodeGen/CGCUDANV.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/Sema/SemaExpr.cpp
  clang/test/CodeGenCUDA/device-stub.cu
  clang/test/CodeGenCUDA/host-used-device-var.cu

Index: clang/test/CodeGenCUDA/host-used-device-var.cu
===================================================================
--- clang/test/CodeGenCUDA/host-used-device-var.cu
+++ clang/test/CodeGenCUDA/host-used-device-var.cu
@@ -1,47 +1,95 @@
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -x hip %s \
-// RUN:   -std=c++11 -O3 -mllvm -amdgpu-internalize-symbols -emit-llvm -o - \
-// RUN:   | FileCheck %s
+// RUN:   -std=c++17 -O3 -mllvm -amdgpu-internalize-symbols -emit-llvm -o - \
+// RUN:   | FileCheck -check-prefix=DEV %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -x hip %s \
+// RUN:   -std=c++17 -O3 -emit-llvm -o - | FileCheck -check-prefix=HOST %s
+
+// Negative tests.
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -x hip %s \
+// RUN:   -std=c++17 -O3 -mllvm -amdgpu-internalize-symbols -emit-llvm -o - \
+// RUN:   | FileCheck -check-prefix=DEV-NEG %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -x hip %s \
+// RUN:   -std=c++17 -O3 -emit-llvm -o - | FileCheck -check-prefix=HOST-NEG %s
 
 #include "Inputs/cuda.h"
 
 // Check device variables used by neither host nor device functioins are not kept.
 
-// CHECK-NOT: @v1
+// DEV-NEG-NOT: @v1
 __device__ int v1;
 
-// CHECK-NOT: @v2
+// DEV-NEG-NOT: @v2
 __constant__ int v2;
 
-// CHECK-NOT: @_ZL2v3
+// DEV-NEG-NOT: @_ZL2v3
 static __device__ int v3;
 
 // Check device variables used by host functions are kept.
 
-// CHECK-DAG: @u1
+// DEV-DAG: @u1
 __device__ int u1;
 
-// CHECK-DAG: @u2
+// DEV-DAG: @u2
 __constant__ int u2;
 
 // Check host-used static device var is in llvm.compiler.used.
-// CHECK-DAG: @_ZL2u3
+// DEV-DAG: @_ZL2u3
 static __device__ int u3;
 
 // Check device-used static device var is emitted but is not in llvm.compiler.used.
-// CHECK-DAG: @_ZL2u4
+// DEV-DAG: @_ZL2u4
 static __device__ int u4;
 
 // Check device variables with used attribute are always kept.
-// CHECK-DAG: @u5
+// DEV-DAG: @u5
 __device__ __attribute__((used)) int u5;
 
-int fun1() {
-  return u1 + u2 + u3;
+// Test external device variable ODR-used by host code is not emitted or registered.
+// DEV-NEG-NOT: @ext_var
+extern __device__ int ext_var;
+
+// DEV-DAG: @inline_var = linkonce_odr addrspace(1) externally_initialized global i32 0
+__device__ inline int inline_var;
+
+template<typename T>
+using func_t = T (*) (T, T);
+
+template <typename T>
+__device__ T add_func (T x, T y)
+{
+  return x + y;
+}
+
+// DEV-DAG: @_Z10p_add_funcIiE = linkonce_odr addrspace(1) externally_initialized global i32 (i32, i32)* @_Z8add_funcIiET_S0_S0_
+template <typename T>
+__device__ func_t<T> p_add_func = add_func<T>;
+
+void use(func_t<int> p);
+void use(int *p);
+
+void fun1() {
+  use(&u1);
+  use(&u2);
+  use(&u3);
+  use(&ext_var);
+  use(&inline_var);
+  use(p_add_func<int>);
 }
 
 __global__ void kern1(int **x) {
   *x = &u4;
 }
+
 // Check the exact list of variables to ensure @_ZL2u4 is not among them.
-// CHECK: @llvm.compiler.used = {{[^@]*}} @_ZL2u3 {{[^@]*}} @u1 {{[^@]*}} @u2 {{[^@]*}} @u5
+// DEV: @llvm.compiler.used = {{[^@]*}} @_Z10p_add_funcIiE {{[^@]*}} @_ZL2u3 {{[^@]*}} @inline_var {{[^@]*}} @u1 {{[^@]*}} @u2 {{[^@]*}} @u5
+
+// HOST-DAG: hipRegisterVar{{.*}}@u1
+// HOST-DAG: hipRegisterVar{{.*}}@u2
+// HOST-DAG: hipRegisterVar{{.*}}@_ZL2u3
+// HOST-DAG: hipRegisterVar{{.*}}@u5
+// HOST-DAG: hipRegisterVar{{.*}}@inline_var
+// HOST-DAG: hipRegisterVar{{.*}}@_Z10p_add_funcIiE
+// HOST-NEG-NOT: hipRegisterVar{{.*}}@ext_var
+// HOST-NEG-NOT: hipRegisterVar{{.*}}@_ZL2u4
Index: clang/test/CodeGenCUDA/device-stub.cu
===================================================================
--- clang/test/CodeGenCUDA/device-stub.cu
+++ clang/test/CodeGenCUDA/device-stub.cu
@@ -107,9 +107,14 @@
 #if __cplusplus > 201402L
 // NORDC17: @inline_var = internal global i32 undef, comdat, align 4{{$}}
 // RDC17: @inline_var = linkonce_odr global i32 undef, comdat, align 4{{$}}
+// NORDC17-NOT: @inline_var2 =
+// RDC17-NOT: @inline_var2 =
 // NORDC17: @_ZN1C17member_inline_varE = internal constant i32 undef, comdat, align 4{{$}}
 // RDC17: @_ZN1C17member_inline_varE = linkonce_odr constant i32 undef, comdat, align 4{{$}}
+// Check inline variable ODR-used by host is emitted on host and registered.
 __device__ inline int inline_var = 3;
+// Check inline variable not ODR-used by host is not emitted on host or registered.
+__device__ inline int inline_var2 = 5;
 struct C {
   __device__ static constexpr int member_inline_var = 4;
 };
@@ -126,10 +131,17 @@
   p = &ext_host_var;
 #if __cplusplus > 201402L
   p = &inline_var;
+  decltype(inline_var2) tmp;
   p = &C::member_inline_var;
 #endif
 }
 
+__device__ void device_use() {
+#if __cplusplus > 201402L
+  const int *p = &inline_var2;
+#endif
+}
+
 // Make sure that all parts of GPU code init/cleanup are there:
 // * constant unnamed string with the device-side kernel name to be passed to
 //   __hipRegisterFunction/__cudaRegisterFunction.
@@ -212,7 +224,8 @@
 // ALL-DAG: call void {{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}constant_var{{[^,]*}}, {{[^@]*}}@2, {{.*}}i32 0, {{i32|i64}} 4, i32 1, i32 0
 // ALL-DAG: call void {{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}ext_device_var_def{{[^,]*}}, {{[^@]*}}@3, {{.*}}i32 0, {{i32|i64}} 4, i32 0, i32 0
 // ALL-DAG: call void {{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}ext_constant_var_def{{[^,]*}}, {{[^@]*}}@4, {{.*}}i32 0, {{i32|i64}} 4, i32 1, i32 0
-// LNX_17-NOT: [[PREFIX]]RegisterVar(i8** %0, {{.*}}inline_var
+// LNX_17-DAG: [[PREFIX]]RegisterVar(i8** %0, {{.*}}inline_var
+// LNX_17-NOT: [[PREFIX]]RegisterVar(i8** %0, {{.*}}inline_var2
 // ALL: ret void
 
 // Test that we've built a constructor.
Index: clang/lib/Sema/SemaExpr.cpp
===================================================================
--- clang/lib/Sema/SemaExpr.cpp
+++ clang/lib/Sema/SemaExpr.cpp
@@ -17143,26 +17143,26 @@
       return false;
     };
     if (Var && Var->hasGlobalStorage()) {
-      if (SemaRef.LangOpts.CUDAIsDevice && !IsEmittedOnDeviceSide(Var)) {
+      if (!IsEmittedOnDeviceSide(Var)) {
         // Diagnose ODR-use of host global variables in device functions.
         // Reference of device global variables in host functions is allowed
         // through shadow variables therefore it is not diagnosed.
-        SemaRef.targetDiag(Loc, diag::err_ref_bad_target)
-            << /*host*/ 2 << /*variable*/ 1 << Var << Target;
-      } else if (Target == Sema::CFT_Host || Target == Sema::CFT_HostDevice) {
-        // Record a CUDA/HIP static device/constant variable if it is referenced
+        if (SemaRef.LangOpts.CUDAIsDevice)
+          SemaRef.targetDiag(Loc, diag::err_ref_bad_target)
+              << /*host*/ 2 << /*variable*/ 1 << Var << Target;
+      } else if ((Target == Sema::CFT_Host || Target == Sema::CFT_HostDevice) &&
+                 !Var->hasExternalStorage()) {
+        // Record a CUDA/HIP device side variable if it is ODR-used
         // by host code. This is done conservatively, when the variable is
         // referenced in any of the following contexts:
         //   - a non-function context
         //   - a host function
         //   - a host device function
-        // This also requires the reference of the static device/constant
-        // variable by host code to be visible in the device compilation for the
-        // compiler to be able to externalize the static device/constant
-        // variable.
-        if (SemaRef.getASTContext().mayExternalizeStaticVar(Var))
-          SemaRef.getASTContext().CUDAStaticDeviceVarReferencedByHost.insert(
-              Var);
+        // This makes the ODR-use of the device side variable by host code to
+        // be visible in the device compilation for the compiler to be able to
+        // emit template variables instantiated by host code only and to
+        // externalize the static device side variable ODR-used by host code.
+        SemaRef.getASTContext().CUDADeviceVarODRUsedByHost.insert(Var);
       }
     }
   }
Index: clang/lib/CodeGen/CodeGenModule.cpp
===================================================================
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -2362,8 +2362,8 @@
   }
 
   // Emit CUDA/HIP static device variables referenced by host code only.
-  if (getLangOpts().CUDA)
-    for (auto V : getContext().CUDAStaticDeviceVarReferencedByHost)
+  if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice)
+    for (auto V : getContext().CUDADeviceVarODRUsedByHost)
       DeferredDeclsToEmit.push_back(V);
 
   // Stop if we're out of both deferred vtables and deferred declarations.
Index: clang/lib/CodeGen/CGCUDANV.cpp
===================================================================
--- clang/lib/CodeGen/CGCUDANV.cpp
+++ clang/lib/CodeGen/CGCUDANV.cpp
@@ -1015,10 +1015,14 @@
     // Don't register a C++17 inline variable. The local symbol can be
     // discarded and referencing a discarded local symbol from outside the
     // comdat (__cuda_register_globals) is disallowed by the ELF spec.
-    // TODO: Reject __device__ constexpr and __device__ inline in Sema.
+    //
     // HIP managed variables need to be always recorded in device and host
     // compilations for transformation.
+    //
+    // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
+    // added to llvm.compiler-used, therefore they are safe to be registered.
     if ((!D->hasExternalStorage() && !D->isInline()) ||
+        CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
         D->hasAttr<HIPManagedAttr>()) {
       registerDeviceVar(D, GV, !D->hasDefinition(),
                         D->hasAttr<CUDAConstantAttr>());
Index: clang/lib/AST/ASTContext.cpp
===================================================================
--- clang/lib/AST/ASTContext.cpp
+++ clang/lib/AST/ASTContext.cpp
@@ -11630,7 +11630,7 @@
 bool ASTContext::shouldExternalizeStaticVar(const Decl *D) const {
   return mayExternalizeStaticVar(D) &&
          (D->hasAttr<HIPManagedAttr>() ||
-          CUDAStaticDeviceVarReferencedByHost.count(cast<VarDecl>(D)));
+          CUDADeviceVarODRUsedByHost.count(cast<VarDecl>(D)));
 }
 
 StringRef ASTContext::getCUIDHash() const {
Index: clang/include/clang/AST/ASTContext.h
===================================================================
--- clang/include/clang/AST/ASTContext.h
+++ clang/include/clang/AST/ASTContext.h
@@ -1058,8 +1058,8 @@
   // Implicitly-declared type 'struct _GUID'.
   mutable TagDecl *MSGuidTagDecl = nullptr;
 
-  /// Keep track of CUDA/HIP static device variables referenced by host code.
-  llvm::DenseSet<const VarDecl *> CUDAStaticDeviceVarReferencedByHost;
+  /// Keep track of CUDA/HIP device-side variables ODR-used by host code.
+  llvm::DenseSet<const VarDecl *> CUDADeviceVarODRUsedByHost;
 
   ASTContext(LangOptions &LOpts, SourceManager &SM, IdentifierTable &idents,
              SelectorTable &sels, Builtin::Context &builtins);
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to