[PATCH] D78655: [CUDA][HIP] Let non-caputuring lambda be host device

Yaxun Liu via Phabricator via cfe-commits Fri, 19 Jun 2020 14:11:00 -0700

yaxunl updated this revision to Diff 272171.
yaxunl retitled this revision from "[HIP] Add -fhip-lambda-host-device" to 
"[CUDA][HIP] Let non-caputuring lambda be host device".
yaxunl edited the summary of this revision.
yaxunl added a comment.


Revised by Richard's comments.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D78655/new/

https://reviews.llvm.org/D78655

Files:
  clang/include/clang/Sema/Sema.h
  clang/lib/Sema/SemaCUDA.cpp
  clang/lib/Sema/SemaLambda.cpp
  clang/test/CodeGenCUDA/lambda.cu
  clang/test/SemaCUDA/Inputs/cuda.h
  clang/test/SemaCUDA/lambda.cu

Index: clang/test/SemaCUDA/lambda.cu
===================================================================
--- /dev/null
+++ clang/test/SemaCUDA/lambda.cu
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 -std=c++17 -fsyntax-only -verify %s
+
+#include "Inputs/cuda.h"
+
+// file-scope lambda is implicitly host device function.
+auto global_lambda = [] () { return 123; };
+
+template<class F>
+__global__ void kernel(F f) { f(); }
+// expected-error@-1 6{{no matching function for call to object of type}}
+
+constexpr __host__ __device__ void hd();
+
+int main(void) {
+  auto lambda_kernel = [&]__global__(){};
+  // expected-error@-1 {{kernel function 'operator()' must be a free function or static member function}}
+
+  int b;
+  kernel<<<1,1>>>(global_lambda);
+
+  kernel<<<1,1>>>([](){ hd(); });
+
+  kernel<<<1,1>>>([=](){ hd(); });
+  // expected-note@-1 {{in instantiation of function template specialization 'kernel<(lambda at}}
+  // expected-note@-2 {{candidate function not viable: call to __host__ function from __global__ function}}
+
+  kernel<<<1,1>>>([b](){ hd(); });
+  // expected-note@-1 {{in instantiation of function template specialization 'kernel<(lambda at}}
+  // expected-note@-2 {{candidate function not viable: call to __host__ function from __global__ function}}
+
+  kernel<<<1,1>>>([&]()constexpr{ hd(); });
+  // expected-note@-1 {{in instantiation of function template specialization 'kernel<(lambda at}}
+  // expected-note@-2 {{candidate function not viable: call to __host__ function from __global__ function}}
+
+  kernel<<<1,1>>>([&](){ hd(); });
+  // expected-note@-1 {{in instantiation of function template specialization 'kernel<(lambda at}}
+  // expected-note@-2 {{candidate function not viable: call to __host__ function from __global__ function}}
+
+  kernel<<<1,1>>>([=, &b](){ hd(); });
+  // expected-note@-1 {{in instantiation of function template specialization 'kernel<(lambda at}}
+  // expected-note@-2 {{candidate function not viable: call to __host__ function from __global__ function}}
+
+  kernel<<<1,1>>>([&, b](){ hd(); });
+  // expected-note@-1 {{in instantiation of function template specialization 'kernel<(lambda at}}
+  // expected-note@-2 {{candidate function not viable: call to __host__ function from __global__ function}}
+
+  kernel<<<1,1>>>([](){
+      auto f = [&]{ hd(); };
+      f();
+  });
+
+  return 0;
+}
Index: clang/test/SemaCUDA/Inputs/cuda.h
===================================================================
--- clang/test/SemaCUDA/Inputs/cuda.h
+++ clang/test/SemaCUDA/Inputs/cuda.h
@@ -17,6 +17,19 @@
   __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
 };
 
+#ifdef __HIP__
+typedef struct hipStream *hipStream_t;
+typedef enum hipError {} hipError_t;
+int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
+                     hipStream_t stream = 0);
+extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                                 size_t sharedSize = 0,
+                                                 hipStream_t stream = 0);
+extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
+                                      dim3 blockDim, void **args,
+                                      size_t sharedMem,
+                                      hipStream_t stream);
+#else
 typedef struct cudaStream *cudaStream_t;
 typedef enum cudaError {} cudaError_t;
 
@@ -29,6 +42,7 @@
 extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
                                         dim3 blockDim, void **args,
                                         size_t sharedMem, cudaStream_t stream);
+#endif
 
 // Host- and device-side placement new overloads.
 void *operator new(__SIZE_TYPE__, void *p) { return p; }
Index: clang/test/CodeGenCUDA/lambda.cu
===================================================================
--- /dev/null
+++ clang/test/CodeGenCUDA/lambda.cu
@@ -0,0 +1,85 @@
+// RUN: %clang_cc1 -x hip -emit-llvm -std=c++11 %s -o - \
+// RUN:   -triple x86_64-linux-gnu \
+// RUN:   | FileCheck -check-prefix=HOST %s
+// RUN: %clang_cc1 -x hip -emit-llvm -std=c++11 %s -o - \
+// RUN:   -triple amdgcn-amd-amdhsa -fcuda-is-device \
+// RUN:   | FileCheck -check-prefix=DEV %s
+
+#include "Inputs/cuda.h"
+
+// Device side kernel name.
+// HOST: @[[KERN_CAPTURE:[0-9]+]] = {{.*}} c"_Z1gIZ12test_capturevEUlvE_EvT_\00"
+// HOST: @[[KERN_RESOLVE:[0-9]+]] = {{.*}} c"_Z1gIZ12test_resolvevEUlvE_EvT_\00"
+
+// Check functions emitted for test_capture in host compilation.
+// Check lambda is not emitted in host compilation.
+// HOST-LABEL: define void @_Z12test_capturev
+// HOST:  call void @_Z19test_capture_helperIZ12test_capturevEUlvE_EvT_
+// HOST-LABEL: define internal void @_Z19test_capture_helperIZ12test_capturevEUlvE_EvT_
+// HOST:  call void @_Z16__device_stub__gIZ12test_capturevEUlvE_EvT_
+// HOST-NOT: define{{.*}}@_ZZ4mainENKUlvE_clEv
+
+// Check functions emitted for test_resolve in host compilation.
+// Check host version of template function 'overloaded' is emitted and called
+// by the lambda function.
+// HOST-LABEL: define void @_Z12test_resolvev
+// HOST:  call void @_Z19test_resolve_helperIZ12test_resolvevEUlvE_EvT_()
+// HOST-LABEL: define internal void @_Z19test_resolve_helperIZ12test_resolvevEUlvE_EvT_
+// HOST:  call void @_Z16__device_stub__gIZ12test_resolvevEUlvE_EvT_
+// HOST:  call void @_ZZ12test_resolvevENKUlvE_clEv
+// HOST-LABEL: define internal void @_ZZ12test_resolvevENKUlvE_clEv
+// HOST:  call i32 @_Z10overloadedIiET_v
+// HOST-LABEL: define linkonce_odr i32 @_Z10overloadedIiET_v
+// HOST:  ret i32 2
+
+// Check kernel is registered with correct device side kernel name.
+// HOST: @__hipRegisterFunction({{.*}}@[[KERN_CAPTURE]]
+// HOST: @__hipRegisterFunction({{.*}}@[[KERN_RESOLVE]]
+
+// DEV: @a = addrspace(1) externally_initialized global i32 0
+
+// Check functions emitted for test_capture in device compilation.
+// Check lambda is emitted in device compilation and accessing device variable.
+// DEV-LABEL: define amdgpu_kernel void @_Z1gIZ12test_capturevEUlvE_EvT_
+// DEV:  call void @_ZZ12test_capturevENKUlvE_clEv
+// DEV-LABEL: define internal void @_ZZ12test_capturevENKUlvE_clEv
+// DEV:  store i32 1, i32* addrspacecast (i32 addrspace(1)* @a to i32*)
+
+// Check functions emitted for test_resolve in device compilation.
+// Check device version of template function 'overloaded' is emitted and called
+// by the lambda function.
+// DEV-LABEL: define amdgpu_kernel void @_Z1gIZ12test_resolvevEUlvE_EvT_
+// DEV:  call void @_ZZ12test_resolvevENKUlvE_clEv
+// DEV-LABE: define internal void @_ZZ12test_resolvevENKUlvE_clEv
+// DEV:  call i32 @_Z10overloadedIiET_v
+// DEV-LABEL: define linkonce_odr i32 @_Z10overloadedIiET_v
+// DEV:  ret i32 1
+
+__device__ int a;
+
+template<class T>
+__device__ T overloaded() { return 1; }
+
+template<class T>
+__host__ T overloaded() { return 2; }
+
+template<class F>
+__global__ void g(F f) { f(); }
+
+template<class F>
+void test_capture_helper(F f) { g<<<1,1>>>(f); }
+
+template<class F>
+void test_resolve_helper(F f) { g<<<1,1>>>(f); f(); }
+
+// Test capture of device variable in lambda function.
+void test_capture(void) {
+  test_capture_helper([](){ a = 1;});
+}
+
+// Test resolving host/device function in lambda function.
+// Callee should resolve to correct host/device function based on where
+// the lambda function is called, not where it is defined.
+void test_resolve(void) {
+  test_resolve_helper([](){ overloaded<int>();});
+}
Index: clang/lib/Sema/SemaLambda.cpp
===================================================================
--- clang/lib/Sema/SemaLambda.cpp
+++ clang/lib/Sema/SemaLambda.cpp
@@ -993,7 +993,7 @@
   // CUDA lambdas get implicit attributes based on the scope in which they're
   // declared.
   if (getLangOpts().CUDA)
-    CUDASetLambdaAttrs(Method);
+    CUDASetLambdaAttrs(Method, Intro);
 
   // Number the lambda for linkage purposes if necessary.
   handleLambdaNumbering(Class, Method);
Index: clang/lib/Sema/SemaCUDA.cpp
===================================================================
--- clang/lib/Sema/SemaCUDA.cpp
+++ clang/lib/Sema/SemaCUDA.cpp
@@ -746,13 +746,16 @@
          DiagKind != DeviceDiagBuilder::K_ImmediateWithCallStack;
 }
 
-void Sema::CUDASetLambdaAttrs(CXXMethodDecl *Method) {
+void Sema::CUDASetLambdaAttrs(CXXMethodDecl *Method, LambdaIntroducer &LI) {
   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
   if (Method->hasAttr<CUDAHostAttr>() || Method->hasAttr<CUDADeviceAttr>())
     return;
   FunctionDecl *CurFn = dyn_cast<FunctionDecl>(CurContext);
-  if (!CurFn)
+  if (!CurFn || (LI.Default == LCD_None && LI.Captures.size() == 0)) {
+    Method->addAttr(CUDADeviceAttr::CreateImplicit(Context));
+    Method->addAttr(CUDAHostAttr::CreateImplicit(Context));
     return;
+  }
   CUDAFunctionTarget Target = IdentifyCUDATarget(CurFn);
   if (Target == CFT_Global || Target == CFT_Device) {
     Method->addAttr(CUDADeviceAttr::CreateImplicit(Context));
Index: clang/include/clang/Sema/Sema.h
===================================================================
--- clang/include/clang/Sema/Sema.h
+++ clang/include/clang/Sema/Sema.h
@@ -11768,7 +11768,7 @@
   /// CUDA lambdas declared inside __device__ or __global__ functions inherit
   /// the __device__ attribute.  Similarly, lambdas inside __host__ __device__
   /// functions become __host__ __device__ themselves.
-  void CUDASetLambdaAttrs(CXXMethodDecl *Method);
+  void CUDASetLambdaAttrs(CXXMethodDecl *Method, LambdaIntroducer &LI);
 
   /// Finds a function in \p Matches with highest calling priority
   /// from \p Caller context and erases all functions with lower

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D78655: [CUDA][HIP] Let non-caputuring lambda be host device

Reply via email to