r327990 - [OPENMP, NVPTX] Codegen for target distribute parallel combined

Alexey Bataev via cfe-commits Tue, 20 Mar 2018 08:43:55 -0700

Author: abataev
Date: Tue Mar 20 08:41:05 2018
New Revision: 327990

URL: http://llvm.org/viewvc/llvm-project?rev=327990&view=rev
Log:
[OPENMP, NVPTX] Codegen for target distribute parallel combined
constructs in generic mode.


Fixed codegen for distribute parallel combined constructs. We have to
pass and read the shared lower and upper bound from the distribute
region in the inner parallel region. Patch is for generic mode.

Added:
    
cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
Modified:
    cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
    cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp

Modified: cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp?rev=327990&r1=327989&r2=327990&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp (original)
+++ cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp Tue Mar 20 08:41:05 2018
@@ -1444,7 +1444,11 @@ void CGOpenMPRuntimeNVPTX::emitGenericPa
       for (llvm::Value *V : CapturedVars) {
         Address Dst = Bld.CreateConstInBoundsGEP(
             SharedArgListAddress, Idx, CGF.getPointerSize());
-        llvm::Value *PtrV = Bld.CreateBitCast(V, CGF.VoidPtrTy);
+        llvm::Value * PtrV;
+        if (V->getType()->isIntegerTy())
+          PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
+        else
+          PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
         CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
                               Ctx.getPointerType(Ctx.VoidPtrTy));
         ++Idx;
@@ -2963,22 +2967,56 @@ llvm::Function *CGOpenMPRuntimeNVPTX::cr
 
   // Retrieve the shared variables from the list of references returned
   // by the runtime. Pass the variables to the outlined function.
+  Address SharedArgListAddress = Address::invalid();
+  if (CS.capture_size() > 0 ||
+      isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
+    SharedArgListAddress = CGF.EmitLoadOfPointer(
+        GlobalArgs, CGF.getContext()
+                        .getPointerType(CGF.getContext().getPointerType(
+                            CGF.getContext().VoidPtrTy))
+                        .castAs<PointerType>());
+  }
+  unsigned Idx = 0;
+  if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
+    Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
+                                             CGF.getPointerSize());
+    Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
+        Src, CGF.SizeTy->getPointerTo());
+    llvm::Value *LB = CGF.EmitLoadOfScalar(
+        TypedAddress,
+        /*Volatile=*/false,
+        CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
+        cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
+    Args.emplace_back(LB);
+    ++Idx;
+    Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
+                                     CGF.getPointerSize());
+    TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
+        Src, CGF.SizeTy->getPointerTo());
+    llvm::Value *UB = CGF.EmitLoadOfScalar(
+        TypedAddress,
+        /*Volatile=*/false,
+        CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
+        cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
+    Args.emplace_back(UB);
+    ++Idx;
+  }
   if (CS.capture_size() > 0) {
     ASTContext &CGFContext = CGF.getContext();
-    Address SharedArgListAddress = CGF.EmitLoadOfPointer(GlobalArgs,
-        CGFContext
-            .getPointerType(CGFContext.getPointerType(CGFContext.VoidPtrTy))
-            .castAs<PointerType>());
     for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
       QualType ElemTy = CurField->getType();
-      Address Src = Bld.CreateConstInBoundsGEP(
-          SharedArgListAddress, I, CGF.getPointerSize());
-      Address TypedAddress = Bld.CreateBitCast(
+      Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx,
+                                               CGF.getPointerSize());
+      Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
           Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
       llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
                                               /*Volatile=*/false,
                                               
CGFContext.getPointerType(ElemTy),
                                               CI->getLocation());
+      if (CI->capturesVariableByCopy()) {
+        Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
+                              CI->getLocation());
+      }
       Args.emplace_back(Arg);
     }
   }

Modified: 
cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp?rev=327990&r1=327989&r2=327990&view=diff
==============================================================================
--- 
cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp 
(original)
+++ 
cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp 
Tue Mar 20 08:41:05 2018
@@ -22,7 +22,7 @@ tx ftemplate(int n) {
   tx a[N];
   short aa[N];
   tx b[10];
-  tx c[M][M];  
+  tx c[M][M];
   tx f = n;
   tx l;
   int k;
@@ -47,7 +47,7 @@ tx ftemplate(int n) {
   for(int i = 0; i < M; i++) {
     for(int j = 0; j < M; j++) {
       k = M;
-      c[i][j] = i+j*f+k;      
+      c[i][j] = i + j * f + k;
     }
   }
 

Added: 
cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp?rev=327990&view=auto
==============================================================================
--- 
cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
 (added)
+++ 
cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
 Tue Mar 20 08:41:05 2018
@@ -0,0 +1,98 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple 
powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc 
%s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple 
nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s 
-fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck 
%s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple 
i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o 
%t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple 
nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s 
-fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck 
%s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions 
-fcxx-exceptions -x c++ -triple nvptx-unknown-unknown 
-fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device 
-fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix 
CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+int a;
+
+int foo(int *a);
+
+int main(int argc, char **argv) {
+#pragma omp target teams distribute parallel for map(tofrom:a) 
if(parallel:argc)
+  for (int i= 0; i < argc; ++i)
+    a = foo(&i) + foo(&a) + foo(&argc);
+  return 0;
+}
+
+// CHECK: define internal void 
@__omp_offloading_{{.*}}_main_l[[@LINE-6]]_worker()
+// CHECK: [[TID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* @
+// CHECK: call void [[PARALLEL:@.+]]_wrapper(i16 0, i32 [[TID]])
+
+// CHECK: define void @__omp_offloading_{{.*}}_main_l[[@LINE-10]](i{{64|32}} 
%{{[^,].*}}, i32* dereferenceable{{[^,]*}}, i{{64|32}} %{{[^,)]*}})
+// CHECK: [[TID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* @
+// CHECK: call void @__kmpc_kernel_init(
+// CHECK: call void @__kmpc_data_sharing_init_stack()
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void @__kmpc_kernel_prepare_parallel(
+// CHECK: call void @__kmpc_begin_sharing_variables(i8*** 
[[BUF_PTR_PTR:%[^,]+]], i{{64|32}} 4)
+// CHECK: [[BUF_PTR:%.+]] = load i8**, i8*** [[BUF_PTR_PTR]],
+// CHECK: [[LB:%.+]] = inttoptr i{{64|32}} [[LB_:%.*]] to i8*
+// CHECK: store i8* [[LB]], i8** [[BUF_PTR]],
+// CHECK: [[BUF_PTR1:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], 
i{{[0-9]+}} 1
+// CHECK: [[UB:%.+]] = inttoptr i{{64|32}} [[UB_:%.*]] to i8*
+// CHECK: store i8* [[UB]], i8** [[BUF_PTR1]],
+// CHECK: [[BUF_PTR2:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], 
i{{[0-9]+}} 2
+// CHECK: [[ARGC:%.+]] = inttoptr i{{64|32}} [[ARGC_:%.*]] to i8*
+// CHECK: store i8* [[ARGC]], i8** [[BUF_PTR2]],
+// CHECK: [[BUF_PTR3:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], 
i{{[0-9]+}} 3
+// CHECK: [[A_PTR:%.+]] = bitcast i32* [[A_ADDR:%.*]] to i8*
+// CHECK: store i8* [[A_PTR]], i8** [[BUF_PTR3]],
+// CHECK: call void @llvm.nvvm.barrier0()
+// CHECK: call void @llvm.nvvm.barrier0()
+// CHECK: call void @__kmpc_end_sharing_variables()
+// CHECK: br label
+
+// CHECK: call void @__kmpc_serialized_parallel(%ident_t* @
+// CHECK: [[GTID_ADDR:%.*]] = load i32*, i32** %
+// CHECK: call void [[PARALLEL]](i32* [[GTID_ADDR]], i32* %{{.+}}, i{{64|32}} 
[[LB_]], i{{64|32}} [[UB_]], i{{64|32}} [[ARGC_]], i32* [[A_ADDR]])
+// CHECK: call void @__kmpc_end_serialized_parallel(%ident_t* @
+// CHECK: br label %
+
+
+// CHECK: call void @__kmpc_for_static_fini(%ident_t* @
+
+// CHECK: call void @__kmpc_kernel_deinit(i16 1)
+// CHECK: call void @llvm.nvvm.barrier0()
+
+// CHECK: define internal void [[PARALLEL]](i32* noalias %{{.+}}, i32* noalias 
%{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.+}}, i32* 
dereferenceable{{.*}})
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 8, 
i16 0)
+// CHECK: [[GLOBALS:%.+]] = bitcast i8* [[RES]] to [[GLOBAL_TY:%.+]]*
+// CHECK: [[I:%.+]] = getelementptr inbounds [[GLOBAL_TY]], [[GLOBAL_TY]]* 
[[GLOBALS]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[ARGC_VAL:%.+]] = load i32, i32* %
+// CHECK: [[ARGC:%.+]] = getelementptr inbounds [[GLOBAL_TY]], [[GLOBAL_TY]]* 
[[GLOBALS]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: store i32 [[ARGC_VAL]], i32* [[ARGC]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call i32 [[FOO:@.+foo.+]](i32* [[I]])
+// CHECK: call i32 [[FOO]](i32* %{{.+}})
+// CHECK: call i32 [[FOO]](i32* [[ARGC]])
+// CHECK: call void @__kmpc_for_static_fini(
+
+// CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[RES]])
+
+// define internal void [[PARALLEL]]_wrapper(i16 zeroext, i32)
+// CHECK: call void @__kmpc_get_shared_variables(i8*** [[BUF_PTR_PTR:%.+]])
+// CHECK: [[BUF_PTR:%.+]] = load i8**, i8*** [[BUF_PTR_PTR]],
+// CHECK: [[BUF_PTR0:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], 
i{{[0-9]+}} 0
+// CHECK: [[LB_PTR:%.+]] = bitcast i8** [[BUF_PTR0]] to i{{64|32}}*
+// CHECK: [[LB:%.+]] = load i{{64|32}}, i{{64|32}}* [[LB_PTR]],
+// CHECK: [[BUF_PTR1:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], 
i{{[0-9]+}} 1
+// CHECK: [[UB_PTR:%.+]] = bitcast i8** [[BUF_PTR1]] to i{{64|32}}*
+// CHECK: [[UB:%.+]] = load i{{64|32}}, i{{64|32}}* [[UB_PTR]],
+// CHECK: [[BUF_PTR2:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], 
i{{[0-9]+}} 2
+// CHECK: [[ARGC_ADDR:%.+]] = bitcast i8** [[BUF_PTR2]] to i32*
+// CHECK: [[ARGC:%.+]] = load i32, i32* [[ARGC_ADDR]],
+// CHECK-64: [[ARGC_CAST:%.+]] = zext i32 [[ARGC]] to i64
+// CHECK: [[BUF_PTR3:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], 
i{{[0-9]+}} 3
+// CHECK: [[A_ADDR_REF:%.+]] = bitcast i8** [[BUF_PTR3]] to i32**
+// CHECK: [[A_ADDR:%.+]] = load i32*, i32** [[A_ADDR_REF]],
+// CHECK-64: call void [[PARALLEL]](i32* %{{.+}}, i32* %{{.+}}, i64 [[LB]], 
i64 [[UB]], i64 [[ARGC_CAST]], i32* [[A_ADDR]])
+// CHECK-32: call void [[PARALLEL]](i32* %{{.+}}, i32* %{{.+}}, i32 [[LB]], 
i32 [[UB]], i32 [[ARGC]], i32* [[A_ADDR]])
+// CHECK: ret void
+
+#endif


_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

r327990 - [OPENMP, NVPTX] Codegen for target distribute parallel combined

Reply via email to