Author: abataev Date: Tue Mar 20 08:41:05 2018 New Revision: 327990 URL: http://llvm.org/viewvc/llvm-project?rev=327990&view=rev Log: [OPENMP, NVPTX] Codegen for target distribute parallel combined constructs in generic mode.
Fixed codegen for distribute parallel combined constructs. We have to pass and read the shared lower and upper bound from the distribute region in the inner parallel region. Patch is for generic mode. Added: cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp Modified: cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp Modified: cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp?rev=327990&r1=327989&r2=327990&view=diff ============================================================================== --- cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp (original) +++ cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp Tue Mar 20 08:41:05 2018 @@ -1444,7 +1444,11 @@ void CGOpenMPRuntimeNVPTX::emitGenericPa for (llvm::Value *V : CapturedVars) { Address Dst = Bld.CreateConstInBoundsGEP( SharedArgListAddress, Idx, CGF.getPointerSize()); - llvm::Value *PtrV = Bld.CreateBitCast(V, CGF.VoidPtrTy); + llvm::Value * PtrV; + if (V->getType()->isIntegerTy()) + PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy); + else + PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy); CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false, Ctx.getPointerType(Ctx.VoidPtrTy)); ++Idx; @@ -2963,22 +2967,56 @@ llvm::Function *CGOpenMPRuntimeNVPTX::cr // Retrieve the shared variables from the list of references returned // by the runtime. Pass the variables to the outlined function. + Address SharedArgListAddress = Address::invalid(); + if (CS.capture_size() > 0 || + isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { + SharedArgListAddress = CGF.EmitLoadOfPointer( + GlobalArgs, CGF.getContext() + .getPointerType(CGF.getContext().getPointerType( + CGF.getContext().VoidPtrTy)) + .castAs<PointerType>()); + } + unsigned Idx = 0; + if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { + Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx, + CGF.getPointerSize()); + Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( + Src, CGF.SizeTy->getPointerTo()); + llvm::Value *LB = CGF.EmitLoadOfScalar( + TypedAddress, + /*Volatile=*/false, + CGF.getContext().getPointerType(CGF.getContext().getSizeType()), + cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc()); + Args.emplace_back(LB); + ++Idx; + Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx, + CGF.getPointerSize()); + TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( + Src, CGF.SizeTy->getPointerTo()); + llvm::Value *UB = CGF.EmitLoadOfScalar( + TypedAddress, + /*Volatile=*/false, + CGF.getContext().getPointerType(CGF.getContext().getSizeType()), + cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc()); + Args.emplace_back(UB); + ++Idx; + } if (CS.capture_size() > 0) { ASTContext &CGFContext = CGF.getContext(); - Address SharedArgListAddress = CGF.EmitLoadOfPointer(GlobalArgs, - CGFContext - .getPointerType(CGFContext.getPointerType(CGFContext.VoidPtrTy)) - .castAs<PointerType>()); for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) { QualType ElemTy = CurField->getType(); - Address Src = Bld.CreateConstInBoundsGEP( - SharedArgListAddress, I, CGF.getPointerSize()); - Address TypedAddress = Bld.CreateBitCast( + Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx, + CGF.getPointerSize()); + Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy))); llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress, /*Volatile=*/false, CGFContext.getPointerType(ElemTy), CI->getLocation()); + if (CI->capturesVariableByCopy()) { + Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(), + CI->getLocation()); + } Args.emplace_back(Arg); } } Modified: cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp?rev=327990&r1=327989&r2=327990&view=diff ============================================================================== --- cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp (original) +++ cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp Tue Mar 20 08:41:05 2018 @@ -22,7 +22,7 @@ tx ftemplate(int n) { tx a[N]; short aa[N]; tx b[10]; - tx c[M][M]; + tx c[M][M]; tx f = n; tx l; int k; @@ -47,7 +47,7 @@ tx ftemplate(int n) { for(int i = 0; i < M; i++) { for(int j = 0; j < M; j++) { k = M; - c[i][j] = i+j*f+k; + c[i][j] = i + j * f + k; } } Added: cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp?rev=327990&view=auto ============================================================================== --- cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp (added) +++ cfe/trunk/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp Tue Mar 20 08:41:05 2018 @@ -0,0 +1,98 @@ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +int a; + +int foo(int *a); + +int main(int argc, char **argv) { +#pragma omp target teams distribute parallel for map(tofrom:a) if(parallel:argc) + for (int i= 0; i < argc; ++i) + a = foo(&i) + foo(&a) + foo(&argc); + return 0; +} + +// CHECK: define internal void @__omp_offloading_{{.*}}_main_l[[@LINE-6]]_worker() +// CHECK: [[TID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* @ +// CHECK: call void [[PARALLEL:@.+]]_wrapper(i16 0, i32 [[TID]]) + +// CHECK: define void @__omp_offloading_{{.*}}_main_l[[@LINE-10]](i{{64|32}} %{{[^,].*}}, i32* dereferenceable{{[^,]*}}, i{{64|32}} %{{[^,)]*}}) +// CHECK: [[TID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* @ +// CHECK: call void @__kmpc_kernel_init( +// CHECK: call void @__kmpc_data_sharing_init_stack() +// CHECK: call void @__kmpc_for_static_init_4( +// CHECK: call void @__kmpc_kernel_prepare_parallel( +// CHECK: call void @__kmpc_begin_sharing_variables(i8*** [[BUF_PTR_PTR:%[^,]+]], i{{64|32}} 4) +// CHECK: [[BUF_PTR:%.+]] = load i8**, i8*** [[BUF_PTR_PTR]], +// CHECK: [[LB:%.+]] = inttoptr i{{64|32}} [[LB_:%.*]] to i8* +// CHECK: store i8* [[LB]], i8** [[BUF_PTR]], +// CHECK: [[BUF_PTR1:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 1 +// CHECK: [[UB:%.+]] = inttoptr i{{64|32}} [[UB_:%.*]] to i8* +// CHECK: store i8* [[UB]], i8** [[BUF_PTR1]], +// CHECK: [[BUF_PTR2:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 2 +// CHECK: [[ARGC:%.+]] = inttoptr i{{64|32}} [[ARGC_:%.*]] to i8* +// CHECK: store i8* [[ARGC]], i8** [[BUF_PTR2]], +// CHECK: [[BUF_PTR3:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 3 +// CHECK: [[A_PTR:%.+]] = bitcast i32* [[A_ADDR:%.*]] to i8* +// CHECK: store i8* [[A_PTR]], i8** [[BUF_PTR3]], +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: call void @__kmpc_end_sharing_variables() +// CHECK: br label + +// CHECK: call void @__kmpc_serialized_parallel(%ident_t* @ +// CHECK: [[GTID_ADDR:%.*]] = load i32*, i32** % +// CHECK: call void [[PARALLEL]](i32* [[GTID_ADDR]], i32* %{{.+}}, i{{64|32}} [[LB_]], i{{64|32}} [[UB_]], i{{64|32}} [[ARGC_]], i32* [[A_ADDR]]) +// CHECK: call void @__kmpc_end_serialized_parallel(%ident_t* @ +// CHECK: br label % + + +// CHECK: call void @__kmpc_for_static_fini(%ident_t* @ + +// CHECK: call void @__kmpc_kernel_deinit(i16 1) +// CHECK: call void @llvm.nvvm.barrier0() + +// CHECK: define internal void [[PARALLEL]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.+}}, i32* dereferenceable{{.*}}) +// CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 8, i16 0) +// CHECK: [[GLOBALS:%.+]] = bitcast i8* [[RES]] to [[GLOBAL_TY:%.+]]* +// CHECK: [[I:%.+]] = getelementptr inbounds [[GLOBAL_TY]], [[GLOBAL_TY]]* [[GLOBALS]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CHECK: [[ARGC_VAL:%.+]] = load i32, i32* % +// CHECK: [[ARGC:%.+]] = getelementptr inbounds [[GLOBAL_TY]], [[GLOBAL_TY]]* [[GLOBALS]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 +// CHECK: store i32 [[ARGC_VAL]], i32* [[ARGC]], + +// CHECK: call void @__kmpc_for_static_init_4( +// CHECK: call i32 [[FOO:@.+foo.+]](i32* [[I]]) +// CHECK: call i32 [[FOO]](i32* %{{.+}}) +// CHECK: call i32 [[FOO]](i32* [[ARGC]]) +// CHECK: call void @__kmpc_for_static_fini( + +// CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[RES]]) + +// define internal void [[PARALLEL]]_wrapper(i16 zeroext, i32) +// CHECK: call void @__kmpc_get_shared_variables(i8*** [[BUF_PTR_PTR:%.+]]) +// CHECK: [[BUF_PTR:%.+]] = load i8**, i8*** [[BUF_PTR_PTR]], +// CHECK: [[BUF_PTR0:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 0 +// CHECK: [[LB_PTR:%.+]] = bitcast i8** [[BUF_PTR0]] to i{{64|32}}* +// CHECK: [[LB:%.+]] = load i{{64|32}}, i{{64|32}}* [[LB_PTR]], +// CHECK: [[BUF_PTR1:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 1 +// CHECK: [[UB_PTR:%.+]] = bitcast i8** [[BUF_PTR1]] to i{{64|32}}* +// CHECK: [[UB:%.+]] = load i{{64|32}}, i{{64|32}}* [[UB_PTR]], +// CHECK: [[BUF_PTR2:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 2 +// CHECK: [[ARGC_ADDR:%.+]] = bitcast i8** [[BUF_PTR2]] to i32* +// CHECK: [[ARGC:%.+]] = load i32, i32* [[ARGC_ADDR]], +// CHECK-64: [[ARGC_CAST:%.+]] = zext i32 [[ARGC]] to i64 +// CHECK: [[BUF_PTR3:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 3 +// CHECK: [[A_ADDR_REF:%.+]] = bitcast i8** [[BUF_PTR3]] to i32** +// CHECK: [[A_ADDR:%.+]] = load i32*, i32** [[A_ADDR_REF]], +// CHECK-64: call void [[PARALLEL]](i32* %{{.+}}, i32* %{{.+}}, i64 [[LB]], i64 [[UB]], i64 [[ARGC_CAST]], i32* [[A_ADDR]]) +// CHECK-32: call void [[PARALLEL]](i32* %{{.+}}, i32* %{{.+}}, i32 [[LB]], i32 [[UB]], i32 [[ARGC]], i32* [[A_ADDR]]) +// CHECK: ret void + +#endif _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits