gtbercea created this revision.
gtbercea added reviewers: ABataev, caomhin.
Herald added subscribers: cfe-commits, guansong, jholewinski.
For the OpenMP NVPTX toolchain choose default schedules which ensure coalescing
on the GPU when in SPMD mode. This significantly increases the performance of
offloaded target code.
Repository:
rC Clang
https://reviews.llvm.org/D52434
Files:
lib/CodeGen/CGOpenMPRuntime.cpp
lib/CodeGen/CGOpenMPRuntime.h
lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
lib/CodeGen/CGOpenMPRuntimeNVPTX.h
lib/CodeGen/CGStmtOpenMP.cpp
test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
Index: test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
===================================================================
--- test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
+++ test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
@@ -34,7 +34,7 @@
l = i;
}
- #pragma omp target teams distribute parallel for simd map(tofrom: aa) num_teams(M) thread_limit(64)
+ #pragma omp target teams distribute parallel for simd map(tofrom: aa) num_teams(M) thread_limit(64)
for(int i = 0; i < n; i++) {
aa[i] += 1;
}
@@ -81,44 +81,44 @@
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}(
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92,
+// CHECK: call void @__kmpc_distribute_default_init_4({{.+}}, {{.+}}, {{.+}} 1,
// CHECK: {{call|invoke}} void [[OUTL2:@.+]](
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: call void @__kmpc_spmd_kernel_deinit()
// CHECK: ret void
// CHECK: define internal void [[OUTL2]](
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34,
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33,
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: ret void
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}(
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92,
+// CHECK: call void @__kmpc_distribute_default_init_4({{.+}}, {{.+}}, {{.+}} 1,
// CHECK: {{call|invoke}} void [[OUTL3:@.+]](
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: call void @__kmpc_spmd_kernel_deinit()
// CHECK: ret void
// CHECK: define internal void [[OUTL3]](
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34,
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33,
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: ret void
// CHECK: define {{.*}}void {{@__omp_offloading_.+}}({{.+}}, i{{32|64}} [[F_IN:%.+]])
// CHECK: store {{.+}} [[F_IN]], {{.+}}* {{.+}},
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
// CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]],
+// CHECK: call void @__kmpc_distribute_default_init_4({{.+}}, {{.+}}, {{.+}} 1, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]],
// CHECK: {{call|invoke}} void [[OUTL4:@.+]](
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: call void @__kmpc_spmd_kernel_deinit()
// CHECK: ret void
// CHECK: define internal void [[OUTL4]](
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34,
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33,
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: ret void
Index: test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
===================================================================
--- test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
+++ test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
@@ -24,7 +24,7 @@
// CHECK: define weak void @__omp_offloading_{{.*}}_main_l16(i{{64|32}} %{{[^,].*}}, i32* dereferenceable{{[^,]*}}, i{{64|32}} %{{[^,)]*}})
// CHECK: [[TID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @
// CHECK: call void @__kmpc_spmd_kernel_init(
-// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void @__kmpc_distribute_default_init_4(
// CHECK: call void [[PARALLEL:@.+]](i32* %{{.*}}, i32* %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.*}}, i{{64|32}} %{{.*}}, i32* %{{.*}})
// CHECK: br label %
Index: test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
===================================================================
--- test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
+++ test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
@@ -36,7 +36,7 @@
l = i;
}
- #pragma omp target teams distribute parallel for map(tofrom: aa) num_teams(M) thread_limit(64)
+#pragma omp target teams distribute parallel for map(tofrom: aa) num_teams(M) thread_limit(64)
for(int i = 0; i < n; i++) {
aa[i] += 1;
}
@@ -86,44 +86,44 @@
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}(
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92,
+// CHECK: call void @__kmpc_distribute_default_init_4({{.+}}, {{.+}}, {{.+}} 1,
// CHECK: {{call|invoke}} void [[OUTL2:@.+]](
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: call void @__kmpc_spmd_kernel_deinit()
// CHECK: ret void
// CHECK: define internal void [[OUTL2]](
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34,
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33,
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: ret void
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}(
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92,
+// CHECK: call void @__kmpc_distribute_default_init_4({{.+}}, {{.+}}, {{.+}} 1,
// CHECK: {{call|invoke}} void [[OUTL3:@.+]](
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: call void @__kmpc_spmd_kernel_deinit()
// CHECK: ret void
// CHECK: define internal void [[OUTL3]](
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34,
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33,
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: ret void
// CHECK: define {{.*}}void {{@__omp_offloading_.+}}({{.+}}, i{{32|64}} [[F_IN:%.+]])
// CHECK: store {{.+}} [[F_IN]], {{.+}}* {{.+}},
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
// CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]],
+// CHECK: call void @__kmpc_distribute_default_init_4({{.+}}, {{.+}}, {{.+}} 1, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]],
// CHECK: {{call|invoke}} void [[OUTL4:@.+]](
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: call void @__kmpc_spmd_kernel_deinit()
// CHECK: ret void
// CHECK: define internal void [[OUTL4]](
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34,
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33,
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: ret void
Index: lib/CodeGen/CGStmtOpenMP.cpp
===================================================================
--- lib/CodeGen/CGStmtOpenMP.cpp
+++ lib/CodeGen/CGStmtOpenMP.cpp
@@ -2300,7 +2300,9 @@
// Detect the loop schedule kind and chunk.
llvm::Value *Chunk = nullptr;
OpenMPScheduleTy ScheduleKind;
- if (const auto *C = S.getSingleClause<OMPScheduleClause>()) {
+ const auto *C = S.getSingleClause<OMPScheduleClause>();
+ if (C) {
+ // If schedule clause is present.
ScheduleKind.Schedule = C->getScheduleKind();
ScheduleKind.M1 = C->getFirstScheduleModifier();
ScheduleKind.M2 = C->getSecondScheduleModifier();
@@ -2310,7 +2312,13 @@
S.getIterationVariable()->getType(),
S.getBeginLoc());
}
+ } else {
+ // When schedule clause is absent we choose sensible defaults.
+ CGM.getOpenMPRuntime().chooseDefaultSchedule(&ScheduleKind.Schedule);
+ Chunk = CGM.getOpenMPRuntime().getDefaultChunkValue(
+ *this, S, ScheduleKind.Schedule);
}
+
const unsigned IVSize = getContext().getTypeSize(IVExpr->getType());
const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation();
// OpenMP 4.5, 2.7.1 Loop Construct, Description.
@@ -3326,6 +3334,7 @@
S.getBeginLoc());
}
}
+
const unsigned IVSize = getContext().getTypeSize(IVExpr->getType());
const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation();
Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.h
===================================================================
--- lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -340,6 +340,35 @@
///
void functionFinished(CodeGenFunction &CGF) override;
+ /// For CUDA, to ensure coalesching, the default schedule is chunked.
+ /// This will return false in the default case to reflect that.
+ ///
+ bool isStaticNonchunked(OpenMPDistScheduleClauseKind ScheduleKind,
+ bool Chunked) const override;
+
+ /// Gets the default chunk size.
+ /// \param CodeGenFunction current code generation function.
+ /// \param OMPLoopDirective Loop directive.
+ /// \param OpenMPScheduleClauseKind OpenMP schedule type.
+ llvm::Value *getDefaultChunkValue(CodeGenFunction &CGF,
+ const OMPLoopDirective &S,
+ OpenMPScheduleClauseKind ScheduleKind) const override;
+
+ /// Choose a default value for the schedule clause.
+ void chooseDefaultSchedule(
+ OpenMPScheduleClauseKind *ScheduleKind) const override;
+
+ // Create runtime function call to initialize distribute default
+ // schedule.
+ llvm::Constant *createDistributeDefaultInitFunction(unsigned IVSize,
+ bool IVSigned);
+
+ /// Emits device specific call to runtime function.
+ void emitDistributeStaticInit(
+ CodeGenFunction &CGF, SourceLocation Loc,
+ OpenMPDistScheduleClauseKind SchedKind,
+ const CGOpenMPRuntime::StaticRTInput &Values) override;
+
private:
/// Track the execution mode when codegening directives within a target
/// region. The appropriate mode (SPMD/NON-SPMD) is set on entry to the
Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
===================================================================
--- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -4019,3 +4019,97 @@
FunctionGlobalizedDecls.erase(CGF.CurFn);
CGOpenMPRuntime::functionFinished(CGF);
}
+
+bool CGOpenMPRuntimeNVPTX::isStaticNonchunked(
+ OpenMPDistScheduleClauseKind ScheduleKind, bool Chunked) const {
+ // For OMPC_DIST_SCHEDULE_unknown we change the default to
+ // be schedule(static, <number of threads>). Since the new default is
+ // chunked we need to return false.
+ if (ScheduleKind == OMPC_DIST_SCHEDULE_unknown &&
+ getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
+ return false;
+ return CGOpenMPRuntime::isStaticNonchunked(ScheduleKind, Chunked);
+}
+
+llvm::Value *CGOpenMPRuntimeNVPTX::getDefaultChunkValue(CodeGenFunction &CGF,
+ const OMPLoopDirective &S, OpenMPScheduleClauseKind ScheduleKind) const {
+ // For NVPTX, the default schedule for parallel for uses a chunk size of 1
+ // for coalescing purposes.
+ if (ScheduleKind == OMPC_SCHEDULE_static &&
+ getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
+ return CGF.Builder.getIntN(CGM.getDataLayout().getTypeAllocSizeInBits(
+ CGF.ConvertType(S.getIterationVariable()->getType())), 1);
+ return CGOpenMPRuntime::getDefaultChunkValue(CGF, S, ScheduleKind);
+}
+
+void CGOpenMPRuntimeNVPTX::chooseDefaultSchedule(
+ OpenMPScheduleClauseKind *ScheduleKind) const {
+ if (*ScheduleKind == OMPC_SCHEDULE_unknown &&
+ getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
+ *ScheduleKind = OMPC_SCHEDULE_static;
+}
+
+llvm::Constant *CGOpenMPRuntimeNVPTX::createDistributeDefaultInitFunction(
+ unsigned IVSize, bool IVSigned) {
+ assert((IVSize == 32 || IVSize == 64) &&
+ "IV size is not compatible with the omp runtime");
+ StringRef Name = IVSize == 32 ? (IVSigned ? "__kmpc_distribute_default_init_4"
+ : "__kmpc_distribute_default_init_4u")
+ : (IVSigned ? "__kmpc_distribute_default_init_8"
+ : "__kmpc_distribute_default_init_8u");
+ llvm::Type *ITy = IVSize == 32 ? CGM.Int32Ty : CGM.Int64Ty;
+ auto *PtrTy = llvm::PointerType::getUnqual(ITy);
+ llvm::Type *TypeParams[] = {
+ getIdentTyPointerTy(), // loc
+ CGM.Int32Ty, // tid
+ CGM.Int32Ty, // schedtype
+ llvm::PointerType::getUnqual(CGM.Int32Ty), // p_lastiter
+ PtrTy, // p_lower
+ PtrTy, // p_upper
+ PtrTy, // p_stride
+ ITy, // incr
+ ITy // chunk
+ };
+ auto *FnTy =
+ llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+ return CGM.CreateRuntimeFunction(FnTy, Name);
+}
+
+void CGOpenMPRuntimeNVPTX::emitDistributeStaticInit(
+ CodeGenFunction &CGF, SourceLocation Loc,
+ OpenMPDistScheduleClauseKind SchedKind,
+ const CGOpenMPRuntime::StaticRTInput &Values) {
+
+ // When using the default schedule in SPMD mode more effecient code
+ // can be emitted.
+ if (SchedKind == OMPC_DIST_SCHEDULE_unknown &&
+ getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
+ // Use smart default: split loop into chunks of size equal to
+ // number of threads in the team so that only one iteration per
+ // thread can be allocated.
+ llvm::Value *UpdatedLocation =
+ CGOpenMPRuntime::emitUpdateLocation(CGF, Loc);
+ llvm::Constant *DistributeDefaultInitFunction =
+ createDistributeDefaultInitFunction(Values.IVSize, Values.IVSigned);
+
+ if (!CGF.HaveInsertPoint())
+ return;
+
+ llvm::Value *Args[] = {
+ UpdatedLocation, // (not used)
+ getThreadID(CGF, Loc), // (not used)
+ CGF.Builder.getInt32(1), // Schedule type (not used)
+ Values.IL.getPointer(), // &isLastIter (not used)
+ Values.LB.getPointer(), // &LB
+ Values.UB.getPointer(), // &UB
+ Values.ST.getPointer(), // &Stride
+ CGF.Builder.getIntN(Values.IVSize, 1), // Incr (not used)
+ CGF.Builder.getIntN(Values.IVSize, 1) // Chunk (not used)
+ };
+ CGF.EmitRuntimeCall(DistributeDefaultInitFunction, Args);
+
+ return;
+ }
+
+ CGOpenMPRuntime::emitDistributeStaticInit(CGF, Loc, SchedKind, Values);
+}
Index: lib/CodeGen/CGOpenMPRuntime.h
===================================================================
--- lib/CodeGen/CGOpenMPRuntime.h
+++ lib/CodeGen/CGOpenMPRuntime.h
@@ -1494,6 +1494,30 @@
const VarDecl *NativeParam,
const VarDecl *TargetParam) const;
+ /// Gets the default chunk size.
+ /// \param CodeGenFunction current code generation function.
+ /// \param OMPLoopDirective Loop directive.
+ /// \param OpenMPScheduleClauseKind OpenMP schedule type.
+ virtual llvm::Value *getDefaultChunkValue(CodeGenFunction &CGF,
+ const OMPLoopDirective &S,
+ OpenMPScheduleClauseKind ScheduleKind) const;
+
+ /// Gets the default chunk size.
+ /// \param CodeGenFunction current code generation function.
+ /// \param OMPLoopDirective Loop directive.
+ /// \param OpenMPDistScheduleClauseKind OpenMP dist_schedule type.
+ virtual llvm::Value *getDefaultChunkValue(CodeGenFunction &CGF,
+ const OMPLoopDirective &S,
+ OpenMPDistScheduleClauseKind ScheduleKind) const;
+
+ /// Choose a default value for the schedule clause.
+ virtual void chooseDefaultSchedule(
+ OpenMPScheduleClauseKind *ScheduleKind) const;
+
+ /// Choose a default value for the dist_schedule clause.
+ virtual void chooseDefaultSchedule(
+ OpenMPDistScheduleClauseKind *ScheduleKind) const;
+
/// Emits call of the outlined function with the provided arguments,
/// translating these arguments to correct target-specific arguments.
virtual void
Index: lib/CodeGen/CGOpenMPRuntime.cpp
===================================================================
--- lib/CodeGen/CGOpenMPRuntime.cpp
+++ lib/CodeGen/CGOpenMPRuntime.cpp
@@ -8947,6 +8947,26 @@
return Address::invalid();
}
+void CGOpenMPRuntime::chooseDefaultSchedule(
+ OpenMPScheduleClauseKind *ScheduleKind) const {
+ return;
+}
+
+void CGOpenMPRuntime::chooseDefaultSchedule(
+ OpenMPDistScheduleClauseKind *ScheduleKind) const {
+ return;
+}
+
+llvm::Value *CGOpenMPRuntime::getDefaultChunkValue(CodeGenFunction &CGF,
+ const OMPLoopDirective &S, OpenMPScheduleClauseKind ScheduleKind) const {
+ return nullptr;
+}
+
+llvm::Value *CGOpenMPRuntime::getDefaultChunkValue(CodeGenFunction &CGF,
+ const OMPLoopDirective &S, OpenMPDistScheduleClauseKind ScheduleKind) const {
+ return nullptr;
+}
+
llvm::Value *CGOpenMPSIMDRuntime::emitParallelOutlinedFunction(
const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits