[PATCH] D52434: [OpenMP] Make default schedules for NVPTX target regions in SPMD mode achieve coalescing

Gheorghe-Teodor Bercea via Phabricator via cfe-commits Mon, 24 Sep 2018 13:45:02 -0700

gtbercea created this revision.
gtbercea added reviewers: ABataev, caomhin.
Herald added subscribers: cfe-commits, guansong, jholewinski.


For the OpenMP NVPTX toolchain choose default schedules which ensure coalescing 
on the GPU when in SPMD mode. This significantly increases the performance of 
offloaded target code.


Repository:
  rC Clang

https://reviews.llvm.org/D52434

Files:
  lib/CodeGen/CGOpenMPRuntime.cpp
  lib/CodeGen/CGOpenMPRuntime.h
  lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
  lib/CodeGen/CGOpenMPRuntimeNVPTX.h
  lib/CodeGen/CGStmtOpenMP.cpp
  test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
  
test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
  test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp

Index: test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
===================================================================
--- test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
+++ test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
@@ -34,7 +34,7 @@
     l = i;
   }
 
-  #pragma omp target teams distribute parallel for simd map(tofrom: aa) num_teams(M) thread_limit(64)
+ #pragma omp target teams distribute parallel for simd map(tofrom: aa) num_teams(M) thread_limit(64)
   for(int i = 0; i < n; i++) {
     aa[i] += 1;
   }
@@ -81,44 +81,44 @@
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}(
 // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92,
+// CHECK: call void @__kmpc_distribute_default_init_4({{.+}}, {{.+}}, {{.+}} 1,
 // CHECK: {{call|invoke}} void [[OUTL2:@.+]](
 // CHECK: call void @__kmpc_for_static_fini(
 // CHECK: call void @__kmpc_spmd_kernel_deinit()
 // CHECK: ret void
 
 // CHECK: define internal void [[OUTL2]](
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34,
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33,
 // CHECK: call void @__kmpc_for_static_fini(
 // CHECK: ret void
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}(
 // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92,
+// CHECK: call void @__kmpc_distribute_default_init_4({{.+}}, {{.+}}, {{.+}} 1,
 // CHECK: {{call|invoke}} void [[OUTL3:@.+]](
 // CHECK: call void @__kmpc_for_static_fini(
 // CHECK: call void @__kmpc_spmd_kernel_deinit()
 // CHECK: ret void
 
 // CHECK: define internal void [[OUTL3]](
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34,
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33,
 // CHECK: call void @__kmpc_for_static_fini(
 // CHECK: ret void
 
 // CHECK: define {{.*}}void {{@__omp_offloading_.+}}({{.+}}, i{{32|64}} [[F_IN:%.+]])
 // CHECK: store {{.+}} [[F_IN]], {{.+}}* {{.+}},
 // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
 // CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]],
+// CHECK: call void @__kmpc_distribute_default_init_4({{.+}}, {{.+}}, {{.+}} 1, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]],
 // CHECK: {{call|invoke}} void [[OUTL4:@.+]](
 // CHECK: call void @__kmpc_for_static_fini(
 // CHECK: call void @__kmpc_spmd_kernel_deinit()
 // CHECK: ret void
 
 // CHECK: define internal void [[OUTL4]](
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34,
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33,
 // CHECK: call void @__kmpc_for_static_fini(
 // CHECK: ret void
 
Index: test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
===================================================================
--- test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
+++ test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
@@ -24,7 +24,7 @@
 // CHECK: define weak void @__omp_offloading_{{.*}}_main_l16(i{{64|32}} %{{[^,].*}}, i32* dereferenceable{{[^,]*}}, i{{64|32}} %{{[^,)]*}})
 // CHECK: [[TID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @
 // CHECK: call void @__kmpc_spmd_kernel_init(
-// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void @__kmpc_distribute_default_init_4(
 
 // CHECK: call void [[PARALLEL:@.+]](i32* %{{.*}}, i32* %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.*}}, i{{64|32}} %{{.*}}, i32* %{{.*}})
 // CHECK: br label %
Index: test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
===================================================================
--- test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
+++ test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
@@ -36,7 +36,7 @@
     l = i;
   }
 
-  #pragma omp target teams distribute parallel for map(tofrom: aa) num_teams(M) thread_limit(64)
+#pragma omp target teams distribute parallel for map(tofrom: aa) num_teams(M) thread_limit(64)
   for(int i = 0; i < n; i++) {
     aa[i] += 1;
   }
@@ -86,44 +86,44 @@
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}(
 // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92,
+// CHECK: call void @__kmpc_distribute_default_init_4({{.+}}, {{.+}}, {{.+}} 1,
 // CHECK: {{call|invoke}} void [[OUTL2:@.+]](
 // CHECK: call void @__kmpc_for_static_fini(
 // CHECK: call void @__kmpc_spmd_kernel_deinit()
 // CHECK: ret void
 
 // CHECK: define internal void [[OUTL2]](
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34,
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33,
 // CHECK: call void @__kmpc_for_static_fini(
 // CHECK: ret void
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}(
 // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92,
+// CHECK: call void @__kmpc_distribute_default_init_4({{.+}}, {{.+}}, {{.+}} 1,
 // CHECK: {{call|invoke}} void [[OUTL3:@.+]](
 // CHECK: call void @__kmpc_for_static_fini(
 // CHECK: call void @__kmpc_spmd_kernel_deinit()
 // CHECK: ret void
 
 // CHECK: define internal void [[OUTL3]](
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34,
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33,
 // CHECK: call void @__kmpc_for_static_fini(
 // CHECK: ret void
 
 // CHECK: define {{.*}}void {{@__omp_offloading_.+}}({{.+}}, i{{32|64}} [[F_IN:%.+]])
 // CHECK: store {{.+}} [[F_IN]], {{.+}}* {{.+}},
 // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
 // CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]],
+// CHECK: call void @__kmpc_distribute_default_init_4({{.+}}, {{.+}}, {{.+}} 1, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]],
 // CHECK: {{call|invoke}} void [[OUTL4:@.+]](
 // CHECK: call void @__kmpc_for_static_fini(
 // CHECK: call void @__kmpc_spmd_kernel_deinit()
 // CHECK: ret void
 
 // CHECK: define internal void [[OUTL4]](
-// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34,
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33,
 // CHECK: call void @__kmpc_for_static_fini(
 // CHECK: ret void
 
Index: lib/CodeGen/CGStmtOpenMP.cpp
===================================================================
--- lib/CodeGen/CGStmtOpenMP.cpp
+++ lib/CodeGen/CGStmtOpenMP.cpp
@@ -2300,7 +2300,9 @@
       // Detect the loop schedule kind and chunk.
       llvm::Value *Chunk = nullptr;
       OpenMPScheduleTy ScheduleKind;
-      if (const auto *C = S.getSingleClause<OMPScheduleClause>()) {
+      const auto *C = S.getSingleClause<OMPScheduleClause>();
+      if (C) {
+        // If schedule clause is present.
         ScheduleKind.Schedule = C->getScheduleKind();
         ScheduleKind.M1 = C->getFirstScheduleModifier();
         ScheduleKind.M2 = C->getSecondScheduleModifier();
@@ -2310,7 +2312,13 @@
                                        S.getIterationVariable()->getType(),
                                        S.getBeginLoc());
         }
+      } else {
+        // When schedule clause is absent we choose sensible defaults.
+        CGM.getOpenMPRuntime().chooseDefaultSchedule(&ScheduleKind.Schedule);
+        Chunk = CGM.getOpenMPRuntime().getDefaultChunkValue(
+            *this, S, ScheduleKind.Schedule);
       }
+
       const unsigned IVSize = getContext().getTypeSize(IVExpr->getType());
       const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation();
       // OpenMP 4.5, 2.7.1 Loop Construct, Description.
@@ -3326,6 +3334,7 @@
                                        S.getBeginLoc());
         }
       }
+
       const unsigned IVSize = getContext().getTypeSize(IVExpr->getType());
       const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation();
 
Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.h
===================================================================
--- lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -340,6 +340,35 @@
   ///
   void functionFinished(CodeGenFunction &CGF) override;
 
+  /// For CUDA, to ensure coalesching, the default schedule is chunked.
+  /// This will return false in the default case to reflect that.
+  ///
+  bool isStaticNonchunked(OpenMPDistScheduleClauseKind ScheduleKind,
+                          bool Chunked) const override;
+
+  /// Gets the default chunk size.
+  /// \param CodeGenFunction current code generation function.
+  /// \param OMPLoopDirective Loop directive.
+  /// \param OpenMPScheduleClauseKind OpenMP schedule type.
+  llvm::Value *getDefaultChunkValue(CodeGenFunction &CGF,
+      const OMPLoopDirective &S,
+      OpenMPScheduleClauseKind ScheduleKind) const override;
+
+  /// Choose a default value for the schedule clause.
+  void chooseDefaultSchedule(
+      OpenMPScheduleClauseKind *ScheduleKind) const override;
+
+  // Create runtime function call to initialize distribute default
+  // schedule.
+  llvm::Constant *createDistributeDefaultInitFunction(unsigned IVSize,
+                                                      bool IVSigned);
+
+  /// Emits device specific call to runtime function.
+  void emitDistributeStaticInit(
+      CodeGenFunction &CGF, SourceLocation Loc,
+      OpenMPDistScheduleClauseKind SchedKind,
+      const CGOpenMPRuntime::StaticRTInput &Values) override;
+
 private:
   /// Track the execution mode when codegening directives within a target
   /// region. The appropriate mode (SPMD/NON-SPMD) is set on entry to the
Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
===================================================================
--- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -4019,3 +4019,97 @@
   FunctionGlobalizedDecls.erase(CGF.CurFn);
   CGOpenMPRuntime::functionFinished(CGF);
 }
+
+bool CGOpenMPRuntimeNVPTX::isStaticNonchunked(
+    OpenMPDistScheduleClauseKind ScheduleKind, bool Chunked) const {
+  // For OMPC_DIST_SCHEDULE_unknown we change the default to
+  // be schedule(static, <number of threads>). Since the new default is
+  // chunked we need to return false.
+  if (ScheduleKind == OMPC_DIST_SCHEDULE_unknown &&
+      getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
+    return false;
+  return CGOpenMPRuntime::isStaticNonchunked(ScheduleKind, Chunked);
+}
+
+llvm::Value *CGOpenMPRuntimeNVPTX::getDefaultChunkValue(CodeGenFunction &CGF,
+    const OMPLoopDirective &S, OpenMPScheduleClauseKind ScheduleKind) const {
+  // For NVPTX, the default schedule for parallel for uses a chunk size of 1
+  // for coalescing purposes.
+  if (ScheduleKind == OMPC_SCHEDULE_static &&
+      getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
+    return CGF.Builder.getIntN(CGM.getDataLayout().getTypeAllocSizeInBits(
+        CGF.ConvertType(S.getIterationVariable()->getType())), 1);
+  return CGOpenMPRuntime::getDefaultChunkValue(CGF, S, ScheduleKind);
+}
+
+void CGOpenMPRuntimeNVPTX::chooseDefaultSchedule(
+    OpenMPScheduleClauseKind *ScheduleKind) const {
+  if (*ScheduleKind == OMPC_SCHEDULE_unknown &&
+      getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
+    *ScheduleKind = OMPC_SCHEDULE_static;
+}
+
+llvm::Constant *CGOpenMPRuntimeNVPTX::createDistributeDefaultInitFunction(
+    unsigned IVSize, bool IVSigned) {
+  assert((IVSize == 32 || IVSize == 64) &&
+         "IV size is not compatible with the omp runtime");
+  StringRef Name = IVSize == 32 ? (IVSigned ? "__kmpc_distribute_default_init_4"
+                                            : "__kmpc_distribute_default_init_4u")
+                                : (IVSigned ? "__kmpc_distribute_default_init_8"
+                                            : "__kmpc_distribute_default_init_8u");
+  llvm::Type *ITy = IVSize == 32 ? CGM.Int32Ty : CGM.Int64Ty;
+  auto *PtrTy = llvm::PointerType::getUnqual(ITy);
+  llvm::Type *TypeParams[] = {
+    getIdentTyPointerTy(),                     // loc
+    CGM.Int32Ty,                               // tid
+    CGM.Int32Ty,                               // schedtype
+    llvm::PointerType::getUnqual(CGM.Int32Ty), // p_lastiter
+    PtrTy,                                     // p_lower
+    PtrTy,                                     // p_upper
+    PtrTy,                                     // p_stride
+    ITy,                                       // incr
+    ITy                                        // chunk
+  };
+  auto *FnTy =
+      llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+  return CGM.CreateRuntimeFunction(FnTy, Name);
+}
+
+void CGOpenMPRuntimeNVPTX::emitDistributeStaticInit(
+    CodeGenFunction &CGF, SourceLocation Loc,
+    OpenMPDistScheduleClauseKind SchedKind,
+    const CGOpenMPRuntime::StaticRTInput &Values) {
+
+  // When using the default schedule in SPMD mode more effecient code
+  // can be emitted.
+  if (SchedKind == OMPC_DIST_SCHEDULE_unknown &&
+      getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
+    // Use smart default: split loop into chunks of size equal to
+    // number of threads in the team so that only one iteration per
+    // thread can be allocated.
+    llvm::Value *UpdatedLocation =
+        CGOpenMPRuntime::emitUpdateLocation(CGF, Loc);
+    llvm::Constant *DistributeDefaultInitFunction =
+        createDistributeDefaultInitFunction(Values.IVSize, Values.IVSigned);
+
+    if (!CGF.HaveInsertPoint())
+      return;
+
+    llvm::Value *Args[] = {
+      UpdatedLocation,                                  // (not used)
+      getThreadID(CGF, Loc),                            // (not used)
+      CGF.Builder.getInt32(1),                          // Schedule type (not used)
+      Values.IL.getPointer(),                           // &isLastIter (not used)
+      Values.LB.getPointer(),                           // &LB
+      Values.UB.getPointer(),                           // &UB
+      Values.ST.getPointer(),                           // &Stride
+      CGF.Builder.getIntN(Values.IVSize, 1),            // Incr (not used)
+      CGF.Builder.getIntN(Values.IVSize, 1)             // Chunk (not used)
+    };
+    CGF.EmitRuntimeCall(DistributeDefaultInitFunction, Args);
+
+    return;
+  }
+
+  CGOpenMPRuntime::emitDistributeStaticInit(CGF, Loc, SchedKind, Values);
+}
Index: lib/CodeGen/CGOpenMPRuntime.h
===================================================================
--- lib/CodeGen/CGOpenMPRuntime.h
+++ lib/CodeGen/CGOpenMPRuntime.h
@@ -1494,6 +1494,30 @@
                                       const VarDecl *NativeParam,
                                       const VarDecl *TargetParam) const;
 
+  /// Gets the default chunk size.
+  /// \param CodeGenFunction current code generation function.
+  /// \param OMPLoopDirective Loop directive.
+  /// \param OpenMPScheduleClauseKind OpenMP schedule type.
+  virtual llvm::Value *getDefaultChunkValue(CodeGenFunction &CGF,
+    const OMPLoopDirective &S,
+    OpenMPScheduleClauseKind ScheduleKind) const;
+
+  /// Gets the default chunk size.
+  /// \param CodeGenFunction current code generation function.
+  /// \param OMPLoopDirective Loop directive.
+  /// \param OpenMPDistScheduleClauseKind OpenMP dist_schedule type.
+  virtual llvm::Value *getDefaultChunkValue(CodeGenFunction &CGF,
+    const OMPLoopDirective &S,
+    OpenMPDistScheduleClauseKind ScheduleKind) const;
+
+  /// Choose a default value for the schedule clause.
+  virtual void chooseDefaultSchedule(
+    OpenMPScheduleClauseKind *ScheduleKind) const;
+
+  /// Choose a default value for the dist_schedule clause.
+  virtual void chooseDefaultSchedule(
+    OpenMPDistScheduleClauseKind *ScheduleKind) const;
+
   /// Emits call of the outlined function with the provided arguments,
   /// translating these arguments to correct target-specific arguments.
   virtual void
Index: lib/CodeGen/CGOpenMPRuntime.cpp
===================================================================
--- lib/CodeGen/CGOpenMPRuntime.cpp
+++ lib/CodeGen/CGOpenMPRuntime.cpp
@@ -8947,6 +8947,26 @@
   return Address::invalid();
 }
 
+void CGOpenMPRuntime::chooseDefaultSchedule(
+    OpenMPScheduleClauseKind *ScheduleKind) const {
+  return;
+}
+
+void CGOpenMPRuntime::chooseDefaultSchedule(
+    OpenMPDistScheduleClauseKind *ScheduleKind) const {
+  return;
+}
+
+llvm::Value *CGOpenMPRuntime::getDefaultChunkValue(CodeGenFunction &CGF,
+    const OMPLoopDirective &S, OpenMPScheduleClauseKind ScheduleKind) const {
+  return nullptr;
+}
+
+llvm::Value *CGOpenMPRuntime::getDefaultChunkValue(CodeGenFunction &CGF,
+    const OMPLoopDirective &S, OpenMPDistScheduleClauseKind ScheduleKind) const {
+  return nullptr;
+}
+
 llvm::Value *CGOpenMPSIMDRuntime::emitParallelOutlinedFunction(
     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {

_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D52434: [OpenMP] Make default schedules for NVPTX target regions in SPMD mode achieve coalescing

Reply via email to