[PATCH] D86097: [OpenMP][AMDGCN] Generate global variables and attributes for AMDGCN

Saiyedul Islam via Phabricator via cfe-commits Mon, 17 Aug 2020 11:56:16 -0700

saiislam created this revision.
saiislam added reviewers: ABataev, jdoerfert, JonChesterfield.
Herald added subscribers: cfe-commits, guansong, yaxunl, jvesely, jholewinski.
Herald added a project: clang.
saiislam requested review of this revision.
Herald added a subscriber: sstefan1.


Provide support for amdgcn specific global variables and attributes.
Generalize allocation of various common global variables and provide
their specialized implementations for nvptx and amdgcn.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D86097

Files:
  clang/lib/CodeGen/CGOpenMPRuntime.cpp
  clang/lib/CodeGen/CGOpenMPRuntime.h
  clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
  clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
  clang/test/OpenMP/amdgcn_target_codegen.cpp

Index: clang/test/OpenMP/amdgcn_target_codegen.cpp
===================================================================
--- clang/test/OpenMP/amdgcn_target_codegen.cpp
+++ clang/test/OpenMP/amdgcn_target_codegen.cpp
@@ -8,6 +8,22 @@
 
 #define N 1000
 
+// CHECK: @"_openmp_kernel_static_glob_rd$ptr" = weak addrspace(3) externally_initialized global i8* undef
+
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_tid_threadsv_l36_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 0, i8 1, i8 1, i8 0 }, align 2
+// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_tid_threadsv_l36_exec_mode = weak constant i8 1
+
+// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_tid_threads_simdv_l52_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 0, i8 0, i8 1, i8 0 }, align 2
+// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_tid_threads_simdv_l52_exec_mode = weak constant i8 0
+
+// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l63_wg_size = weak addrspace(1) constant i16 10
+// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l63_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 10, i8 0, i8 1, i8 0 }, align 2
+// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l63_exec_mode = weak constant i8 0
+
+// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l75_wg_size = weak addrspace(1) constant i16 74
+// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l75_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 74, i8 1, i8 1, i8 0 }, align 2
+// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l75_exec_mode = weak constant i8 1
+
 int test_amdgcn_target_tid_threads() {
 // CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads
 
@@ -40,4 +56,40 @@
   return arr[0];
 }
 
+int test_amdgcn_target_attributes_spmd() {
+  int arr[N];
+
+// CHECK: {{.*}}"amdgpu-flat-work-group-size"="10,10"
+#pragma omp target parallel num_threads(10)
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+
+  return arr[0];
+}
+
+int test_amdgcn_target_attributes_non_spmd() {
+  int arr[N];
+
+// CHECK: {{.*}}"amdgpu-flat-work-group-size"="74,74"
+#pragma omp target teams thread_limit(10)
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+
+  return arr[0];
+}
+
+int test_amdgcn_target_attributes_max_work_group_size() {
+  int arr[N];
+
+// CHECK: {{.*}}"amdgpu-flat-work-group-size"="1024,1024"
+#pragma omp target teams thread_limit(1500)
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+
+  return arr[0];
+}
+
 #endif
Index: clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -35,6 +35,34 @@
 
   /// Get the maximum number of threads in a block of the GPU.
   llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
+
+  /// Allocate global variable for TransferMedium
+  virtual llvm::GlobalVariable *
+  allocateTransferMediumGlobal(CodeGenModule &CGM, llvm::ArrayType *Ty,
+                               StringRef Name) override;
+
+  /// Allocate global variable for SharedStaticRD
+  virtual llvm::GlobalVariable *
+  allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+                               llvm::Type *LLVMStaticTy) override;
+
+  /// Allocate global variable for KernelStaticGlobalized
+  virtual llvm::GlobalVariable *
+  allocateKernelStaticGlobalized(CodeGenModule &CGM) override;
+
+  /// Emit target specific SPMD kernel
+  virtual void emitSPMDKernelWrapper(const OMPExecutableDirective &D,
+                                     StringRef ParentName,
+                                     llvm::Function *&OutlinedFn,
+                                     llvm::Constant *&OutlinedFnID,
+                                     bool IsOffloadEntry,
+                                     const RegionCodeGenTy &CodeGen) override;
+  /// Emit target specific Non-SPMD kernel
+  virtual void
+  emitNonSPMDKernelWrapper(const OMPExecutableDirective &D,
+                           StringRef ParentName, llvm::Function *&OutlinedFn,
+                           llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                           const RegionCodeGenTy &CodeGen) override;
 };
 
 } // CodeGen namespace.
Index: clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -30,6 +30,11 @@
     : CGOpenMPRuntimeGPU(CGM) {
   if (!CGM.getLangOpts().OpenMPIsDevice)
     llvm_unreachable("OpenMP NVPTX can only handle device code.");
+
+  // FIXME: nvlink does not handle weak linkage correctly (object with the
+  // different size are reported as erroneous).
+  // Restore CommonLinkage as soon as nvlink is fixed.
+  StaticRDLinkage = llvm::GlobalValue::InternalLinkage;
 }
 
 llvm::Value *CGOpenMPRuntimeNVPTX::getGPUWarpSize(CodeGenFunction &CGF) {
@@ -54,3 +59,52 @@
       &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x);
   return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
 }
+
+llvm::GlobalVariable *CGOpenMPRuntimeNVPTX::allocateTransferMediumGlobal(
+    CodeGenModule &CGM, llvm::ArrayType *Ty, StringRef TransferMediumName) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), Ty, /*isConstant=*/false,
+      llvm::GlobalVariable::CommonLinkage, llvm::Constant::getNullValue(Ty),
+      TransferMediumName,
+      /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+}
+
+llvm::GlobalVariable *
+CGOpenMPRuntimeNVPTX::allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+                                                   llvm::Type *LLVMStaticTy) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), LLVMStaticTy,
+      /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
+      llvm::Constant::getNullValue(LLVMStaticTy),
+      "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
+      llvm::GlobalValue::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+}
+
+llvm::GlobalVariable *
+CGOpenMPRuntimeNVPTX::allocateKernelStaticGlobalized(CodeGenModule &CGM) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
+      llvm::GlobalValue::InternalLinkage,
+      llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
+      "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
+      llvm::GlobalValue::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+}
+
+void CGOpenMPRuntimeNVPTX::emitSPMDKernelWrapper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                 CodeGen);
+}
+
+void CGOpenMPRuntimeNVPTX::emitNonSPMDKernelWrapper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                    CodeGen);
+}
Index: clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -33,6 +33,9 @@
     /// Unknown execution mode (orphaned directive).
     EM_Unknown,
   };
+  /// Linkage type of StaticRD Global variable
+  llvm::GlobalValue::LinkageTypes StaticRDLinkage;
+
 private:
   /// Parallel outlined function work for workers to execute.
   llvm::SmallVector<llvm::Function *, 16> Work;
@@ -99,36 +102,6 @@
                           uint64_t Size, int32_t Flags,
                           llvm::GlobalValue::LinkageTypes Linkage) override;
 
-  /// Emit outlined function specialized for the Fork-Join
-  /// programming model for applicable target directives on the NVPTX device.
-  /// \param D Directive to emit.
-  /// \param ParentName Name of the function that encloses the target region.
-  /// \param OutlinedFn Outlined function value to be defined by this call.
-  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
-  /// \param IsOffloadEntry True if the outlined function is an offload entry.
-  /// An outlined function may not be an entry if, e.g. the if clause always
-  /// evaluates to false.
-  void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
-                         llvm::Function *&OutlinedFn,
-                         llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
-                         const RegionCodeGenTy &CodeGen);
-
-  /// Emit outlined function specialized for the Single Program
-  /// Multiple Data programming model for applicable target directives on the
-  /// NVPTX device.
-  /// \param D Directive to emit.
-  /// \param ParentName Name of the function that encloses the target region.
-  /// \param OutlinedFn Outlined function value to be defined by this call.
-  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
-  /// \param IsOffloadEntry True if the outlined function is an offload entry.
-  /// \param CodeGen Object containing the target statements.
-  /// An outlined function may not be an entry if, e.g. the if clause always
-  /// evaluates to false.
-  void emitSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
-                      llvm::Function *&OutlinedFn,
-                      llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
-                      const RegionCodeGenTy &CodeGen);
-
   /// Emit outlined function for 'target' directive on the NVPTX
   /// device.
   /// \param D Directive to emit.
@@ -211,6 +184,64 @@
   /// Get the maximum number of threads in a block of the GPU.
   virtual llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) = 0;
 
+  /// Allocate global variable for TransferMedium
+  virtual llvm::GlobalVariable *
+  allocateTransferMediumGlobal(CodeGenModule &CGM, llvm::ArrayType *Ty,
+                               StringRef TransferMediumName) = 0;
+
+  /// Allocate global variable for SharedStaticRD
+  virtual llvm::GlobalVariable *
+  allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+                               llvm::Type *LLVMStaticTy) = 0;
+
+  /// Allocate global variable for KernelStaticGlobalized
+  virtual llvm::GlobalVariable *
+  allocateKernelStaticGlobalized(CodeGenModule &CGM) = 0;
+
+  virtual void emitSPMDKernelWrapper(const OMPExecutableDirective &D,
+                                     StringRef ParentName,
+                                     llvm::Function *&OutlinedFn,
+                                     llvm::Constant *&OutlinedFnID,
+                                     bool IsOffloadEntry,
+                                     const RegionCodeGenTy &CodeGen) = 0;
+
+  virtual void emitNonSPMDKernelWrapper(const OMPExecutableDirective &D,
+                                        StringRef ParentName,
+                                        llvm::Function *&OutlinedFn,
+                                        llvm::Constant *&OutlinedFnID,
+                                        bool IsOffloadEntry,
+                                        const RegionCodeGenTy &CodeGen) = 0;
+
+  /// Emit outlined function specialized for the Single Program
+  /// Multiple Data programming model for applicable target directives on the
+  /// NVPTX device.
+  /// \param D Directive to emit.
+  /// \param ParentName Name of the function that encloses the target region.
+  /// \param OutlinedFn Outlined function value to be defined by this call.
+  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+  /// \param IsOffloadEntry True if the outlined function is an offload entry.
+  /// \param CodeGen Object containing the target statements.
+  /// An outlined function may not be an entry if, e.g. the if clause always
+  /// evaluates to false.
+  void emitSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
+                      llvm::Function *&OutlinedFn,
+                      llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                      const RegionCodeGenTy &CodeGen);
+
+  /// Emit outlined function specialized for the Fork-Join
+  /// programming model for applicable target directives on the NVPTX device.
+  /// \param D Directive to emit.
+  /// \param ParentName Name of the function that encloses the target region.
+  /// \param OutlinedFn Outlined function value to be defined by this call.
+  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+  /// \param IsOffloadEntry True if the outlined function is an offload entry.
+  /// An outlined function may not be an entry if, e.g. the if clause always
+  /// evaluates to false.
+  void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
+                         llvm::Function *&OutlinedFn,
+                         llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                         const RegionCodeGenTy &CodeGen);
+
   /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32
   /// global_tid, int proc_bind) to generate code for 'proc_bind' clause.
   virtual void emitProcBindClause(CodeGenFunction &CGF,
Index: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CGOpenMPRuntimeGPU.h"
+#include "CGOpenMPRuntimeAMDGCN.h"
 #include "CGOpenMPRuntimeNVPTX.h"
 #include "CodeGenFunction.h"
 #include "clang/AST/Attr.h"
@@ -1190,13 +1191,8 @@
   // Reserve place for the globalized memory.
   GlobalizedRecords.emplace_back();
   if (!KernelStaticGlobalized) {
-    KernelStaticGlobalized = new llvm::GlobalVariable(
-        CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
-        llvm::GlobalValue::InternalLinkage,
-        llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
-        "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
-        llvm::GlobalValue::NotThreadLocal,
-        CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+    auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
+    KernelStaticGlobalized = RT.allocateKernelStaticGlobalized(CGM);
   }
   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
                                    IsOffloadEntry, CodeGen);
@@ -1321,13 +1317,8 @@
   // Reserve place for the globalized memory.
   GlobalizedRecords.emplace_back();
   if (!KernelStaticGlobalized) {
-    KernelStaticGlobalized = new llvm::GlobalVariable(
-        CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
-        llvm::GlobalValue::InternalLinkage,
-        llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
-        "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
-        llvm::GlobalValue::NotThreadLocal,
-        CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+    auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
+    KernelStaticGlobalized = RT.allocateKernelStaticGlobalized(CGM);
   }
   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
                                    IsOffloadEntry, CodeGen);
@@ -1396,11 +1387,11 @@
 // warps participate in parallel work.
 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
                                      bool Mode) {
-  auto *GVMode =
-      new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
-                               llvm::GlobalValue::WeakAnyLinkage,
-                               llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
-                               Twine(Name, "_exec_mode"));
+  auto *GVMode = new llvm::GlobalVariable(
+      CGM.getModule(), CGM.Int8Ty,
+      /*isConstant=*/true, llvm::GlobalValue::WeakAnyLinkage,
+      llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
+      Twine(Name, "_exec_mode"));
   CGM.addCompilerUsedGlobal(GVMode);
 }
 
@@ -1888,12 +1879,11 @@
 
   bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
   if (Mode)
-    emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
-                   CodeGen);
+    emitSPMDKernelWrapper(D, ParentName, OutlinedFn, OutlinedFnID,
+                          IsOffloadEntry, CodeGen);
   else
-    emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
-                      CodeGen);
-
+    emitNonSPMDKernelWrapper(D, ParentName, OutlinedFn, OutlinedFnID,
+                             IsOffloadEntry, CodeGen);
   setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
 }
 
@@ -3231,6 +3221,7 @@
   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
 
   CGBuilderTy &Bld = CGF.Builder;
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
 
   // This array is used as a medium to transfer, one reduce element at a time,
   // the data from the first lane of every warp to lanes in the first warp
@@ -3246,16 +3237,11 @@
   unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
   if (!TransferMedium) {
     auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
-    unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
-    TransferMedium = new llvm::GlobalVariable(
-        M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
-        llvm::Constant::getNullValue(Ty), TransferMediumName,
-        /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
-        SharedAddressSpace);
+    TransferMedium =
+        RT.allocateTransferMediumGlobal(CGM, Ty, TransferMediumName);
     CGM.addCompilerUsedGlobal(TransferMedium);
   }
 
-  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
   // Get the CUDA thread id of the current OpenMP thread on the GPU.
   llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
   // nvptx_lane_id = nvptx_id % warpsize
@@ -5104,6 +5090,7 @@
 }
 
 void CGOpenMPRuntimeGPU::clear() {
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
   if (!GlobalizedRecords.empty() &&
       !CGM.getLangOpts().OpenMPCUDATargetParallel) {
     ASTContext &C = CGM.getContext();
@@ -5152,9 +5139,6 @@
           llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
     }
     // Allocate SharedMemorySize buffer for the shared memory.
-    // FIXME: nvlink does not handle weak linkage correctly (object with the
-    // different size are reported as erroneous).
-    // Restore this code as sson as nvlink is fixed.
     if (!SharedStaticRD->field_empty()) {
       llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
       QualType SubTy = C.getConstantArrayType(
@@ -5171,13 +5155,7 @@
     if (!SharedStaticRD->field_empty()) {
       QualType StaticTy = C.getRecordType(SharedStaticRD);
       llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
-      auto *GV = new llvm::GlobalVariable(
-          CGM.getModule(), LLVMStaticTy,
-          /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
-          llvm::Constant::getNullValue(LLVMStaticTy),
-          "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
-          llvm::GlobalValue::NotThreadLocal,
-          C.getTargetAddressSpace(LangAS::cuda_shared));
+      auto *GV = RT.allocateSharedStaticRDGlobal(CGM, LLVMStaticTy);
       auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
           GV, CGM.VoidPtrTy);
       for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
@@ -5198,14 +5176,11 @@
           C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal,
                                  /*IndexTypeQuals=*/0);
       llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
-      // FIXME: nvlink does not handle weak linkage correctly (object with the
-      // different size are reported as erroneous).
-      // Restore CommonLinkage as soon as nvlink is fixed.
-      auto *GV = new llvm::GlobalVariable(
-          CGM.getModule(), LLVMArr2Ty,
-          /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
-          llvm::Constant::getNullValue(LLVMArr2Ty),
-          "_openmp_static_glob_rd_$_");
+      auto *GV =
+          new llvm::GlobalVariable(CGM.getModule(), LLVMArr2Ty,
+                                   /*isConstant=*/false, RT.StaticRDLinkage,
+                                   llvm::Constant::getNullValue(LLVMArr2Ty),
+                                   "_openmp_static_glob_rd_$_");
       auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
           GV, CGM.VoidPtrTy);
       for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
@@ -5238,7 +5213,7 @@
     // Restore CommonLinkage as soon as nvlink is fixed.
     auto *GV = new llvm::GlobalVariable(
         CGM.getModule(), LLVMReductionsBufferTy,
-        /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
+        /*isConstant=*/false, RT.StaticRDLinkage,
         llvm::Constant::getNullValue(LLVMReductionsBufferTy),
         "_openmp_teams_reductions_buffer_$_");
     KernelTeamsReductionPtr->setInitializer(
Index: clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
+++ clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
@@ -25,6 +25,10 @@
 class CGOpenMPRuntimeAMDGCN final : public CGOpenMPRuntimeGPU {
 
 public:
+  /// Nesting level of parallel region.
+  int ParallelLevel = 0;
+  int MaxParallelLevel = 0;
+
   explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM);
 
   /// Get the GPU warp size.
@@ -35,6 +39,46 @@
 
   /// Get the maximum number of threads in a block of the GPU.
   llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
+
+  /// Allocate global variable for TransferMedium
+  virtual llvm::GlobalVariable *
+  allocateTransferMediumGlobal(CodeGenModule &CGM, llvm::ArrayType *Ty,
+                               StringRef TransferMediumName) override;
+
+  /// Allocate global variable for SharedStaticRD
+  virtual llvm::GlobalVariable *
+  allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+                               llvm::Type *LLVMStaticTy) override;
+
+  /// Get global variable KernelStaticGlobalized which is a shared pointer for
+  /// the global memory in the global memory buffer used for the given kernel
+  virtual llvm::GlobalVariable *
+  allocateKernelStaticGlobalized(CodeGenModule &CGM) override;
+
+  /// Emit target specifc SPMD kernel
+  virtual void emitSPMDKernelWrapper(const OMPExecutableDirective &D,
+                                     StringRef ParentName,
+                                     llvm::Function *&OutlinedFn,
+                                     llvm::Constant *&OutlinedFnID,
+                                     bool IsOffloadEntry,
+                                     const RegionCodeGenTy &CodeGen) override;
+
+  /// Emit target specific Non-SPMD kernel
+  virtual void
+  emitNonSPMDKernelWrapper(const OMPExecutableDirective &D,
+                           StringRef ParentName, llvm::Function *&OutlinedFn,
+                           llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                           const RegionCodeGenTy &CodeGen) override;
+
+  /// Create a unique global variable to indicate the flat-work-group-size
+  /// for this region. Values are [256..1024].
+  static void setPropertyWorkGroupSize(CodeGenModule &CGM, StringRef Name,
+                                       unsigned WGSize);
+
+  /// Generate global variables _wg_size, kern_desc, __tgt_attribute_struct.
+  /// Also generate appropriate value of attribute amdgpu-flat-work-group-size
+  void generateMetaData(CodeGenModule &CGM, const OMPExecutableDirective &D,
+                        llvm::Function *&OutlinedFn, bool IsGeneric);
 };
 
 } // namespace CodeGen
Index: clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CGOpenMPRuntimeAMDGCN.h"
+#include "CGOpenMPRuntime.h"
 #include "CGOpenMPRuntimeGPU.h"
 #include "CodeGenFunction.h"
 #include "clang/AST/Attr.h"
@@ -30,6 +31,7 @@
     : CGOpenMPRuntimeGPU(CGM) {
   if (!CGM.getLangOpts().OpenMPIsDevice)
     llvm_unreachable("OpenMP AMDGCN can only handle device code.");
+  StaticRDLinkage = llvm::GlobalValue::PrivateLinkage;
 }
 
 llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) {
@@ -59,3 +61,133 @@
   return Bld.CreateTrunc(
       Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty);
 }
+
+llvm::GlobalVariable *CGOpenMPRuntimeAMDGCN::allocateTransferMediumGlobal(
+    CodeGenModule &CGM, llvm::ArrayType *Ty, StringRef TransferMediumName) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), Ty, /*isConstant=*/false,
+      llvm::GlobalVariable::WeakAnyLinkage, llvm::UndefValue::get(Ty),
+      TransferMediumName,
+      /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared),
+      /*isExternallyInitialized*/ true);
+}
+
+llvm::GlobalVariable *
+CGOpenMPRuntimeAMDGCN::allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+                                                    llvm::Type *LLVMStaticTy) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), LLVMStaticTy,
+      /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
+      llvm::UndefValue::get(LLVMStaticTy), "_openmp_shared_static_glob_rd_$_",
+      /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared),
+      /*isExternallyInitialized*/ true);
+}
+
+llvm::GlobalVariable *
+CGOpenMPRuntimeAMDGCN::allocateKernelStaticGlobalized(CodeGenModule &CGM) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
+      llvm::GlobalValue::WeakAnyLinkage, llvm::UndefValue::get(CGM.VoidPtrTy),
+      "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
+      llvm::GlobalValue::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared),
+      /*isExternallyInitialized*/ true);
+}
+
+void CGOpenMPRuntimeAMDGCN::setPropertyWorkGroupSize(CodeGenModule &CGM,
+                                                     StringRef Name,
+                                                     unsigned WGSize) {
+  auto *GVMode = new llvm::GlobalVariable(
+      CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
+      llvm::GlobalValue::WeakAnyLinkage,
+      llvm::ConstantInt::get(CGM.Int16Ty, WGSize), Name + Twine("_wg_size"),
+      /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_device),
+      /*isExternallyInitialized*/ false);
+  CGM.addCompilerUsedGlobal(GVMode);
+}
+
+void CGOpenMPRuntimeAMDGCN::generateMetaData(CodeGenModule &CGM,
+                                             const OMPExecutableDirective &D,
+                                             llvm::Function *&OutlinedFn,
+                                             bool IsGeneric) {
+  if (!CGM.getTriple().isAMDGCN())
+    return;
+  int FlatAttr = 0;
+  bool flatAttrEmitted = false;
+  unsigned DefaultWorkGroupSz =
+      CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Default_WG_Size);
+
+  if (isOpenMPTeamsDirective(D.getDirectiveKind()) ||
+      isOpenMPParallelDirective(D.getDirectiveKind())) {
+    const auto *ThreadLimitClause = D.getSingleClause<OMPThreadLimitClause>();
+    const auto *NumThreadsClause = D.getSingleClause<OMPNumThreadsClause>();
+    unsigned MaxWorkGroupSz =
+        CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Max_WG_Size);
+    unsigned compileTimeThreadLimit = 0;
+    // Only one of thread_limit or num_threads is used, cant do it for both
+    if (ThreadLimitClause && !NumThreadsClause) {
+      Expr *ThreadLimitExpr = ThreadLimitClause->getThreadLimit();
+      clang::Expr::EvalResult Result;
+      if (ThreadLimitExpr->EvaluateAsInt(Result, CGM.getContext()))
+        compileTimeThreadLimit = Result.Val.getInt().getExtValue();
+    } else if (!ThreadLimitClause && NumThreadsClause) {
+      Expr *NumThreadsExpr = NumThreadsClause->getNumThreads();
+      clang::Expr::EvalResult Result;
+      if (NumThreadsExpr->EvaluateAsInt(Result, CGM.getContext()))
+        compileTimeThreadLimit = Result.Val.getInt().getExtValue();
+    }
+
+    // Add kernel metadata if ThreadLimit Clause is compile time constant > 0
+    if (compileTimeThreadLimit > 0) {
+      // Add the WarpSize to generic, to reflect what runtime dispatch does.
+      if (IsGeneric)
+        compileTimeThreadLimit +=
+            CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Warp_Size);
+      if (compileTimeThreadLimit > MaxWorkGroupSz)
+        compileTimeThreadLimit = MaxWorkGroupSz;
+      std::string AttrVal = llvm::utostr(compileTimeThreadLimit);
+      FlatAttr = compileTimeThreadLimit;
+      OutlinedFn->addFnAttr("amdgpu-flat-work-group-size",
+                            AttrVal + "," + AttrVal);
+      setPropertyWorkGroupSize(CGM, OutlinedFn->getName(),
+                               compileTimeThreadLimit);
+    }
+    flatAttrEmitted = true;
+  } // end of amdgcn teams or parallel directive
+
+  // emit amdgpu-flat-work-group-size if not emitted already.
+  if (!flatAttrEmitted) {
+    std::string FlatAttrVal = llvm::utostr(DefaultWorkGroupSz);
+    OutlinedFn->addFnAttr("amdgpu-flat-work-group-size",
+                          FlatAttrVal + "," + FlatAttrVal);
+  }
+  // Emit a kernel descriptor for runtime.
+  StringRef KernDescName = OutlinedFn->getName();
+  CGOpenMPRuntime::emitStructureKernelDesc(CGM, KernDescName, FlatAttr,
+                                           IsGeneric,
+                                           1, // Uses HostServices
+                                           MaxParallelLevel);
+  // Reset it to zero for any subsequent kernel
+  MaxParallelLevel = 0;
+}
+
+void CGOpenMPRuntimeAMDGCN::emitSPMDKernelWrapper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                 CodeGen);
+  generateMetaData(CGM, D, OutlinedFn, /*SPMD*/ false);
+}
+
+void CGOpenMPRuntimeAMDGCN::emitNonSPMDKernelWrapper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                    CodeGen);
+  generateMetaData(CGM, D, OutlinedFn, /*Generic*/ true);
+}
Index: clang/lib/CodeGen/CGOpenMPRuntime.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntime.h
+++ clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -495,6 +495,8 @@
   QualType TgtOffloadEntryQTy;
   /// Entity that registers the offloading constants that were emitted so
   /// far.
+
+  QualType TgtAttributeStructQTy;
   class OffloadEntriesInfoManagerTy {
     CodeGenModule &CGM;
 
@@ -1755,6 +1757,14 @@
                            llvm::FunctionCallee OutlinedFn,
                            ArrayRef<llvm::Value *> Args = llvm::None) const;
 
+  /// Returns __tgt_attribute_struct type.
+  QualType getTgtAttributeStructQTy();
+
+  /// Emit structure descriptor for a kernel
+  void emitStructureKernelDesc(CodeGenModule &CGM, StringRef Name,
+                               int16_t WG_Size, int8_t Mode,
+                               int8_t HostServices, int8_t MaxParallelLevel);
+
   /// Emits OpenMP-specific function prolog.
   /// Required for device constructs.
   virtual void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D);
Index: clang/lib/CodeGen/CGOpenMPRuntime.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -3335,6 +3335,47 @@
   }
 }
 
+/// Emit structure descriptor for a kernel
+void CGOpenMPRuntime::emitStructureKernelDesc(CodeGenModule &CGM,
+                                              StringRef Name, int16_t WG_Size,
+                                              int8_t Mode, int8_t HostServices,
+                                              int8_t MaxParallelLevel) {
+
+  // Create all device images
+  llvm::Constant *AttrData[] = {
+      llvm::ConstantInt::get(CGM.Int16Ty, 2), // Version
+      llvm::ConstantInt::get(CGM.Int16Ty, 9), // Size in bytes
+      llvm::ConstantInt::get(CGM.Int16Ty, WG_Size),
+      llvm::ConstantInt::get(CGM.Int8Ty, Mode), // 0 => SPMD, 1 => GENERIC
+      llvm::ConstantInt::get(CGM.Int8Ty, HostServices), // 1 => use HostServices
+      llvm::ConstantInt::get(CGM.Int8Ty, MaxParallelLevel)}; // number of nests
+
+  llvm::GlobalVariable *AttrImages = createGlobalStruct(
+      CGM, getTgtAttributeStructQTy(), isDefaultLocationConstant(), AttrData,
+      Name + Twine("_kern_desc"), llvm::GlobalValue::WeakAnyLinkage);
+  CGM.addCompilerUsedGlobal(AttrImages);
+}
+
+// Create Tgt Attribute Sruct type.
+QualType CGOpenMPRuntime::getTgtAttributeStructQTy() {
+  ASTContext &C = CGM.getContext();
+  QualType KmpInt8Ty = C.getIntTypeForBitwidth(/*Width=*/8, /*Signed=*/1);
+  QualType KmpInt16Ty = C.getIntTypeForBitwidth(/*Width=*/16, /*Signed=*/1);
+  if (TgtAttributeStructQTy.isNull()) {
+    RecordDecl *RD = C.buildImplicitRecord("__tgt_attribute_struct");
+    RD->startDefinition();
+    addFieldToRecordDecl(C, RD, KmpInt16Ty); // Version
+    addFieldToRecordDecl(C, RD, KmpInt16Ty); // Struct Size in bytes.
+    addFieldToRecordDecl(C, RD, KmpInt16Ty); // WG_size
+    addFieldToRecordDecl(C, RD, KmpInt8Ty);  // Mode
+    addFieldToRecordDecl(C, RD, KmpInt8Ty);  // HostServices
+    addFieldToRecordDecl(C, RD, KmpInt8Ty);  // MaxParallelLevel
+    RD->completeDefinition();
+    TgtAttributeStructQTy = C.getRecordType(RD);
+  }
+  return TgtAttributeStructQTy;
+}
+
 QualType CGOpenMPRuntime::getTgtOffloadEntryQTy() {
   // Make sure the type of the entry is already created. This is the type we
   // have to create:

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D86097: [OpenMP][AMDGCN] Generate global variables and attributes for AMDGCN

Reply via email to