[PATCH] D86097: [OpenMP][AMDGCN] Generate global variables and attributes for AMDGCN

Saiyedul Islam via Phabricator via cfe-commits Thu, 15 Oct 2020 07:29:55 -0700

saiislam updated this revision to Diff 298377.
saiislam added a comment.

1. Removed unnecessary formatting of untouched code.
2. Encapsulated addFieldToRecordDecl and createGlobalStruct methods in a class 
and made them static (triggered change at all calling sites).
3. Marked most of the member methods of CGOpenMPRuntimeAMDGCN as private 
(forgot to do same change in nvptx)
4. Fixed the memory leak
5. Marked appropriate member variables as protected in CGOpenMPRuntimeGPU



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D86097/new/

https://reviews.llvm.org/D86097

Files:
  clang/lib/CodeGen/CGOpenMPRuntime.cpp
  clang/lib/CodeGen/CGOpenMPRuntime.h
  clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
  clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
  clang/test/OpenMP/amdgcn_target_codegen.cpp

Index: clang/test/OpenMP/amdgcn_target_codegen.cpp
===================================================================
--- clang/test/OpenMP/amdgcn_target_codegen.cpp
+++ clang/test/OpenMP/amdgcn_target_codegen.cpp
@@ -8,6 +8,29 @@
 
 #define N 1000
 
+// CHECK: @"_openmp_kernel_static_glob_rd$ptr" = weak addrspace(3) externally_initialized global i8* undef
+
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_tid_threadsv_l[[LINE1:.+]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 0, i8 1, i8 1, i8 0 }, align 2
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_tid_threadsv_l[[LINE1]]_exec_mode = weak constant i8 1
+
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_tid_threads_simdv_l[[LINE2:.+]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 0, i8 0, i8 1, i8 0 }, align 2
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_tid_threads_simdv_l[[LINE2]]_exec_mode = weak constant i8 0
+
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_max_parallel_levelv_l[[LINE3:.+]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 0, i8 0, i8 1, i8 3 }, align 2
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_max_parallel_levelv_l[[LINE3]]_exec_mode = weak constant i8 0
+
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l[[LINE4:.+]]_wg_size = weak addrspace(1) constant i16 10
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l[[LINE4]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 10, i8 0, i8 1, i8 0 }, align 2
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l[[LINE4]]_exec_mode = weak constant i8 0
+
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l[[LINE5:.+]]_wg_size = weak addrspace(1) constant i16 74
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l[[LINE5]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 74, i8 1, i8 1, i8 0 }, align 2
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l[[LINE5]]_exec_mode = weak constant i8 1
+
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_max_work_group_sizev_l[[LINE6:.+]]_wg_size = weak addrspace(1) constant i16 1024
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_max_work_group_sizev_l[[LINE6]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 1024, i8 1, i8 1, i8 0 }, align 2
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_max_work_group_sizev_l[[LINE6]]_exec_mode = weak constant i8 1
+
 int test_amdgcn_target_tid_threads() {
 // CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads
 
@@ -40,4 +63,65 @@
   return arr[0];
 }
 
+int test_amdgcn_target_max_parallel_level() {
+  // CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_max_parallel_level
+  int arr[N];
+
+#pragma omp target parallel for
+  for (int i = 0; i < N; i++)
+#pragma omp parallel for
+    for (int j = 0; j < N; j++)
+#pragma omp parallel for
+      for (int k = 0; k < N; k++)
+        for (int l = 0; l < N; l++)
+#pragma omp parallel for
+          for (int m = 0; m < N; m++)
+            arr[m] = 0;
+
+  return arr[0];
+}
+
+int test_amdgcn_target_attributes_spmd() {
+  int arr[N];
+
+// CHECK: {{.*}}"amdgpu-flat-work-group-size"="10,10"
+#pragma omp target parallel num_threads(10)
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+
+  return arr[0];
+}
+
+int test_amdgcn_target_attributes_non_spmd() {
+  int arr[N];
+
+// CHECK: {{.*}}"amdgpu-flat-work-group-size"="74,74"
+#pragma omp target teams thread_limit(10)
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+
+  return arr[0];
+}
+
+int test_amdgcn_target_attributes_max_work_group_size() {
+  int arr[N];
+
+// CHECK: {{.*}}"amdgpu-flat-work-group-size"="1024,1024"
+#pragma omp target teams thread_limit(1500)
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+
+  return arr[0];
+}
+
 #endif
+
+// CHECK: !0 = !{i32 0, i32 [[ARG1:[0-9]+]], i32 [[ARG2:[0-9]+]], !"_Z37test_amdgcn_target_max_parallel_levelv", i32 [[LINE3]], i32 2}
+// CHECK: !1 = !{i32 0, i32 [[ARG1]], i32 [[ARG2]], !"_Z30test_amdgcn_target_tid_threadsv", i32 [[LINE1]], i32 0}
+// CHECK: !2 = !{i32 0, i32 [[ARG1]], i32 [[ARG2]], !"_Z35test_amdgcn_target_tid_threads_simdv", i32 [[LINE2]], i32 1}
+// CHECK: !3 = !{i32 0, i32 [[ARG1]], i32 [[ARG2]], !"_Z38test_amdgcn_target_attributes_non_spmdv", i32 [[LINE5]], i32 4}
+// CHECK: !4 = !{i32 0, i32 [[ARG1]], i32 [[ARG2]], !"_Z34test_amdgcn_target_attributes_spmdv", i32 [[LINE4]], i32 3}
+// CHECK: !5 = !{i32 0, i32 [[ARG1]], i32 [[ARG2]], !"_Z49test_amdgcn_target_attributes_max_work_group_sizev", i32 [[LINE6]], i32 5}
\ No newline at end of file
Index: clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -35,6 +35,54 @@
 
   /// Get the maximum number of threads in a block of the GPU.
   llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
+
+  /// Allocate global variable for TransferMedium
+  llvm::GlobalVariable *allocateTransferMediumGlobal(CodeGenModule &CGM,
+                                                     llvm::ArrayType *Ty,
+                                                     StringRef Name) override;
+
+  /// Allocate global variable for SharedStaticRD
+  llvm::GlobalVariable *
+  allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+                               llvm::Type *LLVMStaticTy) override;
+
+  /// Allocate global variable for KernelStaticGlobalized
+  llvm::GlobalVariable *
+  allocateKernelStaticGlobalized(CodeGenModule &CGM) override;
+
+  /// Get target specific PrePostAction
+  PrePostActionTy *getPrePostActionTy() override;
+
+  /// Target independent wrapper over target specific emitSPMDKernel()
+  void emitSPMDKernelWrapper(const OMPExecutableDirective &D,
+                             StringRef ParentName, llvm::Function *&OutlinedFn,
+                             llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                             const RegionCodeGenTy &CodeGen) override;
+
+  /// Target independent wrapper over target specific emitNonSPMDKernel()
+  void emitNonSPMDKernelWrapper(const OMPExecutableDirective &D,
+                                StringRef ParentName,
+                                llvm::Function *&OutlinedFn,
+                                llvm::Constant *&OutlinedFnID,
+                                bool IsOffloadEntry,
+                                const RegionCodeGenTy &CodeGen) override;
+
+  /// NVPTX specific class for PrePostActionTy
+  class NVPTXPrePostActionTy final : public PrePostActionTy {
+    bool &IsInParallelRegion;
+    bool PrevIsInParallelRegion;
+
+  public:
+    NVPTXPrePostActionTy(bool &IsInParallelRegion)
+        : IsInParallelRegion(IsInParallelRegion) {}
+    void Enter(CodeGenFunction &CGF) override {
+      PrevIsInParallelRegion = IsInParallelRegion;
+      IsInParallelRegion = true;
+    }
+    void Exit(CodeGenFunction &CGF) override {
+      IsInParallelRegion = PrevIsInParallelRegion;
+    }
+  };
 };
 
 } // CodeGen namespace.
Index: clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -30,6 +30,11 @@
     : CGOpenMPRuntimeGPU(CGM) {
   if (!CGM.getLangOpts().OpenMPIsDevice)
     llvm_unreachable("OpenMP NVPTX can only handle device code.");
+
+  // FIXME: nvlink does not handle weak linkage correctly (object with the
+  // different size are reported as erroneous).
+  // Restore CommonLinkage as soon as nvlink is fixed.
+  StaticRDLinkage = llvm::GlobalValue::InternalLinkage;
 }
 
 llvm::Value *CGOpenMPRuntimeNVPTX::getGPUWarpSize(CodeGenFunction &CGF) {
@@ -54,3 +59,56 @@
       &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x);
   return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
 }
+
+llvm::GlobalVariable *CGOpenMPRuntimeNVPTX::allocateTransferMediumGlobal(
+    CodeGenModule &CGM, llvm::ArrayType *Ty, StringRef TransferMediumName) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), Ty, /*isConstant=*/false,
+      llvm::GlobalVariable::CommonLinkage, llvm::Constant::getNullValue(Ty),
+      TransferMediumName,
+      /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+}
+
+llvm::GlobalVariable *
+CGOpenMPRuntimeNVPTX::allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+                                                   llvm::Type *LLVMStaticTy) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), LLVMStaticTy,
+      /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
+      llvm::Constant::getNullValue(LLVMStaticTy),
+      "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
+      llvm::GlobalValue::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+}
+
+llvm::GlobalVariable *
+CGOpenMPRuntimeNVPTX::allocateKernelStaticGlobalized(CodeGenModule &CGM) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
+      llvm::GlobalValue::InternalLinkage,
+      llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
+      "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
+      llvm::GlobalValue::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+}
+
+void CGOpenMPRuntimeNVPTX::emitSPMDKernelWrapper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                 CodeGen);
+}
+
+void CGOpenMPRuntimeNVPTX::emitNonSPMDKernelWrapper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                    CodeGen);
+}
+
+PrePostActionTy *CGOpenMPRuntimeNVPTX::getPrePostActionTy() {
+  return new NVPTXPrePostActionTy(IsInParallelRegion);
+}
Index: clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -33,6 +33,14 @@
     /// Unknown execution mode (orphaned directive).
     EM_Unknown,
   };
+
+protected:
+  /// true if we're definitely in the parallel region.
+  bool IsInParallelRegion = false;
+  
+  /// Linkage type of StaticRD Global variable
+  llvm::GlobalValue::LinkageTypes StaticRDLinkage;
+
 private:
   /// Parallel outlined function work for workers to execute.
   llvm::SmallVector<llvm::Function *, 16> Work;
@@ -99,36 +107,6 @@
                           uint64_t Size, int32_t Flags,
                           llvm::GlobalValue::LinkageTypes Linkage) override;
 
-  /// Emit outlined function specialized for the Fork-Join
-  /// programming model for applicable target directives on the NVPTX device.
-  /// \param D Directive to emit.
-  /// \param ParentName Name of the function that encloses the target region.
-  /// \param OutlinedFn Outlined function value to be defined by this call.
-  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
-  /// \param IsOffloadEntry True if the outlined function is an offload entry.
-  /// An outlined function may not be an entry if, e.g. the if clause always
-  /// evaluates to false.
-  void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
-                         llvm::Function *&OutlinedFn,
-                         llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
-                         const RegionCodeGenTy &CodeGen);
-
-  /// Emit outlined function specialized for the Single Program
-  /// Multiple Data programming model for applicable target directives on the
-  /// NVPTX device.
-  /// \param D Directive to emit.
-  /// \param ParentName Name of the function that encloses the target region.
-  /// \param OutlinedFn Outlined function value to be defined by this call.
-  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
-  /// \param IsOffloadEntry True if the outlined function is an offload entry.
-  /// \param CodeGen Object containing the target statements.
-  /// An outlined function may not be an entry if, e.g. the if clause always
-  /// evaluates to false.
-  void emitSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
-                      llvm::Function *&OutlinedFn,
-                      llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
-                      const RegionCodeGenTy &CodeGen);
-
   /// Emit outlined function for 'target' directive on the NVPTX
   /// device.
   /// \param D Directive to emit.
@@ -194,6 +172,36 @@
   /// Full/Lightweight runtime mode. Used for better optimization.
   unsigned getDefaultLocationReserved2Flags() const override;
 
+  /// Emit outlined function specialized for the Fork-Join
+  /// programming model for applicable target directives on the NVPTX device.
+  /// \param D Directive to emit.
+  /// \param ParentName Name of the function that encloses the target region.
+  /// \param OutlinedFn Outlined function value to be defined by this call.
+  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+  /// \param IsOffloadEntry True if the outlined function is an offload entry.
+  /// An outlined function may not be an entry if, e.g. the if clause always
+  /// evaluates to false.
+  void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
+                         llvm::Function *&OutlinedFn,
+                         llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                         const RegionCodeGenTy &CodeGen);
+
+  /// Emit outlined function specialized for the Single Program
+  /// Multiple Data programming model for applicable target directives on the
+  /// NVPTX device.
+  /// \param D Directive to emit.
+  /// \param ParentName Name of the function that encloses the target region.
+  /// \param OutlinedFn Outlined function value to be defined by this call.
+  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+  /// \param IsOffloadEntry True if the outlined function is an offload entry.
+  /// \param CodeGen Object containing the target statements.
+  /// An outlined function may not be an entry if, e.g. the if clause always
+  /// evaluates to false.
+  void emitSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
+                      llvm::Function *&OutlinedFn,
+                      llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                      const RegionCodeGenTy &CodeGen);
+
 public:
   explicit CGOpenMPRuntimeGPU(CodeGenModule &CGM);
   void clear() override;
@@ -211,6 +219,39 @@
   /// Get the maximum number of threads in a block of the GPU.
   virtual llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) = 0;
 
+  /// Allocate global variable for TransferMedium
+  virtual llvm::GlobalVariable *
+  allocateTransferMediumGlobal(CodeGenModule &CGM, llvm::ArrayType *Ty,
+                               StringRef TransferMediumName) = 0;
+
+  /// Allocate global variable for SharedStaticRD
+  virtual llvm::GlobalVariable *
+  allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+                               llvm::Type *LLVMStaticTy) = 0;
+
+  /// Allocate global variable for KernelStaticGlobalized
+  virtual llvm::GlobalVariable *
+  allocateKernelStaticGlobalized(CodeGenModule &CGM) = 0;
+
+  /// Get target specific PrePostAction
+  virtual PrePostActionTy *getPrePostActionTy() = 0;
+
+  /// Target independent wrapper over target specific emitSPMDKernel()
+  virtual void emitSPMDKernelWrapper(const OMPExecutableDirective &D,
+                                     StringRef ParentName,
+                                     llvm::Function *&OutlinedFn,
+                                     llvm::Constant *&OutlinedFnID,
+                                     bool IsOffloadEntry,
+                                     const RegionCodeGenTy &CodeGen) = 0;
+
+  /// Target independent wrapper over target specific emitNonSPMDKernel()
+  virtual void emitNonSPMDKernelWrapper(const OMPExecutableDirective &D,
+                                        StringRef ParentName,
+                                        llvm::Function *&OutlinedFn,
+                                        llvm::Constant *&OutlinedFnID,
+                                        bool IsOffloadEntry,
+                                        const RegionCodeGenTy &CodeGen) = 0;
+
   /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32
   /// global_tid, int proc_bind) to generate code for 'proc_bind' clause.
   virtual void emitProcBindClause(CodeGenFunction &CGF,
@@ -424,8 +465,6 @@
   /// true if currently emitting code for target/teams/distribute region, false
   /// - otherwise.
   bool IsInTTDRegion = false;
-  /// true if we're definitely in the parallel region.
-  bool IsInParallelRegion = false;
 
   /// Map between an outlined function and its wrapper.
   llvm::DenseMap<llvm::Function *, llvm::Function *> WrapperFunctionsMap;
Index: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CGOpenMPRuntimeGPU.h"
+#include "CGOpenMPRuntimeAMDGCN.h"
 #include "CGOpenMPRuntimeNVPTX.h"
 #include "CodeGenFunction.h"
 #include "clang/AST/Attr.h"
@@ -1190,13 +1191,8 @@
   // Reserve place for the globalized memory.
   GlobalizedRecords.emplace_back();
   if (!KernelStaticGlobalized) {
-    KernelStaticGlobalized = new llvm::GlobalVariable(
-        CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
-        llvm::GlobalValue::InternalLinkage,
-        llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
-        "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
-        llvm::GlobalValue::NotThreadLocal,
-        CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+    auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
+    KernelStaticGlobalized = RT.allocateKernelStaticGlobalized(CGM);
   }
   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
                                    IsOffloadEntry, CodeGen);
@@ -1321,13 +1317,8 @@
   // Reserve place for the globalized memory.
   GlobalizedRecords.emplace_back();
   if (!KernelStaticGlobalized) {
-    KernelStaticGlobalized = new llvm::GlobalVariable(
-        CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
-        llvm::GlobalValue::InternalLinkage,
-        llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
-        "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
-        llvm::GlobalValue::NotThreadLocal,
-        CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+    auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
+    KernelStaticGlobalized = RT.allocateKernelStaticGlobalized(CGM);
   }
   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
                                    IsOffloadEntry, CodeGen);
@@ -1888,12 +1879,11 @@
 
   bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
   if (Mode)
-    emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
-                   CodeGen);
+    emitSPMDKernelWrapper(D, ParentName, OutlinedFn, OutlinedFnID,
+                          IsOffloadEntry, CodeGen);
   else
-    emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
-                      CodeGen);
-
+    emitNonSPMDKernelWrapper(D, ParentName, OutlinedFn, OutlinedFnID,
+                             IsOffloadEntry, CodeGen);
   setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
 }
 
@@ -1963,22 +1953,9 @@
     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
   // Emit target region as a standalone region.
-  class NVPTXPrePostActionTy : public PrePostActionTy {
-    bool &IsInParallelRegion;
-    bool PrevIsInParallelRegion;
-
-  public:
-    NVPTXPrePostActionTy(bool &IsInParallelRegion)
-        : IsInParallelRegion(IsInParallelRegion) {}
-    void Enter(CodeGenFunction &CGF) override {
-      PrevIsInParallelRegion = IsInParallelRegion;
-      IsInParallelRegion = true;
-    }
-    void Exit(CodeGenFunction &CGF) override {
-      IsInParallelRegion = PrevIsInParallelRegion;
-    }
-  } Action(IsInParallelRegion);
-  CodeGen.setAction(Action);
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
+  std::unique_ptr<PrePostActionTy> Action(RT.getPrePostActionTy());
+  CodeGen.setAction(*Action);
   bool PrevIsInTTDRegion = IsInTTDRegion;
   IsInTTDRegion = false;
   bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
@@ -3231,6 +3208,7 @@
   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
 
   CGBuilderTy &Bld = CGF.Builder;
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
 
   // This array is used as a medium to transfer, one reduce element at a time,
   // the data from the first lane of every warp to lanes in the first warp
@@ -3246,16 +3224,11 @@
   unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
   if (!TransferMedium) {
     auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
-    unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
-    TransferMedium = new llvm::GlobalVariable(
-        M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
-        llvm::Constant::getNullValue(Ty), TransferMediumName,
-        /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
-        SharedAddressSpace);
+    TransferMedium =
+        RT.allocateTransferMediumGlobal(CGM, Ty, TransferMediumName);
     CGM.addCompilerUsedGlobal(TransferMedium);
   }
 
-  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
   // Get the CUDA thread id of the current OpenMP thread on the GPU.
   llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
   // nvptx_lane_id = nvptx_id % warpsize
@@ -5104,6 +5077,7 @@
 }
 
 void CGOpenMPRuntimeGPU::clear() {
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
   if (!GlobalizedRecords.empty() &&
       !CGM.getLangOpts().OpenMPCUDATargetParallel) {
     ASTContext &C = CGM.getContext();
@@ -5152,9 +5126,6 @@
           llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
     }
     // Allocate SharedMemorySize buffer for the shared memory.
-    // FIXME: nvlink does not handle weak linkage correctly (object with the
-    // different size are reported as erroneous).
-    // Restore this code as sson as nvlink is fixed.
     if (!SharedStaticRD->field_empty()) {
       llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
       QualType SubTy = C.getConstantArrayType(
@@ -5171,13 +5142,7 @@
     if (!SharedStaticRD->field_empty()) {
       QualType StaticTy = C.getRecordType(SharedStaticRD);
       llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
-      auto *GV = new llvm::GlobalVariable(
-          CGM.getModule(), LLVMStaticTy,
-          /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
-          llvm::Constant::getNullValue(LLVMStaticTy),
-          "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
-          llvm::GlobalValue::NotThreadLocal,
-          C.getTargetAddressSpace(LangAS::cuda_shared));
+      auto *GV = RT.allocateSharedStaticRDGlobal(CGM, LLVMStaticTy);
       auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
           GV, CGM.VoidPtrTy);
       for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
@@ -5198,14 +5163,11 @@
           C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal,
                                  /*IndexTypeQuals=*/0);
       llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
-      // FIXME: nvlink does not handle weak linkage correctly (object with the
-      // different size are reported as erroneous).
-      // Restore CommonLinkage as soon as nvlink is fixed.
-      auto *GV = new llvm::GlobalVariable(
-          CGM.getModule(), LLVMArr2Ty,
-          /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
-          llvm::Constant::getNullValue(LLVMArr2Ty),
-          "_openmp_static_glob_rd_$_");
+      auto *GV =
+          new llvm::GlobalVariable(CGM.getModule(), LLVMArr2Ty,
+                                   /*isConstant=*/false, RT.StaticRDLinkage,
+                                   llvm::Constant::getNullValue(LLVMArr2Ty),
+                                   "_openmp_static_glob_rd_$_");
       auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
           GV, CGM.VoidPtrTy);
       for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
@@ -5238,7 +5200,7 @@
     // Restore CommonLinkage as soon as nvlink is fixed.
     auto *GV = new llvm::GlobalVariable(
         CGM.getModule(), LLVMReductionsBufferTy,
-        /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
+        /*isConstant=*/false, RT.StaticRDLinkage,
         llvm::Constant::getNullValue(LLVMReductionsBufferTy),
         "_openmp_teams_reductions_buffer_$_");
     KernelTeamsReductionPtr->setInitializer(
Index: clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
+++ clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
@@ -27,6 +27,16 @@
 public:
   explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM);
 
+private:
+  /// Curret nesting level of parallel region
+  int ParallelLevel = 0;
+
+  /// Maximum nesting level of parallel region
+  int MaxParallelLevel = 0;
+
+  /// Struct to store kernel descriptors
+  QualType TgtAttributeStructQTy;
+
   /// Get the GPU warp size.
   llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
 
@@ -35,6 +45,72 @@
 
   /// Get the maximum number of threads in a block of the GPU.
   llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
+
+  /// Allocate global variable for TransferMedium
+  llvm::GlobalVariable *
+  allocateTransferMediumGlobal(CodeGenModule &CGM, llvm::ArrayType *Ty,
+                               StringRef TransferMediumName) override;
+
+  /// Allocate global variable for SharedStaticRD
+  llvm::GlobalVariable *
+  allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+                               llvm::Type *LLVMStaticTy) override;
+
+  /// Get global variable KernelStaticGlobalized which is a shared pointer for
+  /// the global memory in the global memory buffer used for the given kernel
+  llvm::GlobalVariable *
+  allocateKernelStaticGlobalized(CodeGenModule &CGM) override;
+
+  /// Get target specific PrePostActionTy
+  PrePostActionTy *getPrePostActionTy() override;
+
+  /// Target independent wrapper over target specific emitSPMDKernel()
+  void emitSPMDKernelWrapper(const OMPExecutableDirective &D,
+                             StringRef ParentName, llvm::Function *&OutlinedFn,
+                             llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                             const RegionCodeGenTy &CodeGen) override;
+
+  /// Target independent wrapper over target specific emitNonSPMDKernel()
+  void emitNonSPMDKernelWrapper(const OMPExecutableDirective &D,
+                                StringRef ParentName,
+                                llvm::Function *&OutlinedFn,
+                                llvm::Constant *&OutlinedFnID,
+                                bool IsOffloadEntry,
+                                const RegionCodeGenTy &CodeGen) override;
+
+  /// Create a unique global variable to indicate the flat-work-group-size
+  /// for this region. Values are [256..1024].
+  static void setPropertyWorkGroupSize(CodeGenModule &CGM, StringRef Name,
+                                       unsigned WGSize);
+
+  /// Generate global variables _wg_size, kern_desc, __tgt_attribute_struct.
+  /// Also generate appropriate value of attribute amdgpu-flat-work-group-size
+  void generateMetaData(CodeGenModule &CGM, const OMPExecutableDirective &D,
+                        llvm::Function *&OutlinedFn, bool IsGeneric);
+
+  /// Returns __tgt_attribute_struct type.
+  QualType getTgtAttributeStructQTy();
+
+  /// Emit structure descriptor for a kernel
+  void emitStructureKernelDesc(CodeGenModule &CGM, StringRef Name,
+                               int16_t WG_Size, int8_t Mode,
+                               int8_t HostServices, int8_t MaxParallelLevel);
+
+  class AMDGCNPrePostActionTy final : public PrePostActionTy {
+    int &ParallelLevel;
+    int &MaxParallelLevel;
+
+  public:
+    AMDGCNPrePostActionTy(int &ParallelLevel, int &MaxParallelLevel)
+        : ParallelLevel(ParallelLevel), MaxParallelLevel(MaxParallelLevel) {}
+    void Enter(CodeGenFunction &CGF) override {
+      // Count the number of nested parallels.
+      if (ParallelLevel > MaxParallelLevel)
+        MaxParallelLevel = ParallelLevel;
+      ParallelLevel++;
+    }
+    void Exit(CodeGenFunction &CGF) override { ParallelLevel--; }
+  };
 };
 
 } // namespace CodeGen
Index: clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CGOpenMPRuntimeAMDGCN.h"
+#include "CGOpenMPRuntime.h"
 #include "CGOpenMPRuntimeGPU.h"
 #include "CodeGenFunction.h"
 #include "clang/AST/Attr.h"
@@ -30,6 +31,7 @@
     : CGOpenMPRuntimeGPU(CGM) {
   if (!CGM.getLangOpts().OpenMPIsDevice)
     llvm_unreachable("OpenMP AMDGCN can only handle device code.");
+  StaticRDLinkage = llvm::GlobalValue::PrivateLinkage;
 }
 
 llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) {
@@ -59,3 +61,176 @@
   return Bld.CreateTrunc(
       Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty);
 }
+
+llvm::GlobalVariable *CGOpenMPRuntimeAMDGCN::allocateTransferMediumGlobal(
+    CodeGenModule &CGM, llvm::ArrayType *Ty, StringRef TransferMediumName) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), Ty, /*isConstant=*/false,
+      llvm::GlobalVariable::WeakAnyLinkage, llvm::UndefValue::get(Ty),
+      TransferMediumName,
+      /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared),
+      /*isExternallyInitialized*/ true);
+}
+
+llvm::GlobalVariable *
+CGOpenMPRuntimeAMDGCN::allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+                                                    llvm::Type *LLVMStaticTy) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), LLVMStaticTy,
+      /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
+      llvm::UndefValue::get(LLVMStaticTy), "_openmp_shared_static_glob_rd_$_",
+      /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared),
+      /*isExternallyInitialized*/ true);
+}
+
+llvm::GlobalVariable *
+CGOpenMPRuntimeAMDGCN::allocateKernelStaticGlobalized(CodeGenModule &CGM) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
+      llvm::GlobalValue::WeakAnyLinkage, llvm::UndefValue::get(CGM.VoidPtrTy),
+      "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
+      llvm::GlobalValue::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared),
+      /*isExternallyInitialized*/ true);
+}
+
+void CGOpenMPRuntimeAMDGCN::setPropertyWorkGroupSize(CodeGenModule &CGM,
+                                                     StringRef Name,
+                                                     unsigned WGSize) {
+  auto *GVMode = new llvm::GlobalVariable(
+      CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
+      llvm::GlobalValue::WeakAnyLinkage,
+      llvm::ConstantInt::get(CGM.Int16Ty, WGSize), Name + Twine("_wg_size"),
+      /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_device),
+      /*isExternallyInitialized*/ false);
+  CGM.addCompilerUsedGlobal(GVMode);
+}
+
+void CGOpenMPRuntimeAMDGCN::generateMetaData(CodeGenModule &CGM,
+                                             const OMPExecutableDirective &D,
+                                             llvm::Function *&OutlinedFn,
+                                             bool IsGeneric) {
+  int FlatAttr = 0;
+  bool FlatAttrEmitted = false;
+  unsigned DefaultWorkGroupSz =
+      CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Default_WG_Size);
+
+  if (isOpenMPTeamsDirective(D.getDirectiveKind()) ||
+      isOpenMPParallelDirective(D.getDirectiveKind())) {
+    const auto *ThreadLimitClause = D.getSingleClause<OMPThreadLimitClause>();
+    const auto *NumThreadsClause = D.getSingleClause<OMPNumThreadsClause>();
+    unsigned MaxWorkGroupSz =
+        CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Max_WG_Size);
+    unsigned CompileTimeThreadLimit = 0;
+    // Only one of thread_limit or num_threads is used, cant do it for both
+    if (ThreadLimitClause && !NumThreadsClause) {
+      Expr *ThreadLimitExpr = ThreadLimitClause->getThreadLimit();
+      clang::Expr::EvalResult Result;
+      if (ThreadLimitExpr->EvaluateAsInt(Result, CGM.getContext()))
+        CompileTimeThreadLimit = Result.Val.getInt().getExtValue();
+    } else if (!ThreadLimitClause && NumThreadsClause) {
+      Expr *NumThreadsExpr = NumThreadsClause->getNumThreads();
+      clang::Expr::EvalResult Result;
+      if (NumThreadsExpr->EvaluateAsInt(Result, CGM.getContext()))
+        CompileTimeThreadLimit = Result.Val.getInt().getExtValue();
+    }
+
+    // Add kernel metadata if ThreadLimit Clause is compile time constant > 0
+    if (CompileTimeThreadLimit > 0) {
+      // Add the WarpSize to generic, to reflect what runtime dispatch does.
+      if (IsGeneric)
+        CompileTimeThreadLimit +=
+            CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Warp_Size);
+      if (CompileTimeThreadLimit > MaxWorkGroupSz)
+        CompileTimeThreadLimit = MaxWorkGroupSz;
+      std::string AttrVal = llvm::utostr(CompileTimeThreadLimit);
+      FlatAttr = CompileTimeThreadLimit;
+      OutlinedFn->addFnAttr("amdgpu-flat-work-group-size",
+                            AttrVal + "," + AttrVal);
+      setPropertyWorkGroupSize(CGM, OutlinedFn->getName(),
+                               CompileTimeThreadLimit);
+    }
+    FlatAttrEmitted = true;
+  } // end of amdgcn teams or parallel directive
+
+  // emit amdgpu-flat-work-group-size if not emitted already.
+  if (!FlatAttrEmitted) {
+    std::string FlatAttrVal = llvm::utostr(DefaultWorkGroupSz);
+    OutlinedFn->addFnAttr("amdgpu-flat-work-group-size",
+                          FlatAttrVal + "," + FlatAttrVal);
+  }
+  // Emit a kernel descriptor for runtime.
+  StringRef KernDescName = OutlinedFn->getName();
+  CGOpenMPRuntimeAMDGCN::emitStructureKernelDesc(CGM, KernDescName, FlatAttr,
+                                                 IsGeneric,
+                                                 1, // Uses HostServices
+                                                 MaxParallelLevel);
+  // Reset it to zero for any subsequent kernel
+  MaxParallelLevel = 0;
+}
+
+void CGOpenMPRuntimeAMDGCN::emitSPMDKernelWrapper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                 CodeGen);
+  generateMetaData(CGM, D, OutlinedFn, /*SPMD*/ false);
+}
+
+void CGOpenMPRuntimeAMDGCN::emitNonSPMDKernelWrapper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                    CodeGen);
+  generateMetaData(CGM, D, OutlinedFn, /*Generic*/ true);
+}
+
+PrePostActionTy *CGOpenMPRuntimeAMDGCN::getPrePostActionTy() {
+  return new AMDGCNPrePostActionTy(ParallelLevel, MaxParallelLevel);
+}
+
+/// Emit structure descriptor for a kernel
+void CGOpenMPRuntimeAMDGCN::emitStructureKernelDesc(
+    CodeGenModule &CGM, StringRef Name, int16_t WG_Size, int8_t Mode,
+    int8_t HostServices, int8_t MaxParallelLevel) {
+
+  // Create all device images
+  llvm::Constant *AttrData[] = {
+      llvm::ConstantInt::get(CGM.Int16Ty, 2), // Version
+      llvm::ConstantInt::get(CGM.Int16Ty, 9), // Size in bytes
+      llvm::ConstantInt::get(CGM.Int16Ty, WG_Size),
+      llvm::ConstantInt::get(CGM.Int8Ty, Mode), // 0 => SPMD, 1 => GENERIC
+      llvm::ConstantInt::get(CGM.Int8Ty, HostServices), // 1 => use HostServices
+      llvm::ConstantInt::get(CGM.Int8Ty, MaxParallelLevel)}; // number of nests
+
+  llvm::GlobalVariable *AttrImages = clang::CodeGen::CodeGenUtil::createGlobalStruct(
+      CGM, getTgtAttributeStructQTy(), isDefaultLocationConstant(), AttrData,
+      Name + Twine("_kern_desc"), llvm::GlobalValue::WeakAnyLinkage);
+  CGM.addCompilerUsedGlobal(AttrImages);
+}
+
+// Create Tgt Attribute Struct type.
+QualType CGOpenMPRuntimeAMDGCN::getTgtAttributeStructQTy() {
+  ASTContext &C = CGM.getContext();
+  QualType KmpInt8Ty = C.getIntTypeForBitwidth(/*Width=*/8, /*Signed=*/1);
+  QualType KmpInt16Ty = C.getIntTypeForBitwidth(/*Width=*/16, /*Signed=*/1);
+  if (TgtAttributeStructQTy.isNull()) {
+    RecordDecl *RD = C.buildImplicitRecord("__tgt_attribute_struct");
+    RD->startDefinition();
+    clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt16Ty); // Version
+    clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD,
+                                         KmpInt16Ty); // Struct Size in bytes.
+    clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt16Ty); // WG_size
+    clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt8Ty);  // Mode
+    clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt8Ty);  // HostServices
+    clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt8Ty);  // MaxParallelLevel
+    RD->completeDefinition();
+    TgtAttributeStructQTy = C.getRecordType(RD);
+  }
+  return TgtAttributeStructQTy;
+}
Index: clang/lib/CodeGen/CGOpenMPRuntime.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntime.h
+++ clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -2476,6 +2476,20 @@
   }
 };
 
+/// To encapsulate helper methods to be used by target specific specializations
+/// of CGOpenMPRuntimeGPU.
+class CodeGenUtil {
+public:
+  static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC,
+                                  QualType FieldTy);
+
+  template <class... As>
+  static llvm::GlobalVariable *createGlobalStruct(CodeGenModule &CGM, QualType Ty,
+                                          bool IsConstant,
+                                          ArrayRef<llvm::Constant *> Data,
+                                          const Twine &Name, As &&... Args);
+};
+
 } // namespace CodeGen
 } // namespace clang
 
Index: clang/lib/CodeGen/CGOpenMPRuntime.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1048,17 +1048,6 @@
                             AlignmentSource::Decl);
 }
 
-static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC,
-                                       QualType FieldTy) {
-  auto *Field = FieldDecl::Create(
-      C, DC, SourceLocation(), SourceLocation(), /*Id=*/nullptr, FieldTy,
-      C.getTrivialTypeSourceInfo(FieldTy, SourceLocation()),
-      /*BW=*/nullptr, /*Mutable=*/false, /*InitStyle=*/ICIS_NoInit);
-  Field->setAccess(AS_public);
-  DC->addDecl(Field);
-  return Field;
-}
-
 CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM, StringRef FirstSeparator,
                                  StringRef Separator)
     : CGM(CGM), FirstSeparator(FirstSeparator), Separator(Separator),
@@ -1352,11 +1341,21 @@
   }
 }
 
+FieldDecl *clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(ASTContext &C, DeclContext *DC,
+                                                QualType FieldTy) {
+  auto *Field = FieldDecl::Create(
+      C, DC, SourceLocation(), SourceLocation(), /*Id=*/nullptr, FieldTy,
+      C.getTrivialTypeSourceInfo(FieldTy, SourceLocation()),
+      /*BW=*/nullptr, /*Mutable=*/false, /*InitStyle=*/ICIS_NoInit);
+  Field->setAccess(AS_public);
+  DC->addDecl(Field);
+  return Field;
+}
+
 template <class... As>
-static llvm::GlobalVariable *
-createGlobalStruct(CodeGenModule &CGM, QualType Ty, bool IsConstant,
-                   ArrayRef<llvm::Constant *> Data, const Twine &Name,
-                   As &&... Args) {
+llvm::GlobalVariable *clang::CodeGen::CodeGenUtil::createGlobalStruct(
+    CodeGenModule &CGM, QualType Ty, bool IsConstant,
+    ArrayRef<llvm::Constant *> Data, const Twine &Name, As &&... Args) {
   const auto *RD = cast<RecordDecl>(Ty->getAsTagDecl());
   const CGRecordLayout &RL = CGM.getTypes().getCGRecordLayout(RD);
   ConstantInitBuilder CIBuilder(CGM);
@@ -3072,7 +3071,7 @@
                             llvm::ConstantInt::get(CGM.Int32Ty, Flags),
                             llvm::ConstantInt::get(CGM.Int32Ty, 0)};
   std::string EntryName = getName({"omp_offloading", "entry", ""});
-  llvm::GlobalVariable *Entry = createGlobalStruct(
+  llvm::GlobalVariable *Entry = CodeGenUtil::createGlobalStruct(
       CGM, getTgtOffloadEntryQTy(), /*IsConstant=*/true, Data,
       Twine(EntryName).concat(Name), llvm::GlobalValue::WeakAnyLinkage);
 
@@ -3350,12 +3349,12 @@
     ASTContext &C = CGM.getContext();
     RecordDecl *RD = C.buildImplicitRecord("__tgt_offload_entry");
     RD->startDefinition();
-    addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-    addFieldToRecordDecl(C, RD, C.getPointerType(C.CharTy));
-    addFieldToRecordDecl(C, RD, C.getSizeType());
-    addFieldToRecordDecl(
+    CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, C.getPointerType(C.CharTy));
+    CodeGenUtil::addFieldToRecordDecl(C, RD, C.getSizeType());
+    CodeGenUtil::addFieldToRecordDecl(
         C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true));
-    addFieldToRecordDecl(
+    CodeGenUtil::addFieldToRecordDecl(
         C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true));
     RD->completeDefinition();
     RD->addAttr(PackedAttr::CreateImplicit(C));
@@ -3399,7 +3398,7 @@
       if (Pair.second.isLocalPrivate() &&
           VD->getType()->isLValueReferenceType())
         Type = C.getPointerType(Type);
-      FieldDecl *FD = addFieldToRecordDecl(C, RD, Type);
+      FieldDecl *FD = CodeGenUtil::addFieldToRecordDecl(C, RD, Type);
       if (VD->hasAttrs()) {
         for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
              E(VD->getAttrs().end());
@@ -3433,27 +3432,27 @@
   //       };
   RecordDecl *UD = C.buildImplicitRecord("kmp_cmplrdata_t", TTK_Union);
   UD->startDefinition();
-  addFieldToRecordDecl(C, UD, KmpInt32Ty);
-  addFieldToRecordDecl(C, UD, KmpRoutineEntryPointerQTy);
+  CodeGenUtil::addFieldToRecordDecl(C, UD, KmpInt32Ty);
+  CodeGenUtil::addFieldToRecordDecl(C, UD, KmpRoutineEntryPointerQTy);
   UD->completeDefinition();
   QualType KmpCmplrdataTy = C.getRecordType(UD);
   RecordDecl *RD = C.buildImplicitRecord("kmp_task_t");
   RD->startDefinition();
-  addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-  addFieldToRecordDecl(C, RD, KmpRoutineEntryPointerQTy);
-  addFieldToRecordDecl(C, RD, KmpInt32Ty);
-  addFieldToRecordDecl(C, RD, KmpCmplrdataTy);
-  addFieldToRecordDecl(C, RD, KmpCmplrdataTy);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, KmpRoutineEntryPointerQTy);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt32Ty);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, KmpCmplrdataTy);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, KmpCmplrdataTy);
   if (isOpenMPTaskLoopDirective(Kind)) {
     QualType KmpUInt64Ty =
         CGM.getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/0);
     QualType KmpInt64Ty =
         CGM.getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/1);
-    addFieldToRecordDecl(C, RD, KmpUInt64Ty);
-    addFieldToRecordDecl(C, RD, KmpUInt64Ty);
-    addFieldToRecordDecl(C, RD, KmpInt64Ty);
-    addFieldToRecordDecl(C, RD, KmpInt32Ty);
-    addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, KmpUInt64Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, KmpUInt64Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt64Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt32Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
   }
   RD->completeDefinition();
   return RD;
@@ -3469,9 +3468,9 @@
   //       };
   RecordDecl *RD = C.buildImplicitRecord("kmp_task_t_with_privates");
   RD->startDefinition();
-  addFieldToRecordDecl(C, RD, KmpTaskTQTy);
+  CodeGenUtil::addFieldToRecordDecl(C, RD, KmpTaskTQTy);
   if (const RecordDecl *PrivateRD = createPrivatesRecordDecl(CGM, Privates))
-    addFieldToRecordDecl(C, RD, C.getRecordType(PrivateRD));
+    CodeGenUtil::addFieldToRecordDecl(C, RD, C.getRecordType(PrivateRD));
   RD->completeDefinition();
   return RD;
 }
@@ -4100,9 +4099,9 @@
     RecordDecl *KmpAffinityInfoRD =
         C.buildImplicitRecord("kmp_task_affinity_info_t");
     KmpAffinityInfoRD->startDefinition();
-    addFieldToRecordDecl(C, KmpAffinityInfoRD, C.getIntPtrType());
-    addFieldToRecordDecl(C, KmpAffinityInfoRD, C.getSizeType());
-    addFieldToRecordDecl(C, KmpAffinityInfoRD, FlagsTy);
+    CodeGenUtil::addFieldToRecordDecl(C, KmpAffinityInfoRD, C.getIntPtrType());
+    CodeGenUtil::addFieldToRecordDecl(C, KmpAffinityInfoRD, C.getSizeType());
+    CodeGenUtil::addFieldToRecordDecl(C, KmpAffinityInfoRD, FlagsTy);
     KmpAffinityInfoRD->completeDefinition();
     KmpTaskAffinityInfoTy = C.getRecordType(KmpAffinityInfoRD);
   }
@@ -4536,9 +4535,9 @@
   if (KmpDependInfoTy.isNull()) {
     RecordDecl *KmpDependInfoRD = C.buildImplicitRecord("kmp_depend_info");
     KmpDependInfoRD->startDefinition();
-    addFieldToRecordDecl(C, KmpDependInfoRD, C.getIntPtrType());
-    addFieldToRecordDecl(C, KmpDependInfoRD, C.getSizeType());
-    addFieldToRecordDecl(C, KmpDependInfoRD, FlagsTy);
+    CodeGenUtil::addFieldToRecordDecl(C, KmpDependInfoRD, C.getIntPtrType());
+    CodeGenUtil::addFieldToRecordDecl(C, KmpDependInfoRD, C.getSizeType());
+    CodeGenUtil::addFieldToRecordDecl(C, KmpDependInfoRD, FlagsTy);
     KmpDependInfoRD->completeDefinition();
     KmpDependInfoTy = C.getRecordType(KmpDependInfoRD);
   }
@@ -5985,13 +5984,13 @@
   ASTContext &C = CGM.getContext();
   RecordDecl *RD = C.buildImplicitRecord("kmp_taskred_input_t");
   RD->startDefinition();
-  const FieldDecl *SharedFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-  const FieldDecl *OrigFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-  const FieldDecl *SizeFD = addFieldToRecordDecl(C, RD, C.getSizeType());
-  const FieldDecl *InitFD  = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-  const FieldDecl *FiniFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-  const FieldDecl *CombFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-  const FieldDecl *FlagsFD = addFieldToRecordDecl(
+  const FieldDecl *SharedFD = CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *OrigFD = CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *SizeFD = CodeGenUtil::addFieldToRecordDecl(C, RD, C.getSizeType());
+  const FieldDecl *InitFD  = CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *FiniFD = CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *CombFD = CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *FlagsFD = CodeGenUtil::addFieldToRecordDecl(
       C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false));
   RD->completeDefinition();
   QualType RDType = C.getRecordType(RD);
@@ -11104,9 +11103,9 @@
     // };
     RD = C.buildImplicitRecord("kmp_dim");
     RD->startDefinition();
-    addFieldToRecordDecl(C, RD, Int64Ty);
-    addFieldToRecordDecl(C, RD, Int64Ty);
-    addFieldToRecordDecl(C, RD, Int64Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty);
+    CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty);
     RD->completeDefinition();
     KmpDimTy = C.getRecordType(RD);
   } else {
@@ -11536,8 +11535,8 @@
   if (VI == I->getSecond().end()) {
     RecordDecl *RD = C.buildImplicitRecord("lasprivate.conditional");
     RD->startDefinition();
-    VDField = addFieldToRecordDecl(C, RD, VD->getType().getNonReferenceType());
-    FiredField = addFieldToRecordDecl(C, RD, C.CharTy);
+    VDField = CodeGenUtil::addFieldToRecordDecl(C, RD, VD->getType().getNonReferenceType());
+    FiredField = CodeGenUtil::addFieldToRecordDecl(C, RD, C.CharTy);
     RD->completeDefinition();
     NewType = C.getRecordType(RD);
     Address Addr = CGF.CreateMemTemp(NewType, C.getDeclAlign(VD), VD->getName());

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D86097: [OpenMP][AMDGCN] Generate global variables and attributes for AMDGCN

Reply via email to