[PATCH] D86097: [OpenMP][AMDGCN] Generate global variables and attributes for AMDGCN

Saiyedul Islam via Phabricator via cfe-commits Mon, 24 Aug 2020 15:32:03 -0700

saiislam updated this revision to Diff 287513.
saiislam added a comment.

1. Moved amdgcn specific functions to CGOpenMPAMDGCN.cpp
2. Removed tautology condition
3. Corrected case of local variables
4. Restored original formatting
5. Changed back declaration of emit kernel methods as private
6. Added support of amdgcn specific PrePostActionTy implementation and its 
corresponding test cases
7. Changed static line numbers in new test cases with regex
8. Other small code corrections



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D86097/new/

https://reviews.llvm.org/D86097

Files:
  clang/lib/CodeGen/CGOpenMPRuntime.cpp
  clang/lib/CodeGen/CGOpenMPRuntime.h
  clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
  clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
  clang/test/OpenMP/amdgcn_target_codegen.cpp

Index: clang/test/OpenMP/amdgcn_target_codegen.cpp
===================================================================
--- clang/test/OpenMP/amdgcn_target_codegen.cpp
+++ clang/test/OpenMP/amdgcn_target_codegen.cpp
@@ -8,6 +8,29 @@
 
 #define N 1000
 
+// CHECK: @"_openmp_kernel_static_glob_rd$ptr" = weak addrspace(3) externally_initialized global i8* undef
+
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_tid_threadsv_l[[LINE1:.+]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 0, i8 1, i8 1, i8 0 }, align 2
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_tid_threadsv_l[[LINE1]]_exec_mode = weak constant i8 1
+
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_tid_threads_simdv_l[[LINE2:.+]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 0, i8 0, i8 1, i8 0 }, align 2
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_tid_threads_simdv_l[[LINE2]]_exec_mode = weak constant i8 0
+
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_max_parallel_levelv_l[[LINE3:.+]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 0, i8 0, i8 1, i8 3 }, align 2
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_max_parallel_levelv_l[[LINE3]]_exec_mode = weak constant i8 0
+
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l[[LINE4:.+]]_wg_size = weak addrspace(1) constant i16 10
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l[[LINE4]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 10, i8 0, i8 1, i8 0 }, align 2
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l[[LINE4]]_exec_mode = weak constant i8 0
+
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l[[LINE5:.+]]_wg_size = weak addrspace(1) constant i16 74
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l[[LINE5]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 74, i8 1, i8 1, i8 0 }, align 2
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l[[LINE5]]_exec_mode = weak constant i8 1
+
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_max_work_group_sizev_l[[LINE6:.+]]_wg_size = weak addrspace(1) constant i16 1024
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_max_work_group_sizev_l[[LINE6]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 1024, i8 1, i8 1, i8 0 }, align 2
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_max_work_group_sizev_l[[LINE6]]_exec_mode = weak constant i8 1
+
 int test_amdgcn_target_tid_threads() {
 // CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads
 
@@ -40,4 +63,65 @@
   return arr[0];
 }
 
+int test_amdgcn_target_max_parallel_level() {
+// CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_max_parallel_level
+  int arr[N];
+  
+  #pragma omp target parallel for
+  for(int i = 0; i < N; i++)
+    #pragma omp parallel for
+    for (int j = 0; j < N; j++)
+      #pragma omp parallel for
+      for (int k = 0; k < N; k++)
+        for (int l = 0; l < N; l++)
+          #pragma omp parallel for
+          for (int m = 0; m < N; m++)
+            arr[m] = 0;
+  
+  return arr[0];
+}
+
+int test_amdgcn_target_attributes_spmd() {
+  int arr[N];
+
+// CHECK: {{.*}}"amdgpu-flat-work-group-size"="10,10"
+#pragma omp target parallel num_threads(10)
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+
+  return arr[0];
+}
+
+int test_amdgcn_target_attributes_non_spmd() {
+  int arr[N];
+
+// CHECK: {{.*}}"amdgpu-flat-work-group-size"="74,74"
+#pragma omp target teams thread_limit(10)
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+
+  return arr[0];
+}
+
+int test_amdgcn_target_attributes_max_work_group_size() {
+  int arr[N];
+
+// CHECK: {{.*}}"amdgpu-flat-work-group-size"="1024,1024"
+#pragma omp target teams thread_limit(1500)
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+
+  return arr[0];
+}
+
 #endif
+
+// CHECK: !0 = !{i32 0, i32 [[ARG1:[0-9]+]], i32 [[ARG2:[0-9]+]], !"_Z37test_amdgcn_target_max_parallel_levelv", i32 [[LINE3]], i32 2}
+// CHECK: !1 = !{i32 0, i32 [[ARG1]], i32 [[ARG2]], !"_Z30test_amdgcn_target_tid_threadsv", i32 [[LINE1]], i32 0}
+// CHECK: !2 = !{i32 0, i32 [[ARG1]], i32 [[ARG2]], !"_Z35test_amdgcn_target_tid_threads_simdv", i32 [[LINE2]], i32 1}
+// CHECK: !3 = !{i32 0, i32 [[ARG1]], i32 [[ARG2]], !"_Z38test_amdgcn_target_attributes_non_spmdv", i32 [[LINE5]], i32 4}
+// CHECK: !4 = !{i32 0, i32 [[ARG1]], i32 [[ARG2]], !"_Z34test_amdgcn_target_attributes_spmdv", i32 [[LINE4]], i32 3}
+// CHECK: !5 = !{i32 0, i32 [[ARG1]], i32 [[ARG2]], !"_Z49test_amdgcn_target_attributes_max_work_group_sizev", i32 [[LINE6]], i32 5}
\ No newline at end of file
Index: clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -35,6 +35,55 @@
 
   /// Get the maximum number of threads in a block of the GPU.
   llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
+
+  /// Allocate global variable for TransferMedium
+  llvm::GlobalVariable *
+  allocateTransferMediumGlobal(CodeGenModule &CGM, llvm::ArrayType *Ty,
+                               StringRef Name) override;
+
+  /// Allocate global variable for SharedStaticRD
+  llvm::GlobalVariable *
+  allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+                               llvm::Type *LLVMStaticTy) override;
+
+  /// Allocate global variable for KernelStaticGlobalized
+  llvm::GlobalVariable *
+  allocateKernelStaticGlobalized(CodeGenModule &CGM) override;
+
+  /// Get target specific PrePostAction
+  PrePostActionTy *getPrePostActionTy() override;
+
+  /// Target independent wrapper over target specific emitSPMDKernel()
+  void emitSPMDKernelWrapper(const OMPExecutableDirective &D,
+                                     StringRef ParentName,
+                                     llvm::Function *&OutlinedFn,
+                                     llvm::Constant *&OutlinedFnID,
+                                     bool IsOffloadEntry,
+                                     const RegionCodeGenTy &CodeGen) override;
+  
+  /// Target independent wrapper over target specific emitNonSPMDKernel()
+  void
+  emitNonSPMDKernelWrapper(const OMPExecutableDirective &D,
+                           StringRef ParentName, llvm::Function *&OutlinedFn,
+                           llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                           const RegionCodeGenTy &CodeGen) override;
+};
+
+/// NVPTX specific class for PrePostActionTy
+class NVPTXPrePostActionTy : public PrePostActionTy {
+    bool &IsInParallelRegion;
+    bool PrevIsInParallelRegion;
+
+  public:
+    NVPTXPrePostActionTy(bool &IsInParallelRegion)
+        : IsInParallelRegion(IsInParallelRegion) {}
+    void Enter(CodeGenFunction &CGF) override {
+      PrevIsInParallelRegion = IsInParallelRegion;
+      IsInParallelRegion = true;
+    }
+    void Exit(CodeGenFunction &CGF) override {
+      IsInParallelRegion = PrevIsInParallelRegion;
+    }
 };
 
 } // CodeGen namespace.
Index: clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -30,6 +30,11 @@
     : CGOpenMPRuntimeGPU(CGM) {
   if (!CGM.getLangOpts().OpenMPIsDevice)
     llvm_unreachable("OpenMP NVPTX can only handle device code.");
+
+  // FIXME: nvlink does not handle weak linkage correctly (object with the
+  // different size are reported as erroneous).
+  // Restore CommonLinkage as soon as nvlink is fixed.
+  StaticRDLinkage = llvm::GlobalValue::InternalLinkage;
 }
 
 llvm::Value *CGOpenMPRuntimeNVPTX::getGPUWarpSize(CodeGenFunction &CGF) {
@@ -54,3 +59,56 @@
       &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x);
   return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
 }
+
+llvm::GlobalVariable *CGOpenMPRuntimeNVPTX::allocateTransferMediumGlobal(
+    CodeGenModule &CGM, llvm::ArrayType *Ty, StringRef TransferMediumName) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), Ty, /*isConstant=*/false,
+      llvm::GlobalVariable::CommonLinkage, llvm::Constant::getNullValue(Ty),
+      TransferMediumName,
+      /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+}
+
+llvm::GlobalVariable *
+CGOpenMPRuntimeNVPTX::allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+                                                   llvm::Type *LLVMStaticTy) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), LLVMStaticTy,
+      /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
+      llvm::Constant::getNullValue(LLVMStaticTy),
+      "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
+      llvm::GlobalValue::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+}
+
+llvm::GlobalVariable *
+CGOpenMPRuntimeNVPTX::allocateKernelStaticGlobalized(CodeGenModule &CGM) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
+      llvm::GlobalValue::InternalLinkage,
+      llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
+      "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
+      llvm::GlobalValue::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+}
+
+void CGOpenMPRuntimeNVPTX::emitSPMDKernelWrapper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                 CodeGen);
+}
+
+void CGOpenMPRuntimeNVPTX::emitNonSPMDKernelWrapper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                    CodeGen);
+}
+
+PrePostActionTy *CGOpenMPRuntimeNVPTX::getPrePostActionTy(){
+  return new NVPTXPrePostActionTy(IsInParallelRegion);
+}
Index: clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -33,6 +33,9 @@
     /// Unknown execution mode (orphaned directive).
     EM_Unknown,
   };
+  /// Linkage type of StaticRD Global variable
+  llvm::GlobalValue::LinkageTypes StaticRDLinkage;
+
 private:
   /// Parallel outlined function work for workers to execute.
   llvm::SmallVector<llvm::Function *, 16> Work;
@@ -99,36 +102,6 @@
                           uint64_t Size, int32_t Flags,
                           llvm::GlobalValue::LinkageTypes Linkage) override;
 
-  /// Emit outlined function specialized for the Fork-Join
-  /// programming model for applicable target directives on the NVPTX device.
-  /// \param D Directive to emit.
-  /// \param ParentName Name of the function that encloses the target region.
-  /// \param OutlinedFn Outlined function value to be defined by this call.
-  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
-  /// \param IsOffloadEntry True if the outlined function is an offload entry.
-  /// An outlined function may not be an entry if, e.g. the if clause always
-  /// evaluates to false.
-  void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
-                         llvm::Function *&OutlinedFn,
-                         llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
-                         const RegionCodeGenTy &CodeGen);
-
-  /// Emit outlined function specialized for the Single Program
-  /// Multiple Data programming model for applicable target directives on the
-  /// NVPTX device.
-  /// \param D Directive to emit.
-  /// \param ParentName Name of the function that encloses the target region.
-  /// \param OutlinedFn Outlined function value to be defined by this call.
-  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
-  /// \param IsOffloadEntry True if the outlined function is an offload entry.
-  /// \param CodeGen Object containing the target statements.
-  /// An outlined function may not be an entry if, e.g. the if clause always
-  /// evaluates to false.
-  void emitSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
-                      llvm::Function *&OutlinedFn,
-                      llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
-                      const RegionCodeGenTy &CodeGen);
-
   /// Emit outlined function for 'target' directive on the NVPTX
   /// device.
   /// \param D Directive to emit.
@@ -194,6 +167,36 @@
   /// Full/Lightweight runtime mode. Used for better optimization.
   unsigned getDefaultLocationReserved2Flags() const override;
 
+  /// Emit outlined function specialized for the Fork-Join
+  /// programming model for applicable target directives on the NVPTX device.
+  /// \param D Directive to emit.
+  /// \param ParentName Name of the function that encloses the target region.
+  /// \param OutlinedFn Outlined function value to be defined by this call.
+  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+  /// \param IsOffloadEntry True if the outlined function is an offload entry.
+  /// An outlined function may not be an entry if, e.g. the if clause always
+  /// evaluates to false.
+  void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
+                         llvm::Function *&OutlinedFn,
+                         llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                         const RegionCodeGenTy &CodeGen);
+
+  /// Emit outlined function specialized for the Single Program
+  /// Multiple Data programming model for applicable target directives on the
+  /// NVPTX device.
+  /// \param D Directive to emit.
+  /// \param ParentName Name of the function that encloses the target region.
+  /// \param OutlinedFn Outlined function value to be defined by this call.
+  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+  /// \param IsOffloadEntry True if the outlined function is an offload entry.
+  /// \param CodeGen Object containing the target statements.
+  /// An outlined function may not be an entry if, e.g. the if clause always
+  /// evaluates to false.
+  void emitSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
+                      llvm::Function *&OutlinedFn,
+                      llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                      const RegionCodeGenTy &CodeGen);
+
 public:
   explicit CGOpenMPRuntimeGPU(CodeGenModule &CGM);
   void clear() override;
@@ -211,6 +214,39 @@
   /// Get the maximum number of threads in a block of the GPU.
   virtual llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) = 0;
 
+  /// Allocate global variable for TransferMedium
+  virtual llvm::GlobalVariable *
+  allocateTransferMediumGlobal(CodeGenModule &CGM, llvm::ArrayType *Ty,
+                               StringRef TransferMediumName) = 0;
+
+  /// Allocate global variable for SharedStaticRD
+  virtual llvm::GlobalVariable *
+  allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+                               llvm::Type *LLVMStaticTy) = 0;
+
+  /// Allocate global variable for KernelStaticGlobalized
+  virtual llvm::GlobalVariable *
+  allocateKernelStaticGlobalized(CodeGenModule &CGM) = 0;
+
+  /// Get target specific PrePostAction
+  virtual PrePostActionTy *getPrePostActionTy() = 0;
+
+  /// Target independent wrapper over target specific emitSPMDKernel()
+  virtual void emitSPMDKernelWrapper(const OMPExecutableDirective &D,
+                                     StringRef ParentName,
+                                     llvm::Function *&OutlinedFn,
+                                     llvm::Constant *&OutlinedFnID,
+                                     bool IsOffloadEntry,
+                                     const RegionCodeGenTy &CodeGen) = 0;
+
+  /// Target independent wrapper over target specific emitNonSPMDKernel()
+  virtual void emitNonSPMDKernelWrapper(const OMPExecutableDirective &D,
+                                        StringRef ParentName,
+                                        llvm::Function *&OutlinedFn,
+                                        llvm::Constant *&OutlinedFnID,
+                                        bool IsOffloadEntry,
+                                        const RegionCodeGenTy &CodeGen) = 0;
+
   /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32
   /// global_tid, int proc_bind) to generate code for 'proc_bind' clause.
   virtual void emitProcBindClause(CodeGenFunction &CGF,
@@ -375,7 +411,10 @@
     /// Generic data-sharing mode.
     Generic,
   };
-
+  
+  /// true if we're definitely in the parallel region.
+  bool IsInParallelRegion = false;
+  
   /// Cleans up references to the objects in finished function.
   ///
   void functionFinished(CodeGenFunction &CGF) override;
@@ -424,8 +463,6 @@
   /// true if currently emitting code for target/teams/distribute region, false
   /// - otherwise.
   bool IsInTTDRegion = false;
-  /// true if we're definitely in the parallel region.
-  bool IsInParallelRegion = false;
 
   /// Map between an outlined function and its wrapper.
   llvm::DenseMap<llvm::Function *, llvm::Function *> WrapperFunctionsMap;
Index: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CGOpenMPRuntimeGPU.h"
+#include "CGOpenMPRuntimeAMDGCN.h"
 #include "CGOpenMPRuntimeNVPTX.h"
 #include "CodeGenFunction.h"
 #include "clang/AST/Attr.h"
@@ -1190,13 +1191,8 @@
   // Reserve place for the globalized memory.
   GlobalizedRecords.emplace_back();
   if (!KernelStaticGlobalized) {
-    KernelStaticGlobalized = new llvm::GlobalVariable(
-        CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
-        llvm::GlobalValue::InternalLinkage,
-        llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
-        "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
-        llvm::GlobalValue::NotThreadLocal,
-        CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+    auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
+    KernelStaticGlobalized = RT.allocateKernelStaticGlobalized(CGM);
   }
   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
                                    IsOffloadEntry, CodeGen);
@@ -1321,13 +1317,8 @@
   // Reserve place for the globalized memory.
   GlobalizedRecords.emplace_back();
   if (!KernelStaticGlobalized) {
-    KernelStaticGlobalized = new llvm::GlobalVariable(
-        CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
-        llvm::GlobalValue::InternalLinkage,
-        llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
-        "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
-        llvm::GlobalValue::NotThreadLocal,
-        CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+    auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
+    KernelStaticGlobalized = RT.allocateKernelStaticGlobalized(CGM);
   }
   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
                                    IsOffloadEntry, CodeGen);
@@ -1888,12 +1879,11 @@
 
   bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
   if (Mode)
-    emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
-                   CodeGen);
+    emitSPMDKernelWrapper(D, ParentName, OutlinedFn, OutlinedFnID,
+                          IsOffloadEntry, CodeGen);
   else
-    emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
-                      CodeGen);
-
+    emitNonSPMDKernelWrapper(D, ParentName, OutlinedFn, OutlinedFnID,
+                             IsOffloadEntry, CodeGen);
   setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
 }
 
@@ -1963,22 +1953,9 @@
     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
   // Emit target region as a standalone region.
-  class NVPTXPrePostActionTy : public PrePostActionTy {
-    bool &IsInParallelRegion;
-    bool PrevIsInParallelRegion;
-
-  public:
-    NVPTXPrePostActionTy(bool &IsInParallelRegion)
-        : IsInParallelRegion(IsInParallelRegion) {}
-    void Enter(CodeGenFunction &CGF) override {
-      PrevIsInParallelRegion = IsInParallelRegion;
-      IsInParallelRegion = true;
-    }
-    void Exit(CodeGenFunction &CGF) override {
-      IsInParallelRegion = PrevIsInParallelRegion;
-    }
-  } Action(IsInParallelRegion);
-  CodeGen.setAction(Action);
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
+  PrePostActionTy *Action = RT.getPrePostActionTy();
+  CodeGen.setAction(*Action);
   bool PrevIsInTTDRegion = IsInTTDRegion;
   IsInTTDRegion = false;
   bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
@@ -3231,6 +3208,7 @@
   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
 
   CGBuilderTy &Bld = CGF.Builder;
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
 
   // This array is used as a medium to transfer, one reduce element at a time,
   // the data from the first lane of every warp to lanes in the first warp
@@ -3246,16 +3224,11 @@
   unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
   if (!TransferMedium) {
     auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
-    unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
-    TransferMedium = new llvm::GlobalVariable(
-        M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
-        llvm::Constant::getNullValue(Ty), TransferMediumName,
-        /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
-        SharedAddressSpace);
+    TransferMedium =
+        RT.allocateTransferMediumGlobal(CGM, Ty, TransferMediumName);
     CGM.addCompilerUsedGlobal(TransferMedium);
   }
 
-  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
   // Get the CUDA thread id of the current OpenMP thread on the GPU.
   llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
   // nvptx_lane_id = nvptx_id % warpsize
@@ -5104,6 +5077,7 @@
 }
 
 void CGOpenMPRuntimeGPU::clear() {
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
   if (!GlobalizedRecords.empty() &&
       !CGM.getLangOpts().OpenMPCUDATargetParallel) {
     ASTContext &C = CGM.getContext();
@@ -5152,9 +5126,6 @@
           llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
     }
     // Allocate SharedMemorySize buffer for the shared memory.
-    // FIXME: nvlink does not handle weak linkage correctly (object with the
-    // different size are reported as erroneous).
-    // Restore this code as sson as nvlink is fixed.
     if (!SharedStaticRD->field_empty()) {
       llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
       QualType SubTy = C.getConstantArrayType(
@@ -5171,13 +5142,7 @@
     if (!SharedStaticRD->field_empty()) {
       QualType StaticTy = C.getRecordType(SharedStaticRD);
       llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
-      auto *GV = new llvm::GlobalVariable(
-          CGM.getModule(), LLVMStaticTy,
-          /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
-          llvm::Constant::getNullValue(LLVMStaticTy),
-          "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
-          llvm::GlobalValue::NotThreadLocal,
-          C.getTargetAddressSpace(LangAS::cuda_shared));
+      auto *GV = RT.allocateSharedStaticRDGlobal(CGM, LLVMStaticTy);
       auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
           GV, CGM.VoidPtrTy);
       for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
@@ -5198,14 +5163,11 @@
           C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal,
                                  /*IndexTypeQuals=*/0);
       llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
-      // FIXME: nvlink does not handle weak linkage correctly (object with the
-      // different size are reported as erroneous).
-      // Restore CommonLinkage as soon as nvlink is fixed.
-      auto *GV = new llvm::GlobalVariable(
-          CGM.getModule(), LLVMArr2Ty,
-          /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
-          llvm::Constant::getNullValue(LLVMArr2Ty),
-          "_openmp_static_glob_rd_$_");
+      auto *GV =
+          new llvm::GlobalVariable(CGM.getModule(), LLVMArr2Ty,
+                                   /*isConstant=*/false, RT.StaticRDLinkage,
+                                   llvm::Constant::getNullValue(LLVMArr2Ty),
+                                   "_openmp_static_glob_rd_$_");
       auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
           GV, CGM.VoidPtrTy);
       for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
@@ -5238,7 +5200,7 @@
     // Restore CommonLinkage as soon as nvlink is fixed.
     auto *GV = new llvm::GlobalVariable(
         CGM.getModule(), LLVMReductionsBufferTy,
-        /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
+        /*isConstant=*/false, RT.StaticRDLinkage,
         llvm::Constant::getNullValue(LLVMReductionsBufferTy),
         "_openmp_teams_reductions_buffer_$_");
     KernelTeamsReductionPtr->setInitializer(
Index: clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
+++ clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
@@ -25,6 +25,11 @@
 class CGOpenMPRuntimeAMDGCN final : public CGOpenMPRuntimeGPU {
 
 public:
+  /// Nesting level of parallel region.
+  int ParallelLevel = 0;
+  int MaxParallelLevel = 0;
+  QualType TgtAttributeStructQTy;
+  
   explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM);
 
   /// Get the GPU warp size.
@@ -35,8 +40,79 @@
 
   /// Get the maximum number of threads in a block of the GPU.
   llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
+
+  /// Allocate global variable for TransferMedium
+  llvm::GlobalVariable *
+  allocateTransferMediumGlobal(CodeGenModule &CGM, llvm::ArrayType *Ty,
+                               StringRef TransferMediumName) override;
+
+  /// Allocate global variable for SharedStaticRD
+  llvm::GlobalVariable *
+  allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+                               llvm::Type *LLVMStaticTy) override;
+
+  /// Get global variable KernelStaticGlobalized which is a shared pointer for
+  /// the global memory in the global memory buffer used for the given kernel
+  llvm::GlobalVariable *
+  allocateKernelStaticGlobalized(CodeGenModule &CGM) override;
+
+  /// Get target specific PrePostActionTy
+  PrePostActionTy *getPrePostActionTy() override;
+  
+  /// Target independent wrapper over target specific emitSPMDKernel()
+  void emitSPMDKernelWrapper(const OMPExecutableDirective &D,
+                                     StringRef ParentName,
+                                     llvm::Function *&OutlinedFn,
+                                     llvm::Constant *&OutlinedFnID,
+                                     bool IsOffloadEntry,
+                                     const RegionCodeGenTy &CodeGen) override;
+
+  /// Target independent wrapper over target specific emitNonSPMDKernel()
+  void
+  emitNonSPMDKernelWrapper(const OMPExecutableDirective &D,
+                           StringRef ParentName, llvm::Function *&OutlinedFn,
+                           llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                           const RegionCodeGenTy &CodeGen) override;
+
+  /// Create a unique global variable to indicate the flat-work-group-size
+  /// for this region. Values are [256..1024].
+  static void setPropertyWorkGroupSize(CodeGenModule &CGM, StringRef Name,
+                                       unsigned WGSize);
+
+  /// Generate global variables _wg_size, kern_desc, __tgt_attribute_struct.
+  /// Also generate appropriate value of attribute amdgpu-flat-work-group-size
+  void generateMetaData(CodeGenModule &CGM, const OMPExecutableDirective &D,
+                        llvm::Function *&OutlinedFn, bool IsGeneric);
+
+  /// Returns __tgt_attribute_struct type.
+  QualType getTgtAttributeStructQTy();
+
+  /// Emit structure descriptor for a kernel
+  void emitStructureKernelDesc(CodeGenModule &CGM, StringRef Name,
+                               int16_t WG_Size, int8_t Mode,
+                               int8_t HostServices, int8_t MaxParallelLevel);
 };
 
+/// AMDGCN specific PrePostActionTy implementation
+class AMDGCNPrePostActionTy : public PrePostActionTy {
+    int &ParallelLevel;
+    int &MaxParallelLevel;
+
+  public:
+    AMDGCNPrePostActionTy(int &ParallelLevel, int &MaxParallelLevel)
+        : ParallelLevel(ParallelLevel),
+          MaxParallelLevel(MaxParallelLevel) {}
+    void Enter(CodeGenFunction &CGF) override {
+      // Count the number of nested parallels.
+      if (ParallelLevel > MaxParallelLevel)
+        MaxParallelLevel = ParallelLevel;
+      ParallelLevel++;
+    }
+    void Exit(CodeGenFunction &CGF) override {
+      ParallelLevel--;
+    }
+  };
+
 } // namespace CodeGen
 } // namespace clang
 
Index: clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CGOpenMPRuntimeAMDGCN.h"
+#include "CGOpenMPRuntime.h"
 #include "CGOpenMPRuntimeGPU.h"
 #include "CodeGenFunction.h"
 #include "clang/AST/Attr.h"
@@ -30,6 +31,7 @@
     : CGOpenMPRuntimeGPU(CGM) {
   if (!CGM.getLangOpts().OpenMPIsDevice)
     llvm_unreachable("OpenMP AMDGCN can only handle device code.");
+  StaticRDLinkage = llvm::GlobalValue::PrivateLinkage;
 }
 
 llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) {
@@ -59,3 +61,176 @@
   return Bld.CreateTrunc(
       Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty);
 }
+
+llvm::GlobalVariable *CGOpenMPRuntimeAMDGCN::allocateTransferMediumGlobal(
+    CodeGenModule &CGM, llvm::ArrayType *Ty, StringRef TransferMediumName) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), Ty, /*isConstant=*/false,
+      llvm::GlobalVariable::WeakAnyLinkage, llvm::UndefValue::get(Ty),
+      TransferMediumName,
+      /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared),
+      /*isExternallyInitialized*/ true);
+}
+
+llvm::GlobalVariable *
+CGOpenMPRuntimeAMDGCN::allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+                                                    llvm::Type *LLVMStaticTy) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), LLVMStaticTy,
+      /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
+      llvm::UndefValue::get(LLVMStaticTy), "_openmp_shared_static_glob_rd_$_",
+      /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared),
+      /*isExternallyInitialized*/ true);
+}
+
+llvm::GlobalVariable *
+CGOpenMPRuntimeAMDGCN::allocateKernelStaticGlobalized(CodeGenModule &CGM) {
+  return new llvm::GlobalVariable(
+      CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
+      llvm::GlobalValue::WeakAnyLinkage, llvm::UndefValue::get(CGM.VoidPtrTy),
+      "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
+      llvm::GlobalValue::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared),
+      /*isExternallyInitialized*/ true);
+}
+
+void CGOpenMPRuntimeAMDGCN::setPropertyWorkGroupSize(CodeGenModule &CGM,
+                                                     StringRef Name,
+                                                     unsigned WGSize) {
+  auto *GVMode = new llvm::GlobalVariable(
+      CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
+      llvm::GlobalValue::WeakAnyLinkage,
+      llvm::ConstantInt::get(CGM.Int16Ty, WGSize), Name + Twine("_wg_size"),
+      /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
+      CGM.getContext().getTargetAddressSpace(LangAS::cuda_device),
+      /*isExternallyInitialized*/ false);
+  CGM.addCompilerUsedGlobal(GVMode);
+}
+
+void CGOpenMPRuntimeAMDGCN::generateMetaData(CodeGenModule &CGM,
+                                             const OMPExecutableDirective &D,
+                                             llvm::Function *&OutlinedFn,
+                                             bool IsGeneric) {
+  int FlatAttr = 0;
+  bool FlatAttrEmitted = false;
+  unsigned DefaultWorkGroupSz =
+      CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Default_WG_Size);
+
+  if (isOpenMPTeamsDirective(D.getDirectiveKind()) ||
+      isOpenMPParallelDirective(D.getDirectiveKind())) {
+    const auto *ThreadLimitClause = D.getSingleClause<OMPThreadLimitClause>();
+    const auto *NumThreadsClause = D.getSingleClause<OMPNumThreadsClause>();
+    unsigned MaxWorkGroupSz =
+        CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Max_WG_Size);
+    unsigned CompileTimeThreadLimit = 0;
+    // Only one of thread_limit or num_threads is used, cant do it for both
+    if (ThreadLimitClause && !NumThreadsClause) {
+      Expr *ThreadLimitExpr = ThreadLimitClause->getThreadLimit();
+      clang::Expr::EvalResult Result;
+      if (ThreadLimitExpr->EvaluateAsInt(Result, CGM.getContext()))
+        CompileTimeThreadLimit = Result.Val.getInt().getExtValue();
+    } else if (!ThreadLimitClause && NumThreadsClause) {
+      Expr *NumThreadsExpr = NumThreadsClause->getNumThreads();
+      clang::Expr::EvalResult Result;
+      if (NumThreadsExpr->EvaluateAsInt(Result, CGM.getContext()))
+        CompileTimeThreadLimit = Result.Val.getInt().getExtValue();
+    }
+
+    // Add kernel metadata if ThreadLimit Clause is compile time constant > 0
+    if (CompileTimeThreadLimit > 0) {
+      // Add the WarpSize to generic, to reflect what runtime dispatch does.
+      if (IsGeneric)
+        CompileTimeThreadLimit +=
+            CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Warp_Size);
+      if (CompileTimeThreadLimit > MaxWorkGroupSz)
+        CompileTimeThreadLimit = MaxWorkGroupSz;
+      std::string AttrVal = llvm::utostr(CompileTimeThreadLimit);
+      FlatAttr = CompileTimeThreadLimit;
+      OutlinedFn->addFnAttr("amdgpu-flat-work-group-size",
+                            AttrVal + "," + AttrVal);
+      setPropertyWorkGroupSize(CGM, OutlinedFn->getName(),
+                               CompileTimeThreadLimit);
+    }
+    FlatAttrEmitted = true;
+  } // end of amdgcn teams or parallel directive
+
+  // emit amdgpu-flat-work-group-size if not emitted already.
+  if (!FlatAttrEmitted) {
+    std::string FlatAttrVal = llvm::utostr(DefaultWorkGroupSz);
+    OutlinedFn->addFnAttr("amdgpu-flat-work-group-size",
+                          FlatAttrVal + "," + FlatAttrVal);
+  }
+  // Emit a kernel descriptor for runtime.
+  StringRef KernDescName = OutlinedFn->getName();
+  CGOpenMPRuntimeAMDGCN::emitStructureKernelDesc(CGM, KernDescName, FlatAttr,
+                                           IsGeneric,
+                                           1, // Uses HostServices
+                                           MaxParallelLevel);
+  // Reset it to zero for any subsequent kernel
+  MaxParallelLevel = 0;
+}
+
+void CGOpenMPRuntimeAMDGCN::emitSPMDKernelWrapper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                 CodeGen);
+  generateMetaData(CGM, D, OutlinedFn, /*SPMD*/ false);
+}
+
+void CGOpenMPRuntimeAMDGCN::emitNonSPMDKernelWrapper(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                    CodeGen);
+  generateMetaData(CGM, D, OutlinedFn, /*Generic*/ true);
+}
+
+PrePostActionTy *CGOpenMPRuntimeAMDGCN::getPrePostActionTy(){
+  return new AMDGCNPrePostActionTy(ParallelLevel, MaxParallelLevel);
+}
+
+/// Emit structure descriptor for a kernel
+void CGOpenMPRuntimeAMDGCN::emitStructureKernelDesc(CodeGenModule &CGM,
+                                              StringRef Name, int16_t WG_Size,
+                                              int8_t Mode, int8_t HostServices,
+                                              int8_t MaxParallelLevel) {
+
+  // Create all device images
+  llvm::Constant *AttrData[] = {
+      llvm::ConstantInt::get(CGM.Int16Ty, 2), // Version
+      llvm::ConstantInt::get(CGM.Int16Ty, 9), // Size in bytes
+      llvm::ConstantInt::get(CGM.Int16Ty, WG_Size),
+      llvm::ConstantInt::get(CGM.Int8Ty, Mode), // 0 => SPMD, 1 => GENERIC
+      llvm::ConstantInt::get(CGM.Int8Ty, HostServices), // 1 => use HostServices
+      llvm::ConstantInt::get(CGM.Int8Ty, MaxParallelLevel)}; // number of nests
+
+  llvm::GlobalVariable *AttrImages = createGlobalStruct(
+      CGM, getTgtAttributeStructQTy(), isDefaultLocationConstant(), AttrData,
+      Name + Twine("_kern_desc"), llvm::GlobalValue::WeakAnyLinkage);
+  CGM.addCompilerUsedGlobal(AttrImages);
+}
+
+// Create Tgt Attribute Struct type.
+QualType CGOpenMPRuntimeAMDGCN::getTgtAttributeStructQTy() {
+  ASTContext &C = CGM.getContext();
+  QualType KmpInt8Ty = C.getIntTypeForBitwidth(/*Width=*/8, /*Signed=*/1);
+  QualType KmpInt16Ty = C.getIntTypeForBitwidth(/*Width=*/16, /*Signed=*/1);
+  if (TgtAttributeStructQTy.isNull()) {
+    RecordDecl *RD = C.buildImplicitRecord("__tgt_attribute_struct");
+    RD->startDefinition();
+    clang::CodeGen::addFieldToRecordDecl(C, RD, KmpInt16Ty); // Version
+    clang::CodeGen::addFieldToRecordDecl(C, RD, KmpInt16Ty); // Struct Size in bytes.
+    clang::CodeGen::addFieldToRecordDecl(C, RD, KmpInt16Ty); // WG_size
+    clang::CodeGen::addFieldToRecordDecl(C, RD, KmpInt8Ty);  // Mode
+    clang::CodeGen::addFieldToRecordDecl(C, RD, KmpInt8Ty);  // HostServices
+    clang::CodeGen::addFieldToRecordDecl(C, RD, KmpInt8Ty);  // MaxParallelLevel
+    RD->completeDefinition();
+    TgtAttributeStructQTy = C.getRecordType(RD);
+  }
+  return TgtAttributeStructQTy;
+}
Index: clang/lib/CodeGen/CGOpenMPRuntime.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntime.h
+++ clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -495,6 +495,7 @@
   QualType TgtOffloadEntryQTy;
   /// Entity that registers the offloading constants that were emitted so
   /// far.
+
   class OffloadEntriesInfoManagerTy {
     CodeGenModule &CGM;
 
@@ -681,7 +682,6 @@
         OffloadDeviceGlobalVarEntryInfoActTy;
     void actOnDeviceGlobalVarEntriesInfo(
         const OffloadDeviceGlobalVarEntryInfoActTy &Action);
-
   private:
     // Storage for target region entries kind. The storage is to be indexed by
     // file ID, device ID, parent function name and line number.
@@ -2476,6 +2476,18 @@
   }
 };
 
+/// Declaration of functions visible in clang::CodeGen namespace, to
+/// be used by target specific specializations of CGOpenMPRuntimeGPU.
+
+FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC,
+                                       QualType FieldTy);
+
+template <class... As>
+llvm::GlobalVariable *
+createGlobalStruct(CodeGenModule &CGM, QualType Ty, bool IsConstant,
+                   ArrayRef<llvm::Constant *> Data, const Twine &Name,
+                   As &&... Args);
+
 } // namespace CodeGen
 } // namespace clang
 
Index: clang/lib/CodeGen/CGOpenMPRuntime.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1048,17 +1048,6 @@
                             AlignmentSource::Decl);
 }
 
-static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC,
-                                       QualType FieldTy) {
-  auto *Field = FieldDecl::Create(
-      C, DC, SourceLocation(), SourceLocation(), /*Id=*/nullptr, FieldTy,
-      C.getTrivialTypeSourceInfo(FieldTy, SourceLocation()),
-      /*BW=*/nullptr, /*Mutable=*/false, /*InitStyle=*/ICIS_NoInit);
-  Field->setAccess(AS_public);
-  DC->addDecl(Field);
-  return Field;
-}
-
 CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM, StringRef FirstSeparator,
                                  StringRef Separator)
     : CGM(CGM), FirstSeparator(FirstSeparator), Separator(Separator),
@@ -1352,9 +1341,20 @@
   }
 }
 
+FieldDecl *clang::CodeGen::addFieldToRecordDecl(ASTContext &C, DeclContext *DC,
+                                       QualType FieldTy) {
+  auto *Field = FieldDecl::Create(
+      C, DC, SourceLocation(), SourceLocation(), /*Id=*/nullptr, FieldTy,
+      C.getTrivialTypeSourceInfo(FieldTy, SourceLocation()),
+      /*BW=*/nullptr, /*Mutable=*/false, /*InitStyle=*/ICIS_NoInit);
+  Field->setAccess(AS_public);
+  DC->addDecl(Field);
+  return Field;
+}
+
 template <class... As>
-static llvm::GlobalVariable *
-createGlobalStruct(CodeGenModule &CGM, QualType Ty, bool IsConstant,
+llvm::GlobalVariable *
+clang::CodeGen::createGlobalStruct(CodeGenModule &CGM, QualType Ty, bool IsConstant,
                    ArrayRef<llvm::Constant *> Data, const Twine &Name,
                    As &&... Args) {
   const auto *RD = cast<RecordDecl>(Ty->getAsTagDecl());

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D86097: [OpenMP][AMDGCN] Generate global variables and attributes for AMDGCN

Reply via email to