ABataev created this revision.
ABataev added a reviewer: jdoerfert.
Herald added subscribers: jansvoboda11, dexonsmith, dang, guansong, yaxunl.
ABataev requested review of this revision.
Herald added a subscriber: sstefan1.
Herald added a project: clang.

Added options -f-[no]openmp-cuda-const-firstprivate to control the
address space of the the corresponding global variable, created for the
target firstprivate variable. Disabled by default.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D99689

Files:
  clang/include/clang/Basic/LangOptions.def
  clang/include/clang/Driver/Options.td
  clang/lib/CodeGen/CGOpenMPRuntime.cpp
  clang/lib/CodeGen/CGStmtOpenMP.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/lib/Frontend/CompilerInvocation.cpp
  clang/test/Driver/openmp-offload-gpu.c
  clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp

Index: clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
===================================================================
--- clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
+++ clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
@@ -1,8 +1,10 @@
 // Test target codegen - host bc file has to be created first.
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 --check-prefix TCHECK-NONCONST
+// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-cuda-const-firstprivate -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 --check-prefix TCHECK-CONST
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 --check-prefix TCHECK-NONCONST
+// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-cuda-const-firstprivate -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 --check-prefix TCHECK-CONST
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
@@ -17,7 +19,8 @@
 // TCHECK-DAG:  [[TTII:%.+]] = type { i32, i32 }
 // TCHECK-DAG:  [[S1:%.+]] = type { double }
 
-// TCHECK: @__omp_offloading_firstprivate__{{.+}}_e_l27 = internal addrspace(4) global [[TTII]] zeroinitializer
+// TCHECK-NONCONST-NOT: @__omp_offloading_firstprivate__{{.+}}_e_l30
+// TCHECK-CONST:    @__omp_offloading_firstprivate__{{.+}}_e_l30 = internal addrspace(4) global [[TTII]] zeroinitializer
 int foo(int n, double *ptr) {
   int a = 0;
   short aa = 0;
@@ -36,7 +39,8 @@
   // TCHECK:  define {{.*}}void @__omp_offloading_{{.+}}([10 x float] addrspace(1)* noalias [[B_IN:%.+]], i{{[0-9]+}} [[A_IN:%.+]], [[TTII]]* noalias [[E_IN:%.+]])
   // TCHECK-NOT: alloca [[TTII]],
   // TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
-  // TCHECK-NOT: alloca [[TTII]],
+  // TCHECK-NONCONST: alloca [[TTII]],
+  // TCHECK-CONST-NOT: alloca [[TTII]],
   // TCHECK-NOT: alloca i{{[0-9]+}},
   // TCHECK-64:  call void @llvm.dbg.declare(metadata [10 x float] addrspace(1)** %{{.+}}, metadata !{{[0-9]+}}, metadata !DIExpression())
   // TCHECK:  store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[A_ADDR]],
Index: clang/test/Driver/openmp-offload-gpu.c
===================================================================
--- clang/test/Driver/openmp-offload-gpu.c
+++ clang/test/Driver/openmp-offload-gpu.c
@@ -281,6 +281,26 @@
 // RUN:   | FileCheck -check-prefix=NO_CUDA_MODE %s
 // NO_CUDA_MODE-NOT: "-{{fno-|f}}openmp-cuda-mode"
 
+// RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fopenmp-cuda-const-firstprivate 2>&1 \
+// RUN:   | FileCheck -check-prefix=CUDA_CONST_FP %s
+// RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fno-openmp-cuda-const-firstprivate -fopenmp-cuda-const-firstprivate 2>&1 \
+// RUN:   | FileCheck -check-prefix=CUDA_CONST_FP %s
+// RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target -march=gfx906 %s -fopenmp-cuda-const-firstprivate 2>&1 \
+// RUN:   | FileCheck -check-prefix=CUDA_CONST_FP %s
+// RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target -march=gfx906 %s -fno-openmp-cuda-const-firstprivate -fopenmp-cuda-const-firstprivate 2>&1 \
+// RUN:   | FileCheck -check-prefix=CUDA_CONST_FP %s
+// CUDA_CONST_FP: clang{{.*}}"-cc1"{{.*}}"-triple" "{{nvptx64-nvidia-cuda|amdgcn-amd-amdhsa}}"
+// CUDA_CONST_FP-SAME: "-fopenmp-cuda-const-firstprivate"
+// RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fno-openmp-cuda-const-firstprivate 2>&1 \
+// RUN:   | FileCheck -check-prefix=NO_CUDA_CONST_FP %s
+// RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fopenmp-cuda-const-firstprivate -fno-openmp-cuda-const-firstprivate 2>&1 \
+// RUN:   | FileCheck -check-prefix=NO_CUDA_CONST_FP %s
+// RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target -march=gfx906 %s -fno-openmp-cuda-const-firstprivate 2>&1 \
+// RUN:   | FileCheck -check-prefix=NO_CUDA_CONST_FP %s
+// RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target -march=gfx906 %s -fopenmp-cuda-const-firstprivate -fno-openmp-cuda-const-firstprivate 2>&1 \
+// RUN:   | FileCheck -check-prefix=NO_CUDA_CONST_FP %s
+// NO_CUDA_CONST_FP-NOT: "-{{fno-|f}}openmp-cuda-const-firstprivate"
+
 // RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fopenmp-cuda-force-full-runtime 2>&1 \
 // RUN:   | FileCheck -check-prefix=FULL_RUNTIME %s
 // RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fno-openmp-cuda-force-full-runtime -fopenmp-cuda-force-full-runtime 2>&1 \
Index: clang/lib/Frontend/CompilerInvocation.cpp
===================================================================
--- clang/lib/Frontend/CompilerInvocation.cpp
+++ clang/lib/Frontend/CompilerInvocation.cpp
@@ -3448,6 +3448,9 @@
   if (Opts.OpenMPCUDAMode)
     GenerateArg(Args, OPT_fopenmp_cuda_mode, SA);
 
+  if (Opts.OpenMPCUDAConstFirstprivate)
+    GenerateArg(Args, OPT_fopenmp_cuda_const_firstprivate, SA);
+
   if (Opts.OpenMPCUDATargetParallel)
     GenerateArg(Args, OPT_fopenmp_cuda_parallel_target_regions, SA);
 
@@ -3843,6 +3846,9 @@
   // Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options
   Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) &&
                         Args.hasArg(options::OPT_fopenmp_cuda_mode);
+  Opts.OpenMPCUDAConstFirstprivate =
+      Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) &&
+      Args.hasArg(options::OPT_fopenmp_cuda_const_firstprivate);
 
   // Set CUDA support for parallel execution of target regions for OpenMP target
   // NVPTX/AMDGCN if specified in options.
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -5571,6 +5571,13 @@
                        options::OPT_fno_openmp_cuda_mode, /*Default=*/false))
         CmdArgs.push_back("-fopenmp-cuda-mode");
 
+      // When in OpenMP offloading mode with NVPTX target, forward
+      // cuda-const-firstprivate flag
+      if (Args.hasFlag(options::OPT_fopenmp_cuda_const_firstprivate,
+                       options::OPT_fno_openmp_cuda_const_firstprivate,
+                       /*Default=*/false))
+        CmdArgs.push_back("-fopenmp-cuda-const-firstprivate");
+
       // When in OpenMP offloading mode with NVPTX target, forward
       // cuda-parallel-target-regions flag
       if (Args.hasFlag(options::OPT_fopenmp_cuda_parallel_target_regions,
Index: clang/lib/CodeGen/CGStmtOpenMP.cpp
===================================================================
--- clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -775,6 +775,7 @@
     return false;
   bool DeviceConstTarget =
       getLangOpts().OpenMPIsDevice &&
+      getLangOpts().OpenMPCUDAConstFirstprivate &&
       isOpenMPTargetExecutionDirective(D.getDirectiveKind());
   bool FirstprivateIsLastprivate = false;
   llvm::DenseMap<const VarDecl *, OpenMPLastprivateModifier> Lastprivates;
Index: clang/lib/CodeGen/CGOpenMPRuntime.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -9131,7 +9131,8 @@
       CombinedInfo.Types.push_back(getMapModifiersForPrivateClauses(CI));
       const VarDecl *VD = CI.getCapturedVar();
       auto I = FirstPrivateDecls.find(VD);
-      if (I != FirstPrivateDecls.end() &&
+      if (CGF.getLangOpts().OpenMPCUDAConstFirstprivate &&
+          I != FirstPrivateDecls.end() &&
           VD->getType().isConstant(CGF.getContext())) {
         llvm::Constant *Addr =
             CGF.CGM.getOpenMPRuntime().registerTargetFirstprivateCopy(CGF, VD);
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -2265,6 +2265,12 @@
 def fno_openmp_cuda_parallel_target_regions : Flag<["-"], "fno-openmp-cuda-parallel-target-regions">, Group<f_Group>,
   Flags<[NoArgumentUnused, HelpHidden]>,
   HelpText<"Support only serial execution of target regions on Cuda-based devices.">;
+def fopenmp_cuda_const_firstprivate : Flag<["-"], "fopenmp-cuda-const-firstprivate">, Group<f_Group>,
+  Flags<[CC1Option, NoArgumentUnused, HelpHidden]>,
+  HelpText<"Allow placement of firstprivate const variables in the constant address space.">;
+def fno_openmp_cuda_const_firstprivate : Flag<["-"], "fno-openmp-cuda-const-firstprivate">, Group<f_Group>,
+  Flags<[NoArgumentUnused, HelpHidden]>,
+  HelpText<"Do not allow placement of firstprivate const variables in the constant address space.">;
 def static_openmp: Flag<["-"], "static-openmp">,
   HelpText<"Use the static host OpenMP runtime while linking.">;
 def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group<f_Group>;
Index: clang/include/clang/Basic/LangOptions.def
===================================================================
--- clang/include/clang/Basic/LangOptions.def
+++ clang/include/clang/Basic/LangOptions.def
@@ -237,6 +237,7 @@
 LANGOPT(OpenMPCUDAReductionBufNum , 32, 1024, "Number of the reduction records in the intermediate reduction buffer used for the teams reductions.")
 LANGOPT(OpenMPOptimisticCollapse  , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
 LANGOPT(OpenMPCUDATargetParallel, 1, 0, "Support parallel execution of target region on Cuda-based devices.")
+LANGOPT(OpenMPCUDAConstFirstprivate, 1, 0, "Place firstprivate const variables in the constant address space.")
 LANGOPT(RenderScript      , 1, 0, "RenderScript")
 
 LANGOPT(CUDAIsDevice      , 1, 0, "compiling for CUDA device")
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to