[PATCH] D124721: [OpenMP] Allow compiling multiple target architectures with OpenMP

Joseph Huber via Phabricator via cfe-commits Sat, 30 Apr 2022 16:04:46 -0700

jhuber6 created this revision.
jhuber6 added reviewers: jdoerfert, JonChesterfield, yaxunl, saiislam, 
tianshilei1992, tra.
Herald added subscribers: kerbowa, guansong, jvesely.
Herald added a project: All.
jhuber6 requested review of this revision.
Herald added subscribers: cfe-commits, sstefan1, MaskRay.
Herald added a project: clang.


This patch adds support for OpenMP to use the `--offload-arch` and
`--no-offload-arch` options. Traditionally, OpenMP has only supported
compiling for a single architecture via the `-Xopenmp-target` option.
Now we can pass in a bound architecture and use that if given, otherwise
we default to the value of the `-march` option as before.

Note that this only applies the basic support, the OpenMP target runtime
does not yet know how to choose between multiple architectures.
Additionally other parts of the offloading toolchain (e.g. LTO) require
the `-march` option, these should be worked out later.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D124721

Files:
  clang/lib/Driver/Driver.cpp
  clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
  clang/lib/Driver/ToolChains/Cuda.cpp
  clang/test/Driver/amdgpu-openmp-toolchain-new.c
  clang/test/Driver/openmp-offload-gpu-new.c

Index: clang/test/Driver/openmp-offload-gpu-new.c
===================================================================
--- clang/test/Driver/openmp-offload-gpu-new.c
+++ clang/test/Driver/openmp-offload-gpu-new.c
@@ -10,6 +10,10 @@
 // RUN:          -Xopenmp-target=nvptx64-nvidia-cuda -march=sm_52 \
 // RUN:          --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc %s 2>&1 \
 // RUN:   | FileCheck %s
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:          --offload-arch=sm_52 \
+// RUN:          --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc %s 2>&1 \
+// RUN:   | FileCheck %s
 
 // verify the tools invocations
 // CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-emit-llvm-bc"{{.*}}"-x" "c"
@@ -40,6 +44,15 @@
 // CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[DEVICE_OBJ]]"], output: "[[HOST_OBJ:.*]]"
 // CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out"
 
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda --offload-arch=sm_52 --offload-arch=sm_70 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-ARCH-BINDINGS
+// CHECK-ARCH-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.*]]"], output: "[[HOST_BC:.*]]"
+// CHECK-ARCH-BINDINGS: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[DEVICE_BC_SM_52:.*]]"
+// CHECK-ARCH-BINDINGS: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[DEVICE_BC_SM_52]]"], output: "[[DEVICE_OBJ_SM_52:.*]]"
+// CHECK-ARCH-BINDINGS: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[DEVICE_BC_SM_70:.*]]"
+// CHECK-ARCH-BINDINGS: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[DEVICE_BC_SM_70]]"], output: "[[DEVICE_OBJ_SM_70:.*]]"
+// CHECK-ARCH-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[DEVICE_OBJ_SM_52]]", "[[DEVICE_OBJ_SM_70]]"], output: "[[HOST_OBJ:.*]]"
+// CHECK-ARCH-BINDINGS: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out"
+
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target=nvptx64-nvidia-cuda -march=sm_52 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR
 // CHECK-EMIT-LLVM-IR: clang{{.*}}"-cc1"{{.*}}"-triple" "nvptx64-nvidia-cuda"{{.*}}"-emit-llvm"
 
Index: clang/test/Driver/amdgpu-openmp-toolchain-new.c
===================================================================
--- clang/test/Driver/amdgpu-openmp-toolchain-new.c
+++ clang/test/Driver/amdgpu-openmp-toolchain-new.c
@@ -3,6 +3,9 @@
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa \
 // RUN:          -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgpu-bc-path=%S/Inputs/hip_dev_lib %s 2>&1 \
 // RUN:   | FileCheck %s
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa \
+// RUN:          --offload-arch=gfx906 --libomptarget-amdgpu-bc-path=%S/Inputs/hip_dev_lib %s 2>&1 \
+// RUN:   | FileCheck %s
 
 // verify the tools invocations
 // CHECK: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-emit-llvm-bc"{{.*}}"-x" "c"
@@ -34,6 +37,7 @@
 // CHECK-NOGPULIB-NOT: "-cc1" "-triple" "amdgcn-amd-amdhsa"{{.*}}"-target-cpu" "gfx803" "-fcuda-is-device" "-mlink-builtin-bitcode"{{.*}}libomptarget-amdgpu-gfx803.bc"{{.*}}
 
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa --offload-arch=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS
 // CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.*]]"], output: "[[HOST_BC:.*]]"
 // CHECK-BINDINGS: "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[DEVICE_BC:.*]]"
 // CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[DEVICE_BC]]"], output: "[[HOST_OBJ:.*]]"
Index: clang/lib/Driver/ToolChains/Cuda.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Cuda.cpp
+++ clang/lib/Driver/ToolChains/Cuda.cpp
@@ -847,10 +847,10 @@
       if (!llvm::is_contained(*DAL, A))
         DAL->append(A);
 
-    StringRef Arch = DAL->getLastArgValue(options::OPT_march_EQ);
-    if (Arch.empty())
+    if (!DAL->hasArg(options::OPT_march_EQ))
       DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),
-                        CLANG_OPENMP_NVPTX_DEFAULT_ARCH);
+                        !BoundArch.empty() ? BoundArch
+                                           : CLANG_OPENMP_NVPTX_DEFAULT_ARCH);
 
     return DAL;
   }
Index: clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
===================================================================
--- clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@@ -307,9 +307,10 @@
       if (!llvm::is_contained(*DAL, A))
         DAL->append(A);
 
-    std::string Arch = DAL->getLastArgValue(options::OPT_march_EQ).str();
-    if (Arch.empty()) {
-      checkSystemForAMDGPU(Args, *this, Arch);
+    if (!DAL->hasArg(options::OPT_march_EQ)) {
+      std::string Arch = BoundArch.str();
+      if (BoundArch.empty())
+        checkSystemForAMDGPU(Args, *this, Arch);
       DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), Arch);
     }
 
Index: clang/lib/Driver/Driver.cpp
===================================================================
--- clang/lib/Driver/Driver.cpp
+++ clang/lib/Driver/Driver.cpp
@@ -4215,15 +4215,18 @@
 static StringRef getCanonicalArchString(Compilation &C,
                                         llvm::opt::DerivedArgList &Args,
                                         StringRef ArchStr,
-                                        Action::OffloadKind Kind) {
-  if (Kind == Action::OFK_Cuda) {
+                                        Action::OffloadKind Kind,
+                                        const ToolChain *TC) {
+  if (Kind == Action::OFK_Cuda ||
+      (Kind == Action::OFK_OpenMP && TC->getTriple().isNVPTX())) {
     CudaArch Arch = StringToCudaArch(ArchStr);
     if (Arch == CudaArch::UNKNOWN || !IsNVIDIAGpuArch(Arch)) {
       C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr;
       return StringRef();
     }
     return Args.MakeArgStringRef(CudaArchToString(Arch));
-  } else if (Kind == Action::OFK_HIP) {
+  } else if (Kind == Action::OFK_HIP ||
+             (Kind == Action::OFK_OpenMP && TC->getTriple().isAMDGPU())) {
     llvm::StringMap<bool> Features;
     // getHIPOffloadTargetTriple() is known to return valid value as it has
     // been called successfully in the CreateOffloadingDeviceToolChains().
@@ -4259,11 +4262,7 @@
 /// architctures we return a set containing only the empty string.
 static llvm::DenseSet<StringRef>
 getOffloadArchs(Compilation &C, llvm::opt::DerivedArgList &Args,
-                Action::OffloadKind Kind) {
-
-  // If this is OpenMP offloading we don't use a bound architecture.
-  if (Kind == Action::OFK_OpenMP)
-    return llvm::DenseSet<StringRef>{StringRef()};
+                Action::OffloadKind Kind, const ToolChain *TC) {
 
   // --offload and --offload-arch options are mutually exclusive.
   if (Args.hasArgNoClaim(options::OPT_offload_EQ) &&
@@ -4279,12 +4278,12 @@
   llvm::DenseSet<StringRef> Archs;
   for (auto &Arg : Args) {
     if (Arg->getOption().matches(options::OPT_offload_arch_EQ)) {
-      Archs.insert(getCanonicalArchString(C, Args, Arg->getValue(), Kind));
+      Archs.insert(getCanonicalArchString(C, Args, Arg->getValue(), Kind, TC));
     } else if (Arg->getOption().matches(options::OPT_no_offload_arch_EQ)) {
       if (Arg->getValue() == StringRef("all"))
         Archs.clear();
       else
-        Archs.erase(getCanonicalArchString(C, Args, Arg->getValue(), Kind));
+        Archs.erase(getCanonicalArchString(C, Args, Arg->getValue(), Kind, TC));
     }
   }
 
@@ -4300,6 +4299,11 @@
       Archs.insert(CudaArchToString(CudaArch::CudaDefault));
     else if (Kind == Action::OFK_HIP)
       Archs.insert(CudaArchToString(CudaArch::HIPDefault));
+    else if (Kind == Action::OFK_OpenMP)
+      Archs.insert(StringRef());
+  } else {
+    Args.ClaimAllArgs(options::OPT_offload_arch_EQ);
+    Args.ClaimAllArgs(options::OPT_no_offload_arch_EQ);
   }
 
   return Archs;
@@ -4345,7 +4349,7 @@
     // Get the product of all bound architectures and toolchains.
     SmallVector<std::pair<const ToolChain *, StringRef>> TCAndArchs;
     for (const ToolChain *TC : ToolChains)
-      for (StringRef Arch : getOffloadArchs(C, Args, Kind))
+      for (StringRef Arch : getOffloadArchs(C, Args, Kind, TC))
         TCAndArchs.push_back(std::make_pair(TC, Arch));
 
     for (unsigned I = 0, E = TCAndArchs.size(); I != E; ++I)
@@ -4374,9 +4378,9 @@
           HostAction->setCannotBeCollapsedWithNextDependentAction();
           OffloadAction::HostDependence HDep(
               *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
-              /*BoundArch=*/nullptr, Kind);
+              TCAndArch->second.data(), Kind);
           OffloadAction::DeviceDependences DDep;
-          DDep.add(*A, *TCAndArch->first, /*BoundArch=*/nullptr, Kind);
+          DDep.add(*A, *TCAndArch->first, TCAndArch->second.data(), Kind);
           A = C.MakeAction<OffloadAction>(HDep, DDep);
         } else if (isa<AssembleJobAction>(A) && Kind == Action::OFK_Cuda) {
           // The Cuda toolchain uses fatbinary as the linker phase to bundle the

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D124721: [OpenMP] Allow compiling multiple target architectures with OpenMP

Reply via email to