saiislam created this revision.
saiislam added reviewers: jdoerfert, yaxunl, JonChesterfield, RaviNarayanaswamy.
Herald added subscribers: kerbowa, pengfei, guansong, nhaehnle, jvesely.
saiislam requested review of this revision.
Herald added subscribers: openmp-commits, cfe-commits, sstefan1.
Herald added projects: clang, OpenMP.

Multiple offloading targets can now be specified in the command
line. An instance of toolchain is created for each unique
combination of Target Triple and Target GPU. Device runtime has
been modified to support binaries containing multiple images,
each for a different target.
Data structure "__tgt_image_info" defined in
"llvm-project/openmp/libomptarget/include/omptarget.h" is used
to pass requirements of each image. E.g. GPU name like gfx906,
sm35, etc are the requirements of the image, which is produced
by clang-offload-wrapper and read by device RTL.

Example:

  clang  -O2  -target x86_64-pc-linux-gnu -fopenmp \
    -fopenmp-targets=amdgcn-amd-amdhsa,amdgcn-amd-amdhsa \
    -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 \
    -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908 \
   helloworld.c -o helloworld


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D106870

Files:
  clang/include/clang/Basic/DiagnosticDriverKinds.td
  clang/include/clang/Driver/ToolChain.h
  clang/lib/Driver/Action.cpp
  clang/lib/Driver/Driver.cpp
  clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
  clang/lib/Driver/ToolChains/AMDGPUOpenMP.h
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/lib/Driver/ToolChains/Cuda.cpp
  clang/lib/Driver/ToolChains/Cuda.h
  clang/test/Driver/amdgpu-openmp-system-arch-fail.c
  clang/test/Driver/amdgpu-openmp-toolchain.c
  clang/test/Driver/hip-rdc-device-only.hip
  clang/test/Driver/hip-toolchain-rdc-separate.hip
  clang/test/Driver/openmp-offload-multi.c
  clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
  openmp/libomptarget/include/omptarget.h
  openmp/libomptarget/src/exports
  openmp/libomptarget/src/interface.cpp
  openmp/libomptarget/src/rtl.cpp

Index: openmp/libomptarget/src/rtl.cpp
===================================================================
--- openmp/libomptarget/src/rtl.cpp
+++ openmp/libomptarget/src/rtl.cpp
@@ -20,6 +20,7 @@
 #include <dlfcn.h>
 #include <mutex>
 #include <string>
+#include <sys/stat.h>
 
 // List of all plugins that can support offloading.
 static const char *RTLNames[] = {
@@ -288,18 +289,131 @@
      flags, RequiresFlags);
 }
 
+/// Query runtime capabilities of this system by calling offload-arch -c
+/// offload_arch_output_buffer is persistant storage returned by this
+/// __tgt_get_active_offload_env.
+static void
+__tgt_get_active_offload_env(__tgt_active_offload_env *active_env,
+                             char *offload_arch_output_buffer,
+                             size_t offload_arch_output_buffer_size) {
+  void *handle = dlopen("libomptarget.so", RTLD_NOW);
+  if (!handle)
+    DP("dlopen() failed: %s\n", dlerror());
+  char *libomptarget_dir_name = new char[PATH_MAX];
+  if (dlinfo(handle, RTLD_DI_ORIGIN, libomptarget_dir_name) == -1)
+    DP("RTLD_DI_ORIGIN failed: %s\n", dlerror());
+  std::string cmd_bin;
+  cmd_bin.assign(libomptarget_dir_name).append("/../bin/amdgpu-arch");
+  struct stat stat_buffer;
+  if (stat(cmd_bin.c_str(), &stat_buffer)) {
+    DP("Missing offload-arch command at %s \n", cmd_bin.c_str());
+  } else {
+    // Add option to print capabilities of current system
+    // cmd_bin.append(" -c");
+    FILE *stream = popen(cmd_bin.c_str(), "r");
+    while (fgets(offload_arch_output_buffer, offload_arch_output_buffer_size,
+                 stream) != NULL)
+      ;
+    pclose(stream);
+    active_env->capabilities = offload_arch_output_buffer;
+    size_t slen = strlen(active_env->capabilities);
+    offload_arch_output_buffer[slen - 1] =
+        '\0'; // terminate string before line feed
+    offload_arch_output_buffer +=
+        slen; // To store next value in offload_arch_output_buffer, not likely
+  }
+  delete[] libomptarget_dir_name;
+}
+
+std::vector<std::string> _splitstrings(char *input, const char *sep) {
+  std::vector<std::string> split_strings;
+  std::string s(input);
+  std::string delimiter(sep);
+  size_t pos = 0;
+  while ((pos = s.find(delimiter)) != std::string::npos) {
+    if (pos != 0)
+      split_strings.push_back(s.substr(0, pos));
+    s.erase(0, pos + delimiter.length());
+  }
+  if (s.length() > 1)
+    split_strings.push_back(s.substr(0, s.length()));
+  return split_strings;
+}
+
+static bool _ImageIsCompatibleWithEnv(__tgt_image_info *img_info,
+                                      __tgt_active_offload_env *active_env) {
+  // get_image_info will return null if no image information was registered.
+  // If no image information, assume application built with old compiler and
+  // check each image.
+  if (!img_info)
+    return true;
+
+  // Each runtime requirement for the compiled image is stored in
+  // the img_info->requirements string and is separated by __ .
+  // Each runtime capability obtained from "offload-arch -c" is stored in
+  // actvie_env->capabilities and is separated by spaces.
+  // If every requirement has a matching capability, then the image
+  // is compatible with active environment
+
+  std::vector<std::string> reqs = _splitstrings(img_info->requirements, "__");
+  std::vector<std::string> caps = _splitstrings(active_env->capabilities, " ");
+
+  bool is_compatible = true;
+  for (auto req : reqs) {
+    bool missing_capability = true;
+    for (auto capability : caps)
+      if (capability == req)
+        missing_capability = false;
+    if (missing_capability) {
+      DP("Image requires %s but runtime capability %s is missing.\n",
+         img_info->requirements, req.c_str());
+      is_compatible = false;
+    }
+  }
+  return is_compatible;
+}
+
+#define MAX_CAPS_STR_SIZE 1024
 void RTLsTy::RegisterLib(__tgt_bin_desc *desc) {
+
+  // Get the current active offload environment
+  __tgt_active_offload_env offload_env;
+  // Need a buffer to hold results of offload-arch -c command
+  size_t offload_arch_output_buffer_size = MAX_CAPS_STR_SIZE;
+  char *offload_arch_output_buffer =
+      (char *)malloc(offload_arch_output_buffer_size);
+  __tgt_get_active_offload_env(&offload_env, offload_arch_output_buffer,
+                               offload_arch_output_buffer_size);
+
+  bool requires_usm = (bool)(RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY);
+  bool has_xnack = (std::string(offload_env.capabilities).find("xnack+") !=
+                    std::string::npos);
+  bool is_amd = (std::string(offload_env.capabilities).find("gfx") == 0);
+  if (is_amd && requires_usm && !has_xnack) {
+    fprintf(stderr, "WARNING: USM SET WITHOUT XNACK ENABLED.\n");
+    fprintf(stderr, "         THIS WILL BECOME FATAL ERROR IN FUTURE.\n");
+  }
+#if 0
+    FATAL_MESSAGE0(1, "'#pragma omp requires unified_shared_memory' requires "
+                      "environment with xnack+ capability!");
+#endif
+
+  RTLInfoTy *FoundRTL = NULL;
   PM->RTLsMtx.lock();
   // Register the images with the RTLs that understand them, if any.
   for (int32_t i = 0; i < desc->NumDeviceImages; ++i) {
     // Obtain the image.
     __tgt_device_image *img = &desc->DeviceImages[i];
 
-    RTLInfoTy *FoundRTL = NULL;
-
+    // Get corresponding image info requirements and check with runtime
+    __tgt_image_info *img_info = __tgt_get_image_info(i);
+    if (!_ImageIsCompatibleWithEnv(img_info, &offload_env))
+      continue;
+    FoundRTL = NULL;
     // Scan the RTLs that have associated images until we find one that supports
     // the current image.
     for (auto &R : AllRTLs) {
+
       if (!R.is_valid_binary(img)) {
         DP("Image " DPxMOD " is NOT compatible with RTL %s!\n",
            DPxPTR(img->ImageStart), R.RTLName.c_str());
@@ -368,7 +482,41 @@
   }
   PM->RTLsMtx.unlock();
 
+  if (!FoundRTL) {
+    if (PM->TargetOffloadPolicy == tgt_mandatory)
+      fprintf(stderr, "ERROR:\
+	Runtime capabilities do NOT meet any offload image requirements\n\
+	and the OMP_TARGET_OFFLOAD policy is mandatory.  Terminating!\n\
+	Runtime capabilities : %s\n",
+              offload_env.capabilities);
+    else if (PM->TargetOffloadPolicy == tgt_disabled)
+      fprintf(stderr, "WARNING: Offloading is disabled.\n");
+    else
+      fprintf(
+          stderr,
+          "WARNING: Runtime capabilities do NOT meet any image requirements.\n\
+	 So device offloading is now disabled.\n\
+	Runtime capabilities : %s\n",
+          offload_env.capabilities);
+    if (PM->TargetOffloadPolicy != tgt_disabled) {
+      for (int32_t i = 0; i < desc->NumDeviceImages; ++i) {
+        __tgt_image_info *img_info = __tgt_get_image_info(i);
+        if (img_info)
+          fprintf(stderr, "\
+	  Image %d requirements : %s\n",
+                  i, img_info->requirements);
+        else
+          fprintf(stderr, "\
+	  Image %d has no requirements. Could be from older compiler\n",
+                  i);
+      }
+    }
+    if (PM->TargetOffloadPolicy == tgt_mandatory)
+      exit(1);
+  }
+
   DP("Done registering entries!\n");
+  free(offload_arch_output_buffer);
 }
 
 void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) {
Index: openmp/libomptarget/src/interface.cpp
===================================================================
--- openmp/libomptarget/src/interface.cpp
+++ openmp/libomptarget/src/interface.cpp
@@ -43,6 +43,30 @@
   PM->RTLs.RegisterLib(desc);
 }
 
+static __tgt_image_info **__tgt_AllImageInfos;
+static int __tgt_num_registered_images = 0;
+EXTERN void __tgt_register_image_info(__tgt_image_info *imageInfo) {
+
+  DP(" register_image_info image %d of %d  requirements:%s VERSION:%d\n",
+     imageInfo->image_number, imageInfo->number_images, imageInfo->requirements,
+     imageInfo->version);
+
+  if (!__tgt_AllImageInfos)
+    __tgt_AllImageInfos = (__tgt_image_info **)malloc(
+        sizeof(__tgt_image_info *) * imageInfo->number_images);
+  __tgt_AllImageInfos[imageInfo->image_number] = imageInfo;
+  __tgt_num_registered_images = imageInfo->number_images;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Return pointer to image information if it was registered
+EXTERN __tgt_image_info *__tgt_get_image_info(unsigned image_number) {
+  if (__tgt_num_registered_images)
+    return __tgt_AllImageInfos[image_number];
+  else
+    return nullptr;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 /// unloads a target shared library
 EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc) {
@@ -55,6 +79,10 @@
       }
     }
   }
+  if (__tgt_num_registered_images) {
+    free(__tgt_AllImageInfos);
+    __tgt_num_registered_images = 0;
+  }
 }
 
 /// creates host-to-target data mapping, stores it in the
Index: openmp/libomptarget/src/exports
===================================================================
--- openmp/libomptarget/src/exports
+++ openmp/libomptarget/src/exports
@@ -2,6 +2,7 @@
   global:
     __tgt_register_requires;
     __tgt_register_lib;
+    __tgt_register_image_info;
     __tgt_unregister_lib;
     __tgt_target_data_begin;
     __tgt_target_data_end;
Index: openmp/libomptarget/include/omptarget.h
===================================================================
--- openmp/libomptarget/include/omptarget.h
+++ openmp/libomptarget/include/omptarget.h
@@ -120,6 +120,44 @@
   __tgt_offload_entry *HostEntriesEnd;   // End of table (non inclusive)
 };
 
+/// __tgt_image_info:
+///
+/// The information in this struct is provided in clang-offload-wrapper
+/// as a call to __tgt_register_image_info for each image in the library
+/// of images also created created by clang-offload-wrapper.
+/// __tgt_register_image_info is called for each image BEFORE the single
+/// call to __tgt_register_lib so that image information is available
+/// before they are loaded.  clang-offload-wrapper gets this image information
+/// from command line arguments provided by the clang driver when it creates
+/// the call to the __clang-offload-wrapper command.
+/// This architecture allows the binary image (pointed to by ImageStart and
+/// ImageEnd in __tgt_device_image) to remain architecture indenendent.
+/// That is, the architecture independent part of the libomptarget runtime
+/// does not need to peer inside the image to determine if it is loadable
+/// even though in most cases the image is an elf object.
+/// There is one __tgt_image_info for each __tgt_device_image. For backward
+/// compabibility, no changes are allowed to either __tgt_device_image or
+/// __tgt_bin_desc. The absense of __tgt_image_info is the indication that
+/// the runtime is being used on a binary created by an old version of
+/// the compiler.
+///
+struct __tgt_image_info {
+  int32_t version;           // The version of this struct
+  int32_t image_number;      // Image number in image library starting from 0
+  int32_t number_images;     // Number of images, used for initial allocation
+  char *requirements;        // e.g. sm_30, sm_70, gfx906, includes features
+  char *compile_opts;        // reserved for future use
+};
+
+/// __tgt_active_offload_env
+///
+/// This structure is created by __tgt_get_active_offload_env and is used
+/// to determine compatibility of the images with the current environment
+/// that is "in play".
+struct __tgt_active_offload_env {
+  char *capabilities; // string returned by offload-arch -r
+};
+
 /// This struct contains the offload entries identified by the target runtime
 struct __tgt_target_table {
   __tgt_offload_entry *EntriesBegin; // Begin of the table with all the entries
@@ -210,6 +248,13 @@
 /// adds a target shared library to the target execution image
 void __tgt_register_lib(__tgt_bin_desc *desc);
 
+/// adds an image information struct, called for each image
+void __tgt_register_image_info(__tgt_image_info *imageInfo);
+
+/// gets pointer to image information for specified image number
+/// Returns nullptr for apps built with old version of compiler
+__tgt_image_info *__tgt_get_image_info(uint32_t image_num);
+
 /// removes a target shared library from the target execution image
 void __tgt_unregister_lib(__tgt_bin_desc *desc);
 
Index: clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
===================================================================
--- clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
+++ clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
@@ -60,6 +60,11 @@
            cl::desc("Target triple for the output module"),
            cl::value_desc("triple"), cl::cat(ClangOffloadWrapperCategory));
 
+static cl::list<std::string>
+    OffloadArchs("requirements", cl::desc("requirements contains offload-arch"),
+                 cl::value_desc("requirements"),
+                 cl::cat(ClangOffloadWrapperCategory));
+
 namespace {
 
 class BinaryWrapper {
@@ -69,6 +74,7 @@
   StructType *EntryTy = nullptr;
   StructType *ImageTy = nullptr;
   StructType *DescTy = nullptr;
+  StructType *ImageInfoTy = nullptr;
 
 private:
   IntegerType *getSizeTTy() {
@@ -134,6 +140,27 @@
     return PointerType::getUnqual(getBinDescTy());
   }
 
+  // This matches the runtime struct definition of __tgt_image_info
+  // declared in openmp/libomptarget/include/omptarget.h /
+  // struct __tgt_image_info {
+  //   int32_t version;
+  //   int32_t image_number;
+  //   int32_t number_images;
+  //   char* requirements;
+  //   char* target_compile_opts;
+  // };
+  StructType *getImageInfoTy() {
+    if (!ImageInfoTy)
+      ImageInfoTy = StructType::create(
+          "__tgt_image_info", Type::getInt32Ty(C), Type::getInt32Ty(C),
+          Type::getInt32Ty(C), Type::getInt8PtrTy(C), Type::getInt8PtrTy(C));
+    return ImageInfoTy;
+  }
+
+  PointerType *getImageInfoPtrTy() {
+    return PointerType::getUnqual(getImageInfoTy());
+  }
+
   /// Creates binary descriptor for the given device images. Binary descriptor
   /// is an object that is passed to the offloading runtime at program startup
   /// and it describes all device images available in the executable or shared
@@ -245,7 +272,9 @@
                               ".omp_offloading.descriptor");
   }
 
-  void createRegisterFunction(GlobalVariable *BinDesc) {
+  void createRegisterFunction(GlobalVariable *BinDesc,
+                              ArrayRef<ArrayRef<char>> Requirements) {
+
     auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
     auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
                                   ".omp_offloading.descriptor_reg", &M);
@@ -259,6 +288,47 @@
 
     // Construct function body
     IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
+
+    // Create calls to __tgt_register_image_info for each image
+    auto *NullPtr = llvm::ConstantPointerNull::get(Builder.getInt8PtrTy());
+    auto *Zero = ConstantInt::get(getSizeTTy(), 0u);
+    auto *RegInfoFuncTy =
+        FunctionType::get(Type::getVoidTy(C), getImageInfoPtrTy(), false);
+    FunctionCallee RegInfoFuncC =
+        M.getOrInsertFunction("__tgt_register_image_info", RegInfoFuncTy);
+    unsigned int img_count = 0;
+    for (ArrayRef<char> Requirement : Requirements) {
+      Constant *RequirementV = ConstantDataArray::get(C, Requirement);
+      auto *GV =
+          new GlobalVariable(M, RequirementV->getType(), /*isConstant*/ true,
+                             GlobalValue::InternalLinkage, RequirementV,
+                             Twine("__offload_arch_" + Twine(img_count)));
+      GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+      // store value of these variables (i.e. offload archs) into a custom
+      // section which will be used by "offload-arch -f". It won't be
+      // removed during binary stripping.
+      GV->setSection(".offload_arch_list");
+
+      auto *RequirementVPtr =
+          ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zero);
+      RequirementVPtr =
+          ConstantExpr::getBitCast(RequirementVPtr, Type::getInt8PtrTy(C));
+      auto *InfoInit = ConstantStruct::get(
+          getImageInfoTy(), ConstantInt::get(Type::getInt32Ty(C), 1),
+          ConstantInt::get(Type::getInt32Ty(C), img_count),
+          ConstantInt::get(Type::getInt32Ty(C), (uint32_t)Requirements.size()),
+          RequirementVPtr,
+          NullPtr // TODO: capture target-compile-opts from clang driver
+      );
+      auto *ImageInfoGV = new GlobalVariable(
+          M, InfoInit->getType(),
+          /*isConstant*/ true, GlobalValue::InternalLinkage, InfoInit,
+          Twine(".offload_image_info_" + Twine(img_count++)));
+      ImageInfoGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+      Builder.CreateCall(RegInfoFuncC, ImageInfoGV);
+    }
+
     Builder.CreateCall(RegFuncC, BinDesc);
     Builder.CreateRetVoid();
 
@@ -298,10 +368,11 @@
     M.setTargetTriple(Target);
   }
 
-  const Module &wrapBinaries(ArrayRef<ArrayRef<char>> Binaries) {
+  const Module &wrapBinaries(ArrayRef<ArrayRef<char>> Binaries,
+                             ArrayRef<ArrayRef<char>> Requirements) {
     GlobalVariable *Desc = createBinDesc(Binaries);
     assert(Desc && "no binary descriptor");
-    createRegisterFunction(Desc);
+    createRegisterFunction(Desc, Requirements);
     createUnregisterFunction(Desc);
     return M;
   }
@@ -363,10 +434,20 @@
     return 1;
   }
 
+  SmallVector<ArrayRef<char>, 4u> Requirements;
+  Requirements.reserve(OffloadArchs.size());
+  for (unsigned i = 0; i != OffloadArchs.size(); ++i) {
+    OffloadArchs[i].append("\0");
+    Requirements.emplace_back(OffloadArchs[i].data(),
+                              OffloadArchs[i].size() + 1);
+  }
+
   // Create a wrapper for device binaries and write its bitcode to the file.
-  WriteBitcodeToFile(BinaryWrapper(Target).wrapBinaries(
-                         makeArrayRef(Images.data(), Images.size())),
-                     Out.os());
+  WriteBitcodeToFile(
+      BinaryWrapper(Target).wrapBinaries(
+          makeArrayRef(Images.data(), Images.size()),
+          makeArrayRef(Requirements.data(), Requirements.size())),
+      Out.os());
   if (Out.os().has_error()) {
     reportError(createFileError(Output, Out.os().error()));
     return 1;
Index: clang/test/Driver/openmp-offload-multi.c
===================================================================
--- /dev/null
+++ clang/test/Driver/openmp-offload-multi.c
@@ -0,0 +1,34 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+//
+// Legacy mode (-fopenmp-targets,-Xopenmp-target,-march) tests for
+// multi arch compilation
+//
+// RUN:   %clang -### -target x86_64-linux-gnu -fopenmp\
+// RUN:   -fopenmp-targets=amdgcn-amd-amdhsa,amdgcn-amd-amdhsa \
+// RUN:   -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 \
+// RUN:   -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908 \
+// RUN:   %s 2>&1 | FileCheck %s
+
+// CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-x" "c"{{.*}}
+// CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-o" "[[HOSTOBJ:.*.o]]" "-x" "ir"{{.*}}
+
+// compilation for offload target 1 : gfx906
+// CHECK: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-emit-llvm-bc"{{.*}}"-target-cpu" "gfx906" "-fcuda-is-device"{{.*}}"-o" "{{.*}}.bc" "-x" "c"{{.*}}.c
+// CHECK: llvm-link"{{.*}}openmp-offload-multi-{{.*}}.bc"{{.*}}"-o" "{{.*}}openmp-offload-multi-{{.*}}-gfx906-linked-{{.*}}.bc"
+// CHECK: llc{{.*}}openmp-offload-multi-{{.*}}-gfx906-linked-{{.*}}.bc" "-mtriple=amdgcn-amd-amdhsa" "-mcpu=gfx906" "-filetype=obj"{{.*}}"-o"{{.*}}openmp-offload-multi-{{.*}}-gfx906-{{.*}}.o"
+// CHECK: lld{{.*}}"-flavor" "gnu" "--no-undefined" "-shared" "-o" "[[GFX906OUT:.*.out]]" "{{.*}}openmp-offload-multi-{{.*}}-gfx906-{{.*}}.o"
+
+// compilation for offload target 1 : gfx908
+// CHECK: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-emit-llvm-bc"{{.*}}"-target-cpu" "gfx908" "-fcuda-is-device"{{.*}}"-o" "{{.*}}.bc" "-x" "c"{{.*}}.c
+// CHECK: llvm-link"{{.*}}openmp-offload-multi-{{.*}}.bc"{{.*}}"-o" "{{.*}}openmp-offload-multi-{{.*}}-gfx908-linked-{{.*}}.bc"
+// CHECK: llc{{.*}}openmp-offload-multi-{{.*}}-gfx908-linked-{{.*}}.bc" "-mtriple=amdgcn-amd-amdhsa" "-mcpu=gfx908" "-filetype=obj"{{.*}}"-o"{{.*}}openmp-offload-multi-{{.*}}-gfx908-{{.*}}.o"
+// CHECK: lld{{.*}}"-flavor" "gnu" "--no-undefined" "-shared" "-o" "[[GFX908OUT:.*.out]]" "{{.*}}openmp-offload-multi-{{.*}}-gfx908-{{.*}}.o"
+
+// Combining device images for offload targets
+// CHECK: clang-offload-wrapper"{{.*}}" "-o" "[[COMBINEDIR:.*.bc]]" "--requirements=gfx906" "[[GFX906OUT]]" "--requirements=gfx908" "[[GFX908OUT]]"
+
+// CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}} "-fopenmp-targets=amdgcn-amd-amdhsa,amdgcn-amd-amdhsa"{{.*}}"-o" "[[COMBINEDOBJ:.*.o]]" "-x" "ir" "[[COMBINEDIR]]"
+// CHECK: ld.lld"{{.*}}" "-o" "a.out{{.*}}[[HOSTOBJ]]" "[[COMBINEDOBJ]]{{.*}}" "-lomp{{.*}}-lomptarget"
Index: clang/test/Driver/hip-toolchain-rdc-separate.hip
===================================================================
--- clang/test/Driver/hip-toolchain-rdc-separate.hip
+++ clang/test/Driver/hip-toolchain-rdc-separate.hip
@@ -44,7 +44,7 @@
 // CHECK-SAME: {{.*}} [[A_SRC]]
 
 // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// CHECK-SAME: "-targets=hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900,host-x86_64-unknown-linux-gnu"
+// CHECK-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900,host-x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-outputs=[[A_O:.*a.o]]" "-inputs=[[A_BC1]],[[A_BC2]],[[A_OBJ_HOST]]"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
@@ -79,7 +79,7 @@
 // CHECK-SAME: {{.*}} [[B_SRC]]
 
 // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// CHECK-SAME: "-targets=hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900,host-x86_64-unknown-linux-gnu"
+// CHECK-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900,host-x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-outputs=[[B_O:.*b.o]]" "-inputs=[[B_BC1]],[[B_BC2]],[[B_OBJ_HOST]]"
 
 // RUN: touch %T/a.o
@@ -91,22 +91,22 @@
 // RUN: 2>&1 | FileCheck -check-prefix=LINK %s
 
 // LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900"
+// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // LINK-SAME: "-inputs=[[A_O:.*a.o]]" "-outputs=[[A_OBJ_HOST:.*o]],{{.*o}},{{.*o}}"
 // LINK: "-unbundle" "-allow-missing-bundles"
 
 // LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900"
+// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // LINK-SAME: "-inputs=[[B_O:.*b.o]]" "-outputs=[[B_OBJ_HOST:.*o]],{{.*o}},{{.*o}}"
 // LINK: "-unbundle" "-allow-missing-bundles"
 
 // LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900"
+// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // LINK-SAME: "-inputs=[[A_O]]" "-outputs={{.*o}},[[A_BC1:.*o]],[[A_BC2:.*o]]"
 // LINK: "-unbundle" "-allow-missing-bundles"
 
 // LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900"
+// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // LINK-SAME: "-inputs=[[B_O]]" "-outputs={{.*o}},[[B_BC1:.*o]],[[B_BC2:.*o]]"
 // LINK: "-unbundle" "-allow-missing-bundles"
 
Index: clang/test/Driver/hip-rdc-device-only.hip
===================================================================
--- clang/test/Driver/hip-rdc-device-only.hip
+++ clang/test/Driver/hip-rdc-device-only.hip
@@ -82,7 +82,7 @@
 // COMMON-SAME: {{.*}} {{".*a.cu"}}
 
 // COMMON: "{{.*}}clang-offload-bundler" "-type={{(bc|ll)}}"
-// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900"
+// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // COMMON-SAME: "-outputs=a-hip-amdgcn-amd-amdhsa.{{(bc|ll)}}"
 
 // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
@@ -112,7 +112,7 @@
 // COMMON-SAME: {{.*}} {{".*b.hip"}}
 
 // COMMON: "{{.*}}clang-offload-bundler" "-type={{(bc|ll)}}"
-// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900"
+// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // COMMON-SAME: "-outputs=b-hip-amdgcn-amd-amdhsa.{{(bc|ll)}}"
 
 // SAVETEMP: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu"
@@ -142,7 +142,7 @@
 // SAVETEMP-SAME: {{.*}} "-o" {{"a.*.ll"}} "-x" "ir" [[A_GFX900_TMP_BC]]
 
 // SAVETEMP: "{{.*}}clang-offload-bundler" "-type=ll"
-// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900"
+// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // SAVETEMP-SAME: "-outputs=a-hip-amdgcn-amd-amdhsa.ll"
 
 // SAVETEMP: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu"
@@ -172,7 +172,7 @@
 // SAVETEMP-SAME: {{.*}} "-o" {{"b.*.ll"}} "-x" "ir" [[B_GFX900_TMP_BC]]
 
 // SAVETEMP: "{{.*}}clang-offload-bundler" "-type=ll"
-// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900"
+// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // SAVETEMP-SAME: "-outputs=b-hip-amdgcn-amd-amdhsa.ll"
 
 // FAIL: error: cannot specify -o when generating multiple output files
Index: clang/test/Driver/amdgpu-openmp-toolchain.c
===================================================================
--- clang/test/Driver/amdgpu-openmp-toolchain.c
+++ clang/test/Driver/amdgpu-openmp-toolchain.c
@@ -10,9 +10,9 @@
 // CHECK: llvm-link{{.*}}"-o" "{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-linked-{{.*}}.bc"
 // CHECK: llc{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-linked-{{.*}}.bc" "-mtriple=amdgcn-amd-amdhsa" "-mcpu=gfx906" "-filetype=obj" "-o"{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-{{.*}}.o"
 // CHECK: lld{{.*}}"-flavor" "gnu" "--no-undefined" "-shared" "-o"{{.*}}amdgpu-openmp-toolchain-{{.*}}.out" "{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-{{.*}}.o"
-// CHECK: clang-offload-wrapper{{.*}}"-target" "x86_64-unknown-linux-gnu" "-o" "{{.*}}a-{{.*}}.bc" {{.*}}amdgpu-openmp-toolchain-{{.*}}.out"
-// CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-o" "{{.*}}a-{{.*}}.o" "-x" "ir" "{{.*}}a-{{.*}}.bc"
-// CHECK: ld{{.*}}"-o" "a.out"{{.*}}"{{.*}}amdgpu-openmp-toolchain-{{.*}}.o" "{{.*}}a-{{.*}}.o" "-lomp" "-lomptarget"
+// CHECK: clang-offload-wrapper{{.*}}" "-target" "x86_64-unknown-linux-gnu" "-o" "{{.*}}a_{{.*}}.bc" "--requirements=gfx906" "{{.*}}amdgpu-openmp-toolchain-{{.*}}.out"
+// CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-o" "{{.*}}a_{{.*}}.o" "-x" "ir" "{{.*}}a_{{.*}}.bc"
+// CHECK: ld{{.*}}"-o" "a.out"{{.*}}"{{.*}}amdgpu-openmp-toolchain-{{.*}}.o" "{{.*}}a_{{.*}}.o" "-lomp" "-lomptarget"
 
 // RUN:   %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PHASES %s
@@ -26,14 +26,12 @@
 // CHECK-PHASES: 6: preprocessor, {5}, cpp-output, (device-openmp)
 // CHECK-PHASES: 7: compiler, {6}, ir, (device-openmp)
 // CHECK-PHASES: 8: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (amdgcn-amd-amdhsa)" {7}, ir
-// CHECK-PHASES: 9: backend, {8}, assembler, (device-openmp)
-// CHECK-PHASES: 10: assembler, {9}, object, (device-openmp)
-// CHECK-PHASES: 11: linker, {10}, image, (device-openmp)
-// CHECK-PHASES: 12: offload, "device-openmp (amdgcn-amd-amdhsa)" {11}, image
-// CHECK-PHASES: 13: clang-offload-wrapper, {12}, ir, (host-openmp)
-// CHECK-PHASES: 14: backend, {13}, assembler, (host-openmp)
-// CHECK-PHASES: 15: assembler, {14}, object, (host-openmp)
-// CHECK-PHASES: 16: linker, {4, 15}, image, (host-openmp)
+// CHECK-PHASES: 9: linker, {8}, image, (device-openmp)
+// CHECK-PHASES: 10: offload, "device-openmp (amdgcn-amd-amdhsa)" {9}, image
+// CHECK-PHASES: 11: clang-offload-wrapper, {10}, ir, (host-openmp)
+// CHECK-PHASES: 12: backend, {11}, assembler, (host-openmp)
+// CHECK-PHASES: 13: assembler, {12}, object, (host-openmp)
+// CHECK-PHASES: 14: linker, {4, 13}, image, (host-openmp)
 
 // handling of --libomptarget-amdgcn-bc-path
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgcn-gfx803.bc %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIBOMPTARGET
@@ -73,4 +71,4 @@
 // CHECK-C: "x86_64-unknown-linux-gnu" - "offload bundler"
 
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR
-// CHECK-EMIT-LLVM-IR: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-emit-llvm"
+// CHECK-EMIT-LLVM-IR: clang{{.*}}" "-cc1" "-triple" "amdgcn-amd-amdhsa"{{.*}}"-emit-llvm"
Index: clang/test/Driver/amdgpu-openmp-system-arch-fail.c
===================================================================
--- clang/test/Driver/amdgpu-openmp-system-arch-fail.c
+++ clang/test/Driver/amdgpu-openmp-system-arch-fail.c
@@ -15,14 +15,9 @@
 // case when amdgpu_arch returns nothing or fails
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib --amdgpu-arch-tool=%t/amdgpu_arch_fail %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=NO-OUTPUT-ERROR
-// NO-OUTPUT-ERROR: error: Cannot determine AMDGPU architecture{{.*}}Exited with error code 1. Consider passing it via --march
-
-// case when amdgpu_arch returns multiple gpus but all are different
-// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib --amdgpu-arch-tool=%t/amdgpu_arch_different %s 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=MULTIPLE-OUTPUT-ERROR
-// MULTIPLE-OUTPUT-ERROR: error: Cannot determine AMDGPU architecture: Multiple AMD GPUs found with different archs. Consider passing it via --march
+// NO-OUTPUT-ERROR: fatal error: The option -fopenmp-targets= requires additional options -Xopenmp-target= and -march=
 
 // case when amdgpu_arch does not return anything with successful execution
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib --amdgpu-arch-tool=%t/amdgpu_arch_empty %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=EMPTY-OUTPUT
-// EMPTY-OUTPUT: error: Cannot determine AMDGPU architecture: No AMD GPU detected in the system. Consider passing it via --march
+// EMPTY-OUTPUT: fatal error: The option -fopenmp-targets= requires additional options -Xopenmp-target= and -march=
Index: clang/lib/Driver/ToolChains/Cuda.h
===================================================================
--- clang/lib/Driver/ToolChains/Cuda.h
+++ clang/lib/Driver/ToolChains/Cuda.h
@@ -134,6 +134,10 @@
                 const ToolChain &HostTC, const llvm::opt::ArgList &Args,
                 const Action::OffloadKind OK);
 
+  CudaToolChain(const Driver &D, const llvm::Triple &Triple,
+                const ToolChain &HostTC, const llvm::opt::ArgList &Args,
+                const Action::OffloadKind OK, const std::string OffloadArch);
+
   const llvm::Triple *getAuxTriple() const override {
     return &HostTC.getTriple();
   }
Index: clang/lib/Driver/ToolChains/Cuda.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Cuda.cpp
+++ clang/lib/Driver/ToolChains/Cuda.cpp
@@ -404,6 +404,8 @@
   // flag or the default value.
   if (JA.isDeviceOffloading(Action::OFK_OpenMP)) {
     GPUArchName = Args.getLastArgValue(options::OPT_march_EQ);
+    if (GPUArchName.empty())
+      GPUArchName = TC.getOffloadArch();
     assert(!GPUArchName.empty() && "Must have an architecture passed in.");
   } else
     GPUArchName = JA.getOffloadingArch();
@@ -597,6 +599,9 @@
 
   StringRef GPUArch =
       Args.getLastArgValue(options::OPT_march_EQ);
+  if (GPUArch.empty())
+    GPUArch = getToolChain().getOffloadArch();
+
   assert(!GPUArch.empty() && "At least one GPU Arch required for ptxas.");
 
   CmdArgs.push_back("-arch");
@@ -659,6 +664,22 @@
   getProgramPaths().push_back(getDriver().Dir);
 }
 
+CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
+                             const ToolChain &HostTC, const ArgList &Args,
+                             const Action::OffloadKind OK,
+                             const std::string OffloadArch)
+    : ToolChain(D, Triple, Args), HostTC(HostTC),
+      CudaInstallation(D, HostTC.getTriple(), Args), OK(OK) {
+  if (CudaInstallation.isValid()) {
+    CudaInstallation.WarnIfUnsupportedVersion();
+    getProgramPaths().push_back(std::string(CudaInstallation.getBinPath()));
+  }
+  // Lookup binaries into the driver directory, this is used to
+  // discover the clang-offload-bundler executable.
+  getProgramPaths().push_back(getDriver().Dir);
+  setOffloadArch(OffloadArch);
+}
+
 std::string CudaToolChain::getInputFilename(const InputInfo &Input) const {
   // Only object files are changed, for example assembly files keep their .s
   // extensions. CUDA also continues to use .o as they don't use nvlink but
@@ -680,6 +701,8 @@
   HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);
 
   StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
+  if (GpuArch.empty())
+    GpuArch = getOffloadArch();
   assert(!GpuArch.empty() && "Must have an explicit GPU arch.");
   assert((DeviceOffloadingKind == Action::OFK_OpenMP ||
           DeviceOffloadingKind == Action::OFK_Cuda) &&
@@ -844,6 +867,8 @@
     }
 
     StringRef Arch = DAL->getLastArgValue(options::OPT_march_EQ);
+    if (Arch.empty())
+      Arch = getOffloadArch();
     if (Arch.empty())
       DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),
                         CLANG_OPENMP_NVPTX_DEFAULT_ARCH);
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -6659,20 +6659,32 @@
   }
 
   // For all the host OpenMP offloading compile jobs we need to pass the targets
-  // information using -fopenmp-targets= option.
+  // information using `-fopenmp-targets=` option.
   if (JA.isHostOffloading(Action::OFK_OpenMP)) {
     SmallString<128> TargetInfo("-fopenmp-targets=");
 
     Arg *Tgts = Args.getLastArg(options::OPT_fopenmp_targets_EQ);
-    assert(Tgts && Tgts->getNumValues() &&
-           "OpenMP offloading has to have targets specified.");
-    for (unsigned i = 0; i < Tgts->getNumValues(); ++i) {
-      if (i)
-        TargetInfo += ',';
-      // We need to get the string from the triple because it may be not exactly
-      // the same as the one we get directly from the arguments.
-      llvm::Triple T(Tgts->getValue(i));
-      TargetInfo += T.getTriple();
+    // Get list of device Toolchains
+    auto OpenMPTCRange = C.getOffloadToolChains<Action::OFK_OpenMP>();
+
+    if (Tgts && Tgts->getNumValues()) {
+      for (unsigned i = 0; i < Tgts->getNumValues(); ++i) {
+        if (i)
+          TargetInfo += ',';
+        // We need to get the string from the triple because it may be not
+        // exactly the same as the one we get directly from the arguments.
+        llvm::Triple T(Tgts->getValue(i));
+        TargetInfo += T.getTriple();
+      }
+    } else if (OpenMPTCRange.first != OpenMPTCRange.second) {
+      for (auto TI = OpenMPTCRange.first, TE = OpenMPTCRange.second; TI != TE;
+           ++TI) {
+        auto *deviceTC = TI->second;
+        TargetInfo += deviceTC->getTriple().str();
+      }
+    } else {
+      assert("OpenMP offloading requires target devices use \
+              `-fopenmp-targets=`");
     }
     CmdArgs.push_back(Args.MakeArgString(TargetInfo.str()));
   }
@@ -7668,18 +7680,17 @@
       });
     }
     Triples += Action::GetOffloadKindName(CurKind);
-    Triples += "-";
-    std::string NormalizedTriple = CurTC->getTriple().normalize();
-    Triples += NormalizedTriple;
-
-    if (CurDep->getOffloadingArch() != nullptr) {
-      // If OffloadArch is present it can only appear as the 6th hypen
-      // sepearated field of Bundle Entry ID. So, pad required number of
-      // hyphens in Triple.
-      for (int i = 4 - StringRef(NormalizedTriple).count("-"); i > 0; i--)
-        Triples += "-";
+    Triples += '-';
+    Triples += CurTC->getTriple().normalize();
+    if ((CurKind == Action::OFK_HIP || CurKind == Action::OFK_Cuda) &&
+        CurDep->getOffloadingArch()) {
+      Triples += '-';
       Triples += CurDep->getOffloadingArch();
     }
+    if (CurKind == Action::OFK_OpenMP && !CurTC->getOffloadArch().empty()) {
+      Triples += '-';
+      Triples += CurTC->getOffloadArch();
+    }
   }
   CmdArgs.push_back(TCArgs.MakeArgString(Triples));
 
@@ -7711,7 +7722,7 @@
   C.addCommand(std::make_unique<Command>(
       JA, *this, ResponseFileSupport::None(),
       TCArgs.MakeArgString(getToolChain().GetProgramPath(getShortName())),
-      CmdArgs, None, Output));
+      CmdArgs, Inputs, Output));
 }
 
 void OffloadBundler::ConstructJobMultipleOutputs(
@@ -7746,20 +7757,21 @@
       Triples += ',';
 
     auto &Dep = DepInfo[I];
-    Triples += Action::GetOffloadKindName(Dep.DependentOffloadKind);
-    Triples += "-";
-    std::string NormalizedTriple =
-        Dep.DependentToolChain->getTriple().normalize();
-    Triples += NormalizedTriple;
-
-    if (!Dep.DependentBoundArch.empty()) {
-      // If OffloadArch is present it can only appear as the 6th hypen
-      // sepearated field of Bundle Entry ID. So, pad required number of
-      // hyphens in Triple.
-      for (int i = 4 - StringRef(NormalizedTriple).count("-"); i > 0; i--)
-        Triples += "-";
+    auto OffloadKind = Dep.DependentOffloadKind;
+    Triples += Action::GetOffloadKindName(OffloadKind);
+    Triples += '-';
+    Triples += Dep.DependentToolChain->getTriple().normalize();
+    if ((Dep.DependentOffloadKind == Action::OFK_HIP ||
+         Dep.DependentOffloadKind == Action::OFK_Cuda) &&
+        !Dep.DependentBoundArch.empty()) {
+      Triples += '-';
       Triples += Dep.DependentBoundArch;
     }
+    if (OffloadKind == Action::OFK_OpenMP &&
+        !Dep.DependentToolChain->getOffloadArch().empty()) {
+      Triples += '-';
+      Triples += Dep.DependentToolChain->getOffloadArch();
+    }
   }
 
   CmdArgs.push_back(TCArgs.MakeArgString(Triples));
@@ -7805,9 +7817,30 @@
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
-  // Add inputs.
+  auto TCs = C.getOffloadToolChains<Action::OFK_OpenMP>();
+
+  // Add runtime requirements on each image which includes the offload-arch
+  auto II = TCs.first;
   for (const InputInfo &I : Inputs) {
     assert(I.isFilename() && "Invalid input.");
+    if (I.getAction()) {
+      auto TC = II->second;
+      II++;
+      std::string requirements("--requirements=");
+      requirements.append(TC->getOffloadArch());
+      // targetid could have user specified features such as :xnack-:sramecc+
+      // so replace ":" with "__" in requirements used for
+      // clang-offload-wrapper.
+      size_t start_pos = 0;
+      while ((start_pos = requirements.find(":", start_pos)) !=
+             std::string::npos) {
+        requirements.replace(start_pos, 1, "__");
+        start_pos += 2;
+      }
+
+      // FIXME: Add other architecture requirements here
+      CmdArgs.push_back(Args.MakeArgString(requirements.c_str()));
+    }
     CmdArgs.push_back(I.getFilename());
   }
 
Index: clang/lib/Driver/ToolChains/AMDGPUOpenMP.h
===================================================================
--- clang/lib/Driver/ToolChains/AMDGPUOpenMP.h
+++ clang/lib/Driver/ToolChains/AMDGPUOpenMP.h
@@ -68,6 +68,10 @@
                         const ToolChain &HostTC,
                         const llvm::opt::ArgList &Args);
 
+  AMDGPUOpenMPToolChain(const Driver &D, const llvm::Triple &Triple,
+                        const ToolChain &HostTC, const llvm::opt::ArgList &Args,
+                        const std::string OffloadArch);
+
   const llvm::Triple *getAuxTriple() const override {
     return &HostTC.getTriple();
   }
Index: clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
===================================================================
--- clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@@ -166,7 +166,7 @@
   const toolchains::AMDGPUOpenMPToolChain &AMDGPUOpenMPTC =
       static_cast<const toolchains::AMDGPUOpenMPToolChain &>(TC);
 
-  std::string GPUArch = Args.getLastArgValue(options::OPT_march_EQ).str();
+  std::string GPUArch = AMDGPUOpenMPTC.getOffloadArch();
   if (GPUArch.empty()) {
     if (!checkSystemForAMDGPU(Args, AMDGPUOpenMPTC, GPUArch))
       return;
@@ -202,12 +202,21 @@
   getProgramPaths().push_back(getDriver().Dir);
 }
 
+AMDGPUOpenMPToolChain::AMDGPUOpenMPToolChain(const Driver &D,
+                                             const llvm::Triple &Triple,
+                                             const ToolChain &HostTC,
+                                             const ArgList &Args,
+                                             const std::string OffloadArch)
+    : ROCMToolChain(D, Triple, Args), HostTC(HostTC) {
+  getProgramPaths().push_back(getDriver().Dir);
+  setOffloadArch(OffloadArch);
+}
+
 void AMDGPUOpenMPToolChain::addClangTargetOptions(
     const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args,
     Action::OffloadKind DeviceOffloadingKind) const {
   HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);
-
-  std::string GPUArch = DriverArgs.getLastArgValue(options::OPT_march_EQ).str();
+  std::string GPUArch = getOffloadArch();
   if (GPUArch.empty()) {
     if (!checkSystemForAMDGPU(DriverArgs, *this, GPUArch))
       return;
Index: clang/lib/Driver/Driver.cpp
===================================================================
--- clang/lib/Driver/Driver.cpp
+++ clang/lib/Driver/Driver.cpp
@@ -678,6 +678,38 @@
   return RT;
 }
 
+bool GetTargetInfoFromMArch(Compilation &C,
+                            std::set<std::string> &OffloadArchs) {
+  StringRef OpenMPTargetArch;
+  for (Arg *A : C.getInputArgs()) {
+    if (A->getOption().matches(options::OPT_Xopenmp_target_EQ)) {
+      for (auto *V : A->getValues()) {
+        StringRef VStr = StringRef(V);
+        if (VStr.startswith("-march=") || VStr.startswith("--march=")) {
+          OpenMPTargetArch = VStr.split('=').second;
+          CudaArch Arch = StringToCudaArch(StringRef(OpenMPTargetArch));
+          if (Arch == CudaArch::UNKNOWN) {
+            C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch)
+                << OpenMPTargetArch;
+            C.setContainsError();
+            return false;
+          }
+          StringRef OpenMPTargetTriple = StringRef(A->getValue(0));
+          llvm::Triple TargetTriple(OpenMPTargetTriple);
+
+          // Append Triple and Arch to form a unique key for each instance of
+          // the ToolChain
+          if (!OpenMPTargetTriple.empty() && !OpenMPTargetArch.empty())
+            OffloadArchs.insert(TargetTriple.normalize().append("^").append(
+                OpenMPTargetArch.str()));
+        }
+        A->claim();
+      }
+    }
+  }
+  return true;
+}
+
 void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
                                               InputList &Inputs) {
 
@@ -729,17 +761,58 @@
           *this, HIPTriple, *HostTC, C.getInputArgs());
     }
     C.addOffloadDeviceToolChain(HIPTC.get(), OFK);
-  }
+  } else {
+    //
+    // OpenMP
+    //
 
-  //
-  // OpenMP
-  //
-  // We need to generate an OpenMP toolchain if the user specified targets with
-  // the -fopenmp-targets option.
-  if (Arg *OpenMPTargets =
-          C.getInputArgs().getLastArg(options::OPT_fopenmp_targets_EQ)) {
-    if (OpenMPTargets->getNumValues()) {
-      // We expect that -fopenmp-targets is always used in conjunction with the
+    std::set<std::string> OffloadArchs;
+
+    if (Arg *OpenMPTargets =
+            C.getInputArgs().getLastArg(options::OPT_fopenmp_targets_EQ)) {
+
+      if (!OpenMPTargets->getNumValues()) {
+        Diag(clang::diag::warn_drv_empty_joined_argument)
+            << OpenMPTargets->getAsString(C.getInputArgs());
+        return;
+      }
+
+      // First, handle errors in command line for OpenMP target offload
+      bool is_host_offloading =
+          (OpenMPTargets->getNumValues() == 1) &&
+          StringRef(OpenMPTargets->getValue())
+              .startswith_insensitive(
+                  C.getSingleOffloadToolChain<Action::OFK_Host>()
+                      ->getTriple()
+                      .getArchName());
+      if (!is_host_offloading) {
+        // Ensure at least one -Xopenm-target exists with a gpu -march
+        if (Arg *XOpenMPTargets =
+                C.getInputArgs().getLastArg(options::OPT_Xopenmp_target_EQ)) {
+          bool has_valid_march = false;
+          for (auto *V : XOpenMPTargets->getValues())
+            if (StringRef(V).startswith("-march=") ||
+                StringRef(V).startswith("--march="))
+              has_valid_march = true;
+          if (!has_valid_march) {
+            Diag(diag::err_drv_missing_Xopenmptarget_or_march);
+            return;
+          }
+        } else {
+          Diag(diag::err_drv_missing_Xopenmptarget_or_march);
+          return;
+        }
+      }
+
+      //  process legacy option -fopenmp-targets -Xopenmp-target and -march
+      auto status = GetTargetInfoFromMArch(C, OffloadArchs);
+      if (!status)
+        return;
+    }
+
+    if (!OffloadArchs.empty()) {
+
+      // We expect that an offload target is always used in conjunction with
       // option -fopenmp specifying a valid runtime with offloading support,
       // i.e. libomp or libiomp.
       bool HasValidOpenMPRuntime = C.getInputArgs().hasFlag(
@@ -750,61 +823,65 @@
         HasValidOpenMPRuntime =
             OpenMPKind == OMPRT_OMP || OpenMPKind == OMPRT_IOMP5;
       }
+      if (!HasValidOpenMPRuntime) {
+        Diag(clang::diag::err_drv_expecting_fopenmp_with_fopenmp_targets);
+        return;
+      }
 
-      if (HasValidOpenMPRuntime) {
-        llvm::StringMap<const char *> FoundNormalizedTriples;
-        for (const char *Val : OpenMPTargets->getValues()) {
-          llvm::Triple TT(Val);
-          std::string NormalizedName = TT.normalize();
-
-          // Make sure we don't have a duplicate triple.
-          auto Duplicate = FoundNormalizedTriples.find(NormalizedName);
-          if (Duplicate != FoundNormalizedTriples.end()) {
-            Diag(clang::diag::warn_drv_omp_offload_target_duplicate)
-                << Val << Duplicate->second;
-            continue;
-          }
+      llvm::StringMap<const char *> FoundNormalizedTriples;
+      for (auto &Target : OffloadArchs) {
+        size_t Loc = Target.find('^');
+        std::string TripleStr = Target.substr(0, Loc);
+        std::string OpenMPTargetArch = Target.substr(Loc + 1);
+        llvm::Triple TT(TripleStr);
+        std::string NormalizedName = Target;
+
+        // Make sure we don't have a duplicate triple.
+        auto Duplicate = FoundNormalizedTriples.find(NormalizedName);
+        if (Duplicate != FoundNormalizedTriples.end()) {
+          Diag(clang::diag::warn_drv_omp_offload_target_duplicate)
+              << NormalizedName << Duplicate->second;
+          continue;
+        }
+
+        // Store the current triple so that we can check for duplicates in the
+        // following iterations.
+        FoundNormalizedTriples[NormalizedName] = NormalizedName.c_str();
 
-          // Store the current triple so that we can check for duplicates in the
-          // following iterations.
-          FoundNormalizedTriples[NormalizedName] = Val;
-
-          // If the specified target is invalid, emit a diagnostic.
-          if (TT.getArch() == llvm::Triple::UnknownArch)
-            Diag(clang::diag::err_drv_invalid_omp_target) << Val;
-          else {
-            const ToolChain *TC;
-            // Device toolchains have to be selected differently. They pair host
-            // and device in their implementation.
-            if (TT.isNVPTX() || TT.isAMDGCN()) {
-              const ToolChain *HostTC =
-                  C.getSingleOffloadToolChain<Action::OFK_Host>();
-              assert(HostTC && "Host toolchain should be always defined.");
-              auto &DeviceTC =
-                  ToolChains[TT.str() + "/" + HostTC->getTriple().normalize()];
-              if (!DeviceTC) {
-                if (TT.isNVPTX())
-                  DeviceTC = std::make_unique<toolchains::CudaToolChain>(
-                      *this, TT, *HostTC, C.getInputArgs(), Action::OFK_OpenMP);
-                else if (TT.isAMDGCN())
-                  DeviceTC =
-                      std::make_unique<toolchains::AMDGPUOpenMPToolChain>(
-                          *this, TT, *HostTC, C.getInputArgs());
-                else
-                  assert(DeviceTC && "Device toolchain not defined.");
-              }
-
-              TC = DeviceTC.get();
-            } else
-              TC = &getToolChain(C.getInputArgs(), TT);
-            C.addOffloadDeviceToolChain(TC, Action::OFK_OpenMP);
+        // If the specified target is invalid, emit a diagnostic.
+        if (TT.getArch() == llvm::Triple::UnknownArch) {
+          Diag(clang::diag::err_drv_invalid_omp_target) << NormalizedName;
+          return;
+        }
+
+        const ToolChain *TC;
+        // Device toolchains have to be selected differently. They pair host
+        // and device in their implementation.
+        if (TT.isNVPTX() || TT.isAMDGCN()) {
+          const ToolChain *HostTC =
+              C.getSingleOffloadToolChain<Action::OFK_Host>();
+          assert(HostTC && "Host toolchain should be always defined.");
+          auto &DeviceTC = ToolChains[NormalizedName + "/" +
+                                      HostTC->getTriple().normalize()];
+          if (!DeviceTC) {
+            if (TT.isNVPTX())
+              DeviceTC = std::make_unique<toolchains::CudaToolChain>(
+                  *this, TT, *HostTC, C.getInputArgs(), Action::OFK_OpenMP,
+                  OpenMPTargetArch);
+            else if (TT.isAMDGCN())
+              DeviceTC = std::make_unique<toolchains::AMDGPUOpenMPToolChain>(
+                  *this, TT, *HostTC, C.getInputArgs(), OpenMPTargetArch);
+            else
+              assert(DeviceTC && "Device toolchain not defined.");
           }
+          TC = DeviceTC.get();
+        } else {
+          TC = &getToolChain(C.getInputArgs(), TT);
         }
-      } else
-        Diag(clang::diag::err_drv_expecting_fopenmp_with_fopenmp_targets);
-    } else
-      Diag(clang::diag::warn_drv_empty_joined_argument)
-          << OpenMPTargets->getAsString(C.getInputArgs());
+        // Each value of -fopenmp-targets gets instance of offload toolchain
+        C.addOffloadDeviceToolChain(TC, Action::OFK_OpenMP);
+      } // end foreach openmp target
+    }   // end has openmp offload targets
   }
 
   //
@@ -2406,6 +2483,19 @@
       ABRT_Ignore_Host,
     };
 
+    /// ID to identify each device compilation. For CUDA it is simply the
+    /// GPU arch string. For HIP it is either the GPU arch string or GPU
+    /// arch string plus feature strings delimited by a plus sign, e.g.
+    /// gfx906+xnack.
+    struct TargetID {
+      /// Target ID string which is persistent throughout the compilation.
+      const char *ID;
+      TargetID(CudaArch Arch) { ID = CudaArchToString(Arch); }
+      TargetID(const char *ID) : ID(ID) {}
+      operator const char *() { return ID; }
+      operator StringRef() { return StringRef(ID); }
+    };
+
   protected:
     /// Compilation associated with this builder.
     Compilation &C;
@@ -2487,18 +2577,6 @@
     bool EmitLLVM = false;
     bool EmitAsm = false;
 
-    /// ID to identify each device compilation. For CUDA it is simply the
-    /// GPU arch string. For HIP it is either the GPU arch string or GPU
-    /// arch string plus feature strings delimited by a plus sign, e.g.
-    /// gfx906+xnack.
-    struct TargetID {
-      /// Target ID string which is persistent throughout the compilation.
-      const char *ID;
-      TargetID(CudaArch Arch) { ID = CudaArchToString(Arch); }
-      TargetID(const char *ID) : ID(ID) {}
-      operator const char *() { return ID; }
-      operator StringRef() { return StringRef(ID); }
-    };
     /// List of GPU architectures to use in this compilation.
     SmallVector<TargetID, 4> GpuArchList;
 
@@ -3121,6 +3199,12 @@
     /// The OpenMP actions for the current input.
     ActionList OpenMPDeviceActions;
 
+    bool CompileHostOnly = false;
+    bool CompileDeviceOnly = false;
+
+    /// List of GPU architectures to use in this compilation.
+    SmallVector<TargetID, 4> GpuArchList;
+
     /// The linker inputs obtained for each toolchain.
     SmallVector<ActionList, 8> DeviceLinkerInputs;
 
@@ -3154,14 +3238,26 @@
         // We passed the device action as a host dependence, so we don't need to
         // do anything else with them.
         OpenMPDeviceActions.clear();
-        return ABRT_Success;
+        return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success;
+        ;
       }
 
+      bool LastActionIsCompile = false;
       // By default, we produce an action for each device arch.
-      for (Action *&A : OpenMPDeviceActions)
-        A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A);
-
-      return ABRT_Success;
+      for (unsigned I = 0; I < ToolChains.size(); ++I) {
+        Action *&A = OpenMPDeviceActions[I];
+        // AMDGPU does not support linking of object files, so we skip
+        // assemble and backend actions to produce LLVM IR.
+        if (ToolChains[I]->getTriple().isAMDGCN() &&
+            (CurPhase == phases::Assemble || CurPhase == phases::Backend))
+          continue;
+        A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A,
+                                               Action::OFK_OpenMP);
+        LastActionIsCompile =
+            (A->getKind() == Action::ActionClass::CompileJobClass);
+      }
+      return (CompileDeviceOnly && LastActionIsCompile) ? ABRT_Ignore_Host
+                                                        : ABRT_Success;
     }
 
     ActionBuilderReturnCode addDeviceDepences(Action *HostAction) override {
@@ -3169,9 +3265,15 @@
       // If this is an input action replicate it for each OpenMP toolchain.
       if (auto *IA = dyn_cast<InputAction>(HostAction)) {
         OpenMPDeviceActions.clear();
-        for (unsigned I = 0; I < ToolChains.size(); ++I)
-          OpenMPDeviceActions.push_back(
-              C.MakeAction<InputAction>(IA->getInputArg(), IA->getType()));
+        // Only process input actions for files that have extensions
+        std::string FileName = IA->getInputArg().getAsString(Args);
+        if (!llvm::sys::path::has_extension(FileName)) {
+          return ABRT_Inactive;
+        }
+        for (unsigned I = 0; I < ToolChains.size(); ++I) {
+          OpenMPDeviceActions.push_back(C.MakeAction<InputAction>(
+              IA->getInputArg(), IA->getType(), GpuArchList[I].ID));
+        }
         return ABRT_Success;
       }
 
@@ -3191,8 +3293,9 @@
           return ABRT_Inactive;
         for (unsigned I = 0; I < ToolChains.size(); ++I) {
           OpenMPDeviceActions.push_back(UA);
-          UA->registerDependentActionInfo(
-              ToolChains[I], /*BoundArch=*/StringRef(), Action::OFK_OpenMP);
+          UA->registerDependentActionInfo(ToolChains[I],
+                                          /*BoundArch=*/GpuArchList[I].ID,
+                                          Action::OFK_OpenMP);
         }
         return ABRT_Success;
       }
@@ -3209,10 +3312,11 @@
             *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
             /*BoundArch=*/nullptr, Action::OFK_OpenMP);
         auto TC = ToolChains.begin();
+        unsigned arch_count = 0;
         for (Action *&A : OpenMPDeviceActions) {
           assert(isa<CompileJobAction>(A));
           OffloadAction::DeviceDependences DDep;
-          DDep.add(*A, **TC, /*BoundArch=*/nullptr, Action::OFK_OpenMP);
+          DDep.add(*A, **TC, GpuArchList[arch_count++].ID, Action::OFK_OpenMP);
           A = C.MakeAction<OffloadAction>(HDep, DDep);
           ++TC;
         }
@@ -3228,11 +3332,13 @@
       assert(OpenMPDeviceActions.size() == ToolChains.size() &&
              "Number of OpenMP actions and toolchains do not match.");
 
+      unsigned arch_count = 0;
       // Append all device actions followed by the proper offload action.
       auto TI = ToolChains.begin();
       for (auto *A : OpenMPDeviceActions) {
         OffloadAction::DeviceDependences Dep;
-        Dep.add(*A, **TI, /*BoundArch=*/nullptr, Action::OFK_OpenMP);
+        Dep.add(*A, **TI, /*BoundArch=*/GpuArchList[arch_count++].ID,
+                Action::OFK_OpenMP);
         AL.push_back(C.MakeAction<OffloadAction>(Dep, A->getType()));
         ++TI;
       }
@@ -3243,17 +3349,17 @@
     void appendLinkDeviceActions(ActionList &AL) override {
       assert(ToolChains.size() == DeviceLinkerInputs.size() &&
              "Toolchains and linker inputs sizes do not match.");
-
       // Append a new link action for each device.
       auto TC = ToolChains.begin();
+      unsigned arch_count = 0;
       for (auto &LI : DeviceLinkerInputs) {
         auto *DeviceLinkAction =
             C.MakeAction<LinkJobAction>(LI, types::TY_Image);
         OffloadAction::DeviceDependences DeviceLinkDeps;
-        DeviceLinkDeps.add(*DeviceLinkAction, **TC, /*BoundArch=*/nullptr,
-		        Action::OFK_OpenMP);
+        DeviceLinkDeps.add(*DeviceLinkAction, **TC,
+                           GpuArchList[arch_count++].ID, Action::OFK_OpenMP);
         AL.push_back(C.MakeAction<OffloadAction>(DeviceLinkDeps,
-            DeviceLinkAction->getType()));
+                                                 DeviceLinkAction->getType()));
         ++TC;
       }
       DeviceLinkerInputs.clear();
@@ -3270,12 +3376,21 @@
     void appendLinkDependences(OffloadAction::DeviceDependences &DA) override {}
 
     bool initialize() override {
+      if (Arg *cu_dev_only =
+              C.getInputArgs().getLastArg(options::OPT_cuda_device_only)) {
+        cu_dev_only->claim();
+        CompileDeviceOnly = true;
+        // TODO: Check emitting IR for OpenMP when cuda-device-only is set
+      }
       // Get the OpenMP toolchains. If we don't get any, the action builder will
       // know there is nothing to do related to OpenMP offloading.
       auto OpenMPTCRange = C.getOffloadToolChains<Action::OFK_OpenMP>();
       for (auto TI = OpenMPTCRange.first, TE = OpenMPTCRange.second; TI != TE;
-           ++TI)
+           ++TI) {
+        GpuArchList.push_back(
+            TI->second->getTriple().getEnvironmentName().data());
         ToolChains.push_back(TI->second);
+      }
 
       DeviceLinkerInputs.resize(ToolChains.size());
       return false;
@@ -4593,6 +4708,7 @@
     OA->doOnEachDependence(
         /*IsHostDependence=*/BuildingForOffloadDevice,
         [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) {
+
           OffloadDependencesInputInfo.push_back(BuildJobsForAction(
               C, DepA, DepTC, DepBoundArch, /*AtTopLevel=*/false,
               /*MultipleArchs*/ !!DepBoundArch, LinkingOutput, CachedResults,
@@ -4645,25 +4761,6 @@
   if (!T)
     return InputInfo();
 
-  if (BuildingForOffloadDevice &&
-      A->getOffloadingDeviceKind() == Action::OFK_OpenMP) {
-    if (TC->getTriple().isAMDGCN()) {
-      // AMDGCN treats backend and assemble actions as no-op because
-      // linker does not support object files.
-      if (const BackendJobAction *BA = dyn_cast<BackendJobAction>(A)) {
-        return BuildJobsForAction(C, *BA->input_begin(), TC, BoundArch,
-                                  AtTopLevel, MultipleArchs, LinkingOutput,
-                                  CachedResults, TargetDeviceOffloadKind);
-      }
-
-      if (const AssembleJobAction *AA = dyn_cast<AssembleJobAction>(A)) {
-        return BuildJobsForAction(C, *AA->input_begin(), TC, BoundArch,
-                                  AtTopLevel, MultipleArchs, LinkingOutput,
-                                  CachedResults, TargetDeviceOffloadKind);
-      }
-    }
-  }
-
   // If we've collapsed action list that contained OffloadAction we
   // need to build jobs for host/device-side inputs it may have held.
   for (const auto *OA : CollapsedOffloadActions)
@@ -4747,17 +4844,23 @@
                                  UI.DependentOffloadKind == Action::OFK_HIP,
                              OffloadingPrefix),
           BaseInput);
+      if (UI.DependentOffloadKind == Action::OFK_Host &&
+          llvm::sys::path::extension(InputInfos[0].getFilename()) == ".a")
+        CurI = InputInfos[0];
       // Save the unbundling result.
       UnbundlingResults.push_back(CurI);
 
       // Get the unique string identifier for this dependence and cache the
       // result.
       StringRef Arch;
-      if (TargetDeviceOffloadKind == Action::OFK_HIP) {
+      if (TargetDeviceOffloadKind == Action::OFK_HIP ||
+          TargetDeviceOffloadKind == Action::OFK_OpenMP) {
         if (UI.DependentOffloadKind == Action::OFK_Host)
           Arch = StringRef();
-        else
+        else if (TargetDeviceOffloadKind == Action::OFK_HIP)
           Arch = UI.DependentBoundArch;
+        else if (TargetDeviceOffloadKind == Action::OFK_OpenMP)
+          Arch = UI.DependentToolChain->getOffloadArch();
       } else
         Arch = BoundArch;
 
@@ -4787,8 +4890,9 @@
         BaseInput = FinalOutput->getValue();
       else
         BaseInput = getDefaultImageName();
-      BaseInput =
-          C.getArgs().MakeArgString(std::string(BaseInput) + "-wrapper");
+      std::string BaseNm = std::string(BaseInput);
+      std::replace(BaseNm.begin(), BaseNm.end(), '.', '_');
+      BaseInput = C.getArgs().MakeArgString(BaseNm + "-wrapper");
     }
     Result = InputInfo(A, GetNamedOutputPath(C, *JA, BaseInput, BoundArch,
                                              AtTopLevel, MultipleArchs,
Index: clang/lib/Driver/Action.cpp
===================================================================
--- clang/lib/Driver/Action.cpp
+++ clang/lib/Driver/Action.cpp
@@ -206,11 +206,23 @@
                              const DeviceDependences &DDeps)
     : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()),
       DevToolChains(DDeps.getToolChains()) {
-  // We use the kinds of the host dependence for this action.
-  OffloadingArch = HDep.getBoundArch();
+  auto &OKinds = DDeps.getOffloadKinds();
+  auto &BArchs = DDeps.getBoundArchs();
+
+  // If all inputs agree on the same kind, use it also for this action.
+  if (llvm::all_of(OKinds, [&](OffloadKind K) { return K == OKinds.front(); }))
+    OffloadingDeviceKind = OKinds.front();
+
+  // If we have a single dependency, inherit the architecture from it.
+  if (OKinds.size() == 1)
+    OffloadingArch = BArchs.front();
+  else
+    // We use the kinds of the host dependence for this action.
+    OffloadingArch = HDep.getBoundArch();
+
   ActiveOffloadKindMask = HDep.getOffloadKinds();
   HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(),
-                                             HDep.getBoundArch());
+                                             OffloadingArch);
 
   // Add device inputs and propagate info to the device actions. Do work only if
   // we have dependencies.
Index: clang/include/clang/Driver/ToolChain.h
===================================================================
--- clang/include/clang/Driver/ToolChain.h
+++ clang/include/clang/Driver/ToolChain.h
@@ -170,6 +170,9 @@
   mutable llvm::Optional<RuntimeLibType> runtimeLibType;
   mutable llvm::Optional<UnwindLibType> unwindLibType;
 
+  // OpenMP creates a toolchain for each target arch. eg - gfx908
+  std::string OffloadArch;
+
 protected:
   MultilibSet Multilibs;
   Multilib SelectedMultilib;
@@ -246,6 +249,12 @@
     return EffectiveTriple;
   }
 
+  const std::string getOffloadArch() const { return OffloadArch; }
+
+  void setOffloadArch(std::string OffloadArch) {
+    this->OffloadArch = std::move(OffloadArch);
+  }
+
   path_list &getLibraryPaths() { return LibraryPaths; }
   const path_list &getLibraryPaths() const { return LibraryPaths; }
 
Index: clang/include/clang/Basic/DiagnosticDriverKinds.td
===================================================================
--- clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -121,6 +121,9 @@
   "invalid argument in '%0', only integer or 'auto' is supported">;
 def err_drv_missing_argument : Error<
   "argument to '%0' is missing (expected %1 value%s1)">;
+def err_drv_missing_Xopenmptarget_or_march: Error<
+  "The option -fopenmp-targets= requires additional options -Xopenmp-target= and -march= .">,
+  DefaultFatal;
 def err_drv_invalid_Xarch_argument_with_args : Error<
   "invalid Xarch argument: '%0', options requiring arguments are unsupported">;
 def err_drv_Xopenmp_target_missing_triple : Error<
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to