yaxunl updated this revision to Diff 342182.
yaxunl edited the summary of this revision.
yaxunl added a comment.
Herald added a subscriber: dang.

added option -fhip-bundle-device-output


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D101630/new/

https://reviews.llvm.org/D101630

Files:
  clang/include/clang/Driver/Options.td
  clang/lib/Driver/Driver.cpp
  clang/test/Driver/clang-offload-bundler.c
  clang/test/Driver/hip-device-compile.hip
  clang/test/Driver/hip-output-file-name.hip
  clang/test/Driver/hip-phases.hip
  clang/test/Driver/hip-rdc-device-only.hip
  clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp

Index: clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
===================================================================
--- clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
+++ clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
@@ -117,6 +117,9 @@
 /// The index of the host input in the list of inputs.
 static unsigned HostInputIndex = ~0u;
 
+/// Whether not having host target is allowed.
+static bool AllowNoHost = false;
+
 /// Path to the current binary.
 static std::string BundlerExecutable;
 
@@ -857,9 +860,10 @@
   }
 
   // Get the file handler. We use the host buffer as reference.
-  assert(HostInputIndex != ~0u && "Host input index undefined??");
+  assert((HostInputIndex != ~0u || AllowNoHost) &&
+         "Host input index undefined??");
   Expected<std::unique_ptr<FileHandler>> FileHandlerOrErr =
-      CreateFileHandler(*InputBuffers[HostInputIndex]);
+      CreateFileHandler(*InputBuffers[AllowNoHost ? 0 : HostInputIndex]);
   if (!FileHandlerOrErr)
     return FileHandlerOrErr.takeError();
 
@@ -1126,6 +1130,7 @@
   // have exactly one host target.
   unsigned Index = 0u;
   unsigned HostTargetNum = 0u;
+  bool HIPOnly = true;
   llvm::DenseSet<StringRef> ParsedTargets;
   for (StringRef Target : TargetNames) {
     if (ParsedTargets.contains(Target)) {
@@ -1167,12 +1172,21 @@
       HostInputIndex = Index;
     }
 
+    if (Kind != "hip" && Kind != "hipv4")
+      HIPOnly = false;
+
     ++Index;
   }
 
+  // HIP uses clang-offload-bundler to bundle device-only compilation results
+  // for multiple GPU archs, therefore allow no host target if all entries
+  // are for HIP.
+  AllowNoHost = HIPOnly;
+
   // Host triple is not really needed for unbundling operation, so do not
   // treat missing host triple as error if we do unbundling.
-  if ((Unbundle && HostTargetNum > 1) || (!Unbundle && HostTargetNum != 1)) {
+  if ((Unbundle && HostTargetNum > 1) ||
+      (!Unbundle && HostTargetNum != 1 && !AllowNoHost)) {
     reportError(createStringError(errc::invalid_argument,
                                   "expecting exactly one host target but got " +
                                       Twine(HostTargetNum)));
Index: clang/test/Driver/hip-rdc-device-only.hip
===================================================================
--- clang/test/Driver/hip-rdc-device-only.hip
+++ clang/test/Driver/hip-rdc-device-only.hip
@@ -56,8 +56,8 @@
 // COMMON-SAME: "-fapply-global-visibility-to-externs"
 // COMMON-SAME: "-target-cpu" "gfx803"
 // COMMON-SAME: "-fgpu-rdc"
-// EMITBC-SAME: {{.*}} "-o" {{"a.*bc"}} "-x" "hip"
-// EMITLL-SAME: {{.*}} "-o" {{"a.*ll"}} "-x" "hip"
+// EMITBC-SAME: {{.*}} "-o" {{".*a.*bc"}} "-x" "hip"
+// EMITLL-SAME: {{.*}} "-o" {{".*a.*ll"}} "-x" "hip"
 // CHECK-SAME: {{.*}} {{".*a.cu"}}
 
 // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
@@ -69,10 +69,14 @@
 // COMMON-SAME: "-fapply-global-visibility-to-externs"
 // COMMON-SAME: "-target-cpu" "gfx900"
 // COMMON-SAME: "-fgpu-rdc"
-// EMITBC-SAME: {{.*}} "-o" {{"a.*bc"}} "-x" "hip"
-// EMITLL-SAME: {{.*}} "-o" {{"a.*ll"}} "-x" "hip"
+// EMITBC-SAME: {{.*}} "-o" {{".*a.*bc"}} "-x" "hip"
+// EMITLL-SAME: {{.*}} "-o" {{".*a.*ll"}} "-x" "hip"
 // COMMON-SAME: {{.*}} {{".*a.cu"}}
 
+// COMMON: "{{.*}}clang-offload-bundler" "-type={{(bc|ll)}}"
+// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// COMMON-SAME: "-outputs=a-hip-amdgcn-amd-amdhsa.{{(bc|ll)}}"
+
 // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // COMMON-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
 // EMITBC-SAME: "-emit-llvm-bc"
@@ -82,8 +86,8 @@
 // COMMON-SAME: "-fapply-global-visibility-to-externs"
 // COMMON-SAME: "-target-cpu" "gfx803"
 // COMMON-SAME: "-fgpu-rdc"
-// EMITBC-SAME: {{.*}} "-o" {{"b.*bc"}} "-x" "hip"
-// EMITLL-SAME: {{.*}} "-o" {{"b.*ll"}} "-x" "hip"
+// EMITBC-SAME: {{.*}} "-o" {{".*b.*bc"}} "-x" "hip"
+// EMITLL-SAME: {{.*}} "-o" {{".*b.*ll"}} "-x" "hip"
 // COMMON-SAME: {{.*}} {{".*b.hip"}}
 
 // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
@@ -95,10 +99,14 @@
 // COMMON-SAME: "-fapply-global-visibility-to-externs"
 // COMMON-SAME: "-target-cpu" "gfx900"
 // COMMON-SAME: "-fgpu-rdc"
-// EMITBC-SAME: {{.*}} "-o" {{"b.*bc"}} "-x" "hip"
-// EMITLL-SAME: {{.*}} "-o" {{"b.*ll"}} "-x" "hip"
+// EMITBC-SAME: {{.*}} "-o" {{".*b.*bc"}} "-x" "hip"
+// EMITLL-SAME: {{.*}} "-o" {{".*b.*ll"}} "-x" "hip"
 // COMMON-SAME: {{.*}} {{".*b.hip"}}
 
+// COMMON: "{{.*}}clang-offload-bundler" "-type={{(bc|ll)}}"
+// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// COMMON-SAME: "-outputs=b-hip-amdgcn-amd-amdhsa.{{(bc|ll)}}"
+
 // SAVETEMP: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu"
 // SAVETEMP-SAME: "-E"
 // SAVETEMP-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx803"
@@ -125,6 +133,10 @@
 // SAVETEMP-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx900"
 // SAVETEMP-SAME: {{.*}} "-o" {{"a.*.ll"}} "-x" "ir" [[A_GFX900_TMP_BC]]
 
+// SAVETEMP: "{{.*}}clang-offload-bundler" "-type=ll"
+// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// SAVETEMP-SAME: "-outputs=a-hip-amdgcn-amd-amdhsa.ll"
+
 // SAVETEMP: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu"
 // SAVETEMP-SAME: "-E"
 // SAVETEMP-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx803"
@@ -150,3 +162,7 @@
 // SAVETEMP-SAME: "-emit-llvm"
 // SAVETEMP-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx900"
 // SAVETEMP-SAME: {{.*}} "-o" {{"b.*.ll"}} "-x" "ir" [[B_GFX900_TMP_BC]]
+
+// SAVETEMP: "{{.*}}clang-offload-bundler" "-type=ll"
+// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// SAVETEMP-SAME: "-outputs=b-hip-amdgcn-amd-amdhsa.ll"
Index: clang/test/Driver/hip-phases.hip
===================================================================
--- clang/test/Driver/hip-phases.hip
+++ clang/test/Driver/hip-phases.hip
@@ -231,13 +231,14 @@
 // compilation mode.
 //
 // RUN: %clang -x hip -target x86_64-unknown-linux-gnu -ccc-print-phases \
-// RUN: --cuda-gpu-arch=gfx803 %s --cuda-device-only -S 2>&1 \
+// RUN: --cuda-gpu-arch=gfx803 %s --cuda-device-only -S -fno-hip-bundle-device-output 2>&1 \
 // RUN: | FileCheck -check-prefixes=DASM %s
 // DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
 // DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
 // DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
 // DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
 // DASM-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P3]]}, assembler
+// DASM-NOT: clang-offload-bundler
 // DASM-NOT: host
 
 //
@@ -282,6 +283,7 @@
 // DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]])
 // DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]])
 // DASM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P8]]}, assembler
+// DASM2-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P4]], [[P9]]}, assembler, (device-hip, )
 // DASM2-NOT: host
 
 //
@@ -312,3 +314,88 @@
 
 // NL2-DAG: [[P4:[0-9]+]]: linker, {[[P0]], [[P2]]}, image, (host-[[T]])
 // RL2-DAG: [[P4:[0-9]+]]: linker, {[[P1]], [[P3]], [[P9]]}, image, (host-[[T]])
+
+// Test one gpu architectures up to the preprocessor expansion output phase in device-only
+// compilation mode.
+//
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 %s \
+// RUN: --cuda-device-only -E -fno-hip-bundle-device-output 2>&1 \
+// RUN: | FileCheck -check-prefixes=PPE %s
+
+// Test two gpu architectures up to the preprocessor expansion output phase in device-only
+// compilation mode.
+//
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
+// RUN: --cuda-device-only -E 2>&1 \
+// RUN: | FileCheck -check-prefixes=PPE2 %s
+
+// Test one gpu architectures up to the LLVM IR output phase in device-only
+// compilation mode.
+//
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 %s \
+// RUN: --cuda-device-only -c -emit-llvm -fno-hip-bundle-device-output 2>&1 \
+// RUN: | FileCheck -check-prefixes=LLVM %s
+
+// Test two gpu architectures up to the LLVM IR output phase in device-only
+// compilation mode.
+//
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
+// RUN: --cuda-device-only -c -emit-llvm 2>&1 \
+// RUN: | FileCheck -check-prefixes=LLVM2 %s
+
+// Test two gpu architectures up to the LLVM IR output phase in device-only
+// compilation mode with bundled preprocessor expansion as input.
+//
+// RUN: %clang -x hip-cpp-output -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
+// RUN: --cuda-device-only -c -emit-llvm 2>&1 \
+// RUN: | FileCheck -check-prefixes=PPELLVM2 %s
+
+// PPE-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// PPE-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// PPE-NOT: clang-offload-bundler
+// PPE-NOT: host
+
+// PPE2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// PPE2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// PPE2-DAG: [[P2:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P1]]}, [[T]]-cpp-output
+// PPE2-DAG: [[P5:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH2:gfx900]])
+// PPE2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
+// PPE2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P6]]}, [[T]]-cpp-output
+// PPE2-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P2]], [[P9]]}, [[T]]-cpp-output, (device-hip, )
+// PPE2-NOT: host
+
+// LLVM-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// LLVM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// LLVM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// LLVM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, ir, (device-[[T]], [[ARCH]])
+// LLVM-NOT: clang-offload-bundler
+// LLVM-NOT: host
+
+// LLVM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// LLVM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// LLVM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// LLVM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, ir, (device-[[T]], [[ARCH]])
+// LLVM2-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P3]]}, ir
+// LLVM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH2:gfx900]])
+// LLVM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
+// LLVM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]])
+// LLVM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, ir, (device-[[T]], [[ARCH2]])
+// LLVM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P8]]}, ir
+// LLVM2-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P4]], [[P9]]}, ir, (device-hip, )
+// LLVM2-NOT: host
+
+// PPELLVM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]]-cpp-output
+// PPELLVM2-DAG: [[P1:[0-9]+]]: clang-offload-unbundler, {[[P0]]}, hip-cpp-output
+// PPELLVM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH:gfx803]])
+// PPELLVM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, ir, (device-[[T]], [[ARCH]])
+// PPELLVM2-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P3]]}, ir
+// PPELLVM2-DAG: [[P7:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH2:gfx900]])
+// PPELLVM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, ir, (device-[[T]], [[ARCH2]])
+// PPELLVM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P8]]}, ir
+// PPELLVM2-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P4]], [[P9]]}, ir, (device-hip, )
+// PPELLVM2-NOT: host
Index: clang/test/Driver/hip-output-file-name.hip
===================================================================
--- clang/test/Driver/hip-output-file-name.hip
+++ clang/test/Driver/hip-output-file-name.hip
@@ -9,6 +9,16 @@
 // CHECK: {{.*}}clang-offload-bundler{{.*}}"-outputs=hip-output-file-name.o"
 
 // Check -E default output is "-" (stdout).
+// If there are multiple preprocessor expansion outputs clang-offload-bundler
+// is used to bundle the final output.
+
+// RUN: %clang -### -E -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx803 %s \
+// RUN: 2>&1 | FileCheck -check-prefixes=DASH %s
+
+// RUN: %clang -### -E --cuda-device-only -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx803 %s \
+// RUN: 2>&1 | FileCheck -check-prefixes=DASH %s
 
 // RUN: %clang -### -E -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
@@ -20,7 +30,7 @@
 
 // RUN: %clang -### -E --cuda-device-only -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
-// RUN: 2>&1 | FileCheck -check-prefixes=CLANG-DASH %s
+// RUN: 2>&1 | FileCheck -check-prefixes=DASH %s
 
 // RUN: %clang -### -E --cuda-host-only -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
@@ -41,7 +51,7 @@
 
 // RUN: %clang -### -E -o test.cui --cuda-device-only -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
-// RUN: 2>&1 | FileCheck -check-prefixes=CLANG-OUT %s
+// RUN: 2>&1 | FileCheck -check-prefixes=OUT %s
 
 // RUN: %clang -### -E -o test.cui --cuda-host-only -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
Index: clang/test/Driver/hip-device-compile.hip
===================================================================
--- clang/test/Driver/hip-device-compile.hip
+++ clang/test/Driver/hip-device-compile.hip
@@ -6,21 +6,21 @@
 // the output should not be bundled.
 
 // RUN: %clang -c -emit-llvm --cuda-device-only -### -target x86_64-linux-gnu \
-// RUN:   -o a.bc -x hip --cuda-gpu-arch=gfx900 \
+// RUN:   -o a.bc -x hip --cuda-gpu-arch=gfx900 -fno-hip-bundle-device-output \
 // RUN:   --hip-device-lib=lib1.bc \
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN: 2>&1 | FileCheck -check-prefixes=CHECK,BC %s
 
 // RUN: %clang -c -S -emit-llvm --cuda-device-only -### -target x86_64-linux-gnu \
-// RUN:   -o a.ll -x hip --cuda-gpu-arch=gfx900 \
+// RUN:   -o a.ll -x hip --cuda-gpu-arch=gfx900 -fno-hip-bundle-device-output \
 // RUN:   --hip-device-lib=lib1.bc \
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LL %s
 
 // RUN: %clang -c -S --cuda-device-only -### -target x86_64-linux-gnu \
-// RUN:   -o a.s -x hip --cuda-gpu-arch=gfx900 \
+// RUN:   -o a.s -x hip --cuda-gpu-arch=gfx900 -fno-hip-bundle-device-output \
 // RUN:   --hip-device-lib=lib1.bc \
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
Index: clang/test/Driver/clang-offload-bundler.c
===================================================================
--- clang/test/Driver/clang-offload-bundler.c
+++ clang/test/Driver/clang-offload-bundler.c
@@ -362,6 +362,21 @@
 // CKLST2-NOT: openmp-powerpc64le-ibm-linux-gnu
 // CKLST2-NOT: openmp-x86_64-pc-linux-gnu
 
+//
+// Check bundling without host target is allowed for HIP.
+//
+// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa-gfx900,hip-amdgcn-amd-amdhsa-gfx906 \
+// RUN:   -inputs=%t.tgt1,%t.tgt2 -outputs=%t.hip.bundle.bc
+// RUN: clang-offload-bundler -type=bc -list -inputs=%t.hip.bundle.bc | FileCheck -check-prefix=NOHOST %s
+// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa-gfx900,hip-amdgcn-amd-amdhsa-gfx906 \
+// RUN:   -outputs=%t.res.tgt1,%t.res.tgt2 -inputs=%t.hip.bundle.bc -unbundle
+// RUN: diff %t.tgt1 %t.res.tgt1
+// RUN: diff %t.tgt2 %t.res.tgt2
+//
+// NOHOST-NOT: host-
+// NOHOST-DAG: hip-amdgcn-amd-amdhsa-gfx900
+// NOHOST-DAG: hip-amdgcn-amd-amdhsa-gfx906
+
 // Some code so that we can create a binary out of this file.
 int A = 0;
 void test_func(void) {
Index: clang/lib/Driver/Driver.cpp
===================================================================
--- clang/lib/Driver/Driver.cpp
+++ clang/lib/Driver/Driver.cpp
@@ -2874,6 +2874,7 @@
     /// The linker inputs obtained for each device arch.
     SmallVector<ActionList, 8> DeviceLinkerInputs;
     bool GPUSanitize;
+    bool BundleDeviceOutput;
 
   public:
     HIPActionBuilder(Compilation &C, DerivedArgList &Args,
@@ -2882,6 +2883,9 @@
       DefaultCudaArch = CudaArch::GFX803;
       GPUSanitize = Args.hasFlag(options::OPT_fgpu_sanitize,
                                  options::OPT_fno_gpu_sanitize, false);
+      BundleDeviceOutput =
+          Args.hasFlag(options::OPT_fhip_bundle_device_output,
+                       options::OPT_fno_hip_bundle_device_output);
     }
 
     bool canUseBundlerUnbundler() const override { return true; }
@@ -3012,6 +3016,21 @@
         A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A,
                                                AssociatedOffloadKind);
 
+      // Bundle output files for device only compilation if there are more
+      // than one output files.
+      if (CompileDeviceOnly && CurPhase == FinalPhase && BundleDeviceOutput) {
+        for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+          OffloadAction::DeviceDependences DDep;
+          DDep.add(*CudaDeviceActions[I], *ToolChains.front(), GpuArchList[I],
+                   AssociatedOffloadKind);
+          CudaDeviceActions[I] = C.MakeAction<OffloadAction>(
+              DDep, CudaDeviceActions[I]->getType());
+        }
+        CudaFatBinary =
+            C.MakeAction<OffloadBundlingJobAction>(CudaDeviceActions);
+        CudaDeviceActions.clear();
+      }
+
       return (CompileDeviceOnly && CurPhase == FinalPhase) ? ABRT_Ignore_Host
                                                            : ABRT_Success;
     }
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -974,6 +974,9 @@
 defm gpu_sanitize : BoolFOption<"gpu-sanitize", EmptyKPM, DefaultFalse,
   PosFlag<SetTrue, [], "Enable sanitizer for AMDGPU target">,
   NegFlag<SetFalse>>;
+defm hip_bundle_device_output : BoolFOption<"hip-bundle-device-output", EmptyKPM, DefaultTrue,
+  PosFlag<SetTrue, []>,
+  NegFlag<SetFalse, [], "Do not bundle output files of HIP device compilation">>;
 def cuid_EQ : Joined<["-"], "cuid=">, Flags<[CC1Option]>,
   HelpText<"An ID for compilation unit, which should be the same for the same "
            "compilation unit but different for different compilation units. "
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to