yaxunl updated this revision to Diff 347400.
yaxunl added a comment.

fixed option. bundle output if users specify output by -o or -E


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D101630/new/

https://reviews.llvm.org/D101630

Files:
  clang/include/clang/Driver/Options.td
  clang/lib/Driver/Driver.cpp
  clang/test/Driver/clang-offload-bundler.c
  clang/test/Driver/hip-device-compile.hip
  clang/test/Driver/hip-output-file-name.hip
  clang/test/Driver/hip-phases.hip
  clang/test/Driver/hip-rdc-device-only.hip
  clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp

Index: clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
===================================================================
--- clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
+++ clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
@@ -117,6 +117,9 @@
 /// The index of the host input in the list of inputs.
 static unsigned HostInputIndex = ~0u;
 
+/// Whether not having host target is allowed.
+static bool AllowNoHost = false;
+
 /// Path to the current binary.
 static std::string BundlerExecutable;
 
@@ -839,9 +842,10 @@
   }
 
   // Get the file handler. We use the host buffer as reference.
-  assert(HostInputIndex != ~0u && "Host input index undefined??");
+  assert((HostInputIndex != ~0u || AllowNoHost) &&
+         "Host input index undefined??");
   Expected<std::unique_ptr<FileHandler>> FileHandlerOrErr =
-      CreateFileHandler(*InputBuffers[HostInputIndex]);
+      CreateFileHandler(*InputBuffers[AllowNoHost ? 0 : HostInputIndex]);
   if (!FileHandlerOrErr)
     return FileHandlerOrErr.takeError();
 
@@ -1108,6 +1112,7 @@
   // have exactly one host target.
   unsigned Index = 0u;
   unsigned HostTargetNum = 0u;
+  bool HIPOnly = true;
   llvm::DenseSet<StringRef> ParsedTargets;
   for (StringRef Target : TargetNames) {
     if (ParsedTargets.contains(Target)) {
@@ -1149,12 +1154,21 @@
       HostInputIndex = Index;
     }
 
+    if (Kind != "hip" && Kind != "hipv4")
+      HIPOnly = false;
+
     ++Index;
   }
 
+  // HIP uses clang-offload-bundler to bundle device-only compilation results
+  // for multiple GPU archs, therefore allow no host target if all entries
+  // are for HIP.
+  AllowNoHost = HIPOnly;
+
   // Host triple is not really needed for unbundling operation, so do not
   // treat missing host triple as error if we do unbundling.
-  if ((Unbundle && HostTargetNum > 1) || (!Unbundle && HostTargetNum != 1)) {
+  if ((Unbundle && HostTargetNum > 1) ||
+      (!Unbundle && HostTargetNum != 1 && !AllowNoHost)) {
     reportError(createStringError(errc::invalid_argument,
                                   "expecting exactly one host target but got " +
                                       Twine(HostTargetNum)));
Index: clang/test/Driver/hip-rdc-device-only.hip
===================================================================
--- clang/test/Driver/hip-rdc-device-only.hip
+++ clang/test/Driver/hip-rdc-device-only.hip
@@ -6,7 +6,7 @@
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
 // RUN:   -c -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
-// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip -fhip-bundle-device-output \
 // RUN: 2>&1 | FileCheck -check-prefixes=COMMON,EMITBC %s
 
 // With `-emit-llvm`, the output should be the same as the aforementioned line
@@ -16,14 +16,14 @@
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
 // RUN:   -c -emit-llvm -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
-// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip -fhip-bundle-device-output \
 // RUN: 2>&1 | FileCheck -check-prefixes=COMMON,EMITBC %s
 
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
 // RUN:   -S -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
-// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip -fhip-bundle-device-output \
 // RUN: 2>&1 | FileCheck -check-prefixes=COMMON,EMITLL %s
 
 // With `-emit-llvm`, the output should be the same as the aforementioned line
@@ -33,7 +33,7 @@
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
 // RUN:   -S -emit-llvm -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
-// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip -fhip-bundle-device-output \
 // RUN: 2>&1 | FileCheck -check-prefixes=COMMON,EMITLL %s
 
 // With `-save-temps`, commane lines for each steps are dumped. For assembly
@@ -44,9 +44,17 @@
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
 // RUN:   -S -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
-// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip -fhip-bundle-device-output \
 // RUN: 2>&1 | FileCheck -check-prefix=SAVETEMP %s
 
+// Check output one file without bundling cause error.
+
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
+// RUN:   -S -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \
+// RUN:   %S/Inputs/hip_multiple_inputs/a.cu -o %t.s -fno-hip-bundle-device-output \
+// RUN: 2>&1 | FileCheck -check-prefix=FAIL %s
+
 // COMMON: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // COMMON-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
 // EMITBC-SAME: "-emit-llvm-bc"
@@ -56,8 +64,8 @@
 // COMMON-SAME: "-fapply-global-visibility-to-externs"
 // COMMON-SAME: "-target-cpu" "gfx803"
 // COMMON-SAME: "-fgpu-rdc"
-// EMITBC-SAME: {{.*}} "-o" {{"a.*bc"}} "-x" "hip"
-// EMITLL-SAME: {{.*}} "-o" {{"a.*ll"}} "-x" "hip"
+// EMITBC-SAME: {{.*}} "-o" {{".*a.*bc"}} "-x" "hip"
+// EMITLL-SAME: {{.*}} "-o" {{".*a.*ll"}} "-x" "hip"
 // CHECK-SAME: {{.*}} {{".*a.cu"}}
 
 // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
@@ -69,10 +77,14 @@
 // COMMON-SAME: "-fapply-global-visibility-to-externs"
 // COMMON-SAME: "-target-cpu" "gfx900"
 // COMMON-SAME: "-fgpu-rdc"
-// EMITBC-SAME: {{.*}} "-o" {{"a.*bc"}} "-x" "hip"
-// EMITLL-SAME: {{.*}} "-o" {{"a.*ll"}} "-x" "hip"
+// EMITBC-SAME: {{.*}} "-o" {{".*a.*bc"}} "-x" "hip"
+// EMITLL-SAME: {{.*}} "-o" {{".*a.*ll"}} "-x" "hip"
 // COMMON-SAME: {{.*}} {{".*a.cu"}}
 
+// COMMON: "{{.*}}clang-offload-bundler" "-type={{(bc|ll)}}"
+// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// COMMON-SAME: "-outputs=a-hip-amdgcn-amd-amdhsa.{{(bc|ll)}}"
+
 // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // COMMON-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
 // EMITBC-SAME: "-emit-llvm-bc"
@@ -82,8 +94,8 @@
 // COMMON-SAME: "-fapply-global-visibility-to-externs"
 // COMMON-SAME: "-target-cpu" "gfx803"
 // COMMON-SAME: "-fgpu-rdc"
-// EMITBC-SAME: {{.*}} "-o" {{"b.*bc"}} "-x" "hip"
-// EMITLL-SAME: {{.*}} "-o" {{"b.*ll"}} "-x" "hip"
+// EMITBC-SAME: {{.*}} "-o" {{".*b.*bc"}} "-x" "hip"
+// EMITLL-SAME: {{.*}} "-o" {{".*b.*ll"}} "-x" "hip"
 // COMMON-SAME: {{.*}} {{".*b.hip"}}
 
 // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
@@ -95,10 +107,14 @@
 // COMMON-SAME: "-fapply-global-visibility-to-externs"
 // COMMON-SAME: "-target-cpu" "gfx900"
 // COMMON-SAME: "-fgpu-rdc"
-// EMITBC-SAME: {{.*}} "-o" {{"b.*bc"}} "-x" "hip"
-// EMITLL-SAME: {{.*}} "-o" {{"b.*ll"}} "-x" "hip"
+// EMITBC-SAME: {{.*}} "-o" {{".*b.*bc"}} "-x" "hip"
+// EMITLL-SAME: {{.*}} "-o" {{".*b.*ll"}} "-x" "hip"
 // COMMON-SAME: {{.*}} {{".*b.hip"}}
 
+// COMMON: "{{.*}}clang-offload-bundler" "-type={{(bc|ll)}}"
+// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// COMMON-SAME: "-outputs=b-hip-amdgcn-amd-amdhsa.{{(bc|ll)}}"
+
 // SAVETEMP: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu"
 // SAVETEMP-SAME: "-E"
 // SAVETEMP-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx803"
@@ -125,6 +141,10 @@
 // SAVETEMP-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx900"
 // SAVETEMP-SAME: {{.*}} "-o" {{"a.*.ll"}} "-x" "ir" [[A_GFX900_TMP_BC]]
 
+// SAVETEMP: "{{.*}}clang-offload-bundler" "-type=ll"
+// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// SAVETEMP-SAME: "-outputs=a-hip-amdgcn-amd-amdhsa.ll"
+
 // SAVETEMP: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu"
 // SAVETEMP-SAME: "-E"
 // SAVETEMP-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx803"
@@ -150,3 +170,9 @@
 // SAVETEMP-SAME: "-emit-llvm"
 // SAVETEMP-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx900"
 // SAVETEMP-SAME: {{.*}} "-o" {{"b.*.ll"}} "-x" "ir" [[B_GFX900_TMP_BC]]
+
+// SAVETEMP: "{{.*}}clang-offload-bundler" "-type=ll"
+// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// SAVETEMP-SAME: "-outputs=b-hip-amdgcn-amd-amdhsa.ll"
+
+// FAIL: error: cannot specify -o when generating multiple output files
Index: clang/test/Driver/hip-phases.hip
===================================================================
--- clang/test/Driver/hip-phases.hip
+++ clang/test/Driver/hip-phases.hip
@@ -231,13 +231,14 @@
 // compilation mode.
 //
 // RUN: %clang -x hip -target x86_64-unknown-linux-gnu -ccc-print-phases \
-// RUN: --cuda-gpu-arch=gfx803 %s --cuda-device-only -S 2>&1 \
+// RUN: --cuda-gpu-arch=gfx803 %s --cuda-device-only -S -fno-hip-bundle-device-output 2>&1 \
 // RUN: | FileCheck -check-prefixes=DASM %s
 // DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
 // DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
 // DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
 // DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
 // DASM-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P3]]}, assembler
+// DASM-NOT: clang-offload-bundler
 // DASM-NOT: host
 
 //
@@ -270,8 +271,20 @@
 //
 // RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
 // RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
+// RUN: --cuda-device-only -S -o %t.s 2>&1 \
+// RUN: | FileCheck -check-prefixes=DASM2,DASM2-BUNDLE %s
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
+// RUN: --cuda-device-only -S -o %t.s -fno-hip-bundle-device-output 2>&1 \
+// RUN: | FileCheck -check-prefixes=DASM2,DASM2-NOBUNDLE %s
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
 // RUN: --cuda-device-only -S 2>&1 \
-// RUN: | FileCheck -check-prefixes=DASM2 %s
+// RUN: | FileCheck -check-prefixes=DASM2,DASM2-NOBUNDLE %s
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
+// RUN: --cuda-device-only -S -fhip-bundle-device-output 2>&1 \
+// RUN: | FileCheck -check-prefixes=DASM2,DASM2-BUNDLE %s
 // DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
 // DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
 // DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
@@ -282,6 +295,8 @@
 // DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]])
 // DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]])
 // DASM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P8]]}, assembler
+// DASM2-BUNDLE: [[P10:[0-9]+]]: clang-offload-bundler, {[[P4]], [[P9]]}, assembler, (device-hip, )
+// DASM2-NOBUNDLE-NOT: clang-offload-bundler, {[[P4]], [[P9]]}, assembler, (device-hip, )
 // DASM2-NOT: host
 
 //
@@ -312,3 +327,88 @@
 
 // NL2-DAG: [[P4:[0-9]+]]: linker, {[[P0]], [[P2]]}, image, (host-[[T]])
 // RL2-DAG: [[P4:[0-9]+]]: linker, {[[P1]], [[P3]], [[P9]]}, image, (host-[[T]])
+
+// Test one gpu architectures up to the preprocessor expansion output phase in device-only
+// compilation mode.
+//
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 %s \
+// RUN: --cuda-device-only -E -fno-hip-bundle-device-output 2>&1 \
+// RUN: | FileCheck -check-prefixes=PPE %s
+
+// Test two gpu architectures up to the preprocessor expansion output phase in device-only
+// compilation mode.
+//
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
+// RUN: --cuda-device-only -E 2>&1 \
+// RUN: | FileCheck -check-prefixes=PPE2 %s
+
+// Test one gpu architectures up to the LLVM IR output phase in device-only
+// compilation mode.
+//
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 %s \
+// RUN: --cuda-device-only -c -emit-llvm -fno-hip-bundle-device-output 2>&1 \
+// RUN: | FileCheck -check-prefixes=LLVM %s
+
+// Test two gpu architectures up to the LLVM IR output phase in device-only
+// compilation mode.
+//
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
+// RUN: --cuda-device-only -c -emit-llvm -o %t.bc 2>&1 \
+// RUN: | FileCheck -check-prefixes=LLVM2 %s
+
+// Test two gpu architectures up to the LLVM IR output phase in device-only
+// compilation mode with bundled preprocessor expansion as input.
+//
+// RUN: %clang -x hip-cpp-output -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
+// RUN: --cuda-device-only -c -emit-llvm -o %t.bc 2>&1 \
+// RUN: | FileCheck -check-prefixes=PPELLVM2 %s
+
+// PPE-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// PPE-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// PPE-NOT: clang-offload-bundler
+// PPE-NOT: host
+
+// PPE2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// PPE2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// PPE2-DAG: [[P2:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P1]]}, [[T]]-cpp-output
+// PPE2-DAG: [[P5:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH2:gfx900]])
+// PPE2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
+// PPE2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P6]]}, [[T]]-cpp-output
+// PPE2-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P2]], [[P9]]}, [[T]]-cpp-output, (device-hip, )
+// PPE2-NOT: host
+
+// LLVM-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// LLVM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// LLVM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// LLVM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, ir, (device-[[T]], [[ARCH]])
+// LLVM-NOT: clang-offload-bundler
+// LLVM-NOT: host
+
+// LLVM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// LLVM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// LLVM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// LLVM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, ir, (device-[[T]], [[ARCH]])
+// LLVM2-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P3]]}, ir
+// LLVM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH2:gfx900]])
+// LLVM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
+// LLVM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]])
+// LLVM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, ir, (device-[[T]], [[ARCH2]])
+// LLVM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P8]]}, ir
+// LLVM2-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P4]], [[P9]]}, ir, (device-hip, )
+// LLVM2-NOT: host
+
+// PPELLVM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]]-cpp-output
+// PPELLVM2-DAG: [[P1:[0-9]+]]: clang-offload-unbundler, {[[P0]]}, hip-cpp-output
+// PPELLVM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH:gfx803]])
+// PPELLVM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, ir, (device-[[T]], [[ARCH]])
+// PPELLVM2-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P3]]}, ir
+// PPELLVM2-DAG: [[P7:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH2:gfx900]])
+// PPELLVM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, ir, (device-[[T]], [[ARCH2]])
+// PPELLVM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P8]]}, ir
+// PPELLVM2-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P4]], [[P9]]}, ir, (device-hip, )
+// PPELLVM2-NOT: host
Index: clang/test/Driver/hip-output-file-name.hip
===================================================================
--- clang/test/Driver/hip-output-file-name.hip
+++ clang/test/Driver/hip-output-file-name.hip
@@ -9,6 +9,16 @@
 // CHECK: {{.*}}clang-offload-bundler{{.*}}"-outputs=hip-output-file-name.o"
 
 // Check -E default output is "-" (stdout).
+// If there are multiple preprocessor expansion outputs clang-offload-bundler
+// is used to bundle the final output.
+
+// RUN: %clang -### -E -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx803 %s \
+// RUN: 2>&1 | FileCheck -check-prefixes=DASH %s
+
+// RUN: %clang -### -E --cuda-device-only -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx803 %s \
+// RUN: 2>&1 | FileCheck -check-prefixes=DASH %s
 
 // RUN: %clang -### -E -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
@@ -20,7 +30,7 @@
 
 // RUN: %clang -### -E --cuda-device-only -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
-// RUN: 2>&1 | FileCheck -check-prefixes=CLANG-DASH %s
+// RUN: 2>&1 | FileCheck -check-prefixes=DASH %s
 
 // RUN: %clang -### -E --cuda-host-only -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
@@ -41,7 +51,7 @@
 
 // RUN: %clang -### -E -o test.cui --cuda-device-only -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
-// RUN: 2>&1 | FileCheck -check-prefixes=CLANG-OUT %s
+// RUN: 2>&1 | FileCheck -check-prefixes=OUT %s
 
 // RUN: %clang -### -E -o test.cui --cuda-host-only -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
Index: clang/test/Driver/hip-device-compile.hip
===================================================================
--- clang/test/Driver/hip-device-compile.hip
+++ clang/test/Driver/hip-device-compile.hip
@@ -6,21 +6,21 @@
 // the output should not be bundled.
 
 // RUN: %clang -c -emit-llvm --cuda-device-only -### -target x86_64-linux-gnu \
-// RUN:   -o a.bc -x hip --cuda-gpu-arch=gfx900 \
+// RUN:   -o a.bc -x hip --cuda-gpu-arch=gfx900 -fno-hip-bundle-device-output \
 // RUN:   --hip-device-lib=lib1.bc \
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN: 2>&1 | FileCheck -check-prefixes=CHECK,BC %s
 
 // RUN: %clang -c -S -emit-llvm --cuda-device-only -### -target x86_64-linux-gnu \
-// RUN:   -o a.ll -x hip --cuda-gpu-arch=gfx900 \
+// RUN:   -o a.ll -x hip --cuda-gpu-arch=gfx900 -fno-hip-bundle-device-output \
 // RUN:   --hip-device-lib=lib1.bc \
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LL %s
 
 // RUN: %clang -c -S --cuda-device-only -### -target x86_64-linux-gnu \
-// RUN:   -o a.s -x hip --cuda-gpu-arch=gfx900 \
+// RUN:   -o a.s -x hip --cuda-gpu-arch=gfx900 -fno-hip-bundle-device-output \
 // RUN:   --hip-device-lib=lib1.bc \
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
Index: clang/test/Driver/clang-offload-bundler.c
===================================================================
--- clang/test/Driver/clang-offload-bundler.c
+++ clang/test/Driver/clang-offload-bundler.c
@@ -361,6 +361,21 @@
 // CKLST2-NOT: openmp-powerpc64le-ibm-linux-gnu
 // CKLST2-NOT: openmp-x86_64-pc-linux-gnu
 
+//
+// Check bundling without host target is allowed for HIP.
+//
+// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa-gfx900,hip-amdgcn-amd-amdhsa-gfx906 \
+// RUN:   -inputs=%t.tgt1,%t.tgt2 -outputs=%t.hip.bundle.bc
+// RUN: clang-offload-bundler -type=bc -list -inputs=%t.hip.bundle.bc | FileCheck -check-prefix=NOHOST %s
+// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa-gfx900,hip-amdgcn-amd-amdhsa-gfx906 \
+// RUN:   -outputs=%t.res.tgt1,%t.res.tgt2 -inputs=%t.hip.bundle.bc -unbundle
+// RUN: diff %t.tgt1 %t.res.tgt1
+// RUN: diff %t.tgt2 %t.res.tgt2
+//
+// NOHOST-NOT: host-
+// NOHOST-DAG: hip-amdgcn-amd-amdhsa-gfx900
+// NOHOST-DAG: hip-amdgcn-amd-amdhsa-gfx906
+
 // Some code so that we can create a binary out of this file.
 int A = 0;
 void test_func(void) {
Index: clang/lib/Driver/Driver.cpp
===================================================================
--- clang/lib/Driver/Driver.cpp
+++ clang/lib/Driver/Driver.cpp
@@ -2900,6 +2900,7 @@
     /// The linker inputs obtained for each device arch.
     SmallVector<ActionList, 8> DeviceLinkerInputs;
     bool GPUSanitize;
+    bool BundleDeviceOutput;
 
   public:
     HIPActionBuilder(Compilation &C, DerivedArgList &Args,
@@ -2908,6 +2909,12 @@
       DefaultCudaArch = CudaArch::GFX803;
       GPUSanitize = Args.hasFlag(options::OPT_fgpu_sanitize,
                                  options::OPT_fno_gpu_sanitize, false);
+      // By default, bundle device output only if -o or -E is specified.
+      // However, it will be overridden by -f[no-]bundle-device-output.
+      BundleDeviceOutput = Args.hasArg(options::OPT_o, options::OPT_E);
+      BundleDeviceOutput = Args.hasFlag(
+          options::OPT_fhip_bundle_device_output,
+          options::OPT_fno_hip_bundle_device_output, BundleDeviceOutput);
     }
 
     bool canUseBundlerUnbundler() const override { return true; }
@@ -3039,6 +3046,21 @@
         A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A,
                                                AssociatedOffloadKind);
 
+      // Bundle output files for device only compilation if there are more
+      // than one output files.
+      if (CompileDeviceOnly && CurPhase == FinalPhase && BundleDeviceOutput) {
+        for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+          OffloadAction::DeviceDependences DDep;
+          DDep.add(*CudaDeviceActions[I], *ToolChains.front(), GpuArchList[I],
+                   AssociatedOffloadKind);
+          CudaDeviceActions[I] = C.MakeAction<OffloadAction>(
+              DDep, CudaDeviceActions[I]->getType());
+        }
+        CudaFatBinary =
+            C.MakeAction<OffloadBundlingJobAction>(CudaDeviceActions);
+        CudaDeviceActions.clear();
+      }
+
       return (CompileDeviceOnly && CurPhase == FinalPhase) ? ABRT_Ignore_Host
                                                            : ABRT_Success;
     }
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -988,6 +988,10 @@
 def fgpu_sanitize : Flag<["-"], "fgpu-sanitize">, Group<f_Group>,
   HelpText<"Enable sanitizer for AMDGPU target">;
 def fno_gpu_sanitize : Flag<["-"], "fno-gpu-sanitize">, Group<f_Group>;
+def fhip_bundle_device_output : Flag<["-"], "fhip-bundle-device-output">,
+  Group<f_Group>, HelpText<"Bundle output files of HIP device compilation">;
+def fno_hip_bundle_device_output : Flag<["-"], "fno-hip-bundle-device-output">,
+  Group<f_Group>;
 def cuid_EQ : Joined<["-"], "cuid=">, Flags<[CC1Option]>,
   HelpText<"An ID for compilation unit, which should be the same for the same "
            "compilation unit but different for different compilation units. "
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to