yaxunl created this revision. yaxunl added a reviewer: tra. yaxunl requested review of this revision.
When clang compiles a HIP program with -E, there are multiple output files for host and different GPU archs. Clang uses clang-offload-bundler to bundle them as one output file. Currently clang does this for combined host/device compilation but does not do it for device-only compilation. This causes issue when there are multiple GPU arch's in device-only compilation. If -o is specified, the output for different GPU arch's will overwrite each other. This patch fixes that by bundling the output files if there are multiple GPU arch's. This is consistent with the behavior for combined host/device compilation. Clang will automatically unbundle it if the preprocessor expansion output is used as input. The same thing with -emit-llvm or -S. https://reviews.llvm.org/D101630 Files: clang/lib/Driver/Driver.cpp clang/test/Driver/clang-offload-bundler.c clang/test/Driver/hip-output-file-name.hip clang/test/Driver/hip-phases.hip clang/test/Driver/hip-rdc-device-only.hip clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
Index: clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp =================================================================== --- clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp +++ clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp @@ -117,6 +117,9 @@ /// The index of the host input in the list of inputs. static unsigned HostInputIndex = ~0u; +/// Whether not having host target is allowed. +static bool AllowNoHost = false; + /// Path to the current binary. static std::string BundlerExecutable; @@ -857,9 +860,10 @@ } // Get the file handler. We use the host buffer as reference. - assert(HostInputIndex != ~0u && "Host input index undefined??"); + assert((HostInputIndex != ~0u || AllowNoHost) && + "Host input index undefined??"); Expected<std::unique_ptr<FileHandler>> FileHandlerOrErr = - CreateFileHandler(*InputBuffers[HostInputIndex]); + CreateFileHandler(*InputBuffers[AllowNoHost ? 0 : HostInputIndex]); if (!FileHandlerOrErr) return FileHandlerOrErr.takeError(); @@ -1126,6 +1130,7 @@ // have exactly one host target. unsigned Index = 0u; unsigned HostTargetNum = 0u; + bool HIPOnly = true; llvm::DenseSet<StringRef> ParsedTargets; for (StringRef Target : TargetNames) { if (ParsedTargets.contains(Target)) { @@ -1167,12 +1172,21 @@ HostInputIndex = Index; } + if (Kind != "hip" && Kind != "hipv4") + HIPOnly = false; + ++Index; } + // HIP uses clang-offload-bundler to bundle device-only compilation results + // for multiple GPU archs, therefore allow no host target if all entries + // are for HIP. + AllowNoHost = HIPOnly; + // Host triple is not really needed for unbundling operation, so do not // treat missing host triple as error if we do unbundling. - if ((Unbundle && HostTargetNum > 1) || (!Unbundle && HostTargetNum != 1)) { + if ((Unbundle && HostTargetNum > 1) || + (!Unbundle && HostTargetNum != 1 && !AllowNoHost)) { reportError(createStringError(errc::invalid_argument, "expecting exactly one host target but got " + Twine(HostTargetNum))); Index: clang/test/Driver/hip-rdc-device-only.hip =================================================================== --- clang/test/Driver/hip-rdc-device-only.hip +++ clang/test/Driver/hip-rdc-device-only.hip @@ -56,8 +56,8 @@ // COMMON-SAME: "-fapply-global-visibility-to-externs" // COMMON-SAME: "-target-cpu" "gfx803" // COMMON-SAME: "-fgpu-rdc" -// EMITBC-SAME: {{.*}} "-o" {{"a.*bc"}} "-x" "hip" -// EMITLL-SAME: {{.*}} "-o" {{"a.*ll"}} "-x" "hip" +// EMITBC-SAME: {{.*}} "-o" {{".*a.*bc"}} "-x" "hip" +// EMITLL-SAME: {{.*}} "-o" {{".*a.*ll"}} "-x" "hip" // CHECK-SAME: {{.*}} {{".*a.cu"}} // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" @@ -69,10 +69,14 @@ // COMMON-SAME: "-fapply-global-visibility-to-externs" // COMMON-SAME: "-target-cpu" "gfx900" // COMMON-SAME: "-fgpu-rdc" -// EMITBC-SAME: {{.*}} "-o" {{"a.*bc"}} "-x" "hip" -// EMITLL-SAME: {{.*}} "-o" {{"a.*ll"}} "-x" "hip" +// EMITBC-SAME: {{.*}} "-o" {{".*a.*bc"}} "-x" "hip" +// EMITLL-SAME: {{.*}} "-o" {{".*a.*ll"}} "-x" "hip" // COMMON-SAME: {{.*}} {{".*a.cu"}} +// COMMON: "{{.*}}clang-offload-bundler" "-type={{(bc|ll)}}" +// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// COMMON-SAME: "-outputs=a-hip-amdgcn-amd-amdhsa.{{(bc|ll)}}" + // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" // COMMON-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" // EMITBC-SAME: "-emit-llvm-bc" @@ -82,8 +86,8 @@ // COMMON-SAME: "-fapply-global-visibility-to-externs" // COMMON-SAME: "-target-cpu" "gfx803" // COMMON-SAME: "-fgpu-rdc" -// EMITBC-SAME: {{.*}} "-o" {{"b.*bc"}} "-x" "hip" -// EMITLL-SAME: {{.*}} "-o" {{"b.*ll"}} "-x" "hip" +// EMITBC-SAME: {{.*}} "-o" {{".*b.*bc"}} "-x" "hip" +// EMITLL-SAME: {{.*}} "-o" {{".*b.*ll"}} "-x" "hip" // COMMON-SAME: {{.*}} {{".*b.hip"}} // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" @@ -95,10 +99,14 @@ // COMMON-SAME: "-fapply-global-visibility-to-externs" // COMMON-SAME: "-target-cpu" "gfx900" // COMMON-SAME: "-fgpu-rdc" -// EMITBC-SAME: {{.*}} "-o" {{"b.*bc"}} "-x" "hip" -// EMITLL-SAME: {{.*}} "-o" {{"b.*ll"}} "-x" "hip" +// EMITBC-SAME: {{.*}} "-o" {{".*b.*bc"}} "-x" "hip" +// EMITLL-SAME: {{.*}} "-o" {{".*b.*ll"}} "-x" "hip" // COMMON-SAME: {{.*}} {{".*b.hip"}} +// COMMON: "{{.*}}clang-offload-bundler" "-type={{(bc|ll)}}" +// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// COMMON-SAME: "-outputs=b-hip-amdgcn-amd-amdhsa.{{(bc|ll)}}" + // SAVETEMP: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu" // SAVETEMP-SAME: "-E" // SAVETEMP-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx803" @@ -125,6 +133,10 @@ // SAVETEMP-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx900" // SAVETEMP-SAME: {{.*}} "-o" {{"a.*.ll"}} "-x" "ir" [[A_GFX900_TMP_BC]] +// SAVETEMP: "{{.*}}clang-offload-bundler" "-type=ll" +// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// SAVETEMP-SAME: "-outputs=a-hip-amdgcn-amd-amdhsa.ll" + // SAVETEMP: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu" // SAVETEMP-SAME: "-E" // SAVETEMP-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx803" @@ -150,3 +162,7 @@ // SAVETEMP-SAME: "-emit-llvm" // SAVETEMP-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx900" // SAVETEMP-SAME: {{.*}} "-o" {{"b.*.ll"}} "-x" "ir" [[B_GFX900_TMP_BC]] + +// SAVETEMP: "{{.*}}clang-offload-bundler" "-type=ll" +// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" +// SAVETEMP-SAME: "-outputs=b-hip-amdgcn-amd-amdhsa.ll" Index: clang/test/Driver/hip-phases.hip =================================================================== --- clang/test/Driver/hip-phases.hip +++ clang/test/Driver/hip-phases.hip @@ -238,6 +238,7 @@ // DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) // DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]]) // DASM-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P3]]}, assembler +// DASM-NOT: clang-offload-bundler // DASM-NOT: host // @@ -282,6 +283,7 @@ // DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]]) // DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]]) // DASM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P8]]}, assembler +// DASM2-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P4]], [[P9]]}, assembler, (device-hip, ) // DASM2-NOT: host // @@ -312,3 +314,88 @@ // NL2-DAG: [[P4:[0-9]+]]: linker, {[[P0]], [[P2]]}, image, (host-[[T]]) // RL2-DAG: [[P4:[0-9]+]]: linker, {[[P1]], [[P3]], [[P9]]}, image, (host-[[T]]) + +// Test one gpu architectures up to the preprocessor expansion output phase in device-only +// compilation mode. +// +// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 %s \ +// RUN: --cuda-device-only -E 2>&1 \ +// RUN: | FileCheck -check-prefixes=PPE %s + +// Test two gpu architectures up to the preprocessor expansion output phase in device-only +// compilation mode. +// +// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ +// RUN: --cuda-device-only -E 2>&1 \ +// RUN: | FileCheck -check-prefixes=PPE2 %s + +// Test one gpu architectures up to the LLVM IR output phase in device-only +// compilation mode. +// +// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 %s \ +// RUN: --cuda-device-only -c -emit-llvm 2>&1 \ +// RUN: | FileCheck -check-prefixes=LLVM %s + +// Test two gpu architectures up to the LLVM IR output phase in device-only +// compilation mode. +// +// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ +// RUN: --cuda-device-only -c -emit-llvm 2>&1 \ +// RUN: | FileCheck -check-prefixes=LLVM2 %s + +// Test two gpu architectures up to the LLVM IR output phase in device-only +// compilation mode with bundled preprocessor expansion as input. +// +// RUN: %clang -x hip-cpp-output -target x86_64-unknown-linux-gnu \ +// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ +// RUN: --cuda-device-only -c -emit-llvm 2>&1 \ +// RUN: | FileCheck -check-prefixes=PPELLVM2 %s + +// PPE-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// PPE-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// PPE-NOT: clang-offload-bundler +// PPE-NOT: host + +// PPE2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// PPE2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// PPE2-DAG: [[P2:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P1]]}, [[T]]-cpp-output +// PPE2-DAG: [[P5:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH2:gfx900]]) +// PPE2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]]) +// PPE2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P6]]}, [[T]]-cpp-output +// PPE2-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P2]], [[P9]]}, [[T]]-cpp-output, (device-hip, ) +// PPE2-NOT: host + +// LLVM-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// LLVM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// LLVM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) +// LLVM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, ir, (device-[[T]], [[ARCH]]) +// LLVM-NOT: clang-offload-bundler +// LLVM-NOT: host + +// LLVM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]]) +// LLVM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) +// LLVM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]]) +// LLVM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, ir, (device-[[T]], [[ARCH]]) +// LLVM2-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P3]]}, ir +// LLVM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH2:gfx900]]) +// LLVM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]]) +// LLVM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]]) +// LLVM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, ir, (device-[[T]], [[ARCH2]]) +// LLVM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P8]]}, ir +// LLVM2-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P4]], [[P9]]}, ir, (device-hip, ) +// LLVM2-NOT: host + +// PPELLVM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]]-cpp-output +// PPELLVM2-DAG: [[P1:[0-9]+]]: clang-offload-unbundler, {[[P0]]}, hip-cpp-output +// PPELLVM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH:gfx803]]) +// PPELLVM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, ir, (device-[[T]], [[ARCH]]) +// PPELLVM2-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P3]]}, ir +// PPELLVM2-DAG: [[P7:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH2:gfx900]]) +// PPELLVM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, ir, (device-[[T]], [[ARCH2]]) +// PPELLVM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P8]]}, ir +// PPELLVM2-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P4]], [[P9]]}, ir, (device-hip, ) +// PPELLVM2-NOT: host Index: clang/test/Driver/hip-output-file-name.hip =================================================================== --- clang/test/Driver/hip-output-file-name.hip +++ clang/test/Driver/hip-output-file-name.hip @@ -9,6 +9,16 @@ // CHECK: {{.*}}clang-offload-bundler{{.*}}"-outputs=hip-output-file-name.o" // Check -E default output is "-" (stdout). +// If there are multiple preprocessor expansion outputs clang-offload-bundler +// is used to bundle the final output. + +// RUN: %clang -### -E -target x86_64-linux-gnu \ +// RUN: --cuda-gpu-arch=gfx803 %s \ +// RUN: 2>&1 | FileCheck -check-prefixes=DASH %s + +// RUN: %clang -### -E --cuda-device-only -target x86_64-linux-gnu \ +// RUN: --cuda-gpu-arch=gfx803 %s \ +// RUN: 2>&1 | FileCheck -check-prefixes=CLANG-DASH %s // RUN: %clang -### -E -target x86_64-linux-gnu \ // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ @@ -20,7 +30,7 @@ // RUN: %clang -### -E --cuda-device-only -target x86_64-linux-gnu \ // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ -// RUN: 2>&1 | FileCheck -check-prefixes=CLANG-DASH %s +// RUN: 2>&1 | FileCheck -check-prefixes=DASH %s // RUN: %clang -### -E --cuda-host-only -target x86_64-linux-gnu \ // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ @@ -41,7 +51,7 @@ // RUN: %clang -### -E -o test.cui --cuda-device-only -target x86_64-linux-gnu \ // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ -// RUN: 2>&1 | FileCheck -check-prefixes=CLANG-OUT %s +// RUN: 2>&1 | FileCheck -check-prefixes=OUT %s // RUN: %clang -### -E -o test.cui --cuda-host-only -target x86_64-linux-gnu \ // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \ Index: clang/test/Driver/clang-offload-bundler.c =================================================================== --- clang/test/Driver/clang-offload-bundler.c +++ clang/test/Driver/clang-offload-bundler.c @@ -362,6 +362,21 @@ // CKLST2-NOT: openmp-powerpc64le-ibm-linux-gnu // CKLST2-NOT: openmp-x86_64-pc-linux-gnu +// +// Check bundling without host target is allowed for HIP. +// +// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa-gfx900,hip-amdgcn-amd-amdhsa-gfx906 \ +// RUN: -inputs=%t.tgt1,%t.tgt2 -outputs=%t.hip.bundle.bc +// RUN: clang-offload-bundler -type=bc -list -inputs=%t.hip.bundle.bc | FileCheck -check-prefix=NOHOST %s +// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa-gfx900,hip-amdgcn-amd-amdhsa-gfx906 \ +// RUN: -outputs=%t.res.tgt1,%t.res.tgt2 -inputs=%t.hip.bundle.bc -unbundle +// RUN: diff %t.tgt1 %t.res.tgt1 +// RUN: diff %t.tgt2 %t.res.tgt2 +// +// NOHOST-NOT: host- +// NOHOST-DAG: hip-amdgcn-amd-amdhsa-gfx900 +// NOHOST-DAG: hip-amdgcn-amd-amdhsa-gfx906 + // Some code so that we can create a binary out of this file. int A = 0; void test_func(void) { Index: clang/lib/Driver/Driver.cpp =================================================================== --- clang/lib/Driver/Driver.cpp +++ clang/lib/Driver/Driver.cpp @@ -3006,6 +3006,22 @@ A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A, AssociatedOffloadKind); + // Bundle output files for device only compilation if there are more + // than one output files. + if (CompileDeviceOnly && CurPhase == FinalPhase && + GpuArchList.size() > 1) { + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { + OffloadAction::DeviceDependences DDep; + DDep.add(*CudaDeviceActions[I], *ToolChains.front(), GpuArchList[I], + AssociatedOffloadKind); + CudaDeviceActions[I] = C.MakeAction<OffloadAction>( + DDep, CudaDeviceActions[I]->getType()); + } + CudaFatBinary = + C.MakeAction<OffloadBundlingJobAction>(CudaDeviceActions); + CudaDeviceActions.clear(); + } + return (CompileDeviceOnly && CurPhase == FinalPhase) ? ABRT_Ignore_Host : ABRT_Success; }
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits