Hahnfeld created this revision. Hahnfeld added reviewers: jlebar, tra. Herald added a subscriber: cfe-commits. Hahnfeld added a dependency: D42920: [CUDA] Fix test cuda-external-tools.cu. Hahnfeld added a reviewer: hfinkel.
As a first step, pass '-c/--compile-only' to ptxas so that it doesn't complain about references to external function. This will successfully generate object files, but they won't work at runtime because the registration routines need to adapted. Repository: rC Clang https://reviews.llvm.org/D42921 Files: include/clang/Basic/LangOptions.def include/clang/Driver/Options.td lib/Driver/ToolChains/Clang.cpp lib/Driver/ToolChains/Cuda.cpp lib/Frontend/CompilerInvocation.cpp test/Driver/cuda-external-tools.cu
Index: test/Driver/cuda-external-tools.cu =================================================================== --- test/Driver/cuda-external-tools.cu +++ test/Driver/cuda-external-tools.cu @@ -24,6 +24,10 @@ // RUN: %clang -### -target x86_64-linux-gnu -Ofast -c %s 2>&1 \ // RUN: | FileCheck -check-prefix CHECK -check-prefix ARCH64 \ // RUN: -check-prefix SM20 -check-prefix OPT3 %s +// Generating relocatable device code +// RUN: %clang -### -target x86_64-linux-gnu -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix CHECK -check-prefix ARCH64 \ +// RUN: -check-prefix SM20 -check-prefix RDC %s // With debugging enabled, ptxas should be run with with no ptxas optimizations. // RUN: %clang -### -target x86_64-linux-gnu --cuda-noopt-device-debug -O2 -c %s 2>&1 \ @@ -53,15 +57,27 @@ // Regular compile targeting sm_35. // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \ // RUN: | FileCheck -check-prefix CHECK -check-prefix ARCH64 -check-prefix SM35 %s +// Separate compilation targeting sm_35. +// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix CHECK -check-prefix ARCH64 \ +// RUN: -check-prefix SM35 -check-prefix RDC %s // 32-bit compile. // RUN: %clang -### -target i386-linux-gnu -c %s 2>&1 \ // RUN: | FileCheck -check-prefix CHECK -check-prefix ARCH32 -check-prefix SM20 %s +// 32-bit compile when generating relocatable device code. +// RUN: %clang -### -target i386-linux-gnu -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix CHECK -check-prefix ARCH32 \ +// RUN: -check-prefix SM20 -check-prefix RDC %s // Compile with -fintegrated-as. This should still cause us to invoke ptxas. // RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \ // RUN: | FileCheck -check-prefix CHECK -check-prefix ARCH64 \ // RUN: -check-prefix SM20 -check-prefix OPT0 %s +// Check that we still pass -c when generating relocatable device code. +// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix CHECK -check-prefix ARCH64 \ +// RUN: -check-prefix SM20 -check-prefix RDC %s // Check -Xcuda-ptxas and -Xcuda-fatbinary // RUN: %clang -### -target x86_64-linux-gnu -c -Xcuda-ptxas -foo1 \ @@ -78,6 +94,17 @@ // RUN: %clang -### -target i386-apple-macosx -c %s 2>&1 \ // RUN: | FileCheck -check-prefix CHECK -check-prefix ARCH32 -check-prefix SM20 %s +// Check relocatable device code generation on MacOS. +// RUN: %clang -### -target x86_64-apple-macosx -O0 -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix CHECK -check-prefix ARCH64 \ +// RUN: -check-prefix SM20 -check-prefix RDC %s +// RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix CHECK -check-prefix ARCH64 \ +// RUN: -check-prefix SM35 -check-prefix RDC %s +// RUN: %clang -### -target i386-apple-macosx -fcuda-rdc -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix CHECK -check-prefix ARCH32 \ +// RUN: -check-prefix SM20 -check-prefix RDC %s + // Check that CLANG forwards the -v flag to PTXAS. // RUN: %clang -### -save-temps -no-canonical-prefixes -v %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-PTXAS-VERBOSE %s @@ -90,6 +117,8 @@ // SM35-SAME: "-target-cpu" "sm_35" // SM20-SAME: "-o" "[[PTXFILE:[^"]*]]" // SM35-SAME: "-o" "[[PTXFILE:[^"]*]]" +// RDC-SAME: "-fcuda-rdc" +// CHECK-NOT: "-fcuda-rdc" // Match the call to ptxas (which assembles PTX to SASS). // CHECK: ptxas @@ -111,6 +140,8 @@ // CHECK-SAME: "[[PTXFILE]]" // PTXAS-EXTRA-SAME: "-foo1" // PTXAS-EXTRA-SAME: "-foo2" +// RDC-SAME: "-c" +// CHECK-NOT: "-c" // Match the call to fatbinary (which combines all our PTX and SASS into one // blob). @@ -131,5 +162,7 @@ // ARCH64-SAME: "-triple" "x86_64- // ARCH32-SAME: "-triple" "i386- // CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]" +// RDC-SAME: "-fcuda-rdc" +// CHECK-NOT: "-fcuda-rdc" // CHK-PTXAS-VERBOSE: ptxas{{.*}}" "-v" Index: lib/Frontend/CompilerInvocation.cpp =================================================================== --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -2072,6 +2072,8 @@ if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals)) Opts.CUDADeviceApproxTranscendentals = 1; + Opts.CUDARelocatableDeviceCode = Args.hasArg(OPT_fcuda_rdc); + if (Opts.ObjC1) { if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) { StringRef value = arg->getValue(); Index: lib/Driver/ToolChains/Cuda.cpp =================================================================== --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -355,11 +355,17 @@ for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas)) CmdArgs.push_back(Args.MakeArgString(A)); - // In OpenMP we need to generate relocatable code. - if (JA.isOffloading(Action::OFK_OpenMP) && - Args.hasFlag(options::OPT_fopenmp_relocatable_target, - options::OPT_fnoopenmp_relocatable_target, - /*Default=*/ true)) + bool Relocatable = false; + if (JA.isOffloading(Action::OFK_OpenMP)) + // In OpenMP we need to generate relocatable code. + Relocatable = Args.hasFlag(options::OPT_fopenmp_relocatable_target, + options::OPT_fnoopenmp_relocatable_target, + /*Default=*/true); + else if (JA.isOffloading(Action::OFK_Cuda)) + Relocatable = Args.hasFlag(options::OPT_fcuda_rdc, + options::OPT_fno_cuda_rdc, /*Default=*/false); + + if (Relocatable) CmdArgs.push_back("-c"); const char *Exec; @@ -540,6 +546,10 @@ if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals, options::OPT_fno_cuda_approx_transcendentals, false)) CC1Args.push_back("-fcuda-approx-transcendentals"); + + if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, + false)) + CC1Args.push_back("-fcuda-rdc"); } if (DriverArgs.hasArg(options::OPT_nocudalib)) Index: lib/Driver/ToolChains/Clang.cpp =================================================================== --- lib/Driver/ToolChains/Clang.cpp +++ lib/Driver/ToolChains/Clang.cpp @@ -4643,14 +4643,20 @@ CmdArgs.push_back(Args.MakeArgString(Flags)); } - // Host-side cuda compilation receives device-side outputs as Inputs[1...]. - // Include them with -fcuda-include-gpubinary. - if (IsCuda && Inputs.size() > 1) - for (auto I = std::next(Inputs.begin()), E = Inputs.end(); I != E; ++I) { - CmdArgs.push_back("-fcuda-include-gpubinary"); - CmdArgs.push_back(I->getFilename()); + if (IsCuda) { + // Host-side cuda compilation receives device-side outputs as Inputs[1...]. + // Include them with -fcuda-include-gpubinary. + if (Inputs.size() > 1) { + for (auto I = std::next(Inputs.begin()), E = Inputs.end(); I != E; ++I) { + CmdArgs.push_back("-fcuda-include-gpubinary"); + CmdArgs.push_back(I->getFilename()); + } } + if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false)) + CmdArgs.push_back("-fcuda-rdc"); + } + // OpenMP offloading device jobs take the argument -fopenmp-host-ir-file-path // to specify the result of the compile phase on the host, so the meaningful // device declarations can be identified. Also, -fopenmp-is-device is passed Index: include/clang/Driver/Options.td =================================================================== --- include/clang/Driver/Options.td +++ include/clang/Driver/Options.td @@ -567,6 +567,9 @@ def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">, Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">; def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">; +def fcuda_rdc : Flag<["-"], "fcuda-rdc">, Flags<[CC1Option]>, + HelpText<"Generate relocatable device code, also known as separate compilation mode.">; +def fno_cuda_rdc : Flag<["-"], "fno-cuda-rdc">; def dA : Flag<["-"], "dA">, Group<d_Group>; def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>, HelpText<"Print macro definitions in -E mode in addition to normal output">; Index: include/clang/Basic/LangOptions.def =================================================================== --- include/clang/Basic/LangOptions.def +++ include/clang/Basic/LangOptions.def @@ -204,6 +204,7 @@ LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__") LANGOPT(CUDADeviceFlushDenormalsToZero, 1, 0, "flushing denormals to zero") LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions") +LANGOPT(CUDARelocatableDeviceCode, 1, 0, "generate relocatable device code") LANGOPT(SizedDeallocation , 1, 0, "sized deallocation") LANGOPT(AlignedAllocation , 1, 0, "aligned allocation")
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits