[PATCH] D11280: [CUDA] Improve CUDA compilation pipeline creation.

Artem Belevich Thu, 16 Jul 2015 15:16:11 -0700

tra created this revision.
tra added reviewers: echristo, bogner, dblaikie.
tra added a subscriber: cfe-commits.


Current implementation tries to guess which Action will result in a job which 
needs to incorporate device-side GPU binaries. The guessing was attempting to 
work around the fact that multiple actions may be combined into a single 
compiler invocation. If CudaHostAction ends up being combined (and thus 
bypassed during action list traversal) no device-side actions it pointed to 
were processed. The guessing worked for most of the usual cases, but fell apart 
when external assembler was used.

This change removes the guessing and makes sure we create and pass device-side 
jobs regardless of how the jobs get combined.

* CudaHostAction is always inserted either at Compile phase or the FinalPhase 
of current compilation, whichever happens first.
* If selectToolForJob combines CudaHostAction with other actions, it passes 
info about CudaHostAction up to the caller
* When it sees that CudaHostAction got combined with other actions (and hence 
will never be passed to BuildJobsForActions), BuildJobsForActions creates 
device-side jobs the same way they would be created if CudaHostAction was 
passed to BuildJobsForActions directly.
* Added two more test cases to make sure GPU binaries are passed to correct 
jobs.

http://reviews.llvm.org/D11280

Files:
  lib/Driver/Driver.cpp
  test/Driver/cuda-options.cu

Index: test/Driver/cuda-options.cu
===================================================================
--- test/Driver/cuda-options.cu
+++ test/Driver/cuda-options.cu
@@ -6,16 +6,16 @@
 // Simple compilation case:
 // RUN: %clang -### -target x86_64-linux-gnu -c %s 2>&1 \
 // Compile device-side to PTX assembly and make sure we use it on the host side.
-// RUN:   | FileCheck -check-prefix CUDA-D1 \
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS\
 // Then compile host side and incorporate device code.
 // RUN:   -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
 // Make sure we don't link anything.
 // RUN:   -check-prefix CUDA-NL %s
 
 // Typical compilation + link case:
 // RUN: %clang -### -target x86_64-linux-gnu %s 2>&1 \
 // Compile device-side to PTX assembly and make sure we use it on the host side
-// RUN:   | FileCheck -check-prefix CUDA-D1 \
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS\
 // Then compile host side and incorporate device code.
 // RUN:   -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
 // Then link things.
@@ -33,15 +33,15 @@
 // Verify that -cuda-no-host disables host-side compilation and linking
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only %s 2>&1 \
 // Compile device-side to PTX assembly
-// RUN:   | FileCheck -check-prefix CUDA-D1 \
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS\
 // Make sure there are no host cmpilation or linking.
 // RUN:   -check-prefix CUDA-NH -check-prefix CUDA-NL %s
 
 // Verify that with -S we compile host and device sides to assembly
 // and incorporate device code on the host side.
 // RUN: %clang -### -target x86_64-linux-gnu -S -c %s 2>&1 \
 // Compile device-side to PTX assembly
-// RUN:   | FileCheck -check-prefix CUDA-D1 \
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS\
 // Then compile host side and incorporate GPU code.
 // RUN:  -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
 // Make sure we don't link anything.
@@ -51,30 +51,64 @@
 // archtecture info to device compilation.
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
 // Compile device-side to PTX assembly.
-// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1-SM35 \
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS \
+// RUN:   -check-prefix CUDA-D1-SM35 \
 // Then compile host side and incorporate GPU code.
 // RUN:   -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
 // Make sure we don't link anything.
 // RUN:   -check-prefix CUDA-NL %s
 
 // Verify that there is device-side compilation per --cuda-gpu-arch args
 // and that all results are included on the host side.
-// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:        --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 -c %s 2>&1 \
 // Compile both device-sides to PTX assembly
 // RUN:   | FileCheck \
-// RUN: -check-prefix CUDA-D1 -check-prefix CUDA-D1-SM35 \
+// RUN: -check-prefix CUDA-D1 -check-prefix CUDA-D1NS -check-prefix CUDA-D1-SM35 \
 // RUN: -check-prefix CUDA-D2 -check-prefix CUDA-D2-SM30 \
 // Then compile host side and incorporate both device-side outputs
-// RUN:   -check-prefix CUDA-H -check-prefix CUDA-H-I1 -check-prefix CUDA-H-I2 \
+// RUN:   -check-prefix CUDA-H -check-prefix CUDA-HNS \
+// RUN:   -check-prefix CUDA-H-I1 -check-prefix CUDA-H-I2 \
 // Make sure we don't link anything.
 // RUN:   -check-prefix CUDA-NL %s
 
-// Match device-side compilation
+// Verify that device-side results are passed to correct tool when
+// -save-temps is used
+// RUN: %clang -### -target x86_64-linux-gnu -save-temps -c %s 2>&1 \
+// Compile device-side to PTX assembly and make sure we use it on the host side.
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1S \
+// Then compile host side and incorporate device code.
+// RUN:   -check-prefix CUDA-H -check-prefix CUDA-HS -check-prefix CUDA-HS-I1 \
+// Make sure we don't link anything.
+// RUN:   -check-prefix CUDA-NL %s
+
+// Verify that device-side results are passed to correct tool when
+// -fno-integrated-as is used
+// RUN: %clang -### -target x86_64-linux-gnu -fno-integrated-as -c %s 2>&1 \
+// Compile device-side to PTX assembly and make sure we use it on the host side.
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS \
+// Then compile host side and incorporate device code.
+// RUN:   -check-prefix CUDA-H -check-prefix CUDA-HNS -check-prefix CUDA-HS-I1 \
+// RUN:   -check-prefix CUDA-H-AS \
+// Make sure we don't link anything.
+// RUN:   -check-prefix CUDA-NL %s
+
+
+// Match device-side preprocessor, and compiler phases with -save-temps
+// CUDA-D1S: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
+// CUDA-D1S-SAME: "-fcuda-is-device"
+// CUDA-D1S-SAME: "-x" "cuda"
+// CUDA-D1S: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
+// CUDA-D1S-SAME: "-fcuda-is-device"
+// CUDA-D1S-SAME: "-x" "cuda-cpp-output"
+
+// Match the job that produces PTX assembly
 // CUDA-D1: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
 // CUDA-D1-SAME: "-fcuda-is-device"
 // CUDA-D1-SM35-SAME: "-target-cpu" "sm_35"
 // CUDA-D1-SAME: "-o" "[[GPUBINARY1:[^"]*]]"
-// CUDA-D1-SAME: "-x" "cuda"
+// CUDA-D1NS-SAME: "-x" "cuda"
+// CUDA-D1S-SAME: "-x" "ir"
 
 // Match anothe device-side compilation
 // CUDA-D2: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
@@ -84,18 +118,28 @@
 // CUDA-D2-SAME: "-x" "cuda"
 
 // Match no device-side compilation
-// CUDA-ND-NOT: "-cc1" "-triple" "nvptx{{64?}}-nvidia-cuda"
+// CUDA-ND-NOT: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
 // CUDA-ND-SAME-NOT: "-fcuda-is-device"
 
+// Match host-side preprocessor job with -save-temps
+// CUDA-HS: "-cc1" "-triple"
+// CUDA-HS-SAME-NOT: "nvptx{{(64)?}}-nvidia-cuda"
+// CUDA-HS-SAME-NOT: "-fcuda-is-device"
+// CUDA-HS-SAME: "-x" "cuda"
+
 // Match host-side compilation
 // CUDA-H: "-cc1" "-triple"
-// CUDA-H-SAME-NOT: "nvptx{{64?}}-nvidia-cuda"
+// CUDA-H-SAME-NOT: "nvptx{{(64)?}}-nvidia-cuda"
 // CUDA-H-SAME-NOT: "-fcuda-is-device"
-// CUDA-H-SAME: "-o" "[[HOSTOBJ:[^"]*]]"
-// CUDA-H-SAME: "-x" "cuda"
+// CUDA-H-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]"
+// CUDA-HNS-SAME: "-x" "cuda"
+// CUDA-HS-SAME: "-x" "cuda-cpp-output"
 // CUDA-H-I1-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY1]]"
 // CUDA-H-I2-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY2]]"
 
+// Match external assembler that uses compilation output
+// CUDA-H-AS: "-o" "{{.*}}.o" "[[HOSTOUTPUT]]"
+
 // Match no GPU code inclusion.
 // CUDA-H-NI-NOT: "-fcuda-include-gpubinary"
 
@@ -105,7 +149,7 @@
 
 // Match linker
 // CUDA-L: "{{.*}}{{ld|link}}{{(.exe)?}}"
-// CUDA-L-SAME: "[[HOSTOBJ]]"
+// CUDA-L-SAME: "[[HOSTOUTPUT]]"
 
 // Match no linker
 // CUDA-NL-NOT: "{{.*}}{{ld|link}}{{(.exe)?}}"
Index: lib/Driver/Driver.cpp
===================================================================
--- lib/Driver/Driver.cpp
+++ lib/Driver/Driver.cpp
@@ -1228,11 +1228,13 @@
   }
 }
 
-// For each unique --cuda-gpu-arch= argument creates a TY_CUDA_DEVICE input
-// action and then wraps each in CudaDeviceAction paired with appropriate GPU
-// arch name. If we're only building device-side code, each action remains
-// independent. Otherwise we pass device-side actions as inputs to a new
-// CudaHostAction which combines both host and device side actions.
+// For each unique --cuda-gpu-arch= argument creates a TY_CUDA_DEVICE
+// input action and then wraps each in CudaDeviceAction paired with
+// appropriate GPU arch name. In case of partial (i.e preprocessing
+// only) or device-only compilation, each device action is added to /p
+// Actions and /p Current is released. Otherwise the function creates
+// and returns a new CudaHostAction which wraps /p Current and device
+// side actions.
 static std::unique_ptr<Action>
 buildCudaActions(const Driver &D, const ToolChain &TC, DerivedArgList &Args,
                  const Arg *InputArg, const types::ID InputType,
@@ -1413,22 +1415,14 @@
     }
 
     phases::ID CudaInjectionPhase;
-    if (isSaveTempsEnabled()) {
-      // All phases are done independently, inject GPU blobs during compilation
-      // phase as that's where we generate glue code to init them.
-      CudaInjectionPhase = phases::Compile;
-    } else {
-      // Assumes that clang does everything up until linking phase, so we inject
-      // cuda device actions at the last step before linking. Otherwise CUDA
-      // host action forces preprocessor into a separate invocation.
-      CudaInjectionPhase = FinalPhase;
-      if (FinalPhase == phases::Link)
-        for (auto PI = PL.begin(), PE = PL.end(); PI != PE; ++PI) {
-          auto next = PI + 1;
-          if (next != PE && *next == phases::Link)
-            CudaInjectionPhase = *PI;
-        }
-    }
+    bool InjectCuda = (InputType == types::TY_CUDA &&
+                       !Args.hasArg(options::OPT_cuda_host_only));
+    CudaInjectionPhase = FinalPhase;
+    for (auto &Phase : PL)
+      if (Phase <= FinalPhase && Phase == phases::Compile) {
+        CudaInjectionPhase = Phase;
+        break;
+      }
 
     // Build the pipeline for this file.
     std::unique_ptr<Action> Current(new InputAction(*InputArg, InputType));
@@ -1456,8 +1450,7 @@
       // Otherwise construct the appropriate action.
       Current = ConstructPhaseAction(TC, Args, Phase, std::move(Current));
 
-      if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase &&
-          !Args.hasArg(options::OPT_cuda_host_only)) {
+      if (InjectCuda && Phase == CudaInjectionPhase) {
         Current = buildCudaActions(*this, TC, Args, InputArg, InputType,
                                    std::move(Current), Actions);
         if (!Current)
@@ -1667,10 +1660,17 @@
   }
 }
 
-static const Tool *SelectToolForJob(Compilation &C, bool SaveTemps,
+// Returns a Tool for a given JobAction.  In case the action and its
+// predecessors can be combined, updates Inputs with the inputs of the
+// first combined action. If one of the collapsed actions is a
+// CudaHostAction, updates CollapsedCHA with the pointer to it so the
+// caller can deal with extra handling such action requires.
+static const Tool *selectToolForJob(Compilation &C, bool SaveTemps,
                                     const ToolChain *TC, const JobAction *JA,
-                                    const ActionList *&Inputs) {
+                                    const ActionList *&Inputs,
+                                    const CudaHostAction *&CollapsedCHA) {
   const Tool *ToolForJob = nullptr;
+  CollapsedCHA = nullptr;
 
   // See if we should look for a compiler with an integrated assembler. We match
   // bottom up, so what we are actually looking for is an assembler job with a
@@ -1687,13 +1687,19 @@
     // checking the backend tool, check if the tool for the CompileJob
     // has an integrated assembler.
     const ActionList *BackendInputs = &(*Inputs)[0]->getInputs();
-    JobAction *CompileJA = cast<CompileJobAction>(*BackendInputs->begin());
+    // Compile job may be wrapped in CudaHostAction, extract it if
+    // that's the case and update CollapsedCHA if we combine phases.
+    CudaHostAction *CHA = dyn_cast<CudaHostAction>(*BackendInputs->begin());
+    JobAction *CompileJA =
+        cast<CompileJobAction>(CHA ? *CHA->begin() : *BackendInputs->begin());
+    assert(CompileJA && "Backend job is not preceeded by compile job.");
     const Tool *Compiler = TC->SelectTool(*CompileJA);
     if (!Compiler)
       return nullptr;
     if (Compiler->hasIntegratedAssembler()) {
-      Inputs = &(*BackendInputs)[0]->getInputs();
+      Inputs = &CompileJA->getInputs();
       ToolForJob = Compiler;
+      CollapsedCHA = CHA;
     }
   }
 
@@ -1703,19 +1709,19 @@
   if (isa<BackendJobAction>(JA)) {
     // Check if the compiler supports emitting LLVM IR.
     assert(Inputs->size() == 1);
-    JobAction *CompileJA;
-    // Extract real host action, if it's a CudaHostAction.
-    if (CudaHostAction *CudaHA = dyn_cast<CudaHostAction>(*Inputs->begin()))
-      CompileJA = cast<CompileJobAction>(*CudaHA->begin());
-    else
-      CompileJA = cast<CompileJobAction>(*Inputs->begin());
-
+    // Compile job may be wrapped in CudaHostAction, extract it if
+    // that's the case and update CollapsedCHA if we combine phases.
+    CudaHostAction *CHA = dyn_cast<CudaHostAction>(*Inputs->begin());
+    JobAction *CompileJA =
+        cast<CompileJobAction>(CHA ? *CHA->begin() : *Inputs->begin());
+    assert(CompileJA && "Backend job is not preceeded by compile job.");
     const Tool *Compiler = TC->SelectTool(*CompileJA);
     if (!Compiler)
       return nullptr;
     if (!Compiler->canEmitIR() || !SaveTemps) {
-      Inputs = &(*Inputs)[0]->getInputs();
+      Inputs = &CompileJA->getInputs();
       ToolForJob = Compiler;
+      CollapsedCHA = CHA;
     }
   }
 
@@ -1803,10 +1809,23 @@
   const ActionList *Inputs = &A->getInputs();
 
   const JobAction *JA = cast<JobAction>(A);
-  const Tool *T = SelectToolForJob(C, isSaveTempsEnabled(), TC, JA, Inputs);
+  const CudaHostAction *CollapsedCHA = nullptr;
+  const Tool *T =
+      selectToolForJob(C, isSaveTempsEnabled(), TC, JA, Inputs, CollapsedCHA);
   if (!T)
     return;
 
+  // If we've collapsed action list that contained CudaHostAction we
+  // need to build jobs for device-side inputs it may have held.
+  if (CollapsedCHA) {
+    InputInfo II;
+    for (const Action *DA : CollapsedCHA->getDeviceActions()) {
+      BuildJobsForAction(C, DA, TC, "", AtTopLevel,
+                         /*MultipleArchs*/ false, LinkingOutput, II);
+      CudaDeviceInputInfos.push_back(II);
+    }
+  }
+
   // Only use pipes when there is exactly one input.
   InputInfoList InputInfos;
   for (const Action *Input : *Inputs) {

_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits

[PATCH] D11280: [CUDA] Improve CUDA compilation pipeline creation.

Reply via email to