[PATCH] D99683: [HIP] Support ThinLTO

Yaxun Liu via Phabricator via cfe-commits Wed, 12 May 2021 13:50:27 -0700

yaxunl updated this revision to Diff 344949.
yaxunl marked an inline comment as done.
yaxunl added a comment.


Revised by Teresa's comments


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D99683/new/

https://reviews.llvm.org/D99683

Files:
  clang/include/clang/Driver/Driver.h
  clang/include/clang/Driver/Options.td
  clang/lib/Driver/Driver.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/lib/Driver/ToolChains/HIP.cpp
  clang/test/Driver/hip-options.hip
  llvm/lib/Transforms/IPO/FunctionImport.cpp
  llvm/test/Transforms/FunctionImport/Inputs/funcimport.ll
  llvm/test/Transforms/FunctionImport/Inputs/noinline.ll
  llvm/test/Transforms/FunctionImport/adjustable_threshold.ll
  llvm/test/Transforms/FunctionImport/funcimport.ll
  llvm/test/Transforms/FunctionImport/noinline.ll

Index: llvm/test/Transforms/FunctionImport/noinline.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/FunctionImport/noinline.ll
@@ -0,0 +1,23 @@
+; Do setup work for all below tests: generate bitcode and combined index
+; RUN: opt -module-summary %s -o %t.main.bc
+; RUN: opt -module-summary %p/Inputs/noinline.ll -o %t.inputs.noinline.bc
+; RUN: llvm-lto -thinlto -o %t.summary %t.main.bc %t.inputs.noinline.bc
+
+; Attempt the import now, ensure below that file containing noinline
+; is not imported by default but imported with -force-import-all.
+
+; RUN: opt -function-import -summary-file %t.summary.thinlto.bc %t.main.bc -S 2>&1 \
+; RUN:   | FileCheck -check-prefix=NOIMPORT %s
+; RUN: opt -function-import -force-import-all -summary-file %t.summary.thinlto.bc \
+; RUN:   %t.main.bc -S 2>&1 | FileCheck -check-prefix=IMPORT %s
+
+define i32 @main() #0 {
+entry:
+  %f = alloca i64, align 8
+  call void @foo(i64* %f)
+  ret i32 0
+}
+
+; NOIMPORT: declare void @foo(i64*)
+; IMPORT: define available_externally void @foo
+declare void @foo(i64*) #1
Index: llvm/test/Transforms/FunctionImport/funcimport.ll
===================================================================
--- llvm/test/Transforms/FunctionImport/funcimport.ll
+++ llvm/test/Transforms/FunctionImport/funcimport.ll
@@ -1,3 +1,5 @@
+; REQUIRES: x86-registered-target
+
 ; Do setup work for all below tests: generate bitcode and combined index
 ; RUN: opt -module-summary %s -o %t.bc
 ; RUN: opt -module-summary %p/Inputs/funcimport.ll -o %t2.bc
@@ -15,6 +17,13 @@
 ; RUN: opt -function-import -enable-import-metadata  -summary-file %t3.thinlto.bc %t.bc -import-instr-limit=5 -S | FileCheck %s --check-prefix=CHECK --check-prefix=INSTLIM5
 ; INSTLIM5-NOT: @staticfunc.llvm.
 
+; Test force import all
+; RUN: llvm-lto -thinlto-action=run -force-import-all %t.bc %t2.bc 2>&1 \
+; RUN:  | FileCheck %s --check-prefix=IMPORTALL
+; IMPORTALL-DAG: Error importing module: Failed to import function weakalias due to InterposableLinkage
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
 
 define i32 @main() #0 {
 entry:
Index: llvm/test/Transforms/FunctionImport/adjustable_threshold.ll
===================================================================
--- llvm/test/Transforms/FunctionImport/adjustable_threshold.ll
+++ llvm/test/Transforms/FunctionImport/adjustable_threshold.ll
@@ -11,7 +11,15 @@
 ; RUN: opt -function-import -summary-file %t3.thinlto.bc %t.bc -import-instr-limit=10 -import-instr-evolution-factor=0.5 -S | FileCheck %s --check-prefix=INSTLIM-PROGRESSIVE
 ; INSTLIM-PROGRESSIVE-NOT: call void @staticfunc
 
-
+; Test force import all
+; RUN: opt -function-import -summary-file %t3.thinlto.bc %t.bc \
+; RUN:  -import-instr-limit=1 -force-import-all -S \
+; RUN:  | FileCheck %s --check-prefix=IMPORTALL
+; IMPORTALL-DAG: define available_externally void @globalfunc1()
+; IMPORTALL-DAG: define available_externally void @trampoline()
+; IMPORTALL-DAG: define available_externally void @largefunction()
+; IMPORTALL-DAG: define available_externally hidden void @staticfunc2.llvm.0()
+; IMPORTALL-DAG: define available_externally void @globalfunc2()
 
 declare void @globalfunc1()
 declare void @globalfunc2()
Index: llvm/test/Transforms/FunctionImport/Inputs/noinline.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/FunctionImport/Inputs/noinline.ll
@@ -0,0 +1,8 @@
+define void @foo(i64* %v) #0 {
+entry:
+  %v.addr = alloca i64*, align 8
+  store i64* %v, i64** %v.addr, align 8
+  ret void
+}
+
+attributes #0 = { noinline }
\ No newline at end of file
Index: llvm/test/Transforms/FunctionImport/Inputs/funcimport.ll
===================================================================
--- llvm/test/Transforms/FunctionImport/Inputs/funcimport.ll
+++ llvm/test/Transforms/FunctionImport/Inputs/funcimport.ll
@@ -1,3 +1,6 @@
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
 @globalvar = global i32 1, align 4
 @staticvar = internal global i32 1, align 4
 @staticconstvar = internal unnamed_addr constant [2 x i32] [i32 10, i32 20], align 4
Index: llvm/lib/Transforms/IPO/FunctionImport.cpp
===================================================================
--- llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -84,6 +84,10 @@
     "import-cutoff", cl::init(-1), cl::Hidden, cl::value_desc("N"),
     cl::desc("Only import first N functions if N>=0 (default -1)"));
 
+static cl::opt<bool>
+    ForceImportAll("force-import-all", cl::init(false), cl::Hidden,
+                   cl::desc("Import functions with noinline attribute"));
+
 static cl::opt<float>
     ImportInstrFactor("import-instr-evolution-factor", cl::init(0.7),
                       cl::Hidden, cl::value_desc("x"),
@@ -227,7 +231,7 @@
         }
 
         if ((Summary->instCount() > Threshold) &&
-            !Summary->fflags().AlwaysInline) {
+            !Summary->fflags().AlwaysInline && !ForceImportAll) {
           Reason = FunctionImporter::ImportFailureReason::TooLarge;
           return false;
         }
@@ -240,7 +244,7 @@
         }
 
         // Don't bother importing if we can't inline it anyway.
-        if (Summary->fflags().NoInline) {
+        if (Summary->fflags().NoInline && !ForceImportAll) {
           Reason = FunctionImporter::ImportFailureReason::NoInline;
           return false;
         }
@@ -487,17 +491,28 @@
           FailureInfo = std::make_unique<FunctionImporter::ImportFailureInfo>(
               VI, Edge.second.getHotness(), Reason, 1);
         }
-        LLVM_DEBUG(
-            dbgs() << "ignored! No qualifying callee with summary found.\n");
-        continue;
+        if (ForceImportAll) {
+          std::string Msg = std::string("Failed to import function ") +
+                            VI.name().str() + " due to " +
+                            getFailureName(Reason);
+          auto Error = make_error<StringError>(
+              Msg, std::make_error_code(std::errc::operation_not_supported));
+          logAllUnhandledErrors(std::move(Error), errs(),
+                                "Error importing module: ");
+          break;
+        } else {
+          LLVM_DEBUG(dbgs()
+                     << "ignored! No qualifying callee with summary found.\n");
+          continue;
+        }
       }
 
       // "Resolve" the summary
       CalleeSummary = CalleeSummary->getBaseObject();
       ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary);
 
-      assert((ResolvedCalleeSummary->fflags().AlwaysInline ||
-	     (ResolvedCalleeSummary->instCount() <= NewThreshold)) &&
+      assert((ResolvedCalleeSummary->fflags().AlwaysInline || ForceImportAll ||
+              (ResolvedCalleeSummary->instCount() <= NewThreshold)) &&
              "selectCallee() didn't honor the threshold");
 
       auto ExportModulePath = ResolvedCalleeSummary->modulePath();
Index: clang/test/Driver/hip-options.hip
===================================================================
--- clang/test/Driver/hip-options.hip
+++ clang/test/Driver/hip-options.hip
@@ -56,3 +56,17 @@
 // RUN:   --offload-arch=gfx906 -fgpu-inline-threshold=1000 %s 2>&1 | FileCheck -check-prefix=THRESH %s
 // THRESH: clang{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-mllvm" "-inline-threshold=1000"
 // THRESH-NOT: clang{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-inline-threshold=1000"
+
+// Check -foffload-lto=thin translated correctly.
+
+// RUN: %clang -### -target x86_64-unknown-linux-gnu -nogpuinc -nogpulib \
+// RUN:   --cuda-gpu-arch=gfx906 -foffload-lto=thin %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=THINLTO %s
+
+// RUN: %clang -### -target x86_64-unknown-linux-gnu -nogpuinc -nogpulib \
+// RUN:   --cuda-gpu-arch=gfx906 -fgpu-rdc -foffload-lto=thin %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=THINLTO %s
+
+// THINLTO-NOT: clang{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-flto-unit"
+// THINLTO: clang{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-flto=thin" "-flto-unit"
+// THINLTO: lld{{.*}}"-plugin-opt=mcpu=gfx906" "-plugin-opt=thinlto" "-plugin-opt=-force-import-all"
Index: clang/lib/Driver/ToolChains/HIP.cpp
===================================================================
--- clang/lib/Driver/ToolChains/HIP.cpp
+++ clang/lib/Driver/ToolChains/HIP.cpp
@@ -49,8 +49,8 @@
   auto &TC = getToolChain();
   auto &D = TC.getDriver();
   assert(!Inputs.empty() && "Must have at least one input.");
-  addLTOOptions(TC, Args, LldArgs, Output, Inputs[0],
-                D.getLTOMode() == LTOK_Thin);
+  bool IsThinLTO = D.getLTOMode(/*IsOffload=*/true) == LTOK_Thin;
+  addLTOOptions(TC, Args, LldArgs, Output, Inputs[0], IsThinLTO);
 
   // Extract all the -m options
   std::vector<llvm::StringRef> Features;
@@ -66,6 +66,12 @@
   if (!Features.empty())
     LldArgs.push_back(Args.MakeArgString(MAttrString));
 
+  // ToDo: Remove this option after AMDGPU backend supports ISA-level linking.
+  // Since AMDGPU backend currently does not support ISA-level linking, all
+  // called functions need to be imported.
+  if (IsThinLTO)
+    LldArgs.push_back(Args.MakeArgString("-plugin-opt=-force-import-all"));
+
   for (const Arg *A : Args.filtered(options::OPT_mllvm)) {
     LldArgs.push_back(
         Args.MakeArgString(Twine("-plugin-opt=") + A->getValue(0)));
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -4176,6 +4176,10 @@
   bool IsHIPDevice = JA.isDeviceOffloading(Action::OFK_HIP);
   bool IsOpenMPDevice = JA.isDeviceOffloading(Action::OFK_OpenMP);
   bool IsHeaderModulePrecompile = isa<HeaderModulePrecompileJobAction>(JA);
+  bool IsDeviceOffloadAction = !(JA.isDeviceOffloading(Action::OFK_None) ||
+                                 JA.isDeviceOffloading(Action::OFK_Host));
+  bool IsUsingLTO = D.isUsingLTO(IsDeviceOffloadAction);
+  auto LTOMode = D.getLTOMode(IsDeviceOffloadAction);
 
   // A header module compilation doesn't have a main input file, so invent a
   // fake one as a placeholder.
@@ -4427,13 +4431,23 @@
     if (JA.getType() == types::TY_LLVM_BC)
       CmdArgs.push_back("-emit-llvm-uselists");
 
-    // Device-side jobs do not support LTO.
-    bool isDeviceOffloadAction = !(JA.isDeviceOffloading(Action::OFK_None) ||
-                                   JA.isDeviceOffloading(Action::OFK_Host));
-
-    if (D.isUsingLTO() && !isDeviceOffloadAction) {
-      Args.AddLastArg(CmdArgs, options::OPT_flto, options::OPT_flto_EQ);
-      CmdArgs.push_back("-flto-unit");
+    if (IsUsingLTO) {
+      if (!IsDeviceOffloadAction) {
+        Args.AddLastArg(CmdArgs, options::OPT_flto, options::OPT_flto_EQ);
+        CmdArgs.push_back("-flto-unit");
+      } else if (Triple.isAMDGPU()) {
+        // Only AMDGPU supports device-side LTO
+        assert(LTOMode == LTOK_Full || LTOMode == LTOK_Thin);
+        CmdArgs.push_back(Args.MakeArgString(
+            Twine("-flto=") + (LTOMode == LTOK_Thin ? "thin" : "full")));
+        CmdArgs.push_back("-flto-unit");
+      } else {
+        D.Diag(diag::err_drv_unsupported_opt_for_target)
+            << Args.getLastArg(options::OPT_foffload_lto,
+                               options::OPT_foffload_lto_EQ)
+                   ->getAsString(Args)
+            << Triple.getTriple();
+      }
     }
   }
 
@@ -4458,7 +4472,7 @@
 
   // Embed-bitcode option.
   // Only white-listed flags below are allowed to be embedded.
-  if (C.getDriver().embedBitcodeInObject() && !C.getDriver().isUsingLTO() &&
+  if (C.getDriver().embedBitcodeInObject() && !IsUsingLTO &&
       (isa<BackendJobAction>(JA) || isa<AssembleJobAction>(JA))) {
     // Add flags implied by -fembed-bitcode.
     Args.AddLastArg(CmdArgs, options::OPT_fembed_bitcode_EQ);
@@ -4575,7 +4589,7 @@
     return;
   }
 
-  if (C.getDriver().embedBitcodeMarkerOnly() && !C.getDriver().isUsingLTO())
+  if (C.getDriver().embedBitcodeMarkerOnly() && !IsUsingLTO)
     CmdArgs.push_back("-fembed-bitcode=marker");
 
   // We normally speed up the clang process a bit by skipping destructors at
@@ -6412,7 +6426,7 @@
   // be added so both IR can be captured.
   if ((C.getDriver().isSaveTempsEnabled() ||
        JA.isHostOffloading(Action::OFK_OpenMP)) &&
-      !(C.getDriver().embedBitcodeInObject() && !C.getDriver().isUsingLTO()) &&
+      !(C.getDriver().embedBitcodeInObject() && !IsUsingLTO) &&
       isa<CompileJobAction>(JA))
     CmdArgs.push_back("-disable-llvm-passes");
 
@@ -6545,7 +6559,7 @@
   if (VirtualFunctionElimination) {
     // VFE requires full LTO (currently, this might be relaxed to allow ThinLTO
     // in the future).
-    if (D.getLTOMode() != LTOK_Full)
+    if (LTOMode != LTOK_Full)
       D.Diag(diag::err_drv_argument_only_allowed_with)
           << "-fvirtual-function-elimination"
           << "-flto=full";
@@ -6564,7 +6578,7 @@
   }
 
   if (WholeProgramVTables) {
-    if (!D.isUsingLTO())
+    if (!IsUsingLTO)
       D.Diag(diag::err_drv_argument_only_allowed_with)
           << "-fwhole-program-vtables"
           << "-flto";
@@ -6573,7 +6587,7 @@
 
   bool DefaultsSplitLTOUnit =
       (WholeProgramVTables || Sanitize.needsLTO()) &&
-      (D.getLTOMode() == LTOK_Full || TC.canSplitThinLTOUnit());
+      (LTOMode == LTOK_Full || TC.canSplitThinLTOUnit());
   bool SplitLTOUnit =
       Args.hasFlag(options::OPT_fsplit_lto_unit,
                    options::OPT_fno_split_lto_unit, DefaultsSplitLTOUnit);
@@ -6619,7 +6633,7 @@
      // Enable order file instrumentation when ThinLTO is not on. When ThinLTO is
      // on, we need to pass these flags as linker flags and that will be handled
      // outside of the compiler.
-     if (!D.isUsingLTO()) {
+     if (!IsUsingLTO) {
        CmdArgs.push_back("-mllvm");
        CmdArgs.push_back("-enable-order-file-instrumentation");
      }
Index: clang/lib/Driver/Driver.cpp
===================================================================
--- clang/lib/Driver/Driver.cpp
+++ clang/lib/Driver/Driver.cpp
@@ -594,16 +594,18 @@
 }
 
 // Parse the LTO options and record the type of LTO compilation
-// based on which -f(no-)?lto(=.*)? option occurs last.
-void Driver::setLTOMode(const llvm::opt::ArgList &Args) {
-  LTOMode = LTOK_None;
-  if (!Args.hasFlag(options::OPT_flto, options::OPT_flto_EQ,
-                    options::OPT_fno_lto, false))
-    return;
+// based on which -f(no-)?lto(=.*)? or -f(no-)?offload-lto(=.*)?
+// option occurs last.
+static llvm::Optional<driver::LTOKind>
+parseLTOMode(Driver &D, const llvm::opt::ArgList &Args, OptSpecifier OptPos,
+             OptSpecifier OptNeg, OptSpecifier OptEq) {
+  driver::LTOKind LTOMode = LTOK_None;
+  if (!Args.hasFlag(OptPos, OptEq, OptNeg, false))
+    return None;
 
   StringRef LTOName("full");
 
-  const Arg *A = Args.getLastArg(options::OPT_flto_EQ);
+  const Arg *A = Args.getLastArg(OptEq);
   if (A)
     LTOName = A->getValue();
 
@@ -614,9 +616,25 @@
 
   if (LTOMode == LTOK_Unknown) {
     assert(A);
-    Diag(diag::err_drv_unsupported_option_argument) << A->getOption().getName()
-                                                    << A->getValue();
+    D.Diag(diag::err_drv_unsupported_option_argument)
+        << A->getOption().getName() << A->getValue();
+    return None;
   }
+  return LTOMode;
+}
+
+// Parse the LTO options.
+void Driver::setLTOMode(const llvm::opt::ArgList &Args) {
+  LTOMode = LTOK_None;
+  if (auto M = parseLTOMode(*this, Args, options::OPT_flto,
+                            options::OPT_fno_lto, options::OPT_flto_EQ))
+    LTOMode = M.getValue();
+
+  OffloadLTOMode = LTOK_None;
+  if (auto M = parseLTOMode(*this, Args, options::OPT_foffload_lto,
+                            options::OPT_fno_offload_lto,
+                            options::OPT_foffload_lto_EQ))
+    OffloadLTOMode = M.getValue();
 }
 
 /// Compute the desired OpenMP runtime from the flags provided.
@@ -2930,11 +2948,12 @@
         // a fat binary containing all the code objects for different GPU's.
         // The fat binary is then an input to the host action.
         for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
-          if (GPUSanitize) {
+          if (GPUSanitize || C.getDriver().isUsingLTO(/*IsOffload=*/true)) {
             // When GPU sanitizer is enabled, since we need to link in the
             // the sanitizer runtime library after the sanitize pass, we have
             // to skip the backend and assemble phases and use lld to link
-            // the bitcode.
+            // the bitcode. The same happens if users request to use LTO
+            // explicitly.
             ActionList AL;
             AL.push_back(CudaDeviceActions[I]);
             // Create a link action to link device IR with device library
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -1943,6 +1943,12 @@
   HelpText<"Enable LTO in 'full' mode">;
 def fno_lto : Flag<["-"], "fno-lto">, Flags<[CoreOption, CC1Option]>, Group<f_Group>,
   HelpText<"Disable LTO mode (default)">;
+def foffload_lto_EQ : Joined<["-"], "foffload-lto=">, Flags<[CoreOption]>, Group<f_Group>,
+  HelpText<"Set LTO mode to either 'full' or 'thin' for offload compilation">, Values<"thin,full">;
+def foffload_lto : Flag<["-"], "foffload-lto">, Flags<[CoreOption]>, Group<f_Group>,
+  HelpText<"Enable LTO in 'full' mode for offload compilation">;
+def fno_offload_lto : Flag<["-"], "fno-offload-lto">, Flags<[CoreOption]>, Group<f_Group>,
+  HelpText<"Disable LTO mode (default) for offload compilation">;
 def flto_jobs_EQ : Joined<["-"], "flto-jobs=">,
   Flags<[CC1Option]>, Group<f_Group>,
   HelpText<"Controls the backend parallelism of -flto=thin (default "
Index: clang/include/clang/Driver/Driver.h
===================================================================
--- clang/include/clang/Driver/Driver.h
+++ clang/include/clang/Driver/Driver.h
@@ -84,6 +84,9 @@
   /// LTO mode selected via -f(no-)?lto(=.*)? options.
   LTOKind LTOMode;
 
+  /// LTO mode selected via -f(no-offload-)?lto(=.*)? options.
+  LTOKind OffloadLTOMode;
+
 public:
   enum OpenMPRuntimeKind {
     /// An unknown OpenMP runtime. We can't generate effective OpenMP code
@@ -559,10 +562,14 @@
   bool ShouldEmitStaticLibrary(const llvm::opt::ArgList &Args) const;
 
   /// Returns true if we are performing any kind of LTO.
-  bool isUsingLTO() const { return LTOMode != LTOK_None; }
+  bool isUsingLTO(bool IsOffload = false) const {
+    return getLTOMode(IsOffload) != LTOK_None;
+  }
 
   /// Get the specific kind of LTO being performed.
-  LTOKind getLTOMode() const { return LTOMode; }
+  LTOKind getLTOMode(bool IsOffload = false) const {
+    return IsOffload ? OffloadLTOMode : LTOMode;
+  }
 
 private:

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D99683: [HIP] Support ThinLTO

Reply via email to