yaxunl updated this revision to Diff 344949.
yaxunl marked an inline comment as done.
yaxunl added a comment.
Revised by Teresa's comments
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D99683/new/
https://reviews.llvm.org/D99683
Files:
clang/include/clang/Driver/Driver.h
clang/include/clang/Driver/Options.td
clang/lib/Driver/Driver.cpp
clang/lib/Driver/ToolChains/Clang.cpp
clang/lib/Driver/ToolChains/HIP.cpp
clang/test/Driver/hip-options.hip
llvm/lib/Transforms/IPO/FunctionImport.cpp
llvm/test/Transforms/FunctionImport/Inputs/funcimport.ll
llvm/test/Transforms/FunctionImport/Inputs/noinline.ll
llvm/test/Transforms/FunctionImport/adjustable_threshold.ll
llvm/test/Transforms/FunctionImport/funcimport.ll
llvm/test/Transforms/FunctionImport/noinline.ll
Index: llvm/test/Transforms/FunctionImport/noinline.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/FunctionImport/noinline.ll
@@ -0,0 +1,23 @@
+; Do setup work for all below tests: generate bitcode and combined index
+; RUN: opt -module-summary %s -o %t.main.bc
+; RUN: opt -module-summary %p/Inputs/noinline.ll -o %t.inputs.noinline.bc
+; RUN: llvm-lto -thinlto -o %t.summary %t.main.bc %t.inputs.noinline.bc
+
+; Attempt the import now, ensure below that file containing noinline
+; is not imported by default but imported with -force-import-all.
+
+; RUN: opt -function-import -summary-file %t.summary.thinlto.bc %t.main.bc -S 2>&1 \
+; RUN: | FileCheck -check-prefix=NOIMPORT %s
+; RUN: opt -function-import -force-import-all -summary-file %t.summary.thinlto.bc \
+; RUN: %t.main.bc -S 2>&1 | FileCheck -check-prefix=IMPORT %s
+
+define i32 @main() #0 {
+entry:
+ %f = alloca i64, align 8
+ call void @foo(i64* %f)
+ ret i32 0
+}
+
+; NOIMPORT: declare void @foo(i64*)
+; IMPORT: define available_externally void @foo
+declare void @foo(i64*) #1
Index: llvm/test/Transforms/FunctionImport/funcimport.ll
===================================================================
--- llvm/test/Transforms/FunctionImport/funcimport.ll
+++ llvm/test/Transforms/FunctionImport/funcimport.ll
@@ -1,3 +1,5 @@
+; REQUIRES: x86-registered-target
+
; Do setup work for all below tests: generate bitcode and combined index
; RUN: opt -module-summary %s -o %t.bc
; RUN: opt -module-summary %p/Inputs/funcimport.ll -o %t2.bc
@@ -15,6 +17,13 @@
; RUN: opt -function-import -enable-import-metadata -summary-file %t3.thinlto.bc %t.bc -import-instr-limit=5 -S | FileCheck %s --check-prefix=CHECK --check-prefix=INSTLIM5
; INSTLIM5-NOT: @staticfunc.llvm.
+; Test force import all
+; RUN: llvm-lto -thinlto-action=run -force-import-all %t.bc %t2.bc 2>&1 \
+; RUN: | FileCheck %s --check-prefix=IMPORTALL
+; IMPORTALL-DAG: Error importing module: Failed to import function weakalias due to InterposableLinkage
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
define i32 @main() #0 {
entry:
Index: llvm/test/Transforms/FunctionImport/adjustable_threshold.ll
===================================================================
--- llvm/test/Transforms/FunctionImport/adjustable_threshold.ll
+++ llvm/test/Transforms/FunctionImport/adjustable_threshold.ll
@@ -11,7 +11,15 @@
; RUN: opt -function-import -summary-file %t3.thinlto.bc %t.bc -import-instr-limit=10 -import-instr-evolution-factor=0.5 -S | FileCheck %s --check-prefix=INSTLIM-PROGRESSIVE
; INSTLIM-PROGRESSIVE-NOT: call void @staticfunc
-
+; Test force import all
+; RUN: opt -function-import -summary-file %t3.thinlto.bc %t.bc \
+; RUN: -import-instr-limit=1 -force-import-all -S \
+; RUN: | FileCheck %s --check-prefix=IMPORTALL
+; IMPORTALL-DAG: define available_externally void @globalfunc1()
+; IMPORTALL-DAG: define available_externally void @trampoline()
+; IMPORTALL-DAG: define available_externally void @largefunction()
+; IMPORTALL-DAG: define available_externally hidden void @staticfunc2.llvm.0()
+; IMPORTALL-DAG: define available_externally void @globalfunc2()
declare void @globalfunc1()
declare void @globalfunc2()
Index: llvm/test/Transforms/FunctionImport/Inputs/noinline.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/FunctionImport/Inputs/noinline.ll
@@ -0,0 +1,8 @@
+define void @foo(i64* %v) #0 {
+entry:
+ %v.addr = alloca i64*, align 8
+ store i64* %v, i64** %v.addr, align 8
+ ret void
+}
+
+attributes #0 = { noinline }
\ No newline at end of file
Index: llvm/test/Transforms/FunctionImport/Inputs/funcimport.ll
===================================================================
--- llvm/test/Transforms/FunctionImport/Inputs/funcimport.ll
+++ llvm/test/Transforms/FunctionImport/Inputs/funcimport.ll
@@ -1,3 +1,6 @@
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
@globalvar = global i32 1, align 4
@staticvar = internal global i32 1, align 4
@staticconstvar = internal unnamed_addr constant [2 x i32] [i32 10, i32 20], align 4
Index: llvm/lib/Transforms/IPO/FunctionImport.cpp
===================================================================
--- llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -84,6 +84,10 @@
"import-cutoff", cl::init(-1), cl::Hidden, cl::value_desc("N"),
cl::desc("Only import first N functions if N>=0 (default -1)"));
+static cl::opt<bool>
+ ForceImportAll("force-import-all", cl::init(false), cl::Hidden,
+ cl::desc("Import functions with noinline attribute"));
+
static cl::opt<float>
ImportInstrFactor("import-instr-evolution-factor", cl::init(0.7),
cl::Hidden, cl::value_desc("x"),
@@ -227,7 +231,7 @@
}
if ((Summary->instCount() > Threshold) &&
- !Summary->fflags().AlwaysInline) {
+ !Summary->fflags().AlwaysInline && !ForceImportAll) {
Reason = FunctionImporter::ImportFailureReason::TooLarge;
return false;
}
@@ -240,7 +244,7 @@
}
// Don't bother importing if we can't inline it anyway.
- if (Summary->fflags().NoInline) {
+ if (Summary->fflags().NoInline && !ForceImportAll) {
Reason = FunctionImporter::ImportFailureReason::NoInline;
return false;
}
@@ -487,17 +491,28 @@
FailureInfo = std::make_unique<FunctionImporter::ImportFailureInfo>(
VI, Edge.second.getHotness(), Reason, 1);
}
- LLVM_DEBUG(
- dbgs() << "ignored! No qualifying callee with summary found.\n");
- continue;
+ if (ForceImportAll) {
+ std::string Msg = std::string("Failed to import function ") +
+ VI.name().str() + " due to " +
+ getFailureName(Reason);
+ auto Error = make_error<StringError>(
+ Msg, std::make_error_code(std::errc::operation_not_supported));
+ logAllUnhandledErrors(std::move(Error), errs(),
+ "Error importing module: ");
+ break;
+ } else {
+ LLVM_DEBUG(dbgs()
+ << "ignored! No qualifying callee with summary found.\n");
+ continue;
+ }
}
// "Resolve" the summary
CalleeSummary = CalleeSummary->getBaseObject();
ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary);
- assert((ResolvedCalleeSummary->fflags().AlwaysInline ||
- (ResolvedCalleeSummary->instCount() <= NewThreshold)) &&
+ assert((ResolvedCalleeSummary->fflags().AlwaysInline || ForceImportAll ||
+ (ResolvedCalleeSummary->instCount() <= NewThreshold)) &&
"selectCallee() didn't honor the threshold");
auto ExportModulePath = ResolvedCalleeSummary->modulePath();
Index: clang/test/Driver/hip-options.hip
===================================================================
--- clang/test/Driver/hip-options.hip
+++ clang/test/Driver/hip-options.hip
@@ -56,3 +56,17 @@
// RUN: --offload-arch=gfx906 -fgpu-inline-threshold=1000 %s 2>&1 | FileCheck -check-prefix=THRESH %s
// THRESH: clang{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-mllvm" "-inline-threshold=1000"
// THRESH-NOT: clang{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-inline-threshold=1000"
+
+// Check -foffload-lto=thin translated correctly.
+
+// RUN: %clang -### -target x86_64-unknown-linux-gnu -nogpuinc -nogpulib \
+// RUN: --cuda-gpu-arch=gfx906 -foffload-lto=thin %s 2>&1 \
+// RUN: | FileCheck -check-prefix=THINLTO %s
+
+// RUN: %clang -### -target x86_64-unknown-linux-gnu -nogpuinc -nogpulib \
+// RUN: --cuda-gpu-arch=gfx906 -fgpu-rdc -foffload-lto=thin %s 2>&1 \
+// RUN: | FileCheck -check-prefix=THINLTO %s
+
+// THINLTO-NOT: clang{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-flto-unit"
+// THINLTO: clang{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-flto=thin" "-flto-unit"
+// THINLTO: lld{{.*}}"-plugin-opt=mcpu=gfx906" "-plugin-opt=thinlto" "-plugin-opt=-force-import-all"
Index: clang/lib/Driver/ToolChains/HIP.cpp
===================================================================
--- clang/lib/Driver/ToolChains/HIP.cpp
+++ clang/lib/Driver/ToolChains/HIP.cpp
@@ -49,8 +49,8 @@
auto &TC = getToolChain();
auto &D = TC.getDriver();
assert(!Inputs.empty() && "Must have at least one input.");
- addLTOOptions(TC, Args, LldArgs, Output, Inputs[0],
- D.getLTOMode() == LTOK_Thin);
+ bool IsThinLTO = D.getLTOMode(/*IsOffload=*/true) == LTOK_Thin;
+ addLTOOptions(TC, Args, LldArgs, Output, Inputs[0], IsThinLTO);
// Extract all the -m options
std::vector<llvm::StringRef> Features;
@@ -66,6 +66,12 @@
if (!Features.empty())
LldArgs.push_back(Args.MakeArgString(MAttrString));
+ // ToDo: Remove this option after AMDGPU backend supports ISA-level linking.
+ // Since AMDGPU backend currently does not support ISA-level linking, all
+ // called functions need to be imported.
+ if (IsThinLTO)
+ LldArgs.push_back(Args.MakeArgString("-plugin-opt=-force-import-all"));
+
for (const Arg *A : Args.filtered(options::OPT_mllvm)) {
LldArgs.push_back(
Args.MakeArgString(Twine("-plugin-opt=") + A->getValue(0)));
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -4176,6 +4176,10 @@
bool IsHIPDevice = JA.isDeviceOffloading(Action::OFK_HIP);
bool IsOpenMPDevice = JA.isDeviceOffloading(Action::OFK_OpenMP);
bool IsHeaderModulePrecompile = isa<HeaderModulePrecompileJobAction>(JA);
+ bool IsDeviceOffloadAction = !(JA.isDeviceOffloading(Action::OFK_None) ||
+ JA.isDeviceOffloading(Action::OFK_Host));
+ bool IsUsingLTO = D.isUsingLTO(IsDeviceOffloadAction);
+ auto LTOMode = D.getLTOMode(IsDeviceOffloadAction);
// A header module compilation doesn't have a main input file, so invent a
// fake one as a placeholder.
@@ -4427,13 +4431,23 @@
if (JA.getType() == types::TY_LLVM_BC)
CmdArgs.push_back("-emit-llvm-uselists");
- // Device-side jobs do not support LTO.
- bool isDeviceOffloadAction = !(JA.isDeviceOffloading(Action::OFK_None) ||
- JA.isDeviceOffloading(Action::OFK_Host));
-
- if (D.isUsingLTO() && !isDeviceOffloadAction) {
- Args.AddLastArg(CmdArgs, options::OPT_flto, options::OPT_flto_EQ);
- CmdArgs.push_back("-flto-unit");
+ if (IsUsingLTO) {
+ if (!IsDeviceOffloadAction) {
+ Args.AddLastArg(CmdArgs, options::OPT_flto, options::OPT_flto_EQ);
+ CmdArgs.push_back("-flto-unit");
+ } else if (Triple.isAMDGPU()) {
+ // Only AMDGPU supports device-side LTO
+ assert(LTOMode == LTOK_Full || LTOMode == LTOK_Thin);
+ CmdArgs.push_back(Args.MakeArgString(
+ Twine("-flto=") + (LTOMode == LTOK_Thin ? "thin" : "full")));
+ CmdArgs.push_back("-flto-unit");
+ } else {
+ D.Diag(diag::err_drv_unsupported_opt_for_target)
+ << Args.getLastArg(options::OPT_foffload_lto,
+ options::OPT_foffload_lto_EQ)
+ ->getAsString(Args)
+ << Triple.getTriple();
+ }
}
}
@@ -4458,7 +4472,7 @@
// Embed-bitcode option.
// Only white-listed flags below are allowed to be embedded.
- if (C.getDriver().embedBitcodeInObject() && !C.getDriver().isUsingLTO() &&
+ if (C.getDriver().embedBitcodeInObject() && !IsUsingLTO &&
(isa<BackendJobAction>(JA) || isa<AssembleJobAction>(JA))) {
// Add flags implied by -fembed-bitcode.
Args.AddLastArg(CmdArgs, options::OPT_fembed_bitcode_EQ);
@@ -4575,7 +4589,7 @@
return;
}
- if (C.getDriver().embedBitcodeMarkerOnly() && !C.getDriver().isUsingLTO())
+ if (C.getDriver().embedBitcodeMarkerOnly() && !IsUsingLTO)
CmdArgs.push_back("-fembed-bitcode=marker");
// We normally speed up the clang process a bit by skipping destructors at
@@ -6412,7 +6426,7 @@
// be added so both IR can be captured.
if ((C.getDriver().isSaveTempsEnabled() ||
JA.isHostOffloading(Action::OFK_OpenMP)) &&
- !(C.getDriver().embedBitcodeInObject() && !C.getDriver().isUsingLTO()) &&
+ !(C.getDriver().embedBitcodeInObject() && !IsUsingLTO) &&
isa<CompileJobAction>(JA))
CmdArgs.push_back("-disable-llvm-passes");
@@ -6545,7 +6559,7 @@
if (VirtualFunctionElimination) {
// VFE requires full LTO (currently, this might be relaxed to allow ThinLTO
// in the future).
- if (D.getLTOMode() != LTOK_Full)
+ if (LTOMode != LTOK_Full)
D.Diag(diag::err_drv_argument_only_allowed_with)
<< "-fvirtual-function-elimination"
<< "-flto=full";
@@ -6564,7 +6578,7 @@
}
if (WholeProgramVTables) {
- if (!D.isUsingLTO())
+ if (!IsUsingLTO)
D.Diag(diag::err_drv_argument_only_allowed_with)
<< "-fwhole-program-vtables"
<< "-flto";
@@ -6573,7 +6587,7 @@
bool DefaultsSplitLTOUnit =
(WholeProgramVTables || Sanitize.needsLTO()) &&
- (D.getLTOMode() == LTOK_Full || TC.canSplitThinLTOUnit());
+ (LTOMode == LTOK_Full || TC.canSplitThinLTOUnit());
bool SplitLTOUnit =
Args.hasFlag(options::OPT_fsplit_lto_unit,
options::OPT_fno_split_lto_unit, DefaultsSplitLTOUnit);
@@ -6619,7 +6633,7 @@
// Enable order file instrumentation when ThinLTO is not on. When ThinLTO is
// on, we need to pass these flags as linker flags and that will be handled
// outside of the compiler.
- if (!D.isUsingLTO()) {
+ if (!IsUsingLTO) {
CmdArgs.push_back("-mllvm");
CmdArgs.push_back("-enable-order-file-instrumentation");
}
Index: clang/lib/Driver/Driver.cpp
===================================================================
--- clang/lib/Driver/Driver.cpp
+++ clang/lib/Driver/Driver.cpp
@@ -594,16 +594,18 @@
}
// Parse the LTO options and record the type of LTO compilation
-// based on which -f(no-)?lto(=.*)? option occurs last.
-void Driver::setLTOMode(const llvm::opt::ArgList &Args) {
- LTOMode = LTOK_None;
- if (!Args.hasFlag(options::OPT_flto, options::OPT_flto_EQ,
- options::OPT_fno_lto, false))
- return;
+// based on which -f(no-)?lto(=.*)? or -f(no-)?offload-lto(=.*)?
+// option occurs last.
+static llvm::Optional<driver::LTOKind>
+parseLTOMode(Driver &D, const llvm::opt::ArgList &Args, OptSpecifier OptPos,
+ OptSpecifier OptNeg, OptSpecifier OptEq) {
+ driver::LTOKind LTOMode = LTOK_None;
+ if (!Args.hasFlag(OptPos, OptEq, OptNeg, false))
+ return None;
StringRef LTOName("full");
- const Arg *A = Args.getLastArg(options::OPT_flto_EQ);
+ const Arg *A = Args.getLastArg(OptEq);
if (A)
LTOName = A->getValue();
@@ -614,9 +616,25 @@
if (LTOMode == LTOK_Unknown) {
assert(A);
- Diag(diag::err_drv_unsupported_option_argument) << A->getOption().getName()
- << A->getValue();
+ D.Diag(diag::err_drv_unsupported_option_argument)
+ << A->getOption().getName() << A->getValue();
+ return None;
}
+ return LTOMode;
+}
+
+// Parse the LTO options.
+void Driver::setLTOMode(const llvm::opt::ArgList &Args) {
+ LTOMode = LTOK_None;
+ if (auto M = parseLTOMode(*this, Args, options::OPT_flto,
+ options::OPT_fno_lto, options::OPT_flto_EQ))
+ LTOMode = M.getValue();
+
+ OffloadLTOMode = LTOK_None;
+ if (auto M = parseLTOMode(*this, Args, options::OPT_foffload_lto,
+ options::OPT_fno_offload_lto,
+ options::OPT_foffload_lto_EQ))
+ OffloadLTOMode = M.getValue();
}
/// Compute the desired OpenMP runtime from the flags provided.
@@ -2930,11 +2948,12 @@
// a fat binary containing all the code objects for different GPU's.
// The fat binary is then an input to the host action.
for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
- if (GPUSanitize) {
+ if (GPUSanitize || C.getDriver().isUsingLTO(/*IsOffload=*/true)) {
// When GPU sanitizer is enabled, since we need to link in the
// the sanitizer runtime library after the sanitize pass, we have
// to skip the backend and assemble phases and use lld to link
- // the bitcode.
+ // the bitcode. The same happens if users request to use LTO
+ // explicitly.
ActionList AL;
AL.push_back(CudaDeviceActions[I]);
// Create a link action to link device IR with device library
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -1943,6 +1943,12 @@
HelpText<"Enable LTO in 'full' mode">;
def fno_lto : Flag<["-"], "fno-lto">, Flags<[CoreOption, CC1Option]>, Group<f_Group>,
HelpText<"Disable LTO mode (default)">;
+def foffload_lto_EQ : Joined<["-"], "foffload-lto=">, Flags<[CoreOption]>, Group<f_Group>,
+ HelpText<"Set LTO mode to either 'full' or 'thin' for offload compilation">, Values<"thin,full">;
+def foffload_lto : Flag<["-"], "foffload-lto">, Flags<[CoreOption]>, Group<f_Group>,
+ HelpText<"Enable LTO in 'full' mode for offload compilation">;
+def fno_offload_lto : Flag<["-"], "fno-offload-lto">, Flags<[CoreOption]>, Group<f_Group>,
+ HelpText<"Disable LTO mode (default) for offload compilation">;
def flto_jobs_EQ : Joined<["-"], "flto-jobs=">,
Flags<[CC1Option]>, Group<f_Group>,
HelpText<"Controls the backend parallelism of -flto=thin (default "
Index: clang/include/clang/Driver/Driver.h
===================================================================
--- clang/include/clang/Driver/Driver.h
+++ clang/include/clang/Driver/Driver.h
@@ -84,6 +84,9 @@
/// LTO mode selected via -f(no-)?lto(=.*)? options.
LTOKind LTOMode;
+ /// LTO mode selected via -f(no-offload-)?lto(=.*)? options.
+ LTOKind OffloadLTOMode;
+
public:
enum OpenMPRuntimeKind {
/// An unknown OpenMP runtime. We can't generate effective OpenMP code
@@ -559,10 +562,14 @@
bool ShouldEmitStaticLibrary(const llvm::opt::ArgList &Args) const;
/// Returns true if we are performing any kind of LTO.
- bool isUsingLTO() const { return LTOMode != LTOK_None; }
+ bool isUsingLTO(bool IsOffload = false) const {
+ return getLTOMode(IsOffload) != LTOK_None;
+ }
/// Get the specific kind of LTO being performed.
- LTOKind getLTOMode() const { return LTOMode; }
+ LTOKind getLTOMode(bool IsOffload = false) const {
+ return IsOffload ? OffloadLTOMode : LTOMode;
+ }
private:
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits