https://github.com/shawbyoung created https://github.com/llvm/llvm-project/pull/95049
- **Rebase: [Facebook] Add clang driver options to test debug info and BOLT** - **Rebase: [Facebook] [MC] Introduce NeverAlign fragment type** - **[BOLT] Fix ValidateMemRefs** - **[BOLT][NFC] Add sink block to flow CFG in profile inference** >From c771a162fe79eeaee285a169e7611609f529107d Mon Sep 17 00:00:00 2001 From: Amir Ayupov <aau...@meta.com> Date: Tue, 1 Jun 2021 11:37:41 -0700 Subject: [PATCH 1/4] Rebase: [Facebook] Add clang driver options to test debug info and BOLT Summary: This is an essential piece of infrastructure for us to be continuously testing debug info with BOLT. We can't only make changes to a test repo because we need to change debuginfo tests to call BOLT, hence, this diff needs to sit in our opensource repo. But when upstreaming to LLVM, this should be kept BOLT-only outside of LLVM. When upstreaming, we need to git diff and check all folders that are being modified by our commits and discard this one (and leave as an internal diff). To test BOLT in debuginfo tests, configure it with -DLLVM_TEST_BOLT=ON. Then run check-lldb and check-debuginfo. Manual rebase conflict history: https://phabricator.intern.facebook.com/D29205224 https://phabricator.intern.facebook.com/D29564078 https://phabricator.intern.facebook.com/D33289118 https://phabricator.intern.facebook.com/D34957174 https://phabricator.intern.facebook.com/D35317341 Test Plan: tested locally Configured with: -DLLVM_ENABLE_PROJECTS="clang;lld;lldb;compiler-rt;bolt;debuginfo-tests" -DLLVM_TEST_BOLT=ON Ran test suite with: ninja check-debuginfo ninja check-lldb Reviewers: maks, #llvm-bolt Reviewed By: maks Subscribers: ayermolo, phabricatorlinter Differential Revision: https://phabricator.intern.facebook.com/D46256657 Tasks: T92898286 --- clang/include/clang/Driver/Options.td | 4 ++++ clang/lib/Driver/ToolChains/Gnu.cpp | 29 ++++++++++++++++++++++++++ cross-project-tests/lit.cfg.py | 14 ++++++++++++- cross-project-tests/lit.site.cfg.py.in | 4 ++++ lldb/test/API/lit.cfg.py | 5 +++++ lldb/test/API/lit.site.cfg.py.in | 8 +++++++ lldb/test/Shell/helper/toolchain.py | 5 +++++ lldb/test/Shell/lit.site.cfg.py.in | 9 ++++++++ llvm/CMakeLists.txt | 4 ++++ 9 files changed, 81 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 1637a114fcce1..082ca027261b0 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -5461,6 +5461,10 @@ def pg : Flag<["-"], "pg">, HelpText<"Enable mcount instrumentation">, MarshallingInfoFlag<CodeGenOpts<"InstrumentForProfiling">>; def pipe : Flag<["-", "--"], "pipe">, HelpText<"Use pipes between commands, when possible">; +// Facebook T92898286 +def post_link_optimize : Flag<["--"], "post-link-optimize">, + HelpText<"Apply post-link optimizations using BOLT">; +// End Facebook T92898286 def prebind__all__twolevel__modules : Flag<["-"], "prebind_all_twolevel_modules">; def prebind : Flag<["-"], "prebind">; def preload : Flag<["-"], "preload">; diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index b141e5f2adfab..f7611af5763ab 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -672,12 +672,41 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA, } } + // Facebook T92898286 + if (Args.hasArg(options::OPT_post_link_optimize)) + CmdArgs.push_back("-q"); + // End Facebook T92898286 + Args.AddAllArgs(CmdArgs, options::OPT_T); const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath()); C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs, Output)); + // Facebook T92898286 + if (!Args.hasArg(options::OPT_post_link_optimize) || !Output.isFilename()) + return; + + const char *MvExec = Args.MakeArgString(ToolChain.GetProgramPath("mv")); + ArgStringList MoveCmdArgs; + MoveCmdArgs.push_back(Output.getFilename()); + const char *PreBoltBin = + Args.MakeArgString(Twine(Output.getFilename()) + ".pre-bolt"); + MoveCmdArgs.push_back(PreBoltBin); + C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(), + MvExec, MoveCmdArgs, std::nullopt)); + + ArgStringList BoltCmdArgs; + const char *BoltExec = + Args.MakeArgString(ToolChain.GetProgramPath("llvm-bolt")); + BoltCmdArgs.push_back(PreBoltBin); + BoltCmdArgs.push_back("-reorder-blocks=reverse"); + BoltCmdArgs.push_back("-update-debug-sections"); + BoltCmdArgs.push_back("-o"); + BoltCmdArgs.push_back(Output.getFilename()); + C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(), + BoltExec, BoltCmdArgs, std::nullopt)); + // End Facebook T92898286 } void tools::gnutools::Assembler::ConstructJob(Compilation &C, diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py index 774c4eaf4d976..619634578dfe6 100644 --- a/cross-project-tests/lit.cfg.py +++ b/cross-project-tests/lit.cfg.py @@ -84,7 +84,13 @@ def get_required_attr(config, attr_name): # use_clang() and use_lld() respectively, so set them to "", if needed. if not hasattr(config, "clang_src_dir"): config.clang_src_dir = "" -llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects)) +# Facebook T92898286 +should_test_bolt = get_required_attr(config, "llvm_test_bolt") +if should_test_bolt: + llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects), additional_flags=["--post-link-optimize"]) +else: + llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects)) +# End Facebook T92898286 if not hasattr(config, "lld_src_dir"): config.lld_src_dir = "" @@ -293,3 +299,9 @@ def get_clang_default_dwarf_version_string(triple): # Allow 'REQUIRES: XXX-registered-target' in tests. for arch in config.targets_to_build: config.available_features.add(arch.lower() + "-registered-target") + +# Facebook T92898286 +# Ensure the user's PYTHONPATH is included. +if "PYTHONPATH" in os.environ: + config.environment["PYTHONPATH"] = os.environ["PYTHONPATH"] +# End Facebook T92898286 diff --git a/cross-project-tests/lit.site.cfg.py.in b/cross-project-tests/lit.site.cfg.py.in index 39458dfc79afd..2d53cd377f033 100644 --- a/cross-project-tests/lit.site.cfg.py.in +++ b/cross-project-tests/lit.site.cfg.py.in @@ -21,6 +21,10 @@ config.mlir_src_root = "@MLIR_SOURCE_DIR@" config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@" +# Facebook T92898286 +config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@") +# End Facebook T92898286 + import lit.llvm lit.llvm.initialize(lit_config, config) diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py index d934349fe3ca3..d4a62c51458cc 100644 --- a/lldb/test/API/lit.cfg.py +++ b/lldb/test/API/lit.cfg.py @@ -248,6 +248,11 @@ def delete_module_cache(path): if is_configured("lldb_framework_dir"): dotest_cmd += ["--framework", config.lldb_framework_dir] +# Facebook T92898286 +if is_configured("llvm_test_bolt"): + dotest_cmd += ["-E", '"--post-link-optimize"'] +# End Facebook T92898286 + if ( "lldb-repro-capture" in config.available_features or "lldb-repro-replay" in config.available_features diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in index 8b2d09ae41cd2..602f45759e48f 100644 --- a/lldb/test/API/lit.site.cfg.py.in +++ b/lldb/test/API/lit.site.cfg.py.in @@ -1,5 +1,9 @@ @LIT_SITE_CFG_IN_HEADER@ +#Facebook T92898286 +import lit.util +#End Facebook T92898286 + config.llvm_src_root = "@LLVM_SOURCE_DIR@" config.llvm_obj_root = "@LLVM_BINARY_DIR@" config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@") @@ -39,6 +43,10 @@ config.libcxx_include_target_dir = "@LIBCXX_GENERATED_INCLUDE_TARGET_DIR@" config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-api") config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-api") +# Facebook T92898286 +config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@") +# End Facebook T92898286 + # Plugins lldb_build_intel_pt = '@LLDB_BUILD_INTEL_PT@' if lldb_build_intel_pt == '1': diff --git a/lldb/test/Shell/helper/toolchain.py b/lldb/test/Shell/helper/toolchain.py index 255955fc70d8c..7b7be06643166 100644 --- a/lldb/test/Shell/helper/toolchain.py +++ b/lldb/test/Shell/helper/toolchain.py @@ -165,6 +165,11 @@ def use_support_substitutions(config): if config.cmake_sysroot: host_flags += ["--sysroot={}".format(config.cmake_sysroot)] + # Facebook T92898286 + if config.llvm_test_bolt: + host_flags += ["--post-link-optimize"] + # End Facebook T92898286 + host_flags = " ".join(host_flags) config.substitutions.append(("%clang_host", "%clang " + host_flags)) config.substitutions.append(("%clangxx_host", "%clangxx " + host_flags)) diff --git a/lldb/test/Shell/lit.site.cfg.py.in b/lldb/test/Shell/lit.site.cfg.py.in index b69e7bce1bc0b..fe8323734b7db 100644 --- a/lldb/test/Shell/lit.site.cfg.py.in +++ b/lldb/test/Shell/lit.site.cfg.py.in @@ -1,5 +1,10 @@ @LIT_SITE_CFG_IN_HEADER@ +#Facebook T92898286 +import lit.util +#End Facebook T92898286 + + config.llvm_src_root = "@LLVM_SOURCE_DIR@" config.llvm_obj_root = "@LLVM_BINARY_DIR@" config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@") @@ -31,6 +36,10 @@ config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@" config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-shell") config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-shell") +# Facebook T92898286 +config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@") +# End Facebook T92898286 + import lit.llvm lit.llvm.initialize(lit_config, config) diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index cbf4db60a6e18..6beb7ed6108c6 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -711,6 +711,10 @@ set(LLVM_LIB_FUZZING_ENGINE "" CACHE PATH option(LLVM_USE_SPLIT_DWARF "Use -gsplit-dwarf when compiling llvm and --gdb-index when linking." OFF) +# Facebook T92898286 +option(LLVM_TEST_BOLT "Enable BOLT testing in non-BOLT tests that use clang" OFF) +# End Facebook T92898286 + # Define an option controlling whether we should build for 32-bit on 64-bit # platforms, where supported. if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT (WIN32 OR ${CMAKE_SYSTEM_NAME} MATCHES "AIX")) >From 08c2cedcd75022d8ddb6ac7e0469115bd45e657b Mon Sep 17 00:00:00 2001 From: Rafael Auler <rafaelau...@fb.com> Date: Thu, 5 Aug 2021 14:17:07 -0700 Subject: [PATCH 2/4] Rebase: [Facebook] [MC] Introduce NeverAlign fragment type Summary: Introduce NeverAlign fragment type. The intended usage of this fragment is to insert it before a pair of macro-op fusion eligible instructions. NeverAlign fragment ensures that the next fragment (first instruction in the pair) does not end at a given alignment boundary by emitting a minimal size nop if necessary. In effect, it ensures that a pair of macro-fusible instructions is not split by a given alignment boundary, which is a precondition for macro-op fusion in modern Intel Cores (64B = cache line size, see Intel Architecture Optimization Reference Manual, 2.3.2.1 Legacy Decode Pipeline: Macro-Fusion). This patch introduces functionality used by BOLT when emitting code with MacroFusion alignment already in place. The use case is different from BoundaryAlign and instruction bundling: - BoundaryAlign can be extended to perform the desired alignment for the first instruction in the macro-op fusion pair (D101817). However, this approach has higher overhead due to reliance on relaxation as BoundaryAlign requires in the general case - see https://reviews.llvm.org/D97982#2710638. - Instruction bundling: the intent of NeverAlign fragment is to prevent the first instruction in a pair ending at a given alignment boundary, by inserting at most one minimum size nop. It's OK if either instruction crosses the cache line. Padding both instructions using bundles to not cross the alignment boundary would result in excessive padding. There's no straightforward way to request instruction bundling to avoid a given end alignment for the first instruction in the bundle. LLVM: https://reviews.llvm.org/D97982 Manual rebase conflict history: https://phabricator.intern.facebook.com/D30142613 Test Plan: sandcastle Reviewers: #llvm-bolt Subscribers: phabricatorlinter Differential Revision: https://phabricator.intern.facebook.com/D31361547 --- bolt/lib/Core/BinaryEmitter.cpp | 1 + llvm/include/llvm/MC/MCFragment.h | 22 ++ llvm/include/llvm/MC/MCObjectStreamer.h | 2 + llvm/include/llvm/MC/MCStreamer.h | 6 + llvm/lib/MC/MCAssembler.cpp | 118 ++++++---- llvm/lib/MC/MCFragment.cpp | 12 + llvm/lib/MC/MCObjectStreamer.cpp | 5 + llvm/lib/MC/MCStreamer.cpp | 2 + .../lib/Target/X86/AsmParser/X86AsmParser.cpp | 24 ++ llvm/test/MC/X86/directive-avoid_end_align.s | 208 ++++++++++++++++++ 10 files changed, 363 insertions(+), 37 deletions(-) create mode 100644 llvm/test/MC/X86/directive-avoid_end_align.s diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp index 0b44acb0816f2..e09b9ff92de06 100644 --- a/bolt/lib/Core/BinaryEmitter.cpp +++ b/bolt/lib/Core/BinaryEmitter.cpp @@ -485,6 +485,7 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF, // This assumes the second instruction in the macro-op pair will get // assigned to its own MCRelaxableFragment. Since all JCC instructions // are relaxable, we should be safe. + Streamer.emitNeverAlignCodeAtEnd(/*Alignment to avoid=*/64, *BC.STI); } if (!EmitCodeOnly) { diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h index a9b19dc56f16a..256d98423e030 100644 --- a/llvm/include/llvm/MC/MCFragment.h +++ b/llvm/include/llvm/MC/MCFragment.h @@ -33,6 +33,7 @@ class MCFragment : public ilist_node_with_parent<MCFragment, MCSection> { public: enum FragmentType : uint8_t { FT_Align, + FT_NeverAlign, FT_Data, FT_CompactEncodedInst, FT_Fill, @@ -344,6 +345,27 @@ class MCAlignFragment : public MCFragment { } }; +class MCNeverAlignFragment : public MCFragment { + /// The alignment the end of the next fragment should avoid. + unsigned Alignment; + + /// When emitting Nops some subtargets have specific nop encodings. + const MCSubtargetInfo &STI; + +public: + MCNeverAlignFragment(unsigned Alignment, const MCSubtargetInfo &STI, + MCSection *Sec = nullptr) + : MCFragment(FT_NeverAlign, false, Sec), Alignment(Alignment), STI(STI) {} + + unsigned getAlignment() const { return Alignment; } + + const MCSubtargetInfo &getSubtargetInfo() const { return STI; } + + static bool classof(const MCFragment *F) { + return F->getKind() == MCFragment::FT_NeverAlign; + } +}; + class MCFillFragment : public MCFragment { uint8_t ValueSize; /// Value to use for filling bytes. diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h index e212d54613980..c7d760721e369 100644 --- a/llvm/include/llvm/MC/MCObjectStreamer.h +++ b/llvm/include/llvm/MC/MCObjectStreamer.h @@ -157,6 +157,8 @@ class MCObjectStreamer : public MCStreamer { unsigned MaxBytesToEmit = 0) override; void emitCodeAlignment(Align ByteAlignment, const MCSubtargetInfo *STI, unsigned MaxBytesToEmit = 0) override; + void emitNeverAlignCodeAtEnd(unsigned ByteAlignment, + const MCSubtargetInfo &STI) override; void emitValueToOffset(const MCExpr *Offset, unsigned char Value, SMLoc Loc) override; void emitDwarfLocDirective(unsigned FileNo, unsigned Line, unsigned Column, diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h index b7468cf70a664..dd813192d9ca0 100644 --- a/llvm/include/llvm/MC/MCStreamer.h +++ b/llvm/include/llvm/MC/MCStreamer.h @@ -887,6 +887,12 @@ class MCStreamer { virtual void emitCodeAlignment(Align Alignment, const MCSubtargetInfo *STI, unsigned MaxBytesToEmit = 0); + /// If the end of the fragment following this NeverAlign fragment ever gets + /// aligned to \p ByteAlignment, this fragment emits a single nop before the + /// following fragment to break this end-alignment. + virtual void emitNeverAlignCodeAtEnd(unsigned ByteAlignment, + const MCSubtargetInfo &STI); + /// Emit some number of copies of \p Value until the byte offset \p /// Offset is reached. /// diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index ad30b5ce9e631..62baeb93ea7d0 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -298,6 +298,43 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout, const MCFixup &Fixup, return IsResolved; } +/// Check if the branch crosses the boundary. +/// +/// \param StartAddr start address of the fused/unfused branch. +/// \param Size size of the fused/unfused branch. +/// \param BoundaryAlignment alignment requirement of the branch. +/// \returns true if the branch cross the boundary. +static bool mayCrossBoundary(uint64_t StartAddr, uint64_t Size, + Align BoundaryAlignment) { + uint64_t EndAddr = StartAddr + Size; + return (StartAddr >> Log2(BoundaryAlignment)) != + ((EndAddr - 1) >> Log2(BoundaryAlignment)); +} + +/// Check if the branch is against the boundary. +/// +/// \param StartAddr start address of the fused/unfused branch. +/// \param Size size of the fused/unfused branch. +/// \param BoundaryAlignment alignment requirement of the branch. +/// \returns true if the branch is against the boundary. +static bool isAgainstBoundary(uint64_t StartAddr, uint64_t Size, + Align BoundaryAlignment) { + uint64_t EndAddr = StartAddr + Size; + return (EndAddr & (BoundaryAlignment.value() - 1)) == 0; +} + +/// Check if the branch needs padding. +/// +/// \param StartAddr start address of the fused/unfused branch. +/// \param Size size of the fused/unfused branch. +/// \param BoundaryAlignment alignment requirement of the branch. +/// \returns true if the branch needs padding. +static bool needPadding(uint64_t StartAddr, uint64_t Size, + Align BoundaryAlignment) { + return mayCrossBoundary(StartAddr, Size, BoundaryAlignment) || + isAgainstBoundary(StartAddr, Size, BoundaryAlignment); +} + uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout, const MCFragment &F) const { assert(getBackendPtr() && "Requires assembler backend"); @@ -358,6 +395,41 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout, return Size; } + case MCFragment::FT_NeverAlign: { + // Disclaimer: NeverAlign fragment size depends on the size of its immediate + // successor, but NeverAlign need not be a MCRelaxableFragment. + // NeverAlign fragment size is recomputed if the successor is relaxed: + // - If RelaxableFragment is relaxed, it gets invalidated by marking its + // predecessor as LastValidFragment. + // - This forces the assembler to call MCAsmLayout::layoutFragment on that + // relaxable fragment, which in turn will always ask the predecessor to + // compute its size (see "computeFragmentSize(prev)" in layoutFragment). + // + // In short, the simplest way to ensure that computeFragmentSize() is sane + // is to establish the following rule: it should never examine fragments + // after the current fragment in the section. If we logically need to + // examine any fragment after the current fragment, we need to do that using + // relaxation, inside MCAssembler::layoutSectionOnce. + const MCNeverAlignFragment &NAF = cast<MCNeverAlignFragment>(F); + const MCFragment *NF = F.getNextNode(); + uint64_t Offset = Layout.getFragmentOffset(&NAF); + size_t NextFragSize = 0; + if (const auto *NextFrag = dyn_cast<MCRelaxableFragment>(NF)) { + NextFragSize = NextFrag->getContents().size(); + } else if (const auto *NextFrag = dyn_cast<MCDataFragment>(NF)) { + NextFragSize = NextFrag->getContents().size(); + } else { + llvm_unreachable("Didn't find the expected fragment after NeverAlign"); + } + // Check if the next fragment ends at the alignment we want to avoid. + if (isAgainstBoundary(Offset, NextFragSize, Align(NAF.getAlignment()))) { + // Avoid this alignment by introducing minimum nop. + assert(getBackend().getMinimumNopSize() != NAF.getAlignment()); + return getBackend().getMinimumNopSize(); + } + return 0; + } + case MCFragment::FT_Org: { const MCOrgFragment &OF = cast<MCOrgFragment>(F); MCValue Value; @@ -581,6 +653,15 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm, break; } + case MCFragment::FT_NeverAlign: { + const MCNeverAlignFragment &NAF = cast<MCNeverAlignFragment>(F); + if (!Asm.getBackend().writeNopData(OS, FragmentSize, + &NAF.getSubtargetInfo())) + report_fatal_error("unable to write nop sequence of " + + Twine(FragmentSize) + " bytes"); + break; + } + case MCFragment::FT_Data: ++stats::EmittedDataFragments; OS << cast<MCDataFragment>(F).getContents(); @@ -1052,43 +1133,6 @@ bool MCAssembler::relaxLEB(MCAsmLayout &Layout, MCLEBFragment &LF) { return OldSize != LF.getContents().size(); } -/// Check if the branch crosses the boundary. -/// -/// \param StartAddr start address of the fused/unfused branch. -/// \param Size size of the fused/unfused branch. -/// \param BoundaryAlignment alignment requirement of the branch. -/// \returns true if the branch cross the boundary. -static bool mayCrossBoundary(uint64_t StartAddr, uint64_t Size, - Align BoundaryAlignment) { - uint64_t EndAddr = StartAddr + Size; - return (StartAddr >> Log2(BoundaryAlignment)) != - ((EndAddr - 1) >> Log2(BoundaryAlignment)); -} - -/// Check if the branch is against the boundary. -/// -/// \param StartAddr start address of the fused/unfused branch. -/// \param Size size of the fused/unfused branch. -/// \param BoundaryAlignment alignment requirement of the branch. -/// \returns true if the branch is against the boundary. -static bool isAgainstBoundary(uint64_t StartAddr, uint64_t Size, - Align BoundaryAlignment) { - uint64_t EndAddr = StartAddr + Size; - return (EndAddr & (BoundaryAlignment.value() - 1)) == 0; -} - -/// Check if the branch needs padding. -/// -/// \param StartAddr start address of the fused/unfused branch. -/// \param Size size of the fused/unfused branch. -/// \param BoundaryAlignment alignment requirement of the branch. -/// \returns true if the branch needs padding. -static bool needPadding(uint64_t StartAddr, uint64_t Size, - Align BoundaryAlignment) { - return mayCrossBoundary(StartAddr, Size, BoundaryAlignment) || - isAgainstBoundary(StartAddr, Size, BoundaryAlignment); -} - bool MCAssembler::relaxBoundaryAlign(MCAsmLayout &Layout, MCBoundaryAlignFragment &BF) { // BoundaryAlignFragment that doesn't need to align any fragment should not be diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp index a8da46dbd8727..2626da7e0391a 100644 --- a/llvm/lib/MC/MCFragment.cpp +++ b/llvm/lib/MC/MCFragment.cpp @@ -274,6 +274,9 @@ void MCFragment::destroy() { case FT_Align: delete cast<MCAlignFragment>(this); return; + case FT_NeverAlign: + delete cast<MCNeverAlignFragment>(this); + return; case FT_Data: delete cast<MCDataFragment>(this); return; @@ -342,6 +345,9 @@ LLVM_DUMP_METHOD void MCFragment::dump() const { OS << "<"; switch (getKind()) { case MCFragment::FT_Align: OS << "MCAlignFragment"; break; + case MCFragment::FT_NeverAlign: + OS << "MCNeverAlignFragment"; + break; case MCFragment::FT_Data: OS << "MCDataFragment"; break; case MCFragment::FT_CompactEncodedInst: OS << "MCCompactEncodedInstFragment"; break; @@ -381,6 +387,12 @@ LLVM_DUMP_METHOD void MCFragment::dump() const { << " MaxBytesToEmit:" << AF->getMaxBytesToEmit() << ">"; break; } + case MCFragment::FT_NeverAlign: { + const MCNeverAlignFragment *NAF = cast<MCNeverAlignFragment>(this); + OS << "\n "; + OS << " Alignment:" << NAF->getAlignment() << ">"; + break; + } case MCFragment::FT_Data: { const auto *DF = cast<MCDataFragment>(this); OS << "\n "; diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index 0ccade91677a4..117475b7dd90b 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -658,6 +658,11 @@ void MCObjectStreamer::emitCodeAlignment(Align Alignment, cast<MCAlignFragment>(getCurrentFragment())->setEmitNops(true, STI); } +void MCObjectStreamer::emitNeverAlignCodeAtEnd(unsigned ByteAlignment, + const MCSubtargetInfo &STI) { + insert(new MCNeverAlignFragment(ByteAlignment, STI)); +} + void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset, unsigned char Value, SMLoc Loc) { diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp index 199d865ea3496..a97cba6c89972 100644 --- a/llvm/lib/MC/MCStreamer.cpp +++ b/llvm/lib/MC/MCStreamer.cpp @@ -1235,6 +1235,8 @@ void MCStreamer::emitValueToAlignment(Align Alignment, int64_t Value, unsigned MaxBytesToEmit) {} void MCStreamer::emitCodeAlignment(Align Alignment, const MCSubtargetInfo *STI, unsigned MaxBytesToEmit) {} +void MCStreamer::emitNeverAlignCodeAtEnd(unsigned ByteAlignment, + const MCSubtargetInfo &STI) {} void MCStreamer::emitValueToOffset(const MCExpr *Offset, unsigned char Value, SMLoc Loc) {} void MCStreamer::emitBundleAlignMode(Align Alignment) {} diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 6623106109316..6c6bd2cf31e86 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -1153,6 +1153,7 @@ class X86AsmParser : public MCTargetAsmParser { bool parseDirectiveArch(); bool parseDirectiveNops(SMLoc L); bool parseDirectiveEven(SMLoc L); + bool parseDirectiveAvoidEndAlign(SMLoc L); bool ParseDirectiveCode(StringRef IDVal, SMLoc L); /// CodeView FPO data directives. @@ -4601,6 +4602,8 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { return false; } else if (IDVal == ".nops") return parseDirectiveNops(DirectiveID.getLoc()); + else if (IDVal == ".avoid_end_align") + return parseDirectiveAvoidEndAlign(DirectiveID.getLoc()); else if (IDVal == ".even") return parseDirectiveEven(DirectiveID.getLoc()); else if (IDVal == ".cv_fpo_proc") @@ -4695,6 +4698,27 @@ bool X86AsmParser::parseDirectiveEven(SMLoc L) { return false; } +/// Directive for NeverAlign fragment testing, not for general usage! +/// parseDirectiveAvoidEndAlign +/// ::= .avoid_end_align alignment +bool X86AsmParser::parseDirectiveAvoidEndAlign(SMLoc L) { + int64_t Alignment = 0; + SMLoc AlignmentLoc; + AlignmentLoc = getTok().getLoc(); + if (getParser().checkForValidSection() || + getParser().parseAbsoluteExpression(Alignment)) + return true; + + if (getParser().parseEOL("unexpected token in directive")) + return true; + + if (Alignment <= 0) + return Error(AlignmentLoc, "expected a positive alignment"); + + getParser().getStreamer().emitNeverAlignCodeAtEnd(Alignment, getSTI()); + return false; +} + /// ParseDirectiveCode /// ::= .code16 | .code32 | .code64 bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) { diff --git a/llvm/test/MC/X86/directive-avoid_end_align.s b/llvm/test/MC/X86/directive-avoid_end_align.s new file mode 100644 index 0000000000000..1d748401edc12 --- /dev/null +++ b/llvm/test/MC/X86/directive-avoid_end_align.s @@ -0,0 +1,208 @@ +# RUN: llvm-mc -triple=x86_64 -filetype=obj %s | llvm-objdump --no-show-raw-insn -d - | FileCheck %s +# RUN: not llvm-mc -triple=x86_64 --defsym ERR=1 %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR + +# avoid_end_align has no effect since test doesn't end at alignment boundary: +.avoid_end_align 64 +# CHECK-NOT: nop + testl %eax, %eax +# CHECK: testl %eax, %eax + je .LBB0 + +.fill 58, 1, 0x00 +# NeverAlign followed by MCDataFragment: +# avoid_end_align inserts nop because `test` would end at alignment boundary: +.avoid_end_align 64 +# CHECK: 3e: nop + testl %eax, %eax +# CHECK-NEXT: 3f: testl %eax, %eax + je .LBB0 +# CHECK-NEXT: 41: je +.LBB0: + retq + +.p2align 6 +.L0: +.nops 57 + int3 +# NeverAlign followed by RelaxableFragment: +.avoid_end_align 64 +# CHECK: ba: nop + cmpl $(.L1-.L0), %eax +# CHECK-NEXT: bb: cmpl + je .L0 +# CHECK-NEXT: c1: je +.nops 65 +.L1: + +############################################################################### +# Experiment A: +# Check that NeverAlign doesn't introduce infinite loops in layout. +# Control: +# 1. NeverAlign fragment is not added, +# 2. Short formats of cmp and jcc are used (3 and 2 bytes respectively), +# 3. cmp and jcc are placed such that to be split by 64B alignment boundary. +# 4. jcc would be relaxed to a longer format if at least one byte is added +# between .L10 and je itself, e.g. by adding a NeverAlign padding byte, +# or relaxing cmp instruction. +# 5. cmp would be relaxed to a longer format if at least one byte is added +# between .L11 and .L12, e.g. due to relaxing jcc instruction. +.p2align 6 +# CHECK: 140: int3 +.fill 2, 1, 0xcc +.L10: +.nops 122 + int3 +# CHECK: 1bc: int3 +# no avoid_end_align here +# CHECK-NOT: nop + cmp $(.L12-.L11), %eax +# CHECK: 1bd: cmpl +.L11: + je .L10 +# CHECK-NEXT: 1c0: je +.nops 125 +.L12: + +# Experiment: +# Same setup as control, except NeverAlign fragment is added before cmp. +# Expected effect: +# 1. NeverAlign pads cmp+jcc by one byte since cmp and jcc are split by a 64B +# alignment boundary, +# 2. This extra byte forces jcc relaxation to a longer format (Control rule #4), +# 3. This results in an cmp relaxation (Control rule #5), +# 4. Which in turn makes NeverAlign fragment unnecessary as cmp and jcc +# are no longer split by an alignment boundary (cmp crosses the boundary). +# 5. NeverAlign padding is removed. +# 6. cmp and jcc instruction remain in relaxed form. +# 7. Relaxation converges, layout succeeds. +.p2align 6 +# CHECK: 240: int3 +.fill 2, 1, 0xcc +.L20: +.nops 122 + int3 +# CHECK: 2bc: int3 +.avoid_end_align 64 +# CHECK-NOT: nop + cmp $(.L22-.L21), %eax +# CHECK-NEXT: 2bd: cmpl +.L21: + je .L20 +# CHECK-NEXT: 2c3: je +.nops 125 +.L22: + +############################################################################### +# Experiment B: similar to exp A, but we check that once NeverAlign padding is +# removed from the layout (exp A, experiment step 5), the increased distance +# between the symbols L33 and L34 triggers the relaxation of instruction at +# label L32. +# +# Control 1: using a one-byte instruction at L33 (site of NeverAlign) leads to +# steps 2-3 of exp A, experiment: +# 2. This extra byte forces jcc relaxation to a longer format (Control rule #4), +# 3. This results in an cmp relaxation (Control rule #5), +# => short cmp under L32 +.p2align 6 +# CHECK: 380: int3 +.fill 2, 1, 0xcc +.L30: +.nops 122 + int3 +# CHECK: 3fc: int3 + hlt +#.avoid_end_align 64 +.L33: + cmp $(.L32-.L31), %eax +# CHECK: 3fe: cmpl +.L31: + je .L30 +# CHECK-NEXT: 404: je +.nops 114 +.p2align 1 + int3 + int3 +# CHECK: 47c: int3 +.L34: +.nops 9 +.L32: + cmp $(.L33-.L34), %eax +# CHECK: 487: cmp +# note that the size of cmp is 48a-487 == 3 bytes (distance is exactly -128) + int3 +# CHECK-NEXT: 48a: int3 + +# Control 2: leaving out a byte at L43 (site of NeverAlign), plus +# relaxed jcc and cmp leads to a relaxed cmp under L42 (-129 as cmp's immediate) +.p2align 6 +# CHECK: 4c0: int3 +.fill 2, 1, 0xcc +.L40: +.nops 122 + int3 +# CHECK: 53c: int3 +# int3 +#.avoid_end_align 64 +.L43: + cmp $(.L42-.L41+0x100), %eax +# CHECK: 53d: cmpl +.L41: + je .L40+0x100 +# CHECK-NEXT: 543: je +.nops 114 +.p2align 1 + int3 + int3 +# CHECK: 5bc: int3 +.L44: +.nops 9 +.L42: + cmp $(.L43-.L44), %eax +# CHECK: 5c7: cmp +# note that the size of cmp is 5cd-5c7 == 6 bytes (distance is exactly -129) + int3 +# CHECK-NEXT: 5cd: int3 + +# Experiment +# Checking if removing NeverAlign padding at L53 as a result of alignment and +# relaxation of cmp and jcc following it (see exp A), thus reproducing the case +# in Control 2 (getting a relaxed cmp under L52), is handled correctly. +.p2align 6 +# CHECK: 600: int3 +.fill 2, 1, 0xcc +.L50: +.nops 122 + int3 +# CHECK: 67c: int3 +.avoid_end_align 64 +.L53: +# CHECK-NOT: nop + cmp $(.L52-.L51), %eax +# CHECK-NEXT: 67d: cmpl +.L51: + je .L50 +# CHECK-NEXT: 683: je +.nops 114 +.p2align 1 + int3 + int3 +# CHECK: 6fc: int3 +.L54: +.nops 9 +.L52: + cmp $(.L53-.L54), %eax +# CHECK: 707: cmp +# note that the size of cmp is 70d-707 == 6 bytes (distance is exactly -129) + int3 +# CHECK-NEXT: 70d: int3 + +.ifdef ERR +# ERR: {{.*}}.s:[[#@LINE+1]]:17: error: unknown token in expression +.avoid_end_align +# ERR: {{.*}}.s:[[#@LINE+1]]:18: error: expected absolute expression +.avoid_end_align x +# ERR: {{.*}}.s:[[#@LINE+1]]:18: error: expected a positive alignment +.avoid_end_align 0 +# ERR: {{.*}}.s:[[#@LINE+1]]:20: error: unexpected token in directive +.avoid_end_align 64, 0 +.endif >From a813b21ba6cf623aa3ff86a0f93a04fb4d53a334 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko <m...@meta.com> Date: Tue, 16 Apr 2024 14:48:55 -0700 Subject: [PATCH 3/4] [BOLT] Fix ValidateMemRefs Summary: In ValidateMemRefs pass, when we validate references of the form `Symbol + Addend`, we should check Symbol against aliasing a jump table instead of the Symbol + Addend value. https://github.com/llvm/llvm-project/pull/88838 Test Plan: NFC Reviewers: aaupov, #llvm-bolt Reviewed By: aaupov Differential Revision: https://phabricator.intern.facebook.com/D56213679 Tags: accept2ship --- bolt/lib/Passes/ValidateMemRefs.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/bolt/lib/Passes/ValidateMemRefs.cpp b/bolt/lib/Passes/ValidateMemRefs.cpp index f29a97c43f497..1d2c230fa7106 100644 --- a/bolt/lib/Passes/ValidateMemRefs.cpp +++ b/bolt/lib/Passes/ValidateMemRefs.cpp @@ -29,8 +29,7 @@ bool ValidateMemRefs::checkAndFixJTReference(BinaryFunction &BF, MCInst &Inst, if (!BD) return false; - const uint64_t TargetAddress = BD->getAddress() + Offset; - JumpTable *JT = BC.getJumpTableContainingAddress(TargetAddress); + JumpTable *JT = BC.getJumpTableContainingAddress(BD->getAddress()); if (!JT) return false; @@ -41,10 +40,10 @@ bool ValidateMemRefs::checkAndFixJTReference(BinaryFunction &BF, MCInst &Inst, // Accessing a jump table in another function. This is not a // legitimate jump table access, we need to replace the reference to // the jump table label with a regular rodata reference. Get a - // non-JT reference by fetching the symbol 1 byte before the JT - // label. - MCSymbol *NewSym = BC.getOrCreateGlobalSymbol(TargetAddress - 1, "DATAat"); - BC.MIB->setOperandToSymbolRef(Inst, OperandNum, NewSym, 1, &*BC.Ctx, 0); + // non-JT reference by fetching the symbol 1 byte before the JT label. + MCSymbol *NewSym = BC.getOrCreateGlobalSymbol(BD->getAddress() - 1, "DATAat"); + BC.MIB->setOperandToSymbolRef(Inst, OperandNum, NewSym, Offset + 1, &*BC.Ctx, + 0); LLVM_DEBUG(dbgs() << "BOLT-DEBUG: replaced reference @" << BF.getPrintName() << " from " << BD->getName() << " to " << NewSym->getName() << " + 1\n"); >From e8ae5406d823dad7c3dba1a66ca53d6ca06e009e Mon Sep 17 00:00:00 2001 From: shawbyoung <shawbyo...@gmail.com> Date: Mon, 10 Jun 2024 12:08:38 -0700 Subject: [PATCH 4/4] [BOLT][NFC] Add sink block to flow CFG in profile inference Test Plan: tbd Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D58380996 --- bolt/lib/Profile/StaleProfileMatching.cpp | 37 ++++++++++++++++--- .../Transforms/Utils/SampleProfileInference.h | 3 +- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp index 365bc5389266d..f11869e50f46d 100644 --- a/bolt/lib/Profile/StaleProfileMatching.cpp +++ b/bolt/lib/Profile/StaleProfileMatching.cpp @@ -309,22 +309,33 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) { FlowFunction Func; // Add a special "dummy" source so that there is always a unique entry point. - // Because of the extra source, for all other blocks in FlowFunction it holds - // that Block.Index == BB->getIndex() + 1 FlowBlock EntryBlock; EntryBlock.Index = 0; Func.Blocks.push_back(EntryBlock); + auto BinaryBlockIsExit = [&](const BinaryBasicBlock &BB) { + if ( BB.successors().empty() ) + return true; + return false; + }; + // Create FlowBlock for every basic block in the binary function for (const BinaryBasicBlock *BB : BlockOrder) { Func.Blocks.emplace_back(); FlowBlock &Block = Func.Blocks.back(); Block.Index = Func.Blocks.size() - 1; + Block.HasSuccessors = BinaryBlockIsExit(*BB); (void)BB; assert(Block.Index == BB->getIndex() + 1 && "incorrectly assigned basic block index"); } + // Add a special "dummy" sink block so there is always a unique sink + FlowBlock SinkBlock; + SinkBlock.Index = Func.Blocks.size(); + Func.Blocks.push_back(SinkBlock); + Func.Sink = SinkBlock.Index; + // Create FlowJump for each jump between basic blocks in the binary function std::vector<uint64_t> InDegree(Func.Blocks.size(), 0); for (const BinaryBasicBlock *SrcBB : BlockOrder) { @@ -360,18 +371,29 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) { // Add dummy edges to the extra sources. If there are multiple entry blocks, // add an unlikely edge from 0 to the subsequent ones assert(InDegree[0] == 0 && "dummy entry blocks shouldn't have predecessors"); - for (uint64_t I = 1; I < Func.Blocks.size(); I++) { + for (uint64_t I = 1; I < BlockOrder.size() + 1; I++) { const BinaryBasicBlock *BB = BlockOrder[I - 1]; if (BB->isEntryPoint() || InDegree[I] == 0) { Func.Jumps.emplace_back(); FlowJump &Jump = Func.Jumps.back(); - Jump.Source = 0; + Jump.Source = Func.Entry; Jump.Target = I; if (!BB->isEntryPoint()) Jump.IsUnlikely = true; } } + // Add dummy edges from the exit blocks to the sink block. + for (uint64_t I = 1; I < BlockOrder.size() + 1; I++) { + FlowBlock &Block = Func.Blocks[I]; + if (Block.HasSuccessors) { + Func.Jumps.emplace_back(); + FlowJump &Jump = Func.Jumps.back(); + Jump.Source = I; + Jump.Target = Func.Sink; + } + } + // Create necessary metadata for the flow function for (FlowJump &Jump : Func.Jumps) { assert(Jump.Source < Func.Blocks.size()); @@ -395,7 +417,7 @@ void matchWeightsByHashes(BinaryContext &BC, const BinaryFunction::BasicBlockOrderType &BlockOrder, const yaml::bolt::BinaryFunctionProfile &YamlBF, FlowFunction &Func) { - assert(Func.Blocks.size() == BlockOrder.size() + 1); + assert(Func.Blocks.size() == BlockOrder.size() + 2); std::vector<FlowBlock *> Blocks; std::vector<BlendedBlockHash> BlendedHashes; @@ -618,7 +640,7 @@ void assignProfile(BinaryFunction &BF, FlowFunction &Func) { BinaryContext &BC = BF.getBinaryContext(); - assert(Func.Blocks.size() == BlockOrder.size() + 1); + assert(Func.Blocks.size() == BlockOrder.size() + 2); for (uint64_t I = 0; I < BlockOrder.size(); I++) { FlowBlock &Block = Func.Blocks[I + 1]; BinaryBasicBlock *BB = BlockOrder[I]; @@ -640,6 +662,9 @@ void assignProfile(BinaryFunction &BF, if (Jump->Flow == 0) continue; + // Skip the artificial sink block + if (Jump->Target == Func.Sink) + continue; BinaryBasicBlock &SuccBB = *BlockOrder[Jump->Target - 1]; // Check if the edge corresponds to a regular jump or a landing pad if (BB->getSuccessor(SuccBB.getLabel())) { diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h b/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h index b4ea1ad840f9d..432c5e39dd35e 100644 --- a/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h +++ b/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h @@ -31,10 +31,10 @@ struct FlowBlock { uint64_t Flow{0}; std::vector<FlowJump *> SuccJumps; std::vector<FlowJump *> PredJumps; + bool HasSuccessors {false}; /// Check if it is the entry block in the function. bool isEntry() const { return PredJumps.empty(); } - /// Check if it is an exit block in the function. bool isExit() const { return SuccJumps.empty(); } }; @@ -57,6 +57,7 @@ struct FlowFunction { std::vector<FlowJump> Jumps; /// The index of the entry block. uint64_t Entry{0}; + uint64_t Sink{0}; }; /// Various thresholds and options controlling the behavior of the profile _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits