llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-bolt Author: Amir Ayupov (aaupov) <details> <summary>Changes</summary> Intel's Architectural LBR supports capturing branch type information as part of LBR stack (SDM Vol 3B, part 2, October 2024): ``` 20.1.3.2 Branch Types The IA32_LBR_x_INFO.BR_TYPE and IA32_LER_INFO.BR_TYPE fields encode the branch types as shown in Table 20-3. Table 20-3. IA32_LBR_x_INFO and IA32_LER_INFO Branch Type Encodings Encoding | Branch Type 0000B | COND 0001B | NEAR_IND_JMP 0010B | NEAR_REL_JMP 0011B | NEAR_IND_CALL 0100B | NEAR_REL_CALL 0101B | NEAR_RET 011xB | Reserved 1xxxB | OTHER_BRANCH For a list of branch operations that fall into the categories above, see Table 20-2. Table 20-2. Branch Type Filtering Details Branch Type | Operations Recorded COND | Jcc, J*CXZ, and LOOP* NEAR_IND_JMP | JMP r/m* NEAR_REL_JMP | JMP rel* NEAR_IND_CALL | CALL r/m* NEAR_REL_CALL | CALL rel* (excluding CALLs to the next sequential IP) NEAR_RET | RET (0C3H) OTHER_BRANCH | JMP/CALL ptr*, JMP/CALL m*, RET (0C8H), SYS*, interrupts, exceptions (other than debug exceptions), IRET, INT3, INTn, INTO, TSX Abort, EENTER, ERESUME, EEXIT, AEX, INIT, SIPI, RSM ``` Linux kernel can preserve branch type when `save_type` is enabled, even if the CPU does not support Architectural LBR: https://github.com/torvalds/linux/blob/f09079bd04a924c72d555cd97942d5f8d7eca98c/tools/perf/Documentation/perf-record.txt#L457-L460 > - save_type: save branch type during sampling in case binary is not available later. For the platforms with Intel Arch LBR support (12th-Gen+ client or 4th-Gen Xeon+ server), the save branch type is unconditionally enabled when the taken branch stack sampling is enabled. This information is needed to disambiguate external returns (from DSO/JIT) to an entry point or a landing pad, when BOLT can't disassemble the branch source. This patch adds new pre-aggregated trace type (R). Test Plan: updated callcont-fallthru.s --- Full diff: https://github.com/llvm/llvm-project/pull/143296.diff 4 Files Affected: - (modified) bolt/include/bolt/Profile/DataAggregator.h (+6-1) - (modified) bolt/lib/Profile/DataAggregator.cpp (+9-2) - (modified) bolt/test/X86/callcont-fallthru.s (+13) - (modified) bolt/test/link_fdata.py (+2-2) ``````````diff diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h index 96969cf53baca..ae66c58e127cd 100644 --- a/bolt/include/bolt/Profile/DataAggregator.h +++ b/bolt/include/bolt/Profile/DataAggregator.h @@ -109,6 +109,7 @@ class DataAggregator : public DataReader { static constexpr const uint64_t BR_ONLY = -1ULL; static constexpr const uint64_t FT_ONLY = -1ULL; static constexpr const uint64_t FT_EXTERNAL_ORIGIN = -2ULL; + static constexpr const uint64_t BR_EXTERNAL_RETURN = -3ULL; uint64_t Branch; uint64_t From; @@ -388,7 +389,7 @@ class DataAggregator : public DataReader { /// File format syntax: /// E <event> /// S <start> <count> - /// T <start> <end> <ft_end> <count> + /// [TR] <start> <end> <ft_end> <count> /// B <start> <end> <count> <mispred_count> /// [Ff] <start> <end> <count> /// @@ -403,6 +404,7 @@ class DataAggregator : public DataReader { /// jump to the block /// T - an aggregated trace: branch from <start> to <end> with a fall-through /// to <ft_end> + /// R - an aggregated trace originating at a return /// /// <id> - build id of the object containing the address. We can skip it for /// the main binary and use "X" for an unknown object. This will save some @@ -532,6 +534,9 @@ inline raw_ostream &operator<<(raw_ostream &OS, case DataAggregator::Trace::FT_ONLY: case DataAggregator::Trace::FT_EXTERNAL_ORIGIN: break; + case DataAggregator::Trace::BR_EXTERNAL_RETURN: + OS << "0 -> "; + break; default: OS << Twine::utohexstr(T.Branch) << " -> "; } diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index 11d282e98413b..c28dd6e57f8e4 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -1194,6 +1194,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() { INVALID = 0, EVENT_NAME, // E TRACE, // T + RETURN, // R SAMPLE, // S BRANCH, // B FT, // F @@ -1224,6 +1225,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() { Type = StringSwitch<AggregatedLBREntry>(Str) .Case("T", TRACE) + .Case("R", RETURN) .Case("S", SAMPLE) .Case("E", EVENT_NAME) .Case("B", BRANCH) @@ -1237,7 +1239,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() { } using SSI = StringSwitch<int>; - AddrNum = SSI(Str).Case("T", 3).Case("S", 1).Case("E", 0).Default(2); + AddrNum = SSI(Str).Cases("T", "R", 3).Case("S", 1).Case("E", 0).Default(2); CounterNum = SSI(Str).Case("B", 2).Case("E", 0).Default(1); } @@ -1295,8 +1297,13 @@ std::error_code DataAggregator::parseAggregatedLBREntry() { Addr[0] = Location(Type == FT ? Trace::FT_ONLY : Trace::FT_EXTERNAL_ORIGIN); } - if (Type == BRANCH) { + if (Type == BRANCH) Addr[2] = Location(Trace::BR_ONLY); + + if (Type == RETURN) { + if (!Addr[0]->Offset) + Addr[0]->Offset = Trace::BR_EXTERNAL_RETURN; + Returns.emplace(Addr[0]->Offset); } Trace T{Addr[0]->Offset, Addr[1]->Offset, Addr[2]->Offset}; diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s index c2ef024db9475..63142903c80d2 100644 --- a/bolt/test/X86/callcont-fallthru.s +++ b/bolt/test/X86/callcont-fallthru.s @@ -10,6 +10,8 @@ # RUN: link_fdata %s %t %t.pa-ret PREAGG-RET # Trace from an external location to a landing pad/entry point call continuation # RUN: link_fdata %s %t %t.pa-ext PREAGG-EXT +# Return trace to a landing pad/entry point call continuation +# RUN: link_fdata %s %t %t.pa-pret PREAGG-PRET # RUN-DISABLED: link_fdata %s %t %t.pa-plt PREAGG-PLT # RUN: llvm-strip --strip-unneeded %t -o %t.strip @@ -38,6 +40,15 @@ # RUN: llvm-bolt %t.strip --pa -p %t.pa-ext -o %t.out \ # RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-SKIP +## Check pre-aggregated return traces from external location attach call +## continuation fallthrough count to secondary entry point (unstripped) +# RUN: llvm-bolt %t --pa -p %t.pa-pret -o %t.out \ +# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH +## Check pre-aggregated return traces from external location attach call +## continuation fallthrough count to landing pad (stripped, landing pad) +# RUN: llvm-bolt %t.strip --pa -p %t.pa-pret -o %t.out \ +# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH + ## Check pre-aggregated traces don't report zero-sized PLT fall-through as ## invalid trace # RUN-DISABLED: llvm-bolt %t.strip --pa -p %t.pa-plt -o %t.out | FileCheck %s \ @@ -92,6 +103,8 @@ Ltmp4_br: # PREAGG-RET: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1 ## Target is a secondary entry point (unstripped) or a landing pad (stripped) # PREAGG-EXT: T X:0 #Ltmp3# #Ltmp3_br# 1 +## Pre-aggregated return trace +# PREAGG-PRET: R X:0 #Ltmp3# #Ltmp3_br# 1 # CHECK-ATTACH: callq foo # CHECK-ATTACH-NEXT: count: 1 diff --git a/bolt/test/link_fdata.py b/bolt/test/link_fdata.py index 5a9752068bb9f..cb6b3c7baaab5 100755 --- a/bolt/test/link_fdata.py +++ b/bolt/test/link_fdata.py @@ -36,9 +36,9 @@ fdata_pat = re.compile(r"([01].*) (?P<mispred>\d+) (?P<exec>\d+)") # Pre-aggregated profile: -# {T|S|E|B|F|f} <start> [<end>] [<ft_end>] <count> [<mispred_count>] +# {T|R|S|E|B|F|f} <start> [<end>] [<ft_end>] <count> [<mispred_count>] # <loc>: [<id>:]<offset> -preagg_pat = re.compile(r"(?P<type>[TSBFf]) (?P<offsets_count>.*)") +preagg_pat = re.compile(r"(?P<type>[TRSBFf]) (?P<offsets_count>.*)") # No-LBR profile: # <is symbol?> <closest elf symbol or DSO name> <relative address> <count> `````````` </details> https://github.com/llvm/llvm-project/pull/143296 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits