I have a wild guess at a potential fix in the meantime: diff --cc llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 3add561c66d5,3add561c66d5..bedbca9dd4b7 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@@ -3242,9 -3242,9 +3242,9 @@@ bool llvm::FoldBranchToCommonDest(Branc // as "bonus instructions", and only allow this transformation when the // number of the bonus instructions we'll need to create when cloning into // each predecessor does not exceed a certain threshold. -- unsigned NumBonusInsts = 0; bool SawVectorOp = false; const unsigned PredCount = Preds.size(); ++ InstructionCost IC; for (Instruction &I : *BB) { // Don't check the branch condition comparison itself. if (&I == Cond) @@@ -3258,16 -3258,16 +3258,13 @@@ SawVectorOp |= isVectorOp(I);
// Account for the cost of duplicating this instruction into each -- // predecessor. Ignore free instructions. -- if (!TTI || -- TTI->getUserCost(&I, CostKind) != TargetTransformInfo::TCC_Free) { -- NumBonusInsts += PredCount; -- -- // Early exits once we reach the limit. -- if (NumBonusInsts > -- BonusInstThreshold * BranchFoldToCommonDestVectorMultiplier) -- return false; -- } ++ // predecessor. ++ IC += (TTI ? TTI->getUserCost(&I, CostKind) ++ : TargetTransformInfo::TCC_Basic) * ++ PredCount; ++ // Early exits once we reach the limit. ++ if (IC > BonusInstThreshold * BranchFoldToCommonDestVectorMultiplier) ++ return false; auto IsBCSSAUse = [BB, &I](Use &U) { auto *UI = cast<Instruction>(U.getUser()); @@@ -3280,9 -3280,9 +3277,8 @@@ if (!all_of(I.uses(), IsBCSSAUse)) return false; } -- if (NumBonusInsts > -- BonusInstThreshold * -- (SawVectorOp ? BranchFoldToCommonDestVectorMultiplier : 1)) ++ if (IC > BonusInstThreshold * ++ (SawVectorOp ? BranchFoldToCommonDestVectorMultiplier : 1)) return false; // Ok, we have the budget. Perform the transformation. On Mon, Sep 27, 2021 at 10:05 AM Arthur Eubanks <aeuba...@google.com> wrote: > Could I get the source file with S_regmatch()? > > On Mon, Sep 27, 2021 at 6:07 AM Maxim Kuvyrkov <maxim.kuvyr...@linaro.org> > wrote: > >> Hi Arthur, >> >> Your patch seems to be slowing down 400.perlbench by 6% — due to slow >> down of its hot function S_regmatch() by 14%. >> >> Could you take a look if this is easily fixable, please? >> >> Regards, >> >> -- >> Maxim Kuvyrkov >> https://www.linaro.org >> >> > On 24 Sep 2021, at 15:07, ci_not...@linaro.org wrote: >> > >> > After llvm commit e7249e4acf3cf9438d6d9e02edecebd5b622a4dc >> > Author: Arthur Eubanks <aeuba...@google.com> >> > >> > [SimplifyCFG] Ignore free instructions when computing cost for >> folding branch to common dest >> > >> > the following benchmarks slowed down by more than 2%: >> > - 400.perlbench slowed down by 6% from 9730 to 10312 perf samples >> > - 400.perlbench:[.] S_regmatch slowed down by 14% from 3660 to 4188 >> perf samples >> > >> > Below reproducer instructions can be used to re-build both "first_bad" >> and "last_good" cross-toolchains used in this bisection. Naturally, the >> scripts will fail when triggerring benchmarking jobs if you don't have >> access to Linaro TCWG CI. >> > >> > For your convenience, we have uploaded tarballs with pre-processed >> source and assembly files at: >> > - First_bad save-temps: >> https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-aarch64-spec2k6-O3/23/artifact/artifacts/build-e7249e4acf3cf9438d6d9e02edecebd5b622a4dc/save-temps/ >> > - Last_good save-temps: >> https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-aarch64-spec2k6-O3/23/artifact/artifacts/build-32a50078657dd8beead327a3478ede4e9d730432/save-temps/ >> > - Baseline save-temps: >> https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-aarch64-spec2k6-O3/23/artifact/artifacts/build-baseline/save-temps/ >> > >> > Configuration: >> > - Benchmark: SPEC CPU2006 >> > - Toolchain: Clang + Glibc + LLVM Linker >> > - Version: all components were built from their tip of trunk >> > - Target: aarch64-linux-gnu >> > - Compiler flags: -O3 >> > - Hardware: NVidia TX1 4x Cortex-A57 >> > >> > This benchmarking CI is work-in-progress, and we welcome feedback and >> suggestions at linaro-toolchain@lists.linaro.org . In our improvement >> plans is to add support for SPEC CPU2017 benchmarks and provide "perf >> report/annotate" data behind these reports. >> > >> > THIS IS THE END OF INTERESTING STUFF. BELOW ARE LINKS TO BUILDS, >> REPRODUCTION INSTRUCTIONS, AND THE RAW COMMIT. >> > >> > This commit has regressed these CI configurations: >> > - tcwg_bmk_llvm_tx1/llvm-master-aarch64-spec2k6-O3 >> > >> > First_bad build: >> https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-aarch64-spec2k6-O3/23/artifact/artifacts/build-e7249e4acf3cf9438d6d9e02edecebd5b622a4dc/ >> > Last_good build: >> https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-aarch64-spec2k6-O3/23/artifact/artifacts/build-32a50078657dd8beead327a3478ede4e9d730432/ >> > Baseline build: >> https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-aarch64-spec2k6-O3/23/artifact/artifacts/build-baseline/ >> > Even more details: >> https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-aarch64-spec2k6-O3/23/artifact/artifacts/ >> > >> > Reproduce builds: >> > <cut> >> > mkdir investigate-llvm-e7249e4acf3cf9438d6d9e02edecebd5b622a4dc >> > cd investigate-llvm-e7249e4acf3cf9438d6d9e02edecebd5b622a4dc >> > >> > # Fetch scripts >> > git clone https://git.linaro.org/toolchain/jenkins-scripts >> > >> > # Fetch manifests and test.sh script >> > mkdir -p artifacts/manifests >> > curl -o artifacts/manifests/build-baseline.sh >> https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-aarch64-spec2k6-O3/23/artifact/artifacts/manifests/build-baseline.sh >> --fail >> > curl -o artifacts/manifests/build-parameters.sh >> https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-aarch64-spec2k6-O3/23/artifact/artifacts/manifests/build-parameters.sh >> --fail >> > curl -o artifacts/test.sh >> https://ci.linaro.org/job/tcwg_bmk_ci_llvm-bisect-tcwg_bmk_tx1-llvm-master-aarch64-spec2k6-O3/23/artifact/artifacts/test.sh >> --fail >> > chmod +x artifacts/test.sh >> > >> > # Reproduce the baseline build (build all pre-requisites) >> > ./jenkins-scripts/tcwg_bmk-build.sh @@ >> artifacts/manifests/build-baseline.sh >> > >> > # Save baseline build state (which is then restored in >> artifacts/test.sh) >> > mkdir -p ./bisect >> > rsync -a --del --delete-excluded --exclude /bisect/ --exclude >> /artifacts/ --exclude /llvm/ ./ ./bisect/baseline/ >> > >> > cd llvm >> > >> > # Reproduce first_bad build >> > git checkout --detach e7249e4acf3cf9438d6d9e02edecebd5b622a4dc >> > ../artifacts/test.sh >> > >> > # Reproduce last_good build >> > git checkout --detach 32a50078657dd8beead327a3478ede4e9d730432 >> > ../artifacts/test.sh >> > >> > cd .. >> > </cut> >> > >> > Full commit (up to 1000 lines): >> > <cut> >> > commit e7249e4acf3cf9438d6d9e02edecebd5b622a4dc >> > Author: Arthur Eubanks <aeuba...@google.com> >> > Date: Fri Aug 27 12:32:59 2021 -0700 >> > >> > [SimplifyCFG] Ignore free instructions when computing cost for >> folding branch to common dest >> > >> > When determining whether to fold branches to a common destination by >> > merging two blocks, SimplifyCFG will count the number of >> instructions to >> > be moved into the first basic block. However, there's no reason to >> count >> > free instructions like bitcasts and other similar instructions. >> > >> > This resolves missed branch foldings with -fstrict-vtable-pointers in >> > llvm-test-suite's lambda benchmark. >> > >> > Reviewed By: spatel >> > >> > Differential Revision: https://reviews.llvm.org/D108837 >> > --- >> > llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 17 ++++++----- >> > llvm/test/CodeGen/AArch64/csr-split.ll | 34 >> +++++++++++----------- >> > .../fold-branch-to-common-dest-free-cost.ll | 5 ++-- >> > 3 files changed, 29 insertions(+), 27 deletions(-) >> > >> > diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp >> b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp >> > index 2ff98b238de0..a3bd89e72af9 100644 >> > --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp >> > +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp >> > @@ -3258,13 +3258,16 @@ bool llvm::FoldBranchToCommonDest(BranchInst >> *BI, DomTreeUpdater *DTU, >> > SawVectorOp |= isVectorOp(I); >> > >> > // Account for the cost of duplicating this instruction into each >> > - // predecessor. >> > - NumBonusInsts += PredCount; >> > - >> > - // Early exits once we reach the limit. >> > - if (NumBonusInsts > >> > - BonusInstThreshold * BranchFoldToCommonDestVectorMultiplier) >> > - return false; >> > + // predecessor. Ignore free instructions. >> > + if (!TTI || >> > + TTI->getUserCost(&I, CostKind) != >> TargetTransformInfo::TCC_Free) { >> > + NumBonusInsts += PredCount; >> > + >> > + // Early exits once we reach the limit. >> > + if (NumBonusInsts > >> > + BonusInstThreshold * BranchFoldToCommonDestVectorMultiplier) >> > + return false; >> > + } >> > >> > auto IsBCSSAUse = [BB, &I](Use &U) { >> > auto *UI = cast<Instruction>(U.getUser()); >> > diff --git a/llvm/test/CodeGen/AArch64/csr-split.ll >> b/llvm/test/CodeGen/AArch64/csr-split.ll >> > index 1bee7f05acec..de85b4313433 100644 >> > --- a/llvm/test/CodeGen/AArch64/csr-split.ll >> > +++ b/llvm/test/CodeGen/AArch64/csr-split.ll >> > @@ -82,22 +82,22 @@ define dso_local signext i32 @test2(i32* %p1) >> local_unnamed_addr { >> > ; CHECK-NEXT: .cfi_def_cfa_offset 16 >> > ; CHECK-NEXT: .cfi_offset w19, -8 >> > ; CHECK-NEXT: .cfi_offset w30, -16 >> > -; CHECK-NEXT: cbz x0, .LBB1_2 >> > -; CHECK-NEXT: // %bb.1: // %if.end >> > +; CHECK-NEXT: cbz x0, .LBB1_3 >> > +; CHECK-NEXT: // %bb.1: // %entry >> > ; CHECK-NEXT: adrp x8, a >> > ; CHECK-NEXT: ldrsw x8, [x8, :lo12:a] >> > ; CHECK-NEXT: mov x19, x0 >> > ; CHECK-NEXT: cmp x8, x0 >> > -; CHECK-NEXT: b.eq .LBB1_3 >> > -; CHECK-NEXT: .LBB1_2: // %return >> > -; CHECK-NEXT: mov w0, wzr >> > -; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload >> > -; CHECK-NEXT: ret >> > -; CHECK-NEXT: .LBB1_3: // %if.then2 >> > +; CHECK-NEXT: b.ne .LBB1_3 >> > +; CHECK-NEXT: // %bb.2: // %if.then2 >> > ; CHECK-NEXT: bl callVoid >> > ; CHECK-NEXT: mov x0, x19 >> > ; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload >> > ; CHECK-NEXT: b callNonVoid >> > +; CHECK-NEXT: .LBB1_3: // %return >> > +; CHECK-NEXT: mov w0, wzr >> > +; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload >> > +; CHECK-NEXT: ret >> > ; >> > ; CHECK-APPLE-LABEL: test2: >> > ; CHECK-APPLE: ; %bb.0: ; %entry >> > @@ -108,26 +108,26 @@ define dso_local signext i32 @test2(i32* %p1) >> local_unnamed_addr { >> > ; CHECK-APPLE-NEXT: .cfi_offset w29, -16 >> > ; CHECK-APPLE-NEXT: .cfi_offset w19, -24 >> > ; CHECK-APPLE-NEXT: .cfi_offset w20, -32 >> > -; CHECK-APPLE-NEXT: cbz x0, LBB1_2 >> > -; CHECK-APPLE-NEXT: ; %bb.1: ; %if.end >> > +; CHECK-APPLE-NEXT: cbz x0, LBB1_3 >> > +; CHECK-APPLE-NEXT: ; %bb.1: ; %entry >> > ; CHECK-APPLE-NEXT: Lloh2: >> > ; CHECK-APPLE-NEXT: adrp x8, _a@PAGE >> > ; CHECK-APPLE-NEXT: Lloh3: >> > ; CHECK-APPLE-NEXT: ldrsw x8, [x8, _a@PAGEOFF] >> > ; CHECK-APPLE-NEXT: mov x19, x0 >> > ; CHECK-APPLE-NEXT: cmp x8, x0 >> > -; CHECK-APPLE-NEXT: b.eq LBB1_3 >> > -; CHECK-APPLE-NEXT: LBB1_2: ; %return >> > -; CHECK-APPLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload >> > -; CHECK-APPLE-NEXT: mov w0, wzr >> > -; CHECK-APPLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload >> > -; CHECK-APPLE-NEXT: ret >> > -; CHECK-APPLE-NEXT: LBB1_3: ; %if.then2 >> > +; CHECK-APPLE-NEXT: b.ne LBB1_3 >> > +; CHECK-APPLE-NEXT: ; %bb.2: ; %if.then2 >> > ; CHECK-APPLE-NEXT: bl _callVoid >> > ; CHECK-APPLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload >> > ; CHECK-APPLE-NEXT: mov x0, x19 >> > ; CHECK-APPLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload >> > ; CHECK-APPLE-NEXT: b _callNonVoid >> > +; CHECK-APPLE-NEXT: LBB1_3: ; %return >> > +; CHECK-APPLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload >> > +; CHECK-APPLE-NEXT: mov w0, wzr >> > +; CHECK-APPLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload >> > +; CHECK-APPLE-NEXT: ret >> > ; CHECK-APPLE-NEXT: .loh AdrpLdr Lloh2, Lloh3 >> > entry: >> > %tobool = icmp eq i32* %p1, null >> > diff --git >> a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-free-cost.ll >> b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-free-cost.ll >> > index ace2a5ed35ca..27df5ec44582 100644 >> > --- >> a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-free-cost.ll >> > +++ >> b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-free-cost.ll >> > @@ -8,12 +8,11 @@ declare void @g2() >> > >> > define void @f(i8* %a, i8* %b, i1 %c, i1 %d, i1 %e) { >> > ; CHECK-LABEL: @f( >> > -; CHECK-NEXT: br i1 [[C:%.*]], label [[L1:%.*]], label [[L3:%.*]] >> > -; CHECK: l1: >> > ; CHECK-NEXT: [[A1:%.*]] = call i8* >> @llvm.strip.invariant.group.p0i8(i8* [[A:%.*]]) >> > ; CHECK-NEXT: [[B1:%.*]] = call i8* >> @llvm.strip.invariant.group.p0i8(i8* [[B:%.*]]) >> > ; CHECK-NEXT: [[I:%.*]] = icmp eq i8* [[A1]], [[B1]] >> > -; CHECK-NEXT: br i1 [[I]], label [[L2:%.*]], label [[L3]] >> > +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[C:%.*]], i1 [[I]], i1 >> false >> > +; CHECK-NEXT: br i1 [[OR_COND]], label [[L2:%.*]], label [[L3:%.*]] >> > ; CHECK: l2: >> > ; CHECK-NEXT: call void @g1() >> > ; CHECK-NEXT: br label [[RET:%.*]] >> > </cut> >> >> _______________________________________________ linaro-toolchain mailing list linaro-toolchain@lists.linaro.org https://lists.linaro.org/mailman/listinfo/linaro-toolchain