On Wed, Nov 11, 2015 at 11:27:16AM +1100, Benjamin Herrenschmidt wrote: > On ppc64 especially, we flush the tlb on any slbie or tlbie instruction. > > However, those instructions often come in bursts of 3 or more (context > switch will favor a series of slbie's for example to an slbia if the > SLB has less than a certain number of entries in it, and tlbie's can > happen in a series, with PAPR, H_BULK_REMOVE can remove up to 4 entries > at a time. > > Doing a tlb_flush() each time is a waste of time. We end up doing a memset > of the whole TLB, reloading it for the next instruction, memset'ing again, > etc... > > Those instructions don't have to take effect immediately. For slbie, they > can wait for the next context synchronizing event. For tlbie, the next > tlbsync. > > This implements batching by keeping a flag that indicates that we have a > TLB in need of flushing. We check it on interrupts, rfi's, isync's and > tlbsync and flush the TLB if needed. > > This reduces the number of tlb_flush() on a boot to a ubuntu installer > first dialog screen from roughly 360K down to 36K. > > Signed-off-by: Benjamin Herrenschmidt <[email protected]> > --- > hw/ppc/spapr_hcall.c | 12 +++++++++--- > target-ppc/cpu.h | 2 ++ > target-ppc/excp_helper.c | 9 +++++++++ > target-ppc/helper.h | 1 + > target-ppc/helper_regs.h | 13 +++++++++++++ > target-ppc/mmu-hash64.c | 12 +++--------- > target-ppc/mmu_helper.c | 9 ++++++++- > target-ppc/translate.c | 39 ++++++++++++++++++++++++++++++++++++--- > 8 files changed, 81 insertions(+), 16 deletions(-) > > diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c > index cebceea..7e2cb4b 100644 > --- a/hw/ppc/spapr_hcall.c > +++ b/hw/ppc/spapr_hcall.c > @@ -220,6 +220,7 @@ static target_ulong h_remove(PowerPCCPU *cpu, > sPAPRMachineState *spapr, > > switch (ret) { > case REMOVE_SUCCESS: > + check_tlb_flush(env); > return H_SUCCESS; > > case REMOVE_NOT_FOUND: > @@ -257,6 +258,7 @@ static target_ulong h_bulk_remove(PowerPCCPU *cpu, > sPAPRMachineState *spapr, > target_ulong opcode, target_ulong *args) > { > CPUPPCState *env = &cpu->env; > + target_ulong rc = H_SUCCESS; > int i; > > for (i = 0; i < H_BULK_REMOVE_MAX_BATCH; i++) { > @@ -290,14 +292,18 @@ static target_ulong h_bulk_remove(PowerPCCPU *cpu, > sPAPRMachineState *spapr, > break; > > case REMOVE_PARM: > - return H_PARAMETER; > + rc = H_PARAMETER; > + goto exit; > > case REMOVE_HW: > - return H_HARDWARE; > + rc = H_HARDWARE; > + goto exit; > } > } > + exit: > + check_tlb_flush(env); > > - return H_SUCCESS; > + return rc; > } > > static target_ulong h_protect(PowerPCCPU *cpu, sPAPRMachineState *spapr, > diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h > index aaa7117..e6c43f9 100644 > --- a/target-ppc/cpu.h > +++ b/target-ppc/cpu.h > @@ -1013,6 +1013,8 @@ struct CPUPPCState { > /* PowerPC 64 SLB area */ > ppc_slb_t slb[MAX_SLB_ENTRIES]; > int32_t slb_nr; > + /* tcg TLB needs flush (deferred slb inval instruction typically) */ > + uint32_t tlb_need_flush; > #endif > /* segment registers */ > hwaddr htab_base; > diff --git a/target-ppc/excp_helper.c b/target-ppc/excp_helper.c > index 3e39098..c1d6605 100644 > --- a/target-ppc/excp_helper.c > +++ b/target-ppc/excp_helper.c > @@ -671,6 +671,11 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int > excp_model, int excp) > /* Reset exception state */ > cs->exception_index = POWERPC_EXCP_NONE; > env->error_code = 0; > + > + /* Any interrupt is context synchronizing, check if TCG TLB > + * needs a delayed flush on ppc64 > + */ > + check_tlb_flush(env); > } > > void ppc_cpu_do_interrupt(CPUState *cs) > @@ -692,6 +697,7 @@ static void ppc_hw_interrupt(CPUPPCState *env) > __func__, env, env->pending_interrupts, > cs->interrupt_request, (int)msr_me, (int)msr_ee); > #endif > + > /* External reset */ > if (env->pending_interrupts & (1 << PPC_INTERRUPT_RESET)) { > env->pending_interrupts &= ~(1 << PPC_INTERRUPT_RESET); > @@ -896,6 +902,9 @@ static inline void do_rfi(CPUPPCState *env, target_ulong > nip, target_ulong msr, > * as rfi is always the last insn of a TB > */ > cs->interrupt_request |= CPU_INTERRUPT_EXITTB; > + > + /* Context synchronizing: check if TCG TLB needs flush */ > + check_tlb_flush(env); > } > > void helper_rfi(CPUPPCState *env) > diff --git a/target-ppc/helper.h b/target-ppc/helper.h > index 869be15..ff2d50b 100644 > --- a/target-ppc/helper.h > +++ b/target-ppc/helper.h > @@ -16,6 +16,7 @@ DEF_HELPER_1(rfmci, void, env) > DEF_HELPER_1(rfid, void, env) > DEF_HELPER_1(hrfid, void, env) > #endif > +DEF_HELPER_1(check_tlb_flush, void, env) > #endif > > DEF_HELPER_3(lmw, void, env, tl, i32) > diff --git a/target-ppc/helper_regs.h b/target-ppc/helper_regs.h > index f7edd5b..57da931 100644 > --- a/target-ppc/helper_regs.h > +++ b/target-ppc/helper_regs.h > @@ -151,4 +151,17 @@ static inline int hreg_store_msr(CPUPPCState *env, > target_ulong value, > return excp; > } > > +#if !defined(CONFIG_USER_ONLY) && defined(TARGET_PPC64) > +static inline void check_tlb_flush(CPUPPCState *env) > +{ > + CPUState *cs = CPU(ppc_env_get_cpu(env)); > + if (env->tlb_need_flush) { > + env->tlb_need_flush = 0; > + tlb_flush(cs, 1); > + } > +} > +#else > +static inline void check_tlb_flush(CPUPPCState *env) { } > +#endif > + > #endif /* !defined(__HELPER_REGS_H__) */ > diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c > index 7df6ede..71e1d14 100644 > --- a/target-ppc/mmu-hash64.c > +++ b/target-ppc/mmu-hash64.c > @@ -97,10 +97,8 @@ void dump_slb(FILE *f, fprintf_function cpu_fprintf, > CPUPPCState *env) > > void helper_slbia(CPUPPCState *env) > { > - PowerPCCPU *cpu = ppc_env_get_cpu(env); > - int n, do_invalidate; > + int n; > > - do_invalidate = 0; > /* XXX: Warning: slbia never invalidates the first segment */ > for (n = 1; n < env->slb_nr; n++) { > ppc_slb_t *slb = &env->slb[n]; > @@ -111,17 +109,13 @@ void helper_slbia(CPUPPCState *env) > * and we still don't have a tlb_flush_mask(env, n, mask) > * in QEMU, we just invalidate all TLBs > */ > - do_invalidate = 1; > + env->tlb_need_flush = true; > } > } > - if (do_invalidate) { > - tlb_flush(CPU(cpu), 1); > - } > } > > void helper_slbie(CPUPPCState *env, target_ulong addr) > { > - PowerPCCPU *cpu = ppc_env_get_cpu(env); > ppc_slb_t *slb; > > slb = slb_lookup(env, addr); > @@ -136,7 +130,7 @@ void helper_slbie(CPUPPCState *env, target_ulong addr) > * and we still don't have a tlb_flush_mask(env, n, mask) > * in QEMU, we just invalidate all TLBs > */ > - tlb_flush(CPU(cpu), 1); > + env->tlb_need_flush = true; > } > } > > diff --git a/target-ppc/mmu_helper.c b/target-ppc/mmu_helper.c > index e52d0e5..54bc5d1 100644 > --- a/target-ppc/mmu_helper.c > +++ b/target-ppc/mmu_helper.c > @@ -23,6 +23,7 @@ > #include "mmu-hash64.h" > #include "mmu-hash32.h" > #include "exec/cpu_ldst.h" > +#include "helper_regs.h" > > //#define DEBUG_MMU > //#define DEBUG_BATS > @@ -1940,6 +1941,7 @@ void ppc_tlb_invalidate_all(CPUPPCState *env) > case POWERPC_MMU_2_03: > case POWERPC_MMU_2_06: > case POWERPC_MMU_2_07: > + env->tlb_need_flush = 0; > #endif /* defined(TARGET_PPC64) */ > tlb_flush(CPU(cpu), 1); > break;
Any particular reason you're leaving this one as an immediate rather
than deferred flush?
> @@ -2019,7 +2021,7 @@ void ppc_tlb_invalidate_one(CPUPPCState *env,
> target_ulong addr)
> * and we still don't have a tlb_flush_mask(env, n, mask) in
> QEMU,
> * we just invalidate all TLBs
> */
> - tlb_flush(CPU(cpu), 1);
> + env->tlb_need_flush = 1;
> break;
> #endif /* defined(TARGET_PPC64) */
> default:
> @@ -2904,6 +2906,11 @@ void helper_booke206_tlbflush(CPUPPCState *env,
> target_ulong type)
> }
>
>
> +void helper_check_tlb_flush(CPUPPCState *env)
> +{
> + check_tlb_flush(env);
> +}
> +
>
> /*****************************************************************************/
>
> /* try to fill the TLB and return an exception if error. If retaddr is
> diff --git a/target-ppc/translate.c b/target-ppc/translate.c
> index 6d9f252..e18d204 100644
> --- a/target-ppc/translate.c
> +++ b/target-ppc/translate.c
> @@ -3299,9 +3299,32 @@ static void gen_eieio(DisasContext *ctx)
> {
> }
>
> +#if !defined(CONFIG_USER_ONLY) && defined(TARGET_PPC64)
> +static inline void gen_check_tlb_flush(DisasContext *ctx)
> +{
> + TCGv_i32 t = tcg_temp_new_i32();
> + TCGLabel *l = gen_new_label();
> +
> + tcg_gen_ld_i32(t, cpu_env, offsetof(CPUPPCState, tlb_need_flush));
> + tcg_gen_brcondi_i32(TCG_COND_EQ, t, 0, l);
> + gen_helper_check_tlb_flush(cpu_env);
> + gen_set_label(l);
> + tcg_temp_free_i32(t);
> +}
> +#else
> +static inline void gen_check_tlb_flush(DisasContext *ctx) { }
> +#endif
> +
> /* isync */
> static void gen_isync(DisasContext *ctx)
> {
> + /*
> + * We need to check for a pending TLB flush. This can only happen in
> + * kernel mode however so check MSR_PR
> + */
> + if (!ctx->pr) {
> + gen_check_tlb_flush(ctx);
> + }
> gen_stop_exception(ctx);
> }
>
> @@ -3458,6 +3481,15 @@ STCX(stqcx_, 16);
> /* sync */
> static void gen_sync(DisasContext *ctx)
> {
> + uint32_t l = (ctx->opcode >> 21) & 3;
> +
> + /*
> + * For l == 2, it's a ptesync, We need to check for a pending TLB flush.
> + * This can only happen in kernel mode however so check MSR_PR as well.
> + */
> + if (l == 2 && !ctx->pr) {
> + gen_check_tlb_flush(ctx);
> + }
> }
>
> /* wait */
> @@ -4851,10 +4883,11 @@ static void gen_tlbsync(DisasContext *ctx)
> gen_inval_exception(ctx, POWERPC_EXCP_PRIV_OPC);
> return;
> }
> - /* This has no effect: it should ensure that all previous
> - * tlbie have completed
> + /* tlbsync is a nop for server, ptesync handles delayed tlb flush,
> + * embedded however needs to deal with tlbsync. We don't try to be
> + * fancy and swallow the overhead of checking for both.
> */
> - gen_stop_exception(ctx);
> + gen_check_tlb_flush(ctx);
> #endif
> }
>
Should you be clearing the pending flush flag cpu_reset()?
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
signature.asc
Description: PGP signature
