On Tue, 4 Dec 2018, Julien Grall wrote:
> Set/Way operations are used to perform maintenance on a given cache.
> At the moment, Set/Way operations are not trapped and therefore a guest
> OS will directly act on the local cache. However, a vCPU may migrate to
> another pCPU in the middle of the processor. This will result to have
> cache with stall data (Set/Way are not propagated) potentially causing
> crash. This may be the cause of heisenbug noticed in Osstest [1].
>
> Furthermore, Set/Way operations are not available on system cache. This
> means that OS, such as Linux 32-bit, relying on those operations to
> fully clean the cache before disabling MMU may break because data may
> sits in system caches and not in RAM.
>
> For more details about Set/Way, see the talk "The Art of Virtualizing
> Cache Maintenance" given at Xen Summit 2018 [2].
>
> In the context of Xen, we need to trap Set/Way operations and emulate
> them. From the Arm Arm (B1.14.4 in DDI 046C.c), Set/Way operations are
> difficult to virtualized. So we can assume that a guest OS using them will
> suffer the consequence (i.e slowness) until developer removes all the usage
> of Set/Way.
>
> As the software is not allowed to infer the Set/Way to Physical Address
> mapping, Xen will need to go through the guest P2M and clean &
> invalidate all the entries mapped.
>
> Because Set/Way happen in batch (a loop on all Set/Way of a cache), Xen
> would need to go through the P2M for every instructions. This is quite
> expensive and would severely impact the guest OS. The implementation is
> re-using the KVM policy to limit the number of flush:
> - If we trap a Set/Way operations, we enable VM trapping (i.e
> HVC_EL2.TVM) to detect cache being turned on/off, and do a full
> clean.
> - We clean the caches when turning on and off
> - Once the caches are enabled, we stop trapping VM instructions
>
> [1] https://lists.xenproject.org/archives/html/xen-devel/2017-09/msg03191.html
> [2] https://fr.slideshare.net/xen_com_mgr/virtualizing-cache
>
> Signed-off-by: Julien Grall <[email protected]>
>
> ---
> Changes in v2:
> - Fix emulation for Set/Way cache flush arm64 sysreg
> - Add support for preemption
> - Check cache status on every VM traps in Arm64
> - Remove spurious change
> ---
> xen/arch/arm/arm64/vsysreg.c | 17 ++++++++
> xen/arch/arm/p2m.c | 92
> ++++++++++++++++++++++++++++++++++++++++++++
> xen/arch/arm/traps.c | 25 +++++++++++-
> xen/arch/arm/vcpreg.c | 22 +++++++++++
> xen/include/asm-arm/domain.h | 8 ++++
> xen/include/asm-arm/p2m.h | 20 ++++++++++
> 6 files changed, 183 insertions(+), 1 deletion(-)
>
> diff --git a/xen/arch/arm/arm64/vsysreg.c b/xen/arch/arm/arm64/vsysreg.c
> index 16ac9c344a..8a85507d9d 100644
> --- a/xen/arch/arm/arm64/vsysreg.c
> +++ b/xen/arch/arm/arm64/vsysreg.c
> @@ -34,9 +34,14 @@
> static bool vreg_emulate_##reg(struct cpu_user_regs *regs, \
> uint64_t *r, bool read) \
> { \
> + struct vcpu *v = current; \
> + bool cache_enabled = vcpu_has_cache_enabled(v); \
> + \
> GUEST_BUG_ON(read); \
> WRITE_SYSREG64(*r, reg); \
> \
> + p2m_toggle_cache(v, cache_enabled); \
> + \
> return true; \
> }
>
> @@ -85,6 +90,18 @@ void do_sysreg(struct cpu_user_regs *regs,
> break;
>
> /*
> + * HCR_EL2.TSW
> + *
> + * ARMv8 (DDI 0487B.b): Table D1-42
> + */
> + case HSR_SYSREG_DCISW:
> + case HSR_SYSREG_DCCSW:
> + case HSR_SYSREG_DCCISW:
> + if ( !hsr.sysreg.read )
> + p2m_set_way_flush(current);
> + break;
> +
> + /*
> * HCR_EL2.TVM
> *
> * ARMv8 (DDI 0487D.a): Table D1-38
> diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
> index ca9f0d9ebe..8ee6ff7bd7 100644
> --- a/xen/arch/arm/p2m.c
> +++ b/xen/arch/arm/p2m.c
> @@ -3,6 +3,7 @@
> #include <xen/iocap.h>
> #include <xen/lib.h>
> #include <xen/sched.h>
> +#include <xen/softirq.h>
>
> #include <asm/event.h>
> #include <asm/flushtlb.h>
> @@ -1620,6 +1621,97 @@ int p2m_cache_flush_range(struct domain *d, gfn_t
> *pstart, gfn_t end)
> return rc;
> }
>
> +/*
> + * Clean & invalidate RAM associated to the guest vCPU.
> + *
> + * The function can only work with the current vCPU and should be called
> + * with IRQ enabled as the vCPU could get preempted.
> + */
> +void p2m_flush_vm(struct vcpu *v)
> +{
> + int rc;
> + gfn_t start = _gfn(0);
> +
> + ASSERT(v == current);
> + ASSERT(local_irq_is_enabled());
> + ASSERT(v->arch.need_flush_to_ram);
> +
> + do
> + {
> + rc = p2m_cache_flush_range(v->domain, &start, _gfn(ULONG_MAX));
> + if ( rc == -ERESTART )
> + do_softirq();
Shouldn't we store somewhere where we left off? Specifically the value
of `start' when rc == -ERESTART? Maybe we change need_flush_to_ram to
gfn_t and used it to store `start'?
> + } while ( rc == -ERESTART );
> +
> + if ( rc != 0 )
> + gprintk(XENLOG_WARNING,
> + "P2M has not been correctly cleaned (rc = %d)\n",
> + rc);
> +
> + v->arch.need_flush_to_ram = false;
> +}
> +
> +/*
> + * See note at ARMv7 ARM B1.14.4 (DDI 0406C.c) (TL;DR: S/W ops are not
> + * easily virtualized).
> + *
> + * Main problems:
> + * - S/W ops are local to a CPU (not broadcast)
> + * - We have line migration behind our back (speculation)
> + * - System caches don't support S/W at all (damn!)
> + *
> + * In the face of the above, the best we can do is to try and convert
> + * S/W ops to VA ops. Because the guest is not allowed to infer the S/W
> + * to PA mapping, it can only use S/W to nuke the whole cache, which is
> + * rather a good thing for us.
> + *
> + * Also, it is only used when turning caches on/off ("The expected
> + * usage of the cache maintenance instructions that operate by set/way
> + * is associated with the powerdown and powerup of caches, if this is
> + * required by the implementation.").
> + *
> + * We use the following policy:
> + * - If we trap a S/W operation, we enabled VM trapping to detect
> + * caches being turned on/off, and do a full clean.
> + *
> + * - We flush the caches on both caches being turned on and off.
> + *
> + * - Once the caches are enabled, we stop trapping VM ops.
> + */
> +void p2m_set_way_flush(struct vcpu *v)
> +{
> + /* This function can only work with the current vCPU. */
> + ASSERT(v == current);
> +
> + if ( !(v->arch.hcr_el2 & HCR_TVM) )
> + {
> + v->arch.need_flush_to_ram = true;
> + vcpu_hcr_set_flags(v, HCR_TVM);
> + }
> +}
> +
> +void p2m_toggle_cache(struct vcpu *v, bool was_enabled)
> +{
> + bool now_enabled = vcpu_has_cache_enabled(v);
> +
> + /* This function can only work with the current vCPU. */
> + ASSERT(v == current);
> +
> + /*
> + * If switching the MMU+caches on, need to invalidate the caches.
> + * If switching it off, need to clean the caches.
> + * Clean + invalidate does the trick always.
> + */
> + if ( was_enabled != now_enabled )
> + {
> + v->arch.need_flush_to_ram = true;
> + }
> +
> + /* Caches are now on, stop trapping VM ops (until a S/W op) */
> + if ( now_enabled )
> + vcpu_hcr_clear_flags(v, HCR_TVM);
> +}
> +
> mfn_t gfn_to_mfn(struct domain *d, gfn_t gfn)
> {
> return p2m_lookup(d, gfn, NULL);
> diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
> index 02665cc7b4..221c762ada 100644
> --- a/xen/arch/arm/traps.c
> +++ b/xen/arch/arm/traps.c
> @@ -97,7 +97,7 @@ register_t get_default_hcr_flags(void)
> {
> return (HCR_PTW|HCR_BSU_INNER|HCR_AMO|HCR_IMO|HCR_FMO|HCR_VM|
> (vwfi != NATIVE ? (HCR_TWI|HCR_TWE) : 0) |
> - HCR_TSC|HCR_TAC|HCR_SWIO|HCR_TIDCP|HCR_FB);
> + HCR_TSC|HCR_TAC|HCR_SWIO|HCR_TIDCP|HCR_FB|HCR_TSW);
> }
>
> static enum {
> @@ -2258,10 +2258,33 @@ static void check_for_pcpu_work(void)
> }
> }
>
> +/*
> + * Process pending work for the vCPU. Any call should be fast or
> + * implement preemption.
> + */
> +static void check_for_vcpu_work(void)
> +{
> + struct vcpu *v = current;
> +
> + if ( likely(!v->arch.need_flush_to_ram) )
> + return;
> +
> + /*
> + * Give a chance for the pCPU to process work before handling the vCPU
> + * pending work.
> + */
> + check_for_pcpu_work();
This is a bit awkward, basically we need to call check_for_pcpu_work
before check_for_vcpu_work(), and after it. This is basically what we
are doing:
check_for_pcpu_work()
check_for_vcpu_work()
check_for_pcpu_work()
Sounds like we should come up with something better but I don't have a
concrete suggestion :-)
> + local_irq_enable();
> + p2m_flush_vm(v);
> + local_irq_disable();
> +}
> +
> void leave_hypervisor_tail(void)
> {
> local_irq_disable();
>
> + check_for_vcpu_work();
> check_for_pcpu_work();
>
> vgic_sync_to_lrs();
> diff --git a/xen/arch/arm/vcpreg.c b/xen/arch/arm/vcpreg.c
> index 550c25ec3f..cdc91cdf5b 100644
> --- a/xen/arch/arm/vcpreg.c
> +++ b/xen/arch/arm/vcpreg.c
> @@ -51,9 +51,14 @@
> #define TVM_REG(sz, func, reg...) \
> static bool func(struct cpu_user_regs *regs, uint##sz##_t *r, bool read) \
> { \
> + struct vcpu *v = current; \
> + bool cache_enabled = vcpu_has_cache_enabled(v); \
> + \
> GUEST_BUG_ON(read); \
> WRITE_SYSREG##sz(*r, reg); \
> \
> + p2m_toggle_cache(v, cache_enabled); \
> + \
> return true; \
> }
>
> @@ -71,6 +76,8 @@ static bool func(struct cpu_user_regs *regs, uint##sz##_t
> *r, bool read) \
> static bool vreg_emulate_##xreg(struct cpu_user_regs *regs, uint32_t *r, \
> bool read, bool hi) \
> { \
> + struct vcpu *v = current; \
> + bool cache_enabled = vcpu_has_cache_enabled(v); \
> register_t reg = READ_SYSREG(xreg); \
> \
> GUEST_BUG_ON(read); \
> @@ -86,6 +93,8 @@ static bool vreg_emulate_##xreg(struct cpu_user_regs *regs,
> uint32_t *r, \
> } \
> WRITE_SYSREG(reg, xreg); \
> \
> + p2m_toggle_cache(v, cache_enabled); \
> + \
> return true; \
> } \
> \
> @@ -186,6 +195,19 @@ void do_cp15_32(struct cpu_user_regs *regs, const union
> hsr hsr)
> break;
>
> /*
> + * HCR_EL2.TSW
> + *
> + * ARMv7 (DDI 0406C.b): B1.14.6
> + * ARMv8 (DDI 0487B.b): Table D1-42
> + */
> + case HSR_CPREG32(DCISW):
> + case HSR_CPREG32(DCCSW):
> + case HSR_CPREG32(DCCISW):
> + if ( !cp32.read )
> + p2m_set_way_flush(current);
> + break;
> +
> + /*
> * HCR_EL2.TVM
> *
> * ARMv8 (DDI 0487D.a): Table D1-38
> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
> index 175de44927..f16b973e0d 100644
> --- a/xen/include/asm-arm/domain.h
> +++ b/xen/include/asm-arm/domain.h
> @@ -202,6 +202,14 @@ struct arch_vcpu
> struct vtimer phys_timer;
> struct vtimer virt_timer;
> bool vtimer_initialized;
> +
> + /*
> + * The full P2M may require some cleaning (e.g when emulation
> + * set/way). As the action can take a long time, it requires
> + * preemption. So this is deferred until we return to the guest.
The reason for delaying the call to p2m_flush_vm until we return to the
guest is that we need to call do_softirq to check whether we need to be
preempted and we cannot make that call in the middle of the vcpreg.c
handlers, right?
> + */
> + bool need_flush_to_ram;
> +
> } __cacheline_aligned;
>
> void vcpu_show_execution_state(struct vcpu *);
> diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
> index a633e27cc9..79abcb5a63 100644
> --- a/xen/include/asm-arm/p2m.h
> +++ b/xen/include/asm-arm/p2m.h
> @@ -6,6 +6,8 @@
> #include <xen/rwlock.h>
> #include <xen/mem_access.h>
>
> +#include <asm/current.h>
>
> #define paddr_bits PADDR_BITS
>
> /* Holds the bit size of IPAs in p2m tables. */
> @@ -237,6 +239,12 @@ bool p2m_resolve_translation_fault(struct domain *d,
> gfn_t gfn);
> */
> int p2m_cache_flush_range(struct domain *d, gfn_t *pstart, gfn_t end);
>
> +void p2m_set_way_flush(struct vcpu *v);
> +
> +void p2m_toggle_cache(struct vcpu *v, bool was_enabled);
> +
> +void p2m_flush_vm(struct vcpu *v);
> +
> /*
> * Map a region in the guest p2m with a specific p2m type.
> * The memory attributes will be derived from the p2m type.
> @@ -364,6 +372,18 @@ static inline int set_foreign_p2m_entry(struct domain
> *d, unsigned long gfn,
> return -EOPNOTSUPP;
> }
>
> +/*
> + * A vCPU has cache enabled only when the MMU is enabled and data cache
> + * is enabled.
> + */
> +static inline bool vcpu_has_cache_enabled(struct vcpu *v)
> +{
> + /* Only works with the current vCPU */
> + ASSERT(current == v);
> +
> + return (READ_SYSREG32(SCTLR_EL1) & (SCTLR_C|SCTLR_M)) ==
> (SCTLR_C|SCTLR_M);
> +}
> +
> #endif /* _XEN_P2M_H */
>
> /*
_______________________________________________
Xen-devel mailing list
[email protected]
https://lists.xenproject.org/mailman/listinfo/xen-devel