On 14.11.2017 06:12, Rik van Riel wrote:
> Currently, every time a VCPU is scheduled out, the host kernel will
> first save the guest FPU/xstate context, then load the qemu userspace
> FPU context, only to then immediately save the qemu userspace FPU
> context back to memory. When scheduling in a VCPU, the same extraneous
> FPU loads and saves are done.
> 
> This could be avoided by moving from a model where the guest FPU is
> loaded and stored with preemption disabled, to a model where the
> qemu userspace FPU is swapped out for the guest FPU context for
> the duration of the KVM_RUN ioctl.
> 
> This is done under the VCPU mutex, which is also taken when other
> tasks inspect the VCPU FPU context, so the code should already be
> safe for this change. That should come as no surprise, given that
> s390 already has this optimization.
> 
> No performance changes were detected in quick ping-pong tests on
> my 4 socket system, which is expected since an FPU+xstate load is
> on the order of 0.1us, while ping-ponging between CPUs is on the
> order of 20us, and somewhat noisy. 
> 
> There may be other tests where performance changes are noticeable.
> 
> Signed-off-by: Rik van Riel <[email protected]>
> Suggested-by: Christian Borntraeger <[email protected]>
> ---
>  arch/x86/include/asm/kvm_host.h | 13 +++++++++++++
>  arch/x86/kvm/x86.c              | 29 ++++++++++++-----------------
>  2 files changed, 25 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index c73e493adf07..92e66685249e 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h

We should also get rid of guest_fpu_loaded now, right?


> @@ -536,7 +536,20 @@ struct kvm_vcpu_arch {
>       struct kvm_mmu_memory_cache mmu_page_cache;
>       struct kvm_mmu_memory_cache mmu_page_header_cache;
>  
> +     /*
> +      * QEMU userspace and the guest each have their own FPU state.
> +      * In vcpu_run, we switch between the user and guest FPU contexts.
> +      * While running a VCPU, the VCPU thread will have the guest FPU
> +      * context.
> +      *
> +      * Note that while the PKRU state lives inside the fpu registers,
> +      * it is switched out separately at VMENTER and VMEXIT time. The
> +      * "guest_fpu" state here contains the guest FPU context, with the
> +      * host PRKU bits.
> +      */
> +     struct fpu user_fpu;
>       struct fpu guest_fpu;
> +
>       u64 xcr0;
>       u64 guest_supported_xcr0;
>       u32 guest_xstate_size;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 03869eb7fcd6..59912b20a830 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2917,7 +2917,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>       srcu_read_unlock(&vcpu->kvm->srcu, idx);
>       pagefault_enable();
>       kvm_x86_ops->vcpu_put(vcpu);
> -     kvm_put_guest_fpu(vcpu);
>       vcpu->arch.last_host_tsc = rdtsc();
>  }
>  
> @@ -6908,7 +6907,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>       preempt_disable();
>  
>       kvm_x86_ops->prepare_guest_switch(vcpu);
> -     kvm_load_guest_fpu(vcpu);
>  
>       /*
>        * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
> @@ -7095,6 +7093,8 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
>  
>       vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
>  
> +     kvm_load_guest_fpu(vcpu);
> +
>       for (;;) {
>               if (kvm_vcpu_running(vcpu)) {
>                       r = vcpu_enter_guest(vcpu);
> @@ -7132,6 +7132,8 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
>               }
>       }
>  
> +     kvm_put_guest_fpu(vcpu);
> +
>       srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
>  
>       return r;
> @@ -7663,32 +7665,25 @@ static void fx_init(struct kvm_vcpu *vcpu)
>       vcpu->arch.cr0 |= X86_CR0_ET;
>  }
>  
> +/* Swap (qemu) user FPU context for the guest FPU context. */
>  void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
>  {
> -     if (vcpu->guest_fpu_loaded)
> -             return;
> -
> -     /*
> -      * Restore all possible states in the guest,
> -      * and assume host would use all available bits.
> -      * Guest xcr0 would be loaded later.
> -      */
> -     vcpu->guest_fpu_loaded = 1;
> -     __kernel_fpu_begin();
> +     preempt_disable();
> +     copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
>       /* PKRU is separately restored in kvm_x86_ops->run.  */
>       __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
>                               ~XFEATURE_MASK_PKRU);
> +     preempt_enable();
>       trace_kvm_fpu(1);
>  }
>  
> +/* When vcpu_run ends, restore user space FPU context. */
>  void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
>  {
> -     if (!vcpu->guest_fpu_loaded)
> -             return;
> -
> -     vcpu->guest_fpu_loaded = 0;
> +     preempt_disable();
>       copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
> -     __kernel_fpu_end();
> +     copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
> +     preempt_enable();
>       ++vcpu->stat.fpu_reload;
>       trace_kvm_fpu(0);
>  }
> 

emulator_get_fpu() does a kvm_load_guest_fpu(). Doesn't that mean that
this is now not needed anymore? (at least when emulator code is called
from inside the loop?)

Also, what about preempt_diable() at that point, still needed?


-- 

Thanks,

David / dhildenb

Reply via email to