Hi,

When profiling a VM migration I noticed that on the sender side a significant 
(~45%) time is spent in PV Dom0 Linux taking an emulation fault in 
unmap_page_range [1]:

The call comes from zap_pte_range (pgtable_64.h, inlined):
 ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);

This has 2 implementations: native and a generic one where pte_clear is 
implemented using set_pte_at with a Xen pvop.

As a proof of concept I’ve deleted the native implementation [2], which makes 
it fall back to the generic implementation [3].
This is not necessarily safe on SMP (it reads and clears as 2 separate steps), 
but shows that a paravirt op is faster than emulation.

To fix this we may need to partially revert  
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4cdd9c8931767e1c56a51a1078d33a8c340f4405

To test this more easily I’ve written a small test program that maps/unmaps 
pages from a domain in Dom0 [4].
Before: 3.26932 +- 0.00503 seconds time elapsed  ( +-  0.15% )
After: 0.75642 +- 0.00202 seconds time elapsed  ( +-  0.27% )

It is more than 4x faster to use the paravirt ops than trapping and emulating.
From a functional point of view the above commit is correct, Xen doesn’t need a 
dedicated PV operation: trap and emulation works.
But from a performance point of view I’d say that Xen does need it. This is a 
hot-path during migration, and it’d be worthwhile to optimise it.

Just deleting the native implementation is probably not the solution, since we 
also want a value returned, and the existing PV operation is void.
It probably needs a new PV operation (re)introduced with the desired semantics?

Best regards,
--Edwin

[1]: 
https://cdn.jsdelivr.net/gh/edwintorok/xen@pmustack-coverletter/docs/tmp/migrate-send.svg?x=950.6&y=2197

[2]:
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 690c0307afed..ab6318bb5676 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1097,13 +1097,9 @@ extern int ptep_test_and_clear_young(struct 
vm_area_struct *vma,
 extern int ptep_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pte_t *ptep);

-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long 
addr,
-                                      pte_t *ptep)
-{
-       pte_t pte = native_ptep_get_and_clear(ptep);
-       return pte;
-}
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+                                      unsigned long address,
+                                      pte_t *ptep);

 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
 static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,

[3]: 
#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
                                       unsigned long address,
                                       pte_t *ptep)
{
        pte_t pte = *ptep;
        pte_clear(mm, address, ptep);
        return pte;
}
#endif
static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t 
*ptep) { set_pte_at(mm, addr, ptep, __pte(0)); }

[4]:
$ cat >main.c <<EOF
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <xenctrl.h>
#include <xenforeignmemory.h>

int main(int argc, char *argv[]) {
  if (argc != 3) {
    fprintf(stderr, "Usage: %s <domid> <batchsize>\n", argv[0]);
    return 1;
  }
  uint32_t domid = atoi(argv[1]);
  uint32_t batch_size = atoi(argv[2]);

  if (!domid || !batch_size) {
    fprintf(stderr, "Invalid arguments, expected 2 integers");
    return 2;
  }

  xenforeignmemory_handle *handle = xenforeignmemory_open(NULL, 0);
  if (!handle)
    return 3;

  xc_interface *xc = xc_interface_open(NULL, NULL, 0);
  if (!xc)
    return 4;

  xen_pfn_t nr_pfns;
  if (xc_domain_nr_gpfns(xc, domid, &nr_pfns) < 0)
    return 5;

  unsigned mappings_size = nr_pfns / batch_size + 1;
  void **mappings = calloc(mappings_size, sizeof(void *));
  if (!mappings) {
    perror("calloc");
    return 6;
  }

  for (xen_pfn_t i = 0; i < nr_pfns; i += batch_size) {
    xen_pfn_t arr[batch_size];
    int err[batch_size];

    for (unsigned j = 0; j < batch_size; j++)
      arr[j] = i + j;

    unsigned long idx = i / batch_size;
    assert(idx < mappings_size);
    mappings[idx] = xenforeignmemory_map(
        handle, domid, PROT_READ, sizeof(arr) / sizeof(arr[0]), arr, err);
    if (!mappings[idx])
      return 7;
  }

  for (xen_pfn_t i = 0; i < nr_pfns; i += batch_size) {
    unsigned long idx = i / batch_size;
    assert(idx < mappings_size);
    if (xenforeignmemory_unmap(handle, mappings[idx], batch_size))
      return 8;
  }

  free(mappings);
  xc_interface_close(xc);
  xenforeignmemory_close(handle);

  return EXIT_SUCCESS;
}
EOF

$ cat >meson.build <<EOF
project('xfm_scale', 'c', default_options:['c_std=gnu11'])
xfm = dependency('xenforeignmemory')
xc = dependency('xencontrol’)
executable('xfm_scale', 'main.c', dependencies: [xfm, xc])
EOF

Reply via email to