Implement an heuristic for X86 HVM guests which tries to use superpages while populating guest physmap on live migration. This should impove memory accesses performances for these guests.
Signed-off-by: Andrei Semenov <[email protected]> --- tools/include/xen-tools/libs.h | 4 ++ tools/libs/guest/xg_private.h | 3 + tools/libs/guest/xg_sr_common.h | 18 ++++- tools/libs/guest/xg_sr_restore.c | 60 +++++++--------- tools/libs/guest/xg_sr_restore_x86_hvm.c | 88 +++++++++++++++++++++++- tools/libs/guest/xg_sr_restore_x86_pv.c | 22 +++++- 6 files changed, 154 insertions(+), 41 deletions(-) diff --git a/tools/include/xen-tools/libs.h b/tools/include/xen-tools/libs.h index a16e0c3807..bdd903eb7b 100644 --- a/tools/include/xen-tools/libs.h +++ b/tools/include/xen-tools/libs.h @@ -63,4 +63,8 @@ #define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1)) #endif +#ifndef ROUNDDOWN +#define ROUNDDOWN(_x,_w) ((unsigned long)(_x) & (-1UL << (_w))) +#endif + #endif /* __XEN_TOOLS_LIBS__ */ diff --git a/tools/libs/guest/xg_private.h b/tools/libs/guest/xg_private.h index 09e24f1227..dcf63b5188 100644 --- a/tools/libs/guest/xg_private.h +++ b/tools/libs/guest/xg_private.h @@ -134,6 +134,9 @@ typedef uint64_t x86_pgentry_t; #define PAGE_SIZE_X86 (1UL << PAGE_SHIFT_X86) #define PAGE_MASK_X86 (~(PAGE_SIZE_X86-1)) +#define S_PAGE_1GB_ORDER 18 +#define S_PAGE_2MB_ORDER 9 + #define NRPAGES(x) (ROUNDUP(x, PAGE_SHIFT) >> PAGE_SHIFT) static inline xen_pfn_t xc_pfn_to_mfn(xen_pfn_t pfn, xen_pfn_t *p2m, diff --git a/tools/libs/guest/xg_sr_common.h b/tools/libs/guest/xg_sr_common.h index 941e24d7b7..96365e05a8 100644 --- a/tools/libs/guest/xg_sr_common.h +++ b/tools/libs/guest/xg_sr_common.h @@ -137,7 +137,8 @@ struct xc_sr_restore_ops bool (*pfn_is_valid)(const struct xc_sr_context *ctx, xen_pfn_t pfn); /* Set the GFN of a PFN. */ - void (*set_gfn)(struct xc_sr_context *ctx, xen_pfn_t pfn, xen_pfn_t gfn); + void (*set_gfn)(struct xc_sr_context *ctx, xen_pfn_t pfn, xen_pfn_t gfn, + unsigned int order); /* Set the type of a PFN. */ void (*set_page_type)(struct xc_sr_context *ctx, xen_pfn_t pfn, @@ -175,6 +176,17 @@ struct xc_sr_restore_ops #define BROKEN_CHANNEL 2 int (*process_record)(struct xc_sr_context *ctx, struct xc_sr_record *rec); + /** + * Guest physmap population order is based on heuristic which is family + * dependant. X86 HVM heuristic is interested in observing the whole + * record (the first) in order to guess how the physmap should be populated. + */ + void (*guess_physmap)(struct xc_sr_context *ctx, unsigned int count, + const xen_pfn_t *pfns, const uint32_t *types); + + /* Get the physmap population order for given PFN */ + int (*get_physmap_order)(const struct xc_sr_context *ctx, xen_pfn_t pfn); + /** * Perform any actions required after the static data has arrived. Called * when the STATIC_DATA_COMPLETE record has been recieved/inferred. @@ -404,6 +416,10 @@ struct xc_sr_context { /* HVM context blob. */ struct xc_sr_blob context; + + /* Set guest type (based on the first record) */ + bool set_guest_type; + bool pvh_guest; } restore; }; } hvm; diff --git a/tools/libs/guest/xg_sr_restore.c b/tools/libs/guest/xg_sr_restore.c index 074b56d263..af864bd5ea 100644 --- a/tools/libs/guest/xg_sr_restore.c +++ b/tools/libs/guest/xg_sr_restore.c @@ -86,18 +86,21 @@ static bool pfn_is_populated(const struct xc_sr_context *ctx, xen_pfn_t pfn) * avoid realloc()ing too excessively, the size increased to the nearest power * of two large enough to contain the required pfn. */ -static int pfn_set_populated(struct xc_sr_context *ctx, xen_pfn_t pfn) +static int pfn_set_populated(struct xc_sr_context *ctx, xen_pfn_t pfn, + unsigned int order) { xc_interface *xch = ctx->xch; + xen_pfn_t start_pfn = ROUNDDOWN(pfn, order), + end_pfn = (ROUNDUP(pfn + 1, order) - 1); - if ( pfn > ctx->restore.max_populated_pfn ) + if ( end_pfn > ctx->restore.max_populated_pfn ) { xen_pfn_t new_max; size_t old_sz, new_sz; unsigned long *p; /* Round up to the nearest power of two larger than pfn, less 1. */ - new_max = pfn; + new_max = end_pfn; new_max |= new_max >> 1; new_max |= new_max >> 2; new_max |= new_max >> 4; @@ -123,8 +126,11 @@ static int pfn_set_populated(struct xc_sr_context *ctx, xen_pfn_t pfn) ctx->restore.max_populated_pfn = new_max; } - assert(!test_bit(pfn, ctx->restore.populated_pfns)); - set_bit(pfn, ctx->restore.populated_pfns); + for ( pfn = start_pfn; pfn <= end_pfn; ++pfn ) + { + assert(!test_bit(pfn, ctx->restore.populated_pfns)); + set_bit(pfn, ctx->restore.populated_pfns); + } return 0; } @@ -138,60 +144,40 @@ int populate_pfns(struct xc_sr_context *ctx, unsigned int count, const xen_pfn_t *original_pfns, const uint32_t *types) { xc_interface *xch = ctx->xch; - xen_pfn_t *mfns = malloc(count * sizeof(*mfns)), - *pfns = malloc(count * sizeof(*pfns)); - unsigned int i, nr_pfns = 0; + xen_pfn_t mfn, pfn; + unsigned int i, order; int rc = -1; - if ( !mfns || !pfns ) - { - ERROR("Failed to allocate %zu bytes for populating the physmap", - 2 * count * sizeof(*mfns)); - goto err; - } + /* Feed this record for family dependant heuristic to guess the physmap */ + ctx->restore.ops.guess_physmap(ctx, count, original_pfns, types); for ( i = 0; i < count; ++i ) { if ( (!types || page_type_to_populate(types[i])) && !pfn_is_populated(ctx, original_pfns[i]) ) { - rc = pfn_set_populated(ctx, original_pfns[i]); + order = ctx->restore.ops.get_physmap_order(ctx, original_pfns[i]); + rc = pfn_set_populated(ctx, original_pfns[i], order); if ( rc ) goto err; - pfns[nr_pfns] = mfns[nr_pfns] = original_pfns[i]; - ++nr_pfns; - } - } - - if ( nr_pfns ) - { - rc = xc_domain_populate_physmap_exact( - xch, ctx->domid, nr_pfns, 0, 0, mfns); - if ( rc ) - { - PERROR("Failed to populate physmap"); - goto err; - } - for ( i = 0; i < nr_pfns; ++i ) - { - if ( mfns[i] == INVALID_MFN ) + pfn = mfn = ROUNDDOWN(original_pfns[i], order); + rc = xc_domain_populate_physmap_exact(xch, ctx->domid, 1, order, 0, + &mfn); + if ( rc || (mfn == INVALID_MFN) ) { - ERROR("Populate physmap failed for pfn %u", i); + ERROR("Failed to populate physmap for pfn %lu (%u)", pfn, order); rc = -1; goto err; } - ctx->restore.ops.set_gfn(ctx, pfns[i], mfns[i]); + ctx->restore.ops.set_gfn(ctx, pfn, mfn, order); } } rc = 0; err: - free(pfns); - free(mfns); - return rc; } diff --git a/tools/libs/guest/xg_sr_restore_x86_hvm.c b/tools/libs/guest/xg_sr_restore_x86_hvm.c index d6ea6f3012..2e525443ab 100644 --- a/tools/libs/guest/xg_sr_restore_x86_hvm.c +++ b/tools/libs/guest/xg_sr_restore_x86_hvm.c @@ -110,7 +110,7 @@ static xen_pfn_t x86_hvm_pfn_to_gfn(const struct xc_sr_context *ctx, /* restore_ops function. */ static void x86_hvm_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn, - xen_pfn_t gfn) + xen_pfn_t gfn, unsigned int order) { /* no op */ } @@ -161,6 +161,8 @@ static int x86_hvm_setup(struct xc_sr_context *ctx) } #endif + ctx->x86.hvm.restore.set_guest_type = true; + return 0; } @@ -192,6 +194,88 @@ static int x86_hvm_process_record(struct xc_sr_context *ctx, } } +/* + * We consider that PVH guest physmap starts from 0 and coninugiously cover the + * pysical memory space for the first GB of memory. HVM guest will have I/0 + * holes in the first 2MB of memory space (at least for VGA). Therefore we + * should observe the very first record (wich comes in physmap order) to find + * out how we should map this first GB. + * To map the rest of the memory space in both cases (PVH or HVM) we will use + * the maximum available order (up to 1GB), except for forth GB wich holds the + * low MMIO hole (at least for LAPIC MMIO window and for potential passthroughed + * or emulated PCI devices BARs). + */ +static void x86_hvm_guess_physmap(struct xc_sr_context *ctx, unsigned int count, + const xen_pfn_t *pfns, const uint32_t *types) +{ + xen_pfn_t prev; + unsigned int i; + + + if ( !ctx->x86.hvm.restore.set_guest_type ) + return; + + for ( i = 0, prev = INVALID_PFN; i < count; ++i ) + { + if ( !types || page_type_to_populate(types[i]) ) + { + if ( prev == INVALID_MFN ) + { + if (pfns[i] != 0) + break; + } + else + { + if ( pfns[i] != (prev + 1) ) + break; + } + prev = pfns[i]; + } + } + + ctx->x86.hvm.restore.pvh_guest = (i == count) ? true : false; + ctx->x86.hvm.restore.set_guest_type = false; +} + +/* + * + */ +static int x86_hvm_get_physmap_order(const struct xc_sr_context *ctx, + xen_pfn_t pfn) +{ + int order; + + if ( pfn >= ctx->restore.p2m_size ) + return 0; + + switch (pfn >> S_PAGE_1GB_ORDER) + { + case 3: + /* The forth GB of memory is mapped with 2MB superpages */ + order = S_PAGE_2MB_ORDER; + break; + case 0: + if (!ctx->x86.hvm.restore.pvh_guest) + { + /* First 2MB are mapped as 4K for HVM guest */ + order = (pfn > 0x1ff) ? S_PAGE_2MB_ORDER : 0; + break; + } + default: + order = S_PAGE_1GB_ORDER; + } + + if ( ((ROUNDUP(pfn + 1, S_PAGE_1GB_ORDER) - 1) >= ctx->restore.p2m_size) && + order == S_PAGE_1GB_ORDER ) + order = S_PAGE_2MB_ORDER; + + if ( ((ROUNDUP(pfn + 1, S_PAGE_2MB_ORDER) - 1) >= ctx->restore.p2m_size) && + order == S_PAGE_2MB_ORDER ) + order = 0; + + return order; +} + /* * restore_ops function. Sets extra hvm parameters and seeds the grant table. */ @@ -258,6 +342,8 @@ struct xc_sr_restore_ops restore_ops_x86_hvm = .localise_page = x86_hvm_localise_page, .setup = x86_hvm_setup, .process_record = x86_hvm_process_record, + .guess_physmap = x86_hvm_guess_physmap, + .get_physmap_order = x86_hvm_get_physmap_order, .static_data_complete = x86_static_data_complete, .stream_complete = x86_hvm_stream_complete, .cleanup = x86_hvm_cleanup, diff --git a/tools/libs/guest/xg_sr_restore_x86_pv.c b/tools/libs/guest/xg_sr_restore_x86_pv.c index dc50b0f5a8..f8545f941a 100644 --- a/tools/libs/guest/xg_sr_restore_x86_pv.c +++ b/tools/libs/guest/xg_sr_restore_x86_pv.c @@ -59,7 +59,7 @@ static int expand_p2m(struct xc_sr_context *ctx, unsigned long max_pfn) ctx->x86.pv.max_pfn = max_pfn; for ( i = (old_max ? old_max + 1 : 0); i <= max_pfn; ++i ) { - ctx->restore.ops.set_gfn(ctx, i, INVALID_MFN); + ctx->restore.ops.set_gfn(ctx, i, INVALID_MFN, 0); ctx->restore.ops.set_page_type(ctx, i, 0); } @@ -947,9 +947,10 @@ static void x86_pv_set_page_type(struct xc_sr_context *ctx, xen_pfn_t pfn, /* restore_ops function. */ static void x86_pv_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn, - xen_pfn_t mfn) + xen_pfn_t mfn, unsigned int order) { assert(pfn <= ctx->x86.pv.max_pfn); + assert(!order); if ( ctx->x86.pv.width == sizeof(uint64_t) ) /* 64 bit guest. Need to expand INVALID_MFN for 32 bit toolstacks. */ @@ -1113,6 +1114,21 @@ static int x86_pv_process_record(struct xc_sr_context *ctx, } } +/* + * There's no reliable heuristic which can predict the PV guest physmap. + * Therefore the 0 order always will be used. + */ +static void x86_pv_guess_physmap(struct xc_sr_context *ctx, unsigned int count, + const xen_pfn_t *pfns, const uint32_t *types) +{ +} + +static int x86_pv_get_physmap_order(const struct xc_sr_context *ctx, + xen_pfn_t pfn) +{ + return 0; +} + /* * restore_ops function. Update the vcpu context in Xen, pin the pagetables, * rewrite the p2m and seed the grant table. @@ -1194,6 +1210,8 @@ struct xc_sr_restore_ops restore_ops_x86_pv = .localise_page = x86_pv_localise_page, .setup = x86_pv_setup, .process_record = x86_pv_process_record, + .guess_physmap = x86_pv_guess_physmap, + .get_physmap_order = x86_pv_get_physmap_order, .static_data_complete = x86_static_data_complete, .stream_complete = x86_pv_stream_complete, .cleanup = x86_pv_cleanup, -- 2.34.1 Andrei Semenov | Vates XCP-ng Developer XCP-ng & Xen Orchestra - Vates solutions w: vates.fr | xcp-ng.org | xen-orchestra.com
