A single shared ioreq page provides PAGE_SIZE/sizeof(ioreq_t) = 128 slots, limiting HVM guests to 128 vCPUs. To support more vCPUs, extend the ioreq server to use xvzalloc_array() for allocating a contiguous virtual array of ioreq_t slots sized to d->max_vcpus, backed by potentially non-contiguous physical pages.
For the GFN-mapped path (x86), individual pages are mapped via prepare_ring_for_helper() and then combined into a single contiguous VA using vmap(). The number of ioreq pages is computed at runtime via nr_ioreq_pages(d) = DIV_ROUND_UP(d->max_vcpus, IOREQS_PER_PAGE), so small VMs only allocate one page. All existing single-page paths (bufioreq, legacy clients) remain unchanged. Mark the now-unused shared_iopage_t in the public header as deprecated. Signed-off-by: Julian Vetter <[email protected]> --- Changes in v2 - Use xvalloc_array to allocate the contigeous region - Removed unncessary includes - nr_ioreq_pages is now based on d->max_vcpus and not the HVM_MAX_VCPUS define - Reduced indentation by 1 level in hvm_alloc_ioreq_gfns - Added blank lines between declarations and statements - Added comment why we can just return in hvm_add_ioreq_gfn without rollback --- xen/arch/x86/hvm/ioreq.c | 198 ++++++++++++++++++++++++++++++++- xen/common/ioreq.c | 95 ++++++++++++---- xen/include/public/hvm/ioreq.h | 5 + xen/include/xen/ioreq.h | 13 ++- 4 files changed, 285 insertions(+), 26 deletions(-) diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c index 5ebc48dbd4..a77f00dd96 100644 --- a/xen/arch/x86/hvm/ioreq.c +++ b/xen/arch/x86/hvm/ioreq.c @@ -6,6 +6,7 @@ */ #include <xen/domain.h> +#include <xen/domain_page.h> #include <xen/event.h> #include <xen/init.h> #include <xen/ioreq.h> @@ -15,6 +16,7 @@ #include <xen/sched.h> #include <xen/softirq.h> #include <xen/trace.h> +#include <xen/vmap.h> #include <xen/vpci.h> #include <asm/hvm/emulate.h> @@ -89,6 +91,39 @@ static gfn_t hvm_alloc_ioreq_gfn(struct ioreq_server *s) return hvm_alloc_legacy_ioreq_gfn(s); } +static gfn_t hvm_alloc_ioreq_gfns(struct ioreq_server *s, + unsigned int nr_pages) +{ + struct domain *d = s->target; + unsigned long mask; + unsigned int i, run; + + if ( nr_pages == 1 ) + return hvm_alloc_ioreq_gfn(s); + + /* Find nr_pages consecutive set bits */ + mask = d->arch.hvm.ioreq_gfn.mask; + + for ( i = 0, run = 0; i < BITS_PER_LONG; i++ ) + { + if ( !test_bit(i, &mask) ) + run = 0; + else if ( ++run == nr_pages ) + { + /* Found a run - clear all bits and return base GFN */ + unsigned int start = i - nr_pages + 1; + unsigned int j; + + for ( j = start; j <= i; j++ ) + clear_bit(j, &d->arch.hvm.ioreq_gfn.mask); + + return _gfn(d->arch.hvm.ioreq_gfn.base + start); + } + } + + return INVALID_GFN; +} + static bool hvm_free_legacy_ioreq_gfn(struct ioreq_server *s, gfn_t gfn) { @@ -121,11 +156,23 @@ static void hvm_free_ioreq_gfn(struct ioreq_server *s, gfn_t gfn) } } +static void hvm_free_ioreq_gfns(struct ioreq_server *s, gfn_t gfn, + unsigned int nr_pages) +{ + unsigned int i; + + for ( i = 0; i < nr_pages; i++ ) + hvm_free_ioreq_gfn(s, gfn_add(gfn, i)); +} + static void hvm_unmap_ioreq_gfn(struct ioreq_server *s, bool buf) { - struct ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq; + unsigned int i, nr_pages; + if ( buf ) { + struct ioreq_page *iorp = &s->bufioreq; + if ( gfn_eq(iorp->gfn, INVALID_GFN) ) return; @@ -134,16 +181,41 @@ static void hvm_unmap_ioreq_gfn(struct ioreq_server *s, bool buf) hvm_free_ioreq_gfn(s, iorp->gfn); iorp->gfn = INVALID_GFN; + return; + } + + if ( gfn_eq(s->ioreq_gfn, INVALID_GFN) ) + return; + + nr_pages = nr_ioreq_pages(s->target); + + for ( i = 0; i < nr_pages; i++ ) + { + struct page_info *pg = vmap_to_page((char *)s->ioreq + + i * PAGE_SIZE); + + put_page_and_type(pg); + put_page(pg); } + vunmap(s->ioreq); + s->ioreq = NULL; + + hvm_free_ioreq_gfns(s, s->ioreq_gfn, nr_pages); + s->ioreq_gfn = INVALID_GFN; } static int hvm_map_ioreq_gfn(struct ioreq_server *s, bool buf) { struct domain *d = s->target; - struct ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq; + unsigned int i, nr_pages; + gfn_t base_gfn; + mfn_t *mfns; int rc; + if ( buf ) { + struct ioreq_page *iorp = &s->bufioreq; + if ( iorp->page ) { /* @@ -173,30 +245,122 @@ static int hvm_map_ioreq_gfn(struct ioreq_server *s, bool buf) return rc; } + + /* ioreq: multi-page with contiguous VA */ + if ( s->ioreq ) + { + if ( gfn_eq(s->ioreq_gfn, INVALID_GFN) ) + return -EPERM; + return 0; + } + + if ( d->is_dying ) + return -EINVAL; + + nr_pages = nr_ioreq_pages(d); + base_gfn = hvm_alloc_ioreq_gfns(s, nr_pages); + + if ( gfn_eq(base_gfn, INVALID_GFN) ) + return -ENOMEM; + + mfns = xmalloc_array(mfn_t, nr_pages); + if ( !mfns ) + { + hvm_free_ioreq_gfns(s, base_gfn, nr_pages); + return -ENOMEM; + } + + /* + * Use prepare_ring_for_helper() to obtain page and type references + * for each GFN. Discard its per-page VA immediately, as all pages + * will be combined into a single contiguous VA via vmap() below. + */ + for ( i = 0; i < nr_pages; i++ ) + { + struct page_info *pg; + void *va; + + rc = prepare_ring_for_helper(d, gfn_x(base_gfn) + i, &pg, &va); + if ( rc ) + goto fail; + + /* Discard per-page VA */ + unmap_domain_page_global(va); + mfns[i] = page_to_mfn(pg); + } + + /* Map all mfns as single contiguous VA */ + s->ioreq = vmap(mfns, nr_pages); + if ( !s->ioreq ) + { + rc = -ENOMEM; + goto fail; + } + + s->ioreq_gfn = base_gfn; + xfree(mfns); + + return 0; + + fail: + while ( i-- > 0 ) + { + struct page_info *pg = mfn_to_page(mfns[i]); + + put_page_and_type(pg); + put_page(pg); + } + hvm_free_ioreq_gfns(s, base_gfn, nr_pages); + xfree(mfns); + + return rc; } static void hvm_remove_ioreq_gfn(struct ioreq_server *s, bool buf) { struct domain *d = s->target; - struct ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq; + unsigned int i, nr_pages; + if ( buf ) { + struct ioreq_page *iorp = &s->bufioreq; + if ( gfn_eq(iorp->gfn, INVALID_GFN) ) return; if ( p2m_remove_page(d, iorp->gfn, page_to_mfn(iorp->page), 0) ) domain_crash(d); clear_page(iorp->va); + return; + } + + if ( gfn_eq(s->ioreq_gfn, INVALID_GFN) ) + return; + + nr_pages = nr_ioreq_pages(d); + + for ( i = 0; i < nr_pages; i++ ) + { + gfn_t gfn = gfn_add(s->ioreq_gfn, i); + struct page_info *pg = vmap_to_page((char *)s->ioreq + + i * PAGE_SIZE); + + if ( p2m_remove_page(d, gfn, page_to_mfn(pg), 0) ) + domain_crash(d); } + memset(s->ioreq, 0, nr_pages * PAGE_SIZE); } static int hvm_add_ioreq_gfn(struct ioreq_server *s, bool buf) { struct domain *d = s->target; - struct ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq; + unsigned int i, nr_pages; int rc; + if ( buf ) { + struct ioreq_page *iorp = &s->bufioreq; + if ( gfn_eq(iorp->gfn, INVALID_GFN) ) return 0; @@ -208,6 +372,32 @@ static int hvm_add_ioreq_gfn(struct ioreq_server *s, bool buf) return rc; } + + if ( gfn_eq(s->ioreq_gfn, INVALID_GFN) ) + return 0; + + nr_pages = nr_ioreq_pages(d); + memset(s->ioreq, 0, nr_pages * PAGE_SIZE); + + for ( i = 0; i < nr_pages; i++ ) + { + gfn_t gfn = gfn_add(s->ioreq_gfn, i); + struct page_info *pg = vmap_to_page((char *)s->ioreq + + i * PAGE_SIZE); + + rc = p2m_add_page(d, gfn, page_to_mfn(pg), 0, p2m_ram_rw); + if ( rc ) + /* + * No rollback of previously added pages: The caller + * (arch_ioreq_server_disable) has no error handling path, + * and partial failure here will be cleaned up when the + * ioreq server is eventually destroyed. + */ + return rc; + + paging_mark_pfn_dirty(d, _pfn(gfn_x(gfn))); + } + return 0; } int arch_ioreq_server_map_pages(struct ioreq_server *s) diff --git a/xen/common/ioreq.c b/xen/common/ioreq.c index 5d722c8d4e..0ad86d3af3 100644 --- a/xen/common/ioreq.c +++ b/xen/common/ioreq.c @@ -26,6 +26,7 @@ #include <xen/paging.h> #include <xen/sched.h> #include <xen/trace.h> +#include <xen/xvmalloc.h> #include <asm/guest_atomics.h> #include <asm/ioreq.h> @@ -95,12 +96,10 @@ static struct ioreq_server *get_ioreq_server(const struct domain *d, static ioreq_t *get_ioreq(struct ioreq_server *s, struct vcpu *v) { - shared_iopage_t *p = s->ioreq.va; - ASSERT((v == current) || !vcpu_runnable(v)); - ASSERT(p != NULL); + ASSERT(s->ioreq != NULL); - return &p->vcpu_ioreq[v->vcpu_id]; + return &s->ioreq[v->vcpu_id]; } /* @@ -260,9 +259,32 @@ bool vcpu_ioreq_handle_completion(struct vcpu *v) static int ioreq_server_alloc_mfn(struct ioreq_server *s, bool buf) { - struct ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq; + struct ioreq_page *iorp; struct page_info *page; + if ( !buf ) + { + if ( s->ioreq ) + { + /* + * If a guest frame has already been mapped (which may happen + * on demand if ioreq_server_get_info() is called), then + * allocating a page is not permitted. + */ + if ( !gfn_eq(s->ioreq_gfn, INVALID_GFN) ) + return -EPERM; + + return 0; + } + + s->ioreq = xvzalloc_array(ioreq_t, s->target->max_vcpus); + + return s->ioreq ? 0 : -ENOMEM; + } + + /* bufioreq: single page allocation */ + iorp = &s->bufioreq; + if ( iorp->page ) { /* @@ -309,8 +331,17 @@ static int ioreq_server_alloc_mfn(struct ioreq_server *s, bool buf) static void ioreq_server_free_mfn(struct ioreq_server *s, bool buf) { - struct ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq; - struct page_info *page = iorp->page; + struct ioreq_page *iorp; + struct page_info *page; + + if ( !buf ) + { + XVFREE(s->ioreq); + return; + } + + iorp = &s->bufioreq; + page = iorp->page; if ( !page ) return; @@ -334,11 +365,29 @@ bool is_ioreq_server_page(struct domain *d, const struct page_info *page) FOR_EACH_IOREQ_SERVER(d, id, s) { - if ( (s->ioreq.page == page) || (s->bufioreq.page == page) ) + if ( s->bufioreq.page == page ) { found = true; break; } + + if ( s->ioreq ) + { + unsigned int i; + + for ( i = 0; i < nr_ioreq_pages(d); i++ ) + { + if ( vmap_to_page((char *)s->ioreq + + i * PAGE_SIZE) == page ) + { + found = true; + break; + } + } + + if ( found ) + break; + } } rspin_unlock(&d->ioreq_server.lock); @@ -351,7 +400,7 @@ static void ioreq_server_update_evtchn(struct ioreq_server *s, { ASSERT(spin_is_locked(&s->lock)); - if ( s->ioreq.va != NULL ) + if ( s->ioreq != NULL ) { ioreq_t *p = get_ioreq(s, sv->vcpu); @@ -591,7 +640,7 @@ static int ioreq_server_init(struct ioreq_server *s, INIT_LIST_HEAD(&s->ioreq_vcpu_list); spin_lock_init(&s->bufioreq_lock); - s->ioreq.gfn = INVALID_GFN; + s->ioreq_gfn = INVALID_GFN; s->bufioreq.gfn = INVALID_GFN; rc = ioreq_server_alloc_rangesets(s, id); @@ -770,7 +819,7 @@ static int ioreq_server_get_info(struct domain *d, ioservid_t id, } if ( ioreq_gfn ) - *ioreq_gfn = gfn_x(s->ioreq.gfn); + *ioreq_gfn = gfn_x(s->ioreq_gfn); if ( HANDLE_BUFIOREQ(s) ) { @@ -813,26 +862,30 @@ int ioreq_server_get_frame(struct domain *d, ioservid_t id, if ( rc ) goto out; - switch ( idx ) + if ( idx == XENMEM_resource_ioreq_server_frame_bufioreq) { - case XENMEM_resource_ioreq_server_frame_bufioreq: rc = -ENOENT; if ( !HANDLE_BUFIOREQ(s) ) goto out; *mfn = page_to_mfn(s->bufioreq.page); rc = 0; - break; - - case XENMEM_resource_ioreq_server_frame_ioreq(0): - *mfn = page_to_mfn(s->ioreq.page); - rc = 0; - break; + } + else if (( idx >= XENMEM_resource_ioreq_server_frame_ioreq(0) ) && + ( idx < XENMEM_resource_ioreq_server_frame_ioreq(nr_ioreq_pages(d)) )) + { + unsigned int page_idx = idx - XENMEM_resource_ioreq_server_frame_ioreq(0); - default: rc = -EINVAL; - break; + if ( idx >= XENMEM_resource_ioreq_server_frame_ioreq(0) && + page_idx < nr_ioreq_pages(d) && s->ioreq ) + { + *mfn = vmap_to_mfn((char *)s->ioreq + page_idx * PAGE_SIZE); + rc = 0; + } } + else + rc = -EINVAL; out: rspin_unlock(&d->ioreq_server.lock); diff --git a/xen/include/public/hvm/ioreq.h b/xen/include/public/hvm/ioreq.h index 7a6bc760d0..1c1a9e61ae 100644 --- a/xen/include/public/hvm/ioreq.h +++ b/xen/include/public/hvm/ioreq.h @@ -49,6 +49,11 @@ struct ioreq { }; typedef struct ioreq ioreq_t; +/* + * Deprecated: shared_iopage is no longer used by Xen internally. + * The ioreq server now uses a dynamically sized ioreq_t array + * to support more than 128 vCPUs. + */ struct shared_iopage { struct ioreq vcpu_ioreq[1]; }; diff --git a/xen/include/xen/ioreq.h b/xen/include/xen/ioreq.h index e86f0869fa..a4c7621f3f 100644 --- a/xen/include/xen/ioreq.h +++ b/xen/include/xen/ioreq.h @@ -19,9 +19,19 @@ #ifndef __XEN_IOREQ_H__ #define __XEN_IOREQ_H__ +#include <xen/macros.h> #include <xen/sched.h> #include <public/hvm/dm_op.h> +#include <public/hvm/ioreq.h> + +/* 4096 / 32 = 128 ioreq slots per page */ +#define IOREQS_PER_PAGE (PAGE_SIZE / sizeof(ioreq_t)) + +static inline unsigned int nr_ioreq_pages(const struct domain *d) +{ + return DIV_ROUND_UP(d->max_vcpus, IOREQS_PER_PAGE); +} struct ioreq_page { gfn_t gfn; @@ -45,7 +55,8 @@ struct ioreq_server { /* Lock to serialize toolstack modifications */ spinlock_t lock; - struct ioreq_page ioreq; + ioreq_t *ioreq; + gfn_t ioreq_gfn; struct list_head ioreq_vcpu_list; struct ioreq_page bufioreq; -- 2.51.0 -- Julian Vetter | Vates Hypervisor & Kernel Developer XCP-ng & Xen Orchestra - Vates solutions web: https://vates.tech
