A single shared ioreq page provides PAGE_SIZE/sizeof(ioreq_t) = 128 slots, limiting HVM guests to 128 vCPUs. To support more vCPUs, the single struct ioreq_page in the ioreq_server is replaced with an ioreq_t pointer backed by a dynamically sized allocation. For the resource mapping path (XENMEM_acquire_resource), the common ioreq_server_alloc_mfn() allocates nr_ioreq_pages(d) domain heap pages with MEMF_no_refcount and writable type references, then maps them as a single contiguous VA via vmap(). Teardown recovers the pages from the vmap address via vmap_size() and vmap_to_page(). The legacy GFN-mapped path continues to support only a single ioreq page. For domains whose vCPU count exceeds PAGE_SIZE/sizeof(ioreq_t), hvm_map_ioreq_gfn() returns -EOPNOTSUPP. Those domains need to use the resource mapping interface. This also avoids fragmentation when allocating multiple slots from the GFN pool. The GFN path uses vmap() for the Xen-side mapping, consistent with ioreq_server_alloc_mfn(), allowing vmap_to_page() to recover the page during teardown.
Signed-off-by: Julian Vetter <[email protected]> --- Changes in v4: - Dropped the multi-page support for the legacy GFN-mapped path - When 'nr_ioreq_pages(d) > 1' -> -EOPNOTSUPP - But now also use vmap(), same as in resource mapping path, so new s->ioreq pointer is used in either case - Mirror exactly what prepare_ring_for_helper/destroy_ring_for_helper is doing except the __map_domain_page_global (replaced by vmap/vunmap) - Replaced xvzalloc_array() by xvmalloc_array() -> No need to have mfns array zero'ed - Inverted logic in ioreq_server_alloc_mfn() and ioreq_server_free_mfn() to check for 'if (buf)' (same as in the GFN-mapped path) - Fixed ioreq_server_alloc_mfn -> Correctly call alloc_domheap_page + get_page_and_type - Fixed error handling and teardown path to correctly call put_page_alloc_ref + put_page_and_type just like for the bufioreq - Kept shared_iopage_t so QEMU, varstored, etc. can just reference into it via p->ioreq[cpu_id] -> No modification needed --- xen/arch/x86/hvm/ioreq.c | 153 ++++++++++++++++++++++++++++++++---- xen/common/ioreq.c | 166 ++++++++++++++++++++++++++++++++------- xen/include/xen/ioreq.h | 10 ++- 3 files changed, 286 insertions(+), 43 deletions(-) diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c index 355b2ba12c..ec4f210768 100644 --- a/xen/arch/x86/hvm/ioreq.c +++ b/xen/arch/x86/hvm/ioreq.c @@ -15,6 +15,7 @@ #include <xen/sched.h> #include <xen/softirq.h> #include <xen/trace.h> +#include <xen/vmap.h> #include <xen/vpci.h> #include <asm/hvm/emulate.h> @@ -123,9 +124,10 @@ static void hvm_free_ioreq_gfn(struct ioreq_server *s, gfn_t gfn) static void hvm_unmap_ioreq_gfn(struct ioreq_server *s, bool buf) { - struct ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq; - + if ( buf ) { + struct ioreq_page *iorp = &s->bufioreq; + if ( gfn_eq(iorp->gfn, INVALID_GFN) ) return; @@ -134,16 +136,33 @@ static void hvm_unmap_ioreq_gfn(struct ioreq_server *s, bool buf) hvm_free_ioreq_gfn(s, iorp->gfn); iorp->gfn = INVALID_GFN; + return; } + + if ( gfn_eq(s->ioreq_gfn, INVALID_GFN) ) + return; + + put_page_and_type(vmap_to_page((void *)s->ioreq)); + vunmap(s->ioreq); + s->ioreq = NULL; + + hvm_free_ioreq_gfn(s, s->ioreq_gfn); + s->ioreq_gfn = INVALID_GFN; } static int hvm_map_ioreq_gfn(struct ioreq_server *s, bool buf) { struct domain *d = s->target; - struct ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq; + gfn_t base_gfn; + struct page_info *pg; + p2m_type_t p2mt; + mfn_t mfn; int rc; + if ( buf ) { + struct ioreq_page *iorp = &s->bufioreq; + if ( iorp->page ) { /* @@ -173,35 +192,141 @@ static int hvm_map_ioreq_gfn(struct ioreq_server *s, bool buf) return rc; } + + if ( s->ioreq ) + { + /* + * If pages have already been allocated (which will happen on + * demand if ioreq_server_get_frame() is called), then + * mapping a guest frame is not permitted. + */ + if ( gfn_eq(s->ioreq_gfn, INVALID_GFN) ) + return -EPERM; + return 0; + } + + /* + * The legacy GFN path supports only a single ioreq page. Guests requiring + * more ioreq slots must use the resource mapping interface + * (XENMEM_acquire_resource). + */ + if ( nr_ioreq_pages(d) > 1 ) + return -EOPNOTSUPP; + + if ( d->is_dying ) + return -EINVAL; + + base_gfn = hvm_alloc_ioreq_gfn(s); + + if ( gfn_eq(base_gfn, INVALID_GFN) ) + return -ENOMEM; + + /* + * The page management is the same as prepare_ring_for_helper(), but vmap() + * is used instead of __map_domain_page_global() to be consistent with + * ioreq_server_alloc_mfn(), which uses vmap() to map potentially multiple + * pages. This also allows vmap_to_page() to recover the struct page_info * + * from s->ioreq during teardown. + */ + rc = check_get_page_from_gfn(d, base_gfn, false, &p2mt, &pg); + if ( rc ) + { + if ( rc == -EAGAIN ) + rc = -ENOENT; + goto fail; + } + + if ( !get_page_type(pg, PGT_writable_page) ) + { + put_page(pg); + rc = -EINVAL; + goto fail; + } + + mfn = page_to_mfn(pg); + s->ioreq = vmap(&mfn, 1); + if ( !s->ioreq ) + { + put_page_and_type(pg); + rc = -ENOMEM; + goto fail; + } + + s->ioreq_gfn = base_gfn; + return 0; + + fail: + hvm_free_ioreq_gfn(s, base_gfn); + return rc; } static void hvm_remove_ioreq_gfn(struct ioreq_server *s, bool buf) { struct domain *d = s->target; - struct ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq; + struct page_info *pg; + void *va; + gfn_t gfn; - if ( gfn_eq(iorp->gfn, INVALID_GFN) ) - return; + if ( buf ) + { + struct ioreq_page *iorp = &s->bufioreq; + + if ( gfn_eq(iorp->gfn, INVALID_GFN) ) + return; + + gfn = iorp->gfn; + pg = iorp->page; + va = iorp->va; + } + else + { + if ( gfn_eq(s->ioreq_gfn, INVALID_GFN) ) + return; - if ( p2m_remove_page(d, iorp->gfn, page_to_mfn(iorp->page), 0) ) + gfn = s->ioreq_gfn; + pg = vmap_to_page(s->ioreq); + va = s->ioreq; + } + + if ( p2m_remove_page(d, gfn, page_to_mfn(pg), 0) ) domain_crash(d); - clear_page(iorp->va); + clear_page(va); } static int hvm_add_ioreq_gfn(struct ioreq_server *s, bool buf) { struct domain *d = s->target; - struct ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq; + struct page_info *pg; + void *va; + gfn_t gfn; int rc; - if ( gfn_eq(iorp->gfn, INVALID_GFN) ) - return 0; + if ( buf ) + { + struct ioreq_page *iorp = &s->bufioreq; + + if ( gfn_eq(iorp->gfn, INVALID_GFN) ) + return 0; + + gfn = iorp->gfn; + pg = iorp->page; + va = iorp->va; + } + else + { + if ( gfn_eq(s->ioreq_gfn, INVALID_GFN) ) + return 0; + + gfn = s->ioreq_gfn; + pg = vmap_to_page(s->ioreq); + va = s->ioreq; + } - clear_page(iorp->va); + clear_page(va); - rc = p2m_add_page(d, iorp->gfn, page_to_mfn(iorp->page), 0, p2m_ram_rw); + rc = p2m_add_page(d, gfn, page_to_mfn(pg), 0, p2m_ram_rw); if ( rc == 0 ) - paging_mark_pfn_dirty(d, _pfn(gfn_x(iorp->gfn))); + paging_mark_pfn_dirty(d, _pfn(gfn_x(gfn))); return rc; } diff --git a/xen/common/ioreq.c b/xen/common/ioreq.c index 2e284ad26c..5a09e2ba36 100644 --- a/xen/common/ioreq.c +++ b/xen/common/ioreq.c @@ -26,6 +26,8 @@ #include <xen/paging.h> #include <xen/sched.h> #include <xen/trace.h> +#include <xen/vmap.h> +#include <xen/xvmalloc.h> #include <asm/guest_atomics.h> #include <asm/ioreq.h> @@ -95,12 +97,10 @@ static struct ioreq_server *get_ioreq_server(const struct domain *d, static ioreq_t *get_ioreq(struct ioreq_server *s, struct vcpu *v) { - shared_iopage_t *p = s->ioreq.va; - ASSERT((v == current) || !vcpu_runnable(v)); - ASSERT(p != NULL); + ASSERT(s->ioreq != NULL); - return &p->vcpu_ioreq[v->vcpu_id]; + return &s->ioreq[v->vcpu_id]; } /* @@ -260,10 +260,16 @@ bool vcpu_ioreq_handle_completion(struct vcpu *v) static int ioreq_server_alloc_mfn(struct ioreq_server *s, bool buf) { - struct ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq; - struct page_info *page; + unsigned int i, nr_pages; + mfn_t *mfns; + int rc; + /* bufioreq: single page allocation */ + if ( buf ) { + struct ioreq_page *iorp = &s->bufioreq; + struct page_info *page; + if ( iorp->page ) { /* @@ -294,26 +300,92 @@ static int ioreq_server_alloc_mfn(struct ioreq_server *s, bool buf) iorp->va = __map_domain_page_global(page); if ( !iorp->va ) - goto fail; + { + put_page_alloc_ref(page); + put_page_and_type(page); + return -ENOMEM; + } iorp->page = page; clear_page(iorp->va); return 0; } - fail: - put_page_alloc_ref(page); - put_page_and_type(page); + /* ioreq: multi-page allocation */ + if ( s->ioreq ) + { + /* + * If a guest frame has already been mapped (which may happen + * on demand if ioreq_server_get_info() is called), then + * allocating a page is not permitted. + */ + if ( !gfn_eq(s->ioreq_gfn, INVALID_GFN) ) + return -EPERM; + + return 0; + } + + nr_pages = nr_ioreq_pages(s->target); + mfns = xvmalloc_array(mfn_t, nr_pages); + + if ( !mfns ) + return -ENOMEM; + + for ( i = 0; i < nr_pages; i++ ) + { + struct page_info *pg = alloc_domheap_page(s->target, + MEMF_no_refcount); + + rc = -ENOMEM; + if ( !pg ) + goto fail_pages; + + if ( !get_page_and_type(pg, s->target, PGT_writable_page) ) + { + /* + * The domain can't possibly know about this page yet, so + * failure here is a clear indication of something fishy + * going on. + */ + domain_crash(s->emulator); + rc = -ENODATA; + goto fail_pages; + } + + mfns[i] = page_to_mfn(pg); + } + + s->ioreq = vmap(mfns, nr_pages); + if ( !s->ioreq ) + { + rc = -ENOMEM; + i = nr_pages; + goto fail_pages; + } + + memset(s->ioreq, 0, nr_pages * PAGE_SIZE); + xvfree(mfns); + return 0; - return -ENOMEM; + fail_pages: + while ( i-- > 0 ) + { + struct page_info *pg = mfn_to_page(mfns[i]); + + put_page_alloc_ref(pg); + put_page_and_type(pg); + } + xvfree(mfns); + return rc; } static void ioreq_server_free_mfn(struct ioreq_server *s, bool buf) { - struct ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq; - struct page_info *page = iorp->page; - + if ( buf ) { + struct ioreq_page *iorp = &s->bufioreq; + struct page_info *page = iorp->page; + if ( !page ) return; @@ -324,6 +396,23 @@ static void ioreq_server_free_mfn(struct ioreq_server *s, bool buf) put_page_alloc_ref(page); put_page_and_type(page); + return; + } + + if ( s->ioreq ) + { + unsigned int i, nr_pages = vmap_size(s->ioreq); + + for ( i = 0; i < nr_pages; i++ ) + { + struct page_info *pg = vmap_to_page((void *)s->ioreq + + i * PAGE_SIZE); + + put_page_alloc_ref(pg); + put_page_and_type(pg); + } + vunmap(s->ioreq); + s->ioreq = NULL; } } @@ -337,11 +426,29 @@ bool is_ioreq_server_page(struct domain *d, const struct page_info *page) FOR_EACH_IOREQ_SERVER(d, id, s) { - if ( (s->ioreq.page == page) || (s->bufioreq.page == page) ) + if ( s->bufioreq.page == page ) { found = true; break; } + + if ( s->ioreq ) + { + unsigned int i; + + for ( i = 0; i < nr_ioreq_pages(d); i++ ) + { + if ( vmap_to_page((char *)s->ioreq + + i * PAGE_SIZE) == page ) + { + found = true; + break; + } + } + + if ( found ) + break; + } } rspin_unlock(&d->ioreq_server.lock); @@ -354,7 +461,7 @@ static void ioreq_server_update_evtchn(struct ioreq_server *s, { ASSERT(spin_is_locked(&s->lock)); - if ( s->ioreq.va != NULL ) + if ( s->ioreq != NULL ) { ioreq_t *p = get_ioreq(s, sv->vcpu); @@ -594,7 +701,7 @@ static int ioreq_server_init(struct ioreq_server *s, INIT_LIST_HEAD(&s->ioreq_vcpu_list); spin_lock_init(&s->bufioreq_lock); - s->ioreq.gfn = INVALID_GFN; + s->ioreq_gfn = INVALID_GFN; s->bufioreq.gfn = INVALID_GFN; rc = ioreq_server_alloc_rangesets(s, id); @@ -773,7 +880,7 @@ static int ioreq_server_get_info(struct domain *d, ioservid_t id, } if ( ioreq_gfn ) - *ioreq_gfn = gfn_x(s->ioreq.gfn); + *ioreq_gfn = gfn_x(s->ioreq_gfn); if ( HANDLE_BUFIOREQ(s) ) { @@ -816,26 +923,29 @@ int ioreq_server_get_frame(struct domain *d, ioservid_t id, if ( rc ) goto out; - switch ( idx ) + if ( idx == XENMEM_resource_ioreq_server_frame_bufioreq ) { - case XENMEM_resource_ioreq_server_frame_bufioreq: rc = -ENOENT; if ( !HANDLE_BUFIOREQ(s) ) goto out; *mfn = page_to_mfn(s->bufioreq.page); rc = 0; - break; - - case XENMEM_resource_ioreq_server_frame_ioreq(0): - *mfn = page_to_mfn(s->ioreq.page); - rc = 0; - break; + } + else if ( idx >= XENMEM_resource_ioreq_server_frame_ioreq(0) && + idx < XENMEM_resource_ioreq_server_frame_ioreq(nr_ioreq_pages(d)) ) + { + unsigned int page_idx = idx - XENMEM_resource_ioreq_server_frame_ioreq(0); - default: rc = -EINVAL; - break; + if ( s->ioreq ) + { + *mfn = vmap_to_mfn((void *)s->ioreq + page_idx * PAGE_SIZE); + rc = 0; + } } + else + rc = -EINVAL; out: rspin_unlock(&d->ioreq_server.lock); diff --git a/xen/include/xen/ioreq.h b/xen/include/xen/ioreq.h index e86f0869fa..41650a59ca 100644 --- a/xen/include/xen/ioreq.h +++ b/xen/include/xen/ioreq.h @@ -19,9 +19,16 @@ #ifndef __XEN_IOREQ_H__ #define __XEN_IOREQ_H__ +#include <xen/macros.h> #include <xen/sched.h> #include <public/hvm/dm_op.h> +#include <public/hvm/ioreq.h> + +static inline unsigned int nr_ioreq_pages(const struct domain *d) +{ + return DIV_ROUND_UP(d->max_vcpus, PAGE_SIZE / sizeof(ioreq_t)); +} struct ioreq_page { gfn_t gfn; @@ -45,7 +52,8 @@ struct ioreq_server { /* Lock to serialize toolstack modifications */ spinlock_t lock; - struct ioreq_page ioreq; + ioreq_t *ioreq; + gfn_t ioreq_gfn; struct list_head ioreq_vcpu_list; struct ioreq_page bufioreq; -- 2.51.0 -- Julian Vetter | Vates Hypervisor & Kernel Developer XCP-ng & Xen Orchestra - Vates solutions web: https://vates.tech
