Hi Shameer, Cedric,
> Subject: [PATCH 3/3] hw/vfio/region: Create dmabuf for PCI BAR per region
>
> From: Nicolin Chen <[email protected]>
>
> Linux now provides a VFIO dmabuf exporter to expose PCI BAR memory for
> P2P
> use cases. Create a dmabuf for each mapped BAR region after the mmap is
> set
> up, and store the returned fd in the region's RAMBlock. This allows QEMU
> to
> pass the fd to dma_map_file(), enabling iommufd to import the dmabuf and
> map
> the BAR correctly in the host IOMMU page table.
>
> If the kernel lacks support or dmabuf setup fails, QEMU skips the setup
> and continues with normal mmap handling.
>
> Signed-off-by: Nicolin Chen <[email protected]>
> Signed-off-by: Shameer Kolothum <[email protected]>
> ---
> hw/vfio/region.c | 57
> +++++++++++++++++++++++++++++++++++++++++++-
> hw/vfio/trace-events | 1 +
> 2 files changed, 57 insertions(+), 1 deletion(-)
>
> diff --git a/hw/vfio/region.c b/hw/vfio/region.c
> index b165ab0b93..6949f6779c 100644
> --- a/hw/vfio/region.c
> +++ b/hw/vfio/region.c
> @@ -29,6 +29,7 @@
> #include "qemu/error-report.h"
> #include "qemu/units.h"
> #include "monitor/monitor.h"
> +#include "system/ramblock.h"
> #include "vfio-helpers.h"
>
> /*
> @@ -238,13 +239,52 @@ static void vfio_subregion_unmap(VFIORegion
> *region, int index)
> region->mmaps[index].mmap = NULL;
> }
>
> +static int vfio_region_create_dma_buf(VFIORegion *region)
Would it make sense to consolidate this implementation with the one from my
series:
https://lore.kernel.org/qemu-devel/[email protected]/
so that it is a bit more generic and can also be invoked from outside of VFIO?
Or, is it ok to have two dmabuf implementations: one that is internal to VFIO
and takes a VFIORegion as input like this one and another one that takes a
VFIODevice and iovec as input and can be invoked externally?
Thanks,
Vivek
> +{
> + g_autofree struct vfio_device_feature *feature = NULL;
> + VFIODevice *vbasedev = region->vbasedev;
> + struct vfio_device_feature_dma_buf *dma_buf;
> + size_t total_size;
> + int i, ret;
> +
> + g_assert(region->nr_mmaps);
> +
> + total_size = sizeof(*feature) + sizeof(*dma_buf) +
> + sizeof(struct vfio_region_dma_range) * region->nr_mmaps;
> + feature = g_malloc0(total_size);
> + *feature = (struct vfio_device_feature) {
> + .argsz = total_size,
> + .flags = VFIO_DEVICE_FEATURE_GET |
> VFIO_DEVICE_FEATURE_DMA_BUF,
> + };
> +
> + dma_buf = (void *)feature->data;
> + *dma_buf = (struct vfio_device_feature_dma_buf) {
> + .region_index = region->nr,
> + .open_flags = O_RDWR,
> + .nr_ranges = region->nr_mmaps,
> + };
> +
> + for (i = 0; i < region->nr_mmaps; i++) {
> + dma_buf->dma_ranges[i].offset = region->mmaps[i].offset;
> + dma_buf->dma_ranges[i].length = region->mmaps[i].size;
> + }
> +
> + ret = vbasedev->io_ops->device_feature(vbasedev, feature);
> + for (i = 0; i < region->nr_mmaps; i++) {
> + trace_vfio_region_dmabuf(region->vbasedev->name, ret, region->nr,
> + region->mem->name, region->mmaps[i].offset,
> + region->mmaps[i].size);
> + }
> + return ret;
> +}
> +
> int vfio_region_mmap(VFIORegion *region)
> {
> int i, ret, prot = 0;
> char *name;
> int fd;
>
> - if (!region->mem) {
> + if (!region->mem || !region->nr_mmaps) {
> return 0;
> }
>
> @@ -305,6 +345,21 @@ int vfio_region_mmap(VFIORegion *region)
> region->mmaps[i].size - 1);
> }
>
> + ret = vfio_region_create_dma_buf(region);
> + if (ret < 0) {
> + if (ret == -ENOTTY) {
> + warn_report_once("VFIO dmabuf not supported in kernel");
> + } else {
> + error_report("%s: failed to create dmabuf: %s",
> + memory_region_name(region->mem), strerror(errno));
> + }
> + } else {
> + MemoryRegion *mr = ®ion->mmaps[0].mem;
> + RAMBlock *ram_block = mr->ram_block;
> +
> + ram_block->fd = ret;
> + }
> +
> return 0;
>
> no_mmap:
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 1e895448cd..592a0349d4 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -117,6 +117,7 @@ vfio_device_put(int fd) "close vdev->fd=%d"
> vfio_region_write(const char *name, int index, uint64_t addr, uint64_t
> data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)"
> vfio_region_read(char *name, int index, uint64_t addr, unsigned size,
> uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64
> vfio_region_setup(const char *dev, int index, const char *name, unsigned
> long flags, unsigned long offset, unsigned long size) "Device %s, region %d
> \"%s\", flags: 0x%lx, offset: 0x%lx, size: 0x%lx"
> +vfio_region_dmabuf(const char *dev, int fd, int index, const char *name,
> unsigned long offset, unsigned long size) "Device %s, dmabuf fd %d region
> %d \"%s\", offset: 0x%lx, size: 0x%lx"
> vfio_region_mmap_fault(const char *name, int index, unsigned long offset,
> unsigned long size, int fault) "Region %s mmaps[%d], [0x%lx - 0x%lx], fault:
> %d"
> vfio_region_mmap(const char *name, unsigned long offset, unsigned long
> end) "Region %s [0x%lx - 0x%lx]"
> vfio_region_exit(const char *name, int index) "Device %s, region %d"
> --
> 2.43.0
>