On Thu, 11 Sep 2025 14:33:07 +0300
Leon Romanovsky <[email protected]> wrote:

> From: Leon Romanovsky <[email protected]>
> 
> Refactor the PCI P2PDMA subsystem to separate the core peer-to-peer DMA
> functionality from the optional memory allocation layer. This creates a
> two-tier architecture:
> 
> The core layer provides P2P mapping functionality for physical addresses
> based on PCI device MMIO BARs and integrates with the DMA API for
> mapping operations. This layer is required for all P2PDMA users.
> 
> The optional upper layer provides memory allocation capabilities
> including gen_pool allocator, struct page support, and sysfs interface
> for user space access.
> 
> This separation allows subsystems like VFIO to use only the core P2P
> mapping functionality without the overhead of memory allocation features
> they don't need. The core functionality is now available through the
> new pci_p2pdma_enable() function that returns a p2pdma_provider
> structure.
> 
> Signed-off-by: Leon Romanovsky <[email protected]>
> ---
>  drivers/pci/p2pdma.c       | 129 +++++++++++++++++++++++++++----------
>  include/linux/pci-p2pdma.h |   5 ++
>  2 files changed, 100 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> index 176a99232fdca..c22cbb3a26030 100644
> --- a/drivers/pci/p2pdma.c
> +++ b/drivers/pci/p2pdma.c
> @@ -25,11 +25,12 @@ struct pci_p2pdma {
>       struct gen_pool *pool;
>       bool p2pmem_published;
>       struct xarray map_types;
> +     struct p2pdma_provider mem[PCI_STD_NUM_BARS];
>  };
>  
>  struct pci_p2pdma_pagemap {
>       struct dev_pagemap pgmap;
> -     struct p2pdma_provider mem;
> +     struct p2pdma_provider *mem;
>  };
>  
>  static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap)
> @@ -204,7 +205,7 @@ static void p2pdma_page_free(struct page *page)
>       struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page_pgmap(page));
>       /* safe to dereference while a reference is held to the percpu ref */
>       struct pci_p2pdma *p2pdma = rcu_dereference_protected(
> -             to_pci_dev(pgmap->mem.owner)->p2pdma, 1);
> +             to_pci_dev(pgmap->mem->owner)->p2pdma, 1);
>       struct percpu_ref *ref;
>  
>       gen_pool_free_owner(p2pdma->pool, (uintptr_t)page_to_virt(page),
> @@ -227,44 +228,93 @@ static void pci_p2pdma_release(void *data)
>  
>       /* Flush and disable pci_alloc_p2p_mem() */
>       pdev->p2pdma = NULL;
> -     synchronize_rcu();
> +     if (p2pdma->pool)
> +             synchronize_rcu();
> +     xa_destroy(&p2pdma->map_types);
> +
> +     if (!p2pdma->pool)
> +             return;
>  
>       gen_pool_destroy(p2pdma->pool);
>       sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group);
> -     xa_destroy(&p2pdma->map_types);
>  }
>  
> -static int pci_p2pdma_setup(struct pci_dev *pdev)
> +/**
> + * pcim_p2pdma_enable - Enable peer-to-peer DMA support for a PCI device
> + * @pdev: The PCI device to enable P2PDMA for
> + * @bar: BAR index to get provider
> + *
> + * This function initializes the peer-to-peer DMA infrastructure for a PCI
> + * device. It allocates and sets up the necessary data structures to support
> + * P2PDMA operations, including mapping type tracking.
> + */
> +struct p2pdma_provider *pcim_p2pdma_enable(struct pci_dev *pdev, int bar)
>  {
> -     int error = -ENOMEM;
>       struct pci_p2pdma *p2p;
> +     int i, ret;
> +
> +     p2p = rcu_dereference_protected(pdev->p2pdma, 1);
> +     if (p2p)
> +             /* PCI device was "rebound" to the driver */
> +             return &p2p->mem[bar];
>  

This seems like two separate functions rolled into one, an 'initialize
providers' and a 'get provider for BAR'.  The comment above even makes
it sound like only a driver re-probing a device would encounter this
branch, but the use case later in vfio-pci shows it to be the common
case to iterate BARs for a device.

But then later in patch 8/ and again in 10/ why exactly do we cache
the provider on the vfio_pci_core_device rather than ask for it on
demand from the p2pdma?

It also seems like the coordination of a valid provider is ad-hoc
between p2pdma and vfio-pci.  For example, this only fills providers
for MMIO BARs and vfio-pci validates that dmabuf operations are for
MMIO BARs, but it would be more consistent if vfio-pci relied on p2pdma
to give it a valid provider for a given BAR.  Thanks,

Alex

>       p2p = devm_kzalloc(&pdev->dev, sizeof(*p2p), GFP_KERNEL);
>       if (!p2p)
> -             return -ENOMEM;
> +             return ERR_PTR(-ENOMEM);
>  
>       xa_init(&p2p->map_types);
> +     /*
> +      * Iterate over all standard PCI BARs and record only those that
> +      * correspond to MMIO regions. Skip non-memory resources (e.g. I/O
> +      * port BARs) since they cannot be used for peer-to-peer (P2P)
> +      * transactions.
> +      */
> +     for (i = 0; i < PCI_STD_NUM_BARS; i++) {
> +             if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM))
> +                     continue;
>  
> -     p2p->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev));
> -     if (!p2p->pool)
> -             goto out;
> +             p2p->mem[i].owner = &pdev->dev;
> +             p2p->mem[i].bus_offset =
> +                     pci_bus_address(pdev, i) - pci_resource_start(pdev, i);
> +     }
>  
> -     error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev);
> -     if (error)
> -             goto out_pool_destroy;
> +     ret = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev);
> +     if (ret)
> +             goto out_p2p;
>  
> -     error = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group);
> -     if (error)
> +     rcu_assign_pointer(pdev->p2pdma, p2p);
> +     return &p2p->mem[bar];
> +
> +out_p2p:
> +     devm_kfree(&pdev->dev, p2p);
> +     return ERR_PTR(ret);
> +}
> +EXPORT_SYMBOL_GPL(pcim_p2pdma_enable);
> +
> +static int pci_p2pdma_setup_pool(struct pci_dev *pdev)
> +{
> +     struct pci_p2pdma *p2pdma;
> +     int ret;
> +
> +     p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
> +     if (p2pdma->pool)
> +             /* We already setup pools, do nothing, */
> +             return 0;
> +
> +     p2pdma->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev));
> +     if (!p2pdma->pool)
> +             return -ENOMEM;
> +
> +     ret = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group);
> +     if (ret)
>               goto out_pool_destroy;
>  
> -     rcu_assign_pointer(pdev->p2pdma, p2p);
>       return 0;
>  
>  out_pool_destroy:
> -     gen_pool_destroy(p2p->pool);
> -out:
> -     devm_kfree(&pdev->dev, p2p);
> -     return error;
> +     gen_pool_destroy(p2pdma->pool);
> +     p2pdma->pool = NULL;
> +     return ret;
>  }
>  
>  static void pci_p2pdma_unmap_mappings(void *data)
> @@ -276,7 +326,7 @@ static void pci_p2pdma_unmap_mappings(void *data)
>        * unmap_mapping_range() on the inode, teardown any existing userspace
>        * mappings and prevent new ones from being created.
>        */
> -     sysfs_remove_file_from_group(&p2p_pgmap->mem.owner->kobj,
> +     sysfs_remove_file_from_group(&p2p_pgmap->mem->owner->kobj,
>                                    &p2pmem_alloc_attr.attr,
>                                    p2pmem_group.name);
>  }
> @@ -295,6 +345,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int 
> bar, size_t size,
>                           u64 offset)
>  {
>       struct pci_p2pdma_pagemap *p2p_pgmap;
> +     struct p2pdma_provider *mem;
>       struct dev_pagemap *pgmap;
>       struct pci_p2pdma *p2pdma;
>       void *addr;
> @@ -312,15 +363,25 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int 
> bar, size_t size,
>       if (size + offset > pci_resource_len(pdev, bar))
>               return -EINVAL;
>  
> -     if (!pdev->p2pdma) {
> -             error = pci_p2pdma_setup(pdev);
> +     p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
> +     if (!p2pdma) {
> +             mem = pcim_p2pdma_enable(pdev, bar);
> +             if (IS_ERR(mem))
> +                     return PTR_ERR(mem);
> +
> +             error = pci_p2pdma_setup_pool(pdev);
>               if (error)
>                       return error;
> -     }
> +
> +             p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
> +     } else
> +             mem = &p2pdma->mem[bar];
>  
>       p2p_pgmap = devm_kzalloc(&pdev->dev, sizeof(*p2p_pgmap), GFP_KERNEL);
> -     if (!p2p_pgmap)
> -             return -ENOMEM;
> +     if (!p2p_pgmap) {
> +             error = -ENOMEM;
> +             goto free_pool;
> +     }
>  
>       pgmap = &p2p_pgmap->pgmap;
>       pgmap->range.start = pci_resource_start(pdev, bar) + offset;
> @@ -328,9 +389,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int 
> bar, size_t size,
>       pgmap->nr_range = 1;
>       pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
>       pgmap->ops = &p2pdma_pgmap_ops;
> -     p2p_pgmap->mem.owner = &pdev->dev;
> -     p2p_pgmap->mem.bus_offset =
> -             pci_bus_address(pdev, bar) - pci_resource_start(pdev, bar);
> +     p2p_pgmap->mem = mem;
>  
>       addr = devm_memremap_pages(&pdev->dev, pgmap);
>       if (IS_ERR(addr)) {
> @@ -343,7 +402,6 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int 
> bar, size_t size,
>       if (error)
>               goto pages_free;
>  
> -     p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
>       error = gen_pool_add_owner(p2pdma->pool, (unsigned long)addr,
>                       pci_bus_address(pdev, bar) + offset,
>                       range_len(&pgmap->range), dev_to_node(&pdev->dev),
> @@ -359,7 +417,10 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int 
> bar, size_t size,
>  pages_free:
>       devm_memunmap_pages(&pdev->dev, pgmap);
>  pgmap_free:
> -     devm_kfree(&pdev->dev, pgmap);
> +     devm_kfree(&pdev->dev, p2p_pgmap);
> +free_pool:
> +     sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group);
> +     gen_pool_destroy(p2pdma->pool);
>       return error;
>  }
>  EXPORT_SYMBOL_GPL(pci_p2pdma_add_resource);
> @@ -1008,11 +1069,11 @@ void __pci_p2pdma_update_state(struct 
> pci_p2pdma_map_state *state,
>  {
>       struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(page_pgmap(page));
>  
> -     if (state->mem == &p2p_pgmap->mem)
> +     if (state->mem == p2p_pgmap->mem)
>               return;
>  
> -     state->mem = &p2p_pgmap->mem;
> -     state->map = pci_p2pdma_map_type(&p2p_pgmap->mem, dev);
> +     state->mem = p2p_pgmap->mem;
> +     state->map = pci_p2pdma_map_type(p2p_pgmap->mem, dev);
>  }
>  
>  /**
> diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
> index eef96636c67e6..888ad7b0c54cf 100644
> --- a/include/linux/pci-p2pdma.h
> +++ b/include/linux/pci-p2pdma.h
> @@ -27,6 +27,7 @@ struct p2pdma_provider {
>  };
>  
>  #ifdef CONFIG_PCI_P2PDMA
> +struct p2pdma_provider *pcim_p2pdma_enable(struct pci_dev *pdev, int bar);
>  int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
>               u64 offset);
>  int pci_p2pdma_distance_many(struct pci_dev *provider, struct device 
> **clients,
> @@ -45,6 +46,10 @@ int pci_p2pdma_enable_store(const char *page, struct 
> pci_dev **p2p_dev,
>  ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
>                              bool use_p2pdma);
>  #else /* CONFIG_PCI_P2PDMA */
> +static inline struct p2pdma_provider *pcim_p2pdma_enable(struct pci_dev 
> *pdev, int bar)
> +{
> +     return ERR_PTR(-EOPNOTSUPP);
> +}
>  static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar,
>               size_t size, u64 offset)
>  {

Reply via email to