Query dma-buf TPH metadata when registering a dma-buf MR for peer-to- peer access and translate the returned steering tag into an mlx5 ST index. Keep the DMAH path as the first priority and only fall back to DMA-buf metadata when no DMAH is supplied.
Split the existing mlx5_st_alloc_index() into mlx5_st_alloc_index_by_tag() plus a tag-from-cpu wrapper so the dma-buf path can allocate an ST index directly from a raw steering tag without going through the per-CPU table. mlx5_st_alloc_index_by_tag() explicitly initialises 'ret' so the duplicate-tag fast path doesn't return an uninitialised value, which would otherwise be observed by callers when an MR re-uses a tag that already has an ST index allocated. For TPH-backed FRMRs, the extra ST-table reference belongs to the hardware mkey handle, not the transient MR object. Add mlx5_st_get_index() and extend the FRMR pool API so ib_frmr_pool_pop() reports whether a handle was reused and destroy_frmrs() receives the pool key. The DMAH and dma-buf paths take a provisional ST ref before pool lookup; reuse drops that provisional ref immediately, while newly created handles keep it and release it only when the FRMR handle is actually destroyed, either directly or through FRMR pool aging/cleanup. Also decode the PH bits stored in kernel_vendor_key when recreating pooled mkeys so the programmed requester hint matches the pool key. Signed-off-by: Zhiping Zhang <[email protected]> --- drivers/infiniband/core/frmr_pools.c | 20 ++- drivers/infiniband/hw/mlx5/mr.c | 124 +++++++++++++++++- .../net/ethernet/mellanox/mlx5/core/lib/st.c | 49 +++++-- include/linux/mlx5/driver.h | 12 ++ include/rdma/frmr_pools.h | 5 +- 5 files changed, 191 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/core/frmr_pools.c b/drivers/infiniband/core/frmr_pools.c index 5e992ff3d7cf..61a77847118e 100644 --- a/drivers/infiniband/core/frmr_pools.c +++ b/drivers/infiniband/core/frmr_pools.c @@ -92,7 +92,8 @@ static void destroy_all_handles_in_queue(struct ib_device *device, u32 count; while (pop_frmr_handles_page(pool, queue, &page, &count)) { - pools->pool_ops->destroy_frmrs(device, page->handles, count); + pools->pool_ops->destroy_frmrs(device, &pool->key, + page->handles, count); kfree(page); } } @@ -136,7 +137,8 @@ static bool age_pinned_pool(struct ib_device *device, struct ib_frmr_pool *pool) spin_unlock(&pool->lock); if (destroyed) - pools->pool_ops->destroy_frmrs(device, handles, destroyed); + pools->pool_ops->destroy_frmrs(device, &pool->key, handles, + destroyed); kfree(handles); return has_work; } @@ -453,9 +455,11 @@ int ib_frmr_pools_set_pinned(struct ib_device *device, struct ib_frmr_key *key, } static int get_frmr_from_pool(struct ib_device *device, - struct ib_frmr_pool *pool, struct ib_mr *mr) + struct ib_frmr_pool *pool, struct ib_mr *mr, + bool *reused) { struct ib_frmr_pools *pools = device->frmr_pools; + bool local_reused = false; u32 handle; int err; @@ -464,6 +468,7 @@ static int get_frmr_from_pool(struct ib_device *device, if (pool->inactive_queue.ci > 0) { handle = pop_handle_from_queue_locked( &pool->inactive_queue); + local_reused = true; } else { spin_unlock(&pool->lock); err = pools->pool_ops->create_frmrs(device, &pool->key, @@ -474,6 +479,7 @@ static int get_frmr_from_pool(struct ib_device *device, } } else { handle = pop_handle_from_queue_locked(&pool->queue); + local_reused = true; } pool->in_use++; @@ -484,6 +490,8 @@ static int get_frmr_from_pool(struct ib_device *device, mr->frmr.pool = pool; mr->frmr.handle = handle; + if (reused) + *reused = local_reused; return 0; } @@ -493,10 +501,12 @@ static int get_frmr_from_pool(struct ib_device *device, * * @device: The device to pop the FRMR handle from. * @mr: The MR to pop the FRMR handle from. + * @reused: Optional output that reports whether the returned handle was + * reused from the pool instead of freshly created. * * Returns 0 on success, negative error code on failure. */ -int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr *mr) +int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr *mr, bool *reused) { struct ib_frmr_pools *pools = device->frmr_pools; struct ib_frmr_pool *pool; @@ -509,7 +519,7 @@ int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr *mr) return PTR_ERR(pool); } - return get_frmr_from_pool(device, pool, mr); + return get_frmr_from_pool(device, pool, mr, reused); } EXPORT_SYMBOL(ib_frmr_pool_pop); diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 3b6da45061a5..b56df39d3385 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -38,6 +38,7 @@ #include <linux/delay.h> #include <linux/dma-buf.h> #include <linux/dma-resv.h> +#include <linux/pci-tph.h> #include <rdma/frmr_pools.h> #include <rdma/ib_umem_odp.h> #include "dm.h" @@ -167,12 +168,39 @@ static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev, #define MLX5_FRMR_POOLS_KERNEL_KEY_PH_MASK 0xFF0000 #define MLX5_FRMR_POOLS_KERNEL_KEY_ST_INDEX_MASK 0xFFFF +static int mlx5_ib_get_frmr_st_handle_ref(struct mlx5_ib_dev *dev, + u16 st_index) +{ + if (st_index == MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX) + return 0; + + return mlx5_st_get_index(dev->mdev, st_index); +} + +static void mlx5_ib_put_st_index_ref(struct mlx5_ib_dev *dev, u16 st_index) +{ + if (st_index == MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX) + return; + + mlx5_st_dealloc_index(dev->mdev, st_index); +} + +static void mlx5_ib_put_frmr_st_handle_ref(struct mlx5_ib_dev *dev, + u64 kernel_vendor_key) +{ + u16 st_index = kernel_vendor_key & + MLX5_FRMR_POOLS_KERNEL_KEY_ST_INDEX_MASK; + + mlx5_ib_put_st_index_ref(dev, st_index); +} + static struct mlx5_ib_mr * _mlx5_frmr_pool_alloc(struct mlx5_ib_dev *dev, struct ib_umem *umem, int access_flags, int access_mode, unsigned long page_size, u16 st_index, u8 ph) { struct mlx5_ib_mr *mr; + bool reused = false; int err; mr = kzalloc_obj(*mr); @@ -195,11 +223,14 @@ _mlx5_frmr_pool_alloc(struct mlx5_ib_dev *dev, struct ib_umem *umem, mr->ibmr.frmr.key.kernel_vendor_key = st_index | (ph << MLX5_FRMR_POOLS_KERNEL_KEY_PH_SHIFT); - err = ib_frmr_pool_pop(&dev->ib_dev, &mr->ibmr); + err = ib_frmr_pool_pop(&dev->ib_dev, &mr->ibmr, &reused); if (err) { kfree(mr); return ERR_PTR(err); } + if (reused) + mlx5_ib_put_frmr_st_handle_ref( + dev, mr->ibmr.frmr.key.kernel_vendor_key); mr->mmkey.key = mr->ibmr.frmr.handle; init_waitqueue_head(&mr->mmkey.wait); @@ -229,7 +260,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, init_waitqueue_head(&mr->mmkey.wait); mr->ibmr.frmr.key = key; - ret = ib_frmr_pool_pop(&dev->ib_dev, &mr->ibmr); + ret = ib_frmr_pool_pop(&dev->ib_dev, &mr->ibmr, NULL); if (ret) { kfree(mr); return ERR_PTR(ret); @@ -273,7 +304,8 @@ static int mlx5r_create_mkeys(struct ib_device *device, struct ib_frmr_key *key, st_index = key->kernel_vendor_key & MLX5_FRMR_POOLS_KERNEL_KEY_ST_INDEX_MASK; - ph = key->kernel_vendor_key & MLX5_FRMR_POOLS_KERNEL_KEY_PH_MASK; + ph = (key->kernel_vendor_key & MLX5_FRMR_POOLS_KERNEL_KEY_PH_MASK) >> + MLX5_FRMR_POOLS_KERNEL_KEY_PH_SHIFT; if (ph) { /* Normalize ph: swap MLX5_IB_NO_PH for 0 */ if (ph == MLX5_IB_NO_PH) @@ -299,7 +331,8 @@ static int mlx5r_create_mkeys(struct ib_device *device, struct ib_frmr_key *key, return err; } -static void mlx5r_destroy_mkeys(struct ib_device *device, u32 *handles, +static void mlx5r_destroy_mkeys(struct ib_device *device, + const struct ib_frmr_key *key, u32 *handles, unsigned int count) { struct mlx5_ib_dev *dev = to_mdev(device); @@ -311,6 +344,9 @@ static void mlx5r_destroy_mkeys(struct ib_device *device, u32 *handles, pr_warn_ratelimited( "mlx5_ib: failed to destroy mkey %d: %d", handles[i], err); + else + mlx5_ib_put_frmr_st_handle_ref(dev, + key->kernel_vendor_key); } } @@ -333,6 +369,7 @@ static int mlx5r_build_frmr_key(struct ib_device *device, get_unchangeable_access_flags(dev, in->access_flags); out->vendor_key = in->vendor_key; out->num_dma_blocks = in->num_dma_blocks; + out->kernel_vendor_key = in->kernel_vendor_key; return 0; } @@ -753,6 +790,12 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); if (xlt_with_umr) { + err = mlx5_ib_get_frmr_st_handle_ref(dev, st_index); + if (err) { + ib_umem_release(umem); + return ERR_PTR(err); + } + mr = alloc_cacheable_mr(pd, umem, iova, access_flags, MLX5_MKC_ACCESS_MODE_MTT, st_index, ph); @@ -767,6 +810,8 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, mutex_unlock(&dev->slow_path_mutex); } if (IS_ERR(mr)) { + if (xlt_with_umr) + mlx5_ib_put_st_index_ref(dev, st_index); ib_umem_release(umem); return ERR_CAST(mr); } @@ -899,6 +944,65 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { .invalidate_mappings = mlx5_ib_dmabuf_invalidate_cb, }; +/* + * Query TPH metadata from @dmabuf and translate the raw steering tag into + * an mlx5 ST index. On success *@st_index is updated with a provisional + * reference for a candidate FRMR handle and *@ph is updated to the dma-buf's + * processing hint. Callers that fail to allocate a handle, or that reuse an + * existing pooled handle, must drop the provisional ST reference. On any + * failure *@st_index and *@ph are left untouched, so the caller's no-TPH + * defaults stand. + * + * @dmabuf must already be referenced by the caller (e.g. via the umem's + * attachment) so we don't re-resolve the user's fd here and avoid a + * dup2() TOCTOU between umem creation and TPH lookup. + */ +static void get_tph_mr_dmabuf(struct mlx5_ib_dev *dev, struct dma_buf *dmabuf, + u16 *st_index, u8 *ph) +{ + u16 local_st_index; + u16 steering_tag; + u8 local_ph; + bool extended; + int ret; + + if (!dmabuf->ops->get_tph) + return; + + switch (pcie_tph_enabled_req_type(dev->mdev->pdev)) { + case PCI_TPH_REQ_TPH_ONLY: + extended = false; + break; + case PCI_TPH_REQ_EXT_TPH: + extended = true; + break; + default: + return; + } + + ret = dmabuf->ops->get_tph(dmabuf, extended, &steering_tag, &local_ph); + if (ret) { + mlx5_ib_dbg(dev, "get_tph failed (%d)\n", ret); + return; + } + + ret = mlx5_st_alloc_index_by_tag(dev->mdev, steering_tag, + &local_st_index); + if (ret) { + mlx5_ib_dbg(dev, "st_alloc_index_by_tag failed (%d)\n", ret); + return; + } + + *st_index = local_st_index; + *ph = local_ph; +} + +static void mlx5_ib_mr_put_frmr_st_handle_ref(struct mlx5_ib_mr *mr) +{ + mlx5_ib_put_frmr_st_handle_ref(mr_to_mdev(mr), + mr->ibmr.frmr.key.kernel_vendor_key); +} + static struct ib_mr * reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, u64 offset, u64 length, u64 virt_addr, @@ -941,12 +1045,22 @@ reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, ph = dmah->ph; if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS)) st_index = mdmah->st_index; + + err = mlx5_ib_get_frmr_st_handle_ref(dev, st_index); + if (err) { + ib_umem_release(&umem_dmabuf->umem); + return ERR_PTR(err); + } + } else { + get_tph_mr_dmabuf(dev, umem_dmabuf->attach->dmabuf, + &st_index, &ph); } mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, access_flags, access_mode, st_index, ph); if (IS_ERR(mr)) { + mlx5_ib_put_st_index_ref(dev, st_index); ib_umem_release(&umem_dmabuf->umem); return ERR_CAST(mr); } @@ -1400,6 +1514,8 @@ static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr *mr) dma_resv_unlock( to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv); } + if (!ret) + mlx5_ib_mr_put_frmr_st_handle_ref(mr); return ret; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c index 7cedc348790d..877b37b4e639 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c @@ -92,23 +92,18 @@ void mlx5_st_destroy(struct mlx5_core_dev *dev) kfree(st); } -int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type, - unsigned int cpu_uid, u16 *st_index) +int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev, u16 tag, + u16 *st_index) { struct mlx5_st_idx_data *idx_data; struct mlx5_st *st = dev->st; unsigned long index; u32 xa_id; - u16 tag; - int ret; + int ret = 0; if (!st) return -EOPNOTSUPP; - ret = pcie_tph_get_cpu_st(dev->pdev, mem_type, cpu_uid, &tag); - if (ret) - return ret; - if (st->direct_mode) { *st_index = tag; return 0; @@ -152,8 +147,46 @@ int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type, mutex_unlock(&st->lock); return ret; } +EXPORT_SYMBOL_GPL(mlx5_st_alloc_index_by_tag); + +int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type, + unsigned int cpu_uid, u16 *st_index) +{ + u16 tag; + int ret; + + ret = pcie_tph_get_cpu_st(dev->pdev, mem_type, cpu_uid, &tag); + if (ret) + return ret; + + return mlx5_st_alloc_index_by_tag(dev, tag, st_index); +} EXPORT_SYMBOL_GPL(mlx5_st_alloc_index); +int mlx5_st_get_index(struct mlx5_core_dev *dev, u16 st_index) +{ + struct mlx5_st_idx_data *idx_data; + struct mlx5_st *st = dev->st; + int ret = 0; + + if (!st) + return -EOPNOTSUPP; + + if (st->direct_mode) + return 0; + + mutex_lock(&st->lock); + idx_data = xa_load(&st->idx_xa, st_index); + if (WARN_ON_ONCE(!idx_data)) + ret = -EINVAL; + else + refcount_inc(&idx_data->usecount); + mutex_unlock(&st->lock); + + return ret; +} +EXPORT_SYMBOL_GPL(mlx5_st_get_index); + int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index) { struct mlx5_st_idx_data *idx_data; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 04b96c5abb57..0480b5c4f189 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1166,10 +1166,22 @@ int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type u64 length, u16 uid, phys_addr_t addr, u32 obj_id); #ifdef CONFIG_PCIE_TPH +int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev, u16 tag, + u16 *st_index); +int mlx5_st_get_index(struct mlx5_core_dev *dev, u16 st_index); int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type, unsigned int cpu_uid, u16 *st_index); int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index); #else +static inline int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev, + u16 tag, u16 *st_index) +{ + return -EOPNOTSUPP; +} +static inline int mlx5_st_get_index(struct mlx5_core_dev *dev, u16 st_index) +{ + return -EOPNOTSUPP; +} static inline int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type, unsigned int cpu_uid, u16 *st_index) diff --git a/include/rdma/frmr_pools.h b/include/rdma/frmr_pools.h index af1b88801fa4..a08d2b2cf9f3 100644 --- a/include/rdma/frmr_pools.h +++ b/include/rdma/frmr_pools.h @@ -24,7 +24,8 @@ struct ib_frmr_key { struct ib_frmr_pool_ops { int (*create_frmrs)(struct ib_device *device, struct ib_frmr_key *key, u32 *handles, u32 count); - void (*destroy_frmrs)(struct ib_device *device, u32 *handles, + void (*destroy_frmrs)(struct ib_device *device, + const struct ib_frmr_key *key, u32 *handles, u32 count); int (*build_key)(struct ib_device *device, const struct ib_frmr_key *in, struct ib_frmr_key *out); @@ -33,7 +34,7 @@ struct ib_frmr_pool_ops { int ib_frmr_pools_init(struct ib_device *device, const struct ib_frmr_pool_ops *pool_ops); void ib_frmr_pools_cleanup(struct ib_device *device); -int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr *mr); +int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr *mr, bool *reused); int ib_frmr_pool_push(struct ib_device *device, struct ib_mr *mr); #endif /* FRMR_POOLS_H */ -- 2.53.0-Meta
