On Mon, May 18, 2026 at 8:36 AM Clément Léger <[email protected]> wrote:
>
> From: Pavel Begunkov <[email protected]>
>
> There are currently no easy ways for the user to know if zcrx is out of
> buffers and page pool fails to allocate. Add uapi for zcrx to communicate
> it back.
>
> It's implemented as a separate CQE, which for now is posted to the creator
> ctx. To use it, on registration the user space needs to pass an instance
> of struct zcrx_notification_desc, which tells the kernel the user_data
> for resulting CQEs and which event types are expected / allowed.
>
> When an allowed event happens, zcrx will post a CQE containing the
> specified user_data, and lower bits of cqe->res will be set to the event
> mask. Before the kernel could post another notification of the given
> type, the user needs to acknowledge that it processed the previous one
> by issuing IORING_REGISTER_ZCRX_CTRL with ZCRX_CTRL_ARM_NOTIFICATION.
>
> The only notification type the patch implements is
> ZCRX_NOTIF_NO_BUFFERS, but we'll need more of them in the future.
>
> Co-developed-by: Vishwanath Seshagiri <[email protected]>
> Signed-off-by: Pavel Begunkov <[email protected]>
Signed-off-by: Vishwanath Seshagiri <[email protected]>
> ---
> include/uapi/linux/io_uring/zcrx.h | 24 ++++++++-
> io_uring/io_uring.c | 2 +-
> io_uring/io_uring.h | 1 +
> io_uring/zcrx.c | 86 +++++++++++++++++++++++++++++-
> io_uring/zcrx.h | 7 ++-
> 5 files changed, 115 insertions(+), 5 deletions(-)
>
> diff --git a/include/uapi/linux/io_uring/zcrx.h
> b/include/uapi/linux/io_uring/zcrx.h
> index 5ce02c7a6096..67185566ad3c 100644
> --- a/include/uapi/linux/io_uring/zcrx.h
> +++ b/include/uapi/linux/io_uring/zcrx.h
> @@ -65,6 +65,20 @@ enum zcrx_features {
> * value in struct io_uring_zcrx_ifq_reg::rx_buf_len.
> */
> ZCRX_FEATURE_RX_PAGE_SIZE = 1 << 0,
> + ZCRX_FEATURE_NOTIFICATION = 1 << 1,
> +};
> +
> +enum zcrx_notification_type {
> + ZCRX_NOTIF_NO_BUFFERS,
> +
> + __ZCRX_NOTIF_TYPE_LAST,
> +};
> +
> +struct zcrx_notification_desc {
> + __u64 user_data;
> + __u32 type_mask;
> + __u32 __resv1;
> + __u64 __resv2[10];
> };
>
> /*
> @@ -82,12 +96,14 @@ struct io_uring_zcrx_ifq_reg {
> struct io_uring_zcrx_offsets offsets;
> __u32 zcrx_id;
> __u32 rx_buf_len;
> - __u64 __resv[3];
> + __u64 notif_desc; /* see struct zcrx_notification_desc */
> + __u64 __resv[2];
> };
>
> enum zcrx_ctrl_op {
> ZCRX_CTRL_FLUSH_RQ,
> ZCRX_CTRL_EXPORT,
> + ZCRX_CTRL_ARM_NOTIFICATION,
>
> __ZCRX_CTRL_LAST,
> };
> @@ -101,6 +117,11 @@ struct zcrx_ctrl_export {
> __u32 __resv1[11];
> };
>
> +struct zcrx_ctrl_arm_notif {
> + __u32 notif_type;
> + __u32 __resv[11];
> +};
> +
> struct zcrx_ctrl {
> __u32 zcrx_id;
> __u32 op; /* see enum zcrx_ctrl_op */
> @@ -109,6 +130,7 @@ struct zcrx_ctrl {
> union {
> struct zcrx_ctrl_export zc_export;
> struct zcrx_ctrl_flush_rq zc_flush;
> + struct zcrx_ctrl_arm_notif zc_arm_notif;
> };
> };
>
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 2ebb0ba37c4f..c5972274cce1 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -160,7 +160,7 @@ static void io_poison_cached_req(struct io_kiocb *req)
> req->apoll = IO_URING_PTR_POISON;
> }
>
> -static void io_poison_req(struct io_kiocb *req)
> +void io_poison_req(struct io_kiocb *req)
> {
> io_poison_cached_req(req);
> req->async_data = IO_URING_PTR_POISON;
> diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
> index e612a66ee80e..de0a3bed58d1 100644
> --- a/io_uring/io_uring.h
> +++ b/io_uring/io_uring.h
> @@ -213,6 +213,7 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
>
> void io_activate_pollwq(struct io_ring_ctx *ctx);
> void io_restriction_clone(struct io_restriction *dst, struct io_restriction
> *src);
> +void io_poison_req(struct io_kiocb *req);
>
> static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
> {
> diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
> index 34faf90423f4..463fbaead35b 100644
> --- a/io_uring/zcrx.c
> +++ b/io_uring/zcrx.c
> @@ -768,6 +768,8 @@ static int import_zcrx(struct io_ring_ctx *ctx,
> return -EINVAL;
> if (reg->if_rxq || reg->rq_entries || reg->area_ptr ||
> reg->region_ptr)
> return -EINVAL;
> + if (reg->notif_desc)
> + return -EINVAL;
> if (reg->flags & ~ZCRX_REG_IMPORT)
> return -EINVAL;
>
> @@ -856,6 +858,7 @@ static int zcrx_register_netdev(struct io_zcrx_ifq *ifq,
> int io_register_zcrx(struct io_ring_ctx *ctx,
> struct io_uring_zcrx_ifq_reg __user *arg)
> {
> + struct zcrx_notification_desc notif;
> struct io_uring_zcrx_area_reg area;
> struct io_uring_zcrx_ifq_reg reg;
> struct io_uring_region_desc rd;
> @@ -899,10 +902,22 @@ int io_register_zcrx(struct io_ring_ctx *ctx,
> if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr),
> sizeof(area)))
> return -EFAULT;
>
> + memset(¬if, 0, sizeof(notif));
> + if (reg.notif_desc && copy_from_user(¬if,
> u64_to_user_ptr(reg.notif_desc),
> + sizeof(notif)))
> + return -EFAULT;
> + if (notif.type_mask & ~ZCRX_NOTIF_TYPE_MASK)
> + return -EINVAL;
> + if (notif.__resv1 || !mem_is_zero(¬if.__resv2,
> sizeof(notif.__resv2)))
> + return -EINVAL;
> +
> ifq = io_zcrx_ifq_alloc(ctx);
> if (!ifq)
> return -ENOMEM;
>
> + ifq->notif_data = notif.user_data;
> + ifq->allowed_notif_mask = notif.type_mask;
> +
> if (ctx->user) {
> get_uid(ctx->user);
> ifq->user = ctx->user;
> @@ -954,7 +969,8 @@ int io_register_zcrx(struct io_ring_ctx *ctx,
> goto err;
> }
>
> - zcrx_set_ring_ctx(ifq, ctx);
> + if (notif.type_mask)
> + zcrx_set_ring_ctx(ifq, ctx);
> return 0;
> err:
> scoped_guard(mutex, &ctx->mmap_lock)
> @@ -1127,6 +1143,48 @@ static unsigned io_zcrx_refill_slow(struct page_pool
> *pp, struct io_zcrx_ifq *if
> return allocated;
> }
>
> +static void zcrx_notif_tw(struct io_tw_req tw_req, io_tw_token_t tw)
> +{
> + struct io_kiocb *req = tw_req.req;
> + struct io_ring_ctx *ctx = req->ctx;
> +
> + io_post_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, 0);
> + percpu_ref_put(&ctx->refs);
> + io_poison_req(req);
> + kmem_cache_free(req_cachep, req);
> +}
> +
> +static void zcrx_send_notif(struct io_zcrx_ifq *ifq, unsigned type)
> +{
> + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO;
> + u32 type_mask = 1 << type;
> + struct io_kiocb *req;
> +
> + if (!(type_mask & ifq->allowed_notif_mask))
> + return;
> +
> + guard(spinlock_bh)(&ifq->ctx_lock);
> + if (!ifq->master_ctx)
> + return;
> + if (type_mask & ifq->fired_notifs)
> + return;
> +
> + req = kmem_cache_alloc(req_cachep, gfp);
> + if (unlikely(!req))
> + return;
> +
> + ifq->fired_notifs |= type_mask;
> +
> + req->opcode = IORING_OP_NOP;
> + req->cqe.user_data = ifq->notif_data;
> + req->cqe.res = type;
> + req->ctx = ifq->master_ctx;
> + percpu_ref_get(&req->ctx->refs);
> + req->tctx = NULL;
> + req->io_task_work.func = zcrx_notif_tw;
> + io_req_task_work_add(req);
> +}
> +
> static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
> {
> struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
> @@ -1143,8 +1201,10 @@ static netmem_ref io_pp_zc_alloc_netmems(struct
> page_pool *pp, gfp_t gfp)
> goto out_return;
>
> allocated = io_zcrx_refill_slow(pp, ifq, netmems, to_alloc);
> - if (!allocated)
> + if (!allocated) {
> + zcrx_send_notif(ifq, ZCRX_NOTIF_NO_BUFFERS);
> return 0;
> + }
> out_return:
> zcrx_sync_for_device(pp, ifq, netmems, allocated);
> allocated--;
> @@ -1293,12 +1353,32 @@ static int zcrx_flush_rq(struct io_ring_ctx *ctx,
> struct io_zcrx_ifq *zcrx,
> return 0;
> }
>
> +static int zcrx_arm_notif(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx,
> + struct zcrx_ctrl *ctrl)
> +{
> + const struct zcrx_ctrl_arm_notif *an = &ctrl->zc_arm_notif;
> + unsigned type_mask;
> +
> + if (an->notif_type >= __ZCRX_NOTIF_TYPE_LAST)
> + return -EINVAL;
> + if (!mem_is_zero(&an->__resv, sizeof(an->__resv)))
> + return -EINVAL;
> +
> + guard(spinlock_bh)(&zcrx->ctx_lock);
> + type_mask = 1U << an->notif_type;
> + if (type_mask & ~zcrx->fired_notifs)
> + return -EINVAL;
> + zcrx->fired_notifs &= ~type_mask;
> + return 0;
> +}
> +
> int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
> {
> struct zcrx_ctrl ctrl;
> struct io_zcrx_ifq *zcrx;
>
> BUILD_BUG_ON(sizeof(ctrl.zc_export) != sizeof(ctrl.zc_flush));
> + BUILD_BUG_ON(sizeof(ctrl.zc_export) != sizeof(ctrl.zc_arm_notif));
>
> if (nr_args)
> return -EINVAL;
> @@ -1316,6 +1396,8 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user
> *arg, unsigned nr_args)
> return zcrx_flush_rq(ctx, zcrx, &ctrl);
> case ZCRX_CTRL_EXPORT:
> return zcrx_export(ctx, zcrx, &ctrl, arg);
> + case ZCRX_CTRL_ARM_NOTIFICATION:
> + return zcrx_arm_notif(ctx, zcrx, &ctrl);
> }
>
> return -EOPNOTSUPP;
> diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
> index 6b565d0bf6da..cca10d0d02ac 100644
> --- a/io_uring/zcrx.h
> +++ b/io_uring/zcrx.h
> @@ -9,7 +9,9 @@
> #include <net/net_trackers.h>
>
> #define ZCRX_SUPPORTED_REG_FLAGS (ZCRX_REG_IMPORT | ZCRX_REG_NODEV)
> -#define ZCRX_FEATURES (ZCRX_FEATURE_RX_PAGE_SIZE)
> +#define ZCRX_FEATURES (ZCRX_FEATURE_RX_PAGE_SIZE |\
> + ZCRX_FEATURE_NOTIFICATION)
> +#define ZCRX_NOTIF_TYPE_MASK (1U << ZCRX_NOTIF_NO_BUFFERS)
>
> struct io_zcrx_mem {
> unsigned long size;
> @@ -76,6 +78,9 @@ struct io_zcrx_ifq {
>
> spinlock_t ctx_lock;
> struct io_ring_ctx *master_ctx;
> + u32 allowed_notif_mask;
> + u32 fired_notifs;
> + u64 notif_data;
> };
>
> #if defined(CONFIG_IO_URING_ZCRX)
> --
> 2.53.0-Meta
>