On Mon, May 18, 2026 at 8:36 AM Clément Léger <[email protected]> wrote:
>
> From: Pavel Begunkov <[email protected]>
>
> There are currently no easy ways for the user to know if zcrx is out of
> buffers and page pool fails to allocate. Add uapi for zcrx to communicate
> it back.
>
> It's implemented as a separate CQE, which for now is posted to the creator
> ctx. To use it, on registration the user space needs to pass an instance
> of struct zcrx_notification_desc, which tells the kernel the user_data
> for resulting CQEs and which event types are expected / allowed.
>
> When an allowed event happens, zcrx will post a CQE containing the
> specified user_data, and lower bits of cqe->res will be set to the event
> mask. Before the kernel could post another notification of the given
> type, the user needs to acknowledge that it processed the previous one
> by issuing IORING_REGISTER_ZCRX_CTRL with ZCRX_CTRL_ARM_NOTIFICATION.
>
> The only notification type the patch implements is
> ZCRX_NOTIF_NO_BUFFERS, but we'll need more of them in the future.
>
> Co-developed-by: Vishwanath Seshagiri <[email protected]>
> Signed-off-by: Pavel Begunkov <[email protected]>
Signed-off-by: Vishwanath Seshagiri <[email protected]>
> ---
>  include/uapi/linux/io_uring/zcrx.h | 24 ++++++++-
>  io_uring/io_uring.c                |  2 +-
>  io_uring/io_uring.h                |  1 +
>  io_uring/zcrx.c                    | 86 +++++++++++++++++++++++++++++-
>  io_uring/zcrx.h                    |  7 ++-
>  5 files changed, 115 insertions(+), 5 deletions(-)
>
> diff --git a/include/uapi/linux/io_uring/zcrx.h 
> b/include/uapi/linux/io_uring/zcrx.h
> index 5ce02c7a6096..67185566ad3c 100644
> --- a/include/uapi/linux/io_uring/zcrx.h
> +++ b/include/uapi/linux/io_uring/zcrx.h
> @@ -65,6 +65,20 @@ enum zcrx_features {
>          * value in struct io_uring_zcrx_ifq_reg::rx_buf_len.
>          */
>         ZCRX_FEATURE_RX_PAGE_SIZE       = 1 << 0,
> +       ZCRX_FEATURE_NOTIFICATION       = 1 << 1,
> +};
> +
> +enum zcrx_notification_type {
> +       ZCRX_NOTIF_NO_BUFFERS,
> +
> +       __ZCRX_NOTIF_TYPE_LAST,
> +};
> +
> +struct zcrx_notification_desc {
> +       __u64   user_data;
> +       __u32   type_mask;
> +       __u32   __resv1;
> +       __u64   __resv2[10];
>  };
>
>  /*
> @@ -82,12 +96,14 @@ struct io_uring_zcrx_ifq_reg {
>         struct io_uring_zcrx_offsets offsets;
>         __u32   zcrx_id;
>         __u32   rx_buf_len;
> -       __u64   __resv[3];
> +       __u64   notif_desc; /* see struct zcrx_notification_desc */
> +       __u64   __resv[2];
>  };
>
>  enum zcrx_ctrl_op {
>         ZCRX_CTRL_FLUSH_RQ,
>         ZCRX_CTRL_EXPORT,
> +       ZCRX_CTRL_ARM_NOTIFICATION,
>
>         __ZCRX_CTRL_LAST,
>  };
> @@ -101,6 +117,11 @@ struct zcrx_ctrl_export {
>         __u32           __resv1[11];
>  };
>
> +struct zcrx_ctrl_arm_notif {
> +       __u32           notif_type;
> +       __u32           __resv[11];
> +};
> +
>  struct zcrx_ctrl {
>         __u32   zcrx_id;
>         __u32   op; /* see enum zcrx_ctrl_op */
> @@ -109,6 +130,7 @@ struct zcrx_ctrl {
>         union {
>                 struct zcrx_ctrl_export         zc_export;
>                 struct zcrx_ctrl_flush_rq       zc_flush;
> +               struct zcrx_ctrl_arm_notif      zc_arm_notif;
>         };
>  };
>
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 2ebb0ba37c4f..c5972274cce1 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -160,7 +160,7 @@ static void io_poison_cached_req(struct io_kiocb *req)
>         req->apoll = IO_URING_PTR_POISON;
>  }
>
> -static void io_poison_req(struct io_kiocb *req)
> +void io_poison_req(struct io_kiocb *req)
>  {
>         io_poison_cached_req(req);
>         req->async_data = IO_URING_PTR_POISON;
> diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
> index e612a66ee80e..de0a3bed58d1 100644
> --- a/io_uring/io_uring.h
> +++ b/io_uring/io_uring.h
> @@ -213,6 +213,7 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
>
>  void io_activate_pollwq(struct io_ring_ctx *ctx);
>  void io_restriction_clone(struct io_restriction *dst, struct io_restriction 
> *src);
> +void io_poison_req(struct io_kiocb *req);
>
>  static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
>  {
> diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
> index 34faf90423f4..463fbaead35b 100644
> --- a/io_uring/zcrx.c
> +++ b/io_uring/zcrx.c
> @@ -768,6 +768,8 @@ static int import_zcrx(struct io_ring_ctx *ctx,
>                 return -EINVAL;
>         if (reg->if_rxq || reg->rq_entries || reg->area_ptr || 
> reg->region_ptr)
>                 return -EINVAL;
> +       if (reg->notif_desc)
> +               return -EINVAL;
>         if (reg->flags & ~ZCRX_REG_IMPORT)
>                 return -EINVAL;
>
> @@ -856,6 +858,7 @@ static int zcrx_register_netdev(struct io_zcrx_ifq *ifq,
>  int io_register_zcrx(struct io_ring_ctx *ctx,
>                      struct io_uring_zcrx_ifq_reg __user *arg)
>  {
> +       struct zcrx_notification_desc notif;
>         struct io_uring_zcrx_area_reg area;
>         struct io_uring_zcrx_ifq_reg reg;
>         struct io_uring_region_desc rd;
> @@ -899,10 +902,22 @@ int io_register_zcrx(struct io_ring_ctx *ctx,
>         if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), 
> sizeof(area)))
>                 return -EFAULT;
>
> +       memset(&notif, 0, sizeof(notif));
> +       if (reg.notif_desc && copy_from_user(&notif, 
> u64_to_user_ptr(reg.notif_desc),
> +                                            sizeof(notif)))
> +               return -EFAULT;
> +       if (notif.type_mask & ~ZCRX_NOTIF_TYPE_MASK)
> +               return -EINVAL;
> +       if (notif.__resv1 || !mem_is_zero(&notif.__resv2, 
> sizeof(notif.__resv2)))
> +               return -EINVAL;
> +
>         ifq = io_zcrx_ifq_alloc(ctx);
>         if (!ifq)
>                 return -ENOMEM;
>
> +       ifq->notif_data = notif.user_data;
> +       ifq->allowed_notif_mask = notif.type_mask;
> +
>         if (ctx->user) {
>                 get_uid(ctx->user);
>                 ifq->user = ctx->user;
> @@ -954,7 +969,8 @@ int io_register_zcrx(struct io_ring_ctx *ctx,
>                 goto err;
>         }
>
> -       zcrx_set_ring_ctx(ifq, ctx);
> +       if (notif.type_mask)
> +               zcrx_set_ring_ctx(ifq, ctx);
>         return 0;
>  err:
>         scoped_guard(mutex, &ctx->mmap_lock)
> @@ -1127,6 +1143,48 @@ static unsigned io_zcrx_refill_slow(struct page_pool 
> *pp, struct io_zcrx_ifq *if
>         return allocated;
>  }
>
> +static void zcrx_notif_tw(struct io_tw_req tw_req, io_tw_token_t tw)
> +{
> +       struct io_kiocb *req = tw_req.req;
> +       struct io_ring_ctx *ctx = req->ctx;
> +
> +       io_post_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, 0);
> +       percpu_ref_put(&ctx->refs);
> +       io_poison_req(req);
> +       kmem_cache_free(req_cachep, req);
> +}
> +
> +static void zcrx_send_notif(struct io_zcrx_ifq *ifq, unsigned type)
> +{
> +       gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO;
> +       u32 type_mask = 1 << type;
> +       struct io_kiocb *req;
> +
> +       if (!(type_mask & ifq->allowed_notif_mask))
> +               return;
> +
> +       guard(spinlock_bh)(&ifq->ctx_lock);
> +       if (!ifq->master_ctx)
> +               return;
> +       if (type_mask & ifq->fired_notifs)
> +               return;
> +
> +       req = kmem_cache_alloc(req_cachep, gfp);
> +       if (unlikely(!req))
> +               return;
> +
> +       ifq->fired_notifs |= type_mask;
> +
> +       req->opcode = IORING_OP_NOP;
> +       req->cqe.user_data = ifq->notif_data;
> +       req->cqe.res = type;
> +       req->ctx = ifq->master_ctx;
> +       percpu_ref_get(&req->ctx->refs);
> +       req->tctx = NULL;
> +       req->io_task_work.func = zcrx_notif_tw;
> +       io_req_task_work_add(req);
> +}
> +
>  static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
>  {
>         struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
> @@ -1143,8 +1201,10 @@ static netmem_ref io_pp_zc_alloc_netmems(struct 
> page_pool *pp, gfp_t gfp)
>                 goto out_return;
>
>         allocated = io_zcrx_refill_slow(pp, ifq, netmems, to_alloc);
> -       if (!allocated)
> +       if (!allocated) {
> +               zcrx_send_notif(ifq, ZCRX_NOTIF_NO_BUFFERS);
>                 return 0;
> +       }
>  out_return:
>         zcrx_sync_for_device(pp, ifq, netmems, allocated);
>         allocated--;
> @@ -1293,12 +1353,32 @@ static int zcrx_flush_rq(struct io_ring_ctx *ctx, 
> struct io_zcrx_ifq *zcrx,
>         return 0;
>  }
>
> +static int zcrx_arm_notif(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx,
> +                         struct zcrx_ctrl *ctrl)
> +{
> +       const struct zcrx_ctrl_arm_notif *an = &ctrl->zc_arm_notif;
> +       unsigned type_mask;
> +
> +       if (an->notif_type >= __ZCRX_NOTIF_TYPE_LAST)
> +               return -EINVAL;
> +       if (!mem_is_zero(&an->__resv, sizeof(an->__resv)))
> +               return -EINVAL;
> +
> +       guard(spinlock_bh)(&zcrx->ctx_lock);
> +       type_mask = 1U << an->notif_type;
> +       if (type_mask & ~zcrx->fired_notifs)
> +               return -EINVAL;
> +       zcrx->fired_notifs &= ~type_mask;
> +       return 0;
> +}
> +
>  int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
>  {
>         struct zcrx_ctrl ctrl;
>         struct io_zcrx_ifq *zcrx;
>
>         BUILD_BUG_ON(sizeof(ctrl.zc_export) != sizeof(ctrl.zc_flush));
> +       BUILD_BUG_ON(sizeof(ctrl.zc_export) != sizeof(ctrl.zc_arm_notif));
>
>         if (nr_args)
>                 return -EINVAL;
> @@ -1316,6 +1396,8 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user 
> *arg, unsigned nr_args)
>                 return zcrx_flush_rq(ctx, zcrx, &ctrl);
>         case ZCRX_CTRL_EXPORT:
>                 return zcrx_export(ctx, zcrx, &ctrl, arg);
> +       case ZCRX_CTRL_ARM_NOTIFICATION:
> +               return zcrx_arm_notif(ctx, zcrx, &ctrl);
>         }
>
>         return -EOPNOTSUPP;
> diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
> index 6b565d0bf6da..cca10d0d02ac 100644
> --- a/io_uring/zcrx.h
> +++ b/io_uring/zcrx.h
> @@ -9,7 +9,9 @@
>  #include <net/net_trackers.h>
>
>  #define ZCRX_SUPPORTED_REG_FLAGS       (ZCRX_REG_IMPORT | ZCRX_REG_NODEV)
> -#define ZCRX_FEATURES                  (ZCRX_FEATURE_RX_PAGE_SIZE)
> +#define ZCRX_FEATURES                  (ZCRX_FEATURE_RX_PAGE_SIZE |\
> +                                        ZCRX_FEATURE_NOTIFICATION)
> +#define ZCRX_NOTIF_TYPE_MASK           (1U << ZCRX_NOTIF_NO_BUFFERS)
>
>  struct io_zcrx_mem {
>         unsigned long                   size;
> @@ -76,6 +78,9 @@ struct io_zcrx_ifq {
>
>         spinlock_t                      ctx_lock;
>         struct io_ring_ctx              *master_ctx;
> +       u32                             allowed_notif_mask;
> +       u32                             fired_notifs;
> +       u64                             notif_data;
>  };
>
>  #if defined(CONFIG_IO_URING_ZCRX)
> --
> 2.53.0-Meta
>

Reply via email to