[PATCH net-next v8 1/6] net: refactor set_rx_mode into snapshot and deferred I/O

I Viswanath Mon, 12 Jan 2026 10:18:42 -0800

Refactor set_rx_mode into two stages: a snapshot stage and the
actual I/O. When __dev_set_rx_mode() is called, the core takes a
snapshot of the current rx_mode config and commits it to
hardware later via a work item.


In this model, ndo_set_rx_mode() is responsible for customizing the
rx mode snapshot and deciding whether the work should happen or not,
while ndo_write_rx_mode() applies the snapshot to hardware.

Suggested-by: Jakub Kicinski <[email protected]>
Signed-off-by: I Viswanath <[email protected]>
---
 include/linux/netdevice.h | 112 +++++++++++++++-
 net/core/dev.c            | 265 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 370 insertions(+), 7 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d99b0fbc1942..5f9268ac7b75 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1062,6 +1062,45 @@ struct netdev_net_notifier {
        struct notifier_block *nb;
 };
 
+struct netif_cleanup_work {
+       struct work_struct work;
+       struct net_device *dev;
+};
+
+enum netif_rx_mode_cfg {
+       NETIF_RX_MODE_CFG_ALLMULTI,
+       NETIF_RX_MODE_CFG_PROMISC,
+       NETIF_RX_MODE_CFG_VLAN,
+       NETIF_RX_MODE_CFG_BROADCAST
+};
+
+enum netif_rx_mode_flags {
+       NETIF_RX_MODE_READY,
+
+       /* if set, rx_mode set work will be skipped */
+       NETIF_RX_MODE_SET_SKIP,
+
+       /* if set, uc/mc lists will not be part of rx_mode config */
+       NETIF_RX_MODE_UC_SKIP,
+       NETIF_RX_MODE_MC_SKIP
+};
+
+struct netif_rx_mode_config {
+       char    *uc_addrs;
+       char    *mc_addrs;
+       int     uc_count;
+       int     mc_count;
+       int     cfg;
+};
+
+struct netif_rx_mode_ctx {
+       struct netif_rx_mode_config *pending;
+       struct netif_rx_mode_config *ready;
+       struct work_struct work;
+       struct net_device *dev;
+       int flags;
+};
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -1114,9 +1153,14 @@ struct netdev_net_notifier {
  *     changes to configuration when multicast or promiscuous is enabled.
  *
  * void (*ndo_set_rx_mode)(struct net_device *dev);
- *     This function is called device changes address list filtering.
+ *     This function is called when device changes address list filtering.
  *     If driver handles unicast address filtering, it should set
- *     IFF_UNICAST_FLT in its priv_flags.
+ *     IFF_UNICAST_FLT in its priv_flags. This is used to configure
+ *     the rx_mode snapshot that will be written to the hardware.
+ *
+ * void (*ndo_write_rx_mode)(struct net_device *dev);
+ *     This function is scheduled after set_rx_mode and is responsible for
+ *     writing the rx_mode snapshot to the hardware.
  *
  * int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
  *     This function  is called when the Media Access Control address
@@ -1437,6 +1481,7 @@ struct net_device_ops {
        void                    (*ndo_change_rx_flags)(struct net_device *dev,
                                                       int flags);
        void                    (*ndo_set_rx_mode)(struct net_device *dev);
+       void                    (*ndo_write_rx_mode)(struct net_device *dev);
        int                     (*ndo_set_mac_address)(struct net_device *dev,
                                                       void *addr);
        int                     (*ndo_validate_addr)(struct net_device *dev);
@@ -1939,7 +1984,7 @@ enum netdev_reg_state {
  *     @ingress_queue:         XXX: need comments on this one
  *     @nf_hooks_ingress:      netfilter hooks executed for ingress packets
  *     @broadcast:             hw bcast address
- *
+ *     @rx_mode_ctx:           rx_mode work context
  *     @rx_cpu_rmap:   CPU reverse-mapping for RX completion interrupts,
  *                     indexed by RX queue number. Assigned by driver.
  *                     This must only be set if the ndo_rx_flow_steer
@@ -1971,6 +2016,8 @@ enum netdev_reg_state {
  *     @link_watch_list:       XXX: need comments on this one
  *
  *     @reg_state:             Register/unregister state machine
+ *     @needs_cleanup_work:    Should dev_close schedule the cleanup work?
+ *     @cleanup_work:          Cleanup work context
  *     @dismantle:             Device is going to be freed
  *     @needs_free_netdev:     Should unregister perform free_netdev?
  *     @priv_destructor:       Called from unregister
@@ -2350,6 +2397,7 @@ struct net_device {
 #endif
 
        unsigned char           broadcast[MAX_ADDR_LEN];
+       struct netif_rx_mode_ctx *rx_mode_ctx;
 #ifdef CONFIG_RFS_ACCEL
        struct cpu_rmap         *rx_cpu_rmap;
 #endif
@@ -2387,6 +2435,10 @@ struct net_device {
 
        u8 reg_state;
 
+       bool needs_cleanup_work;
+
+       struct netif_cleanup_work *cleanup_work;
+
        bool dismantle;
 
        /** @moving_ns: device is changing netns, protected by @lock */
@@ -3373,6 +3425,60 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, 
struct sk_buff *newskb);
 u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev);
 
+/* Helpers to be used in the set_rx_mode implementation */
+static inline void netif_rx_mode_set_cfg(struct net_device *dev, int b,
+                                        bool val)
+{
+       if (val)
+               dev->rx_mode_ctx->pending->cfg |= BIT(b);
+       else
+               dev->rx_mode_ctx->pending->cfg &= ~BIT(b);
+}
+
+static inline void netif_rx_mode_set_flag(struct net_device *dev, int b,
+                                         bool val)
+{
+       if (val)
+               dev->rx_mode_ctx->flags |= BIT(b);
+       else
+               dev->rx_mode_ctx->flags &= ~BIT(b);
+}
+
+/* Helpers to be used in the write_rx_mode implementation */
+static inline int netif_rx_mode_get_cfg(struct net_device *dev, int b)
+{
+       return !!(dev->rx_mode_ctx->ready->cfg & BIT(b));
+}
+
+static inline int netif_rx_mode_get_flag(struct net_device *dev, int b)
+{
+       return !!(dev->rx_mode_ctx->flags & BIT(b));
+}
+
+static inline int netif_rx_mode_mc_count(struct net_device *dev)
+{
+       return dev->rx_mode_ctx->ready->mc_count;
+}
+
+static inline int netif_rx_mode_uc_count(struct net_device *dev)
+{
+       return dev->rx_mode_ctx->ready->uc_count;
+}
+
+void netif_schedule_rx_mode_work(struct net_device *dev);
+
+void netif_flush_rx_mode_work(struct net_device *dev);
+
+#define netif_rx_mode_for_each_uc_addr(ha_addr, dev, __i) \
+       for (__i = 0, ha_addr = (dev)->rx_mode_ctx->ready->uc_addrs; \
+            __i < (dev)->rx_mode_ctx->ready->uc_count; \
+            __i++, ha_addr += (dev)->addr_len)
+
+#define netif_rx_mode_for_each_mc_addr(ha_addr, dev, __i) \
+       for (__i = 0, ha_addr = (dev)->rx_mode_ctx->ready->mc_addrs; \
+            __i < (dev)->rx_mode_ctx->ready->mc_count; \
+            __i++, ha_addr += (dev)->addr_len)
+
 int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev);
 int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index c711da335510..072da874a958 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1586,6 +1586,197 @@ void netif_state_change(struct net_device *dev)
        }
 }
 
+/* This function attempts to copy the current state of the
+ * net device into pending (reallocating if necessary). If it fails,
+ * pending is guaranteed to be unmodified.
+ */
+static int __netif_prepare_rx_mode(struct net_device *dev)
+{
+       struct netif_rx_mode_config *pending = dev->rx_mode_ctx->pending;
+       bool skip_uc = false, skip_mc = false;
+       int uc_count = 0, mc_count = 0;
+       struct netdev_hw_addr *ha;
+       char *tmp;
+       int i;
+
+       skip_uc = netif_rx_mode_get_flag(dev, NETIF_RX_MODE_UC_SKIP);
+       skip_mc = netif_rx_mode_get_flag(dev, NETIF_RX_MODE_MC_SKIP);
+
+       /* The allocations need to be atomic since this will be called under
+        * netif_addr_lock_bh()
+        */
+       if (!skip_uc) {
+               uc_count = netdev_uc_count(dev);
+               tmp = krealloc(pending->uc_addrs, uc_count * dev->addr_len,
+                              GFP_ATOMIC);
+               if (!tmp)
+                       return -ENOMEM;
+               pending->uc_addrs = tmp;
+       }
+
+       if (!skip_mc) {
+               mc_count = netdev_mc_count(dev);
+               tmp = krealloc(pending->mc_addrs, mc_count * dev->addr_len,
+                              GFP_ATOMIC);
+               if (!tmp)
+                       return -ENOMEM;
+               pending->mc_addrs = tmp;
+       }
+
+       /* This function cannot fail after this point */
+       i = 0;
+       if (!skip_uc) {
+               pending->uc_count = uc_count;
+               netdev_for_each_uc_addr(ha, dev)
+                       memcpy(pending->uc_addrs + (i++) * dev->addr_len,
+                              ha->addr, dev->addr_len);
+       }
+
+       i = 0;
+       if (!skip_mc) {
+               pending->mc_count = mc_count;
+               netdev_for_each_mc_addr(ha, dev)
+                       memcpy(pending->mc_addrs + (i++) * dev->addr_len,
+                              ha->addr, dev->addr_len);
+       }
+       return 0;
+}
+
+static void netif_prepare_rx_mode(struct net_device *dev)
+{
+       int rc;
+
+       lockdep_assert_held(&dev->addr_list_lock);
+
+       rc = __netif_prepare_rx_mode(dev);
+       if (rc)
+               return;
+
+       netif_rx_mode_set_flag(dev, NETIF_RX_MODE_READY, true);
+}
+
+static void netif_write_rx_mode(struct work_struct *param)
+{
+       struct netif_rx_mode_ctx *ctx;
+       struct net_device *dev;
+
+       rtnl_lock();
+       ctx = container_of(param, struct netif_rx_mode_ctx, work);
+       dev = ctx->dev;
+
+       if (!netif_running(dev)) {
+               rtnl_unlock();
+               return;
+       }
+
+       /* Paranoia. */
+       if (WARN_ON(!dev->netdev_ops->ndo_write_rx_mode)) {
+               rtnl_unlock();
+               return;
+       }
+
+       /* We could introduce a new lock for this but reusing the addr
+        * lock works well enough
+        */
+       netif_addr_lock_bh(dev);
+
+       /* There's no point continuing if the pending config is not ready */
+       if (!netif_rx_mode_get_flag(dev, NETIF_RX_MODE_READY)) {
+               netif_addr_unlock_bh(dev);
+               rtnl_unlock();
+               return;
+       }
+
+       swap(ctx->ready, ctx->pending);
+       netif_rx_mode_set_flag(dev, NETIF_RX_MODE_READY, false);
+       netif_addr_unlock_bh(dev);
+
+       dev->netdev_ops->ndo_write_rx_mode(dev);
+       rtnl_unlock();
+}
+
+static int netif_alloc_rx_mode_ctx(struct net_device *dev)
+{
+       dev->rx_mode_ctx = kzalloc(sizeof(*dev->rx_mode_ctx), GFP_KERNEL);
+       if (!dev->rx_mode_ctx)
+               goto fail_all;
+
+       dev->rx_mode_ctx->ready = kzalloc(sizeof(*dev->rx_mode_ctx->ready),
+                                         GFP_KERNEL);
+       if (!dev->rx_mode_ctx->ready)
+               goto fail_ready;
+
+       dev->rx_mode_ctx->pending = kzalloc(sizeof(*dev->rx_mode_ctx->pending),
+                                           GFP_KERNEL);
+       if (!dev->rx_mode_ctx->pending)
+               goto fail_pending;
+
+       dev->rx_mode_ctx->dev = dev;
+       INIT_WORK(&dev->rx_mode_ctx->work, netif_write_rx_mode);
+       return 0;
+
+fail_pending:
+       kfree(dev->rx_mode_ctx->ready);
+
+fail_ready:
+       kfree(dev->rx_mode_ctx);
+
+fail_all:
+       return -ENOMEM;
+}
+
+static void netif_free_rx_mode_ctx(struct net_device *dev)
+{
+       if (!dev->rx_mode_ctx)
+               return;
+
+       cancel_work_sync(&dev->rx_mode_ctx->work);
+
+       kfree(dev->rx_mode_ctx->ready->uc_addrs);
+       kfree(dev->rx_mode_ctx->ready->mc_addrs);
+       kfree(dev->rx_mode_ctx->ready);
+
+       kfree(dev->rx_mode_ctx->pending->uc_addrs);
+       kfree(dev->rx_mode_ctx->pending->mc_addrs);
+       kfree(dev->rx_mode_ctx->pending);
+
+       kfree(dev->rx_mode_ctx);
+       dev->rx_mode_ctx = NULL;
+}
+
+static void netif_cleanup_work_fn(struct work_struct *param)
+{
+       struct netif_cleanup_work *ctx;
+       struct net_device *dev;
+
+       ctx = container_of(param, struct netif_cleanup_work, work);
+       dev = ctx->dev;
+
+       if (dev->netdev_ops->ndo_write_rx_mode)
+               netif_free_rx_mode_ctx(dev);
+}
+
+static int netif_alloc_cleanup_work(struct net_device *dev)
+{
+       dev->cleanup_work = kzalloc(sizeof(*dev->cleanup_work), GFP_KERNEL);
+       if (!dev->cleanup_work)
+               return -ENOMEM;
+
+       dev->cleanup_work->dev = dev;
+       INIT_WORK(&dev->cleanup_work->work, netif_cleanup_work_fn);
+       return 0;
+}
+
+static void netif_free_cleanup_work(struct net_device *dev)
+{
+       if (!dev->cleanup_work)
+               return;
+
+       flush_work(&dev->cleanup_work->work);
+       kfree(dev->cleanup_work);
+       dev->cleanup_work = NULL;
+}
+
 /**
  * __netdev_notify_peers - notify network peers about existence of @dev,
  * to be called when rtnl lock is already held.
@@ -1678,6 +1869,16 @@ static int __dev_open(struct net_device *dev, struct 
netlink_ext_ack *extack)
        if (ops->ndo_validate_addr)
                ret = ops->ndo_validate_addr(dev);
 
+       if (!ret && dev->needs_cleanup_work) {
+               if (!dev->cleanup_work)
+                       ret = netif_alloc_cleanup_work(dev);
+               else
+                       flush_work(&dev->cleanup_work->work);
+       }
+
+       if (!ret && ops->ndo_write_rx_mode)
+               ret = netif_alloc_rx_mode_ctx(dev);
+
        if (!ret && ops->ndo_open)
                ret = ops->ndo_open(dev);
 
@@ -1754,6 +1955,9 @@ static void __dev_close_many(struct list_head *head)
                if (ops->ndo_stop)
                        ops->ndo_stop(dev);
 
+               if (dev->needs_cleanup_work)
+                       schedule_work(&dev->cleanup_work->work);
+
                netif_set_up(dev, false);
                netpoll_poll_enable(dev);
        }
@@ -9622,6 +9826,57 @@ int netif_set_allmulti(struct net_device *dev, int inc, 
bool notify)
        return 0;
 }
 
+/* netif_schedule_rx_mode_work - Sets up the rx_config snapshot and
+ * schedules the deferred I/O.
+ */
+static void __netif_schedule_rx_mode_work(struct net_device *dev)
+{
+       const struct net_device_ops *ops = dev->netdev_ops;
+
+       if (ops->ndo_set_rx_mode)
+               ops->ndo_set_rx_mode(dev);
+
+       if (!ops->ndo_write_rx_mode)
+               return;
+
+       /* This part is only for drivers that implement ndo_write_rx_mode */
+
+       /* If rx_mode set is to be skipped, we don't schedule the work */
+       if (netif_rx_mode_get_flag(dev, NETIF_RX_MODE_SET_SKIP))
+               return;
+
+       netif_prepare_rx_mode(dev);
+       schedule_work(&dev->rx_mode_ctx->work);
+}
+
+void netif_schedule_rx_mode_work(struct net_device *dev)
+{
+       if (WARN_ON(!netif_running(dev)))
+               return;
+
+       netif_addr_lock_bh(dev);
+       __netif_schedule_rx_mode_work(dev);
+       netif_addr_unlock_bh(dev);
+}
+EXPORT_SYMBOL(netif_schedule_rx_mode_work);
+
+/* Drivers that implement rx mode as work flush the work item when closing
+ * or suspending. This is the substitute for those calls.
+ */
+void netif_flush_rx_mode_work(struct net_device *dev)
+{
+       /* Calling this function with RTNL held will result in a deadlock. */
+       if (WARN_ON(rtnl_is_locked()))
+               return;
+
+       /* Doing nothing is enough to "flush" work on a closed interface */
+       if (!netif_running(dev))
+               return;
+
+       flush_work(&dev->rx_mode_ctx->work);
+}
+EXPORT_SYMBOL(netif_flush_rx_mode_work);
+
 /*
  *     Upload unicast and multicast address lists to device and
  *     configure RX filtering. When the device doesn't support unicast
@@ -9630,8 +9885,6 @@ int netif_set_allmulti(struct net_device *dev, int inc, 
bool notify)
  */
 void __dev_set_rx_mode(struct net_device *dev)
 {
-       const struct net_device_ops *ops = dev->netdev_ops;
-
        /* dev_open will call this function so the list will stay sane. */
        if (!(dev->flags&IFF_UP))
                return;
@@ -9652,8 +9905,7 @@ void __dev_set_rx_mode(struct net_device *dev)
                }
        }
 
-       if (ops->ndo_set_rx_mode)
-               ops->ndo_set_rx_mode(dev);
+       __netif_schedule_rx_mode_work(dev);
 }
 
 void dev_set_rx_mode(struct net_device *dev)
@@ -11324,6 +11576,9 @@ int register_netdevice(struct net_device *dev)
                }
        }
 
+       if (dev->netdev_ops->ndo_write_rx_mode)
+               dev->needs_cleanup_work = true;
+
        if (((dev->hw_features | dev->features) &
             NETIF_F_HW_VLAN_CTAG_FILTER) &&
            (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
@@ -12067,6 +12322,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, 
const char *name,
        dev->real_num_rx_queues = rxqs;
        if (netif_alloc_rx_queues(dev))
                goto free_all;
+
        dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT);
        if (!dev->ethtool)
                goto free_all;
@@ -12150,6 +12406,7 @@ void free_netdev(struct net_device *dev)
        kfree(dev->ethtool);
        netif_free_tx_queues(dev);
        netif_free_rx_queues(dev);
+       netif_free_cleanup_work(dev);
 
        kfree(rcu_dereference_protected(dev->ingress_queue, 1));
 
-- 
2.47.3

[PATCH net-next v8 1/6] net: refactor set_rx_mode into snapshot and deferred I/O

Reply via email to