[WIP] [PATCH] WAS Re: [RFC] New driver API to speed up small packets xmits

jamal Tue, 15 May 2007 15:18:07 -0700

On Tue, 2007-15-05 at 14:32 -0700, David Miller wrote:
>  An efficient qdisc-->driver
> transfer during netif_wake_queue() could help solve some of that,
> as is being discussed here.


Ok, heres the approach i discussed at netconf.
It needs net-2.6 and the patch i posted earlier to clean up
qdisc_restart() [1].
I havent ported over all the bits from 2.6.18,  but this works.
Krishna and i have colluded privately on working together. I just need
to reproduce the patches, so here is the core.
A lot of the code in the core could be aggragated later - right now i am
worried about correctness.
I will post a patch for tun device in a few minutes
that i use to test on my laptop (i need to remove some debugs) to show
an example.
I also plan to post a patch for e1000 - but that will take more
than a few minutes.
the e1000 driver has changed quiet a bit since 2.6.18, so it is
consuming.

What does a driver need to do to get batched-to?

1) On initialization (probe probably)
 a) set NETIF_F_BTX in its dev->features at startup
 i.e dev->features |= NETIF_F_BTX
 b) initialize the batch queue i.e something like
skb_queue_head_init(&dev->blist);
c) set dev->xmit_win to something reasonable like
maybe half the DMA ring size or tx_queuelen

2) create a new method for batch txmit.
This loops on dev->blist and stashes onto hardware.
All return codes like NETDEV_TX_OK etc still apply.

3) set the dev->xmit_win which provides hints on how much
data to send from the core to the driver. Some suggestions:
a)on doing a netif_stop, set it to 1
b)on netif_wake_queue set it to the max available space

Of course, to work, all this requires that the driver to have a
threshold for waking up tx path; like  drivers such as e1000 or tg3 do
in order to invoke netif_wake_queue (example look at TX_WAKE_THRESHOLD
usage in e1000).


feedback welcome (preferably in the form of patches).

Anyone with a really nice tool to measure CPU improvement will help
a great deal in quantifying things. As i have said earlier, I never saw
any throughput improvement. But like T/GSO it may be just CPU savings
(as was suggested at netconf).

cheers,
jamal
[1] http://marc.info/?l=linux-netdev&m=117914954911959&w=2

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f671cd2..7205748 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -325,6 +325,7 @@ struct net_device
 #define NETIF_F_VLAN_CHALLENGED	1024	/* Device cannot handle VLAN packets */
 #define NETIF_F_GSO		2048	/* Enable software GSO. */
 #define NETIF_F_LLTX		4096	/* LockLess TX */
+#define NETIF_F_BTX		8192	/* Capable of batch tx */
 
 	/* Segmentation offload features */
 #define NETIF_F_GSO_SHIFT	16
@@ -450,6 +451,11 @@ struct net_device
 	void			*priv;	/* pointer to private data	*/
 	int			(*hard_start_xmit) (struct sk_buff *skb,
 						    struct net_device *dev);
+	int			(*hard_batch_xmit) (struct sk_buff_head *list,
+						    struct net_device *dev);
+	int			(*hard_prep_xmit) (struct sk_buff *skb,
+						    struct net_device *dev);
+	int			xmit_win;
 	/* These may be needed for future network-power-down code. */
 	unsigned long		trans_start;	/* Time (in jiffies) of last Tx	*/
 
@@ -466,6 +472,10 @@ struct net_device
 	struct list_head	todo_list;
 	/* device index hash chain */
 	struct hlist_node	index_hlist;
+	/*XXX: Fix eventually to not allocate if device not
+	 *batch capable
+	*/
+	struct sk_buff_head	blist;
 
 	struct net_device	*link_watch_next;
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index ed80054..61fa301 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -85,10 +85,12 @@ static inline int
 do_dev_requeue(struct sk_buff *skb, struct net_device *dev, struct Qdisc *q)
 {
 
-	if (unlikely(skb->next))
-		dev->gso_skb = skb;
-	else
-		q->ops->requeue(skb, q);
+	if (skb) {
+		if (unlikely(skb->next))
+			dev->gso_skb = skb;
+		else
+			q->ops->requeue(skb, q);
+	}
 	/* XXX: Could netif_schedule fail? Or is that fact we are
 	 * requeueing imply the hardware path is closed
 	 * and even if we fail, some interupt will wake us
@@ -116,7 +118,10 @@ tx_islocked(struct sk_buff *skb, struct net_device *dev, struct Qdisc *q)
 	int ret = handle_dev_cpu_collision(dev);
 
 	if (ret == SCHED_TX_DROP) {
-		kfree_skb(skb);
+		if (skb) /* we are not batching */
+			kfree_skb(skb);
+		else if (!skb_queue_empty(&dev->blist))
+			skb_queue_purge(&dev->blist);
 		return qdisc_qlen(q);
 	}
 
@@ -195,10 +200,99 @@ static inline int qdisc_restart(struct net_device *dev)
 	return do_dev_requeue(skb, dev, q);
 }
 
+static int try_get_tx_pkts(struct net_device *dev, struct Qdisc *q, int count)
+{
+	struct sk_buff *skb;
+	struct sk_buff_head *skbs = &dev->blist;
+	int tdq = count;
+
+	/* 
+	 * very unlikely, but who knows ..
+	 * If this happens we dont try to grab more pkts
+	 */
+	if (!skb_queue_empty(&dev->blist))
+		return skb_queue_len(&dev->blist);
+
+	if (dev->gso_skb) {
+		count--;
+		__skb_queue_head(skbs, dev->gso_skb);
+		dev->gso_skb = NULL;
+	}
+
+	while (count) {
+		skb = q->dequeue(q);
+		if (!skb)
+			break;
+		count--;
+		__skb_queue_head(skbs, skb);
+	}
+
+	return tdq - count;
+}
+
+static inline int try_tx_pkts(struct net_device *dev)
+{
+
+	return dev->hard_batch_xmit(&dev->blist, dev);
+
+}
+
+/* same comments as in qdisc_restart apply;
+ * at some point use shared code with qdisc_restart*/
+int batch_qdisc_restart(struct net_device *dev)
+{
+	struct Qdisc *q = dev->qdisc;
+	unsigned lockless = (dev->features & NETIF_F_LLTX);
+	int count = dev->xmit_win;
+	int ret = 0;
+
+	ret = try_get_tx_pkts(dev, q, count);
+
+	if (ret == 0)
+		return qdisc_qlen(q);
+
+	/* we have packets to send! */
+	if (!lockless) {
+		if (!netif_tx_trylock(dev))
+			return tx_islocked(NULL, dev, q);
+	}
+
+	/* all clear .. */
+	spin_unlock(&dev->queue_lock);
+
+	ret = NETDEV_TX_BUSY;
+	if (!netif_queue_stopped(dev))
+		ret = try_tx_pkts(dev);
+
+	if (!lockless)
+		netif_tx_unlock(dev);
+
+	spin_lock(&dev->queue_lock);
+
+	q = dev->qdisc;
+
+	/* most likely result, packet went ok */
+	if (ret == NETDEV_TX_OK)
+		return qdisc_qlen(q);
+	/* only for lockless drivers .. */
+	if (ret == NETDEV_TX_LOCKED && lockless)
+		return tx_islocked(NULL, dev, q);
+
+	if (unlikely(ret != NETDEV_TX_BUSY && net_ratelimit()))
+		printk(KERN_WARNING " BUG %s code %d qlen %d\n",
+		       dev->name, ret, q->q.qlen);
+
+	return do_dev_requeue(NULL, dev, q);
+}
+
 void __qdisc_run(struct net_device *dev)
 {
+	unsigned batching = (dev->features & NETIF_F_BTX);
+
 	do {
-		if (!qdisc_restart(dev))
+		if (!batching && !qdisc_restart(dev))
+			break;
+		else if (!batch_qdisc_restart(dev))
 			break;
 	} while (!netif_queue_stopped(dev));

[WIP] [PATCH] WAS Re: [RFC] New driver API to speed up small packets xmits

Reply via email to