Hi folks,
I'd like to start some discussions on SMP optimizations for the networking
stack. The patch below is one such example which changes the loopback
device in a way that helps out on workloads like netperf by trying to share
more work with the other CPU on an HT system. Basically, if the other CPU
is idle, we punt the netif_rx onto the other CPU. Using a kernel thread
for this is fairly inefficient, so I am wondering if it makes sense to do
it on the softirq level. This particular patch improves netperf over
localhost by ~600Mbit/s (from ~9874Mbit/s to ~10475Mbit/s while raising
%CPU usage from ~92% to ~95%, although it varies quite a bit) on a 3GHz P4
with HT (this is with a pile of other patches to optimize task switching
on x86-64).
The bigger part of the discussion is probably a question of how we can make
the network stack scale with multicore CPUs. For workloads like routing
lots of small packets, a single CPU can be easily overwhelmed. The question
becomes where does partitioning the work make sense? At the very least we
probably need to do some preprocessing of incoming packets so that a series
of packets destined for a particular flow end up on the same CPU. This
sort of preprocessing probably makes sense for other reasons: by processing
a group of packets for a particular socket in one go, we can avoid the
overhead of locking and unlocking the socket repeatedly (which is pretty
expensive due to the memory barrier nature of locks).
At this point I'd just like to stir up some discussion, so please comment
away with any ideas and concerns.
-ben
--
"Time is of no importance, Mr. President, only life is important."
Don't Email: <[EMAIL PROTECTED]>.
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 690a1aa..ef283a3 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -58,6 +58,69 @@
#include <linux/tcp.h>
#include <linux/percpu.h>
+#define LOOP_NR_SKBS 256
+static struct sk_buff *loop_skb_ring[LOOP_NR_SKBS];
+static unsigned loop_skb_head, loop_skb_tail;
+static struct task_struct *loop_task;
+
+static void smp_loop_netif_rx(struct sk_buff *skb)
+{
+ unsigned int next = (loop_skb_head + 1) % LOOP_NR_SKBS;
+
+ if (next == loop_skb_tail) {
+ dev_kfree_skb(skb);
+ return;
+ }
+
+ loop_skb_ring[loop_skb_head] = skb;
+ wmb();
+ loop_skb_head = next;
+}
+
+static void smp_loop_wake(void)
+{
+ if (loop_task && loop_task->state != TASK_RUNNING)
+ wake_up_process(loop_task);
+}
+
+static int loop_netif_rx_thread(void *data)
+{
+ loop_task = current;
+
+ for (;;) {
+ int nr = 0;
+ while (loop_skb_tail != loop_skb_head) {
+ unsigned next;
+ struct sk_buff *skb = loop_skb_ring[loop_skb_tail];
+ loop_skb_ring[loop_skb_tail] = NULL;
+ next = (loop_skb_tail + 1) % LOOP_NR_SKBS;
+ barrier();
+ loop_skb_tail = next;
+ netif_rx(skb);
+ if (nr++ >= 96) {
+ do_softirq();
+ nr = 0;
+ }
+ }
+
+ do_softirq();
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (loop_skb_tail == loop_skb_head)
+ schedule();
+ set_current_state(TASK_RUNNING);
+ }
+}
+
+static inline int sibling_is_idle(void)
+{
+ int cpu = smp_processor_id() ^ 1;
+ struct x8664_pda *pda = cpu_pda(cpu);
+ if (pda->pcurrent == idle_task(cpu) || pda->pcurrent == loop_task)
+ return 1;
+ return 0;
+}
+
static DEFINE_PER_CPU(struct net_device_stats, loopback_stats);
#define LOOPBACK_OVERHEAD (128 + MAX_HEADER + 16 + 16)
@@ -69,6 +132,7 @@ static DEFINE_PER_CPU(struct net_device_
*/
#ifdef LOOPBACK_TSO
+static void smp_loop_netif_rx(struct sk_buff *skb);
static void emulate_large_send_offload(struct sk_buff *skb)
{
struct iphdr *iph = skb->nh.iph;
@@ -76,6 +140,7 @@ static void emulate_large_send_offload(s
unsigned int doffset = (iph->ihl + th->doff) * 4;
unsigned int mtu = skb_shinfo(skb)->tso_size + doffset;
unsigned int offset = 0;
+ int use_sibling = sibling_is_idle();
u32 seq = ntohl(th->seq);
u16 id = ntohs(iph->id);
@@ -112,12 +177,21 @@ static void emulate_large_send_offload(s
th->seq = htonl(seq);
if (offset + doffset + frag_size < skb->len)
th->fin = th->psh = 0;
+#ifdef CONFIG_SMP
+ if (use_sibling)
+ smp_loop_netif_rx(nskb);
+ else
+ netif_rx(nskb);
+#else
netif_rx(nskb);
+#endif
offset += frag_size;
seq += frag_size;
id++;
}
+ if (use_sibling)
+ smp_loop_wake();
dev_kfree_skb(skb);
}
#endif /* LOOPBACK_TSO */
@@ -156,8 +230,15 @@ static int loopback_xmit(struct sk_buff
lb_stats->tx_packets = lb_stats->rx_packets;
put_cpu();
+#ifdef CONFIG_SMP
+ if (sibling_is_idle()) {
+ smp_loop_netif_rx(skb);
+ smp_loop_wake();
+ } else
+ netif_rx(skb);
+#else
netif_rx(skb);
-
+#endif
return(0);
}
@@ -225,6 +306,8 @@ int __init loopback_init(void)
{
struct net_device_stats *stats;
+ kernel_thread(loop_netif_rx_thread, NULL, 0);
+
/* Can survive without statistics */
stats = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL);
if (stats) {
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at http://vger.kernel.org/majordomo-info.html