From: Herbert Xu <[EMAIL PROTECTED]>
Date: Sat, 13 Aug 2005 11:32:39 +1000

> I actually had a play with the fast clone stuff.  However, eventually
> I gave up because of this dilemma: I needed to either introduce an extra
> atomic op on the __kfree_skb path, or add bloat to the inlined kfree_skb
> path.

Yes, you need an atomic in the fast path, but it's only done
for fast clones and their parent SKBs, and such an atomic is
_MUCH_ cheaper than a kmem_cache_alloc especially since it
touches more cache local state (the atomic counter in the parent
SKB, whose cache line is hot for the first transmit of an SKB
usually).

It might be possible to make use of skb->users, but currently
the patch Thomas and myself are trying to get to work is
adding a new fclone_ref member.

Here is the current patch.  It dies on the destruction of the
first TCP socket, while pruning the write queue of the socket,
so something is very wrong in the implementation, I just haven't
had a chance to fully debug it yet.

Testing, once it works, will prove my theory or not.  Since with
a TSO sender we should see much lower cpu utilization.  Thinking
about what could make Microsoft's TSO sender cpu utilization lower
than ours, the extra memory allocations was the only thing that
stuck out to me as a real possibility.

This patch is against net-2.6.14:

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -155,6 +155,13 @@ struct skb_shared_info {
 #define SKB_DATAREF_SHIFT 16
 #define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1)
 
+
+enum {
+       SKB_FCLONE_UNAVAILABLE,
+       SKB_FCLONE_ORIG,
+       SKB_FCLONE_CLONE,
+};
+
 /** 
  *     struct sk_buff - socket buffer
  *     @next: Next buffer in list
@@ -248,8 +255,10 @@ struct sk_buff {
                                ip_summed:2,
                                nohdr:1,
                                nfctinfo:3;
-       __u8                    pkt_type;
+       __u8                    pkt_type:3,
+                               fclone:2;
        __u16                   protocol;
+       atomic_t                fclone_ref;
 
        void                    (*destructor)(struct sk_buff *skb);
 #ifdef CONFIG_NETFILTER
@@ -288,8 +297,20 @@ struct sk_buff {
 #include <asm/system.h>
 
 extern void           __kfree_skb(struct sk_buff *skb);
-extern struct sk_buff *alloc_skb(unsigned int size,
-                                unsigned int __nocast priority);
+extern struct sk_buff *__alloc_skb(unsigned int size,
+                                  unsigned int __nocast priority, int fclone);
+static inline struct sk_buff *alloc_skb(unsigned int size,
+                                       unsigned int __nocast priority)
+{
+       return __alloc_skb(size, priority, 0);
+}
+
+static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
+                                              unsigned int __nocast priority)
+{
+       return __alloc_skb(size, priority, 1);
+}
+
 extern struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
                                            unsigned int size,
                                            unsigned int __nocast priority);
diff --git a/include/net/sock.h b/include/net/sock.h
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1195,7 +1195,7 @@ static inline struct sk_buff *sk_stream_
        int hdr_len;
 
        hdr_len = SKB_DATA_ALIGN(sk->sk_prot->max_header);
-       skb = alloc_skb(size + hdr_len, gfp);
+       skb = alloc_skb_fclone(size + hdr_len, gfp);
        if (skb) {
                skb->truesize += mem;
                if (sk->sk_forward_alloc >= (int)skb->truesize ||
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -69,6 +69,7 @@
 #include <asm/system.h>
 
 static kmem_cache_t *skbuff_head_cache;
+static kmem_cache_t *skbuff_fclone_cache;
 
 /*
  *     Keep out-of-line to prevent kernel bloat.
@@ -118,7 +119,7 @@ void skb_under_panic(struct sk_buff *skb
  */
 
 /**
- *     alloc_skb       -       allocate a network buffer
+ *     __alloc_skb     -       allocate a network buffer
  *     @size: size to allocate
  *     @gfp_mask: allocation mask
  *
@@ -129,14 +130,20 @@ void skb_under_panic(struct sk_buff *skb
  *     Buffers may only be allocated from interrupts using a @gfp_mask of
  *     %GFP_ATOMIC.
  */
-struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask)
+struct sk_buff *__alloc_skb(unsigned int size, unsigned int __nocast gfp_mask,
+                           int fclone)
 {
        struct sk_buff *skb;
        u8 *data;
 
        /* Get the HEAD */
-       skb = kmem_cache_alloc(skbuff_head_cache,
-                              gfp_mask & ~__GFP_DMA);
+       if (fclone)
+               skb = kmem_cache_alloc(skbuff_fclone_cache,
+                                      gfp_mask & ~__GFP_DMA);
+       else
+               skb = kmem_cache_alloc(skbuff_head_cache,
+                                      gfp_mask & ~__GFP_DMA);
+
        if (!skb)
                goto out;
 
@@ -154,6 +161,11 @@ struct sk_buff *alloc_skb(unsigned int s
        skb->tail = data;
        skb->end  = data + size;
 
+       if (fclone) {
+               skb->fclone = SKB_FCLONE_ORIG;
+               atomic_set(&skb->fclone_ref, 1);
+       }
+
        atomic_set(&(skb_shinfo(skb)->dataref), 1);
        skb_shinfo(skb)->nr_frags  = 0;
        skb_shinfo(skb)->tso_size = 0;
@@ -266,8 +278,33 @@ void skb_release_data(struct sk_buff *sk
  */
 void kfree_skbmem(struct sk_buff *skb)
 {
+       struct sk_buff *other;
+
        skb_release_data(skb);
-       kmem_cache_free(skbuff_head_cache, skb);
+
+       switch (skb->fclone) {
+       case SKB_FCLONE_UNAVAILABLE:
+               kmem_cache_free(skbuff_head_cache, skb);
+               break;
+
+       case SKB_FCLONE_ORIG:
+               if (!atomic_dec_and_test(&skb->fclone_ref))
+                       kmem_cache_free(skbuff_fclone_cache, skb);
+               break;
+
+       case SKB_FCLONE_CLONE:
+               other = skb - 1;
+
+               if (!atomic_dec_and_test(&other->fclone_ref)) {
+                       kmem_cache_free(skbuff_fclone_cache, other);
+               } else {
+                       /* The clone portion is available for
+                        * fast-cloning again.
+                        */
+                       skb->fclone = SKB_FCLONE_UNAVAILABLE;
+               }
+               break;
+       };
 }
 
 /**
@@ -322,10 +359,19 @@ void __kfree_skb(struct sk_buff *skb)
 
 struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
 {
-       struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
+       struct sk_buff *n;
 
-       if (!n) 
-               return NULL;
+       n = skb + 1;
+       if (skb->fclone == SKB_FCLONE_ORIG &&
+           n->fclone == SKB_FCLONE_UNAVAILABLE) {
+               n->fclone = SKB_FCLONE_CLONE;
+               atomic_inc(&skb->fclone_ref);
+       } else {
+               n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
+               if (!n)
+                       return NULL;
+               n->fclone = SKB_FCLONE_UNAVAILABLE;
+       }
 
 #define C(x) n->x = skb->x
 
@@ -407,6 +453,7 @@ static void copy_skb_header(struct sk_bu
        new->mac.raw    = old->mac.raw + offset;
        memcpy(new->cb, old->cb, sizeof(old->cb));
        new->local_df   = old->local_df;
+       new->fclone     = SKB_FCLONE_UNAVAILABLE;
        new->pkt_type   = old->pkt_type;
        new->stamp      = old->stamp;
        new->destructor = NULL;
@@ -1645,12 +1692,20 @@ void __init skb_init(void)
                                              NULL, NULL);
        if (!skbuff_head_cache)
                panic("cannot create skbuff cache");
+
+       skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
+                                               2*sizeof(struct sk_buff),
+                                               0,
+                                               SLAB_HWCACHE_ALIGN,
+                                               NULL, NULL);
+       if (!skbuff_fclone_cache)
+               panic("cannot create skbuff cache");
 }
 
 EXPORT_SYMBOL(___pskb_trim);
 EXPORT_SYMBOL(__kfree_skb);
 EXPORT_SYMBOL(__pskb_pull_tail);
-EXPORT_SYMBOL(alloc_skb);
+EXPORT_SYMBOL(__alloc_skb);
 EXPORT_SYMBOL(pskb_copy);
 EXPORT_SYMBOL(pskb_expand_head);
 EXPORT_SYMBOL(skb_checksum);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1579,7 +1579,7 @@ void tcp_send_fin(struct sock *sk)
        } else {
                /* Socket is locked, keep trying until memory is available. */
                for (;;) {
-                       skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
+                       skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL);
                        if (skb)
                                break;
                        yield();
@@ -1801,7 +1801,7 @@ int tcp_connect(struct sock *sk)
 
        tcp_connect_init(sk);
 
-       buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
+       buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
        if (unlikely(buff == NULL))
                return -ENOBUFS;
 
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to