* David S. Miller <[EMAIL PROTECTED]> 2005-08-14 17:56 > From: Herbert Xu <[EMAIL PROTECTED]> > Date: Mon, 15 Aug 2005 10:42:44 +1000 > > > Well I don't think I understand the new skb->user solution yet :) > > > > For a start, skb->users will prevent the skb->data area from being > > freed as well as the skb itself. We'll also need to audit the TCP > > code paths to make sure that nobody is doing a skb_shared() check > > since that either leads to another clone or a BUG. > > That's a good point. I had intended to work out the implementation > details, then present it for comment, if it's doable at all.
I just tried to solve this issue by teaching skb_shared about fclone but we have a race because the marking of the clone portion as no longer used and the atomic_dec() of the parent's users is not atomic. This means that we would declared skbs as shared when they actually arent (never vice versa), the question is whether this is a problem. We currently have the following transitions: *FREE* ^ |free(a) | clone() free(a) ORIG [1] --------> ORIG [2] --------> ORIG [1] -------- free(b) -------- -------- free(b) UNAVAIL <-------- CLONE CLONE --------> *FREE* free(a) is a kfree_skb() on the parent and free(b) is a kfree_skb() on the clone portion. The syntax is: <parent state> [refcnt] ----------------------- <clone state> *FREE* is when we actually free the skb. The critial point is still the async call to kfree_skb() for the clone portion, we potentially race against the cloneable check in clone() but it's not a problem because if we get interrupted between the state = UNAVAIL and atomic_dec(users) we simply don't reuse the clone when we could. So except for the skb_shared() issue this looks pretty good. Dave, I found another problem in the earlier patch of mine, when we free the clone portion and the parent is still alive we used to set UNAVAIL in the else branch but at this point the skb could have been gone already, I fixed this in this patch. Index: net-2.6.14/include/linux/skbuff.h =================================================================== --- net-2.6.14.orig/include/linux/skbuff.h +++ net-2.6.14/include/linux/skbuff.h @@ -155,6 +155,12 @@ struct skb_shared_info { #define SKB_DATAREF_SHIFT 16 #define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1) +enum { + SKB_FCLONE_UNAVAILABLE, + SKB_FCLONE_ORIG, + SKB_FCLONE_CLONE, +}; + /** * struct sk_buff - socket buffer * @next: Next buffer in list @@ -248,7 +254,8 @@ struct sk_buff { ip_summed:2, nohdr:1, nfctinfo:3; - __u8 pkt_type; + __u8 pkt_type:3, + fclone:2; __u16 protocol; void (*destructor)(struct sk_buff *skb); @@ -288,8 +295,20 @@ struct sk_buff { #include <asm/system.h> extern void __kfree_skb(struct sk_buff *skb); -extern struct sk_buff *alloc_skb(unsigned int size, - unsigned int __nocast priority); +extern struct sk_buff *__alloc_skb(unsigned int size, + unsigned int __nocast priority, int fclone); +static inline struct sk_buff *alloc_skb(unsigned int size, + unsigned int __nocast priority) +{ + return __alloc_skb(size, priority, 0); +} + +static inline struct sk_buff *alloc_skb_fclone(unsigned int size, + unsigned int __nocast priority) +{ + return __alloc_skb(size, priority, 1); +} + extern struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp, unsigned int size, unsigned int __nocast priority); @@ -442,7 +461,11 @@ static inline void skb_header_release(st */ static inline int skb_shared(const struct sk_buff *skb) { - return atomic_read(&skb->users) != 1; + if (skb->fclone == SKB_FCLONE_ORIG && + (skb + 1)->fclone == SKB_FCLONE_CLONE) + return atomic_read(&skb->users) > 2; + else + return atomic_read(&skb->users) != 1; } /** Index: net-2.6.14/include/net/sock.h =================================================================== --- net-2.6.14.orig/include/net/sock.h +++ net-2.6.14/include/net/sock.h @@ -1195,7 +1195,7 @@ static inline struct sk_buff *sk_stream_ int hdr_len; hdr_len = SKB_DATA_ALIGN(sk->sk_prot->max_header); - skb = alloc_skb(size + hdr_len, gfp); + skb = alloc_skb_fclone(size + hdr_len, gfp); if (skb) { skb->truesize += mem; if (sk->sk_forward_alloc >= (int)skb->truesize || Index: net-2.6.14/net/core/skbuff.c =================================================================== --- net-2.6.14.orig/net/core/skbuff.c +++ net-2.6.14/net/core/skbuff.c @@ -69,6 +69,7 @@ #include <asm/system.h> static kmem_cache_t *skbuff_head_cache; +static kmem_cache_t *skbuff_fclone_cache; /* * Keep out-of-line to prevent kernel bloat. @@ -118,7 +119,7 @@ void skb_under_panic(struct sk_buff *skb */ /** - * alloc_skb - allocate a network buffer + * __alloc_skb - allocate a network buffer * @size: size to allocate * @gfp_mask: allocation mask * @@ -129,14 +130,20 @@ void skb_under_panic(struct sk_buff *skb * Buffers may only be allocated from interrupts using a @gfp_mask of * %GFP_ATOMIC. */ -struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask) +struct sk_buff *__alloc_skb(unsigned int size, unsigned int __nocast gfp_mask, + int fclone) { struct sk_buff *skb; u8 *data; /* Get the HEAD */ - skb = kmem_cache_alloc(skbuff_head_cache, - gfp_mask & ~__GFP_DMA); + if (fclone) + skb = kmem_cache_alloc(skbuff_fclone_cache, + gfp_mask & ~__GFP_DMA); + else + skb = kmem_cache_alloc(skbuff_head_cache, + gfp_mask & ~__GFP_DMA); + if (!skb) goto out; @@ -154,6 +161,9 @@ struct sk_buff *alloc_skb(unsigned int s skb->tail = data; skb->end = data + size; + if (fclone) + skb->fclone = SKB_FCLONE_ORIG; + atomic_set(&(skb_shinfo(skb)->dataref), 1); skb_shinfo(skb)->nr_frags = 0; skb_shinfo(skb)->tso_size = 0; @@ -266,8 +276,31 @@ void skb_release_data(struct sk_buff *sk */ void kfree_skbmem(struct sk_buff *skb) { + struct sk_buff *other; + skb_release_data(skb); - kmem_cache_free(skbuff_head_cache, skb); + + switch (skb->fclone) { + case SKB_FCLONE_UNAVAILABLE: + kmem_cache_free(skbuff_head_cache, skb); + break; + + case SKB_FCLONE_ORIG: + /* We can only reach this point if the clone portion + * is no longer in use */ + kmem_cache_free(skbuff_fclone_cache, skb); + break; + + case SKB_FCLONE_CLONE: + other = skb - 1; + + /* The clone portion is available for fast-cloning again. */ + skb->fclone = SKB_FCLONE_UNAVAILABLE; + + if (atomic_dec_and_test(&other->users)) + kmem_cache_free(skbuff_fclone_cache, other); + break; + } } /** @@ -322,10 +355,21 @@ void __kfree_skb(struct sk_buff *skb) struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask) { - struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + struct sk_buff *n = skb + 1; - if (!n) - return NULL; + if (skb->fclone == SKB_FCLONE_ORIG && + n->fclone == SKB_FCLONE_UNAVAILABLE) { + n->fclone = SKB_FCLONE_CLONE; + /* Get a reference on ourselves to be released + * by the clone portion. */ + skb_get(skb); + } else { + n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + if (!n) + return NULL; + + n->fclone = SKB_FCLONE_UNAVAILABLE; + } #define C(x) n->x = skb->x @@ -407,6 +451,7 @@ static void copy_skb_header(struct sk_bu new->mac.raw = old->mac.raw + offset; memcpy(new->cb, old->cb, sizeof(old->cb)); new->local_df = old->local_df; + new->fclone = SKB_FCLONE_UNAVAILABLE; new->pkt_type = old->pkt_type; new->stamp = old->stamp; new->destructor = NULL; @@ -1645,12 +1690,20 @@ void __init skb_init(void) NULL, NULL); if (!skbuff_head_cache) panic("cannot create skbuff cache"); + + skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", + 2*sizeof(struct sk_buff), + 0, + SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!skbuff_fclone_cache) + panic("cannot create skbuff cache"); } EXPORT_SYMBOL(___pskb_trim); EXPORT_SYMBOL(__kfree_skb); EXPORT_SYMBOL(__pskb_pull_tail); -EXPORT_SYMBOL(alloc_skb); +EXPORT_SYMBOL(__alloc_skb); EXPORT_SYMBOL(pskb_copy); EXPORT_SYMBOL(pskb_expand_head); EXPORT_SYMBOL(skb_checksum); Index: net-2.6.14/net/ipv4/tcp_output.c =================================================================== --- net-2.6.14.orig/net/ipv4/tcp_output.c +++ net-2.6.14/net/ipv4/tcp_output.c @@ -1579,7 +1579,7 @@ void tcp_send_fin(struct sock *sk) } else { /* Socket is locked, keep trying until memory is available. */ for (;;) { - skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL); + skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL); if (skb) break; yield(); @@ -1801,7 +1801,7 @@ int tcp_connect(struct sock *sk) tcp_connect_init(sk); - buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation); + buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); if (unlikely(buff == NULL)) return -ENOBUFS; - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html