Hi, Thank you for the update!
On Wed, 2018-11-28 at 18:50 -0500, Willem de Bruijn wrote: > I did revert to the basic implementation using an extra ref > for the function call, similar to TCP, as you suggested. > > On top of that as a separate optimization patch I have a > variant that uses refcnt zero by replacing refcount_inc with > refcount_set(.., refcount_read(..) + 1). Not very pretty. If the skb/uarg is not shared (no other threads can touch the refcnt) before ip*_append_data() completes, how about something like the following (incremental diff on top of patch 1/2, untested, uncompiled, just to give the idea): --- diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 04f52e719571..1e3d195ffdfb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -480,6 +480,13 @@ static inline void sock_zerocopy_get(struct ubuf_info *uarg) refcount_inc(&uarg->refcnt); } +/* use only before uarg is actually shared */ +static inline void __sock_zerocopy_init(struct ubuf_info *uarg, int cnt) +{ + if (uarg) + refcount_set(&uarg->refcnt, cnt); +} + void sock_zerocopy_put(struct ubuf_info *uarg); void sock_zerocopy_put_abort(struct ubuf_info *uarg); @@ -1326,13 +1333,20 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) return is_zcopy ? skb_uarg(skb) : NULL; } -static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg) +static inline int __skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg) { if (skb && uarg && !skb_zcopy(skb)) { - sock_zerocopy_get(uarg); skb_shinfo(skb)->destructor_arg = uarg; skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG; + return 1; } + return 0; +} + +static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg) +{ + if (__skb_zcopy_set(skb, uarg)) + sock_zerocopy_get(uarg); } static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2179ef84bb44..435bac91d293 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -957,7 +957,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) uarg->len = 1; uarg->bytelen = size; uarg->zerocopy = 1; - refcount_set(&uarg->refcnt, sk->sk_type == SOCK_STREAM ? 1 : 0); + refcount_set(&uarg->refcnt, 1); sock_hold(sk); return uarg; @@ -1097,13 +1097,6 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg) atomic_dec(&sk->sk_zckey); uarg->len--; - /* Stream socks hold a ref for the syscall, as skbs can be sent - * and freed inside the loop, dropping refcnt to 0 inbetween. - * Datagrams do not need this, but sock_zerocopy_put expects it. - */ - if (sk->sk_type != SOCK_STREAM && !refcount_read(&uarg->refcnt)) - refcount_set(&uarg->refcnt, 1); - sock_zerocopy_put(uarg); } } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 7504da2f33d6..d3285613d87a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -882,6 +882,7 @@ static int __ip_append_data(struct sock *sk, struct rtable *rt = (struct rtable *)cork->dst; unsigned int wmem_alloc_delta = 0; u32 tskey = 0; + int uarg_refs = 0; bool paged; skb = skb_peek_tail(queue); @@ -919,6 +920,7 @@ static int __ip_append_data(struct sock *sk, if (flags & MSG_ZEROCOPY && length) { uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg_refs = 1; if (!uarg) return -ENOBUFS; if (rt->dst.dev->features & NETIF_F_SG && @@ -926,7 +928,7 @@ static int __ip_append_data(struct sock *sk, paged = true; } else { uarg->zerocopy = 0; - skb_zcopy_set(skb, uarg); + uarg_refs += __skb_zcopy_set(skb, uarg); } } @@ -1019,7 +1021,7 @@ static int __ip_append_data(struct sock *sk, cork->tx_flags = 0; skb_shinfo(skb)->tskey = tskey; tskey = 0; - skb_zcopy_set(skb, uarg); + uarg_refs += __skb_zcopy_set(skb, uarg); /* * Find where to start putting bytes. @@ -1121,6 +1123,7 @@ static int __ip_append_data(struct sock *sk, length -= copy; } + __sock_zerocopy_init(uarg, uarg_refs); if (wmem_alloc_delta) refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); return 0; @@ -1128,6 +1131,7 @@ static int __ip_append_data(struct sock *sk, error_efault: err = -EFAULT; error: + __sock_zerocopy_init(uarg, uarg_refs); sock_zerocopy_put_abort(uarg); cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); --- The basic idea is using the same schema currently used for wmem accounting: do the book-keeping inside the loop and set the atomic reference counter only once at the end of the loop. WDYT? Thanks, Paolo