Hi,

Thank you for the update!

On Wed, 2018-11-28 at 18:50 -0500, Willem de Bruijn wrote:
> I did revert to the basic implementation using an extra ref
> for the function call, similar to TCP, as you suggested.
> 
> On top of that as a separate optimization patch I have a
> variant that uses refcnt zero by replacing refcount_inc with
> refcount_set(.., refcount_read(..) + 1). Not very pretty.

If the skb/uarg is not shared (no other threads can touch the refcnt)
before ip*_append_data() completes, how about something like the
following (incremental diff on top of patch 1/2, untested, uncompiled,
just to give the idea):

---
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 04f52e719571..1e3d195ffdfb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -480,6 +480,13 @@ static inline void sock_zerocopy_get(struct ubuf_info 
*uarg)
        refcount_inc(&uarg->refcnt);
 }
 
+/* use only before uarg is actually shared */
+static inline void __sock_zerocopy_init(struct ubuf_info *uarg, int cnt)
+{
+       if (uarg)
+               refcount_set(&uarg->refcnt, cnt);
+}
+
 void sock_zerocopy_put(struct ubuf_info *uarg);
 void sock_zerocopy_put_abort(struct ubuf_info *uarg);
 
@@ -1326,13 +1333,20 @@ static inline struct ubuf_info *skb_zcopy(struct 
sk_buff *skb)
        return is_zcopy ? skb_uarg(skb) : NULL;
 }
 
-static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
+static inline int __skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
 {
        if (skb && uarg && !skb_zcopy(skb)) {
-               sock_zerocopy_get(uarg);
                skb_shinfo(skb)->destructor_arg = uarg;
                skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
+               return 1;
        }
+       return 0;
+}
+
+static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
+{
+       if (__skb_zcopy_set(skb, uarg))
+               sock_zerocopy_get(uarg);
 }
 
 static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2179ef84bb44..435bac91d293 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -957,7 +957,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, 
size_t size)
        uarg->len = 1;
        uarg->bytelen = size;
        uarg->zerocopy = 1;
-       refcount_set(&uarg->refcnt, sk->sk_type == SOCK_STREAM ? 1 : 0);
+       refcount_set(&uarg->refcnt, 1);
        sock_hold(sk);
 
        return uarg;
@@ -1097,13 +1097,6 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
                atomic_dec(&sk->sk_zckey);
                uarg->len--;
 
-               /* Stream socks hold a ref for the syscall, as skbs can be sent
-                * and freed inside the loop, dropping refcnt to 0 inbetween.
-                * Datagrams do not need this, but sock_zerocopy_put expects it.
-                */
-               if (sk->sk_type != SOCK_STREAM && !refcount_read(&uarg->refcnt))
-                       refcount_set(&uarg->refcnt, 1);
-
                sock_zerocopy_put(uarg);
        }
 }
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7504da2f33d6..d3285613d87a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -882,6 +882,7 @@ static int __ip_append_data(struct sock *sk,
        struct rtable *rt = (struct rtable *)cork->dst;
        unsigned int wmem_alloc_delta = 0;
        u32 tskey = 0;
+       int uarg_refs = 0;
        bool paged;
 
        skb = skb_peek_tail(queue);
@@ -919,6 +920,7 @@ static int __ip_append_data(struct sock *sk,
 
        if (flags & MSG_ZEROCOPY && length) {
                uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+               uarg_refs = 1;
                if (!uarg)
                        return -ENOBUFS;
                if (rt->dst.dev->features & NETIF_F_SG &&
@@ -926,7 +928,7 @@ static int __ip_append_data(struct sock *sk,
                        paged = true;
                } else {
                        uarg->zerocopy = 0;
-                       skb_zcopy_set(skb, uarg);
+                       uarg_refs += __skb_zcopy_set(skb, uarg);
                }
        }
 
@@ -1019,7 +1021,7 @@ static int __ip_append_data(struct sock *sk,
                        cork->tx_flags = 0;
                        skb_shinfo(skb)->tskey = tskey;
                        tskey = 0;
-                       skb_zcopy_set(skb, uarg);
+                       uarg_refs += __skb_zcopy_set(skb, uarg);
 
                        /*
                         *      Find where to start putting bytes.
@@ -1121,6 +1123,7 @@ static int __ip_append_data(struct sock *sk,
                length -= copy;
        }
 
+       __sock_zerocopy_init(uarg, uarg_refs);
        if (wmem_alloc_delta)
                refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
        return 0;
@@ -1128,6 +1131,7 @@ static int __ip_append_data(struct sock *sk,
 error_efault:
        err = -EFAULT;
 error:
+       __sock_zerocopy_init(uarg, uarg_refs);
        sock_zerocopy_put_abort(uarg);
        cork->length -= length;
        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
---
The basic idea is using the same schema currently used for wmem
accounting: do the book-keeping inside the loop and set the atomic
reference counter only once at the end of the loop.

WDYT?

Thanks,

Paolo

Reply via email to