On Tue, Nov 15, 2016 at 12:51 PM, Eric Dumazet <eric.duma...@gmail.com> wrote:
>
> From: Eric Dumazet <eduma...@google.com>
>
> In commit 2331ccc5b323 ("tcp: enhance tcp collapsing"),
> we made a first step allowing copying right skb to left skb head.
>
> Since all skbs in socket write queue are headless (but possibly the very
> first one), this strategy often does not work.
>
> This patch extends tcp_collapse_retrans() to perform frag shifting,
> thanks to skb_shift() helper.
>
> This helper needs to not BUG on non headless skbs, as callers are ok
> with that.
>
> Tested:
>
> Following packetdrill test now passes :
>
> 0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
>    +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
>    +0 bind(3, ..., ...) = 0
>    +0 listen(3, 1) = 0
>
>    +0 < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 8>
>    +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
> +.100 < . 1:1(0) ack 1 win 257
>    +0 accept(3, ..., ...) = 4
>
>    +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0
>    +0 write(4, ..., 200) = 200
>    +0 > P. 1:201(200) ack 1
> +.001 write(4, ..., 200) = 200
>    +0 > P. 201:401(200) ack 1
> +.001 write(4, ..., 200) = 200
>    +0 > P. 401:601(200) ack 1
> +.001 write(4, ..., 200) = 200
>    +0 > P. 601:801(200) ack 1
> +.001 write(4, ..., 200) = 200
>    +0 > P. 801:1001(200) ack 1
> +.001 write(4, ..., 100) = 100
>    +0 > P. 1001:1101(100) ack 1
> +.001 write(4, ..., 100) = 100
>    +0 > P. 1101:1201(100) ack 1
> +.001 write(4, ..., 100) = 100
>    +0 > P. 1201:1301(100) ack 1
> +.001 write(4, ..., 100) = 100
>    +0 > P. 1301:1401(100) ack 1
>
> +.099 < . 1:1(0) ack 201 win 257
> +.001 < . 1:1(0) ack 201 win 257 <nop,nop,sack 1001:1401>
>    +0 > P. 201:1001(800) ack 1
>
> Signed-off-by: Eric Dumazet <eduma...@google.com>
> Cc: Neal Cardwell <ncardw...@google.com>
> Cc: Yuchung Cheng <ych...@google.com>
Acked-by: Yuchung Cheng <ych...@google.com>

Nice follow-up patch. This also works well with RACK loss detection
since RACK only cares about time (skb_mstamp) not sequence so
collapsing sequences is not a problem.

> ---
>  net/core/skbuff.c     |    4 +++-
>  net/ipv4/tcp_output.c |   22 +++++++++++-----------
>  2 files changed, 14 insertions(+), 12 deletions(-)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 
> 0b2a6e94af2de73ed638634c47a0fb71e2cbc1cb..a9cb81a10c4ba895587727aa4cf098e9a38424ea
>  100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -2656,7 +2656,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, 
> int shiftlen)
>         struct skb_frag_struct *fragfrom, *fragto;
>
>         BUG_ON(shiftlen > skb->len);
> -       BUG_ON(skb_headlen(skb));       /* Would corrupt stream */
> +
> +       if (skb_headlen(skb))
> +               return 0;
>
>         todo = shiftlen;
>         from = 0;
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 
> f57b5aa51b59cf0a58975fe34a7dcdb886ea8c50..19105b46a30436ebb85fe97ee43089e77aa028bb
>  100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -2514,7 +2514,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
>  }
>
>  /* Collapses two adjacent SKB's during retransmission. */
> -static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
> +static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
>  {
>         struct tcp_sock *tp = tcp_sk(sk);
>         struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
> @@ -2525,14 +2525,17 @@ static void tcp_collapse_retrans(struct sock *sk, 
> struct sk_buff *skb)
>
>         BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
>
> +       if (next_skb_size) {
> +               if (next_skb_size <= skb_availroom(skb))
> +                       skb_copy_bits(next_skb, 0, skb_put(skb, 
> next_skb_size),
> +                                     next_skb_size);
> +               else if (!skb_shift(skb, next_skb, next_skb_size))
> +                       return false;
> +       }
>         tcp_highest_sack_combine(sk, next_skb, skb);
>
>         tcp_unlink_write_queue(next_skb, sk);
>
> -       if (next_skb_size)
> -               skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
> -                             next_skb_size);
> -
>         if (next_skb->ip_summed == CHECKSUM_PARTIAL)
>                 skb->ip_summed = CHECKSUM_PARTIAL;
>
> @@ -2561,6 +2564,7 @@ static void tcp_collapse_retrans(struct sock *sk, 
> struct sk_buff *skb)
>         tcp_skb_collapse_tstamp(skb, next_skb);
>
>         sk_wmem_free_skb(sk, next_skb);
> +       return true;
>  }
>
>  /* Check if coalescing SKBs is legal. */
> @@ -2610,16 +2614,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, 
> struct sk_buff *to,
>
>                 if (space < 0)
>                         break;
> -               /* Punt if not enough space exists in the first SKB for
> -                * the data in the second
> -                */
> -               if (skb->len > skb_availroom(to))
> -                       break;
>
>                 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
>                         break;
>
> -               tcp_collapse_retrans(sk, to);
> +               if (!tcp_collapse_retrans(sk, to))
> +                       break;
>         }
>  }
>
>
>

Reply via email to