On Thu, Sep 27, 2018 at 11:21 AM, Yuchung Cheng <ych...@google.com> wrote: > Previously TCP initial receive buffer is ~87KB by default and > the initial receive window is ~29KB (20 MSS). This patch changes > the two numbers to 128KB and ~64KB (rounding down to the multiples > of MSS) respectively. The patch also simplifies the calculations s.t. > the two numbers are directly controlled by sysctl tcp_rmem[1]: > > 1) Initial receiver buffer budget (sk_rcvbuf): while this should > be configured via sysctl tcp_rmem[1], previously tcp_fixup_rcvbuf() > always override and set a larger size when a new connection > establishes. > > 2) Initial receive window in SYN: previously it is set to 20 > packets if MSS <= 1460. The number 20 was based on the initial > congestion window of 10: the receiver needs twice amount to > avoid being limited by the receive window upon out-of-order > delivery in the first window burst. But since this only > applies if the receiving MSS <= 1460, connection using large MTU > (e.g. to utilize receiver zero-copy) may be limited by the > receive window. > > With this patch TCP memory configuration is more straight-forward and > more properly sized to modern high-speed networks by default. Several > popular stacks have been announcing 64KB rwin in SYNs as well. Sorry please ignore this patch for now.
We need to adjust rbuf autotuning as well otherwise w/ larger init rbuf it may increase too slowly during slow start. Will submit a v2 > > Signed-off-by: Yuchung Cheng <ych...@google.com> > Signed-off-by: Wei Wang <wei...@google.com> > Signed-off-by: Neal Cardwell <ncardw...@google.com> > Signed-off-by: Eric Dumazet <eduma...@google.com> > Reviewed-by: Soheil Hassas Yeganeh <soh...@google.com> > --- > net/ipv4/tcp.c | 4 ++-- > net/ipv4/tcp_input.c | 25 ++----------------------- > net/ipv4/tcp_output.c | 25 ++++--------------------- > 3 files changed, 8 insertions(+), 46 deletions(-) > > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c > index 69c236943f56..dcf51fbf5ec7 100644 > --- a/net/ipv4/tcp.c > +++ b/net/ipv4/tcp.c > @@ -3896,8 +3896,8 @@ void __init tcp_init(void) > init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); > > init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; > - init_net.ipv4.sysctl_tcp_rmem[1] = 87380; > - init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare); > + init_net.ipv4.sysctl_tcp_rmem[1] = 131072; > + init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare); > > pr_info("Hash tables configured (established %u bind %u)\n", > tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c > index d703a0b3b6a2..7a59f6a96212 100644 > --- a/net/ipv4/tcp_input.c > +++ b/net/ipv4/tcp_input.c > @@ -426,26 +426,7 @@ static void tcp_grow_window(struct sock *sk, const > struct sk_buff *skb) > } > } > > -/* 3. Tuning rcvbuf, when connection enters established state. */ > -static void tcp_fixup_rcvbuf(struct sock *sk) > -{ > - u32 mss = tcp_sk(sk)->advmss; > - int rcvmem; > - > - rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) * > - tcp_default_init_rwnd(mss); > - > - /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency > - * Allow enough cushion so that sender is not limited by our window > - */ > - if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) > - rcvmem <<= 2; > - > - if (sk->sk_rcvbuf < rcvmem) > - sk->sk_rcvbuf = min(rcvmem, > sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); > -} > - > -/* 4. Try to fixup all. It is made immediately after connection enters > +/* 3. Try to fixup all. It is made immediately after connection enters > * established state. > */ > void tcp_init_buffer_space(struct sock *sk) > @@ -454,8 +435,6 @@ void tcp_init_buffer_space(struct sock *sk) > struct tcp_sock *tp = tcp_sk(sk); > int maxwin; > > - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) > - tcp_fixup_rcvbuf(sk); > if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) > tcp_sndbuf_expand(sk); > > @@ -485,7 +464,7 @@ void tcp_init_buffer_space(struct sock *sk) > tp->snd_cwnd_stamp = tcp_jiffies32; > } > > -/* 5. Recalculate window clamp after socket hit its memory bounds. */ > +/* 4. Recalculate window clamp after socket hit its memory bounds. */ > static void tcp_clamp_window(struct sock *sk) > { > struct tcp_sock *tp = tcp_sk(sk); > diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c > index fe7855b090e4..059b67af28b1 100644 > --- a/net/ipv4/tcp_output.c > +++ b/net/ipv4/tcp_output.c > @@ -195,21 +195,6 @@ static inline void tcp_event_ack_sent(struct sock *sk, > unsigned int pkts, > inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); > } > > - > -u32 tcp_default_init_rwnd(u32 mss) > -{ > - /* Initial receive window should be twice of TCP_INIT_CWND to > - * enable proper sending of new unsent data during fast recovery > - * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a > - * limit when mss is larger than 1460. > - */ > - u32 init_rwnd = TCP_INIT_CWND * 2; > - > - if (mss > 1460) > - init_rwnd = max((1460 * init_rwnd) / mss, 2U); > - return init_rwnd; > -} > - > /* Determine a window scaling and initial window to offer. > * Based on the assumption that the given amount of space > * will be offered. Store the results in the tp structure. > @@ -244,7 +229,10 @@ void tcp_select_initial_window(const struct sock *sk, > int __space, __u32 mss, > if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) > (*rcv_wnd) = min(space, MAX_TCP_WINDOW); > else > - (*rcv_wnd) = space; > + (*rcv_wnd) = min_t(u32, space, U16_MAX); > + > + if (init_rcv_wnd) > + *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); > > (*rcv_wscale) = 0; > if (wscale_ok) { > @@ -257,11 +245,6 @@ void tcp_select_initial_window(const struct sock *sk, > int __space, __u32 mss, > (*rcv_wscale)++; > } > } > - > - if (!init_rcv_wnd) /* Use default unless specified otherwise */ > - init_rcv_wnd = tcp_default_init_rwnd(mss); > - *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); > - > /* Set the clamp no higher than max representable value */ > (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), > *window_clamp); > } > -- > 2.19.0.605.g01d371f741-goog >