TCP peformance with TSO over networks with delay is awful. On a 100Mbit link with 150ms delay, we get 4Mbits/sec with TSO and 50Mbits/sec without TSO.
The problem is with TSO, we intentionally do not keep the maximum number of packets in flight to fill the window, we hold out to until we can send a MSS chunk. The following change compensates for this deferral. Please apply this one (even if the rest are too experimental). Signed-off-by: Stephen Hemminger <[EMAIL PROTECTED]> --- net-2.6.orig/include/net/tcp.h +++ net-2.6/include/net/tcp.h @@ -678,8 +678,7 @@ struct tcp_congestion_ops { /* lower bound for congestion window (optional) */ u32 (*min_cwnd)(struct sock *sk); /* do new cwnd calculation (required) */ - void (*cong_avoid)(struct sock *sk, u32 ack, - u32 rtt, u32 in_flight, int good_ack); + void (*cong_avoid)(struct sock *sk, u32 ack, u32 rtt, int good_ack); /* round trip time sample per acked packet (optional) */ void (*rtt_sample)(struct sock *sk, u32 usrtt); /* call before changing ca_state (optional) */ @@ -708,8 +707,7 @@ extern int tcp_set_congestion_control(st extern struct tcp_congestion_ops tcp_init_congestion_ops; extern u32 tcp_reno_ssthresh(struct sock *sk); -extern void tcp_reno_cong_avoid(struct sock *sk, u32 ack, - u32 rtt, u32 in_flight, int flag); +extern void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, int flag); extern u32 tcp_reno_min_cwnd(struct sock *sk); extern struct tcp_congestion_ops tcp_reno; --- net-2.6.orig/net/ipv4/tcp_bic.c +++ net-2.6/net/ipv4/tcp_bic.c @@ -209,17 +209,13 @@ static inline void bictcp_low_utilizatio } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, - u32 seq_rtt, u32 in_flight, int data_acked) +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 seq_rtt, int data_acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); bictcp_low_utilization(sk, data_acked); - if (in_flight < tp->snd_cwnd) - return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { /* In "safe" area, increase. */ if (tp->snd_cwnd < tp->snd_cwnd_clamp) --- net-2.6.orig/net/ipv4/tcp_cong.c +++ net-2.6/net/ipv4/tcp_cong.c @@ -181,14 +181,10 @@ int tcp_set_congestion_control(struct so /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, - int flag) +void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, int flag) { struct tcp_sock *tp = tcp_sk(sk); - if (in_flight < tp->snd_cwnd) - return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { /* In "safe" area, increase. */ if (tp->snd_cwnd < tp->snd_cwnd_clamp) --- net-2.6.orig/net/ipv4/tcp_highspeed.c +++ net-2.6/net/ipv4/tcp_highspeed.c @@ -110,15 +110,11 @@ static void hstcp_init(struct sock *sk) tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); } -static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, - u32 in_flight, int good) +static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, int good) { struct tcp_sock *tp = tcp_sk(sk); struct hstcp *ca = inet_csk_ca(sk); - if (in_flight < tp->snd_cwnd) - return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { if (tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++; --- net-2.6.orig/net/ipv4/tcp_htcp.c +++ net-2.6/net/ipv4/tcp_htcp.c @@ -201,15 +201,11 @@ static u32 htcp_recalc_ssthresh(struct s return max((tp->snd_cwnd * ca->beta) >> 7, 2U); } -static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, - u32 in_flight, int data_acked) +static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, int data_acked) { struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); - if (in_flight < tp->snd_cwnd) - return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { /* In "safe" area, increase. */ if (tp->snd_cwnd < tp->snd_cwnd_clamp) --- net-2.6.orig/net/ipv4/tcp_hybla.c +++ net-2.6/net/ipv4/tcp_hybla.c @@ -86,8 +86,7 @@ static inline u32 hybla_fraction(u32 odd * o Give cwnd a new value based on the model proposed * o remember increments <1 */ -static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, - u32 in_flight, int flag) +static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, int flag) { struct tcp_sock *tp = tcp_sk(sk); struct hybla *ca = inet_csk_ca(sk); @@ -101,10 +100,7 @@ static void hybla_cong_avoid(struct sock } if (!ca->hybla_en) - return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); - - if (in_flight < tp->snd_cwnd) - return; + return tcp_reno_cong_avoid(sk, ack, rtt, flag); if (ca->rho == 0) hybla_recalc_param(sk); --- net-2.6.orig/net/ipv4/tcp_input.c +++ net-2.6/net/ipv4/tcp_input.c @@ -1977,11 +1977,10 @@ static inline void tcp_ack_update_rtt(st tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); } -static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, - u32 in_flight, int good) +static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, int good) { const struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good); + icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, good); tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; } @@ -2204,6 +2203,29 @@ static inline int tcp_may_raise_cwnd(con !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR)); } +/* RFC2861 Check whether we are limited by application or congestion window + * This is the inverse of cwnd check in tcp_tso_should_defer + */ +static inline int tcp_may_update_cwnd(const struct sock *sk, u32 in_flight) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + u32 left; + + if (in_flight >= tp->snd_cwnd) + return 1; + + /* TCP vegas updates based on rtt data, not cwnd */ + if (icsk->icsk_ca_ops->rtt_sample) + return 1; + + left = tp->snd_cwnd - in_flight; + if (sysctl_tcp_tso_win_divisor) + return left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd; + else + return left <= tcp_max_burst(tp); +} + /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ @@ -2360,12 +2382,14 @@ static int tcp_ack(struct sock *sk, stru if (tcp_ack_is_dubious(sk, flag)) { /* Advanve CWND, if state allows this. */ - if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); + if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag) + && tcp_may_update_cwnd(sk, prior_in_flight)) + tcp_cong_avoid(sk, ack, seq_rtt, 0); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); } else { - if ((flag & FLAG_DATA_ACKED)) - tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1); + if ((flag & FLAG_DATA_ACKED) + && tcp_may_update_cwnd(sk, prior_in_flight)) + tcp_cong_avoid(sk, ack, seq_rtt, 1); } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) --- net-2.6.orig/net/ipv4/tcp_scalable.c +++ net-2.6/net/ipv4/tcp_scalable.c @@ -16,12 +16,9 @@ #define TCP_SCALABLE_AI_CNT 50U #define TCP_SCALABLE_MD_SCALE 3 -static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, - u32 in_flight, int flag) +static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, int flag) { struct tcp_sock *tp = tcp_sk(sk); - if (in_flight < tp->snd_cwnd) - return; if (tp->snd_cwnd <= tp->snd_ssthresh) { tp->snd_cwnd++; --- net-2.6.orig/net/ipv4/tcp_vegas.c +++ net-2.6/net/ipv4/tcp_vegas.c @@ -162,14 +162,13 @@ static void tcp_vegas_cwnd_event(struct tcp_vegas_init(sk); } -static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, - u32 seq_rtt, u32 in_flight, int flag) +static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 seq_rtt, int flag) { struct tcp_sock *tp = tcp_sk(sk); struct vegas *vegas = inet_csk_ca(sk); if (!vegas->doing_vegas_now) - return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); + return tcp_reno_cong_avoid(sk, ack, seq_rtt, flag); /* The key players are v_beg_snd_una and v_beg_snd_nxt. * -- Stephen Hemminger <[EMAIL PROTECTED]> OSDL http://developer.osdl.org/~shemminger - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html