tcp_output.c:918

David S. Miller Sun, 31 Jul 2005 21:49:20 -0700

From: Herbert Xu <[EMAIL PROTECTED]>
Date: Mon, 1 Aug 2005 09:36:57 +1000


> On Sun, Jul 31, 2005 at 04:21:53PM -0700, David S. Miller wrote:
> >
> > However, if that actually occurs, cwnd_quota would decrement past zero
> > in tcp_write_xmit() and hit negative, something we also BUG() on way
> > before the next tcp_tso_should_defer() call.
> 
> Not necessarily.  We could hit tso_fragment first so that cwnd_quota
> doesn't go negative just yet until we come around again to the top
> of the loop.
> 
> I suggest that we unconditionally do tcp_set_skb_tso_segs at the top of
> the function.

I've tried to avoid doing that, but I may need to capitulate for now.
My concern was that the divide that thing does has non-trivial cost.

It would certainly fix this, because only the sk->sk_send_head
can ever have it's tso_segs already set.

Wait... that's not true, multiple SKBs can have it set already
if we tso_fragment() or tcp_fragment() and then the tcp_transmit_skb()
fails (clone allocation failure, for example).

In that kind of case, the first two SKBs will have their tso_segs
set already, defeating your fix.

Another idea is to make tcp_init_tso_segs() reset the values if
the MSS doesn't match up.  This should work and points out another
inconsistency.  We use tp->mss_cache to set the SKB tso seg count
and MSS, but that's not right in the presence of SACK blocks.

I'll probably end up pondering all of this on my long flight to
UKUUG2005 tomorrow. :-)

Anyways, the following compile-tested-only patch shows my idea.
What do you think about this Herbert?

[TCP]: Fix two TSO sizing bugs

MSS changes can be lost since we preemptively initialize
the tso_segs count for an SKB before we %100 commit
to sending it out.

So, by the time we send it out, the tso_size information
can be stale due to PMTU events.  This mucks up all of the
logic in our send engine, and can even result in the BUG()
triggering in tcp_tso_should_defer().

Another problem we have is that we're storing the tp->mss_cache,
not the SACK block normalized MSS, as the tso_size.  That's wrong
too.

Signed-off-by: David S. Miller <[EMAIL PROTECTED]>

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -403,11 +403,9 @@ static void tcp_queue_skb(struct sock *s
                sk->sk_send_head = skb;
 }
 
-static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
+static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, 
unsigned int mss_now)
 {
-       struct tcp_sock *tp = tcp_sk(sk);
-
-       if (skb->len <= tp->mss_cache ||
+       if (skb->len <= mss_now ||
            !(sk->sk_route_caps & NETIF_F_TSO)) {
                /* Avoid the costly divide in the normal
                 * non-TSO case.
@@ -417,10 +415,10 @@ static void tcp_set_skb_tso_segs(struct 
        } else {
                unsigned int factor;
 
-               factor = skb->len + (tp->mss_cache - 1);
-               factor /= tp->mss_cache;
+               factor = skb->len + (mss_now - 1);
+               factor /= mss_now;
                skb_shinfo(skb)->tso_segs = factor;
-               skb_shinfo(skb)->tso_size = tp->mss_cache;
+               skb_shinfo(skb)->tso_size = mss_now;
        }
 }
 
@@ -429,7 +427,7 @@ static void tcp_set_skb_tso_segs(struct 
  * packet to the list.  This won't be called frequently, I hope. 
  * Remember, these are still headerless SKBs at this point.
  */
-static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
+static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, 
unsigned int mss_now)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *buff;
@@ -492,8 +490,8 @@ static int tcp_fragment(struct sock *sk,
        }
 
        /* Fix up tso_factor for both original and new SKB.  */
-       tcp_set_skb_tso_segs(sk, skb);
-       tcp_set_skb_tso_segs(sk, buff);
+       tcp_set_skb_tso_segs(sk, skb, mss_now);
+       tcp_set_skb_tso_segs(sk, buff, mss_now);
 
        if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
                tp->lost_out += tcp_skb_pcount(skb);
@@ -569,7 +567,7 @@ int tcp_trim_head(struct sock *sk, struc
         * factor and mss.
         */
        if (tcp_skb_pcount(skb) > 1)
-               tcp_set_skb_tso_segs(sk, skb);
+               tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
 
        return 0;
 }
@@ -734,12 +732,14 @@ static inline unsigned int tcp_cwnd_test
 /* This must be invoked the first time we consider transmitting
  * SKB onto the wire.
  */
-static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
+static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, 
unsigned int mss_now)
 {
        int tso_segs = tcp_skb_pcount(skb);
 
-       if (!tso_segs) {
-               tcp_set_skb_tso_segs(sk, skb);
+       if (!tso_segs ||
+           (tso_segs > 1 &&
+            skb_shinfo(skb)->tso_size != mss_now)) {
+               tcp_set_skb_tso_segs(sk, skb, mss_now);
                tso_segs = tcp_skb_pcount(skb);
        }
        return tso_segs;
@@ -817,7 +817,7 @@ static unsigned int tcp_snd_test(struct 
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned int cwnd_quota;
 
-       tcp_init_tso_segs(sk, skb);
+       tcp_init_tso_segs(sk, skb, cur_mss);
 
        if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
                return 0;
@@ -854,7 +854,7 @@ int tcp_may_send_now(struct sock *sk, st
  * know that all the data is in scatter-gather pages, and that the
  * packet has never been sent out before (and thus is not cloned).
  */
-static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int 
len, unsigned int mss_now)
 {
        struct sk_buff *buff;
        int nlen = skb->len - len;
@@ -887,8 +887,8 @@ static int tso_fragment(struct sock *sk,
        skb_split(skb, buff, len);
 
        /* Fix up tso_factor for both original and new SKB.  */
-       tcp_set_skb_tso_segs(sk, skb);
-       tcp_set_skb_tso_segs(sk, buff);
+       tcp_set_skb_tso_segs(sk, skb, mss_now);
+       tcp_set_skb_tso_segs(sk, buff, mss_now);
 
        /* Link BUFF into the send queue. */
        skb_header_release(buff);
@@ -976,7 +976,7 @@ static int tcp_write_xmit(struct sock *s
        if (unlikely(!skb))
                return 0;
 
-       tso_segs = tcp_init_tso_segs(sk, skb);
+       tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
        cwnd_quota = tcp_cwnd_test(tp, skb);
        if (unlikely(!cwnd_quota))
                goto out;
@@ -1006,11 +1006,11 @@ static int tcp_write_xmit(struct sock *s
                                        limit = skb->len - trim;
                        }
                        if (skb->len > limit) {
-                               if (tso_fragment(sk, skb, limit))
+                               if (tso_fragment(sk, skb, limit, mss_now))
                                        break;
                        }
                } else if (unlikely(skb->len > mss_now)) {
-                       if (unlikely(tcp_fragment(sk, skb,  mss_now)))
+                       if (unlikely(tcp_fragment(sk, skb,  mss_now, mss_now)))
                                break;
                }
 
@@ -1039,7 +1039,7 @@ static int tcp_write_xmit(struct sock *s
                skb = sk->sk_send_head;
                if (!skb)
                        break;
-               tso_segs = tcp_init_tso_segs(sk, skb);
+               tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
        }
 
        if (likely(sent_pkts)) {
@@ -1076,7 +1076,7 @@ void tcp_push_one(struct sock *sk, unsig
 
        BUG_ON(!skb || skb->len < mss_now);
 
-       tso_segs = tcp_init_tso_segs(sk, skb);
+       tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
        cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
 
        if (likely(cwnd_quota)) {
@@ -1093,11 +1093,11 @@ void tcp_push_one(struct sock *sk, unsig
                                        limit = skb->len - trim;
                        }
                        if (skb->len > limit) {
-                               if (unlikely(tso_fragment(sk, skb, limit)))
+                               if (unlikely(tso_fragment(sk, skb, limit, 
mss_now)))
                                        return;
                        }
                } else if (unlikely(skb->len > mss_now)) {
-                       if (unlikely(tcp_fragment(sk, skb, mss_now)))
+                       if (unlikely(tcp_fragment(sk, skb, mss_now, mss_now)))
                                return;
                }
 
@@ -1388,7 +1388,7 @@ int tcp_retransmit_skb(struct sock *sk, 
                int old_factor = tcp_skb_pcount(skb);
                int new_factor;
 
-               if (tcp_fragment(sk, skb, cur_mss))
+               if (tcp_fragment(sk, skb, cur_mss, cur_mss))
                        return -ENOMEM; /* We'll try again later. */
 
                /* New SKB created, account for it. */
@@ -1991,7 +1991,7 @@ int tcp_write_wakeup(struct sock *sk)
                            skb->len > mss) {
                                seg_size = min(seg_size, mss);
                                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
-                               if (tcp_fragment(sk, skb, seg_size))
+                               if (tcp_fragment(sk, skb, seg_size, mss))
                                        return -1;
                                /* SWS override triggered forced fragmentation.
                                 * Disable TSO, the connection is too sick. */
@@ -2000,7 +2000,7 @@ int tcp_write_wakeup(struct sock *sk)
                                        sk->sk_route_caps &= ~NETIF_F_TSO;
                                }
                        } else if (!tcp_skb_pcount(skb))
-                               tcp_set_skb_tso_segs(sk, skb);
+                               tcp_set_skb_tso_segs(sk, skb, mss);
 
                        TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
                        TCP_SKB_CB(skb)->when = tcp_time_stamp;
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: 2.6.13-rc4 - kernel panic - BUG at net/ipv4/tcp_output.c:918

Reply via email to