Complete rewrite for update_scoreboard and mark_head_lost. Couple
of hints became unnecessary because of this change, NewReno code
was greatly simplified. I changed !TCPCB_TAGBITS check from the
original to a !(S|L) check but it shouldn't make a difference,
and if there ever is a R only skb TCP will mark it as LOST too
that is a correct behavior (IMHO). The algorithm uses some ideas
presented by David Miller and Baruch Even.

Seqno lookups require fast lookups that are provided using
RB-tree patch (which is nowadays included into the latest
net-2.6.22).

Only compile testing after v1, I'll probably try v3 on Friday
as I should have spare time in the testnet work then.

v2-to-v3 changes:
 - Replace of skipping made worries in it obsolete and now really
   one skb less is traversed (was promised in v2)

v1-to-v2 changes:
 - Changes non-fack SACK to something less broken; not a complete
   set of changes (other functions should also be fixed)
 - Fixed NewReno bugs
 - More comments & added RFCs to places I'm unsure of
 - Delayed reord_count increment (traverses now one skb less)
 - Copied retransmit_hint clearing from the original but I
   think it's superfluous
 - Separated highest_sack to own patch
 - Fixed off-by-one bug in skipping

Signed-off-by: Ilpo Järvinen <[EMAIL PROTECTED]>
---
 include/linux/tcp.h  |    4 -
 include/net/tcp.h    |   11 ++-
 net/ipv4/tcp_input.c |  213 +++++++++++++++++++++++++++++++++++---------------
 3 files changed, 156 insertions(+), 72 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 807fc96..9e706bd 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -323,15 +323,11 @@ #endif
        u32     highest_sack;   /* Start seq of globally highest revd SACK 
(valid only in slowpath) */
 
        /* from STCP, retrans queue hinting */
-       struct sk_buff* lost_skb_hint;
-
-       struct sk_buff *scoreboard_skb_hint;
        struct sk_buff *retransmit_skb_hint;
        struct sk_buff *forward_skb_hint;
        struct sk_buff *fastpath_skb_hint;
 
        int     fastpath_cnt_hint;
-       int     lost_cnt_hint;
        int     retransmit_cnt_hint;
        int     forward_cnt_hint;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2599d98..73ae872 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1047,8 +1047,6 @@ static inline void tcp_mib_init(void)
 
 /*from STCP */
 static inline void clear_all_retrans_hints(struct tcp_sock *tp){
-       tp->lost_skb_hint = NULL;
-       tp->scoreboard_skb_hint = NULL;
        tp->retransmit_skb_hint = NULL;
        tp->forward_skb_hint = NULL;
        tp->fastpath_skb_hint = NULL;
@@ -1194,6 +1192,11 @@ static inline struct sk_buff *tcp_write_
        return skb->next;
 }
 
+static inline struct sk_buff *tcp_write_queue_prev(struct sock *sk, struct 
sk_buff *skb)
+{
+       return skb->prev;
+}
+
 #define tcp_for_write_queue(skb, sk)                                   \
                for (skb = (sk)->sk_write_queue.next;                   \
                     (skb != (struct sk_buff *)&(sk)->sk_write_queue);  \
@@ -1203,6 +1206,10 @@ #define tcp_for_write_queue_from(skb, sk
                for (; (skb != (struct sk_buff *)&(sk)->sk_write_queue);\
                     skb = skb->next)
 
+#define tcp_for_write_queue_backwards_from(skb, sk)                    \
+               for (; (skb != (struct sk_buff *)&(sk)->sk_write_queue);\
+                    skb = skb->prev)
+
 static inline struct sk_buff *tcp_send_head(struct sock *sk)
 {
        return sk->sk_send_head;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d738f8e..3d7bd08 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1730,98 +1730,179 @@ static inline void tcp_reset_reno_sack(s
        tp->left_out = tp->lost_out;
 }
 
-/* Mark head of queue up as lost. */
-static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
-                              int packets, u32 high_seq)
+/* RFC: This is from the original, I doubt that this is necessary at all:
+ * clear xmit_retrans hint if seq of this skb is beyond hint. How could we
+ * retransmitted past LOST markings in the first place? I'm not fully sure
+ * about undo and end of connection cases, which can cause R without L?
+ */
+static void tcp_verify_retransmit_hint(struct tcp_sock *tp,
+                                      struct sk_buff *skb)
 {
-       struct sk_buff *skb;
-       int cnt;
+       if (tp->retransmit_skb_hint &&
+           before(TCP_SKB_CB(skb)->seq,
+           TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
+               tp->retransmit_skb_hint = skb;
+}
 
-       BUG_TRAP(packets <= tp->packets_out);
-       if (tp->lost_skb_hint) {
-               skb = tp->lost_skb_hint;
-               cnt = tp->lost_cnt_hint;
-       } else {
-               skb = tcp_write_queue_head(sk);
-               cnt = 0;
-       }
+/* Forward walk starting from until a not timedout skb is encountered, timeout
+ * everything on the way
+ */
+static void tcp_timedout_mark_forward(struct sock *sk, struct sk_buff *skb)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
 
        tcp_for_write_queue_from(skb, sk) {
-               if (skb == tcp_send_head(sk))
-                       break;
-               /* TODO: do this better */
-               /* this is not the most efficient way to do this... */
-               tp->lost_skb_hint = skb;
-               tp->lost_cnt_hint = cnt;
-               cnt += tcp_skb_pcount(skb);
-               if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq))
+               if (skb == tcp_send_head(sk) || !tcp_skb_timedout(sk, skb))
                        break;
-               if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
+               /*
+                * Could be lost already from a previous timedout check? I
+                * think R skbs should never be encountered here.
+                */
+               if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) {
                        TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
                        tp->lost_out += tcp_skb_pcount(skb);
-
-                       /* clear xmit_retransmit_queue hints
-                        *  if this is beyond hint */
-                       if(tp->retransmit_skb_hint != NULL &&
-                          before(TCP_SKB_CB(skb)->seq,
-                                 TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) {
-
-                               tp->retransmit_skb_hint = NULL;
-                       }
+                       tcp_verify_retransmit_hint(tp, skb);
                }
        }
-       tcp_sync_left_out(tp);
 }
 
-/* Account newly detected lost packet(s) */
-
-static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
+/* Algorithm details
+ *
+ *  I)  walk backwards until the first previously LOST marked skb is found (or
+ *      snd_una is encountered) starting from highest_sack->seq
+ *       1)  Skip skbs until reord_count > tp->reordering, then continue from
+ *           min(skb_now->seq, tp->high_seq)
+ *       2a) Mark non-SACKed skbs LOST
+ *       2b) If TCP can timeout non-SACKed skb safely, mark it LOST
+ *       3)  If LOST was not set for a non-SACKed skb, it's possible an
+ *           entrypoint for the step II (store it as such).
+ *
+ *  II) call forward walk that marks timedout skbs as LOST starting from an
+ *      entrypoint provided by the backwards walk, that is, min(LOST_edge,
+ *      tp->high_seq)
+ *
+ * Implements both FACK and RFC3517 SACK. Only FACK does per skb timeouts.
+ * Key difference is that FACK uses both SACK blocks and holes, while RFC3517
+ * is using only SACK blocks to mark (TODO: RFC3517 SACK requires a changes to
+ * elsewhere too).
+ */
+static void tcp_update_scoreboard_fack(struct sock *sk)
 {
-       if (IsFack(tp)) {
-               int lost = tp->fackets_out - tp->reordering;
-               if (lost <= 0)
-                       lost = 1;
-               tcp_mark_head_lost(sk, tp, lost, tp->high_seq);
-       } else {
-               tcp_mark_head_lost(sk, tp, 1, tp->high_seq);
-       }
-
-       /* New heuristics: it is possible only after we switched
-        * to restart timer each time when something is ACKed.
-        * Hence, we can detect timed out packets during fast
-        * retransmit without falling to slow start.
-        */
-       if (!IsReno(tp) && tcp_head_timedout(sk, tp)) {
-               struct sk_buff *skb;
+       struct tcp_sock *tp = tcp_sk(sk);
+       int reord_count = 0;
+       int ts_linear = 0;
+       struct sk_buff *skb, *reentry_skb = NULL;
 
-               skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
-                       : tcp_write_queue_head(sk);
+       skb = tcp_write_queue_find(sk, tp->highest_sack);
+       reentry_skb = tcp_write_queue_next(sk, skb);
 
-               tcp_for_write_queue_from(skb, sk) {
-                       if (skb == tcp_send_head(sk))
-                               break;
+       tcp_for_write_queue_backwards_from(skb, sk) {
+               /*
+                * BUG-to-solve: Sadly enough, still we might not know here
+                * what timedout thingie must do something (not necessarily
+                * encountered the highest SACKED_RETRANS).
+                * RFC: there is a contradiction between the original code and
+                * DaveM's interpretation of the skb_timedout loop which would
+                * be nice to resolve first before fixing this. What should be
+                * do with the discontinuity of timestamps at SACKED_RETRANS
+                * edge when LOST is set for it, should timedout of a later
+                * skb be postponed due to it?
+                */
+               if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
+                       break;
+               /*
+                * Could have S|R skb, or thus checking this here.
+                * BUG: EVER_RETRANS edge somewhere when !tp->retrans
+                * (significance is minor, requires undo or so to occur).
+                */
+               if (IsFack(tp) && !ts_linear &&
+                   (!tp->retrans_out ||
+                    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))) {
+                       ts_linear = 1;
+                       /* At SACKED_RETRANS edge, nothing to do past it? */
                        if (!tcp_skb_timedout(sk, skb))
-                               break;
+                               reentry_skb = NULL;
+               }
 
-                       if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
+               if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
+                       if (((reord_count >= tp->reordering) ||
+                            (ts_linear && tcp_skb_timedout(sk, skb)))) {
+                               /*
+                                * RFC: Might have to handle skb fragmentation
+                                * here if the pcount > 1? Original does not
+                                * handle it, btw.
+                                */
                                TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
                                tp->lost_out += tcp_skb_pcount(skb);
+                               tcp_verify_retransmit_hint(tp, skb);
 
-                               /* clear xmit_retrans hint */
-                               if (tp->retransmit_skb_hint &&
-                                   before(TCP_SKB_CB(skb)->seq,
-                                          
TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
-
-                                       tp->retransmit_skb_hint = NULL;
+                       } else if (reentry_skb != NULL) {
+                               /*
+                                * ...wasn't marked, thus a possible entry
+                                * point (possibly last skb before
+                                * TCPCB_LOST edge comes).
+                                */
+                               reentry_skb = skb;
                        }
                }
 
-               tp->scoreboard_skb_hint = skb;
+               /* For RFC3517, do increment only if SACKED_ACKED is set */
+               if (IsFack(tp) || (TCP_SKB_CB(skb)->sacked & 
TCPCB_SACKED_ACKED))
+                       reord_count += tcp_skb_pcount(skb);
+               /*
+                * Skip to high_seq if enough SACK blocks gathered (and no
+                * timedout work has begun)
+                */
+               if (!ts_linear && (reord_count >= tp->reordering) &&
+                   after(TCP_SKB_CB(skb)->seq, tp->high_seq)) {
+                       /*
+                        * reord_count will be out-of-sync after this but
+                        * that's an non-issue because it is definately
+                        * larger than tp->reordering still!
+                        * Search sends us too far, the loop will fix that
+                        */
+                       skb = tcp_write_queue_find(sk, tp->high_seq);
+                       if (reentry_skb != NULL)
+                               reentry_skb = skb;
+               }
+       }
+       /*
+        * RFC: Might have to handle the lost <= 0 condition here if nothing
+        * got marked in the loop
+        */
+       
+       /* Continue with timedout work */
+       if (IsFack(tp) && (reentry_skb != NULL))
+               tcp_timedout_mark_forward(sk, reentry_skb);
+       
+       tcp_sync_left_out(tp);
+}
 
+/* Simple NewReno thing: Mark head LOST if it wasn't yet and it's below
+ * high_seq, stop. That's all.
+ */
+static void tcp_update_scoreboard_newreno(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct sk_buff *skb = tcp_write_queue_head(sk);
+       
+       if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST) &&
+           before(tp->snd_una, tp->high_seq)) {
+               TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+               tp->lost_out += tcp_skb_pcount(skb);
                tcp_sync_left_out(tp);
        }
 }
 
+/* Account newly detected lost packet(s) */
+static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
+{
+       if (!IsReno(tp))
+               tcp_update_scoreboard_fack(sk);
+       else
+               tcp_update_scoreboard_newreno(sk);
+}
+
 /* CWND moderation, preventing bursts due to too big ACKs
  * in dubious situations.
  */
@@ -2115,7 +2196,7 @@ tcp_fastretrans_alert(struct sock *sk, u
            before(tp->snd_una, tp->high_seq) &&
            icsk->icsk_ca_state != TCP_CA_Open &&
            tp->fackets_out > tp->reordering) {
-               tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, 
tp->high_seq);
+               tcp_update_scoreboard_fack(sk);
                NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
        }
 
-- 
1.4.2

Reply via email to