[PATCH 1/2] tbf scheduler: TSO support (update 3)

Hirokazu Takahashi Wed, 23 May 2007 04:47:44 -0700

Hi,

> > > @@ -924,7 +926,9 @@ cbq_dequeue_prio(struct Qdisc *sch, int 
> > >                           cl->xstats.borrows += skb->len;
> > >  #endif
> > >                   }
> > > -                 q->tx_len = skb->len;
> > > +                 q->tx_segs = skb_shinfo(skb)->gso_segs ? :
> > > +                   skb_shinfo(skb)->gso_size ? 
> > > skb->len/skb_shinfo(skb)->gso_size + 1 : 1;
> > > +                 q->tx_len = (skb->len - 1)/q->tx_segs + 1;
> > 
> > This isn't safe for Xen (and potentially other virtualisation
> > environments) since qdisc code runs before dev_hard_start_xmit
> > which is where we verify the sanity of gso_segs.  So you could
> > be using some arbitrary value from an untrusted source.
> > 
> > If you really want to use it, you should test for SKB_GSO_DODGY
> > on the packet which will be set if gso_segs can't be trusted.
> 
> Yep, you have a point that some sanity check should be added.
> I think a simple check would be enough not to crash CBQ
> as the accurate checking will be done in dev_hard_start_xmit or
> device drivers.


I updated the patch that a temporary index is used to calculate
the transmission time if the index derived from gso_size exceeds
the size of R_tab->data table. see the definition of L2T().

It is intended just to avoid causing any troubles in CBQ
with broken gso_size, which guests on Xen hypervisor or others
can possibly set.

I didn't get any better ideas than this. What do you think of it?


Thanks,
Hirokazu Takahashi.


--- linux-2.6.21/net/sched/sch_cbq.c.ORG        2007-05-14 20:53:06.000000000 
+0900
+++ linux-2.6.21/net/sched/sch_cbq.c    2007-05-21 21:07:48.000000000 +0900
@@ -176,6 +176,7 @@ struct cbq_sched_data
        struct cbq_class        *tx_class;
        struct cbq_class        *tx_borrowed;
        int                     tx_len;
+       unsigned int            tx_segs;
        psched_time_t           now;            /* Cached timestamp */
        psched_time_t           now_rt;         /* Cached real time */
        unsigned                pmask;
@@ -191,7 +192,15 @@ struct cbq_sched_data
 };
 
 
-#define L2T(cl,len)    ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log])
+inline psched_tdiff_t
+L2T(struct cbq_class *cl, int len) {
+       int nent = sizeof(cl->R_tab->data)/sizeof(cl->R_tab->data[0]);
+       int index =  len >> cl->R_tab->rate.cell_log;
+       if (index < nent)
+               return cl->R_tab->data[index];
+       else
+               return cl->R_tab->data[nent - 1] * (index/nent + 1);
+}
 
 
 static __inline__ unsigned cbq_hash(u32 h)
@@ -753,6 +762,7 @@ cbq_update(struct cbq_sched_data *q)
        struct cbq_class *this = q->tx_class;
        struct cbq_class *cl = this;
        int len = q->tx_len;
+       unsigned int segs = q->tx_segs;
 
        q->tx_class = NULL;
 
@@ -761,7 +771,7 @@ cbq_update(struct cbq_sched_data *q)
                long idle;
 
                cl->bstats.packets++;
-               cl->bstats.bytes += len;
+               cl->bstats.bytes += len*segs;
 
                /*
                   (now - last) is total time between packet right edges.
@@ -774,7 +784,7 @@ cbq_update(struct cbq_sched_data *q)
                if ((unsigned long)idle > 128*1024*1024) {
                        avgidle = cl->maxidle;
                } else {
-                       idle -= L2T(cl, len);
+                       idle -= L2T(cl, len) * segs;
 
                /* true_avgidle := (1-W)*true_avgidle + W*idle,
                   where W=2^{-ewma_log}. But cl->avgidle is scaled:
@@ -811,8 +821,8 @@ cbq_update(struct cbq_sched_data *q)
                           to the moment of cbq_update)
                         */
 
-                       idle -= L2T(&q->link, len);
-                       idle += L2T(cl, len);
+                       idle -= L2T(&q->link, len) * segs;
+                       idle += L2T(cl, len) * segs;
 
                        PSCHED_AUDIT_TDIFF(idle);
 
@@ -924,7 +934,9 @@ cbq_dequeue_prio(struct Qdisc *sch, int 
                                cl->xstats.borrows += skb->len;
 #endif
                        }
-                       q->tx_len = skb->len;
+                       q->tx_segs = skb_is_gso(skb) ? 
skb_shinfo(skb)->gso_segs ? :
+                               skb->len/skb_shinfo(skb)->gso_size + 1 : 1;
+                       q->tx_len = (skb->len - 1)/q->tx_segs + 1;
 
                        if (cl->deficit <= 0) {
                                q->active[prio] = cl;
@@ -1013,7 +1025,7 @@ cbq_dequeue(struct Qdisc *sch)
 
                   cbq_time = max(real_time, work);
                 */
-               incr2 = L2T(&q->link, q->tx_len);
+               incr2 = L2T(&q->link, q->tx_len) * q->tx_segs;
                PSCHED_TADD(q->now, incr2);
                cbq_update(q);
                if ((incr -= incr2) < 0)
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/2] tbf scheduler: TSO support (update 3)

Reply via email to