hi,

i've always found it unfortunate that sfq includes the destination port in 
the hash because so-called "download accelerators" which issue a bunch of 
simultaneous http range-requests with non-overlapping ranges end up 
getting an unfair share of the bandwidth.  so i've modified sfq with an 
option to ignore the ports in the hash function.

i know this has a negative effect on people with shared NAT for example, 
but i feel the probability of unfair treatment of shared NAT users is 
lower than the probability of unfair bandwidth allocation to download 
accelerators on pipes of lower bandwidth.

i also prefer all outbound traffic for a particular destination to be in 
the same bucket...

i've been using a patch much like this one on pipes <10mbit for a few
years now and it works great.

this patch adds "noports" and "nosrcip" flags to tc and sch_sfq.

as part of testing these changes i noticed the hash function in sfq is 
weak.  it's not weak enough to be noticed when the destination port is 
part of the hash, which is probably why it hasn't been noticed yet.
there's a few problems:

- the src/dst ip addresses are used in network endian form, but the hash
  output uses only the host-endian low 20 bits.  given the assumption that
  destination IP addresses vary in the (network endian) low bits this
  means a little endian host ends up using the least varying bits of the
  destination IP address.  this is especially noticable for hosts on the
  same network.

- there's a comment about a rotation in the source but the code isn't
  a rotation :)  it uses 0x1f - pert and shoudl be 0x20 - pert to get
  a rotation.

- the perturbation affects bits from the source address only, and so two
  destinations which collide will still collide under another
  perturbation.  this is really obvious when the destination ports are
  removed from the hash.

- even when the perturbation works it tends to cause collisions between
  the same hosts 1/32nd of the time, about 3%.

rather than invent a new hash i've switched to using jhash_3words.
this has the advantage of using all bits of the inputs, and lets us use
a full 32-bit perturbation... which hopefully reduces the incidence of
collision even further.

the only disadvantage i can see to using jhash as opposed to an improvised
hash is the cycle cost.  note that iptables hashlimit module uses jhash,
so i think it should be ok to use jhash in sfq as well...  but if people
are worried about the perf impact i have an improvised hash idea using
only 4 shifts which would probably be sufficient.

given that jhash uses all bits of the inputs i didn't see any reason to
insert ntohl... so this patch produces different results on big/little
endian hosts.  i doubt it's an issue.

note in the patch i had to grow struct tc_sfq_qopt to include a flags
field.  in the kernel patch i test the size of the incoming structure in
order to handle older versions of tc which didn't include a flags field.

one other thing i considered rather than "noports" and "nosrcip" was
a single "dstonly" option... let me know if you'd prefer that.

let me know what you think... i'd like to get something like this patch
included upstream so i can eliminate a patch from several of my kernels.

thanks
-dean

p.s. sorry for the attachments, but there's two packages being patched
here...

Signed-off-by: dean gaudet <[EMAIL PROTECTED]>
diff -pru -X linux-2.6.14.4/Documentation/dontdiff 
linux-2.6.14.4.orig/include/linux/pkt_sched.h 
linux-2.6.14.4/include/linux/pkt_sched.h
--- linux-2.6.14.4.orig/include/linux/pkt_sched.h       2005-10-27 
17:02:08.000000000 -0700
+++ linux-2.6.14.4/include/linux/pkt_sched.h    2005-12-17 17:21:52.000000000 
-0800
@@ -136,6 +136,9 @@ struct tc_sfq_qopt
        __u32           limit;          /* Maximal packets in queue */
        unsigned        divisor;        /* Hash divisor  */
        unsigned        flows;          /* Maximal number of flows  */
+       unsigned char   flags;
+#define TC_SFQ_NOPORTS (1)             /* ignore src/dst ports in hash */
+#define TC_SFQ_NOSRCIP (2)             /* ignore src ip address in hash */
 };
 
 /*
diff -pru -X linux-2.6.14.4/Documentation/dontdiff 
linux-2.6.14.4.orig/net/sched/sch_sfq.c linux-2.6.14.4/net/sched/sch_sfq.c
--- linux-2.6.14.4.orig/net/sched/sch_sfq.c     2005-10-27 17:02:08.000000000 
-0700
+++ linux-2.6.14.4/net/sched/sch_sfq.c  2005-12-17 17:23:49.000000000 -0800
@@ -36,6 +36,7 @@
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/pkt_sched.h>
+#include <linux/jhash.h>
 
 
 /*     Stochastic Fairness Queuing algorithm.
@@ -106,10 +107,11 @@ struct sfq_sched_data
        int             perturb_period;
        unsigned        quantum;        /* Allotment per round: MUST BE >= MTU 
*/
        int             limit;
+       unsigned        flags;
 
 /* Variables */
        struct timer_list perturb_timer;
-       int             perturbation;
+       u32             perturbation;
        sfq_index       tail;           /* Index of current slot in round */
        sfq_index       max_depth;      /* Maximal depth */
 
@@ -121,49 +123,47 @@ struct sfq_sched_data
        struct sfq_head dep[SFQ_DEPTH*2];       /* Linked list of slots, 
indexed by depth */
 };
 
-static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 
h1)
-{
-       int pert = q->perturbation;
-
-       /* Have we any rotation primitives? If not, WHY? */
-       h ^= (h1<<pert) ^ (h1>>(0x1F - pert));
-       h ^= h>>10;
-       return h & 0x3FF;
-}
-
 static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
 {
-       u32 h, h2;
+       u32 h1, h2, h3;
 
+       h3 = 0;
        switch (skb->protocol) {
        case __constant_htons(ETH_P_IP):
        {
                struct iphdr *iph = skb->nh.iph;
-               h = iph->daddr;
-               h2 = iph->saddr^iph->protocol;
-               if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
+               h1 = iph->daddr;
+               h2 = (q->flags & TC_SFQ_NOSRCIP)
+                       ? 0 : (iph->saddr^iph->protocol);
+               if (!(q->flags & TC_SFQ_NOPORTS) &&
+                   !(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
                    (iph->protocol == IPPROTO_TCP ||
                     iph->protocol == IPPROTO_UDP ||
                     iph->protocol == IPPROTO_ESP))
-                       h2 ^= *(((u32*)iph) + iph->ihl);
+                       h3 ^= *(((u32*)iph) + iph->ihl);
                break;
        }
        case __constant_htons(ETH_P_IPV6):
        {
                struct ipv6hdr *iph = skb->nh.ipv6h;
-               h = iph->daddr.s6_addr32[3];
-               h2 = iph->saddr.s6_addr32[3]^iph->nexthdr;
-               if (iph->nexthdr == IPPROTO_TCP ||
-                   iph->nexthdr == IPPROTO_UDP ||
-                   iph->nexthdr == IPPROTO_ESP)
-                       h2 ^= *(u32*)&iph[1];
+               h1 = iph->daddr.s6_addr32[3];
+               h2 = (q->flags & TC_SFQ_NOSRCIP)
+                       ? 0 : (iph->saddr.s6_addr32[3]^iph->nexthdr);
+               if (!(q->flags & TC_SFQ_NOPORTS) &&
+                   (iph->nexthdr == IPPROTO_TCP ||
+                    iph->nexthdr == IPPROTO_UDP ||
+                    iph->nexthdr == IPPROTO_ESP))
+                       h3 ^= *(u32*)&iph[1];
                break;
        }
        default:
-               h = (u32)(unsigned long)skb->dst^skb->protocol;
-               h2 = (u32)(unsigned long)skb->sk;
+               h1 = (u32)(unsigned long)skb->dst;
+               h2 = (q->flags & TC_SFQ_NOSRCIP)
+                       ? 0 : (u32)(unsigned long)skb->sk;
+               h3 = skb->protocol;
        }
-       return sfq_fold_hash(q, h, h2);
+       return jhash_3words(h1, h2, h3, q->perturbation)
+               & (SFQ_HASH_DIVISOR-1);
 }
 
 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
@@ -373,7 +373,7 @@ static void sfq_perturbation(unsigned lo
        struct Qdisc *sch = (struct Qdisc*)arg;
        struct sfq_sched_data *q = qdisc_priv(sch);
 
-       q->perturbation = net_random()&0x1F;
+       q->perturbation = net_random();
 
        if (q->perturb_period) {
                q->perturb_timer.expires = jiffies + q->perturb_period;
@@ -386,7 +386,8 @@ static int sfq_change(struct Qdisc *sch,
        struct sfq_sched_data *q = qdisc_priv(sch);
        struct tc_sfq_qopt *ctl = RTA_DATA(opt);
 
-       if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
+       /* support older tc_sfq_qopt without a flags field */
+       if (opt->rta_len < RTA_LENGTH(offsetof(struct tc_sfq_qopt, flags)))
                return -EINVAL;
 
        sch_tree_lock(sch);
@@ -403,6 +404,14 @@ static int sfq_change(struct Qdisc *sch,
                q->perturb_timer.expires = jiffies + q->perturb_period;
                add_timer(&q->perturb_timer);
        }
+
+       /* must check if flags field is present */
+       if (opt->rta_len >= RTA_LENGTH(sizeof(struct tc_sfq_qopt))) {
+               q->flags = ctl->flags;
+       }
+       else {
+               q->flags = 0;
+       }
        sch_tree_unlock(sch);
        return 0;
 }
@@ -457,6 +466,7 @@ static int sfq_dump(struct Qdisc *sch, s
        opt.limit = q->limit;
        opt.divisor = SFQ_HASH_DIVISOR;
        opt.flows = q->limit;
+       opt.flags = q->flags;
 
        RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 
diff -pru -x '*.o' iproute2-2.6.14-051107.orig/include/linux/pkt_sched.h 
iproute2-2.6.14-051107/include/linux/pkt_sched.h
--- iproute2-2.6.14-051107.orig/include/linux/pkt_sched.h       2005-08-08 
13:24:41.000000000 -0700
+++ iproute2-2.6.14-051107/include/linux/pkt_sched.h    2005-12-17 
17:33:23.000000000 -0800
@@ -136,6 +136,9 @@ struct tc_sfq_qopt
        __u32           limit;          /* Maximal packets in queue */
        unsigned        divisor;        /* Hash divisor  */
        unsigned        flows;          /* Maximal number of flows  */
+       unsigned        flags;
+#define TC_SFQ_NOPORTS (1)             /* ignore src/dst ports in hash */
+#define TC_SFQ_NOSRCIP (2)             /* ignore src ip address in hash */
 };
 
 /*
diff -pru -x '*.o' iproute2-2.6.14-051107.orig/man/man8/tc-sfq.8 
iproute2-2.6.14-051107/man/man8/tc-sfq.8
--- iproute2-2.6.14-051107.orig/man/man8/tc-sfq.8       2004-06-08 
13:34:17.000000000 -0700
+++ iproute2-2.6.14-051107/man/man8/tc-sfq.8    2005-12-17 17:45:26.000000000 
-0800
@@ -6,6 +6,11 @@ sfq \- Stochastic Fairness Queueing
 seconds
 .B quantum
 bytes
+.B [ limit
+packets
+.B ]
+.B [ noports ]
+.B [ nosrcip ]
 
 .SH DESCRIPTION
 
@@ -22,21 +27,13 @@ This may in fact have some effect in mit
 
 SFQ is work-conserving and therefore always delivers a packet if it has one 
available.
 .SH ALGORITHM
-On enqueueing, each packet is assigned to a hash bucket, based on
-.TP
-(i)
-Source address
-.TP
-(ii)
-Destination address
-.TP
-(iii)
-Source port
-.P
-If these are available. SFQ knows about ipv4 and ipv6 and also UDP, TCP and 
ESP. 
-Packets with other protocols are hashed based on the 32bits representation of 
their 
-destination and the socket they belong to. A flow corresponds mostly to a 
TCP/IP 
-connection.
+On enqueueing, each packet is assigned to a hash bucket, based on a function of
+source address, source port, destination address, and destination port.
+
+SFQ knows about ipv4 and ipv6 and also UDP, TCP and ESP.  Packets with
+other protocols are hashed based on the 32bits representation of their
+destination and the socket they belong to. A flow corresponds mostly to
+a TCP/IP connection.
 
 Each of these buckets should represent a unique flow. Because multiple flows 
may
 get hashed to the same bucket, the hashing algorithm is perturbed at 
configurable 
@@ -59,6 +56,25 @@ reordering. Advised value: 10
 quantum
 Amount of bytes a flow is allowed to dequeue during a round of the round robin 
process.
 Defaults to the MTU of the interface which is also the advised value and the 
minimum value.
+.TP
+limit
+Number of packets permitted in the SFQ at one time.  Must be at least 1, and at
+most 128.
+.TP
+noports
+If specified then the source/destination ports are not included in the hash
+function.  This will force all flows for a particular source/destination 
address
+pair to be placed in the same bucket.  It may be useful for defeating so-called
+"download accelerators".  These "accelerators" use HTTP range-requests in order
+to issue multiple simultaneous requests to a webserver in an attempt to get
+an unfair share of available bandwidth.  Note that this option does unfairly
+penalize hosts sharing an IP address, such as hosts behind a shared NAT.
+.TP
+nosrcip
+If specified then the source address is not considered in the hash function.
+This is useful for when you wish all outbound traffic for a particular
+destination to be considered without consideration for which source address
+generated it.
 
 .SH EXAMPLE & USAGE
 
diff -pru -x '*.o' iproute2-2.6.14-051107.orig/tc/q_sfq.c 
iproute2-2.6.14-051107/tc/q_sfq.c
--- iproute2-2.6.14-051107.orig/tc/q_sfq.c      2004-09-28 11:35:49.000000000 
-0700
+++ iproute2-2.6.14-051107/tc/q_sfq.c   2005-12-17 17:26:52.000000000 -0800
@@ -25,7 +25,7 @@
 
 static void explain(void)
 {
-       fprintf(stderr, "Usage: ... sfq [ limit NUMBER ] [ perturb SECS ] [ 
quantum BYTES ]\n");
+       fprintf(stderr, "Usage: ... sfq [ limit NUMBER ] [ perturb SECS ] [ 
quantum BYTES ] [ noports ] [ nosrcip ]\n");
 }
 
 #define usage() return(-1)
@@ -63,6 +63,12 @@ static int sfq_parse_opt(struct qdisc_ut
                                return -1;
                        }
                        ok++;
+               } else if (strcmp(*argv, "noports") == 0) {
+                       opt.flags |= TC_SFQ_NOPORTS;
+                       ok++;
+               } else if (strcmp(*argv, "nosrcip") == 0) {
+                       opt.flags |= TC_SFQ_NOSRCIP;
+                       ok++;
                } else if (strcmp(*argv, "help") == 0) {
                        explain();
                        return -1;
@@ -97,6 +103,10 @@ static int sfq_print_opt(struct qdisc_ut
        }
        if (qopt->perturb_period)
                fprintf(f, "perturb %dsec ", qopt->perturb_period);
+       if (qopt->flags & TC_SFQ_NOPORTS)
+               fprintf(f, "noports ");
+       if (qopt->flags & TC_SFQ_NOSRCIP)
+               fprintf(f, "nosrcip ");
        return 0;
 }
 

Reply via email to