hi,
i've always found it unfortunate that sfq includes the destination port in
the hash because so-called "download accelerators" which issue a bunch of
simultaneous http range-requests with non-overlapping ranges end up
getting an unfair share of the bandwidth. so i've modified sfq with an
option to ignore the ports in the hash function.
i know this has a negative effect on people with shared NAT for example,
but i feel the probability of unfair treatment of shared NAT users is
lower than the probability of unfair bandwidth allocation to download
accelerators on pipes of lower bandwidth.
i also prefer all outbound traffic for a particular destination to be in
the same bucket...
i've been using a patch much like this one on pipes <10mbit for a few
years now and it works great.
this patch adds "noports" and "nosrcip" flags to tc and sch_sfq.
as part of testing these changes i noticed the hash function in sfq is
weak. it's not weak enough to be noticed when the destination port is
part of the hash, which is probably why it hasn't been noticed yet.
there's a few problems:
- the src/dst ip addresses are used in network endian form, but the hash
output uses only the host-endian low 20 bits. given the assumption that
destination IP addresses vary in the (network endian) low bits this
means a little endian host ends up using the least varying bits of the
destination IP address. this is especially noticable for hosts on the
same network.
- there's a comment about a rotation in the source but the code isn't
a rotation :) it uses 0x1f - pert and shoudl be 0x20 - pert to get
a rotation.
- the perturbation affects bits from the source address only, and so two
destinations which collide will still collide under another
perturbation. this is really obvious when the destination ports are
removed from the hash.
- even when the perturbation works it tends to cause collisions between
the same hosts 1/32nd of the time, about 3%.
rather than invent a new hash i've switched to using jhash_3words.
this has the advantage of using all bits of the inputs, and lets us use
a full 32-bit perturbation... which hopefully reduces the incidence of
collision even further.
the only disadvantage i can see to using jhash as opposed to an improvised
hash is the cycle cost. note that iptables hashlimit module uses jhash,
so i think it should be ok to use jhash in sfq as well... but if people
are worried about the perf impact i have an improvised hash idea using
only 4 shifts which would probably be sufficient.
given that jhash uses all bits of the inputs i didn't see any reason to
insert ntohl... so this patch produces different results on big/little
endian hosts. i doubt it's an issue.
note in the patch i had to grow struct tc_sfq_qopt to include a flags
field. in the kernel patch i test the size of the incoming structure in
order to handle older versions of tc which didn't include a flags field.
one other thing i considered rather than "noports" and "nosrcip" was
a single "dstonly" option... let me know if you'd prefer that.
let me know what you think... i'd like to get something like this patch
included upstream so i can eliminate a patch from several of my kernels.
thanks
-dean
p.s. sorry for the attachments, but there's two packages being patched
here...
Signed-off-by: dean gaudet <[EMAIL PROTECTED]>
diff -pru -X linux-2.6.14.4/Documentation/dontdiff
linux-2.6.14.4.orig/include/linux/pkt_sched.h
linux-2.6.14.4/include/linux/pkt_sched.h
--- linux-2.6.14.4.orig/include/linux/pkt_sched.h 2005-10-27
17:02:08.000000000 -0700
+++ linux-2.6.14.4/include/linux/pkt_sched.h 2005-12-17 17:21:52.000000000
-0800
@@ -136,6 +136,9 @@ struct tc_sfq_qopt
__u32 limit; /* Maximal packets in queue */
unsigned divisor; /* Hash divisor */
unsigned flows; /* Maximal number of flows */
+ unsigned char flags;
+#define TC_SFQ_NOPORTS (1) /* ignore src/dst ports in hash */
+#define TC_SFQ_NOSRCIP (2) /* ignore src ip address in hash */
};
/*
diff -pru -X linux-2.6.14.4/Documentation/dontdiff
linux-2.6.14.4.orig/net/sched/sch_sfq.c linux-2.6.14.4/net/sched/sch_sfq.c
--- linux-2.6.14.4.orig/net/sched/sch_sfq.c 2005-10-27 17:02:08.000000000
-0700
+++ linux-2.6.14.4/net/sched/sch_sfq.c 2005-12-17 17:23:49.000000000 -0800
@@ -36,6 +36,7 @@
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
+#include <linux/jhash.h>
/* Stochastic Fairness Queuing algorithm.
@@ -106,10 +107,11 @@ struct sfq_sched_data
int perturb_period;
unsigned quantum; /* Allotment per round: MUST BE >= MTU
*/
int limit;
+ unsigned flags;
/* Variables */
struct timer_list perturb_timer;
- int perturbation;
+ u32 perturbation;
sfq_index tail; /* Index of current slot in round */
sfq_index max_depth; /* Maximal depth */
@@ -121,49 +123,47 @@ struct sfq_sched_data
struct sfq_head dep[SFQ_DEPTH*2]; /* Linked list of slots,
indexed by depth */
};
-static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32
h1)
-{
- int pert = q->perturbation;
-
- /* Have we any rotation primitives? If not, WHY? */
- h ^= (h1<<pert) ^ (h1>>(0x1F - pert));
- h ^= h>>10;
- return h & 0x3FF;
-}
-
static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
{
- u32 h, h2;
+ u32 h1, h2, h3;
+ h3 = 0;
switch (skb->protocol) {
case __constant_htons(ETH_P_IP):
{
struct iphdr *iph = skb->nh.iph;
- h = iph->daddr;
- h2 = iph->saddr^iph->protocol;
- if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
+ h1 = iph->daddr;
+ h2 = (q->flags & TC_SFQ_NOSRCIP)
+ ? 0 : (iph->saddr^iph->protocol);
+ if (!(q->flags & TC_SFQ_NOPORTS) &&
+ !(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
(iph->protocol == IPPROTO_TCP ||
iph->protocol == IPPROTO_UDP ||
iph->protocol == IPPROTO_ESP))
- h2 ^= *(((u32*)iph) + iph->ihl);
+ h3 ^= *(((u32*)iph) + iph->ihl);
break;
}
case __constant_htons(ETH_P_IPV6):
{
struct ipv6hdr *iph = skb->nh.ipv6h;
- h = iph->daddr.s6_addr32[3];
- h2 = iph->saddr.s6_addr32[3]^iph->nexthdr;
- if (iph->nexthdr == IPPROTO_TCP ||
- iph->nexthdr == IPPROTO_UDP ||
- iph->nexthdr == IPPROTO_ESP)
- h2 ^= *(u32*)&iph[1];
+ h1 = iph->daddr.s6_addr32[3];
+ h2 = (q->flags & TC_SFQ_NOSRCIP)
+ ? 0 : (iph->saddr.s6_addr32[3]^iph->nexthdr);
+ if (!(q->flags & TC_SFQ_NOPORTS) &&
+ (iph->nexthdr == IPPROTO_TCP ||
+ iph->nexthdr == IPPROTO_UDP ||
+ iph->nexthdr == IPPROTO_ESP))
+ h3 ^= *(u32*)&iph[1];
break;
}
default:
- h = (u32)(unsigned long)skb->dst^skb->protocol;
- h2 = (u32)(unsigned long)skb->sk;
+ h1 = (u32)(unsigned long)skb->dst;
+ h2 = (q->flags & TC_SFQ_NOSRCIP)
+ ? 0 : (u32)(unsigned long)skb->sk;
+ h3 = skb->protocol;
}
- return sfq_fold_hash(q, h, h2);
+ return jhash_3words(h1, h2, h3, q->perturbation)
+ & (SFQ_HASH_DIVISOR-1);
}
static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
@@ -373,7 +373,7 @@ static void sfq_perturbation(unsigned lo
struct Qdisc *sch = (struct Qdisc*)arg;
struct sfq_sched_data *q = qdisc_priv(sch);
- q->perturbation = net_random()&0x1F;
+ q->perturbation = net_random();
if (q->perturb_period) {
q->perturb_timer.expires = jiffies + q->perturb_period;
@@ -386,7 +386,8 @@ static int sfq_change(struct Qdisc *sch,
struct sfq_sched_data *q = qdisc_priv(sch);
struct tc_sfq_qopt *ctl = RTA_DATA(opt);
- if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
+ /* support older tc_sfq_qopt without a flags field */
+ if (opt->rta_len < RTA_LENGTH(offsetof(struct tc_sfq_qopt, flags)))
return -EINVAL;
sch_tree_lock(sch);
@@ -403,6 +404,14 @@ static int sfq_change(struct Qdisc *sch,
q->perturb_timer.expires = jiffies + q->perturb_period;
add_timer(&q->perturb_timer);
}
+
+ /* must check if flags field is present */
+ if (opt->rta_len >= RTA_LENGTH(sizeof(struct tc_sfq_qopt))) {
+ q->flags = ctl->flags;
+ }
+ else {
+ q->flags = 0;
+ }
sch_tree_unlock(sch);
return 0;
}
@@ -457,6 +466,7 @@ static int sfq_dump(struct Qdisc *sch, s
opt.limit = q->limit;
opt.divisor = SFQ_HASH_DIVISOR;
opt.flows = q->limit;
+ opt.flags = q->flags;
RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
diff -pru -x '*.o' iproute2-2.6.14-051107.orig/include/linux/pkt_sched.h
iproute2-2.6.14-051107/include/linux/pkt_sched.h
--- iproute2-2.6.14-051107.orig/include/linux/pkt_sched.h 2005-08-08
13:24:41.000000000 -0700
+++ iproute2-2.6.14-051107/include/linux/pkt_sched.h 2005-12-17
17:33:23.000000000 -0800
@@ -136,6 +136,9 @@ struct tc_sfq_qopt
__u32 limit; /* Maximal packets in queue */
unsigned divisor; /* Hash divisor */
unsigned flows; /* Maximal number of flows */
+ unsigned flags;
+#define TC_SFQ_NOPORTS (1) /* ignore src/dst ports in hash */
+#define TC_SFQ_NOSRCIP (2) /* ignore src ip address in hash */
};
/*
diff -pru -x '*.o' iproute2-2.6.14-051107.orig/man/man8/tc-sfq.8
iproute2-2.6.14-051107/man/man8/tc-sfq.8
--- iproute2-2.6.14-051107.orig/man/man8/tc-sfq.8 2004-06-08
13:34:17.000000000 -0700
+++ iproute2-2.6.14-051107/man/man8/tc-sfq.8 2005-12-17 17:45:26.000000000
-0800
@@ -6,6 +6,11 @@ sfq \- Stochastic Fairness Queueing
seconds
.B quantum
bytes
+.B [ limit
+packets
+.B ]
+.B [ noports ]
+.B [ nosrcip ]
.SH DESCRIPTION
@@ -22,21 +27,13 @@ This may in fact have some effect in mit
SFQ is work-conserving and therefore always delivers a packet if it has one
available.
.SH ALGORITHM
-On enqueueing, each packet is assigned to a hash bucket, based on
-.TP
-(i)
-Source address
-.TP
-(ii)
-Destination address
-.TP
-(iii)
-Source port
-.P
-If these are available. SFQ knows about ipv4 and ipv6 and also UDP, TCP and
ESP.
-Packets with other protocols are hashed based on the 32bits representation of
their
-destination and the socket they belong to. A flow corresponds mostly to a
TCP/IP
-connection.
+On enqueueing, each packet is assigned to a hash bucket, based on a function of
+source address, source port, destination address, and destination port.
+
+SFQ knows about ipv4 and ipv6 and also UDP, TCP and ESP. Packets with
+other protocols are hashed based on the 32bits representation of their
+destination and the socket they belong to. A flow corresponds mostly to
+a TCP/IP connection.
Each of these buckets should represent a unique flow. Because multiple flows
may
get hashed to the same bucket, the hashing algorithm is perturbed at
configurable
@@ -59,6 +56,25 @@ reordering. Advised value: 10
quantum
Amount of bytes a flow is allowed to dequeue during a round of the round robin
process.
Defaults to the MTU of the interface which is also the advised value and the
minimum value.
+.TP
+limit
+Number of packets permitted in the SFQ at one time. Must be at least 1, and at
+most 128.
+.TP
+noports
+If specified then the source/destination ports are not included in the hash
+function. This will force all flows for a particular source/destination
address
+pair to be placed in the same bucket. It may be useful for defeating so-called
+"download accelerators". These "accelerators" use HTTP range-requests in order
+to issue multiple simultaneous requests to a webserver in an attempt to get
+an unfair share of available bandwidth. Note that this option does unfairly
+penalize hosts sharing an IP address, such as hosts behind a shared NAT.
+.TP
+nosrcip
+If specified then the source address is not considered in the hash function.
+This is useful for when you wish all outbound traffic for a particular
+destination to be considered without consideration for which source address
+generated it.
.SH EXAMPLE & USAGE
diff -pru -x '*.o' iproute2-2.6.14-051107.orig/tc/q_sfq.c
iproute2-2.6.14-051107/tc/q_sfq.c
--- iproute2-2.6.14-051107.orig/tc/q_sfq.c 2004-09-28 11:35:49.000000000
-0700
+++ iproute2-2.6.14-051107/tc/q_sfq.c 2005-12-17 17:26:52.000000000 -0800
@@ -25,7 +25,7 @@
static void explain(void)
{
- fprintf(stderr, "Usage: ... sfq [ limit NUMBER ] [ perturb SECS ] [
quantum BYTES ]\n");
+ fprintf(stderr, "Usage: ... sfq [ limit NUMBER ] [ perturb SECS ] [
quantum BYTES ] [ noports ] [ nosrcip ]\n");
}
#define usage() return(-1)
@@ -63,6 +63,12 @@ static int sfq_parse_opt(struct qdisc_ut
return -1;
}
ok++;
+ } else if (strcmp(*argv, "noports") == 0) {
+ opt.flags |= TC_SFQ_NOPORTS;
+ ok++;
+ } else if (strcmp(*argv, "nosrcip") == 0) {
+ opt.flags |= TC_SFQ_NOSRCIP;
+ ok++;
} else if (strcmp(*argv, "help") == 0) {
explain();
return -1;
@@ -97,6 +103,10 @@ static int sfq_print_opt(struct qdisc_ut
}
if (qopt->perturb_period)
fprintf(f, "perturb %dsec ", qopt->perturb_period);
+ if (qopt->flags & TC_SFQ_NOPORTS)
+ fprintf(f, "noports ");
+ if (qopt->flags & TC_SFQ_NOSRCIP)
+ fprintf(f, "nosrcip ");
return 0;
}