Hi, Eric, On 30.03.2018 08:22, Eric Dumazet wrote: > Some applications still rely on IP fragmentation, and to be fair linux > reassembly unit is not working under any serious load. > > It uses static hash tables of 1024 buckets, and up to 128 items per bucket > (!!!) > > A work queue is supposed to garbage collect items when host is under memory > pressure, and doing a hash rebuild, changing seed used in hash computations. > > This work queue blocks softirqs for up to 25 ms when doing a hash rebuild, > occurring every 5 seconds if host is under fire. > > Then there is the problem of sharing this hash table for all netns. > > It is time to switch to rhashtables, and allocate one of them per netns > to speedup netns dismantle, since this is a critical metric these days. > > Lookup is now using RCU. A followup patch will even remove > the refcount hold/release left from prior implementation and save > a couple of atomic operations. > > Before this patch, 16 cpus (16 RX queue NIC) could not handle more > than 1 Mpps frags DDOS. > > After the patch, I reach 7 Mpps without any tuning, and can use up to 2GB > of storage for the fragments.
Great results! Please, see some comments below. > $ grep FRAG /proc/net/sockstat > FRAG: inuse 1966916 memory 2140004608 > > A followup patch will change the limits for 64bit arches. > > Signed-off-by: Eric Dumazet <eduma...@google.com> > Cc: Florian Westphal <f...@strlen.de> > Cc: Nikolay Aleksandrov <niko...@redhat.com> > Cc: Jesper Dangaard Brouer <bro...@redhat.com> > Cc: Alexander Aring <alex.ar...@gmail.com> > Cc: Stefan Schmidt <ste...@osg.samsung.com> > --- > Documentation/networking/ip-sysctl.txt | 7 +- > include/net/inet_frag.h | 99 +++--- > include/net/ipv6.h | 20 +- > net/ieee802154/6lowpan/6lowpan_i.h | 26 +- > net/ieee802154/6lowpan/reassembly.c | 108 +++---- > net/ipv4/inet_fragment.c | 399 +++++------------------- > net/ipv4/ip_fragment.c | 165 +++++----- > net/ipv6/netfilter/nf_conntrack_reasm.c | 62 ++-- > net/ipv6/reassembly.c | 152 +++++---- > 9 files changed, 344 insertions(+), 694 deletions(-) > > diff --git a/Documentation/networking/ip-sysctl.txt > b/Documentation/networking/ip-sysctl.txt > index > 1d1120753ae82d0aee3e934a3d9c074b70dcbca6..c3b65f24e58aa72b720861d816fb76f9956800f0 > 100644 > --- a/Documentation/networking/ip-sysctl.txt > +++ b/Documentation/networking/ip-sysctl.txt > @@ -134,13 +134,10 @@ min_adv_mss - INTEGER > IP Fragmentation: > > ipfrag_high_thresh - INTEGER > - Maximum memory used to reassemble IP fragments. When > - ipfrag_high_thresh bytes of memory is allocated for this purpose, > - the fragment handler will toss packets until ipfrag_low_thresh > - is reached. This also serves as a maximum limit to namespaces > - different from the initial one. > + Maximum memory used to reassemble IP fragments. > > ipfrag_low_thresh - INTEGER > + (Obsolete since linux-4.17) > Maximum memory used to reassemble IP fragments before the kernel > begins to remove incomplete fragment queues to free up resources. > The kernel still accepts new fragments for defragmentation. > diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h > index > 69e531ed81894393e07cac9e953825fcb55ef42a..05099f9f980e2384c0c8cd7e74659656b585cd22 > 100644 > --- a/include/net/inet_frag.h > +++ b/include/net/inet_frag.h > @@ -2,15 +2,20 @@ > #ifndef __NET_FRAG_H__ > #define __NET_FRAG_H__ > > +#include <linux/rhashtable.h> > + > struct netns_frags { > - /* Keep atomic mem on separate cachelines in structs that include it */ > - atomic_t mem ____cacheline_aligned_in_smp; > /* sysctls */ > int timeout; > int high_thresh; > int low_thresh; > int max_dist; > struct inet_frags *f; > + > + /* Keep atomic mem on separate cachelines in structs that include it */ > + atomic_t mem ____cacheline_aligned_in_smp; The patch is big, and it seems it's possible to extract refactorings like this in separate patch/patches. Here just two lines moved down. > + > + struct rhashtable rhashtable ____cacheline_aligned_in_smp; > }; > > /** > @@ -26,12 +31,31 @@ enum { > INET_FRAG_COMPLETE = BIT(2), > }; > > +struct frag_v4_compare_key { > + __be32 saddr; > + __be32 daddr; > + u32 user; > + u32 vif; > + __be16 id; > + u16 protocol; > +}; > + > +struct frag_v6_compare_key { > + struct in6_addr saddr; > + struct in6_addr daddr; > + u32 user; > + __be32 id; > + u32 iif; > +}; > + > /** > * struct inet_frag_queue - fragment queue > * > - * @lock: spinlock protecting the queue > + * @node: rhash node > + * @key: keys identifying this frag. > * @timer: queue expiration timer > - * @list: hash bucket list > + * @net: namespace that this frag belongs to > + * @lock: spinlock protecting this frag > * @refcnt: reference count of the queue > * @fragments: received fragments head > * @fragments_tail: received fragments tail > @@ -40,66 +64,38 @@ enum { > * @meat: length of received fragments so far > * @flags: fragment queue flags > * @max_size: maximum received fragment size > - * @net: namespace that this frag belongs to > - * @list_evictor: list of queues to forcefully evict (e.g. due to low memory) > + * @rcu: rcu head for freeing deferall > */ > struct inet_frag_queue { > - spinlock_t lock; > + struct rhash_head node; > + union { > + struct frag_v4_compare_key v4; > + struct frag_v6_compare_key v6; > + } key; > struct timer_list timer; > - struct hlist_node list; > + struct netns_frags *net; > + spinlock_t lock; Here lock and net just change their position in struct { }. > refcount_t refcnt; > struct sk_buff *fragments; > struct sk_buff *fragments_tail; > ktime_t stamp; > int len; > int meat; > - __u8 flags; > + u8 flags; Here just type is changed. > u16 max_size; > - struct netns_frags *net; > - struct hlist_node list_evictor; > -}; > - > -#define INETFRAGS_HASHSZ 1024 > - > -/* averaged: > - * max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ / > - * rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or > - * struct frag_queue)) > - */ > -#define INETFRAGS_MAXDEPTH 128 > - > -struct inet_frag_bucket { > - struct hlist_head chain; > - spinlock_t chain_lock; > + struct rcu_head rcu; > }; > > struct inet_frags { > - struct inet_frag_bucket hash[INETFRAGS_HASHSZ]; > - > - struct work_struct frags_work; > - unsigned int next_bucket; > - unsigned long last_rebuild_jiffies; > - bool rebuild; > - > - /* The first call to hashfn is responsible to initialize > - * rnd. This is best done with net_get_random_once. > - * > - * rnd_seqlock is used to let hash insertion detect > - * when it needs to re-lookup the hash chain to use. > - */ > - u32 rnd; > - seqlock_t rnd_seqlock; > unsigned int qsize; > > - unsigned int (*hashfn)(const struct inet_frag_queue *); > - bool (*match)(const struct inet_frag_queue *q, > - const void *arg); > - void (*constructor)(struct inet_frag_queue *q, > + void (*constructor)(struct inet_frag_queue *fq, Here just parameter name is changed > const void *arg); > - void (*destructor)(struct inet_frag_queue *); > + void (*destructor)(struct inet_frag_queue *fq); The same as above. > void (*frag_expire)(struct timer_list *t); > struct kmem_cache *frags_cachep; > const char *frags_cache_name; > + struct rhashtable_params rhash_params; > }; > > int inet_frags_init(struct inet_frags *); > @@ -108,17 +104,13 @@ void inet_frags_fini(struct inet_frags *); > static inline int inet_frags_init_net(struct netns_frags *nf) > { > atomic_set(&nf->mem, 0); > - return 0; > + return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params); > } > void inet_frags_exit_net(struct netns_frags *nf); > > void inet_frag_kill(struct inet_frag_queue *q); > void inet_frag_destroy(struct inet_frag_queue *q); > -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, > - struct inet_frags *f, void *key, unsigned int hash); > - > -void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, > - const char *prefix); > +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key); > > static inline void inet_frag_put(struct inet_frag_queue *q) > { > @@ -126,11 +118,6 @@ static inline void inet_frag_put(struct inet_frag_queue > *q) > inet_frag_destroy(q); > } > > -static inline bool inet_frag_evicting(struct inet_frag_queue *q) > -{ > - return !hlist_unhashed(&q->list_evictor); > -} > - > /* Memory Tracking Functions. */ > > static inline int frag_mem_limit(struct netns_frags *nf) > diff --git a/include/net/ipv6.h b/include/net/ipv6.h > index > 5c18836672e9d1c560cdce15f5b34928c337abfd..76f84d4be91b92761fb9a26e7f52e2101ee34c0a > 100644 > --- a/include/net/ipv6.h > +++ b/include/net/ipv6.h > @@ -579,36 +579,20 @@ enum ip6_defrag_users { > __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + > USHRT_MAX, > }; > > -struct ip6_create_arg { > - __be32 id; > - u32 user; > - const struct in6_addr *src; > - const struct in6_addr *dst; > - int iif; > - u8 ecn; > -}; > - > void ip6_frag_init(struct inet_frag_queue *q, const void *a); > -bool ip6_frag_match(const struct inet_frag_queue *q, const void *a); > > /* > - * Equivalent of ipv4 struct ip > + * Equivalent of ipv4 struct ipq > */ > struct frag_queue { > struct inet_frag_queue q; > > - __be32 id; /* fragment id */ > - u32 user; > - struct in6_addr saddr; > - struct in6_addr daddr; > - > int iif; > __u16 nhoffset; > u8 ecn; > }; > > -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, > - struct inet_frags *frags); > +void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq); > > static inline bool ipv6_addr_any(const struct in6_addr *a) > { > diff --git a/net/ieee802154/6lowpan/6lowpan_i.h > b/net/ieee802154/6lowpan/6lowpan_i.h > index > d8de3bcfb1032a1133402cb2a4c50a2448133846..b8d95cb71c25dd69c8a88b2c886a3f0d2ce1174f > 100644 > --- a/net/ieee802154/6lowpan/6lowpan_i.h > +++ b/net/ieee802154/6lowpan/6lowpan_i.h > @@ -17,37 +17,19 @@ typedef unsigned __bitwise lowpan_rx_result; > #define LOWPAN_DISPATCH_FRAG1 0xc0 > #define LOWPAN_DISPATCH_FRAGN 0xe0 > > -struct lowpan_create_arg { > +struct frag_lowpan_compare_key { > u16 tag; > u16 d_size; > - const struct ieee802154_addr *src; > - const struct ieee802154_addr *dst; > + const struct ieee802154_addr src; > + const struct ieee802154_addr dst; > }; > > -/* Equivalent of ipv4 struct ip > +/* Equivalent of ipv4 struct ipq > */ > struct lowpan_frag_queue { > struct inet_frag_queue q; > - > - u16 tag; > - u16 d_size; > - struct ieee802154_addr saddr; > - struct ieee802154_addr daddr; > }; > > -static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a) > -{ > - switch (a->mode) { > - case IEEE802154_ADDR_LONG: > - return (((__force u64)a->extended_addr) >> 32) ^ > - (((__force u64)a->extended_addr) & 0xffffffff); > - case IEEE802154_ADDR_SHORT: > - return (__force u32)(a->short_addr + (a->pan_id << 16)); > - default: > - return 0; > - } > -} > - > int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type); > void lowpan_net_frag_exit(void); > int lowpan_net_frag_init(void); > diff --git a/net/ieee802154/6lowpan/reassembly.c > b/net/ieee802154/6lowpan/reassembly.c > index > 6badc055555b7baedac2051a1aaea15f9e9b180c..9ee4d22666c26d6d9796d0f484bb4beb265dea42 > 100644 > --- a/net/ieee802154/6lowpan/reassembly.c > +++ b/net/ieee802154/6lowpan/reassembly.c > @@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags; > static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, > struct sk_buff *prev, struct net_device *ldev); > > -static unsigned int lowpan_hash_frag(u16 tag, u16 d_size, > - const struct ieee802154_addr *saddr, > - const struct ieee802154_addr *daddr) > -{ > - net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd)); > - return jhash_3words(ieee802154_addr_hash(saddr), > - ieee802154_addr_hash(daddr), > - (__force u32)(tag + (d_size << 16)), > - lowpan_frags.rnd); > -} > - > -static unsigned int lowpan_hashfn(const struct inet_frag_queue *q) > -{ > - const struct lowpan_frag_queue *fq; > - > - fq = container_of(q, struct lowpan_frag_queue, q); > - return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr); > -} > - > -static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a) > -{ > - const struct lowpan_frag_queue *fq; > - const struct lowpan_create_arg *arg = a; > - > - fq = container_of(q, struct lowpan_frag_queue, q); > - return fq->tag == arg->tag && fq->d_size == arg->d_size && > - ieee802154_addr_equal(&fq->saddr, arg->src) && > - ieee802154_addr_equal(&fq->daddr, arg->dst); > -} > - > static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) > { > - const struct lowpan_create_arg *arg = a; > + const struct frag_lowpan_compare_key *key = a; > struct lowpan_frag_queue *fq; > > fq = container_of(q, struct lowpan_frag_queue, q); > > - fq->tag = arg->tag; > - fq->d_size = arg->d_size; > - fq->saddr = *arg->src; > - fq->daddr = *arg->dst; > + BUILD_BUG_ON(sizeof(*key) > sizeof(q->key)); > + memcpy(&q->key, key, sizeof(*key)); > } > > static void lowpan_frag_expire(struct timer_list *t) > @@ -105,25 +73,20 @@ fq_find(struct net *net, const struct lowpan_802154_cb > *cb, > const struct ieee802154_addr *src, > const struct ieee802154_addr *dst) > { > - struct inet_frag_queue *q; > - struct lowpan_create_arg arg; > - unsigned int hash; > struct netns_ieee802154_lowpan *ieee802154_lowpan = > net_ieee802154_lowpan(net); > + struct frag_lowpan_compare_key key = { > + .tag = cb->d_tag, > + .d_size = cb->d_size, > + .src = *src, > + .dst = *dst, > + }; > + struct inet_frag_queue *q; > > - arg.tag = cb->d_tag; > - arg.d_size = cb->d_size; > - arg.src = src; > - arg.dst = dst; > - > - hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst); > - > - q = inet_frag_find(&ieee802154_lowpan->frags, > - &lowpan_frags, &arg, hash); > - if (IS_ERR_OR_NULL(q)) { > - inet_frag_maybe_warn_overflow(q, pr_fmt()); > + q = inet_frag_find(&ieee802154_lowpan->frags, &key); > + if (IS_ERR_OR_NULL(q)) > return NULL; > - } > + > return container_of(q, struct lowpan_frag_queue, q); > } > > @@ -588,6 +551,7 @@ static int __net_init lowpan_frags_init_net(struct net > *net) > ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT; > ieee802154_lowpan->frags.f = &lowpan_frags; > > + ieee802154_lowpan->frags.f = &lowpan_frags; > res = inet_frags_init_net(&ieee802154_lowpan->frags); > if (res < 0) > return res; > @@ -611,6 +575,36 @@ static struct pernet_operations lowpan_frags_ops = { > .exit = lowpan_frags_exit_net, > }; > > +static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed) > +{ > + return jhash2(data, > + sizeof(struct frag_lowpan_compare_key) / sizeof(u32), > seed); > +} > + > +static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed) > +{ > + const struct inet_frag_queue *fq = data; > + > + return jhash2((const u32 *)&fq->key, > + sizeof(struct frag_lowpan_compare_key) / sizeof(u32), > seed); > +} > + > +static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void > *ptr) > +{ > + const struct frag_lowpan_compare_key *key = arg->key; > + const struct inet_frag_queue *fq = ptr; > + > + return !!memcmp(&fq->key, key, sizeof(*key)); > +} > + > +const struct rhashtable_params lowpan_rhash_params = { > + .head_offset = offsetof(struct inet_frag_queue, node), > + .hashfn = lowpan_key_hashfn, > + .obj_hashfn = lowpan_obj_hashfn, > + .obj_cmpfn = lowpan_obj_cmpfn, > + .automatic_shrinking = true, > +}; > + > int __init lowpan_net_frag_init(void) > { > int ret; > @@ -619,22 +613,24 @@ int __init lowpan_net_frag_init(void) > if (ret) > return ret; > > - ret = register_pernet_subsys(&lowpan_frags_ops); > - if (ret) > - goto err_pernet; > - > - lowpan_frags.hashfn = lowpan_hashfn; > lowpan_frags.constructor = lowpan_frag_init; > lowpan_frags.destructor = NULL; > lowpan_frags.qsize = sizeof(struct frag_queue); > - lowpan_frags.match = lowpan_frag_match; > lowpan_frags.frag_expire = lowpan_frag_expire; > lowpan_frags.frags_cache_name = lowpan_frags_cache_name; > + lowpan_frags.rhash_params = lowpan_rhash_params; > ret = inet_frags_init(&lowpan_frags); > if (ret) > goto err_pernet; > > + ret = register_pernet_subsys(&lowpan_frags_ops); > + if (ret) > + goto err_pernet_frags; > + Can't we move this register_pernet_subsys() in separate patch? > return ret; > + > +err_pernet_frags: > + inet_frags_fini(&lowpan_frags); > err_pernet: > lowpan_frags_sysctl_unregister(); > return ret; > diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c > index > 1ac69f65d0dee600d0ab4db20ff5942952932c40..8ccaf605630f14270996ee1b5a37376299d78661 > 100644 > --- a/net/ipv4/inet_fragment.c > +++ b/net/ipv4/inet_fragment.c > @@ -25,12 +25,6 @@ > #include <net/inet_frag.h> > #include <net/inet_ecn.h> > > -#define INETFRAGS_EVICT_BUCKETS 128 > -#define INETFRAGS_EVICT_MAX 512 > - > -/* don't rebuild inetfrag table with new secret more often than this */ > -#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ) > - > /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements > * Value : 0xff if frame should be dropped. > * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field > @@ -52,157 +46,8 @@ const u8 ip_frag_ecn_table[16] = { > }; > EXPORT_SYMBOL(ip_frag_ecn_table); > > -static unsigned int > -inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q) > -{ > - return f->hashfn(q) & (INETFRAGS_HASHSZ - 1); > -} > - > -static bool inet_frag_may_rebuild(struct inet_frags *f) > -{ > - return time_after(jiffies, > - f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL); > -} > - > -static void inet_frag_secret_rebuild(struct inet_frags *f) > -{ > - int i; > - > - write_seqlock_bh(&f->rnd_seqlock); > - > - if (!inet_frag_may_rebuild(f)) > - goto out; > - > - get_random_bytes(&f->rnd, sizeof(u32)); > - > - for (i = 0; i < INETFRAGS_HASHSZ; i++) { > - struct inet_frag_bucket *hb; > - struct inet_frag_queue *q; > - struct hlist_node *n; > - > - hb = &f->hash[i]; > - spin_lock(&hb->chain_lock); > - > - hlist_for_each_entry_safe(q, n, &hb->chain, list) { > - unsigned int hval = inet_frag_hashfn(f, q); > - > - if (hval != i) { > - struct inet_frag_bucket *hb_dest; > - > - hlist_del(&q->list); > - > - /* Relink to new hash chain. */ > - hb_dest = &f->hash[hval]; > - > - /* This is the only place where we take > - * another chain_lock while already holding > - * one. As this will not run concurrently, > - * we cannot deadlock on hb_dest lock below, if > its > - * already locked it will be released soon since > - * other caller cannot be waiting for hb lock > - * that we've taken above. > - */ > - spin_lock_nested(&hb_dest->chain_lock, > - SINGLE_DEPTH_NESTING); > - hlist_add_head(&q->list, &hb_dest->chain); > - spin_unlock(&hb_dest->chain_lock); > - } > - } > - spin_unlock(&hb->chain_lock); > - } > - > - f->rebuild = false; > - f->last_rebuild_jiffies = jiffies; > -out: > - write_sequnlock_bh(&f->rnd_seqlock); > -} > - > -static bool inet_fragq_should_evict(const struct inet_frag_queue *q) > -{ > - if (!hlist_unhashed(&q->list_evictor)) > - return false; > - > - return q->net->low_thresh == 0 || > - frag_mem_limit(q->net) >= q->net->low_thresh; > -} > - > -static unsigned int > -inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) > -{ > - struct inet_frag_queue *fq; > - struct hlist_node *n; > - unsigned int evicted = 0; > - HLIST_HEAD(expired); > - > - spin_lock(&hb->chain_lock); > - > - hlist_for_each_entry_safe(fq, n, &hb->chain, list) { > - if (!inet_fragq_should_evict(fq)) > - continue; > - > - if (!del_timer(&fq->timer)) > - continue; > - > - hlist_add_head(&fq->list_evictor, &expired); > - ++evicted; > - } > - > - spin_unlock(&hb->chain_lock); > - > - hlist_for_each_entry_safe(fq, n, &expired, list_evictor) > - f->frag_expire(&fq->timer); > - > - return evicted; > -} > - > -static void inet_frag_worker(struct work_struct *work) > -{ > - unsigned int budget = INETFRAGS_EVICT_BUCKETS; > - unsigned int i, evicted = 0; > - struct inet_frags *f; > - > - f = container_of(work, struct inet_frags, frags_work); > - > - BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ); > - > - local_bh_disable(); > - > - for (i = READ_ONCE(f->next_bucket); budget; --budget) { > - evicted += inet_evict_bucket(f, &f->hash[i]); > - i = (i + 1) & (INETFRAGS_HASHSZ - 1); > - if (evicted > INETFRAGS_EVICT_MAX) > - break; > - } > - > - f->next_bucket = i; > - > - local_bh_enable(); > - > - if (f->rebuild && inet_frag_may_rebuild(f)) > - inet_frag_secret_rebuild(f); > -} > - > -static void inet_frag_schedule_worker(struct inet_frags *f) > -{ > - if (unlikely(!work_pending(&f->frags_work))) > - schedule_work(&f->frags_work); > -} > - > int inet_frags_init(struct inet_frags *f) > { > - int i; > - > - INIT_WORK(&f->frags_work, inet_frag_worker); > - > - for (i = 0; i < INETFRAGS_HASHSZ; i++) { > - struct inet_frag_bucket *hb = &f->hash[i]; > - > - spin_lock_init(&hb->chain_lock); > - INIT_HLIST_HEAD(&hb->chain); > - } > - > - seqlock_init(&f->rnd_seqlock); > - f->last_rebuild_jiffies = 0; > f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, > NULL); > if (!f->frags_cachep) > @@ -214,93 +59,80 @@ EXPORT_SYMBOL(inet_frags_init); > > void inet_frags_fini(struct inet_frags *f) > { > - cancel_work_sync(&f->frags_work); > + rcu_barrier(); What does this barrier waits? This should have a comment. > kmem_cache_destroy(f->frags_cachep); > + f->frags_cachep = NULL; > } > EXPORT_SYMBOL(inet_frags_fini); > > void inet_frags_exit_net(struct netns_frags *nf) > { > - struct inet_frags *f =nf->f; > - unsigned int seq; > - int i; > - > - nf->low_thresh = 0; > + struct rhashtable_iter hti; > + struct inet_frag_queue *fq; > > -evict_again: > - local_bh_disable(); > - seq = read_seqbegin(&f->rnd_seqlock); > + nf->low_thresh = 0; /* prevent creation of new frags */ > > - for (i = 0; i < INETFRAGS_HASHSZ ; i++) > - inet_evict_bucket(f, &f->hash[i]); > + rhashtable_walk_enter(&nf->rhashtable, &hti); > + do { > + rhashtable_walk_start(&hti); > > - local_bh_enable(); > - cond_resched(); > + while ((fq = rhashtable_walk_next(&hti)) && !IS_ERR(fq)) { > + if (refcount_inc_not_zero(&fq->refcnt)) { > + spin_lock_bh(&fq->lock); > + inet_frag_kill(fq); > + spin_unlock_bh(&fq->lock); > + inet_frag_put(fq); > + } > + } > > - if (read_seqretry(&f->rnd_seqlock, seq) || > - sum_frag_mem_limit(nf)) > - goto evict_again; > + rhashtable_walk_stop(&hti); > + } while (cond_resched(), fq == ERR_PTR(-EAGAIN)); > + rhashtable_walk_exit(&hti); > + rhashtable_destroy(&nf->rhashtable); > } > EXPORT_SYMBOL(inet_frags_exit_net); > > -static struct inet_frag_bucket * > -get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f) > -__acquires(hb->chain_lock) > -{ > - struct inet_frag_bucket *hb; > - unsigned int seq, hash; > - > - restart: > - seq = read_seqbegin(&f->rnd_seqlock); > - > - hash = inet_frag_hashfn(f, fq); > - hb = &f->hash[hash]; > - > - spin_lock(&hb->chain_lock); > - if (read_seqretry(&f->rnd_seqlock, seq)) { > - spin_unlock(&hb->chain_lock); > - goto restart; > - } > - > - return hb; > -} > - > -static inline void fq_unlink(struct inet_frag_queue *fq) > -{ > - struct inet_frag_bucket *hb; > - > - hb = get_frag_bucket_locked(fq, fq->net->f); > - hlist_del(&fq->list); > - fq->flags |= INET_FRAG_COMPLETE; > - spin_unlock(&hb->chain_lock); > -} > - > void inet_frag_kill(struct inet_frag_queue *fq) > { > if (del_timer(&fq->timer)) > refcount_dec(&fq->refcnt); > > if (!(fq->flags & INET_FRAG_COMPLETE)) { > - fq_unlink(fq); > + struct netns_frags *nf = fq->net; > + > + fq->flags |= INET_FRAG_COMPLETE; > + rhashtable_remove_fast(&nf->rhashtable, &fq->node, > nf->f->rhash_params); > refcount_dec(&fq->refcnt); > } > } > EXPORT_SYMBOL(inet_frag_kill); > > -void inet_frag_destroy(struct inet_frag_queue *q) > +static void inet_frag_destroy_rcu(struct rcu_head *head) > { > - struct sk_buff *fp; > + struct inet_frag_queue *fq = container_of(head, struct inet_frag_queue, > + rcu); > + struct inet_frags *f = fq->net->f; > + > + if (f->destructor) > + f->destructor(fq); > + kmem_cache_free(f->frags_cachep, fq); > +} > + > +void inet_frag_destroy(struct inet_frag_queue *fq) > +{ > + unsigned int sum_truesize; > struct netns_frags *nf; > - unsigned int sum, sum_truesize = 0; > struct inet_frags *f; > + struct sk_buff *fp; > > - WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); > - WARN_ON(del_timer(&q->timer) != 0); > + WARN_ON(!(fq->flags & INET_FRAG_COMPLETE)); > + WARN_ON(del_timer(&fq->timer) != 0); This is actually a result of renaming the variable. The type of variable remains the same, while the name has changed. So, this should go in separate patch. > > /* Release all fragment data. */ > - fp = q->fragments; > - nf = q->net; > + fp = fq->fragments; > + nf = fq->net; The same here > f = nf->f; > + sum_truesize = f->qsize; > while (fp) { > struct sk_buff *xp = fp->next; > > @@ -308,136 +140,63 @@ void inet_frag_destroy(struct inet_frag_queue *q) > kfree_skb(fp); > fp = xp; > } > - sum = sum_truesize + f->qsize; > > - if (f->destructor) > - f->destructor(q); > - kmem_cache_free(f->frags_cachep, q); > + call_rcu(&fq->rcu, inet_frag_destroy_rcu); > > - sub_frag_mem_limit(nf, sum); > + sub_frag_mem_limit(nf, sum_truesize); > } > EXPORT_SYMBOL(inet_frag_destroy); > > -static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, > - struct inet_frag_queue *qp_in, > - struct inet_frags *f, > +static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, > void *arg) > { > - struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f); > - struct inet_frag_queue *qp; > - > -#ifdef CONFIG_SMP > - /* With SMP race we have to recheck hash table, because > - * such entry could have been created on other cpu before > - * we acquired hash bucket lock. > - */ > - hlist_for_each_entry(qp, &hb->chain, list) { > - if (qp->net == nf && f->match(qp, arg)) { > - refcount_inc(&qp->refcnt); > - spin_unlock(&hb->chain_lock); > - qp_in->flags |= INET_FRAG_COMPLETE; > - inet_frag_put(qp_in); > - return qp; > - } > - } > -#endif > - qp = qp_in; > - if (!mod_timer(&qp->timer, jiffies + nf->timeout)) > - refcount_inc(&qp->refcnt); > - > - refcount_inc(&qp->refcnt); > - hlist_add_head(&qp->list, &hb->chain); > - > - spin_unlock(&hb->chain_lock); > - > - return qp; > -} > - > -static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, > - struct inet_frags *f, > - void *arg) > -{ > - struct inet_frag_queue *q; > + struct inet_frags *f = nf->f; > + struct inet_frag_queue *fq; > + int err; > > - if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) { > - inet_frag_schedule_worker(f); > + if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) > return NULL; > - } > > - q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); > - if (!q) > + fq = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); > + if (!fq) > return NULL; Here we also renamed a variable and merged two functions: inet_frag_alloc() into inet_frag_create(). Can't we do that in separate patch? Also, note, git (at least my) generates better diff for this hunk with the following options in .gitconfig: [diff "default"] algorithm = patience [diff] algorithm = patience > > - q->net = nf; > - f->constructor(q, arg); > - add_frag_mem_limit(nf, f->qsize); > - > - timer_setup(&q->timer, f->frag_expire, 0); > - spin_lock_init(&q->lock); > - refcount_set(&q->refcnt, 1); > - > - return q; > -} > + fq->net = nf; > + f->constructor(fq, arg); Also renaming. > > -static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, > - struct inet_frags *f, > - void *arg) > -{ > - struct inet_frag_queue *q; > + timer_setup(&fq->timer, f->frag_expire, 0); > + spin_lock_init(&fq->lock); > + refcount_set(&fq->refcnt, 3); > + mod_timer(&fq->timer, jiffies + nf->timeout); > > - q = inet_frag_alloc(nf, f, arg); > - if (!q) > + err = rhashtable_insert_fast(&nf->rhashtable, &fq->node, > + f->rhash_params); > + add_frag_mem_limit(nf, f->qsize); > + if (err < 0) { > + fq->flags |= INET_FRAG_COMPLETE; > + inet_frag_kill(fq); > + inet_frag_destroy(fq); > return NULL; > - > - return inet_frag_intern(nf, q, f, arg); > + } > + return fq; > } > > -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, > - struct inet_frags *f, void *key, > - unsigned int hash) > +/* TODO : call from rcu_read_lock() and no longer use > refcount_inc_not_zero() */ > +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) > { > - struct inet_frag_bucket *hb; > - struct inet_frag_queue *q; > - int depth = 0; > - > - if (frag_mem_limit(nf) > nf->low_thresh) > - inet_frag_schedule_worker(f); > - > - hash &= (INETFRAGS_HASHSZ - 1); > - hb = &f->hash[hash]; > - > - spin_lock(&hb->chain_lock); > - hlist_for_each_entry(q, &hb->chain, list) { > - if (q->net == nf && f->match(q, key)) { > - refcount_inc(&q->refcnt); > - spin_unlock(&hb->chain_lock); > - return q; > - } > - depth++; > - } > - spin_unlock(&hb->chain_lock); > + struct inet_frag_queue *fq; > > - if (depth <= INETFRAGS_MAXDEPTH) > - return inet_frag_create(nf, f, key); > + rcu_read_lock(); > > - if (inet_frag_may_rebuild(f)) { > - if (!f->rebuild) > - f->rebuild = true; > - inet_frag_schedule_worker(f); > + fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); > + if (fq) { > + if (!refcount_inc_not_zero(&fq->refcnt)) > + fq = NULL; > + rcu_read_unlock(); > + return fq; > } > + rcu_read_unlock(); > > - return ERR_PTR(-ENOBUFS); > + return inet_frag_create(nf, key); > } > EXPORT_SYMBOL(inet_frag_find); > - > -void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, > - const char *prefix) > -{ > - static const char msg[] = "inet_frag_find: Fragment hash bucket" > - " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH) > - ". Dropping fragment.\n"; > - > - if (PTR_ERR(q) == -ENOBUFS) > - net_dbg_ratelimited("%s%s", prefix, msg); > -} > -EXPORT_SYMBOL(inet_frag_maybe_warn_overflow); > diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c > index > cd2b4c9419fc1552d367b572926e314b11cb6c00..1a7423e8ec0a8f88782ad8c945dc0cd6046f79f0 > 100644 > --- a/net/ipv4/ip_fragment.c > +++ b/net/ipv4/ip_fragment.c > @@ -69,15 +69,9 @@ struct ipfrag_skb_cb > struct ipq { > struct inet_frag_queue q; > > - u32 user; > - __be32 saddr; > - __be32 daddr; > - __be16 id; > - u8 protocol; > u8 ecn; /* RFC3168 support */ > u16 max_df_size; /* largest frag with DF set seen */ > int iif; > - int vif; /* L3 master device index */ > unsigned int rid; > struct inet_peer *peer; > }; > @@ -97,41 +91,6 @@ int ip_frag_mem(struct net *net) > static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, > struct net_device *dev); > > -struct ip4_create_arg { > - struct iphdr *iph; > - u32 user; > - int vif; > -}; > - > -static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) > -{ > - net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); > - return jhash_3words((__force u32)id << 16 | prot, > - (__force u32)saddr, (__force u32)daddr, > - ip4_frags.rnd); > -} > - > -static unsigned int ip4_hashfn(const struct inet_frag_queue *q) > -{ > - const struct ipq *ipq; > - > - ipq = container_of(q, struct ipq, q); > - return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); > -} > - > -static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a) > -{ > - const struct ipq *qp; > - const struct ip4_create_arg *arg = a; > - > - qp = container_of(q, struct ipq, q); > - return qp->id == arg->iph->id && > - qp->saddr == arg->iph->saddr && > - qp->daddr == arg->iph->daddr && > - qp->protocol == arg->iph->protocol && > - qp->user == arg->user && > - qp->vif == arg->vif; > -} > > static void ip4_frag_init(struct inet_frag_queue *q, const void *a) > { > @@ -140,37 +99,23 @@ static void ip4_frag_init(struct inet_frag_queue *q, > const void *a) > frags); > struct net *net = container_of(ipv4, struct net, ipv4); > > - const struct ip4_create_arg *arg = a; > + const struct frag_v4_compare_key *key = a; > > - qp->protocol = arg->iph->protocol; > - qp->id = arg->iph->id; > - qp->ecn = ip4_frag_ecn(arg->iph->tos); > - qp->saddr = arg->iph->saddr; > - qp->daddr = arg->iph->daddr; > - qp->vif = arg->vif; > - qp->user = arg->user; > + q->key.v4 = *key; > + qp->ecn = 0; > qp->peer = q->net->max_dist ? > - inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : > + inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) : > NULL; > } > > -static void ip4_frag_free(struct inet_frag_queue *q) > +static void ip4_frag_destructor(struct inet_frag_queue *q) This just renames the function > { > - struct ipq *qp; > + struct ipq *qp = container_of(q, struct ipq, q); > > - qp = container_of(q, struct ipq, q); This is also just a refactoring > if (qp->peer) > inet_putpeer(qp->peer); > } > > - > -/* Destruction primitives. */ > - > -static void ipq_put(struct ipq *ipq) > -{ > - inet_frag_put(&ipq->q); > -} > - > /* Kill ipq entry. It is not destroyed immediately, > * because caller (and someone more) holds reference count. > */ > @@ -198,25 +143,25 @@ static void ip_expire(struct timer_list *t) > struct net *net; > > qp = container_of(frag, struct ipq, q); > - net = container_of(qp->q.net, struct net, ipv4.frags); > + net = container_of(frag->net, struct net, ipv4.frags); > > rcu_read_lock(); > - spin_lock(&qp->q.lock); > + spin_lock(&frag->lock); > > - if (qp->q.flags & INET_FRAG_COMPLETE) > + if (frag->flags & INET_FRAG_COMPLETE) > goto out; > > ipq_kill(qp); > __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); > > - if (!inet_frag_evicting(&qp->q)) { > - struct sk_buff *clone, *head = qp->q.fragments; > + if (true) { This does not look good for me. Better we could try to move this to separate function.. > + struct sk_buff *clone, *head = frag->fragments; > const struct iphdr *iph; > int err; > > __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); > > - if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments) > + if (!(frag->flags & INET_FRAG_FIRST_IN) || !frag->fragments) > goto out; > > head->dev = dev_get_by_index_rcu(net, qp->iif); > @@ -234,7 +179,7 @@ static void ip_expire(struct timer_list *t) > /* Only an end host needs to send an ICMP > * "Fragment Reassembly Timeout" message, per RFC792. > */ > - if (frag_expire_skip_icmp(qp->user) && > + if (frag_expire_skip_icmp(frag->key.v4.user) && > (skb_rtable(head)->rt_type != RTN_LOCAL)) > goto out; > > @@ -242,7 +187,7 @@ static void ip_expire(struct timer_list *t) > > /* Send an ICMP "Fragment Reassembly Timeout" message. */ > if (clone) { > - spin_unlock(&qp->q.lock); > + spin_unlock(&frag->lock); > icmp_send(clone, ICMP_TIME_EXCEEDED, > ICMP_EXC_FRAGTIME, 0); > consume_skb(clone); > @@ -250,33 +195,32 @@ static void ip_expire(struct timer_list *t) > } > } > out: > - spin_unlock(&qp->q.lock); > + spin_unlock(&frag->lock); > out_rcu_unlock: > rcu_read_unlock(); > - ipq_put(qp); > + inet_frag_put(frag); > } > > /* Find the correct entry in the "incomplete datagrams" queue for > * this IP datagram, and create new one, if nothing is found. > */ > -static struct ipq *ip_find(struct net *net, struct iphdr *iph, > +static struct ipq *ip_find(struct net *net, const struct iphdr *iph, > u32 user, int vif) > { > + struct frag_v4_compare_key key = { > + .saddr = iph->saddr, > + .daddr = iph->daddr, > + .user = user, > + .vif = vif, > + .id = iph->id, > + .protocol = iph->protocol, > + }; > struct inet_frag_queue *q; > - struct ip4_create_arg arg; > - unsigned int hash; > > - arg.iph = iph; > - arg.user = user; > - arg.vif = vif; > - > - hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); > - > - q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); > - if (IS_ERR_OR_NULL(q)) { > - inet_frag_maybe_warn_overflow(q, pr_fmt()); > + q = inet_frag_find(&net->ipv4.frags, &key); > + if (!q) > return NULL; > - } > + > return container_of(q, struct ipq, q); > } > > @@ -310,8 +254,8 @@ static int ip_frag_too_far(struct ipq *qp) > > static int ip_frag_reinit(struct ipq *qp) > { > - struct sk_buff *fp; > unsigned int sum_truesize = 0; > + struct sk_buff *fp; This just moves the line... > > if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { > refcount_inc(&qp->q.refcnt); > @@ -652,11 +596,11 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff > *prev, > return 0; > > out_nomem: > - net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp); > + net_dbg_ratelimited("queue_glue: no memory for gluing queue\n"); Since there is no parameter type change, but complete removing of it, we may do that in refatoring patch (together with the above). > err = -ENOMEM; > goto out_fail; > out_oversize: > - net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr); > + net_info_ratelimited("Oversized IP packet from %pI4\n", > &qp->q.key.v4.saddr); > out_fail: > __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); > return err; > @@ -682,7 +626,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 > user) > ret = ip_frag_queue(qp, skb); > > spin_unlock(&qp->q.lock); > - ipq_put(qp); > + inet_frag_put(&qp->q); > return ret; > } > > @@ -894,17 +838,52 @@ static struct pernet_operations ip4_frags_ops = { > .exit = ipv4_frags_exit_net, > }; > > + > +static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed) > +{ > + return jhash2(data, > + sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); > +} > + > +static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed) > +{ > + const struct inet_frag_queue *fq = data; > + > + return jhash2((const u32 *)&fq->key.v4, > + sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); > +} > + > +static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) > +{ > + const struct frag_v4_compare_key *key = arg->key; > + const struct inet_frag_queue *fq = ptr; > + > + return !!memcmp(&fq->key, key, sizeof(*key)); > +} > + > +static const struct rhashtable_params ip4_rhash_params = { > + .head_offset = offsetof(struct inet_frag_queue, node), > + .key_offset = offsetof(struct inet_frag_queue, key), > + .key_len = sizeof(struct frag_v4_compare_key), > + .hashfn = ip4_key_hashfn, > + .obj_hashfn = ip4_obj_hashfn, > + .obj_cmpfn = ip4_obj_cmpfn, > + .automatic_shrinking = true, > +}; > + > void __init ipfrag_init(void) > { > - ip4_frags_ctl_register(); > - register_pernet_subsys(&ip4_frags_ops); > - ip4_frags.hashfn = ip4_hashfn; > ip4_frags.constructor = ip4_frag_init; > - ip4_frags.destructor = ip4_frag_free; > + ip4_frags.destructor = ip4_frag_destructor; Here just reflects the fact we removed the function > ip4_frags.qsize = sizeof(struct ipq); > - ip4_frags.match = ip4_frag_match; > ip4_frags.frag_expire = ip_expire; > ip4_frags.frags_cache_name = ip_frag_cache_name; > + ip4_frags.rhash_params = ip4_rhash_params; > + > if (inet_frags_init(&ip4_frags)) > panic("IP: failed to allocate ip4_frags cache\n"); > + > + ip4_frags_ctl_register(); > + register_pernet_subsys(&ip4_frags_ops); We may consider to do this moving in refatoring patch.. > + > } > diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c > b/net/ipv6/netfilter/nf_conntrack_reasm.c > index > f69b7ca52727c814eb2887c9deb9f356c56e5442..53859311dea96c03fa5ae8456de32de25009efbe > 100644 > --- a/net/ipv6/netfilter/nf_conntrack_reasm.c > +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c > @@ -152,23 +152,6 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr > *ipv6h) > return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); > } > > -static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr, > - const struct in6_addr *daddr) > -{ > - net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd)); > - return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), > - (__force u32)id, nf_frags.rnd); > -} > - > - > -static unsigned int nf_hashfn(const struct inet_frag_queue *q) > -{ > - const struct frag_queue *nq; > - > - nq = container_of(q, struct frag_queue, q); > - return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr); > -} > - > static void nf_ct_frag6_expire(struct timer_list *t) > { > struct inet_frag_queue *frag = from_timer(frag, t, timer); > @@ -178,34 +161,26 @@ static void nf_ct_frag6_expire(struct timer_list *t) > fq = container_of(frag, struct frag_queue, q); > net = container_of(fq->q.net, struct net, nf_frag.frags); > > - ip6_expire_frag_queue(net, fq, &nf_frags); > + ip6_expire_frag_queue(net, fq); > } > > /* Creation primitives. */ > -static inline struct frag_queue *fq_find(struct net *net, __be32 id, > - u32 user, struct in6_addr *src, > - struct in6_addr *dst, int iif, u8 ecn) > +static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, > + const struct ipv6hdr *hdr, int iif) > { > + struct frag_v6_compare_key key = { > + .id = id, > + .saddr = hdr->saddr, > + .daddr = hdr->daddr, > + .user = user, > + .iif = iif, > + }; > struct inet_frag_queue *q; > - struct ip6_create_arg arg; > - unsigned int hash; > - > - arg.id = id; > - arg.user = user; > - arg.src = src; > - arg.dst = dst; > - arg.iif = iif; > - arg.ecn = ecn; > - > - local_bh_disable(); > - hash = nf_hash_frag(id, src, dst); > - > - q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash); > - local_bh_enable(); > - if (IS_ERR_OR_NULL(q)) { > - inet_frag_maybe_warn_overflow(q, pr_fmt()); > + > + q = inet_frag_find(&net->nf_frag.frags, &key); > + if (!q) > return NULL; > - } > + > return container_of(q, struct frag_queue, q); > } > > @@ -593,8 +568,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff > *skb, u32 user) > fhdr = (struct frag_hdr *)skb_transport_header(skb); > > skb_orphan(skb); > - fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, > - skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); > + fq = fq_find(net, fhdr->identification, user, hdr, > + skb->dev ? skb->dev->ifindex : 0); > if (fq == NULL) { > pr_debug("Can't find and can't create new queue\n"); > return -ENOMEM; > @@ -656,17 +631,18 @@ static struct pernet_operations nf_ct_net_ops = { > .exit = nf_ct_net_exit, > }; > > +extern const struct rhashtable_params ip6_rhash_params; Can't we do this declaration in H-file? > + > int nf_ct_frag6_init(void) > { > int ret = 0; > > - nf_frags.hashfn = nf_hashfn; > nf_frags.constructor = ip6_frag_init; > nf_frags.destructor = NULL; > nf_frags.qsize = sizeof(struct frag_queue); > - nf_frags.match = ip6_frag_match; > nf_frags.frag_expire = nf_ct_frag6_expire; > nf_frags.frags_cache_name = nf_frags_cache_name; > + nf_frags.rhash_params = ip6_rhash_params; > ret = inet_frags_init(&nf_frags); > if (ret) > goto out; > diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c > index > 8cfea13a179c6f048177ac91fe26c8a5565e5820..737b0921ab0c9af198fefdf06d8f4ede91c7f3f6 > 100644 > --- a/net/ipv6/reassembly.c > +++ b/net/ipv6/reassembly.c > @@ -79,59 +79,19 @@ static struct inet_frags ip6_frags; > static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, > struct net_device *dev); > > -/* > - * callers should be careful not to use the hash value outside the > ipfrag_lock > - * as doing so could race with ipfrag_hash_rnd being recalculated. > - */ > -static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr, > - const struct in6_addr *daddr) > -{ > - net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd)); > - return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), > - (__force u32)id, ip6_frags.rnd); > -} > - > -static unsigned int ip6_hashfn(const struct inet_frag_queue *q) > -{ > - const struct frag_queue *fq; > - > - fq = container_of(q, struct frag_queue, q); > - return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr); > -} > - > -bool ip6_frag_match(const struct inet_frag_queue *q, const void *a) > -{ > - const struct frag_queue *fq; > - const struct ip6_create_arg *arg = a; > - > - fq = container_of(q, struct frag_queue, q); > - return fq->id == arg->id && > - fq->user == arg->user && > - ipv6_addr_equal(&fq->saddr, arg->src) && > - ipv6_addr_equal(&fq->daddr, arg->dst) && > - (arg->iif == fq->iif || > - !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST | > - IPV6_ADDR_LINKLOCAL))); > -} > -EXPORT_SYMBOL(ip6_frag_match); > - > void ip6_frag_init(struct inet_frag_queue *q, const void *a) > { > struct frag_queue *fq = container_of(q, struct frag_queue, q); > - const struct ip6_create_arg *arg = a; > + const struct frag_v6_compare_key *key = a; > > - fq->id = arg->id; > - fq->user = arg->user; > - fq->saddr = *arg->src; > - fq->daddr = *arg->dst; > - fq->ecn = arg->ecn; > + q->key.v6 = *key; > + fq->ecn = 0; > } > EXPORT_SYMBOL(ip6_frag_init); > > -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, > - struct inet_frags *frags) > +void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq) > { > - struct net_device *dev = NULL; > + struct net_device *dev; > > spin_lock(&fq->q.lock); > > @@ -146,10 +106,6 @@ void ip6_expire_frag_queue(struct net *net, struct > frag_queue *fq, > goto out_rcu_unlock; > > __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); > - > - if (inet_frag_evicting(&fq->q)) > - goto out_rcu_unlock; > - > __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); > > /* Don't send error if the first segment did not arrive. */ > @@ -179,31 +135,29 @@ static void ip6_frag_expire(struct timer_list *t) > fq = container_of(frag, struct frag_queue, q); > net = container_of(fq->q.net, struct net, ipv6.frags); > > - ip6_expire_frag_queue(net, fq, &ip6_frags); > + ip6_expire_frag_queue(net, fq); > } > > static struct frag_queue * > -fq_find(struct net *net, __be32 id, const struct in6_addr *src, > - const struct in6_addr *dst, int iif, u8 ecn) > +fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) > { > + struct frag_v6_compare_key key = { > + .id = id, > + .saddr = hdr->saddr, > + .daddr = hdr->daddr, > + .user = IP6_DEFRAG_LOCAL_DELIVER, > + .iif = iif, > + }; > struct inet_frag_queue *q; > - struct ip6_create_arg arg; > - unsigned int hash; > > - arg.id = id; > - arg.user = IP6_DEFRAG_LOCAL_DELIVER; > - arg.src = src; > - arg.dst = dst; > - arg.iif = iif; > - arg.ecn = ecn; > + if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST | > + IPV6_ADDR_LINKLOCAL))) > + key.iif = 0; > > - hash = inet6_hash_frag(id, src, dst); > - > - q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); > - if (IS_ERR_OR_NULL(q)) { > - inet_frag_maybe_warn_overflow(q, pr_fmt()); > + q = inet_frag_find(&net->ipv6.frags, &key); > + if (!q) > return NULL; > - } > + > return container_of(q, struct frag_queue, q); > } > > @@ -527,10 +481,11 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct > sk_buff *prev, > > static int ipv6_frag_rcv(struct sk_buff *skb) > { > + struct net *net = dev_net(skb_dst(skb)->dev); > + const struct ipv6hdr *hdr = ipv6_hdr(skb); > struct frag_hdr *fhdr; > struct frag_queue *fq; > - const struct ipv6hdr *hdr = ipv6_hdr(skb); > - struct net *net = dev_net(skb_dst(skb)->dev); > + int iif; > > if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) > goto fail_hdr; > @@ -559,13 +514,14 @@ static int ipv6_frag_rcv(struct sk_buff *skb) > return 1; > } > > - fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, > - skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); > + iif = skb->dev ? skb->dev->ifindex : 0; > + fq = fq_find(net, fhdr->identification, hdr, iif); > if (fq) { > int ret; > > spin_lock(&fq->q.lock); > > + fq->iif = iif; > ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff); > > spin_unlock(&fq->q.lock); > @@ -718,6 +674,7 @@ static int __net_init ipv6_frags_init_net(struct net *net) > net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; > net->ipv6.frags.f = &ip6_frags; > > + net->ipv6.frags.f = &ip6_frags; This '=' is already made above, we shouldn't do that twise... > res = inet_frags_init_net(&net->ipv6.frags); > if (res < 0) > return res; > @@ -739,14 +696,55 @@ static struct pernet_operations ip6_frags_ops = { > .exit = ipv6_frags_exit_net, > }; > > +static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed) > +{ > + return jhash2(data, > + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); > +} > + > +static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed) > +{ > + const struct inet_frag_queue *fq = data; > + > + return jhash2((const u32 *)&fq->key.v6, > + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); > +} > + > +static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) > +{ > + const struct frag_v6_compare_key *key = arg->key; > + const struct inet_frag_queue *fq = ptr; > + > + return !!memcmp(&fq->key, key, sizeof(*key)); > +} > + > +const struct rhashtable_params ip6_rhash_params = { > + .head_offset = offsetof(struct inet_frag_queue, node), > + .hashfn = ip6_key_hashfn, > + .obj_hashfn = ip6_obj_hashfn, > + .obj_cmpfn = ip6_obj_cmpfn, > + .automatic_shrinking = true, > +}; > +EXPORT_SYMBOL(ip6_rhash_params); > + > int __init ipv6_frag_init(void) > { > int ret; > > - ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT); > + ip6_frags.constructor = ip6_frag_init; > + ip6_frags.destructor = NULL; > + ip6_frags.qsize = sizeof(struct frag_queue); > + ip6_frags.frag_expire = ip6_frag_expire; > + ip6_frags.frags_cache_name = ip6_frag_cache_name; > + ip6_frags.rhash_params = ip6_rhash_params; > + ret = inet_frags_init(&ip6_frags); > if (ret) > goto out; > > + ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT); > + if (ret) > + goto err_protocol; > + > ret = ip6_frags_sysctl_register(); > if (ret) > goto err_sysctl; > @@ -755,16 +753,6 @@ int __init ipv6_frag_init(void) > if (ret) > goto err_pernet; > > - ip6_frags.hashfn = ip6_hashfn; > - ip6_frags.constructor = ip6_frag_init; > - ip6_frags.destructor = NULL; > - ip6_frags.qsize = sizeof(struct frag_queue); > - ip6_frags.match = ip6_frag_match; > - ip6_frags.frag_expire = ip6_frag_expire; > - ip6_frags.frags_cache_name = ip6_frag_cache_name; > - ret = inet_frags_init(&ip6_frags); > - if (ret) > - goto err_pernet; > out: > return ret; > > @@ -772,6 +760,8 @@ int __init ipv6_frag_init(void) > ip6_frags_sysctl_unregister(); > err_sysctl: > inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); > +err_protocol: > + inet_frags_fini(&ip6_frags); > goto out; > } Thanks, Kirill