[IPV4] route: Dynamic hash table sizing.

The algorithm is stupid, this changeset is about infrastructure.

Currently it starts at 16 entries (or whatever rhash_entries was
specified as), and allows growing up to 8MB.

The code can handle both growing and shrinking just fine, the only
tweaks necessary are to the rthash_new_size() function and the places
where rtcache_work is scheduled.

hashdist is now used at run-time so we need to drop the __initdata
tag.

Signed-off-by: David S. Miller <[EMAIL PROTECTED]>
---
 mm/page_alloc.c  |    2 -
 net/ipv4/route.c |  179 +++++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 158 insertions(+), 23 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 54a4f53..3b5358a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2363,7 +2363,7 @@ int percpu_pagelist_fraction_sysctl_hand
        return 0;
 }
 
-__initdata int hashdist = HASHDIST_DEFAULT;
+int hashdist = HASHDIST_DEFAULT;
 
 #ifdef CONFIG_NUMA
 static int __init set_hashdist(char *str)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a7b4ca2..897e67c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -94,6 +94,9 @@ #include <linux/jhash.h>
 #include <linux/rcupdate.h>
 #include <linux/times.h>
 #include <linux/seqlock.h>
+#include <linux/workqueue.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
 #include <net/protocol.h>
 #include <net/ip.h>
 #include <net/route.h>
@@ -120,6 +123,7 @@ #define RT_GC_TIMEOUT (300*HZ)
 static int ip_rt_min_delay             = 2 * HZ;
 static int ip_rt_max_delay             = 10 * HZ;
 static int ip_rt_max_size;
+static int ip_rt_hashsz_limit          = (8 * 1024 * 1024) / sizeof(void *);
 static int ip_rt_gc_timeout            = RT_GC_TIMEOUT;
 static int ip_rt_gc_interval           = 60 * HZ;
 static int ip_rt_gc_min_interval       = HZ / 2;
@@ -308,6 +312,135 @@ static void rt_hash_snapshot(struct rt_h
        } while (read_seqretry(&rt_hash_seq, seq));
 }
 
+static struct rt_hash_bucket *rthash_alloc(unsigned int sz)
+{
+       struct rt_hash_bucket *n;
+
+       if (sz <= PAGE_SIZE)
+               n = kmalloc(sz, GFP_KERNEL);
+       else if (hashdist)
+               n = __vmalloc(sz, GFP_KERNEL, PAGE_KERNEL);
+       else
+               n = (struct rt_hash_bucket *)
+                       __get_free_pages(GFP_KERNEL, get_order(sz));
+
+       if (n)
+               memset(n, 0, sz);
+
+       return n;
+}
+
+static void rthash_free(struct rt_hash_bucket *r, unsigned int sz)
+{
+       if (sz <= PAGE_SIZE)
+               kfree(r);
+       else if (hashdist)
+               vfree(r);
+       else
+               free_pages((unsigned long)r, get_order(sz));
+}
+
+static void rtcache_transfer(struct rtable *list, struct rt_hash_bucket 
*new_table, unsigned int nhashmask)
+{
+       while (list) {
+               struct rtable *next = list->u.rt_next;
+               struct rt_hash_bucket *ent;
+               int iface = list->fl.iif;
+               unsigned int hash;
+
+               if (!iface)
+                       iface = list->fl.oif;
+               hash = __rt_hash_code(list->fl.fl4_dst,
+                                     list->fl.fl4_src &
+                                     (iface << 5),
+                                     nhashmask);
+               ent = &new_table[hash];
+               list->u.rt_next = ent->chain;
+               ent->chain = list;
+
+               list = next;
+       }
+}
+
+static unsigned long rthash_new_size(void)
+{
+       return ((__rt_hash_mask + 1) << 1) *
+               sizeof(struct rt_hash_bucket);
+}
+
+static __inline__ void rt_free(struct rtable *rt)
+{
+       multipath_remove(rt);
+       call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
+}
+
+static DEFINE_MUTEX(hash_resize_mutex);
+
+static void rtcache_resize(void *__unused)
+{
+       struct rt_hash_bucket *new, *old;
+       unsigned long nsize;
+       unsigned int nhashmask, ohashmask;
+       int i;
+
+       mutex_lock(&hash_resize_mutex);
+
+       nsize = rthash_new_size();
+       new = rthash_alloc(nsize);
+       if (!new)
+               goto out_unlock;
+
+       write_seqlock_bh(&rt_hash_seq);
+
+       nhashmask = (nsize / sizeof(struct rt_hash_bucket)) - 1U;
+       for (i = __rt_hash_mask; i >= 0; i--) {
+               struct rtable *rth;
+
+               spin_lock_bh(rt_hash_lock_addr(i));
+               rth = __rt_hash_table[i].chain;
+               if (rth)
+                       __rt_hash_table[i].chain = NULL;
+               spin_unlock_bh(rt_hash_lock_addr(i));
+
+               rtcache_transfer(rth, new, nhashmask);
+       }
+
+       old = __rt_hash_table;
+       ohashmask = __rt_hash_mask;
+
+       __rt_hash_table = new;
+       __rt_hash_mask = nhashmask;
+
+       /* XXX Do something more intelligent with these things.  */
+       ipv4_dst_ops.gc_thresh = (__rt_hash_mask + 1);
+       ip_rt_max_size = (__rt_hash_mask + 1) * 16;
+
+       write_sequnlock_bh(&rt_hash_seq);
+
+       synchronize_net();
+
+       /* It is possible that some entries got hashed into the old
+        * table, free any such remnants.  No locking is necessary on
+        * the chains as this table is no longer viewable by other
+        * processors.
+        */
+       for (i = ohashmask; i >= 0; i--) {
+               struct rtable *rth, *next;
+
+               for (rth = old[i].chain; rth; rth = next) {
+                       next = rth->u.rt_next;
+                       rt_free(rth);
+               }
+       }
+
+       rthash_free(old, (ohashmask + 1) * sizeof(struct rt_hash_bucket));
+
+out_unlock:
+       mutex_unlock(&hash_resize_mutex);
+}
+
+static DECLARE_WORK(rtcache_work, rtcache_resize, NULL);
+
 #ifdef CONFIG_PROC_FS
 struct rt_cache_iter_state {
        struct rt_hash_bucket *table;
@@ -540,12 +673,6 @@ static struct file_operations rt_cpu_seq
 
 #endif /* CONFIG_PROC_FS */
   
-static __inline__ void rt_free(struct rtable *rt)
-{
-       multipath_remove(rt);
-       call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
-}
-
 static __inline__ void rt_drop(struct rtable *rt)
 {
        multipath_remove(rt);
@@ -676,7 +803,7 @@ static void rt_check_expire(unsigned lon
 
        rt_hash_snapshot(&table, &hmask);
 
-       mult = ((u64)ip_rt_gc_interval) << long_log2(hmask + 1);
+       mult = ((u64)(hmask + 1)) << (u64)ip_rt_gc_interval;
        if (ip_rt_gc_timeout > 1)
                do_div(mult, ip_rt_gc_timeout);
        goal = (unsigned int)mult;
@@ -857,7 +984,7 @@ static int rt_garbage_collect(void)
 
        /* Calculate number of entries, which we want to expire now. */
        goal = atomic_read(&ipv4_dst_ops.entries) -
-               (ip_rt_gc_elasticity << long_log2(hmask + 1));
+               ((hmask + 1) << ip_rt_gc_elasticity);
        if (goal <= 0) {
                if (equilibrium < ipv4_dst_ops.gc_thresh)
                        equilibrium = ipv4_dst_ops.gc_thresh;
@@ -971,7 +1098,11 @@ #if RT_CACHE_DEBUG >= 2
        printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
                        atomic_read(&ipv4_dst_ops.entries), goal, rover);
 #endif
-out:   return 0;
+out:
+       if (atomic_read(&ipv4_dst_ops.entries) >= (hmask + 1) &&
+           (hmask + 1) < ip_rt_hashsz_limit)
+               schedule_work(&rtcache_work);
+       return 0;
 }
 
 static int rt_intern_hash(struct rt_hash_bucket *r, unsigned int hash, struct 
rtable *rt, struct rtable **rp)
@@ -3201,15 +3332,23 @@ #endif /* CONFIG_NET_CLS_ROUTE */
 static __initdata unsigned long rhash_entries;
 static int __init set_rhash_entries(char *str)
 {
+       unsigned long val;
+
        if (!str)
                return 0;
-       rhash_entries = simple_strtoul(str, &str, 0);
+       val = simple_strtoul(str, &str, 0);
+
+       /* Only use it if it's a power-of-2. */
+       if (!(val & (val - 1)))
+               rhash_entries = val;
+
        return 1;
 }
 __setup("rhash_entries=", set_rhash_entries);
 
 int __init ip_rt_init(void)
 {
+       unsigned long sz;
        int rc = 0;
 
        rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
@@ -3236,22 +3375,18 @@ #endif
        if (!ipv4_dst_ops.kmem_cachep)
                panic("IP: failed to allocate ip_dst_cache\n");
 
-       __rt_hash_table = (struct rt_hash_bucket *)
-               alloc_large_system_hash("IP route cache",
-                                       sizeof(struct rt_hash_bucket),
-                                       rhash_entries,
-                                       (num_physpages >= 128 * 1024) ?
-                                       15 : 17,
-                                       HASH_HIGHMEM,
-                                       NULL,
-                                       &__rt_hash_mask,
-                                       0);
-       memset(__rt_hash_table, 0, (__rt_hash_mask + 1) * sizeof(struct 
rt_hash_bucket));
-       rt_hash_lock_init();
+       sz = (rhash_entries ? rhash_entries : 16);
+       sz *= sizeof(struct rt_hash_bucket);
 
+       __rt_hash_table = rthash_alloc(sz);
+       if (!__rt_hash_table)
+               panic("IP: failed to allocate routing cache hash table");
+       __rt_hash_mask = (sz / sizeof(struct rt_hash_bucket)) - 1;
        ipv4_dst_ops.gc_thresh = (__rt_hash_mask + 1);
        ip_rt_max_size = (__rt_hash_mask + 1) * 16;
 
+       rt_hash_lock_init();
+
        devinet_init();
        ip_fib_init();
 
-- 
1.4.2.rc2.g3e042

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to