When multiple nexthops are available for a given route, the routing engine
chooses a nexthop by computing the flow hash through get_hash_from_flowi6
and by taking that value modulo the number of nexthops. The resulting value
indexes the nexthop to select. This method causes issues when a new nexthop
is added or one is removed (e.g. link failure). In that case, the number
of nexthops changes and potentially all the flows get re-routed to another
nexthop.

This patch implements a consistent hash method to select the nexthop in
case of ECMP. The idea is to generate K slices (or intervals) for each
route with multiple nexthops. The nexthops are randomly assigned to those
slices, in a uniform manner. The number K is configurable through a sysctl
net.ipv6.route.ecmp_slices and is always an exponent of 2. To select the
nexthop, the algorithm takes the flow hash and computes an index which is
the flow hash modulo K. As K = 2^x, the modulo can be computed using a
simple binary AND operation (idx = hash & (K - 1)). The resulting index
references the selected nexthop. The lookup time complexity is thus O(1).

When a nexthop is added, it steals K/N slices from the other nexthops,
where N is the new number of nexthops. The slices are stolen randomly and
uniformly from the other nexthops. When a nexthop is removed, the orphan
slices are randomly reassigned to the other nexthops.

The number of slices for a route also fixes the maximum number of nexthops
possible for that route.

Signed-off-by: David Lebrun <david.leb...@uclouvain.be>
---
 include/net/ip6_fib.h    |  25 +++++++
 include/net/netns/ipv6.h |   1 +
 net/ipv6/ip6_fib.c       | 174 ++++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv6/route.c         |  58 ++++++++--------
 4 files changed, 229 insertions(+), 29 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index a74e2aa..29594cc 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -93,6 +93,30 @@ struct rt6key {
 
 struct fib6_table;
 
+/* Multipath nexthop.
+ * @nh is the actual nexthop
+ * @nslices is the number of slices assigned to this nexthop
+ */
+struct rt6_multi_nh {
+       struct rt6_info *nh;
+       unsigned int    nslices;
+};
+
+/* Multipath routes map.
+ * @nhs is an array of available nexthops
+ * @size is the number of elements in @nhs
+ * @nslices is the number of slices and the number of elements in @index
+ *         and is always in the form 2^x to prevent using a division for
+ *         the computation of the modulo.
+ * @index is an array mapping a slice index to a nexthop index in @nhs
+ */
+struct rt6_multi_map {
+       struct rt6_multi_nh     *nhs;
+       unsigned int            size;
+       unsigned int            nslices;
+       __u8                    *index;
+};
+
 struct rt6_info {
        struct dst_entry                dst;
 
@@ -113,6 +137,7 @@ struct rt6_info {
         */
        struct list_head                rt6i_siblings;
        unsigned int                    rt6i_nsiblings;
+       struct rt6_multi_map            *rt6i_nh_map;
 
        atomic_t                        rt6i_ref;
 
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index de7745e..4967846 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -36,6 +36,7 @@ struct netns_sysctl_ipv6 {
        int idgen_retries;
        int idgen_delay;
        int flowlabel_state_ranges;
+       int ip6_rt_ecmp_slices;
 };
 
 struct netns_ipv6 {
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index ef54852..b6b1895 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -727,6 +727,166 @@ static void fib6_purge_rt(struct rt6_info *rt, struct 
fib6_node *fn,
        }
 }
 
+static void fib6_mp_free(struct rt6_info *rt)
+{
+       struct rt6_multi_map *nh_map = rt->rt6i_nh_map;
+       struct rt6_info *sibling;
+
+       if (nh_map) {
+               list_for_each_entry(sibling, &rt->rt6i_siblings,
+                                   rt6i_siblings) {
+                       sibling->rt6i_nh_map = NULL;
+               }
+
+               rt->rt6i_nh_map = NULL;
+
+               kfree(nh_map->nhs);
+               kfree(nh_map->index);
+               kfree(nh_map);
+       }
+}
+
+static int fib6_mp_extend(struct net *net, struct rt6_info *sref,
+                         struct rt6_info *rt)
+{
+       struct rt6_multi_map *nh_map = sref->rt6i_nh_map;
+       struct rt6_multi_nh *tmp_nhs, *old_nhs;
+       unsigned int kslices;
+       unsigned int i, j;
+
+       if (!nh_map) {
+               __u8 *index;
+
+               nh_map = kmalloc(sizeof(*nh_map), GFP_ATOMIC);
+               if (!nh_map)
+                       return -ENOMEM;
+
+               nh_map->size = 1;
+               nh_map->nslices = (1 << net->ipv6.sysctl.ip6_rt_ecmp_slices);
+
+               tmp_nhs = kmalloc(sizeof(*tmp_nhs), GFP_ATOMIC);
+               if (!tmp_nhs) {
+                       kfree(nh_map);
+                       return -ENOMEM;
+               }
+
+               tmp_nhs[0].nh = sref;
+               tmp_nhs[0].nslices = nh_map->nslices;
+               nh_map->nhs = tmp_nhs;
+
+               index = kmalloc_array(nh_map->nslices, sizeof(*index),
+                                     GFP_ATOMIC);
+               if (!index) {
+                       kfree(nh_map->nhs);
+                       kfree(nh_map);
+                       return -ENOMEM;
+               }
+
+               for (i = 0; i < nh_map->nslices; i++)
+                       index[i] = 0;
+
+               nh_map->index = index;
+               sref->rt6i_nh_map = nh_map;
+       }
+
+       if (nh_map->size == nh_map->nslices)
+               return -ENOBUFS;
+
+       nh_map->size++;
+
+       tmp_nhs = kmalloc_array(nh_map->size, sizeof(*tmp_nhs), GFP_ATOMIC);
+       if (!tmp_nhs)
+               return -ENOMEM;
+
+       for (i = 0; i < nh_map->size - 1; i++)
+               tmp_nhs[i] = nh_map->nhs[i];
+
+       tmp_nhs[nh_map->size - 1].nh = rt;
+       tmp_nhs[nh_map->size - 1].nslices = 0;
+
+       kslices = nh_map->nslices / nh_map->size;
+
+       /* Loop over nexthops and steal a random slice until we have kslices for
+        * the new nexthop. This algorithm ensures that flows are taken as
+        * equally as possible from current nexthops.
+        *
+        * At each iteration, @j is the index in tmp_nhs of the nexthop
+        * to steal from and @idx is the index of the slice to steal
+        * among @j's slices.
+        */
+       for (i = 0, j = 0; i < kslices; i++) {
+               unsigned int idx, k = 0;
+
+               if (tmp_nhs[j].nslices == 1)
+                       continue;
+
+               idx = (prandom_u32() % tmp_nhs[j].nslices) + 1;
+               do {
+                       if (nh_map->index[k++] == j)
+                               idx--;
+               } while (idx);
+
+               nh_map->index[k - 1] = nh_map->size - 1;
+               tmp_nhs[nh_map->size - 1].nslices++;
+               tmp_nhs[j].nslices--;
+
+               j = (j + 1) % (nh_map->size - 1);
+       }
+
+       WARN_ON(tmp_nhs[nh_map->size - 1].nslices != kslices);
+
+       old_nhs = nh_map->nhs;
+       nh_map->nhs = tmp_nhs;
+       kfree(old_nhs);
+
+       rt->rt6i_nh_map = nh_map;
+
+       return 0;
+}
+
+static int fib6_mp_shrink(struct rt6_info *sref, struct rt6_info *rt)
+{
+       struct rt6_multi_map *nh_map = sref->rt6i_nh_map;
+       struct rt6_multi_nh *tmp_nhs, *old_nhs;
+       unsigned int i, j, idx = 0;
+
+       tmp_nhs = kmalloc_array(nh_map->size - 1, sizeof(*tmp_nhs), GFP_ATOMIC);
+       if (!tmp_nhs)
+               return -ENOMEM;
+
+       for (i = 0, j = 0; i < nh_map->size; i++) {
+               if (nh_map->nhs[i].nh != rt)
+                       tmp_nhs[j++] = nh_map->nhs[i];
+               else
+                       idx = i;
+       }
+
+       nh_map->size--;
+       old_nhs = nh_map->nhs;
+       nh_map->nhs = tmp_nhs;
+       kfree(old_nhs);
+
+       rt->rt6i_nh_map = NULL;
+
+       /* Update the slices index. For each slice mapping to the removed
+        * nexthop, assign another random nexthop. For each slice mapping
+        * to a nexthop that was after the removed nexthop, decrement the
+        * nexthop index to reflect the shift. Nexthops that were before
+        * the removed nexthop are unaffected.
+        */
+       for (i = 0; i < nh_map->nslices; i++) {
+               if (nh_map->index[i] == idx) {
+                       j = prandom_u32() % nh_map->size;
+                       nh_map->index[i] = j;
+                       nh_map->nhs[j].nslices++;
+               } else if (nh_map->index[i] > idx) {
+                       nh_map->index[i]--;
+               }
+       }
+
+       return 0;
+}
+
 /*
  *     Insert routing information in a node.
  */
@@ -837,6 +997,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct 
rt6_info *rt,
                        }
                        sibling = sibling->dst.rt6_next;
                }
+               err = fib6_mp_extend(info->nl_net, sibling, rt);
+               if (err < 0)
+                       return err;
                /* For each sibling in the list, increment the counter of
                 * siblings. BUG() if counters does not match, list of siblings
                 * is broken!
@@ -900,6 +1063,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct 
rt6_info *rt,
                        fn->fn_flags |= RTN_RTINFO;
                }
                nsiblings = iter->rt6i_nsiblings;
+               if (nsiblings)
+                       fib6_mp_free(iter);
                fib6_purge_rt(iter, fn, info->nl_net);
                rt6_release(iter);
 
@@ -1407,13 +1572,20 @@ static void fib6_del_route(struct fib6_node *fn, struct 
rt6_info **rtp,
 
        /* Remove this entry from other siblings */
        if (rt->rt6i_nsiblings) {
-               struct rt6_info *sibling, *next_sibling;
+               struct rt6_info *sibling, *next_sibling, *sref;
+
+               sref = list_first_entry(&rt->rt6i_siblings, struct rt6_info,
+                                       rt6i_siblings);
 
                list_for_each_entry_safe(sibling, next_sibling,
                                         &rt->rt6i_siblings, rt6i_siblings)
                        sibling->rt6i_nsiblings--;
                rt->rt6i_nsiblings = 0;
                list_del_init(&rt->rt6i_siblings);
+               if (!sref->rt6i_nsiblings)
+                       fib6_mp_free(sref);
+               else
+                       fib6_mp_shrink(sref, rt);
        }
 
        /* Adjust walkers */
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b317bb1..e9f13e0 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -427,39 +427,27 @@ static bool rt6_check_expired(const struct rt6_info *rt)
        return false;
 }
 
-/* Multipath route selection:
- *   Hash based function using packet header and flowlabel.
- * Adapted from fib_info_hashfn()
- */
-static int rt6_info_hash_nhsfn(unsigned int candidate_count,
-                              const struct flowi6 *fl6)
-{
-       return get_hash_from_flowi6(fl6) % candidate_count;
-}
-
 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
                                             struct flowi6 *fl6, int oif,
                                             int strict)
 {
-       struct rt6_info *sibling, *next_sibling;
-       int route_choosen;
+       struct rt6_multi_map *nh_map = match->rt6i_nh_map;
+       __u32 hash = get_hash_from_flowi6(fl6);
+       unsigned int slice, idx;
+       struct rt6_info *res;
 
-       route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
-       /* Don't change the route, if route_choosen == 0
-        * (siblings does not include ourself)
-        */
-       if (route_choosen)
-               list_for_each_entry_safe(sibling, next_sibling,
-                               &match->rt6i_siblings, rt6i_siblings) {
-                       route_choosen--;
-                       if (route_choosen == 0) {
-                               if (rt6_score_route(sibling, oif, strict) < 0)
-                                       break;
-                               match = sibling;
-                               break;
-                       }
-               }
-       return match;
+       WARN_ON(!nh_map);
+       if (!nh_map)
+               return match;
+
+       slice = hash & (nh_map->nslices - 1);
+       idx = nh_map->index[slice];
+       res = nh_map->nhs[idx].nh;
+
+       if (rt6_score_route(res, oif, strict) < 0)
+               res = match;
+
+       return res;
 }
 
 /*
@@ -3550,6 +3538,9 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int 
write,
        return 0;
 }
 
+static int one = 1;
+static int thirtyone = 31;
+
 struct ctl_table ipv6_route_table_template[] = {
        {
                .procname       =       "flush",
@@ -3621,6 +3612,15 @@ struct ctl_table ipv6_route_table_template[] = {
                .mode           =       0644,
                .proc_handler   =       proc_dointvec_ms_jiffies,
        },
+       {
+               .procname       =       "ecmp_slices",
+               .data           =       
&init_net.ipv6.sysctl.ip6_rt_ecmp_slices,
+               .maxlen         =       sizeof(int),
+               .mode           =       0644,
+               .proc_handler   =       proc_dointvec_minmax,
+               .extra1         =       &one,
+               .extra2         =       &thirtyone,
+       },
        { }
 };
 
@@ -3644,6 +3644,7 @@ struct ctl_table * __net_init 
ipv6_route_sysctl_init(struct net *net)
                table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
                table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
                table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
+               table[10].data = &net->ipv6.sysctl.ip6_rt_ecmp_slices;
 
                /* Don't export sysctls to unprivileged users */
                if (net->user_ns != &init_user_ns)
@@ -3707,6 +3708,7 @@ static int __net_init ip6_route_net_init(struct net *net)
        net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
        net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
        net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
+       net->ipv6.sysctl.ip6_rt_ecmp_slices = 5;
 
        net->ipv6.ip6_rt_gc_expire = 30*HZ;
 
-- 
2.7.3

Reply via email to