On 07/29/2015 09:02 AM, Tom Herbert wrote: > On Mon, Jul 27, 2015 at 7:27 PM, Richard Laing > <richard.la...@alliedtelesis.co.nz> wrote: >> From: Richard Laing <richard.la...@alliedtelesis.co.nz> >> >> Enable flow-based ECMP. >> >> Currently if equal-cost multipath is enabled the kernel chooses between >> equal cost paths for each matching packet, essentially packets are >> round-robined between the routes. This means that packets from a single >> flow can traverse different routes. If one of the routes experiences >> congestion this can result in delayed or out of order packets arriving >> at the destination. >> > Richard, someone was complaining to me just last week about the > weakness of the round robin algorithm. Thanks for looking into this! > >> This patch allows packets to be routed based on their >> flow - packets in the same flow will always use the same route. This >> prevents out of order packets. There are other issues with round-robin >> based ECMP routing related to variable path MTU handling and debugging. >> See RFC2991 for more details on the problems associated with packet >> based ECMP routing. >> >> This patch relies on the skb hash value to select between routes. The >> selection uses a hash-threshold algorithm (see RFC2992). >> >> Signed-off-by: Richard Laing <richard.la...@alliedtelesis.co.nz> >> --- >> >> include/net/flow.h | 14 ++++++++++++++ >> include/net/ip_fib.h | 9 +++++++++ >> include/net/route.h | 4 ++++ >> net/ipv4/Kconfig | 9 +++++++++ >> net/ipv4/fib_semantics.c | 38 ++++++++++++++++++++++++++++++++++++++ >> net/ipv4/route.c | 14 +++++++++++++- >> 6 files changed, 87 insertions(+), 1 deletion(-) >> >> diff --git a/include/net/flow.h b/include/net/flow.h >> index 8109a15..d1d933d 100644 >> --- a/include/net/flow.h >> +++ b/include/net/flow.h >> @@ -79,6 +79,10 @@ struct flowi4 { >> #define fl4_ipsec_spi uli.spi >> #define fl4_mh_type uli.mht.type >> #define fl4_gre_key uli.gre_key >> + >> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH > Why bother making this a CONFIG, round robin is a miserable algorithm > anyway and nearly all the other packet steering mechanisms already use > a hash.
Fair enough, I will look at making it a sysctl option. I guess the default can be the current behaviour. > >> + __u32 flowi4_hash; >> +#endif >> } __attribute__((__aligned__(BITS_PER_LONG/8))); >> >> static inline void flowi4_init_output(struct flowi4 *fl4, int oif, >> @@ -99,6 +103,9 @@ static inline void flowi4_init_output(struct flowi4 >> *fl4, int oif, >> fl4->saddr = saddr; >> fl4->fl4_dport = dport; >> fl4->fl4_sport = sport; >> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH >> + fl4->flowi4_hash = 0; >> +#endif >> } >> >> /* Reset some input parameters after previous lookup */ >> @@ -182,6 +189,13 @@ static inline struct flowi *flowidn_to_flowi(struct >> flowidn *fldn) >> return container_of(fldn, struct flowi, u.dn); >> } >> >> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH >> +static inline void flowi4_set_flow_hash(struct flowi4 *fl, __u32 hash) >> +{ >> + fl->flowi4_hash = hash; >> +} >> +#endif >> + >> typedef unsigned long flow_compare_t; >> >> static inline size_t flow_key_size(u16 family) >> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h >> index 5fa643b..c841698 100644 >> --- a/include/net/ip_fib.h >> +++ b/include/net/ip_fib.h >> @@ -117,6 +117,10 @@ struct fib_info { >> #ifdef CONFIG_IP_ROUTE_MULTIPATH >> int fib_power; >> #endif >> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH >> + /* Cache the number of live nexthops for flow based ECMP >> calculation. */ >> + int live_nexthops; >> +#endif >> struct rcu_head rcu; >> struct fib_nh fib_nh[0]; >> #define fib_dev fib_nh[0].nh_dev >> @@ -311,6 +315,11 @@ int fib_sync_down_addr(struct net *net, __be32 local); >> int fib_sync_up(struct net_device *dev, unsigned int nh_flags); >> void fib_select_multipath(struct fib_result *res); >> >> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH >> +void fib_select_multipath_for_flow(struct fib_result *res, >> + const struct flowi4 *fl4); >> +#endif >> + >> /* Exported by fib_trie.c */ >> void fib_trie_init(void); >> struct fib_table *fib_trie_table(u32 id, struct fib_table *alias); >> diff --git a/include/net/route.h b/include/net/route.h >> index fe22d03..fdbbe7f 100644 >> --- a/include/net/route.h >> +++ b/include/net/route.h >> @@ -252,6 +252,10 @@ static inline void ip_route_connect_init(struct >> flowi4 *fl4, __be32 dst, __be32 >> >> flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, >> protocol, flow_flags, dst, src, dport, sport); >> + >> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH >> + flowi4_set_flow_hash(fl4, sk->sk_hash); > Should be sk_txhash I think. I will check, it looks likely! > >> +#endif >> } >> >> static inline struct rtable *ip_route_connect(struct flowi4 *fl4, >> diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig >> index 6fb3c90..76714d0 100644 >> --- a/net/ipv4/Kconfig >> +++ b/net/ipv4/Kconfig >> @@ -90,6 +90,15 @@ config IP_ROUTE_MULTIPATH >> equal "cost" and chooses one of them in a non-deterministic fashion >> if a matching packet arrives. >> >> +config IP_FLOW_BASED_MULTIPATH >> + bool "IP: flow based equal cost multipath" >> + depends on IP_ROUTE_MULTIPATH >> + help >> + Normally if equal cost multipath is enabled the router chooses >> between >> + equal "cost" paths for each matching packet, essentially >> packets are round- >> + robined between the routes. This option allows packets to be >> routed based on >> + their flow - packets in the same flow will always use the >> same route. >> + >> config IP_ROUTE_VERBOSE >> bool "IP: verbose route monitoring" >> depends on IP_ADVANCED_ROUTER >> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c >> index 3a06586..e4750e6 100644 >> --- a/net/ipv4/fib_semantics.c >> +++ b/net/ipv4/fib_semantics.c >> @@ -981,6 +981,9 @@ link_it: >> head = &fib_info_devhash[hash]; >> hlist_add_head(&nexthop_nh->nh_hash, head); >> } endfor_nexthops(fi) >> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH >> + fi->live_nexthops = fi->fib_nhs; >> +#endif >> spin_unlock_bh(&fib_info_lock); >> return fi; >> >> @@ -1196,6 +1199,9 @@ int fib_sync_down_dev(struct net_device *dev, >> unsigned long event) >> } >> ret++; >> } >> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH >> + fi->live_nexthops = fi->fib_nhs - dead; >> +#endif >> } >> >> return ret; >> @@ -1331,6 +1337,9 @@ int fib_sync_up(struct net_device *dev, unsigned >> int nh_flags) >> if (alive > 0) { >> fi->fib_flags &= ~nh_flags; >> ret++; >> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH >> + fi->live_nexthops = alive; >> +#endif >> } >> } >> >> @@ -1397,4 +1406,33 @@ void fib_select_multipath(struct fib_result *res) >> res->nh_sel = 0; >> spin_unlock_bh(&fib_multipath_lock); >> } >> + >> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH >> +void fib_select_multipath_for_flow(struct fib_result *res, >> + const struct flowi4 *fl4) >> +{ >> + struct fib_info *fi = res->fi; >> + int multipath_entry; >> + int region_size; >> + >> + if (fl4->flowi4_hash) { >> + /* Hash-threshold algorithm, see RFC2992. */ >> + region_size = U32_MAX / fi->live_nexthops; >> + multipath_entry = fl4->flowi4_hash / region_size; >> + >> + spin_lock_bh(&fib_multipath_lock); >> + for_nexthops(fi) { >> + if (!(nh->nh_flags & RTNH_F_DEAD)) { >> + res->nh_sel = nhsel; >> + if (multipath_entry == 0) >> + break; >> + multipath_entry--; >> + } >> + } endfor_nexthops(fi); >> + spin_unlock_bh(&fib_multipath_lock); >> + } else { >> + fib_select_multipath(res); >> + } >> +} >> +#endif >> #endif >> diff --git a/net/ipv4/route.c b/net/ipv4/route.c >> index e681b85..4629c04 100644 >> --- a/net/ipv4/route.c >> +++ b/net/ipv4/route.c >> @@ -1638,9 +1638,17 @@ static int ip_mkroute_input(struct sk_buff *skb, >> __be32 daddr, __be32 saddr, u32 tos) >> { >> #ifdef CONFIG_IP_ROUTE_MULTIPATH >> - if (res->fi && res->fi->fib_nhs > 1) >> + if (res->fi && res->fi->fib_nhs > 1) { >> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH >> + if (skb) >> + flowi4_set_flow_hash((struct flowi4 *)fl4, >> + skb_get_hash(skb)); >> + fib_select_multipath_for_flow(res, fl4); >> +#else >> fib_select_multipath(res); >> #endif >> + } >> +#endif >> >> /* create a routing cache entry */ >> return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); >> @@ -2170,7 +2178,11 @@ struct rtable *__ip_route_output_key(struct net >> *net, struct flowi4 *fl4) >> >> #ifdef CONFIG_IP_ROUTE_MULTIPATH >> if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) >> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH >> + fib_select_multipath_for_flow(&res, fl4); >> +#else >> fib_select_multipath(&res); >> +#endif >> else >> #endif >> if (!res.prefixlen && -- Richard Laing Software Team Leader Allied Telesis Labs| 27 Nazareth Ave | Christchurch 8024 | New Zealand Phone: +64 3 339 9248 Web: www.alliedtelesis.com N�����r��y����b�X��ǧv�^�){.n�+���z�^�)����w*jg��������ݢj/���z�ޖ��2�ޙ����&�)ߡ�a�����G���h��j:+v���w��٥