On route-servers at IXPs there is often the wish to not have path hiding because of output filters. On way of solving this is to use per peer RIBs but the cost (in memory) is very high. This solves this issue in a different way but with the same (or similar result).
In bgpd all prefixes/paths are sorted by their preference and by default the best path is put into the Adj-RIB-Out (after that prefix passed the output filter). This diff changes this behaviour by falling back to the next prefix in case the current prefix was filtered. This stops path hiding and announces the next best route. The cost for doing this is a bump in CPU usage. On the other hand the memory consumption remains the same. The feature can be enabled with 'rde evaluate all' globably or per peer. It is also to turn it off by using 'rde evaluate default' (in case a peer does not want this). -- :wq Claudio ? obj Index: bgpd.conf.5 =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/bgpd.conf.5,v retrieving revision 1.208 diff -u -p -r1.208 bgpd.conf.5 --- bgpd.conf.5 16 Feb 2021 08:29:16 -0000 1.208 +++ bgpd.conf.5 16 Feb 2021 09:25:30 -0000 @@ -268,9 +268,18 @@ daemons, such as .Xr ospfd 8 . .Pp .It Xo -.Ic rde -.Ic med -.Ic compare +.Ic rde Ic evaluate +.Pq Ic default Ns | Ns Ic all +.Xc +If set to +.Ar all +keep evaluating alternative paths in case the selected path is filtered +out. +By default if a path is filtered by the output filters then no alternative +path is sent to this peer. +.Pp +.It Xo +.Ic rde Ic med Ic compare .Pq Ic always Ns | Ns Ic strict .Xc If set to @@ -1151,6 +1160,20 @@ setting. .Pp .It Ic remote-as Ar as-number Set the AS number of the remote system. +.Pp +.It Xo +.Ic rde Ic evaluate +.Pq Ic default Ns | Ns Ic all +.Xc +If set to +.Ar all +keep evaluating alternative paths in case the selected path is filtered +out. +By default if a path is filtered by the output filters then no alternative +path is sent to this peer. +The default is inherited from the global +.Ic rde Ic evaluate +setting. .Pp .It Ic rib Ar name Bind the neighbor to the specified RIB. Index: bgpd.h =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/bgpd.h,v retrieving revision 1.412 diff -u -p -r1.412 bgpd.h --- bgpd.h 16 Feb 2021 08:29:16 -0000 1.412 +++ bgpd.h 16 Feb 2021 09:25:30 -0000 @@ -65,7 +65,8 @@ #define BGPD_FLAG_DECISION_ROUTEAGE 0x0100 #define BGPD_FLAG_DECISION_TRANS_AS 0x0200 #define BGPD_FLAG_DECISION_MED_ALWAYS 0x0400 -#define BGPD_FLAG_NO_AS_SET 0x0800 +#define BGPD_FLAG_DECISION_ALL_PATHS 0x0800 +#define BGPD_FLAG_NO_AS_SET 0x1000 #define BGPD_LOG_UPDATES 0x0001 @@ -408,7 +409,8 @@ struct peer_config { #define PEERFLAG_TRANS_AS 0x01 #define PEERFLAG_LOG_UPDATES 0x02 -#define PEERFLAG_NO_AS_SET 0x04 +#define PEERFLAG_EVALUATE_ALL 0x04 +#define PEERFLAG_NO_AS_SET 0x08 enum network_type { NETWORK_DEFAULT, /* from network statements */ Index: parse.y =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/parse.y,v retrieving revision 1.413 diff -u -p -r1.413 parse.y --- parse.y 16 Feb 2021 08:29:16 -0000 1.413 +++ parse.y 16 Feb 2021 09:25:30 -0000 @@ -200,7 +200,7 @@ typedef struct { %token AS ROUTERID HOLDTIME YMIN LISTEN ON FIBUPDATE FIBPRIORITY RTABLE %token NONE UNICAST VPN RD EXPORT EXPORTTRGT IMPORTTRGT DEFAULTROUTE -%token RDE RIB EVALUATE IGNORE COMPARE RTR PORT +%token RDE RIB EVALUATE DEFAULT ALL IGNORE COMPARE RTR PORT %token GROUP NEIGHBOR NETWORK %token EBGP IBGP %token LOCALAS REMOTEAS DESCR LOCALADDR MULTIHOP PASSIVE MAXPREFIX RESTART @@ -791,6 +791,12 @@ conf_main : AS as4number { } free($4); } + | RDE EVALUATE ALL { + conf->flags |= BGPD_FLAG_DECISION_ALL_PATHS; + } + | RDE EVALUATE DEFAULT { + conf->flags &= ~BGPD_FLAG_DECISION_ALL_PATHS; + } | NEXTHOP QUALIFY VIA STRING { if (!strcmp($4, "bgp")) conf->flags |= BGPD_FLAG_NEXTHOP_BGP; @@ -1727,6 +1733,12 @@ peeropts : REMOTEAS as4number { else curpeer->conf.flags &= ~PEERFLAG_NO_AS_SET; } + | RDE EVALUATE ALL { + curpeer->conf.flags |= PEERFLAG_EVALUATE_ALL; + } + | RDE EVALUATE DEFAULT { + curpeer->conf.flags &= ~PEERFLAG_EVALUATE_ALL; + } ; restart : /* nada */ { $$ = 0; } @@ -2850,6 +2862,7 @@ lookup(char *s) { "IPv4", IPV4}, { "IPv6", IPV6}, { "ah", AH}, + { "all", ALL}, { "allow", ALLOW}, { "announce", ANNOUNCE}, { "any", ANY}, @@ -2862,6 +2875,7 @@ lookup(char *s) { "compare", COMPARE}, { "connect-retry", CONNECTRETRY}, { "connected", CONNECTED}, + { "default", DEFAULT}, { "default-route", DEFAULTROUTE}, { "delete", DELETE}, { "demote", DEMOTE}, @@ -3888,6 +3902,8 @@ alloc_peer(void) if (conf->flags & BGPD_FLAG_DECISION_TRANS_AS) p->conf.flags |= PEERFLAG_TRANS_AS; + if (conf->flags & BGPD_FLAG_DECISION_ALL_PATHS) + p->conf.flags |= PEERFLAG_EVALUATE_ALL; if (conf->flags & BGPD_FLAG_NO_AS_SET) p->conf.flags |= PEERFLAG_NO_AS_SET; @@ -3910,8 +3926,6 @@ new_peer(void) sizeof(p->conf.descr)) >= sizeof(p->conf.descr)) fatalx("new_peer descr strlcpy"); p->conf.groupid = curgroup->conf.id; - p->conf.local_as = curgroup->conf.local_as; - p->conf.local_short_as = curgroup->conf.local_short_as; } return (p); } Index: printconf.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/printconf.c,v retrieving revision 1.146 diff -u -p -r1.146 printconf.c --- printconf.c 16 Feb 2021 08:29:16 -0000 1.146 +++ printconf.c 16 Feb 2021 09:25:30 -0000 @@ -390,9 +390,10 @@ print_mainconf(struct bgpd_config *conf) if (conf->flags & BGPD_FLAG_DECISION_ROUTEAGE) printf("rde route-age evaluate\n"); - if (conf->flags & BGPD_FLAG_DECISION_MED_ALWAYS) printf("rde med compare always\n"); + if (conf->flags & BGPD_FLAG_DECISION_ALL_PATHS) + printf("rde evaluate all\n"); if (conf->flags & BGPD_FLAG_NO_AS_SET) printf("reject as-set yes\n"); @@ -681,6 +682,14 @@ print_peer(struct peer_config *p, struct printf("%s\tdepend on \"%s\"\n", c, p->if_depend); if (p->flags & PEERFLAG_TRANS_AS) printf("%s\ttransparent-as yes\n", c); + + if (conf->flags & BGPD_FLAG_DECISION_ALL_PATHS) { + if (!(p->flags & PEERFLAG_EVALUATE_ALL)) + printf("%s\trde evaluate default\n", c); + } else { + if (p->flags & PEERFLAG_EVALUATE_ALL) + printf("%s\trde evaluate all\n", c); + } if (conf->flags & BGPD_FLAG_NO_AS_SET) { if (!(p->flags & PEERFLAG_NO_AS_SET)) Index: rde.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v retrieving revision 1.515 diff -u -p -r1.515 rde.c --- rde.c 16 Feb 2021 08:29:16 -0000 1.515 +++ rde.c 16 Feb 2021 09:25:31 -0000 @@ -2875,8 +2875,17 @@ rde_send_kroute(struct rib *rib, struct /* * update specific functions */ +static int rde_eval_all; + +int +rde_evaluate_all(void) +{ + return rde_eval_all; +} + void -rde_generate_updates(struct rib *rib, struct prefix *new, struct prefix *old) +rde_generate_updates(struct rib *rib, struct prefix *new, struct prefix *old, + int eval_all) { struct rde_peer *peer; u_int8_t aid; @@ -2889,7 +2898,7 @@ rde_generate_updates(struct rib *rib, st if (old == NULL && new == NULL) return; - if ((rib->flags & F_RIB_NOFIB) == 0) + if (!eval_all && (rib->flags & F_RIB_NOFIB) == 0) rde_send_kroute(rib, new, old); if (new) @@ -2897,13 +2906,22 @@ rde_generate_updates(struct rib *rib, st else aid = old->pt->aid; + rde_eval_all = 0; LIST_FOREACH(peer, &peerlist, peer_l) { - if (peer->conf.id == 0) - continue; - if (peer->loc_rib_id != rib->id) + /* skip ourself */ + if (peer == peerself) continue; if (peer->state != PEER_UP) continue; + /* handle evaluate all, keep track if it is needed */ + if (peer->conf.flags & PEERFLAG_EVALUATE_ALL) + rde_eval_all = 1; + else if (eval_all) + /* skip default peers if the best path didn't change */ + continue; + /* skip peers using a different rib */ + if (peer->loc_rib_id != rib->id) + continue; /* check if peer actually supports the address family */ if (peer->capa.mp[aid] == 0) continue; @@ -3556,7 +3574,7 @@ rde_softreconfig_sync_reeval(struct rib_ nexthop_unlink(p); } if (re->active) { - rde_generate_updates(rib, NULL, re->active); + rde_generate_updates(rib, NULL, re->active, 0); re->active = NULL; } return; Index: rde.h =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v retrieving revision 1.236 diff -u -p -r1.236 rde.h --- rde.h 13 Jan 2021 11:34:01 -0000 1.236 +++ rde.h 16 Feb 2021 09:25:31 -0000 @@ -371,8 +371,9 @@ void rde_send_nexthop(struct bgpd_addr void rde_pftable_add(u_int16_t, struct prefix *); void rde_pftable_del(u_int16_t, struct prefix *); +int rde_evaluate_all(void); void rde_generate_updates(struct rib *, struct prefix *, - struct prefix *); + struct prefix *, int); u_int32_t rde_local_as(void); int rde_decisionflags(void); int rde_as4byte(struct rde_peer *); @@ -486,6 +487,7 @@ communities_unref(struct rde_community * int community_to_rd(struct community *, u_int64_t *); /* rde_decide.c */ +int prefix_eligible(struct prefix *); void prefix_evaluate(struct rib_entry *, struct prefix *, struct prefix *); /* rde_filter.c */ Index: rde_decide.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde_decide.c,v retrieving revision 1.81 diff -u -p -r1.81 rde_decide.c --- rde_decide.c 2 Feb 2021 15:24:43 -0000 1.81 +++ rde_decide.c 16 Feb 2021 09:25:31 -0000 @@ -91,8 +91,8 @@ void prefix_remove(struct prefix *, stru /* * Decision Engine OUR implementation: - * Our implementation has only one RIB. The filtering is done first. The - * filtering calculates the preference and stores it in LOCAL_PREF (Phase 1). + * The filtering is done first. The filtering calculates the preference and + * stores it in LOCAL_PREF (Phase 1). * Ineligible routes are flagged as ineligible via nexthop_add(). * Phase 3 is done together with Phase 2. * In following cases a prefix needs to be reevaluated: @@ -284,6 +284,13 @@ prefix_cmp(struct prefix *p1, struct pre fatalx("Uh, oh a politician in the decision process"); } +/* + * Insert a prefix keeping the total order of the list. For routes + * that may depend on a MED selection the set is scanned until the + * condition is cleared. If a MED inversion is detected the respective + * prefix is taken of the rib list and put onto a redo queue. All + * prefixes on the redo queue are re-inserted at the end. + */ void prefix_insert(struct prefix *new, struct prefix *ep, struct rib_entry *re) { @@ -351,6 +358,15 @@ prefix_insert(struct prefix *new, struct } } +/* + * Remove a prefix from the RIB list ensuring that the total order of the + * list remains intact. All routes that differ in the MED are taken of the + * list and put on the redo list. To figure out if a route could cause a + * resort because of a MED check the next prefix of the to-remove prefix + * is compared with the old prefix. A full scan is only done if the next + * route differs because of the MED or later checks. + * Again at the end all routes on the redo queue are reinserted. + */ void prefix_remove(struct prefix *old, struct rib_entry *re) { @@ -397,6 +413,23 @@ prefix_remove(struct prefix *old, struct } } +/* helper function to check if a prefix is valid to be selected */ +int +prefix_eligible(struct prefix *p) +{ + struct rde_aspath *asp = prefix_aspath(p); + struct nexthop *nh = prefix_nexthop(p); + + /* The aspath needs to be loop and error free */ + if (asp == NULL || asp->flags & (F_ATTR_LOOP|F_ATTR_PARSE_ERR)) + return 0; + /* The nexthop needs to exist and be reachable */ + if (nh == NULL || nh->state != NEXTHOP_REACH) + return 0; + + return 1; +} + /* * Find the correct place to insert the prefix in the prefix list. * If the active prefix has changed we need to send an update. @@ -420,7 +453,7 @@ prefix_evaluate(struct rib_entry *re, st * active. Clean up now to ensure that the RIB * is consistant. */ - rde_generate_updates(re_rib(re), NULL, re->active); + rde_generate_updates(re_rib(re), NULL, re->active, 0); re->active = NULL; } return; @@ -433,15 +466,8 @@ prefix_evaluate(struct rib_entry *re, st prefix_insert(new, NULL, re); xp = LIST_FIRST(&re->prefix_h); - if (xp != NULL) { - struct rde_aspath *xasp = prefix_aspath(xp); - if (xasp == NULL || - xasp->flags & (F_ATTR_LOOP|F_ATTR_PARSE_ERR) || - (prefix_nexthop(xp) != NULL && prefix_nexthop(xp)->state != - NEXTHOP_REACH)) - /* xp is ineligible */ - xp = NULL; - } + if (xp != NULL && !prefix_eligible(xp)) + xp = NULL; /* * If the active prefix changed or the active prefix was removed @@ -453,7 +479,17 @@ prefix_evaluate(struct rib_entry *re, st * but remember that xp may be NULL aka ineligible. * Additional decision may be made by the called functions. */ - rde_generate_updates(re_rib(re), xp, re->active); + rde_generate_updates(re_rib(re), xp, re->active, 0); re->active = xp; + return; } + + /* + * If there are peers with 'rde evaluate all' every update needs + * to be passed on (not only a change of the best prefix). + * rde_generate_updates() will then take care of distribution. + */ + if (rde_evaluate_all()) + if ((new != NULL && prefix_eligible(new)) || old != NULL) + rde_generate_updates(re_rib(re), re->active, NULL, 1); } Index: rde_update.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde_update.c,v retrieving revision 1.124 diff -u -p -r1.124 rde_update.c --- rde_update.c 9 Jan 2021 16:49:41 -0000 1.124 +++ rde_update.c 16 Feb 2021 09:25:31 -0000 @@ -23,6 +23,7 @@ #include <stdlib.h> #include <string.h> #include <siphash.h> +#include <stdio.h> #include "bgpd.h" #include "rde.h" @@ -49,26 +50,22 @@ up_test_update(struct rde_peer *peer, st { struct rde_aspath *asp; struct rde_community *comm; - struct rde_peer *prefp; + struct rde_peer *frompeer; - if (p == NULL) - /* no prefix available */ - return (0); - - prefp = prefix_peer(p); + frompeer = prefix_peer(p); asp = prefix_aspath(p); comm = prefix_communities(p); - if (peer == prefp) - /* Do not send routes back to sender */ - return (0); - if (asp == NULL || asp->flags & F_ATTR_PARSE_ERR) fatalx("try to send out a botched path"); if (asp->flags & F_ATTR_LOOP) fatalx("try to send out a looped path"); - if (!prefp->conf.ebgp && !peer->conf.ebgp) { + if (peer == frompeer) + /* Do not send routes back to sender */ + return (0); + + if (!frompeer->conf.ebgp && !peer->conf.ebgp) { /* * route reflector redistribution rules: * 1. if announce is set -> announce @@ -77,7 +74,7 @@ up_test_update(struct rde_peer *peer, st * 4. old non-client, new client -> yes * 5. old client, new client -> yes */ - if (prefp->conf.reflector_client == 0 && + if (frompeer->conf.reflector_client == 0 && peer->conf.reflector_client == 0 && (asp->flags & F_PREFIX_ANNOUNCED) == 0) /* Do not redistribute updates to ibgp peers */ @@ -101,14 +98,12 @@ void up_generate_updates(struct filter_head *rules, struct rde_peer *peer, struct prefix *new, struct prefix *old) { - struct filterstate state; - struct bgpd_addr addr; - - if (peer->state != PEER_UP) - return; + struct filterstate state; + struct bgpd_addr addr; + int need_withdraw; +again: if (new == NULL) { -withdraw: if (old == NULL) /* no prefix to withdraw */ return; @@ -121,8 +116,28 @@ withdraw: peer->up_wcnt++; } } else { - if (!up_test_update(peer, new)) { - goto withdraw; + need_withdraw = 0; + /* + * up_test_update() needs to run before the output filters + * else the well known communities wont work properly. + * The output filters would not be able to add well known + * communities. + */ + if (!up_test_update(peer, new)) + need_withdraw = 1; + + /* + * if 'rde evaluate all' is set for this peer then + * delay the the withdraw because of up_test_update(). + * The filters may actually skip this prefix and so this + * decision needs to be delayed. + * For the default mode we can just give up here and + * skip the filters. + */ + if (need_withdraw && + !(peer->conf.flags & PEERFLAG_EVALUATE_ALL)) { + new = NULL; + goto again; } rde_filterstate_prep(&state, prefix_aspath(new), @@ -133,7 +148,18 @@ withdraw: new->pt->prefixlen, prefix_vstate(new), &state) == ACTION_DENY) { rde_filterstate_clean(&state); - goto withdraw; + if (peer->conf.flags & PEERFLAG_EVALUATE_ALL) + new = LIST_NEXT(new, entry.list.rib); + else + new = NULL; + if (new != NULL && !prefix_eligible(new)) + new = NULL; + goto again; + } + + if (need_withdraw) { + new = NULL; + goto again; } /* only send update if path changed */