On route-servers at IXPs there is often the wish to not have path hiding
because of output filters. On way of solving this is to use per peer RIBs
but the cost (in memory) is very high. This solves this issue in a
different way but with the same (or similar result).

In bgpd all prefixes/paths are sorted by their preference and by default
the best path is put into the Adj-RIB-Out (after that prefix passed the
output filter). This diff changes this behaviour by falling back to the
next prefix in case the current prefix was filtered. This stops path
hiding and announces the next best route. The cost for doing this is a
bump in CPU usage. On the other hand the memory consumption remains the
same.

The feature can be enabled with 'rde evaluate all' globably or per peer.
It is also to turn it off by using 'rde evaluate default' (in case a peer
does not want this).

-- 
:wq Claudio

? obj
Index: bgpd.conf.5
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/bgpd.conf.5,v
retrieving revision 1.208
diff -u -p -r1.208 bgpd.conf.5
--- bgpd.conf.5 16 Feb 2021 08:29:16 -0000      1.208
+++ bgpd.conf.5 16 Feb 2021 09:25:30 -0000
@@ -268,9 +268,18 @@ daemons, such as
 .Xr ospfd 8 .
 .Pp
 .It Xo
-.Ic rde
-.Ic med
-.Ic compare
+.Ic rde Ic evaluate
+.Pq Ic default Ns | Ns Ic all
+.Xc
+If set to
+.Ar all
+keep evaluating alternative paths in case the selected path is filtered
+out.
+By default if a path is filtered by the output filters then no alternative
+path is sent to this peer.
+.Pp
+.It Xo
+.Ic rde Ic med Ic compare
 .Pq Ic always Ns | Ns Ic strict
 .Xc
 If set to
@@ -1151,6 +1160,20 @@ setting.
 .Pp
 .It Ic remote-as Ar as-number
 Set the AS number of the remote system.
+.Pp
+.It Xo
+.Ic rde Ic evaluate
+.Pq Ic default Ns | Ns Ic all
+.Xc
+If set to
+.Ar all
+keep evaluating alternative paths in case the selected path is filtered
+out.
+By default if a path is filtered by the output filters then no alternative
+path is sent to this peer.
+The default is inherited from the global
+.Ic rde Ic evaluate
+setting.
 .Pp
 .It Ic rib Ar name
 Bind the neighbor to the specified RIB.
Index: bgpd.h
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/bgpd.h,v
retrieving revision 1.412
diff -u -p -r1.412 bgpd.h
--- bgpd.h      16 Feb 2021 08:29:16 -0000      1.412
+++ bgpd.h      16 Feb 2021 09:25:30 -0000
@@ -65,7 +65,8 @@
 #define        BGPD_FLAG_DECISION_ROUTEAGE     0x0100
 #define        BGPD_FLAG_DECISION_TRANS_AS     0x0200
 #define        BGPD_FLAG_DECISION_MED_ALWAYS   0x0400
-#define        BGPD_FLAG_NO_AS_SET             0x0800
+#define        BGPD_FLAG_DECISION_ALL_PATHS    0x0800
+#define        BGPD_FLAG_NO_AS_SET             0x1000
 
 #define        BGPD_LOG_UPDATES                0x0001
 
@@ -408,7 +409,8 @@ struct peer_config {
 
 #define PEERFLAG_TRANS_AS      0x01
 #define PEERFLAG_LOG_UPDATES   0x02
-#define PEERFLAG_NO_AS_SET     0x04
+#define PEERFLAG_EVALUATE_ALL  0x04
+#define PEERFLAG_NO_AS_SET     0x08
 
 enum network_type {
        NETWORK_DEFAULT,        /* from network statements */
Index: parse.y
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/parse.y,v
retrieving revision 1.413
diff -u -p -r1.413 parse.y
--- parse.y     16 Feb 2021 08:29:16 -0000      1.413
+++ parse.y     16 Feb 2021 09:25:30 -0000
@@ -200,7 +200,7 @@ typedef struct {
 
 %token AS ROUTERID HOLDTIME YMIN LISTEN ON FIBUPDATE FIBPRIORITY RTABLE
 %token NONE UNICAST VPN RD EXPORT EXPORTTRGT IMPORTTRGT DEFAULTROUTE
-%token RDE RIB EVALUATE IGNORE COMPARE RTR PORT
+%token RDE RIB EVALUATE DEFAULT ALL IGNORE COMPARE RTR PORT
 %token GROUP NEIGHBOR NETWORK
 %token EBGP IBGP
 %token LOCALAS REMOTEAS DESCR LOCALADDR MULTIHOP PASSIVE MAXPREFIX RESTART
@@ -791,6 +791,12 @@ conf_main  : AS as4number          {
                        }
                        free($4);
                }
+               | RDE EVALUATE ALL {
+                       conf->flags |= BGPD_FLAG_DECISION_ALL_PATHS;
+               }
+               | RDE EVALUATE DEFAULT {
+                       conf->flags &= ~BGPD_FLAG_DECISION_ALL_PATHS;
+               }
                | NEXTHOP QUALIFY VIA STRING    {
                        if (!strcmp($4, "bgp"))
                                conf->flags |= BGPD_FLAG_NEXTHOP_BGP;
@@ -1727,6 +1733,12 @@ peeropts : REMOTEAS as4number    {
                        else
                                curpeer->conf.flags &= ~PEERFLAG_NO_AS_SET;
                }
+               | RDE EVALUATE ALL {
+                       curpeer->conf.flags |= PEERFLAG_EVALUATE_ALL;
+               }
+               | RDE EVALUATE DEFAULT {
+                       curpeer->conf.flags &= ~PEERFLAG_EVALUATE_ALL;
+               }
                ;
 
 restart                : /* nada */            { $$ = 0; }
@@ -2850,6 +2862,7 @@ lookup(char *s)
                { "IPv4",               IPV4},
                { "IPv6",               IPV6},
                { "ah",                 AH},
+               { "all",                ALL},
                { "allow",              ALLOW},
                { "announce",           ANNOUNCE},
                { "any",                ANY},
@@ -2862,6 +2875,7 @@ lookup(char *s)
                { "compare",            COMPARE},
                { "connect-retry",      CONNECTRETRY},
                { "connected",          CONNECTED},
+               { "default",            DEFAULT},
                { "default-route",      DEFAULTROUTE},
                { "delete",             DELETE},
                { "demote",             DEMOTE},
@@ -3888,6 +3902,8 @@ alloc_peer(void)
 
        if (conf->flags & BGPD_FLAG_DECISION_TRANS_AS)
                p->conf.flags |= PEERFLAG_TRANS_AS;
+       if (conf->flags & BGPD_FLAG_DECISION_ALL_PATHS)
+               p->conf.flags |= PEERFLAG_EVALUATE_ALL;
        if (conf->flags & BGPD_FLAG_NO_AS_SET)
                p->conf.flags |= PEERFLAG_NO_AS_SET;
 
@@ -3910,8 +3926,6 @@ new_peer(void)
                    sizeof(p->conf.descr)) >= sizeof(p->conf.descr))
                        fatalx("new_peer descr strlcpy");
                p->conf.groupid = curgroup->conf.id;
-               p->conf.local_as = curgroup->conf.local_as;
-               p->conf.local_short_as = curgroup->conf.local_short_as;
        }
        return (p);
 }
Index: printconf.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/printconf.c,v
retrieving revision 1.146
diff -u -p -r1.146 printconf.c
--- printconf.c 16 Feb 2021 08:29:16 -0000      1.146
+++ printconf.c 16 Feb 2021 09:25:30 -0000
@@ -390,9 +390,10 @@ print_mainconf(struct bgpd_config *conf)
 
        if (conf->flags & BGPD_FLAG_DECISION_ROUTEAGE)
                printf("rde route-age evaluate\n");
-
        if (conf->flags & BGPD_FLAG_DECISION_MED_ALWAYS)
                printf("rde med compare always\n");
+       if (conf->flags & BGPD_FLAG_DECISION_ALL_PATHS)
+               printf("rde evaluate all\n");
 
        if (conf->flags & BGPD_FLAG_NO_AS_SET)
                printf("reject as-set yes\n");
@@ -681,6 +682,14 @@ print_peer(struct peer_config *p, struct
                printf("%s\tdepend on \"%s\"\n", c, p->if_depend);
        if (p->flags & PEERFLAG_TRANS_AS)
                printf("%s\ttransparent-as yes\n", c);
+
+       if (conf->flags & BGPD_FLAG_DECISION_ALL_PATHS) {
+               if (!(p->flags & PEERFLAG_EVALUATE_ALL))
+                       printf("%s\trde evaluate default\n", c);
+       } else {
+               if (p->flags & PEERFLAG_EVALUATE_ALL)
+                       printf("%s\trde evaluate all\n", c);
+       }
 
        if (conf->flags & BGPD_FLAG_NO_AS_SET) {
                if (!(p->flags & PEERFLAG_NO_AS_SET))
Index: rde.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v
retrieving revision 1.515
diff -u -p -r1.515 rde.c
--- rde.c       16 Feb 2021 08:29:16 -0000      1.515
+++ rde.c       16 Feb 2021 09:25:31 -0000
@@ -2875,8 +2875,17 @@ rde_send_kroute(struct rib *rib, struct 
 /*
  * update specific functions
  */
+static int rde_eval_all;
+
+int
+rde_evaluate_all(void)
+{
+       return rde_eval_all;
+}
+
 void
-rde_generate_updates(struct rib *rib, struct prefix *new, struct prefix *old)
+rde_generate_updates(struct rib *rib, struct prefix *new, struct prefix *old,
+    int eval_all)
 {
        struct rde_peer *peer;
        u_int8_t         aid;
@@ -2889,7 +2898,7 @@ rde_generate_updates(struct rib *rib, st
        if (old == NULL && new == NULL)
                return;
 
-       if ((rib->flags & F_RIB_NOFIB) == 0)
+       if (!eval_all && (rib->flags & F_RIB_NOFIB) == 0)
                rde_send_kroute(rib, new, old);
 
        if (new)
@@ -2897,13 +2906,22 @@ rde_generate_updates(struct rib *rib, st
        else
                aid = old->pt->aid;
 
+       rde_eval_all = 0;
        LIST_FOREACH(peer, &peerlist, peer_l) {
-               if (peer->conf.id == 0)
-                       continue;
-               if (peer->loc_rib_id != rib->id)
+               /* skip ourself */
+               if (peer == peerself)
                        continue;
                if (peer->state != PEER_UP)
                        continue;
+               /* handle evaluate all, keep track if it is needed */
+               if (peer->conf.flags & PEERFLAG_EVALUATE_ALL)
+                       rde_eval_all = 1;
+               else if (eval_all)
+                       /* skip default peers if the best path didn't change */
+                       continue;
+               /* skip peers using a different rib */
+               if (peer->loc_rib_id != rib->id)
+                       continue;
                /* check if peer actually supports the address family */
                if (peer->capa.mp[aid] == 0)
                        continue;
@@ -3556,7 +3574,7 @@ rde_softreconfig_sync_reeval(struct rib_
                                nexthop_unlink(p);
                }
                if (re->active) {
-                       rde_generate_updates(rib, NULL, re->active);
+                       rde_generate_updates(rib, NULL, re->active, 0);
                        re->active = NULL;
                }
                return;
Index: rde.h
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v
retrieving revision 1.236
diff -u -p -r1.236 rde.h
--- rde.h       13 Jan 2021 11:34:01 -0000      1.236
+++ rde.h       16 Feb 2021 09:25:31 -0000
@@ -371,8 +371,9 @@ void                rde_send_nexthop(struct bgpd_addr 
 void           rde_pftable_add(u_int16_t, struct prefix *);
 void           rde_pftable_del(u_int16_t, struct prefix *);
 
+int            rde_evaluate_all(void);
 void           rde_generate_updates(struct rib *, struct prefix *,
-                   struct prefix *);
+                   struct prefix *, int);
 u_int32_t      rde_local_as(void);
 int            rde_decisionflags(void);
 int            rde_as4byte(struct rde_peer *);
@@ -486,6 +487,7 @@ communities_unref(struct rde_community *
 int    community_to_rd(struct community *, u_int64_t *);
 
 /* rde_decide.c */
+int    prefix_eligible(struct prefix *);
 void   prefix_evaluate(struct rib_entry *, struct prefix *, struct prefix *);
 
 /* rde_filter.c */
Index: rde_decide.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_decide.c,v
retrieving revision 1.81
diff -u -p -r1.81 rde_decide.c
--- rde_decide.c        2 Feb 2021 15:24:43 -0000       1.81
+++ rde_decide.c        16 Feb 2021 09:25:31 -0000
@@ -91,8 +91,8 @@ void  prefix_remove(struct prefix *, stru
 
 /*
  * Decision Engine OUR implementation:
- * Our implementation has only one RIB. The filtering is done first. The
- * filtering calculates the preference and stores it in LOCAL_PREF (Phase 1).
+ * The filtering is done first. The filtering calculates the preference and
+ * stores it in LOCAL_PREF (Phase 1).
  * Ineligible routes are flagged as ineligible via nexthop_add().
  * Phase 3 is done together with Phase 2.
  * In following cases a prefix needs to be reevaluated:
@@ -284,6 +284,13 @@ prefix_cmp(struct prefix *p1, struct pre
        fatalx("Uh, oh a politician in the decision process");
 }
 
+/*
+ * Insert a prefix keeping the total order of the list. For routes
+ * that may depend on a MED selection the set is scanned until the
+ * condition is cleared. If a MED inversion is detected the respective
+ * prefix is taken of the rib list and put onto a redo queue. All
+ * prefixes on the redo queue are re-inserted at the end.
+ */
 void
 prefix_insert(struct prefix *new, struct prefix *ep, struct rib_entry *re)
 {
@@ -351,6 +358,15 @@ prefix_insert(struct prefix *new, struct
        }
 }
 
+/*
+ * Remove a prefix from the RIB list ensuring that the total order of the
+ * list remains intact. All routes that differ in the MED are taken of the
+ * list and put on the redo list. To figure out if a route could cause a
+ * resort because of a MED check the next prefix of the to-remove prefix
+ * is compared with the old prefix. A full scan is only done if the next
+ * route differs because of the MED or later checks.
+ * Again at the end all routes on the redo queue are reinserted.
+ */
 void
 prefix_remove(struct prefix *old, struct rib_entry *re)
 {
@@ -397,6 +413,23 @@ prefix_remove(struct prefix *old, struct
        }
 }
 
+/* helper function to check if a prefix is valid to be selected */
+int
+prefix_eligible(struct prefix *p)
+{
+       struct rde_aspath *asp = prefix_aspath(p);
+       struct nexthop *nh = prefix_nexthop(p);
+
+       /* The aspath needs to be loop and error free */
+       if (asp == NULL || asp->flags & (F_ATTR_LOOP|F_ATTR_PARSE_ERR))
+               return 0;
+       /* The nexthop needs to exist and be reachable */
+       if (nh == NULL || nh->state != NEXTHOP_REACH)
+               return 0;
+
+       return 1;
+}
+
 /*
  * Find the correct place to insert the prefix in the prefix list.
  * If the active prefix has changed we need to send an update.
@@ -420,7 +453,7 @@ prefix_evaluate(struct rib_entry *re, st
                         * active. Clean up now to ensure that the RIB
                         * is consistant.
                         */
-                       rde_generate_updates(re_rib(re), NULL, re->active);
+                       rde_generate_updates(re_rib(re), NULL, re->active, 0);
                        re->active = NULL;
                }
                return;
@@ -433,15 +466,8 @@ prefix_evaluate(struct rib_entry *re, st
                prefix_insert(new, NULL, re);
 
        xp = LIST_FIRST(&re->prefix_h);
-       if (xp != NULL) {
-               struct rde_aspath *xasp = prefix_aspath(xp);
-               if (xasp == NULL ||
-                   xasp->flags & (F_ATTR_LOOP|F_ATTR_PARSE_ERR) ||
-                   (prefix_nexthop(xp) != NULL && prefix_nexthop(xp)->state !=
-                   NEXTHOP_REACH))
-                       /* xp is ineligible */
-                       xp = NULL;
-       }
+       if (xp != NULL && !prefix_eligible(xp))
+               xp = NULL;
 
        /*
         * If the active prefix changed or the active prefix was removed
@@ -453,7 +479,17 @@ prefix_evaluate(struct rib_entry *re, st
                 * but remember that xp may be NULL aka ineligible.
                 * Additional decision may be made by the called functions.
                 */
-               rde_generate_updates(re_rib(re), xp, re->active);
+               rde_generate_updates(re_rib(re), xp, re->active, 0);
                re->active = xp;
+               return;
        }
+
+       /*
+        * If there are peers with 'rde evaluate all' every update needs
+        * to be passed on (not only a change of the best prefix).
+        * rde_generate_updates() will then take care of distribution.
+        */
+       if (rde_evaluate_all())
+               if ((new != NULL && prefix_eligible(new)) || old != NULL)
+                       rde_generate_updates(re_rib(re), re->active, NULL, 1);
 }
Index: rde_update.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_update.c,v
retrieving revision 1.124
diff -u -p -r1.124 rde_update.c
--- rde_update.c        9 Jan 2021 16:49:41 -0000       1.124
+++ rde_update.c        16 Feb 2021 09:25:31 -0000
@@ -23,6 +23,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <siphash.h>
+#include <stdio.h>
 
 #include "bgpd.h"
 #include "rde.h"
@@ -49,26 +50,22 @@ up_test_update(struct rde_peer *peer, st
 {
        struct rde_aspath       *asp;
        struct rde_community    *comm;
-       struct rde_peer         *prefp;
+       struct rde_peer         *frompeer;
 
-       if (p == NULL)
-               /* no prefix available */
-               return (0);
-
-       prefp = prefix_peer(p);
+       frompeer = prefix_peer(p);
        asp = prefix_aspath(p);
        comm = prefix_communities(p);
 
-       if (peer == prefp)
-               /* Do not send routes back to sender */
-               return (0);
-
        if (asp == NULL || asp->flags & F_ATTR_PARSE_ERR)
                fatalx("try to send out a botched path");
        if (asp->flags & F_ATTR_LOOP)
                fatalx("try to send out a looped path");
 
-       if (!prefp->conf.ebgp && !peer->conf.ebgp) {
+       if (peer == frompeer)
+               /* Do not send routes back to sender */
+               return (0);
+
+       if (!frompeer->conf.ebgp && !peer->conf.ebgp) {
                /*
                 * route reflector redistribution rules:
                 * 1. if announce is set                -> announce
@@ -77,7 +74,7 @@ up_test_update(struct rde_peer *peer, st
                 * 4. old non-client, new client        -> yes
                 * 5. old client, new client            -> yes
                 */
-               if (prefp->conf.reflector_client == 0 &&
+               if (frompeer->conf.reflector_client == 0 &&
                    peer->conf.reflector_client == 0 &&
                    (asp->flags & F_PREFIX_ANNOUNCED) == 0)
                        /* Do not redistribute updates to ibgp peers */
@@ -101,14 +98,12 @@ void
 up_generate_updates(struct filter_head *rules, struct rde_peer *peer,
     struct prefix *new, struct prefix *old)
 {
-       struct filterstate              state;
-       struct bgpd_addr                addr;
-
-       if (peer->state != PEER_UP)
-               return;
+       struct filterstate      state;
+       struct bgpd_addr        addr;
+       int                     need_withdraw;
 
+again:
        if (new == NULL) {
-withdraw:
                if (old == NULL)
                        /* no prefix to withdraw */
                        return;
@@ -121,8 +116,28 @@ withdraw:
                        peer->up_wcnt++;
                }
        } else {
-               if (!up_test_update(peer, new)) {
-                       goto withdraw;
+               need_withdraw = 0;
+               /*
+                * up_test_update() needs to run before the output filters
+                * else the well known communities wont work properly.
+                * The output filters would not be able to add well known
+                * communities.
+                */
+               if (!up_test_update(peer, new))
+                       need_withdraw = 1;
+
+               /*
+                * if 'rde evaluate all' is set for this peer then
+                * delay the the withdraw because of up_test_update().
+                * The filters may actually skip this prefix and so this
+                * decision needs to be delayed.
+                * For the default mode we can just give up here and
+                * skip the filters.
+                */
+               if (need_withdraw &&
+                   !(peer->conf.flags & PEERFLAG_EVALUATE_ALL)) {
+                       new = NULL;
+                       goto again;
                }
 
                rde_filterstate_prep(&state, prefix_aspath(new),
@@ -133,7 +148,18 @@ withdraw:
                    new->pt->prefixlen, prefix_vstate(new), &state) ==
                    ACTION_DENY) {
                        rde_filterstate_clean(&state);
-                       goto withdraw;
+                       if (peer->conf.flags & PEERFLAG_EVALUATE_ALL)
+                               new = LIST_NEXT(new, entry.list.rib);
+                       else
+                               new = NULL;
+                       if (new != NULL && !prefix_eligible(new))
+                               new = NULL;
+                       goto again;
+               }
+
+               if (need_withdraw) {
+                       new = NULL;
+                       goto again;
                }
 
                /* only send update if path changed */

Reply via email to