On 2018/08/21 17:16, Remi Locherer wrote: > Hi tech, > > recently we had a short outage in our network. A script started an additional > ospfd instance because the -n flag for config test was missing.
This is a problem with bgpd as well, last time I did this it killed one of the *other* routers on the network (i.e. not just the one where I accidentally ran 2x bgpd...). > What then happend was not nice: > - The new ospfd unlinked the control socket of the first ospfd > - The new ospfd removed all routes from the first ospfd > - The new ospfd was not able to build up an adjacency and therefore could > not install the routes needed for a recovery. > - Both ospfd instances were running but non-functional. > > Of course the faulty script is fixed by now. ;-) > > It would be nice if ospfd could prevent such a situation. > > Below diff does these things: > - Detect a running ospfd by first doing a connect on the control socket. > - Do not delete the control socket on exit. > - This could delete the socket of another instance. > - Unlinking the socket on shutdown will be in the way once we add pledge > to the main process. It was removed recently from various daemons. This all sounds very sensible. > - Do not delete routes added by another process even if they have > prio RTP_OSPF. Without this the new ospfd will remove all the routes > of the first one. I'm unsure about this, the above changes stop the new ospfd from running don't they, so that shouldn't be a problem? If an ospfd blows up for whatever reason, it would be quite inconvenient if it needs manual route tweaks rather than just 'rcctl start ospfd' to fix it .. > A side effect of this is that alien OSPF routes are now only logged but > not removed anymore. Should a crashed ospfd leave some routes behind the > next ospfd does not clean them up anymore. The admin would need to check > the logs and remove them manually with the route command. > > Does this make sense? > > Comments? OK? > > Remi > > > Index: control.c > =================================================================== > RCS file: /cvs/src/usr.sbin/ospfd/control.c,v > retrieving revision 1.44 > diff -u -p -r1.44 control.c > --- control.c 24 Jan 2017 04:24:25 -0000 1.44 > +++ control.c 17 Aug 2018 22:41:43 -0000 > @@ -42,19 +42,29 @@ int > control_init(char *path) > { > struct sockaddr_un sun; > - int fd; > + int fd, fd_check; > mode_t old_umask; > > + bzero(&sun, sizeof(sun)); > + sun.sun_family = AF_UNIX; > + strlcpy(sun.sun_path, path, sizeof(sun.sun_path)); > + > + if ((fd_check = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { > + log_warn("control_init: socket check"); > + return (-1); > + } > + if (connect(fd_check, (struct sockaddr *)&sun, sizeof(sun)) == 0) { > + log_warnx("control_init: socket in use"); > + return (-1); > + } > + close(fd_check); > + > if ((fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC | SOCK_NONBLOCK, > 0)) == -1) { > log_warn("control_init: socket"); > return (-1); > } > > - bzero(&sun, sizeof(sun)); > - sun.sun_family = AF_UNIX; > - strlcpy(sun.sun_path, path, sizeof(sun.sun_path)); > - > if (unlink(path) == -1) > if (errno != ENOENT) { > log_warn("control_init: unlink %s", path); > @@ -98,16 +108,6 @@ control_listen(void) > evtimer_set(&control_state.evt, control_accept, NULL); > > return (0); > -} > - > -void > -control_cleanup(char *path) > -{ > - if (path == NULL) > - return; > - event_del(&control_state.ev); > - event_del(&control_state.evt); > - unlink(path); > } > > /* ARGSUSED */ > Index: control.h > =================================================================== > RCS file: /cvs/src/usr.sbin/ospfd/control.h,v > retrieving revision 1.6 > diff -u -p -r1.6 control.h > --- control.h 10 Feb 2015 05:24:48 -0000 1.6 > +++ control.h 17 Aug 2018 21:02:36 -0000 > @@ -39,6 +39,5 @@ int control_listen(void); > void control_accept(int, short, void *); > void control_dispatch_imsg(int, short, void *); > int control_imsg_relay(struct imsg *); > -void control_cleanup(char *); > > #endif /* _CONTROL_H_ */ > Index: kroute.c > =================================================================== > RCS file: /cvs/src/usr.sbin/ospfd/kroute.c,v > retrieving revision 1.111 > diff -u -p -r1.111 kroute.c > --- kroute.c 10 Jul 2018 11:49:04 -0000 1.111 > +++ kroute.c 21 Aug 2018 14:13:27 -0000 > @@ -263,6 +263,7 @@ kr_change_fib(struct kroute_node *kr, st > kn->r.nexthop.s_addr = kroute[i].nexthop.s_addr; > kn->r.flags = kroute[i].flags | F_OSPFD_INSERTED; > kn->r.priority = RTP_OSPF; > + kn->r.pid = kr_state.pid; > kn->r.ext_tag = kroute[i].ext_tag; > rtlabel_unref(kn->r.rtlabel); /* for RTM_CHANGE */ > kn->r.rtlabel = kroute[i].rtlabel; > @@ -365,7 +366,7 @@ kr_fib_decouple(void) > return; > > RB_FOREACH(kr, kroute_tree, &krt) > - if (kr->r.priority == RTP_OSPF) > + if (kr->r.priority == RTP_OSPF && kr->r.pid == kr_state.pid) > for (kn = kr; kn != NULL; kn = kn->next) > send_rtmsg(kr_state.fd, RTM_DELETE, &kn->r); > > @@ -1365,7 +1366,7 @@ rtmsg_process(char *buf, size_t len) > u_int8_t prefixlen, prio; > int flags, mpath; > u_short ifindex = 0; > - int rv, delay; > + int delay; > > size_t offset; > char *next; > @@ -1529,14 +1530,15 @@ add: > kr->r.ifindex = ifindex; > kr->r.priority = prio; > > + if (rtm->rtm_pid == kr_state.pid) > + kr->r.pid = 0; > + else > + kr->r.pid = rtm->rtm_pid; > + > if (rtm->rtm_priority == RTP_OSPF) { > - log_warnx("alien OSPF route %s/%d", > + log_warnx("alien OSPF route %s/%d " > + "- ignoring it", > inet_ntoa(prefix), prefixlen); > - rv = send_rtmsg(kr_state.fd, > - RTM_DELETE, &kr->r); > - free(kr); > - if (rv == -1) > - return (-1); > } else { > if ((label = (struct sockaddr_rtlabel *) > rti_info[RTAX_LABEL]) != NULL) { > Index: ospfd.c > =================================================================== > RCS file: /cvs/src/usr.sbin/ospfd/ospfd.c,v > retrieving revision 1.99 > diff -u -p -r1.99 ospfd.c > --- ospfd.c 11 Jul 2018 12:09:34 -0000 1.99 > +++ ospfd.c 17 Aug 2018 21:02:20 -0000 > @@ -300,7 +300,6 @@ ospfd_shutdown(void) > msgbuf_clear(&iev_rde->ibuf.w); > close(iev_rde->ibuf.fd); > > - control_cleanup(ospfd_conf->csock); > while ((r = SIMPLEQ_FIRST(&ospfd_conf->redist_list)) != NULL) { > SIMPLEQ_REMOVE_HEAD(&ospfd_conf->redist_list, entry); > free(r); > Index: ospfd.h > =================================================================== > RCS file: /cvs/src/usr.sbin/ospfd/ospfd.h,v > retrieving revision 1.101 > diff -u -p -r1.101 ospfd.h > --- ospfd.h 25 Jun 2018 22:16:53 -0000 1.101 > +++ ospfd.h 17 Aug 2018 22:10:59 -0000 > @@ -416,6 +416,7 @@ struct kroute { > u_short ifindex; > u_int8_t prefixlen; > u_int8_t priority; > + pid_t pid; > }; > > struct kif_addr { > >