On 2018/08/21 17:16, Remi Locherer wrote:
> Hi tech,
> 
> recently we had a short outage in our network. A script started an additional
> ospfd instance because the -n flag for config test was missing.

This is a problem with bgpd as well, last time I did this it killed one of the
*other* routers on the network (i.e. not just the one where I accidentally ran
2x bgpd...).

> What then happend was not nice:
> - The new ospfd unlinked the control socket of the first ospfd
> - The new ospfd removed all routes from the first ospfd
> - The new ospfd was not able to build up an adjacency and therefore could
>   not install the routes needed for a recovery.
> - Both ospfd instances were running but non-functional.
> 
> Of course the faulty script is fixed by now. ;-)
> 
> It would be nice if ospfd could prevent such a situation.
> 
> Below diff does these things:
> - Detect a running ospfd by first doing a connect on the control socket.
> - Do not delete the control socket on exit.
>   - This could delete the socket of another instance.
>   - Unlinking the socket on shutdown will be in the way once we add pledge
>     to the main process. It was removed recently from various daemons.

This all sounds very sensible.

> - Do not delete routes added by another process even if they have
>   prio RTP_OSPF. Without this the new ospfd will remove all the routes
>   of the first one.

I'm unsure about this, the above changes stop the new ospfd from running
don't they, so that shouldn't be a problem?

If an ospfd blows up for whatever reason, it would be quite inconvenient
if it needs manual route tweaks rather than just 'rcctl start ospfd' to fix it 
..

> A side effect of this is that alien OSPF routes are now only logged but
> not removed anymore. Should a crashed ospfd leave some routes behind the
> next ospfd does not clean them up anymore. The admin would need to check
> the logs and remove them manually with the route command.
> 
> Does this make sense?
> 
> Comments? OK?
> 
> Remi
> 
> 
> Index: control.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/ospfd/control.c,v
> retrieving revision 1.44
> diff -u -p -r1.44 control.c
> --- control.c 24 Jan 2017 04:24:25 -0000      1.44
> +++ control.c 17 Aug 2018 22:41:43 -0000
> @@ -42,19 +42,29 @@ int
>  control_init(char *path)
>  {
>       struct sockaddr_un       sun;
> -     int                      fd;
> +     int                      fd, fd_check;
>       mode_t                   old_umask;
>  
> +     bzero(&sun, sizeof(sun));
> +     sun.sun_family = AF_UNIX;
> +     strlcpy(sun.sun_path, path, sizeof(sun.sun_path));
> +
> +     if ((fd_check = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
> +             log_warn("control_init: socket check");
> +             return (-1);
> +     }
> +     if (connect(fd_check, (struct sockaddr *)&sun, sizeof(sun)) == 0) {
> +             log_warnx("control_init: socket in use");
> +             return (-1);
> +     }
> +     close(fd_check);
> +
>       if ((fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC | SOCK_NONBLOCK,
>           0)) == -1) {
>               log_warn("control_init: socket");
>               return (-1);
>       }
>  
> -     bzero(&sun, sizeof(sun));
> -     sun.sun_family = AF_UNIX;
> -     strlcpy(sun.sun_path, path, sizeof(sun.sun_path));
> -
>       if (unlink(path) == -1)
>               if (errno != ENOENT) {
>                       log_warn("control_init: unlink %s", path);
> @@ -98,16 +108,6 @@ control_listen(void)
>       evtimer_set(&control_state.evt, control_accept, NULL);
>  
>       return (0);
> -}
> -
> -void
> -control_cleanup(char *path)
> -{
> -     if (path == NULL)
> -             return;
> -     event_del(&control_state.ev);
> -     event_del(&control_state.evt);
> -     unlink(path);
>  }
>  
>  /* ARGSUSED */
> Index: control.h
> ===================================================================
> RCS file: /cvs/src/usr.sbin/ospfd/control.h,v
> retrieving revision 1.6
> diff -u -p -r1.6 control.h
> --- control.h 10 Feb 2015 05:24:48 -0000      1.6
> +++ control.h 17 Aug 2018 21:02:36 -0000
> @@ -39,6 +39,5 @@ int control_listen(void);
>  void control_accept(int, short, void *);
>  void control_dispatch_imsg(int, short, void *);
>  int  control_imsg_relay(struct imsg *);
> -void control_cleanup(char *);
>  
>  #endif       /* _CONTROL_H_ */
> Index: kroute.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/ospfd/kroute.c,v
> retrieving revision 1.111
> diff -u -p -r1.111 kroute.c
> --- kroute.c  10 Jul 2018 11:49:04 -0000      1.111
> +++ kroute.c  21 Aug 2018 14:13:27 -0000
> @@ -263,6 +263,7 @@ kr_change_fib(struct kroute_node *kr, st
>               kn->r.nexthop.s_addr = kroute[i].nexthop.s_addr;
>               kn->r.flags = kroute[i].flags | F_OSPFD_INSERTED;
>               kn->r.priority = RTP_OSPF;
> +             kn->r.pid = kr_state.pid;
>               kn->r.ext_tag = kroute[i].ext_tag;
>               rtlabel_unref(kn->r.rtlabel);   /* for RTM_CHANGE */
>               kn->r.rtlabel = kroute[i].rtlabel;
> @@ -365,7 +366,7 @@ kr_fib_decouple(void)
>               return;
>  
>       RB_FOREACH(kr, kroute_tree, &krt)
> -             if (kr->r.priority == RTP_OSPF)
> +             if (kr->r.priority == RTP_OSPF && kr->r.pid == kr_state.pid)
>                       for (kn = kr; kn != NULL; kn = kn->next)
>                               send_rtmsg(kr_state.fd, RTM_DELETE, &kn->r);
>  
> @@ -1365,7 +1366,7 @@ rtmsg_process(char *buf, size_t len)
>       u_int8_t                 prefixlen, prio;
>       int                      flags, mpath;
>       u_short                  ifindex = 0;
> -     int                      rv, delay;
> +     int                      delay;
>  
>       size_t                   offset;
>       char                    *next;
> @@ -1529,14 +1530,15 @@ add:
>                               kr->r.ifindex = ifindex;
>                               kr->r.priority = prio;
>  
> +                             if (rtm->rtm_pid == kr_state.pid)
> +                                     kr->r.pid = 0;
> +                             else
> +                                     kr->r.pid = rtm->rtm_pid;
> +
>                               if (rtm->rtm_priority == RTP_OSPF) {
> -                                     log_warnx("alien OSPF route %s/%d",
> +                                     log_warnx("alien OSPF route %s/%d "
> +                                         "- ignoring it",
>                                           inet_ntoa(prefix), prefixlen);
> -                                     rv = send_rtmsg(kr_state.fd,
> -                                         RTM_DELETE, &kr->r);
> -                                     free(kr);
> -                                     if (rv == -1)
> -                                             return (-1);
>                               } else {
>                                       if ((label = (struct sockaddr_rtlabel *)
>                                           rti_info[RTAX_LABEL]) != NULL) {
> Index: ospfd.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/ospfd/ospfd.c,v
> retrieving revision 1.99
> diff -u -p -r1.99 ospfd.c
> --- ospfd.c   11 Jul 2018 12:09:34 -0000      1.99
> +++ ospfd.c   17 Aug 2018 21:02:20 -0000
> @@ -300,7 +300,6 @@ ospfd_shutdown(void)
>       msgbuf_clear(&iev_rde->ibuf.w);
>       close(iev_rde->ibuf.fd);
>  
> -     control_cleanup(ospfd_conf->csock);
>       while ((r = SIMPLEQ_FIRST(&ospfd_conf->redist_list)) != NULL) {
>               SIMPLEQ_REMOVE_HEAD(&ospfd_conf->redist_list, entry);
>               free(r);
> Index: ospfd.h
> ===================================================================
> RCS file: /cvs/src/usr.sbin/ospfd/ospfd.h,v
> retrieving revision 1.101
> diff -u -p -r1.101 ospfd.h
> --- ospfd.h   25 Jun 2018 22:16:53 -0000      1.101
> +++ ospfd.h   17 Aug 2018 22:10:59 -0000
> @@ -416,6 +416,7 @@ struct kroute {
>       u_short         ifindex;
>       u_int8_t        prefixlen;
>       u_int8_t        priority;
> +     pid_t           pid;
>  };
>  
>  struct kif_addr {
> 
> 

Reply via email to