On Tue, Jul 20, 2021 at 10:08:09AM +0200, Martin Pieuchot wrote:
> On 19/07/21(Mon) 17:53, Alexander Bluhm wrote:
> > Hi,
> > 
> > I found why the IPsec workaround did not work.
> > 
> > At init time we set ifiq->ifiq_softnet = net_tq(ifp->if_index +
> > idx), but the workaround modifies net_tq() at runtime.  Modifying
> > net_tq() at runtime is bad anyway as task_add() and task_del() could
> > be called with different task queues.
> > 
> > So better use exclusive lock if IPsec is in use.  For me this is
> > running stable.
> 
> Note that having multiple threads competing for an exclusive rwlock will
> generate unnecessary wakeup/sleep cycles every time the lock is released.  
> It is valuable to keep this in mind as it might add extra latency when
> processing packets.

Of course.  What do you recommend?

- Develop outside of the tree until all problems are fixed.
- Delay work on parallel forwarding until IPsec is MP safe.
- Accept a possible slowdown of IPsec.  In my measurements it gets
  faster even with the exclusive lock.
- Concentrate on making IPsec faster.  By removing the crypto
  queues you gain much more performance than the exclusive lock may
  cost.  Did you see the massive kernel locks in my graph?
  
http://bluhm.genua.de/perform/results/latest/patch-sys-ip-multiqueue.1/btrace/ssh_perform%40lt13_iperf3_-c10.4.56.36_-P10_-t10_-R-btrace-kstack.0.svg
- Make ARP MP safe.  Currently we need the kernel lock there or
  it crashes.  This creates latency for all kind of packets.
- Convert the rwlock in pf to mutex.  I think your argument counts
  much more there.  But I cannot prove it.

My plan is to commit what we have and improve where most pain is.
This makes incremental steps easier.

bluhm

> > Index: net/if.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
> > retrieving revision 1.642
> > diff -u -p -r1.642 if.c
> > --- net/if.c        30 Jun 2021 13:23:33 -0000      1.642
> > +++ net/if.c        19 Jul 2021 14:51:31 -0000
> > @@ -109,6 +109,10 @@
> >  #include <netinet6/ip6_var.h>
> >  #endif
> >  
> > +#ifdef IPSEC
> > +#include <netinet/ip_ipsp.h>
> > +#endif
> > +
> >  #ifdef MPLS
> >  #include <netmpls/mpls.h>
> >  #endif
> > @@ -238,7 +242,7 @@ int     ifq_congestion;
> >  
> >  int                 netisr;
> >  
> > -#define    NET_TASKQ       1
> > +#define    NET_TASKQ       4
> >  struct taskq       *nettqmp[NET_TASKQ];
> >  
> >  struct task if_input_task_locked = TASK_INITIALIZER(if_netisr, NULL);
> > @@ -815,6 +819,7 @@ void
> >  if_input_process(struct ifnet *ifp, struct mbuf_list *ml)
> >  {
> >     struct mbuf *m;
> > +   int exclusive_lock = 0;
> >  
> >     if (ml_empty(ml))
> >             return;
> > @@ -834,10 +839,25 @@ if_input_process(struct ifnet *ifp, stru
> >      * to PF globals, pipex globals, unicast and multicast addresses
> >      * lists and the socket layer.
> >      */
> > -   NET_LOCK();
> > +
> > +   /*
> > +    * XXXSMP IPsec data structures are not ready to be
> > +    * accessed by multiple Network threads in parallel.
> > +    */
> > +   if (ipsec_in_use)
> > +           exclusive_lock = 1;
> > +   if (exclusive_lock)
> > +           NET_LOCK();
> > +   else
> > +           NET_RLOCK_IN_SOFTNET();
> > +
> >     while ((m = ml_dequeue(ml)) != NULL)
> >             (*ifp->if_input)(ifp, m);
> > -   NET_UNLOCK();
> > +
> > +   if (exclusive_lock)
> > +           NET_UNLOCK();
> > +   else
> > +           NET_RUNLOCK_IN_SOFTNET();
> >  }
> >  
> >  void
> > @@ -895,6 +915,12 @@ if_netisr(void *unused)
> >                     KERNEL_UNLOCK();
> >             }
> >  #endif
> > +           if (n & (1 << NETISR_IP))
> > +                   ipintr();
> > +#ifdef INET6
> > +           if (n & (1 << NETISR_IPV6))
> > +                   ip6intr();
> > +#endif
> >  #if NPPP > 0
> >             if (n & (1 << NETISR_PPP)) {
> >                     KERNEL_LOCK();
> > @@ -3311,17 +3337,14 @@ unhandled_af(int af)
> >     panic("unhandled af %d", af);
> >  }
> >  
> > -/*
> > - * XXXSMP This tunable is here to work around the fact that IPsec
> > - * globals aren't ready to be accessed by multiple threads in
> > - * parallel.
> > - */
> > -int                 nettaskqs = NET_TASKQ;
> > -
> >  struct taskq *
> >  net_tq(unsigned int ifindex)
> >  {
> >     struct taskq *t = NULL;
> > +   static int nettaskqs;
> > +
> > +   if (nettaskqs == 0)
> > +           nettaskqs = min(NET_TASKQ, ncpus);
> >  
> >     t = nettqmp[ifindex % nettaskqs];
> >  
> > Index: net/if_ethersubr.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_ethersubr.c,v
> > retrieving revision 1.275
> > diff -u -p -r1.275 if_ethersubr.c
> > --- net/if_ethersubr.c      7 Jul 2021 20:19:01 -0000       1.275
> > +++ net/if_ethersubr.c      19 Jul 2021 14:32:48 -0000
> > @@ -222,7 +222,10 @@ ether_resolve(struct ifnet *ifp, struct 
> >  
> >     switch (af) {
> >     case AF_INET:
> > +           KERNEL_LOCK();
> > +           /* XXXSMP there is a MP race in arpresolve() */
> >             error = arpresolve(ifp, rt, m, dst, eh->ether_dhost);
> > +           KERNEL_UNLOCK();
> >             if (error)
> >                     return (error);
> >             eh->ether_type = htons(ETHERTYPE_IP);
> > @@ -245,7 +248,10 @@ ether_resolve(struct ifnet *ifp, struct 
> >             break;
> >  #ifdef INET6
> >     case AF_INET6:
> > +           KERNEL_LOCK();
> > +           /* XXXSMP there is a MP race in nd6_resolve() */
> >             error = nd6_resolve(ifp, rt, m, dst, eh->ether_dhost);
> > +           KERNEL_UNLOCK();
> >             if (error)
> >                     return (error);
> >             eh->ether_type = htons(ETHERTYPE_IPV6);
> > Index: net/ifq.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/net/ifq.c,v
> > retrieving revision 1.44
> > diff -u -p -r1.44 ifq.c
> > --- net/ifq.c       9 Jul 2021 01:22:05 -0000       1.44
> > +++ net/ifq.c       19 Jul 2021 14:32:48 -0000
> > @@ -243,7 +243,7 @@ void
> >  ifq_init(struct ifqueue *ifq, struct ifnet *ifp, unsigned int idx)
> >  {
> >     ifq->ifq_if = ifp;
> > -   ifq->ifq_softnet = net_tq(ifp->if_index); /* + idx */
> > +   ifq->ifq_softnet = net_tq(ifp->if_index + idx);
> >     ifq->ifq_softc = NULL;
> >  
> >     mtx_init(&ifq->ifq_mtx, IPL_NET);
> > @@ -620,7 +620,7 @@ void
> >  ifiq_init(struct ifiqueue *ifiq, struct ifnet *ifp, unsigned int idx)
> >  {
> >     ifiq->ifiq_if = ifp;
> > -   ifiq->ifiq_softnet = net_tq(ifp->if_index); /* + idx */
> > +   ifiq->ifiq_softnet = net_tq(ifp->if_index + idx);
> >     ifiq->ifiq_softc = NULL;
> >  
> >     mtx_init(&ifiq->ifiq_mtx, IPL_NET);
> > Index: net/netisr.h
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/net/netisr.h,v
> > retrieving revision 1.55
> > diff -u -p -r1.55 netisr.h
> > --- net/netisr.h    5 Jan 2021 20:43:36 -0000       1.55
> > +++ net/netisr.h    19 Jul 2021 14:32:48 -0000
> > @@ -41,8 +41,10 @@
> >   * interrupt used for scheduling the network code to calls
> >   * on the lowest level routine of each protocol.
> >   */
> > +#define    NETISR_IP       2               /* same as AF_INET */
> >  #define    NETISR_PFSYNC   5               /* for pfsync "immediate" tx */
> >  #define    NETISR_ARP      18              /* same as AF_LINK */
> > +#define    NETISR_IPV6     24              /* same as AF_INET6 */
> >  #define    NETISR_PPP      28              /* for PPP processing */
> >  #define    NETISR_BRIDGE   29              /* for bridge processing */
> >  #define    NETISR_SWITCH   31              /* for switch dataplane */
> > @@ -57,6 +59,8 @@ extern int        netisr;                 /* scheduling 
> > bits 
> >  extern struct task if_input_task_locked;
> >  
> >  void       arpintr(void);
> > +void       ipintr(void);
> > +void       ip6intr(void);
> >  void       pppintr(void);
> >  void       bridgeintr(void);
> >  void       switchintr(void);
> > Index: net/pfkeyv2.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/net/pfkeyv2.c,v
> > retrieving revision 1.218
> > diff -u -p -r1.218 pfkeyv2.c
> > --- net/pfkeyv2.c   14 Jul 2021 22:39:26 -0000      1.218
> > +++ net/pfkeyv2.c   19 Jul 2021 14:48:34 -0000
> > @@ -2019,14 +2019,6 @@ pfkeyv2_send(struct socket *so, void *me
> >                     }
> >                     TAILQ_INSERT_HEAD(&ipsec_policy_head, ipo, ipo_list);
> >                     ipsec_in_use++;
> > -                   /*
> > -                    * XXXSMP IPsec data structures are not ready to be
> > -                    * accessed by multiple Network threads in parallel,
> > -                    * so force all packets to be processed by the first
> > -                    * one.
> > -                    */
> > -                   extern int nettaskqs;
> > -                   nettaskqs = 1;
> >             } else {
> >                     ipo->ipo_last_searched = ipo->ipo_flags = 0;
> >             }
> > Index: netinet/ip_input.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_input.c,v
> > retrieving revision 1.363
> > diff -u -p -r1.363 ip_input.c
> > --- netinet/ip_input.c      21 Jun 2021 22:09:14 -0000      1.363
> > +++ netinet/ip_input.c      19 Jul 2021 14:32:48 -0000
> > @@ -130,6 +130,8 @@ const struct sysctl_bounded_args ipctl_v
> >     { IPCTL_ARPDOWN, &arpt_down, 0, INT_MAX },
> >  };
> >  
> > +struct niqueue ipintrq = NIQUEUE_INITIALIZER(IPQ_MAXLEN, NETISR_IP);
> > +
> >  struct pool ipqent_pool;
> >  struct pool ipq_pool;
> >  
> > @@ -143,6 +145,7 @@ static struct mbuf_queue        ipsendraw_mq;
> >  extern struct niqueue              arpinq;
> >  
> >  int        ip_ours(struct mbuf **, int *, int, int);
> > +int        ip_local(struct mbuf **, int *, int, int);
> >  int        ip_dooptions(struct mbuf *, struct ifnet *);
> >  int        in_ouraddr(struct mbuf *, struct ifnet *, struct rtentry **);
> >  
> > @@ -230,6 +233,43 @@ ip_init(void)
> >  }
> >  
> >  /*
> > + * Enqueue packet for local delivery.  Queuing is used as a boundary
> > + * between the network layer (input/forward path) running with shared
> > + * NET_RLOCK_IN_SOFTNET() and the transport layer needing it exclusively.
> > + */
> > +int
> > +ip_ours(struct mbuf **mp, int *offp, int nxt, int af)
> > +{
> > +   /* We are already in a IPv4/IPv6 local deliver loop. */
> > +   if (af != AF_UNSPEC)
> > +           return ip_local(mp, offp, nxt, af);
> > +
> > +   niq_enqueue(&ipintrq, *mp);
> > +   *mp = NULL;
> > +   return IPPROTO_DONE;
> > +}
> > +
> > +/*
> > + * Dequeue and process locally delivered packets.
> > + */
> > +void
> > +ipintr(void)
> > +{
> > +   struct mbuf *m;
> > +   int off, nxt;
> > +
> > +   while ((m = niq_dequeue(&ipintrq)) != NULL) {
> > +#ifdef DIAGNOSTIC
> > +           if ((m->m_flags & M_PKTHDR) == 0)
> > +                   panic("ipintr no HDR");
> > +#endif
> > +           off = 0;
> > +           nxt = ip_local(&m, &off, IPPROTO_IPV4, AF_UNSPEC);
> > +           KASSERT(nxt == IPPROTO_DONE);
> > +   }
> > +}
> > +
> > +/*
> >   * IPv4 input routine.
> >   *
> >   * Checksum and byte swap header.  Process options. Forward or deliver.
> > @@ -514,7 +554,7 @@ ip_input_if(struct mbuf **mp, int *offp,
> >   * If fragmented try to reassemble.  Pass to next level.
> >   */
> >  int
> > -ip_ours(struct mbuf **mp, int *offp, int nxt, int af)
> > +ip_local(struct mbuf **mp, int *offp, int nxt, int af)
> >  {
> >     struct mbuf *m = *mp;
> >     struct ip *ip = mtod(m, struct ip *);
> > @@ -1665,7 +1705,8 @@ ip_sysctl(int *name, u_int namelen, void
> >                 newlen));
> >  #endif
> >     case IPCTL_IFQUEUE:
> > -           return (EOPNOTSUPP);
> > +           return (sysctl_niq(name + 1, namelen - 1,
> > +               oldp, oldlenp, newp, newlen, &ipintrq));
> >     case IPCTL_ARPQUEUE:
> >             return (sysctl_niq(name + 1, namelen - 1,
> >                 oldp, oldlenp, newp, newlen, &arpinq));
> > Index: netinet/ip_var.h
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_var.h,v
> > retrieving revision 1.88
> > diff -u -p -r1.88 ip_var.h
> > --- netinet/ip_var.h        30 Mar 2021 08:37:11 -0000      1.88
> > +++ netinet/ip_var.h        19 Jul 2021 14:32:48 -0000
> > @@ -248,7 +248,6 @@ void     ip_stripoptions(struct mbuf *);
> >  int         ip_sysctl(int *, u_int, void *, size_t *, void *, size_t);
> >  void        ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *,
> >         struct mbuf *);
> > -void        ipintr(void);
> >  int         ip_input_if(struct mbuf **, int *, int, int, struct ifnet *);
> >  int         ip_deliver(struct mbuf **, int *, int, int);
> >  void        ip_forward(struct mbuf *, struct ifnet *, struct rtentry *, 
> > int);
> > Index: netinet6/ip6_input.c
> > ===================================================================
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_input.c,v
> > retrieving revision 1.237
> > diff -u -p -r1.237 ip6_input.c
> > --- netinet6/ip6_input.c    3 Jun 2021 04:47:54 -0000       1.237
> > +++ netinet6/ip6_input.c    19 Jul 2021 14:32:48 -0000
> > @@ -115,11 +115,14 @@
> >  #include <netinet/ip_carp.h>
> >  #endif
> >  
> > +struct niqueue ip6intrq = NIQUEUE_INITIALIZER(IPQ_MAXLEN, NETISR_IPV6);
> > +
> >  struct cpumem *ip6counters;
> >  
> >  uint8_t ip6_soiikey[IP6_SOIIKEY_LEN];
> >  
> >  int ip6_ours(struct mbuf **, int *, int, int);
> > +int ip6_local(struct mbuf **, int *, int, int);
> >  int ip6_check_rh0hdr(struct mbuf *, int *);
> >  int ip6_hbhchcheck(struct mbuf *, int *, int *, int *);
> >  int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *);
> > @@ -162,6 +165,43 @@ ip6_init(void)
> >     ip6counters = counters_alloc(ip6s_ncounters);
> >  }
> >  
> > +/*
> > + * Enqueue packet for local delivery.  Queuing is used as a boundary
> > + * between the network layer (input/forward path) running with shared
> > + * NET_RLOCK_IN_SOFTNET() and the transport layer needing it exclusively.
> > + */
> > +int
> > +ip6_ours(struct mbuf **mp, int *offp, int nxt, int af)
> > +{
> > +   /* We are already in a IPv4/IPv6 local deliver loop. */
> > +   if (af != AF_UNSPEC)
> > +           return ip6_local(mp, offp, nxt, af);
> > +
> > +   niq_enqueue(&ip6intrq, *mp);
> > +   *mp = NULL;
> > +   return IPPROTO_DONE;
> > +}
> > +
> > +/*
> > + * Dequeue and process locally delivered packets.
> > + */
> > +void
> > +ip6intr(void)
> > +{
> > +   struct mbuf *m;
> > +   int off, nxt;
> > +
> > +   while ((m = niq_dequeue(&ip6intrq)) != NULL) {
> > +#ifdef DIAGNOSTIC
> > +           if ((m->m_flags & M_PKTHDR) == 0)
> > +                   panic("ip6intr no HDR");
> > +#endif
> > +           off = 0;
> > +           nxt = ip6_local(&m, &off, IPPROTO_IPV6, AF_UNSPEC);
> > +           KASSERT(nxt == IPPROTO_DONE);
> > +   }
> > +}
> > +
> >  void
> >  ipv6_input(struct ifnet *ifp, struct mbuf *m)
> >  {
> > @@ -544,7 +584,7 @@ ip6_input_if(struct mbuf **mp, int *offp
> >  }
> >  
> >  int
> > -ip6_ours(struct mbuf **mp, int *offp, int nxt, int af)
> > +ip6_local(struct mbuf **mp, int *offp, int nxt, int af)
> >  {
> >     if (ip6_hbhchcheck(*mp, offp, &nxt, NULL))
> >             return IPPROTO_DONE;
> > @@ -1470,7 +1510,8 @@ ip6_sysctl(int *name, u_int namelen, voi
> >             NET_UNLOCK();
> >             return (error);
> >     case IPV6CTL_IFQUEUE:
> > -           return (EOPNOTSUPP);
> > +           return (sysctl_niq(name + 1, namelen - 1,
> > +               oldp, oldlenp, newp, newlen, &ip6intrq));
> >     case IPV6CTL_SOIIKEY:
> >             return (ip6_sysctl_soiikey(oldp, oldlenp, newp, newlen));
> >     default:
> > 

Reply via email to