On Sat, Apr 09, 2022 at 01:51:05AM +0200, Alexandr Nedvedicky wrote:
> updated diff is below.

I am not sure what Hrvoje actually did test and what not.  My
impression was, that he got a panic with the previous version of
this diff, but the machine was stable with the code in current.

But maybe I got it wrong and we need this code to run pfsync with
IPsec in parallel.

In general it looks good, a few comments inline.

> --------8<---------------8<---------------8<------------------8<--------
> diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c
> index cb0f3fbdf52..536c3f9cb70 100644
> --- a/sys/net/if_pfsync.c
> +++ b/sys/net/if_pfsync.c
> @@ -181,6 +181,7 @@ void      pfsync_q_del(struct pf_state *);
>  
>  struct pfsync_upd_req_item {
>       TAILQ_ENTRY(pfsync_upd_req_item)        ur_entry;
> +     TAILQ_ENTRY(pfsync_upd_req_item)        ur_snap;

Do we really need two list entries?  My understanding is that the
element is either in the sc_upd_req_list or in sn_upd_req_list.  We
could use the same entry for both.  But of course with two entries
it is easier to see what is going on.

>       struct pfsync_upd_req                   ur_msg;
>  };
>  TAILQ_HEAD(pfsync_upd_reqs, pfsync_upd_req_item);
> @@ -295,7 +296,7 @@ void      pfsync_bulk_update(void *);
>  void pfsync_bulk_fail(void *);
>  
>  void pfsync_grab_snapshot(struct pfsync_snapshot *, struct pfsync_softc *);
> -void pfsync_drop_snapshot(struct pfsync_snapshot *, struct pfsync_softc *);
> +void pfsync_drop_snapshot(struct pfsync_snapshot *);
>  
>  void pfsync_send_dispatch(void *);
>  void pfsync_send_pkt(struct mbuf *);
> @@ -422,8 +423,7 @@ pfsync_clone_destroy(struct ifnet *ifp)
>               sc->sc_deferred = 0;
>               mtx_leave(&sc->sc_deferrals_mtx);
>  
> -             while (!TAILQ_EMPTY(&deferrals)) {
> -                     pd = TAILQ_FIRST(&deferrals);
> +             while ((pd = TAILQ_FIRST(&deferrals)) != NULL) {
>                       TAILQ_REMOVE(&deferrals, pd, pd_entry);
>                       pfsync_undefer(pd, 0);
>               }
> @@ -1574,6 +1574,9 @@ void
>  pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc)
>  {
>       int q;
> +     struct pf_state *st;
> +     struct pfsync_upd_req_item *ur;
> +     struct tdb *tdb;
>  
>       sn->sn_sc = sc;
>  
> @@ -1583,14 +1586,36 @@ pfsync_grab_snapshot(struct pfsync_snapshot *sn, 
> struct pfsync_softc *sc)
>  
>       for (q = 0; q < PFSYNC_S_COUNT; q++) {
>               TAILQ_INIT(&sn->sn_qs[q]);
> -             TAILQ_CONCAT(&sn->sn_qs[q], &sc->sc_qs[q], sync_list);
> +
> +             while ((st = TAILQ_FIRST(&sc->sc_qs[q])) != NULL) {
> +#ifdef PFSYNC_DEBUG
> +                     KASSERT(st->snapped == 0);
> +#endif

I see that there are other #ifdef PFSYNC_DEBUG.  But why would you
hide a cheap KASSERT behind another ifdef?  If something is wrong
I want to see the crash and not only while debugging.

I will send a diff that removes existing PFSYNC_DEBUG.

> +                     TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
> +                     TAILQ_INSERT_TAIL(&sn->sn_qs[q], st, sync_snap);
> +                     st->snapped = 1;
> +             }
>       }
>  
>       TAILQ_INIT(&sn->sn_upd_req_list);
> -     TAILQ_CONCAT(&sn->sn_upd_req_list, &sc->sc_upd_req_list, ur_entry);
> +     while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
> +             TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
> +             TAILQ_INSERT_TAIL(&sn->sn_upd_req_list, ur, ur_snap);
> +     }
>  
>       TAILQ_INIT(&sn->sn_tdb_q);
> -     TAILQ_CONCAT(&sn->sn_tdb_q, &sc->sc_tdb_q, tdb_sync_entry);
> +     while ((tdb = TAILQ_FIRST(&sc->sc_tdb_q)) != NULL) {
> +             TAILQ_REMOVE(&sc->sc_tdb_q, tdb, tdb_sync_entry);
> +             TAILQ_INSERT_TAIL(&sn->sn_tdb_q, tdb, tdb_sync_snap);
> +
> +             mtx_enter(&tdb->tdb_mtx);
> +#ifdef PFSYNC_DEBUG
> +             KASSERT(!ISSET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED));
> +#endif
> +             /* SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED); */

This comment looks like a debugging leftover.

> +             SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED);
> +             mtx_leave(&tdb->tdb_mtx);
> +     }
>  
>       sn->sn_len = sc->sc_len;
>       sc->sc_len = PFSYNC_MINPKT;
> @@ -1606,41 +1631,44 @@ pfsync_grab_snapshot(struct pfsync_snapshot *sn, 
> struct pfsync_softc *sc)
>  }
>  
>  void
> -pfsync_drop_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc * sc)
> +pfsync_drop_snapshot(struct pfsync_snapshot *sn)
>  {
>       struct pf_state *st;
>       struct pfsync_upd_req_item *ur;
>       struct tdb *t;
>       int q;
>  
> -
>       for (q = 0; q < PFSYNC_S_COUNT; q++) {
>               if (TAILQ_EMPTY(&sn->sn_qs[q]))
>                       continue;
>  
>               while ((st = TAILQ_FIRST(&sn->sn_qs[q])) != NULL) {
> -                     TAILQ_REMOVE(&sn->sn_qs[q], st, sync_list);
>  #ifdef PFSYNC_DEBUG
>                       KASSERT(st->sync_state == q);
> +                     KASSERT(st->snapped == 1);
>  #endif
> +                     TAILQ_REMOVE(&sn->sn_qs[q], st, sync_snap);
>                       st->sync_state = PFSYNC_S_NONE;
> +                     st->snapped = 0;
>                       pf_state_unref(st);
>               }
>       }
>  
>       while ((ur = TAILQ_FIRST(&sn->sn_upd_req_list)) != NULL) {
> -             TAILQ_REMOVE(&sn->sn_upd_req_list, ur, ur_entry);
> +             TAILQ_REMOVE(&sn->sn_upd_req_list, ur, ur_snap);
>               pool_put(&sn->sn_sc->sc_pool, ur);
>       }
>  
> -     mtx_enter(&sc->sc_tdb_mtx);
>       while ((t = TAILQ_FIRST(&sn->sn_tdb_q)) != NULL) {
> -             TAILQ_REMOVE(&sn->sn_tdb_q, t, tdb_sync_entry);
> +             TAILQ_REMOVE(&sn->sn_tdb_q, t, tdb_sync_snap);
>               mtx_enter(&t->tdb_mtx);
> +#ifdef PFSYNC_DEBUG
> +             KASSERT(ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED));
> +#endif
> +             CLR(t->tdb_flags, TDBF_PFSYNC_SNAPPED);
>               CLR(t->tdb_flags, TDBF_PFSYNC);
>               mtx_leave(&t->tdb_mtx);
>       }
> -     mtx_leave(&sc->sc_tdb_mtx);
>  }
>  
>  int
> @@ -1667,7 +1695,7 @@ pfsync_drop(struct pfsync_softc *sc)
>       struct pfsync_snapshot  sn;
>  
>       pfsync_grab_snapshot(&sn, sc);
> -     pfsync_drop_snapshot(&sn, sc);
> +     pfsync_drop_snapshot(&sn);
>  }
>  
>  void
> @@ -1759,7 +1787,7 @@ pfsync_sendout(void)
>       if (m == NULL) {
>               sc->sc_if.if_oerrors++;
>               pfsyncstat_inc(pfsyncs_onomem);
> -             pfsync_drop_snapshot(&sn, sc);
> +             pfsync_drop_snapshot(&sn);
>               return;
>       }
>  
> @@ -1769,7 +1797,7 @@ pfsync_sendout(void)
>                       m_free(m);
>                       sc->sc_if.if_oerrors++;
>                       pfsyncstat_inc(pfsyncs_onomem);
> -                     pfsync_drop_snapshot(&sn, sc);
> +                     pfsync_drop_snapshot(&sn);
>                       return;
>               }
>       }
> @@ -1799,7 +1827,7 @@ pfsync_sendout(void)
>  
>               count = 0;
>               while ((ur = TAILQ_FIRST(&sn.sn_upd_req_list)) != NULL) {
> -                     TAILQ_REMOVE(&sn.sn_upd_req_list, ur, ur_entry);
> +                     TAILQ_REMOVE(&sn.sn_upd_req_list, ur, ur_snap);
>  
>                       bcopy(&ur->ur_msg, m->m_data + offset,
>                           sizeof(ur->ur_msg));
> @@ -1827,18 +1855,21 @@ pfsync_sendout(void)
>               subh = (struct pfsync_subheader *)(m->m_data + offset);
>               offset += sizeof(*subh);
>  
> -             mtx_enter(&sc->sc_tdb_mtx);
>               count = 0;
>               while ((t = TAILQ_FIRST(&sn.sn_tdb_q)) != NULL) {
> -                     TAILQ_REMOVE(&sn.sn_tdb_q, t, tdb_sync_entry);
> +                     TAILQ_REMOVE(&sn.sn_tdb_q, t, tdb_sync_snap);
>                       pfsync_out_tdb(t, m->m_data + offset);
>                       offset += sizeof(struct pfsync_tdb);
>                       mtx_enter(&t->tdb_mtx);
> +#ifdef PFSYNC_DEBUG
> +                     KASSERT(ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED));
> +#endif
> +                     CLR(t->tdb_flags, TDBF_PFSYNC_SNAPPED);
>                       CLR(t->tdb_flags, TDBF_PFSYNC);
>                       mtx_leave(&t->tdb_mtx);
> +                     tdb_unref(t);
>                       count++;
>               }
> -             mtx_leave(&sc->sc_tdb_mtx);
>  
>               bzero(subh, sizeof(*subh));
>               subh->action = PFSYNC_ACT_TDB;
> @@ -1856,11 +1887,13 @@ pfsync_sendout(void)
>  
>               count = 0;
>               while ((st = TAILQ_FIRST(&sn.sn_qs[q])) != NULL) {
> -                     TAILQ_REMOVE(&sn.sn_qs[q], st, sync_list);
> +                     TAILQ_REMOVE(&sn.sn_qs[q], st, sync_snap);
>  #ifdef PFSYNC_DEBUG
>                       KASSERT(st->sync_state == q);
> +                     KASSERT(st->snapped == 1);
>  #endif
>                       st->sync_state = PFSYNC_S_NONE;
> +                     st->snapped = 0;
>                       pfsync_qs[q].write(st, m->m_data + offset);
>                       offset += pfsync_qs[q].len;
>  
> @@ -2411,8 +2444,9 @@ pfsync_q_ins(struct pf_state *st, int q)
>               mtx_enter(&sc->sc_st_mtx);
>  
>               /*
> -              * If two threads are competing to insert the same state, then
> -              * there must be just single winner.
> +              * There are either two threads trying to update the
> +              * the same state, or the state is just being processed
> +              * (is on snapshot queue).
>                */
>               if (st->sync_state != PFSYNC_S_NONE) {
>                       mtx_leave(&sc->sc_st_mtx);
> @@ -2450,6 +2484,15 @@ pfsync_q_del(struct pf_state *st)
>  
>       mtx_enter(&sc->sc_st_mtx);
>       q = st->sync_state;
> +     /*
> +      * re-check under mutex
> +      * if state is snapped already, then just bail out, because we came
> +      * too late, the state is being just processed/dispatched to peer.
> +      */
> +     if ((q == PFSYNC_S_NONE) || (st->snapped)) {
> +             mtx_leave(&sc->sc_st_mtx);
> +             return;
> +     }
>       atomic_sub_long(&sc->sc_len, pfsync_qs[q].len);
>       TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
>       if (TAILQ_EMPTY(&sc->sc_qs[q]))
> @@ -2495,6 +2538,7 @@ pfsync_update_tdb(struct tdb *t, int output)
>                       }
>  
>                       TAILQ_INSERT_TAIL(&sc->sc_tdb_q, t, tdb_sync_entry);
> +                     tdb_ref(t);
>                       SET(t->tdb_flags, TDBF_PFSYNC);
>                       mtx_leave(&t->tdb_mtx);
>  
> @@ -2525,11 +2569,23 @@ pfsync_delete_tdb(struct tdb *t)
>  
>       mtx_enter(&sc->sc_tdb_mtx);
>  
> +     /*
> +      * if tdb entry is just being processed (found in snapshot),
> +      * then it can not be deleted. we just came too late
> +      */
> +     if (ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED)) {
> +             mtx_leave(&sc->sc_tdb_mtx);
> +             return;
> +     }
> +
>       TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry);
> +
>       mtx_enter(&t->tdb_mtx);
>       CLR(t->tdb_flags, TDBF_PFSYNC);
>       mtx_leave(&t->tdb_mtx);
>  
> +     tdb_unref(t);

This should be moved after mtx_leave(&sc->sc_tdb_mtx).  Unref may
free and who knows what happens.  It is not wise to hold a mutex
for more than needed.

> +
>       nlen = sizeof(struct pfsync_tdb);
>       if (TAILQ_EMPTY(&sc->sc_tdb_q))
>               nlen += sizeof(struct pfsync_subheader);
> diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h
> index bd7ec1d88e7..558618a0f14 100644
> --- a/sys/net/pfvar.h
> +++ b/sys/net/pfvar.h
> @@ -749,6 +749,7 @@ struct pf_state {
>       u_int8_t                 pad[3];
>  
>       TAILQ_ENTRY(pf_state)    sync_list;
> +     TAILQ_ENTRY(pf_state)    sync_snap;

Again, do we need sync_snap or could we reuse sync_list?

>       TAILQ_ENTRY(pf_state)    entry_list;
>       SLIST_ENTRY(pf_state)    gc_list;
>       RB_ENTRY(pf_state)       entry_id;
> @@ -797,6 +798,7 @@ struct pf_state {
>       pf_refcnt_t              refcnt;
>       u_int16_t                delay;
>       u_int8_t                 rt;
> +     u_int8_t                 snapped;
>  };
>  
>  /*
> diff --git a/sys/netinet/ip_ipsp.h b/sys/netinet/ip_ipsp.h
> index c697994047b..fa46a1e1282 100644
> --- a/sys/netinet/ip_ipsp.h
> +++ b/sys/netinet/ip_ipsp.h
> @@ -355,6 +355,7 @@ struct tdb {                              /* tunnel 
> descriptor block */
>  #define      TDBF_PFSYNC             0x40000 /* TDB will be synced */
>  #define      TDBF_PFSYNC_RPL         0x80000 /* Replay counter should be 
> bumped */
>  #define      TDBF_ESN                0x100000 /* 64-bit sequence numbers 
> (ESN) */
> +#define      TDBF_PFSYNC_SNAPPED     0x200000 /* entry is being dispatched 
> to peer */
>  
>  #define TDBF_BITS ("\20" \
>       "\1UNIQUE\2TIMER\3BYTES\4ALLOCATIONS" \
> @@ -439,6 +440,7 @@ struct tdb {                              /* tunnel 
> descriptor block */
>  
>       TAILQ_HEAD(tdb_policy_head, ipsec_policy) tdb_policy_head; /* [p] */
>       TAILQ_ENTRY(tdb)        tdb_sync_entry;
> +     TAILQ_ENTRY(tdb)        tdb_sync_snap;

Again, do we need tdb_sync_entry or could we reuse tdb_sync_snap?
I am not demanding it.  If you think the code is clearer with two
enries that would be fine.

>  };
>  
>  enum tdb_counters {

Reply via email to