On Sat, Apr 09, 2022 at 01:51:05AM +0200, Alexandr Nedvedicky wrote: > updated diff is below.
I am not sure what Hrvoje actually did test and what not. My impression was, that he got a panic with the previous version of this diff, but the machine was stable with the code in current. But maybe I got it wrong and we need this code to run pfsync with IPsec in parallel. In general it looks good, a few comments inline. > --------8<---------------8<---------------8<------------------8<-------- > diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c > index cb0f3fbdf52..536c3f9cb70 100644 > --- a/sys/net/if_pfsync.c > +++ b/sys/net/if_pfsync.c > @@ -181,6 +181,7 @@ void pfsync_q_del(struct pf_state *); > > struct pfsync_upd_req_item { > TAILQ_ENTRY(pfsync_upd_req_item) ur_entry; > + TAILQ_ENTRY(pfsync_upd_req_item) ur_snap; Do we really need two list entries? My understanding is that the element is either in the sc_upd_req_list or in sn_upd_req_list. We could use the same entry for both. But of course with two entries it is easier to see what is going on. > struct pfsync_upd_req ur_msg; > }; > TAILQ_HEAD(pfsync_upd_reqs, pfsync_upd_req_item); > @@ -295,7 +296,7 @@ void pfsync_bulk_update(void *); > void pfsync_bulk_fail(void *); > > void pfsync_grab_snapshot(struct pfsync_snapshot *, struct pfsync_softc *); > -void pfsync_drop_snapshot(struct pfsync_snapshot *, struct pfsync_softc *); > +void pfsync_drop_snapshot(struct pfsync_snapshot *); > > void pfsync_send_dispatch(void *); > void pfsync_send_pkt(struct mbuf *); > @@ -422,8 +423,7 @@ pfsync_clone_destroy(struct ifnet *ifp) > sc->sc_deferred = 0; > mtx_leave(&sc->sc_deferrals_mtx); > > - while (!TAILQ_EMPTY(&deferrals)) { > - pd = TAILQ_FIRST(&deferrals); > + while ((pd = TAILQ_FIRST(&deferrals)) != NULL) { > TAILQ_REMOVE(&deferrals, pd, pd_entry); > pfsync_undefer(pd, 0); > } > @@ -1574,6 +1574,9 @@ void > pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc) > { > int q; > + struct pf_state *st; > + struct pfsync_upd_req_item *ur; > + struct tdb *tdb; > > sn->sn_sc = sc; > > @@ -1583,14 +1586,36 @@ pfsync_grab_snapshot(struct pfsync_snapshot *sn, > struct pfsync_softc *sc) > > for (q = 0; q < PFSYNC_S_COUNT; q++) { > TAILQ_INIT(&sn->sn_qs[q]); > - TAILQ_CONCAT(&sn->sn_qs[q], &sc->sc_qs[q], sync_list); > + > + while ((st = TAILQ_FIRST(&sc->sc_qs[q])) != NULL) { > +#ifdef PFSYNC_DEBUG > + KASSERT(st->snapped == 0); > +#endif I see that there are other #ifdef PFSYNC_DEBUG. But why would you hide a cheap KASSERT behind another ifdef? If something is wrong I want to see the crash and not only while debugging. I will send a diff that removes existing PFSYNC_DEBUG. > + TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list); > + TAILQ_INSERT_TAIL(&sn->sn_qs[q], st, sync_snap); > + st->snapped = 1; > + } > } > > TAILQ_INIT(&sn->sn_upd_req_list); > - TAILQ_CONCAT(&sn->sn_upd_req_list, &sc->sc_upd_req_list, ur_entry); > + while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) { > + TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry); > + TAILQ_INSERT_TAIL(&sn->sn_upd_req_list, ur, ur_snap); > + } > > TAILQ_INIT(&sn->sn_tdb_q); > - TAILQ_CONCAT(&sn->sn_tdb_q, &sc->sc_tdb_q, tdb_sync_entry); > + while ((tdb = TAILQ_FIRST(&sc->sc_tdb_q)) != NULL) { > + TAILQ_REMOVE(&sc->sc_tdb_q, tdb, tdb_sync_entry); > + TAILQ_INSERT_TAIL(&sn->sn_tdb_q, tdb, tdb_sync_snap); > + > + mtx_enter(&tdb->tdb_mtx); > +#ifdef PFSYNC_DEBUG > + KASSERT(!ISSET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED)); > +#endif > + /* SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED); */ This comment looks like a debugging leftover. > + SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED); > + mtx_leave(&tdb->tdb_mtx); > + } > > sn->sn_len = sc->sc_len; > sc->sc_len = PFSYNC_MINPKT; > @@ -1606,41 +1631,44 @@ pfsync_grab_snapshot(struct pfsync_snapshot *sn, > struct pfsync_softc *sc) > } > > void > -pfsync_drop_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc * sc) > +pfsync_drop_snapshot(struct pfsync_snapshot *sn) > { > struct pf_state *st; > struct pfsync_upd_req_item *ur; > struct tdb *t; > int q; > > - > for (q = 0; q < PFSYNC_S_COUNT; q++) { > if (TAILQ_EMPTY(&sn->sn_qs[q])) > continue; > > while ((st = TAILQ_FIRST(&sn->sn_qs[q])) != NULL) { > - TAILQ_REMOVE(&sn->sn_qs[q], st, sync_list); > #ifdef PFSYNC_DEBUG > KASSERT(st->sync_state == q); > + KASSERT(st->snapped == 1); > #endif > + TAILQ_REMOVE(&sn->sn_qs[q], st, sync_snap); > st->sync_state = PFSYNC_S_NONE; > + st->snapped = 0; > pf_state_unref(st); > } > } > > while ((ur = TAILQ_FIRST(&sn->sn_upd_req_list)) != NULL) { > - TAILQ_REMOVE(&sn->sn_upd_req_list, ur, ur_entry); > + TAILQ_REMOVE(&sn->sn_upd_req_list, ur, ur_snap); > pool_put(&sn->sn_sc->sc_pool, ur); > } > > - mtx_enter(&sc->sc_tdb_mtx); > while ((t = TAILQ_FIRST(&sn->sn_tdb_q)) != NULL) { > - TAILQ_REMOVE(&sn->sn_tdb_q, t, tdb_sync_entry); > + TAILQ_REMOVE(&sn->sn_tdb_q, t, tdb_sync_snap); > mtx_enter(&t->tdb_mtx); > +#ifdef PFSYNC_DEBUG > + KASSERT(ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED)); > +#endif > + CLR(t->tdb_flags, TDBF_PFSYNC_SNAPPED); > CLR(t->tdb_flags, TDBF_PFSYNC); > mtx_leave(&t->tdb_mtx); > } > - mtx_leave(&sc->sc_tdb_mtx); > } > > int > @@ -1667,7 +1695,7 @@ pfsync_drop(struct pfsync_softc *sc) > struct pfsync_snapshot sn; > > pfsync_grab_snapshot(&sn, sc); > - pfsync_drop_snapshot(&sn, sc); > + pfsync_drop_snapshot(&sn); > } > > void > @@ -1759,7 +1787,7 @@ pfsync_sendout(void) > if (m == NULL) { > sc->sc_if.if_oerrors++; > pfsyncstat_inc(pfsyncs_onomem); > - pfsync_drop_snapshot(&sn, sc); > + pfsync_drop_snapshot(&sn); > return; > } > > @@ -1769,7 +1797,7 @@ pfsync_sendout(void) > m_free(m); > sc->sc_if.if_oerrors++; > pfsyncstat_inc(pfsyncs_onomem); > - pfsync_drop_snapshot(&sn, sc); > + pfsync_drop_snapshot(&sn); > return; > } > } > @@ -1799,7 +1827,7 @@ pfsync_sendout(void) > > count = 0; > while ((ur = TAILQ_FIRST(&sn.sn_upd_req_list)) != NULL) { > - TAILQ_REMOVE(&sn.sn_upd_req_list, ur, ur_entry); > + TAILQ_REMOVE(&sn.sn_upd_req_list, ur, ur_snap); > > bcopy(&ur->ur_msg, m->m_data + offset, > sizeof(ur->ur_msg)); > @@ -1827,18 +1855,21 @@ pfsync_sendout(void) > subh = (struct pfsync_subheader *)(m->m_data + offset); > offset += sizeof(*subh); > > - mtx_enter(&sc->sc_tdb_mtx); > count = 0; > while ((t = TAILQ_FIRST(&sn.sn_tdb_q)) != NULL) { > - TAILQ_REMOVE(&sn.sn_tdb_q, t, tdb_sync_entry); > + TAILQ_REMOVE(&sn.sn_tdb_q, t, tdb_sync_snap); > pfsync_out_tdb(t, m->m_data + offset); > offset += sizeof(struct pfsync_tdb); > mtx_enter(&t->tdb_mtx); > +#ifdef PFSYNC_DEBUG > + KASSERT(ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED)); > +#endif > + CLR(t->tdb_flags, TDBF_PFSYNC_SNAPPED); > CLR(t->tdb_flags, TDBF_PFSYNC); > mtx_leave(&t->tdb_mtx); > + tdb_unref(t); > count++; > } > - mtx_leave(&sc->sc_tdb_mtx); > > bzero(subh, sizeof(*subh)); > subh->action = PFSYNC_ACT_TDB; > @@ -1856,11 +1887,13 @@ pfsync_sendout(void) > > count = 0; > while ((st = TAILQ_FIRST(&sn.sn_qs[q])) != NULL) { > - TAILQ_REMOVE(&sn.sn_qs[q], st, sync_list); > + TAILQ_REMOVE(&sn.sn_qs[q], st, sync_snap); > #ifdef PFSYNC_DEBUG > KASSERT(st->sync_state == q); > + KASSERT(st->snapped == 1); > #endif > st->sync_state = PFSYNC_S_NONE; > + st->snapped = 0; > pfsync_qs[q].write(st, m->m_data + offset); > offset += pfsync_qs[q].len; > > @@ -2411,8 +2444,9 @@ pfsync_q_ins(struct pf_state *st, int q) > mtx_enter(&sc->sc_st_mtx); > > /* > - * If two threads are competing to insert the same state, then > - * there must be just single winner. > + * There are either two threads trying to update the > + * the same state, or the state is just being processed > + * (is on snapshot queue). > */ > if (st->sync_state != PFSYNC_S_NONE) { > mtx_leave(&sc->sc_st_mtx); > @@ -2450,6 +2484,15 @@ pfsync_q_del(struct pf_state *st) > > mtx_enter(&sc->sc_st_mtx); > q = st->sync_state; > + /* > + * re-check under mutex > + * if state is snapped already, then just bail out, because we came > + * too late, the state is being just processed/dispatched to peer. > + */ > + if ((q == PFSYNC_S_NONE) || (st->snapped)) { > + mtx_leave(&sc->sc_st_mtx); > + return; > + } > atomic_sub_long(&sc->sc_len, pfsync_qs[q].len); > TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list); > if (TAILQ_EMPTY(&sc->sc_qs[q])) > @@ -2495,6 +2538,7 @@ pfsync_update_tdb(struct tdb *t, int output) > } > > TAILQ_INSERT_TAIL(&sc->sc_tdb_q, t, tdb_sync_entry); > + tdb_ref(t); > SET(t->tdb_flags, TDBF_PFSYNC); > mtx_leave(&t->tdb_mtx); > > @@ -2525,11 +2569,23 @@ pfsync_delete_tdb(struct tdb *t) > > mtx_enter(&sc->sc_tdb_mtx); > > + /* > + * if tdb entry is just being processed (found in snapshot), > + * then it can not be deleted. we just came too late > + */ > + if (ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED)) { > + mtx_leave(&sc->sc_tdb_mtx); > + return; > + } > + > TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry); > + > mtx_enter(&t->tdb_mtx); > CLR(t->tdb_flags, TDBF_PFSYNC); > mtx_leave(&t->tdb_mtx); > > + tdb_unref(t); This should be moved after mtx_leave(&sc->sc_tdb_mtx). Unref may free and who knows what happens. It is not wise to hold a mutex for more than needed. > + > nlen = sizeof(struct pfsync_tdb); > if (TAILQ_EMPTY(&sc->sc_tdb_q)) > nlen += sizeof(struct pfsync_subheader); > diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h > index bd7ec1d88e7..558618a0f14 100644 > --- a/sys/net/pfvar.h > +++ b/sys/net/pfvar.h > @@ -749,6 +749,7 @@ struct pf_state { > u_int8_t pad[3]; > > TAILQ_ENTRY(pf_state) sync_list; > + TAILQ_ENTRY(pf_state) sync_snap; Again, do we need sync_snap or could we reuse sync_list? > TAILQ_ENTRY(pf_state) entry_list; > SLIST_ENTRY(pf_state) gc_list; > RB_ENTRY(pf_state) entry_id; > @@ -797,6 +798,7 @@ struct pf_state { > pf_refcnt_t refcnt; > u_int16_t delay; > u_int8_t rt; > + u_int8_t snapped; > }; > > /* > diff --git a/sys/netinet/ip_ipsp.h b/sys/netinet/ip_ipsp.h > index c697994047b..fa46a1e1282 100644 > --- a/sys/netinet/ip_ipsp.h > +++ b/sys/netinet/ip_ipsp.h > @@ -355,6 +355,7 @@ struct tdb { /* tunnel > descriptor block */ > #define TDBF_PFSYNC 0x40000 /* TDB will be synced */ > #define TDBF_PFSYNC_RPL 0x80000 /* Replay counter should be > bumped */ > #define TDBF_ESN 0x100000 /* 64-bit sequence numbers > (ESN) */ > +#define TDBF_PFSYNC_SNAPPED 0x200000 /* entry is being dispatched > to peer */ > > #define TDBF_BITS ("\20" \ > "\1UNIQUE\2TIMER\3BYTES\4ALLOCATIONS" \ > @@ -439,6 +440,7 @@ struct tdb { /* tunnel > descriptor block */ > > TAILQ_HEAD(tdb_policy_head, ipsec_policy) tdb_policy_head; /* [p] */ > TAILQ_ENTRY(tdb) tdb_sync_entry; > + TAILQ_ENTRY(tdb) tdb_sync_snap; Again, do we need tdb_sync_entry or could we reuse tdb_sync_snap? I am not demanding it. If you think the code is clearer with two enries that would be fine. > }; > > enum tdb_counters {