Hello,
On Tue, May 09, 2023 at 06:26:43PM +0000, mabi wrote:
> Hi,
>
> On a brand new OpenBSD 7.3 firewall (amd64) I get a kernel panic every few
> days and was wondering if this panic I get is related to this issue/bug?
>
your panic got fixed by recent commit [1]
Hrvoje was/is hitting very close to that KASSERT() (now removed) at line 2274.
in Hrvoje's case the TAILQ_REMOVE() macro complains we attempt to remove state
which is removed already:
2273 atomic_sub_long(&sc->sc_len, pfsync_qs[q].len);
2274 TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
2275 if (TAILQ_EMPTY(&sc->sc_qs[q]))
2276 atomic_sub_long(&sc->sc_len, sizeof (struct
pfsync_subheader));
2277 st->sync_state = PFSYNC_S_NONE;
2278 mtx_leave(&sc->sc_st_mtx);
2279
2280 pf_state_unref(st);
the cause is very similar pfsync relies on volatile value in ->sync_state
member.
The ->sync_state member must be modified under protection of ->mtx.
The issue has been pointed out by bluhm@ during m2k23 hackathon where I shared
my pfsync(4) headache with him. Diff below is my attempt to fix it. I had no
chance to test it. I'll appreciate If you will give it a try and let me know
how things look like.
thanks and
regards
sashan
[1] https://marc.info/?l=openbsd-cvs&m=168269695603160&w=2
--------8<---------------8<---------------8<------------------8<--------
diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c
index 822b4211d0f..811d9d59666 100644
--- a/sys/net/if_pfsync.c
+++ b/sys/net/if_pfsync.c
@@ -1362,14 +1362,17 @@ pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct
pfsync_softc *sc)
while ((st = TAILQ_FIRST(&sc->sc_qs[q])) != NULL) {
TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
+ mtx_enter(&st->mtx);
if (st->snapped == 0) {
TAILQ_INSERT_TAIL(&sn->sn_qs[q], st, sync_snap);
st->snapped = 1;
+ mtx_leave(&st->mtx);
} else {
/*
* item is on snapshot list already, so we can
* skip it now.
*/
+ mtx_leave(&st->mtx);
pf_state_unref(st);
}
}
@@ -1422,11 +1425,13 @@ pfsync_drop_snapshot(struct pfsync_snapshot *sn)
continue;
while ((st = TAILQ_FIRST(&sn->sn_qs[q])) != NULL) {
+ mtx_enter(&st->mtx);
KASSERT(st->sync_state == q);
KASSERT(st->snapped == 1);
TAILQ_REMOVE(&sn->sn_qs[q], st, sync_snap);
st->sync_state = PFSYNC_S_NONE;
st->snapped = 0;
+ mtx_leave(&st->mtx);
pf_state_unref(st);
}
}
@@ -1665,6 +1670,7 @@ pfsync_sendout(void)
count = 0;
while ((st = TAILQ_FIRST(&sn.sn_qs[q])) != NULL) {
+ mtx_enter(&st->mtx);
TAILQ_REMOVE(&sn.sn_qs[q], st, sync_snap);
KASSERT(st->sync_state == q);
KASSERT(st->snapped == 1);
@@ -1672,6 +1678,7 @@ pfsync_sendout(void)
st->snapped = 0;
pfsync_qs[q].write(st, m->m_data + offset);
offset += pfsync_qs[q].len;
+ mtx_leave(&st->mtx);
pf_state_unref(st);
count++;
@@ -1725,8 +1732,6 @@ pfsync_insert_state(struct pf_state *st)
ISSET(st->state_flags, PFSTATE_NOSYNC))
return;
- KASSERT(st->sync_state == PFSYNC_S_NONE);
-
if (sc->sc_len == PFSYNC_MINPKT)
timeout_add_sec(&sc->sc_tmo, 1);
@@ -2221,6 +2226,7 @@ pfsync_q_ins(struct pf_state *st, int q)
panic("pfsync pkt len is too low %zd", sc->sc_len);
do {
mtx_enter(&sc->sc_st_mtx);
+ mtx_enter(&st->mtx);
/*
* There are either two threads trying to update the
@@ -2228,6 +2234,7 @@ pfsync_q_ins(struct pf_state *st, int q)
* (is on snapshot queue).
*/
if (st->sync_state != PFSYNC_S_NONE) {
+ mtx_leave(&st->mtx);
mtx_leave(&sc->sc_st_mtx);
break;
}
@@ -2240,6 +2247,7 @@ pfsync_q_ins(struct pf_state *st, int q)
sclen = atomic_add_long_nv(&sc->sc_len, nlen);
if (sclen > sc->sc_if.if_mtu) {
atomic_sub_long(&sc->sc_len, nlen);
+ mtx_leave(&st->mtx);
mtx_leave(&sc->sc_st_mtx);
pfsync_sendout();
continue;
@@ -2249,6 +2257,7 @@ pfsync_q_ins(struct pf_state *st, int q)
TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
st->sync_state = q;
+ mtx_leave(&st->mtx);
mtx_leave(&sc->sc_st_mtx);
} while (0);
}
@@ -2260,6 +2269,7 @@ pfsync_q_del(struct pf_state *st)
int q;
mtx_enter(&sc->sc_st_mtx);
+ mtx_enter(&st->mtx);
q = st->sync_state;
/*
* re-check under mutex
@@ -2267,6 +2277,7 @@ pfsync_q_del(struct pf_state *st)
* too late, the state is being just processed/dispatched to peer.
*/
if ((q == PFSYNC_S_NONE) || (st->snapped)) {
+ mtx_leave(&st->mtx);
mtx_leave(&sc->sc_st_mtx);
return;
}
@@ -2275,6 +2286,7 @@ pfsync_q_del(struct pf_state *st)
if (TAILQ_EMPTY(&sc->sc_qs[q]))
atomic_sub_long(&sc->sc_len, sizeof (struct pfsync_subheader));
st->sync_state = PFSYNC_S_NONE;
+ mtx_leave(&st->mtx);
mtx_leave(&sc->sc_st_mtx);
pf_state_unref(st);