On Tue, May 30, 2017 at 14:46 +0200, Alexandr Nedvedicky wrote: > oh, not again... > > I'm sorry for not attaching patch to the first email > > On Tue, May 30, 2017 at 02:34:32PM +0200, Alexandr Nedvedicky wrote: > > Hello, > > > > patch delivers two changes to PF: > > > > it adds PF_LOCK() et. al. At the moment the PF_LOCK() sort of > > duplicates the current NET_LOCK(). It essentially synchronizes > > packets with ioctl(2) and timer thread, which purges states. > > The future work is going to break PF_LOCK into smaller locks, > > which each will protect relevant parts of PF. Think of pf_state_lock, > > pf_rule_lock, ... > > > > The other change, which gets introduced is mutex for IP reassembly > > done by PF. The mutex synchronizes fragmented packets with timer > > thread, which expires incomplete packets from fragment cache. > > > > O.K.? > > > > thanks and > > regards > > sasha > > >
It's great stuff, I think you should move forward with this. A couple of nits: - pfvar*.h doesn't have tabs after #define, just one space - do you really want to put MTX in PF_FRAG_MTX_* defines? why not PF_FRAG_LOCK? it would be aligned to PF_LOCK and then we can change the nature of the lock w/o changing defines. Some additional comments inline (especially the nloops). It would be really nice if you'd generate diffs with -p so that additional location information for each hunk would be visible. > --------8<---------------8<---------------8<------------------8<-------- > diff -r 21414694ee7a .hgtags > --- /dev/null Thu Jan 01 00:00:00 1970 +0000 > +++ b/.hgtags Tue May 30 14:25:01 2017 +0200 > @@ -0,0 +1,1 @@ > +d545881e2652dbc0c057691a39a095bce92f441f pf-lock.baseline > diff -r 21414694ee7a src/sys/net/pf.c > --- a/src/sys/net/pf.c Tue May 30 10:55:41 2017 +0200 > +++ b/src/sys/net/pf.c Tue May 30 14:25:01 2017 +0200 > @@ -923,7 +923,7 @@ > pf_state_insert(struct pfi_kif *kif, struct pf_state_key **skw, > struct pf_state_key **sks, struct pf_state *s) > { > - NET_ASSERT_LOCKED(); > + PF_ASSERT_LOCKED(); > > s->kif = kif; > if (*skw == *sks) { > @@ -1186,7 +1186,7 @@ > { > struct pf_rule *r; > > - NET_ASSERT_LOCKED(); > + PF_ASSERT_LOCKED(); > > if (SLIST_EMPTY(&pf_rule_gcl)) > return; > @@ -1207,6 +1207,7 @@ > tsleep(pf_purge_thread, PWAIT, "pftm", 1 * hz); > > NET_LOCK(s); > + PF_LOCK(s); > > /* process a fraction of the state table every second */ > pf_purge_expired_states(1 + (pf_status.states > @@ -1214,13 +1215,20 @@ > > /* purge other expired types every PFTM_INTERVAL seconds */ > if (++nloops >= pf_default_rule.timeout[PFTM_INTERVAL]) { > - pf_purge_expired_fragments(); > pf_purge_expired_src_nodes(0); > pf_purge_expired_rules(); > nloops = 0; > } > > + PF_UNLOCK(s); > NET_UNLOCK(s); > + > + /* > + * Fragments don't require PF_LOCK(), they use their own mutex. > + */ > + if (nloops >= pf_default_rule.timeout[PFTM_INTERVAL]) > + pf_purge_expired_fragments(); > + > } > } > The way nloops gets set to 0 in the previous "if" statement makes this looks suspicious. Perhaps setting nloops to zero has to be happen in the last part? > @@ -1267,7 +1275,7 @@ > { > struct pf_src_node *cur, *next; > > - NET_ASSERT_LOCKED(); > + PF_ASSERT_LOCKED(); > > for (cur = RB_MIN(pf_src_tree, &tree_src_tracking); cur; cur = next) { > next = RB_NEXT(pf_src_tree, &tree_src_tracking, cur); > @@ -1303,7 +1311,7 @@ > void > pf_remove_state(struct pf_state *cur) > { > - NET_ASSERT_LOCKED(); > + PF_ASSERT_LOCKED(); > > /* handle load balancing related tasks */ > pf_postprocess_addr(cur); > @@ -1322,9 +1330,17 @@ > #if NPFLOW > 0 > if (cur->state_flags & PFSTATE_PFLOW) { > /* XXXSMP breaks atomicity */ > + /* > + * The only guy, who kills states (frees from memory) is > + * pf_purge_thread(). The pf_purge_thread() kills only states, > + * which are marked as PFTM_UNLINKED -> state will stay around, > + * once we re-acquire netlock. "The only guy, who kills states (frees from memory) is" I think commets have to have a bit better style. There are no guys who kill states :) > + */ > + rw_exit_write(&pf_lock); > rw_exit_write(&netlock); > export_pflow(cur); > rw_enter_write(&netlock); > + rw_enter_write(&pf_lock); > } > #endif /* NPFLOW > 0 */ > #if NPFSYNC > 0 > @@ -1354,7 +1370,7 @@ > { > struct pf_rule_item *ri; > > - NET_ASSERT_LOCKED(); > + PF_ASSERT_LOCKED(); > > #if NPFSYNC > 0 > if (pfsync_state_in_use(cur)) > @@ -1390,7 +1406,7 @@ > static struct pf_state *cur = NULL; > struct pf_state *next; > > - NET_ASSERT_LOCKED(); > + PF_ASSERT_LOCKED(); > > while (maxcheck--) { > /* wrap to start of list when we hit the end */ > @@ -3142,13 +3158,13 @@ > case IPPROTO_TCP: > sport = pd->hdr.tcp.th_sport; > dport = pd->hdr.tcp.th_dport; > - NET_ASSERT_LOCKED(); > + PF_ASSERT_LOCKED(); > tb = &tcbtable; > break; > case IPPROTO_UDP: > sport = pd->hdr.udp.uh_sport; > dport = pd->hdr.udp.uh_dport; > - NET_ASSERT_LOCKED(); > + PF_ASSERT_LOCKED(); > tb = &udbtable; > break; > default: > @@ -6611,6 +6627,7 @@ > struct pf_pdesc pd; > int dir = (fwdir == PF_FWD) ? PF_OUT : fwdir; > u_int32_t qid, pqid = 0; > + int spl; I've realised 's' collides with the state pointer here. One can only hope for SPLs to go away... > > if (!pf_status.running) > return (PF_PASS); > @@ -6669,6 +6686,7 @@ > /* if packet sits in reassembly queue, return without error */ > if (pd.m == NULL) > return PF_PASS; > + > if (action != PF_PASS) { > #if NPFLOG > 0 > pd.pflog |= PF_LOG_FORCE; > @@ -6688,6 +6706,9 @@ > } > pd.m->m_pkthdr.pf.flags |= PF_TAG_PROCESSED; > > + /* lock the look-up/write section of pf_test() */ > + PF_LOCK(spl); > + > switch (pd.virtual_proto) { > > case PF_VPROTO_FRAGMENT: { > @@ -6707,7 +6728,7 @@ > REASON_SET(&reason, PFRES_NORM); > DPFPRINTF(LOG_NOTICE, > "dropping IPv6 packet with ICMPv4 payload"); > - goto done; > + goto unlock; > } > action = pf_test_state_icmp(&pd, &s, &reason); > if (action == PF_PASS || action == PF_AFRT) { > @@ -6732,7 +6753,7 @@ > REASON_SET(&reason, PFRES_NORM); > DPFPRINTF(LOG_NOTICE, > "dropping IPv4 packet with ICMPv6 payload"); > - goto done; > + goto unlock; > } > action = pf_test_state_icmp(&pd, &s, &reason); > if (action == PF_PASS || action == PF_AFRT) { > @@ -6756,8 +6777,9 @@ > if ((pd.hdr.tcp.th_flags & TH_ACK) && pd.p_len == 0) > pqid = 1; > action = pf_normalize_tcp(&pd); > - if (action == PF_DROP) > - goto done; > + if (action == PF_DROP) { > + goto unlock; > + } > } > action = pf_test_state(&pd, &s, &reason); > if (action == PF_PASS || action == PF_AFRT) { > @@ -6784,6 +6806,16 @@ > break; > } > > +unlock: > + PF_UNLOCK(spl); > + /* > + * We've just left the look-up section of pf_test(). Code further down lookup should not have a dash. > + * assumes, all objects (state, rule, anchor,...) are going to stay > + * around. It's fair assumption, NET_LOCK() prevents the items "It's fair to assume that NET_LOCK prevents removal of items..." reads better to me, but YMMV. > + * we've collected above from removing. Once NET_LOCK() will be gone, > + * we must have reference counting ready. > + */ > + Perhaps rephrase the whole thing like this to stress that we rely on NET_LOCK right now and something will have to replace it when it'll be gone? "At the moment, we rely on NET_LOCK() to prevent removal of items we've collected above. They'll have to be refcounted when NET_LOCK() is gone." But again, YMMV. > done: > if (action != PF_DROP) { > if (s) { > @@ -6987,6 +7019,7 @@ > } > > *m0 = pd.m; > + > return (action); > } > > diff -r 21414694ee7a src/sys/net/pf_ioctl.c > --- a/src/sys/net/pf_ioctl.c Tue May 30 10:55:41 2017 +0200 > +++ b/src/sys/net/pf_ioctl.c Tue May 30 14:25:01 2017 +0200 > @@ -129,6 +129,8 @@ > TAILQ_HEAD(pf_tags, pf_tagname) pf_tags = > TAILQ_HEAD_INITIALIZER(pf_tags), > pf_qids = TAILQ_HEAD_INITIALIZER(pf_qids); > > +struct rwlock pf_lock = RWLOCK_INITIALIZER("pf_lock"); > + > #if (PF_QNAME_SIZE != PF_TAG_NAME_SIZE) > #error PF_QNAME_SIZE must be equal to PF_TAG_NAME_SIZE > #endif > @@ -999,6 +1001,7 @@ > } > > NET_LOCK(s); > + PF_LOCK(s); > switch (cmd) { > > case DIOCSTART: > @@ -2452,6 +2455,7 @@ > break; > } > fail: > + PF_UNLOCK(s); > NET_UNLOCK(s); > return (error); > } > diff -r 21414694ee7a src/sys/net/pf_norm.c > --- a/src/sys/net/pf_norm.c Tue May 30 10:55:41 2017 +0200 > +++ b/src/sys/net/pf_norm.c Tue May 30 14:25:01 2017 +0200 > @@ -39,6 +39,7 @@ > #include <sys/time.h> > #include <sys/pool.h> > #include <sys/syslog.h> > +#include <sys/mutex.h> > > #include <net/if.h> > #include <net/if_var.h> > @@ -135,6 +136,12 @@ > struct pool pf_state_scrub_pl; > int pf_nfrents; > > +struct mutex pf_frag_mtx; > + > +#define PF_FRAG_MTX_INIT() mtx_init(&pf_frag_mtx, IPL_SOFTNET) > +#define PF_FRAG_MTX_ENTER() mtx_enter(&pf_frag_mtx) > +#define PF_FRAG_MTX_LEAVE() mtx_leave(&pf_frag_mtx) > + > void > pf_normalize_init(void) > { > @@ -149,6 +156,8 @@ > pool_sethardlimit(&pf_frent_pl, PFFRAG_FRENT_HIWAT, NULL, 0); > > TAILQ_INIT(&pf_fragqueue); > + > + PF_FRAG_MTX_INIT(); > } > > static __inline int > @@ -176,15 +185,18 @@ > struct pf_fragment *frag; > int32_t expire; > > - NET_ASSERT_LOCKED(); > + PF_ASSERT_UNLOCKED(); > > expire = time_uptime - pf_default_rule.timeout[PFTM_FRAG]; > + > + PF_FRAG_MTX_ENTER(); > while ((frag = TAILQ_LAST(&pf_fragqueue, pf_fragqueue)) != NULL) { > if (frag->fr_timeout > expire) > break; > DPFPRINTF(LOG_NOTICE, "expiring %d(%p)", frag->fr_id, frag); > pf_free_fragment(frag); > } > + PF_FRAG_MTX_LEAVE(); > } > > /* > @@ -533,14 +545,19 @@ > key.fr_id = ip->ip_id; > key.fr_direction = dir; > > - if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) > + PF_FRAG_MTX_ENTER(); > + if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) { > + PF_FRAG_MTX_LEAVE(); > return (PF_DROP); > + } > > /* The mbuf is part of the fragment entry, no direct free or access */ > m = *m0 = NULL; > > - if (!pf_isfull_fragment(frag)) > + if (!pf_isfull_fragment(frag)) { > + PF_FRAG_MTX_LEAVE(); > return (PF_PASS); /* drop because *m0 is NULL, no error */ > + } > > /* We have all the data */ > frent = TAILQ_FIRST(&frag->fr_queue); > @@ -564,6 +581,7 @@ > ip->ip_off &= ~(IP_MF|IP_OFFMASK); > > if (hdrlen + total > IP_MAXPACKET) { > + PF_FRAG_MTX_LEAVE(); > DPFPRINTF(LOG_NOTICE, "drop: too big: %d", total); > ip->ip_len = 0; > REASON_SET(reason, PFRES_SHORT); > @@ -572,6 +590,7 @@ > } > > DPFPRINTF(LOG_INFO, "complete: %p(%d)", m, ntohs(ip->ip_len)); > + PF_FRAG_MTX_LEAVE(); > return (PF_PASS); > } > > @@ -610,14 +629,19 @@ > key.fr_id = fraghdr->ip6f_ident; > key.fr_direction = dir; > > - if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) > + PF_FRAG_MTX_ENTER(); > + if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) { > + PF_FRAG_MTX_LEAVE(); > return (PF_DROP); > + } > > /* The mbuf is part of the fragment entry, no direct free or access */ > m = *m0 = NULL; > > - if (!pf_isfull_fragment(frag)) > + if (!pf_isfull_fragment(frag)) { > + PF_FRAG_MTX_LEAVE(); > return (PF_PASS); /* drop because *m0 is NULL, no error */ > + } > > /* We have all the data */ > extoff = frent->fe_extoff; > @@ -671,17 +695,20 @@ > ip6->ip6_nxt = proto; > > if (hdrlen - sizeof(struct ip6_hdr) + total > IPV6_MAXPACKET) { > + PF_FRAG_MTX_LEAVE(); > DPFPRINTF(LOG_NOTICE, "drop: too big: %d", total); > ip6->ip6_plen = 0; > REASON_SET(reason, PFRES_SHORT); > /* PF_DROP requires a valid mbuf *m0 in pf_test6() */ > return (PF_DROP); > } > + PF_FRAG_MTX_LEAVE(); > > DPFPRINTF(LOG_INFO, "complete: %p(%d)", m, ntohs(ip6->ip6_plen)); > return (PF_PASS); > > fail: > + PF_FRAG_MTX_LEAVE(); > REASON_SET(reason, PFRES_MEMORY); > /* PF_DROP requires a valid mbuf *m0 in pf_test6(), will free later */ > return (PF_DROP); > diff -r 21414694ee7a src/sys/net/pfvar_priv.h > --- a/src/sys/net/pfvar_priv.h Tue May 30 10:55:41 2017 +0200 > +++ b/src/sys/net/pfvar_priv.h Tue May 30 14:25:01 2017 +0200 > @@ -37,6 +37,10 @@ > > #ifdef _KERNEL > > +#include <sys/rwlock.h> > + > +extern struct rwlock pf_lock; > + > struct pf_pdesc { > struct { > int done; > @@ -94,6 +98,32 @@ > } hdr; > }; > > +extern struct rwlock pf_lock; > + > +#define PF_LOCK(s) do { \ > + NET_ASSERT_LOCKED(); \ > + rw_enter_write(&pf_lock); \ > + s = splsoftnet(); \ > + } while (0) > + > +#define PF_UNLOCK(s) do { \ > + PF_ASSERT_LOCKED(); \ > + splx(s); \ > + rw_exit_write(&pf_lock); \ > + } while (0) > + > +#define PF_ASSERT_LOCKED() do { \ > + if (rw_status(&pf_lock) != RW_WRITE) \ > + splassert_fail(RW_WRITE, \ > + rw_status(&pf_lock),__func__);\ > + splsoftassert(IPL_SOFTNET); \ > + } while (0) > + > +#define PF_ASSERT_UNLOCKED() do { \ > + if (rw_status(&pf_lock) == RW_WRITE) \ > + splassert_fail(0, rw_status(&pf_lock), __func__);\ > + } while (0) > + > #endif /* _KERNEL */ > Something is wrong with spaces here. Unless it's your mailing client mangling diffs, could you please align '\' symbols on the same position.