On 2022/06/07 16:58, David Gwynne wrote: > the main change here is to move pf_purge out from under the kernel lock. > > another part of the change is to limit the amount of work the state > purging does to avoid hogging a cpu too much, and to also avoid holding > NET_LOCK for too long.
I'm running this on a small firewall now, it doesn't have much traffic going through it but is doing some wg and isakmpd. Might give it a try on another one later. good find. > the main part is acheived by using systqmp to run the purge tasks > instead of systq. from what i can tell, all the data structures we're > walking over are protected by pf locks and/or the net lock. the pf > state list in particular was recently reworked so iteration over > the list can be done without interfering with insertions. this means we > can scan the state list to collect expired states without affecting > packet processing. > > however, if you have a lot of states (like me) you can still spend a > lot of time scanning. i seem to sit at around 1 to 1.2 million states, > and i run with the default scan interval of 10 seconds. that means im > scanning about 100 to 120k states every second, which just takes time. > > doing the scan while holding the kernel lock means most other stuff > cannot run at the same time. worse, the timeout that schedules the pf > purge work also seems to schedule something in the softclock thread. if > the same timeout also wakes up a process in an unlocked syscall, a > significant amount of time in the system is spent spinning for no > reason. significant meaning dozens of milliseconds, which is actually > noticable if you're working interactively on the box. > > before this change pf purges often took 10 to 50ms. the softclock > thread runs next to it often took a similar amount of time, presumably > because they ended up spinning waiting for each other. after this > change the pf_purges are more like 6 to 12ms, and dont block softclock. > most of the variability in the runs now seems to come from contention on > the net lock. > > the state purge task now checks the SHOULDYIELD flag, so if the > task is taking too long it will return early and requeue itself, > allowing the taskq machinery to yield and let other threads run. > > state purging is now also limited to removing 64 states per task run to > avoid holding locks that would block packet processing for too long. if > there's still states to scan in the interval after these 64 packets are > collected, the task reschedules so it can keep scanning from where it > left off. > > the other purge tasks for source nodes, rules, and fragments have been > moved to their own timeout/task pair to simplify the time accounting. > > the box feels a lot more responsive with this change. i'm still kicking > it around, but i wanted to get it out early. > > Index: pf.c > =================================================================== > RCS file: /cvs/src/sys/net/pf.c,v > retrieving revision 1.1132 > diff -u -p -r1.1132 pf.c > --- pf.c 23 May 2022 11:17:35 -0000 1.1132 > +++ pf.c 7 Jun 2022 06:49:51 -0000 > @@ -120,10 +120,6 @@ u_char pf_tcp_secret[16]; > int pf_tcp_secret_init; > int pf_tcp_iss_off; > > -int pf_npurge; > -struct task pf_purge_task = TASK_INITIALIZER(pf_purge, &pf_npurge); > -struct timeout pf_purge_to = TIMEOUT_INITIALIZER(pf_purge_timeout, > NULL); > - > enum pf_test_status { > PF_TEST_FAIL = -1, > PF_TEST_OK, > @@ -1276,49 +1272,111 @@ pf_purge_expired_rules(void) > } > } > > -void > -pf_purge_timeout(void *unused) > -{ > - /* XXX move to systqmp to avoid KERNEL_LOCK */ > - task_add(systq, &pf_purge_task); > -} > +void pf_purge_states(void *); > +struct task pf_purge_states_task = > + TASK_INITIALIZER(pf_purge_states, NULL); > + > +void pf_purge_states_tick(void *); > +struct timeout pf_purge_states_to = > + TIMEOUT_INITIALIZER(pf_purge_states_tick, NULL); > + > +unsigned int pf_purge_expired_states(unsigned int, unsigned int); > + > +/* > + * how many states to scan this interval. > + * > + * this is set when the timeout fires, and reduced by the task. the > + * task will reschedule itself until the limit is reduced to zero, > + * and then it adds the timeout again. > + */ > +unsigned int pf_purge_states_limit; > + > +/* > + * limit how many states are processed with locks held per run of > + * the state purge task. > + */ > +unsigned int pf_purge_states_collect = 64; > > void > -pf_purge(void *xnloops) > +pf_purge_states_tick(void *null) > { > - int *nloops = xnloops; > + unsigned int limit = pf_status.states; > + unsigned int interval = pf_default_rule.timeout[PFTM_INTERVAL]; > + > + if (limit == 0) { > + timeout_add_sec(&pf_purge_states_to, 1); > + return; > + } > > /* > * process a fraction of the state table every second > - * Note: > - * we no longer need PF_LOCK() here, because > - * pf_purge_expired_states() uses pf_state_lock to maintain > - * consistency. > */ > - if (pf_default_rule.timeout[PFTM_INTERVAL] > 0) > - pf_purge_expired_states(1 + (pf_status.states > - / pf_default_rule.timeout[PFTM_INTERVAL])); > > + if (interval > 1) > + limit /= interval; > + > + pf_purge_states_limit = limit; > + task_add(systqmp, &pf_purge_states_task); > +} > + > +void > +pf_purge_states(void *null) > +{ > + unsigned int limit; > + unsigned int scanned; > + > + limit = pf_purge_states_limit; > + if (limit < pf_purge_states_collect) > + limit = pf_purge_states_collect; > + > + scanned = pf_purge_expired_states(limit, pf_purge_states_collect); > + if (scanned >= pf_purge_states_limit) { > + /* we've run out of states to scan this "interval" */ > + timeout_add_sec(&pf_purge_states_to, 1); > + return; > + } > + > + pf_purge_states_limit -= scanned; > + task_add(systqmp, &pf_purge_states_task); > +} > + > +void pf_purge_tick(void *); > +struct timeout pf_purge_to = > + TIMEOUT_INITIALIZER(pf_purge_tick, NULL); > + > +void pf_purge(void *); > +struct task pf_purge_task = > + TASK_INITIALIZER(pf_purge, NULL); > + > +void > +pf_purge_tick(void *null) > +{ > + task_add(systqmp, &pf_purge_task); > +} > + > +void > +pf_purge(void *null) > +{ > + unsigned int interval = max(1, pf_default_rule.timeout[PFTM_INTERVAL]); > + > + /* XXX is NET_LOCK necessary? */ > NET_LOCK(); > > PF_LOCK(); > - /* purge other expired types every PFTM_INTERVAL seconds */ > - if (++(*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) { > - pf_purge_expired_src_nodes(); > - pf_purge_expired_rules(); > - } > + > + pf_purge_expired_src_nodes(); > + pf_purge_expired_rules(); > + > PF_UNLOCK(); > > /* > * Fragments don't require PF_LOCK(), they use their own lock. > */ > - if ((*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) { > - pf_purge_expired_fragments(); > - *nloops = 0; > - } > + pf_purge_expired_fragments(); > NET_UNLOCK(); > > - timeout_add_sec(&pf_purge_to, 1); > + /* interpret the interval as idle time between runs */ > + timeout_add_sec(&pf_purge_to, interval); > } > > int32_t > @@ -1502,8 +1560,8 @@ pf_free_state(struct pf_state *cur) > pf_status.states--; > } > > -void > -pf_purge_expired_states(u_int32_t maxcheck) > +unsigned int > +pf_purge_expired_states(const unsigned int limit, const unsigned int collect) > { > /* > * this task/thread/context/whatever is the only thing that > @@ -1517,6 +1575,8 @@ pf_purge_expired_states(u_int32_t maxche > struct pf_state *st; > SLIST_HEAD(pf_state_gcl, pf_state) gcl = SLIST_HEAD_INITIALIZER(gcl); > time_t now; > + unsigned int scanned; > + unsigned int collected = 0; > > PF_ASSERT_UNLOCKED(); > > @@ -1530,7 +1590,7 @@ pf_purge_expired_states(u_int32_t maxche > if (head == NULL) { > /* the list is empty */ > rw_exit_read(&pf_state_list.pfs_rwl); > - return; > + return (limit); > } > > /* (re)start at the front of the list */ > @@ -1539,28 +1599,38 @@ pf_purge_expired_states(u_int32_t maxche > > now = getuptime(); > > - do { > + for (scanned = 0; scanned < limit; scanned++) { > uint8_t stimeout = cur->timeout; > + unsigned int limited = 0; > > if ((stimeout == PFTM_UNLINKED) || > (pf_state_expires(cur, stimeout) <= now)) { > st = pf_state_ref(cur); > SLIST_INSERT_HEAD(&gcl, st, gc_list); > + > + if (++collected >= collect) > + limited = 1; > } > > /* don't iterate past the end of our view of the list */ > if (cur == tail) { > + scanned = limit; > cur = NULL; > break; > } > > cur = TAILQ_NEXT(cur, entry_list); > - } while (maxcheck--); > + > + /* don't spend too much time here. */ > + if (ISSET(READ_ONCE(curcpu()->ci_schedstate.spc_schedflags), > + SPCF_SHOULDYIELD) || limited) > + break; > + } > > rw_exit_read(&pf_state_list.pfs_rwl); > > if (SLIST_EMPTY(&gcl)) > - return; > + return (scanned); > > NET_LOCK(); > rw_enter_write(&pf_state_list.pfs_rwl); > @@ -1581,6 +1651,8 @@ pf_purge_expired_states(u_int32_t maxche > SLIST_REMOVE_HEAD(&gcl, gc_list); > pf_state_unref(st); > } > + > + return (scanned); > } > > int > Index: pf_ioctl.c > =================================================================== > RCS file: /cvs/src/sys/net/pf_ioctl.c,v > retrieving revision 1.381 > diff -u -p -r1.381 pf_ioctl.c > --- pf_ioctl.c 10 May 2022 23:12:25 -0000 1.381 > +++ pf_ioctl.c 7 Jun 2022 06:49:51 -0000 > @@ -1163,6 +1163,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a > pf_status.stateid = gettime(); > pf_status.stateid = pf_status.stateid << 32; > } > + timeout_add_sec(&pf_purge_states_to, 1); > timeout_add_sec(&pf_purge_to, 1); > pf_create_queues(); > DPFPRINTF(LOG_NOTICE, "pf: started"); > @@ -2761,8 +2762,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a > pf_default_rule.timeout[i] = > pf_default_rule_new.timeout[i]; > if (pf_default_rule.timeout[i] == PFTM_INTERVAL && > - pf_default_rule.timeout[i] < old) > - task_add(net_tq(0), &pf_purge_task); > + pf_default_rule.timeout[i] < old && > + timeout_del(&pf_purge_to)) > + task_add(systqmp, &pf_purge_task); > } > pfi_xcommit(); > pf_trans_set_commit(); > Index: pfvar.h > =================================================================== > RCS file: /cvs/src/sys/net/pfvar.h,v > retrieving revision 1.507 > diff -u -p -r1.507 pfvar.h > --- pfvar.h 29 Apr 2022 09:55:43 -0000 1.507 > +++ pfvar.h 7 Jun 2022 06:49:51 -0000 > @@ -1714,7 +1714,6 @@ extern void > pf_tbladdr_remove(struct > extern void pf_tbladdr_copyout(struct pf_addr_wrap *); > extern void pf_calc_skip_steps(struct pf_rulequeue *); > extern void pf_purge_expired_src_nodes(void); > -extern void pf_purge_expired_states(u_int32_t); > extern void pf_purge_expired_rules(void); > extern void pf_remove_state(struct pf_state *); > extern void pf_remove_divert_state(struct pf_state_key *); > Index: pfvar_priv.h > =================================================================== > RCS file: /cvs/src/sys/net/pfvar_priv.h,v > retrieving revision 1.10 > diff -u -p -r1.10 pfvar_priv.h > --- pfvar_priv.h 29 Apr 2022 08:58:49 -0000 1.10 > +++ pfvar_priv.h 7 Jun 2022 06:49:51 -0000 > @@ -209,6 +209,7 @@ struct pf_pdesc { > } hdr; > }; > > +extern struct timeout pf_purge_states_to; > extern struct task pf_purge_task; > extern struct timeout pf_purge_to; > > @@ -262,8 +263,6 @@ extern struct rwlock pf_state_lock; > rw_status(&pf_state_lock), __func__);\ > } while (0) > > -extern void pf_purge_timeout(void *); > -extern void pf_purge(void *); > #endif /* _KERNEL */ > > #endif /* _NET_PFVAR_PRIV_H_ */ >