On 2022/06/07 16:58, David Gwynne wrote:
> the main change here is to move pf_purge out from under the kernel lock.
> 
> another part of the change is to limit the amount of work the state
> purging does to avoid hogging a cpu too much, and to also avoid holding
> NET_LOCK for too long.

I'm running this on a small firewall now, it doesn't have much traffic
going through it but is doing some wg and isakmpd. Might give it a try on
another one later.

good find.


> the main part is acheived by using systqmp to run the purge tasks
> instead of systq. from what i can tell, all the data structures we're
> walking over are protected by pf locks and/or the net lock. the pf
> state list in particular was recently reworked so iteration over
> the list can be done without interfering with insertions. this means we
> can scan the state list to collect expired states without affecting
> packet processing.
> 
> however, if you have a lot of states (like me) you can still spend a
> lot of time scanning. i seem to sit at around 1 to 1.2 million states,
> and i run with the default scan interval of 10 seconds. that means im
> scanning about 100 to 120k states every second, which just takes time.
> 
> doing the scan while holding the kernel lock means most other stuff
> cannot run at the same time. worse, the timeout that schedules the pf
> purge work also seems to schedule something in the softclock thread. if
> the same timeout also wakes up a process in an unlocked syscall, a
> significant amount of time in the system is spent spinning for no
> reason. significant meaning dozens of milliseconds, which is actually
> noticable if you're working interactively on the box.
> 
> before this change pf purges often took 10 to 50ms. the softclock
> thread runs next to it often took a similar amount of time, presumably
> because they ended up spinning waiting for each other. after this
> change the pf_purges are more like 6 to 12ms, and dont block softclock.
> most of the variability in the runs now seems to come from contention on
> the net lock.
> 
> the state purge task now checks the SHOULDYIELD flag, so if the
> task is taking too long it will return early and requeue itself,
> allowing the taskq machinery to yield and let other threads run.
> 
> state purging is now also limited to removing 64 states per task run to
> avoid holding locks that would block packet processing for too long. if
> there's still states to scan in the interval after these 64 packets are
> collected, the task reschedules so it can keep scanning from where it
> left off.
> 
> the other purge tasks for source nodes, rules, and fragments have been
> moved to their own timeout/task pair to simplify the time accounting.
> 
> the box feels a lot more responsive with this change. i'm still kicking
> it around, but i wanted to get it out early.
> 
> Index: pf.c
> ===================================================================
> RCS file: /cvs/src/sys/net/pf.c,v
> retrieving revision 1.1132
> diff -u -p -r1.1132 pf.c
> --- pf.c      23 May 2022 11:17:35 -0000      1.1132
> +++ pf.c      7 Jun 2022 06:49:51 -0000
> @@ -120,10 +120,6 @@ u_char                    pf_tcp_secret[16];
>  int                   pf_tcp_secret_init;
>  int                   pf_tcp_iss_off;
>  
> -int           pf_npurge;
> -struct task   pf_purge_task = TASK_INITIALIZER(pf_purge, &pf_npurge);
> -struct timeout        pf_purge_to = TIMEOUT_INITIALIZER(pf_purge_timeout, 
> NULL);
> -
>  enum pf_test_status {
>       PF_TEST_FAIL = -1,
>       PF_TEST_OK,
> @@ -1276,49 +1272,111 @@ pf_purge_expired_rules(void)
>       }
>  }
>  
> -void
> -pf_purge_timeout(void *unused)
> -{
> -     /* XXX move to systqmp to avoid KERNEL_LOCK */
> -     task_add(systq, &pf_purge_task);
> -}
> +void          pf_purge_states(void *);
> +struct task   pf_purge_states_task =
> +                  TASK_INITIALIZER(pf_purge_states, NULL);
> +
> +void          pf_purge_states_tick(void *);
> +struct timeout        pf_purge_states_to =
> +                  TIMEOUT_INITIALIZER(pf_purge_states_tick, NULL);
> +
> +unsigned int  pf_purge_expired_states(unsigned int, unsigned int);
> +
> +/*
> + * how many states to scan this interval.
> + *
> + * this is set when the timeout fires, and reduced by the task. the
> + * task will reschedule itself until the limit is reduced to zero,
> + * and then it adds the timeout again.
> + */
> +unsigned int pf_purge_states_limit;
> +
> +/*
> + * limit how many states are processed with locks held per run of
> + * the state purge task.
> + */
> +unsigned int pf_purge_states_collect = 64;
>  
>  void
> -pf_purge(void *xnloops)
> +pf_purge_states_tick(void *null)
>  {
> -     int *nloops = xnloops;
> +     unsigned int limit = pf_status.states;
> +     unsigned int interval = pf_default_rule.timeout[PFTM_INTERVAL];
> +
> +     if (limit == 0) {
> +             timeout_add_sec(&pf_purge_states_to, 1);
> +             return;
> +     }
>  
>       /*
>        * process a fraction of the state table every second
> -      * Note:
> -      *     we no longer need PF_LOCK() here, because
> -      *     pf_purge_expired_states() uses pf_state_lock to maintain
> -      *     consistency.
>        */
> -     if (pf_default_rule.timeout[PFTM_INTERVAL] > 0)
> -             pf_purge_expired_states(1 + (pf_status.states
> -                 / pf_default_rule.timeout[PFTM_INTERVAL]));
>  
> +     if (interval > 1)
> +             limit /= interval;
> +
> +     pf_purge_states_limit = limit;
> +     task_add(systqmp, &pf_purge_states_task);
> +}
> +
> +void
> +pf_purge_states(void *null)
> +{
> +     unsigned int limit;
> +     unsigned int scanned;
> +
> +     limit = pf_purge_states_limit;
> +     if (limit < pf_purge_states_collect)
> +             limit = pf_purge_states_collect;
> +
> +     scanned = pf_purge_expired_states(limit, pf_purge_states_collect);
> +     if (scanned >= pf_purge_states_limit) {
> +             /* we've run out of states to scan this "interval" */
> +             timeout_add_sec(&pf_purge_states_to, 1);
> +             return;
> +     }
> +
> +     pf_purge_states_limit -= scanned;
> +     task_add(systqmp, &pf_purge_states_task);
> +}
> +
> +void          pf_purge_tick(void *);
> +struct timeout        pf_purge_to =
> +                  TIMEOUT_INITIALIZER(pf_purge_tick, NULL);
> +
> +void          pf_purge(void *);
> +struct task   pf_purge_task = 
> +                  TASK_INITIALIZER(pf_purge, NULL);
> +
> +void
> +pf_purge_tick(void *null)
> +{
> +     task_add(systqmp, &pf_purge_task);
> +}
> +
> +void
> +pf_purge(void *null)
> +{
> +     unsigned int interval = max(1, pf_default_rule.timeout[PFTM_INTERVAL]);
> +
> +     /* XXX is NET_LOCK necessary? */
>       NET_LOCK();
>  
>       PF_LOCK();
> -     /* purge other expired types every PFTM_INTERVAL seconds */
> -     if (++(*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) {
> -             pf_purge_expired_src_nodes();
> -             pf_purge_expired_rules();
> -     }
> +
> +     pf_purge_expired_src_nodes();
> +     pf_purge_expired_rules();
> +
>       PF_UNLOCK();
>  
>       /*
>        * Fragments don't require PF_LOCK(), they use their own lock.
>        */
> -     if ((*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) {
> -             pf_purge_expired_fragments();
> -             *nloops = 0;
> -     }
> +     pf_purge_expired_fragments();
>       NET_UNLOCK();
>  
> -     timeout_add_sec(&pf_purge_to, 1);
> +     /* interpret the interval as idle time between runs */
> +     timeout_add_sec(&pf_purge_to, interval);
>  }
>  
>  int32_t
> @@ -1502,8 +1560,8 @@ pf_free_state(struct pf_state *cur)
>       pf_status.states--;
>  }
>  
> -void
> -pf_purge_expired_states(u_int32_t maxcheck)
> +unsigned int
> +pf_purge_expired_states(const unsigned int limit, const unsigned int collect)
>  {
>       /*
>        * this task/thread/context/whatever is the only thing that
> @@ -1517,6 +1575,8 @@ pf_purge_expired_states(u_int32_t maxche
>       struct pf_state         *st;
>       SLIST_HEAD(pf_state_gcl, pf_state) gcl = SLIST_HEAD_INITIALIZER(gcl);
>       time_t                   now;
> +     unsigned int             scanned;
> +     unsigned int             collected = 0;
>  
>       PF_ASSERT_UNLOCKED();
>  
> @@ -1530,7 +1590,7 @@ pf_purge_expired_states(u_int32_t maxche
>       if (head == NULL) {
>               /* the list is empty */
>               rw_exit_read(&pf_state_list.pfs_rwl);
> -             return;
> +             return (limit);
>       }
>  
>       /* (re)start at the front of the list */
> @@ -1539,28 +1599,38 @@ pf_purge_expired_states(u_int32_t maxche
>  
>       now = getuptime();
>  
> -     do {
> +     for (scanned = 0; scanned < limit; scanned++) {
>               uint8_t stimeout = cur->timeout;
> +             unsigned int limited = 0;
>  
>               if ((stimeout == PFTM_UNLINKED) ||
>                   (pf_state_expires(cur, stimeout) <= now)) {
>                       st = pf_state_ref(cur);
>                       SLIST_INSERT_HEAD(&gcl, st, gc_list);
> +
> +                     if (++collected >= collect)
> +                             limited = 1;
>               }
>  
>               /* don't iterate past the end of our view of the list */
>               if (cur == tail) {
> +                     scanned = limit;
>                       cur = NULL;
>                       break;
>               }
>  
>               cur = TAILQ_NEXT(cur, entry_list);
> -     } while (maxcheck--);
> +
> +             /* don't spend too much time here. */
> +             if (ISSET(READ_ONCE(curcpu()->ci_schedstate.spc_schedflags),
> +                  SPCF_SHOULDYIELD) || limited)
> +                     break;
> +     }
>  
>       rw_exit_read(&pf_state_list.pfs_rwl);
>  
>       if (SLIST_EMPTY(&gcl))
> -             return;
> +             return (scanned);
>  
>       NET_LOCK();
>       rw_enter_write(&pf_state_list.pfs_rwl);
> @@ -1581,6 +1651,8 @@ pf_purge_expired_states(u_int32_t maxche
>               SLIST_REMOVE_HEAD(&gcl, gc_list);
>               pf_state_unref(st);
>       }
> +
> +     return (scanned);
>  }
>  
>  int
> Index: pf_ioctl.c
> ===================================================================
> RCS file: /cvs/src/sys/net/pf_ioctl.c,v
> retrieving revision 1.381
> diff -u -p -r1.381 pf_ioctl.c
> --- pf_ioctl.c        10 May 2022 23:12:25 -0000      1.381
> +++ pf_ioctl.c        7 Jun 2022 06:49:51 -0000
> @@ -1163,6 +1163,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
>                               pf_status.stateid = gettime();
>                               pf_status.stateid = pf_status.stateid << 32;
>                       }
> +                     timeout_add_sec(&pf_purge_states_to, 1);
>                       timeout_add_sec(&pf_purge_to, 1);
>                       pf_create_queues();
>                       DPFPRINTF(LOG_NOTICE, "pf: started");
> @@ -2761,8 +2762,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
>                       pf_default_rule.timeout[i] =
>                           pf_default_rule_new.timeout[i];
>                       if (pf_default_rule.timeout[i] == PFTM_INTERVAL &&
> -                         pf_default_rule.timeout[i] < old)
> -                             task_add(net_tq(0), &pf_purge_task);
> +                         pf_default_rule.timeout[i] < old &&
> +                         timeout_del(&pf_purge_to))
> +                             task_add(systqmp, &pf_purge_task);
>               }
>               pfi_xcommit();
>               pf_trans_set_commit();
> Index: pfvar.h
> ===================================================================
> RCS file: /cvs/src/sys/net/pfvar.h,v
> retrieving revision 1.507
> diff -u -p -r1.507 pfvar.h
> --- pfvar.h   29 Apr 2022 09:55:43 -0000      1.507
> +++ pfvar.h   7 Jun 2022 06:49:51 -0000
> @@ -1714,7 +1714,6 @@ extern void                      
> pf_tbladdr_remove(struct 
>  extern void                   pf_tbladdr_copyout(struct pf_addr_wrap *);
>  extern void                   pf_calc_skip_steps(struct pf_rulequeue *);
>  extern void                   pf_purge_expired_src_nodes(void);
> -extern void                   pf_purge_expired_states(u_int32_t);
>  extern void                   pf_purge_expired_rules(void);
>  extern void                   pf_remove_state(struct pf_state *);
>  extern void                   pf_remove_divert_state(struct pf_state_key *);
> Index: pfvar_priv.h
> ===================================================================
> RCS file: /cvs/src/sys/net/pfvar_priv.h,v
> retrieving revision 1.10
> diff -u -p -r1.10 pfvar_priv.h
> --- pfvar_priv.h      29 Apr 2022 08:58:49 -0000      1.10
> +++ pfvar_priv.h      7 Jun 2022 06:49:51 -0000
> @@ -209,6 +209,7 @@ struct pf_pdesc {
>       } hdr;
>  };
>  
> +extern struct timeout        pf_purge_states_to;
>  extern struct task   pf_purge_task;
>  extern struct timeout        pf_purge_to;
>  
> @@ -262,8 +263,6 @@ extern struct rwlock      pf_state_lock;
>                           rw_status(&pf_state_lock), __func__);\
>       } while (0)
>  
> -extern void                   pf_purge_timeout(void *);
> -extern void                   pf_purge(void *);
>  #endif /* _KERNEL */
>  
>  #endif /* _NET_PFVAR_PRIV_H_ */
> 

Reply via email to