Re: let's add PF_LOCK()

Mike Belopuhov Tue, 30 May 2017 08:56:10 -0700

On Tue, May 30, 2017 at 14:46 +0200, Alexandr Nedvedicky wrote:
> oh, not again...
> 
> I'm sorry for not attaching patch to the first email
> 
> On Tue, May 30, 2017 at 02:34:32PM +0200, Alexandr Nedvedicky wrote:
> > Hello,
> > 
> > patch delivers two changes to PF:
> > 
> >     it adds PF_LOCK() et. al. At the moment the PF_LOCK() sort of
> >     duplicates the current NET_LOCK(). It essentially synchronizes
> >     packets with ioctl(2) and timer thread, which purges states.
> >     The future work is going to break PF_LOCK into smaller locks,
> >     which each will protect relevant parts of PF. Think of pf_state_lock,
> >     pf_rule_lock, ...
> > 
> >     The other change, which gets introduced is mutex for IP reassembly
> >     done by PF. The mutex synchronizes fragmented packets with timer
> >     thread, which expires incomplete packets from fragment cache.
> > 
> > O.K.?
> > 
> > thanks and
> > regards
> > sasha
> > 
>


It's great stuff, I think you should move forward with this.
A couple of nits:

 - pfvar*.h doesn't have tabs after #define, just one space

 - do you really want to put MTX in PF_FRAG_MTX_* defines?
   why not PF_FRAG_LOCK?  it would be aligned to PF_LOCK
   and then we can change the nature of the lock w/o changing
   defines.

Some additional comments inline (especially the nloops).

It would be really nice if you'd generate diffs with -p so
that additional location information for each hunk would be
visible.

> --------8<---------------8<---------------8<------------------8<--------
> diff -r 21414694ee7a .hgtags
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/.hgtags Tue May 30 14:25:01 2017 +0200
> @@ -0,0 +1,1 @@
> +d545881e2652dbc0c057691a39a095bce92f441f pf-lock.baseline
> diff -r 21414694ee7a src/sys/net/pf.c
> --- a/src/sys/net/pf.c        Tue May 30 10:55:41 2017 +0200
> +++ b/src/sys/net/pf.c        Tue May 30 14:25:01 2017 +0200
> @@ -923,7 +923,7 @@
>  pf_state_insert(struct pfi_kif *kif, struct pf_state_key **skw,
>      struct pf_state_key **sks, struct pf_state *s)
>  {
> -     NET_ASSERT_LOCKED();
> +     PF_ASSERT_LOCKED();
>  
>       s->kif = kif;
>       if (*skw == *sks) {
> @@ -1186,7 +1186,7 @@
>  {
>       struct pf_rule  *r;
>  
> -     NET_ASSERT_LOCKED();
> +     PF_ASSERT_LOCKED();
>  
>       if (SLIST_EMPTY(&pf_rule_gcl))
>               return;
> @@ -1207,6 +1207,7 @@
>               tsleep(pf_purge_thread, PWAIT, "pftm", 1 * hz);
>  
>               NET_LOCK(s);
> +             PF_LOCK(s);
>  
>               /* process a fraction of the state table every second */
>               pf_purge_expired_states(1 + (pf_status.states
> @@ -1214,13 +1215,20 @@
>  
>               /* purge other expired types every PFTM_INTERVAL seconds */
>               if (++nloops >= pf_default_rule.timeout[PFTM_INTERVAL]) {
> -                     pf_purge_expired_fragments();
>                       pf_purge_expired_src_nodes(0);
>                       pf_purge_expired_rules();
>                       nloops = 0;
>               }
>  
> +             PF_UNLOCK(s);
>               NET_UNLOCK(s);
> +
> +             /*
> +              * Fragments don't require PF_LOCK(), they use their own mutex.
> +              */
> +             if (nloops >= pf_default_rule.timeout[PFTM_INTERVAL])
> +                     pf_purge_expired_fragments();
> +
>       }
>  }
>

The way nloops gets set to 0 in the previous "if" statement
makes this looks suspicious. Perhaps setting nloops to zero
has to be happen in the last part?

> @@ -1267,7 +1275,7 @@
>  {
>       struct pf_src_node              *cur, *next;
>  
> -     NET_ASSERT_LOCKED();
> +     PF_ASSERT_LOCKED();
>  
>       for (cur = RB_MIN(pf_src_tree, &tree_src_tracking); cur; cur = next) {
>       next = RB_NEXT(pf_src_tree, &tree_src_tracking, cur);
> @@ -1303,7 +1311,7 @@
>  void
>  pf_remove_state(struct pf_state *cur)
>  {
> -     NET_ASSERT_LOCKED();
> +     PF_ASSERT_LOCKED();
>  
>       /* handle load balancing related tasks */
>       pf_postprocess_addr(cur);
> @@ -1322,9 +1330,17 @@
>  #if NPFLOW > 0
>       if (cur->state_flags & PFSTATE_PFLOW) {
>               /* XXXSMP breaks atomicity */
> +             /*
> +              * The only guy, who kills states (frees from memory) is
> +              * pf_purge_thread(). The pf_purge_thread() kills only states,
> +              * which are marked as PFTM_UNLINKED -> state will stay around,
> +              * once we re-acquire netlock.

"The only guy, who kills states (frees from memory) is"
I think commets have to have a bit better style.  There are
no guys who kill states :)


> +              */
> +             rw_exit_write(&pf_lock);
>               rw_exit_write(&netlock);
>               export_pflow(cur);
>               rw_enter_write(&netlock);
> +             rw_enter_write(&pf_lock);
>       }
>  #endif       /* NPFLOW > 0 */
>  #if NPFSYNC > 0
> @@ -1354,7 +1370,7 @@
>  {
>       struct pf_rule_item *ri;
>  
> -     NET_ASSERT_LOCKED();
> +     PF_ASSERT_LOCKED();
>  
>  #if NPFSYNC > 0
>       if (pfsync_state_in_use(cur))
> @@ -1390,7 +1406,7 @@
>       static struct pf_state  *cur = NULL;
>       struct pf_state         *next;
>  
> -     NET_ASSERT_LOCKED();
> +     PF_ASSERT_LOCKED();
>  
>       while (maxcheck--) {
>               /* wrap to start of list when we hit the end */
> @@ -3142,13 +3158,13 @@
>       case IPPROTO_TCP:
>               sport = pd->hdr.tcp.th_sport;
>               dport = pd->hdr.tcp.th_dport;
> -             NET_ASSERT_LOCKED();
> +             PF_ASSERT_LOCKED();
>               tb = &tcbtable;
>               break;
>       case IPPROTO_UDP:
>               sport = pd->hdr.udp.uh_sport;
>               dport = pd->hdr.udp.uh_dport;
> -             NET_ASSERT_LOCKED();
> +             PF_ASSERT_LOCKED();
>               tb = &udbtable;
>               break;
>       default:
> @@ -6611,6 +6627,7 @@
>       struct pf_pdesc          pd;
>       int                      dir = (fwdir == PF_FWD) ? PF_OUT : fwdir;
>       u_int32_t                qid, pqid = 0;
> +     int                      spl;

I've realised 's' collides with the state pointer here.
One can only hope for SPLs to go away...

>  
>       if (!pf_status.running)
>               return (PF_PASS);
> @@ -6669,6 +6686,7 @@
>       /* if packet sits in reassembly queue, return without error */
>       if (pd.m == NULL)
>               return PF_PASS;
> +
>       if (action != PF_PASS) {
>  #if NPFLOG > 0
>               pd.pflog |= PF_LOG_FORCE;
> @@ -6688,6 +6706,9 @@
>       }
>       pd.m->m_pkthdr.pf.flags |= PF_TAG_PROCESSED;
>  
> +     /* lock the look-up/write section of pf_test() */
> +     PF_LOCK(spl);
> +
>       switch (pd.virtual_proto) {
>  
>       case PF_VPROTO_FRAGMENT: {
> @@ -6707,7 +6728,7 @@
>                       REASON_SET(&reason, PFRES_NORM);
>                       DPFPRINTF(LOG_NOTICE,
>                           "dropping IPv6 packet with ICMPv4 payload");
> -                     goto done;
> +                     goto unlock;
>               }
>               action = pf_test_state_icmp(&pd, &s, &reason);
>               if (action == PF_PASS || action == PF_AFRT) {
> @@ -6732,7 +6753,7 @@
>                       REASON_SET(&reason, PFRES_NORM);
>                       DPFPRINTF(LOG_NOTICE,
>                           "dropping IPv4 packet with ICMPv6 payload");
> -                     goto done;
> +                     goto unlock;
>               }
>               action = pf_test_state_icmp(&pd, &s, &reason);
>               if (action == PF_PASS || action == PF_AFRT) {
> @@ -6756,8 +6777,9 @@
>                       if ((pd.hdr.tcp.th_flags & TH_ACK) && pd.p_len == 0)
>                               pqid = 1;
>                       action = pf_normalize_tcp(&pd);
> -                     if (action == PF_DROP)
> -                             goto done;
> +                     if (action == PF_DROP) {
> +                             goto unlock;
> +                     }
>               }
>               action = pf_test_state(&pd, &s, &reason);
>               if (action == PF_PASS || action == PF_AFRT) {
> @@ -6784,6 +6806,16 @@
>               break;
>       }
>  
> +unlock:
> +     PF_UNLOCK(spl);
> +     /*
> +      * We've just left the look-up section of pf_test(). Code further down

lookup should not have a dash.

> +      * assumes, all objects (state, rule, anchor,...) are going to stay
> +      * around. It's fair assumption, NET_LOCK() prevents the items

"It's fair to assume that NET_LOCK prevents removal of items..."
reads better to me, but YMMV.

> +      * we've collected above from removing. Once NET_LOCK() will be gone,
> +      * we must have reference counting ready.
> +      */
> +

Perhaps rephrase the whole thing like this to stress that we rely
on NET_LOCK right now and something will have to replace it when
it'll be gone?

"At the moment, we rely on NET_LOCK() to prevent removal of
items we've collected above.  They'll have to be refcounted
when NET_LOCK() is gone."

But again, YMMV.

>  done:
>       if (action != PF_DROP) {
>               if (s) {
> @@ -6987,6 +7019,7 @@
>       }
>  
>       *m0 = pd.m;
> +
>       return (action);
>  }
>  
> diff -r 21414694ee7a src/sys/net/pf_ioctl.c
> --- a/src/sys/net/pf_ioctl.c  Tue May 30 10:55:41 2017 +0200
> +++ b/src/sys/net/pf_ioctl.c  Tue May 30 14:25:01 2017 +0200
> @@ -129,6 +129,8 @@
>  TAILQ_HEAD(pf_tags, pf_tagname)      pf_tags = 
> TAILQ_HEAD_INITIALIZER(pf_tags),
>                               pf_qids = TAILQ_HEAD_INITIALIZER(pf_qids);
>  
> +struct rwlock                 pf_lock = RWLOCK_INITIALIZER("pf_lock");
> +
>  #if (PF_QNAME_SIZE != PF_TAG_NAME_SIZE)
>  #error PF_QNAME_SIZE must be equal to PF_TAG_NAME_SIZE
>  #endif
> @@ -999,6 +1001,7 @@
>               }
>  
>       NET_LOCK(s);
> +     PF_LOCK(s);
>       switch (cmd) {
>  
>       case DIOCSTART:
> @@ -2452,6 +2455,7 @@
>               break;
>       }
>  fail:
> +     PF_UNLOCK(s);
>       NET_UNLOCK(s);
>       return (error);
>  }
> diff -r 21414694ee7a src/sys/net/pf_norm.c
> --- a/src/sys/net/pf_norm.c   Tue May 30 10:55:41 2017 +0200
> +++ b/src/sys/net/pf_norm.c   Tue May 30 14:25:01 2017 +0200
> @@ -39,6 +39,7 @@
>  #include <sys/time.h>
>  #include <sys/pool.h>
>  #include <sys/syslog.h>
> +#include <sys/mutex.h>
>  
>  #include <net/if.h>
>  #include <net/if_var.h>
> @@ -135,6 +136,12 @@
>  struct pool           pf_state_scrub_pl;
>  int                   pf_nfrents;
>  
> +struct mutex          pf_frag_mtx;
> +
> +#define      PF_FRAG_MTX_INIT()      mtx_init(&pf_frag_mtx, IPL_SOFTNET)
> +#define      PF_FRAG_MTX_ENTER()     mtx_enter(&pf_frag_mtx)
> +#define      PF_FRAG_MTX_LEAVE()     mtx_leave(&pf_frag_mtx)
> +
>  void
>  pf_normalize_init(void)
>  {
> @@ -149,6 +156,8 @@
>       pool_sethardlimit(&pf_frent_pl, PFFRAG_FRENT_HIWAT, NULL, 0);
>  
>       TAILQ_INIT(&pf_fragqueue);
> +
> +     PF_FRAG_MTX_INIT();
>  }
>  
>  static __inline int
> @@ -176,15 +185,18 @@
>       struct pf_fragment      *frag;
>       int32_t                  expire;
>  
> -     NET_ASSERT_LOCKED();
> +     PF_ASSERT_UNLOCKED();
>  
>       expire = time_uptime - pf_default_rule.timeout[PFTM_FRAG];
> +
> +     PF_FRAG_MTX_ENTER();
>       while ((frag = TAILQ_LAST(&pf_fragqueue, pf_fragqueue)) != NULL) {
>               if (frag->fr_timeout > expire)
>                       break;
>               DPFPRINTF(LOG_NOTICE, "expiring %d(%p)", frag->fr_id, frag);
>               pf_free_fragment(frag);
>       }
> +     PF_FRAG_MTX_LEAVE();
>  }
>  
>  /*
> @@ -533,14 +545,19 @@
>       key.fr_id = ip->ip_id;
>       key.fr_direction = dir;
>  
> -     if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL)
> +     PF_FRAG_MTX_ENTER();
> +     if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) {
> +             PF_FRAG_MTX_LEAVE();
>               return (PF_DROP);
> +     }
>  
>       /* The mbuf is part of the fragment entry, no direct free or access */
>       m = *m0 = NULL;
>  
> -     if (!pf_isfull_fragment(frag))
> +     if (!pf_isfull_fragment(frag)) {
> +             PF_FRAG_MTX_LEAVE();
>               return (PF_PASS);  /* drop because *m0 is NULL, no error */
> +     }
>  
>       /* We have all the data */
>       frent = TAILQ_FIRST(&frag->fr_queue);
> @@ -564,6 +581,7 @@
>       ip->ip_off &= ~(IP_MF|IP_OFFMASK);
>  
>       if (hdrlen + total > IP_MAXPACKET) {
> +             PF_FRAG_MTX_LEAVE();
>               DPFPRINTF(LOG_NOTICE, "drop: too big: %d", total);
>               ip->ip_len = 0;
>               REASON_SET(reason, PFRES_SHORT);
> @@ -572,6 +590,7 @@
>       }
>  
>       DPFPRINTF(LOG_INFO, "complete: %p(%d)", m, ntohs(ip->ip_len));
> +     PF_FRAG_MTX_LEAVE();
>       return (PF_PASS);
>  }
>  
> @@ -610,14 +629,19 @@
>       key.fr_id = fraghdr->ip6f_ident;
>       key.fr_direction = dir;
>  
> -     if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL)
> +     PF_FRAG_MTX_ENTER();
> +     if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) {
> +             PF_FRAG_MTX_LEAVE();
>               return (PF_DROP);
> +     }
>  
>       /* The mbuf is part of the fragment entry, no direct free or access */
>       m = *m0 = NULL;
>  
> -     if (!pf_isfull_fragment(frag))
> +     if (!pf_isfull_fragment(frag)) {
> +             PF_FRAG_MTX_LEAVE();
>               return (PF_PASS);  /* drop because *m0 is NULL, no error */
> +     }
>  
>       /* We have all the data */
>       extoff = frent->fe_extoff;
> @@ -671,17 +695,20 @@
>               ip6->ip6_nxt = proto;
>  
>       if (hdrlen - sizeof(struct ip6_hdr) + total > IPV6_MAXPACKET) {
> +             PF_FRAG_MTX_LEAVE();
>               DPFPRINTF(LOG_NOTICE, "drop: too big: %d", total);
>               ip6->ip6_plen = 0;
>               REASON_SET(reason, PFRES_SHORT);
>               /* PF_DROP requires a valid mbuf *m0 in pf_test6() */
>               return (PF_DROP);
>       }
> +     PF_FRAG_MTX_LEAVE();
>  
>       DPFPRINTF(LOG_INFO, "complete: %p(%d)", m, ntohs(ip6->ip6_plen));
>       return (PF_PASS);
>  
>  fail:
> +     PF_FRAG_MTX_LEAVE();
>       REASON_SET(reason, PFRES_MEMORY);
>       /* PF_DROP requires a valid mbuf *m0 in pf_test6(), will free later */
>       return (PF_DROP);
> diff -r 21414694ee7a src/sys/net/pfvar_priv.h
> --- a/src/sys/net/pfvar_priv.h        Tue May 30 10:55:41 2017 +0200
> +++ b/src/sys/net/pfvar_priv.h        Tue May 30 14:25:01 2017 +0200
> @@ -37,6 +37,10 @@
>  
>  #ifdef _KERNEL
>  
> +#include <sys/rwlock.h>
> +
> +extern struct rwlock pf_lock;
> +
>  struct pf_pdesc {
>       struct {
>               int      done;
> @@ -94,6 +98,32 @@
>       } hdr;
>  };
>  
> +extern struct rwlock pf_lock;
> +
> +#define      PF_LOCK(s)              do {            \
> +             NET_ASSERT_LOCKED();            \
> +             rw_enter_write(&pf_lock);       \
> +             s = splsoftnet();               \
> +     } while (0)
> +
> +#define      PF_UNLOCK(s)            do {            \
> +             PF_ASSERT_LOCKED();             \
> +             splx(s);                        \
> +             rw_exit_write(&pf_lock);        \
> +     } while (0)
> +
> +#define      PF_ASSERT_LOCKED()      do {                    \
> +             if (rw_status(&pf_lock) != RW_WRITE)    \
> +                     splassert_fail(RW_WRITE,        \
> +                         rw_status(&pf_lock),__func__);\
> +             splsoftassert(IPL_SOFTNET);     \
> +     } while (0)
> +
> +#define      PF_ASSERT_UNLOCKED()    do {                    \
> +             if (rw_status(&pf_lock) == RW_WRITE)    \
> +                     splassert_fail(0, rw_status(&pf_lock), __func__);\
> +     } while (0)
> +
>  #endif /* _KERNEL */
>

Something is wrong with spaces here.
Unless it's your mailing client mangling diffs,
could you please align '\' symbols on the same
position.

Re: let's add PF_LOCK()

Reply via email to