This patch makes bpf(4) serialize its event filter using bd_mtx, which
allows calling the filter without the kernel lock.

The patch uses the same filter pattern that has been utilized with pipes
and sockets.

Note that bpfkqfilter() still runs with the kernel locked.

I think bpfilter_lookup() should always return non-NULL when
bpfkqfilter() calls it, assuming that the code is free of sleeping
points from vn_kqfilter(). A sleeping point would allow bpf detaching
to intervene.

Most kqfilter routines appear to return an error if the device lookup
fails for some reason. The patch makes bpfkqfilter() do the same.

Please test and review.

Index: net/bpf.c
===================================================================
RCS file: src/sys/net/bpf.c,v
retrieving revision 1.207
diff -u -p -r1.207 bpf.c
--- net/bpf.c   10 Nov 2021 04:45:15 -0000      1.207
+++ net/bpf.c   28 Dec 2021 15:31:28 -0000
@@ -109,6 +109,8 @@ int bpf_setdlt(struct bpf_d *, u_int);
 
 void   filt_bpfrdetach(struct knote *);
 int    filt_bpfread(struct knote *, long);
+int    filt_bpfreadmodify(struct kevent *, struct knote *);
+int    filt_bpfreadprocess(struct knote *, struct kevent *);
 
 int    bpf_sysctl_locked(int *, u_int, void *, size_t *, void *, size_t);
 
@@ -377,6 +379,7 @@ bpfopen(dev_t dev, int flag, int mode, s
        task_set(&bd->bd_wake_task, bpf_wakeup_cb, bd);
        smr_init(&bd->bd_smr);
        sigio_init(&bd->bd_sigio);
+       klist_init_mutex(&bd->bd_sel.si_note, &bd->bd_mtx);
 
        bd->bd_rtout = 0;       /* no timeout by default */
 
@@ -589,7 +592,9 @@ bpf_wakeup_cb(void *xd)
        if (d->bd_async && d->bd_sig)
                pgsigio(&d->bd_sigio, d->bd_sig, 0);
 
+       mtx_enter(&d->bd_mtx);
        selwakeup(&d->bd_sel);
+       mtx_leave(&d->bd_mtx);
        bpf_put(d);
 }
 
@@ -1162,10 +1167,12 @@ bpfpoll(dev_t dev, int events, struct pr
 }
 
 const struct filterops bpfread_filtops = {
-       .f_flags        = FILTEROP_ISFD,
+       .f_flags        = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach       = NULL,
        .f_detach       = filt_bpfrdetach,
        .f_event        = filt_bpfread,
+       .f_modify       = filt_bpfreadmodify,
+       .f_process      = filt_bpfreadprocess,
 };
 
 int
@@ -1177,6 +1184,8 @@ bpfkqfilter(dev_t dev, struct knote *kn)
        KERNEL_ASSERT_LOCKED();
 
        d = bpfilter_lookup(minor(dev));
+       if (d == NULL)
+               return (ENXIO);
 
        switch (kn->kn_filter) {
        case EVFILT_READ:
@@ -1189,7 +1198,7 @@ bpfkqfilter(dev_t dev, struct knote *kn)
 
        bpf_get(d);
        kn->kn_hook = d;
-       klist_insert_locked(klist, kn);
+       klist_insert(klist, kn);
 
        return (0);
 }
@@ -1199,26 +1208,60 @@ filt_bpfrdetach(struct knote *kn)
 {
        struct bpf_d *d = kn->kn_hook;
 
-       KERNEL_ASSERT_LOCKED();
-
-       klist_remove_locked(&d->bd_sel.si_note, kn);
+       klist_remove(&d->bd_sel.si_note, kn);
        bpf_put(d);
 }
 
 int
+filt_bpfread_common(struct knote *kn, struct bpf_d *d)
+{
+       MUTEX_ASSERT_LOCKED(&d->bd_mtx);
+
+       kn->kn_data = d->bd_hlen;
+       if (d->bd_immediate)
+               kn->kn_data += d->bd_slen;
+
+       return (kn->kn_data > 0);
+}
+
+int
 filt_bpfread(struct knote *kn, long hint)
 {
        struct bpf_d *d = kn->kn_hook;
 
-       KERNEL_ASSERT_LOCKED();
+       return (filt_bpfread_common(kn, d));
+}
+
+int
+filt_bpfreadmodify(struct kevent *kev, struct knote *kn)
+{
+       struct bpf_d *d = kn->kn_hook;
+       int active;
 
        mtx_enter(&d->bd_mtx);
-       kn->kn_data = d->bd_hlen;
-       if (d->bd_immediate)
-               kn->kn_data += d->bd_slen;
+       knote_modify(kev, kn);
+       active = filt_bpfread_common(kn, d);
        mtx_leave(&d->bd_mtx);
 
-       return (kn->kn_data > 0);
+       return (active);
+}
+
+int
+filt_bpfreadprocess(struct knote *kn, struct kevent *kev)
+{
+       struct bpf_d *d = kn->kn_hook;
+       int active;
+
+       mtx_enter(&d->bd_mtx);
+       if (kev != NULL && (kn->kn_flags & EV_ONESHOT))
+               active = 1;
+       else
+               active = filt_bpfread_common(kn, d);
+       if (active)
+               knote_submit(kn, kev);
+       mtx_leave(&d->bd_mtx);
+
+       return (active);
 }
 
 /*
@@ -1588,6 +1631,7 @@ bpf_d_smr(void *smr)
        if (bd->bd_wfilter != NULL)
                bpf_prog_smr(bd->bd_wfilter);
 
+       klist_free(&bd->bd_sel.si_note);
        free(bd, M_DEVBUF, sizeof(*bd));
 }
 

Reply via email to