On Thu, Feb 13, 2025 at 08:42:02PM +0100, Claudio Jeker wrote:
> On Thu, Feb 13, 2025 at 08:13:27PM +0300, Vitaliy Makkoveev wrote:
> > > On 13 Feb 2025, at 19:33, Claudio Jeker <[email protected]> wrote:
> > >
> > > On Wed, Feb 12, 2025 at 06:47:36PM +0300, Vitaliy Makkoveev wrote:
> > >> Hi,
> > >>
> > >> This diff serializes tun(4)/tap(4) interfaces creation and destruction
> > >> from ifconfig(8) and device nodes path. Not only simplifies, but fixes
> > >> syzkaller [1] report.
> > >>
> > >> Can I ask openvpn or vmd(8) users to test it and prove that there is no
> > >> behavior changes?
> > >
> > > Just to be clear the diff changes the way tun(4) and tap(4) interfaces are
> > > created and destroyed. So it matters on how the interface is created and
> > > if it does behave properly.
> >
> > The original creation and destruction paths of tun(4) are full of
> > sleep points, so those paths have dances to protect half-created
> > or half-destroyed interface. This diff uses the dedicated rwlock
> > for cloners in the device nodes handlers to make races impossible.
> >
> > > I think the things to test are:
> > > - If you open /dev/tun1 is tun1 created and if you close /dev/tun1 is
> > > it destroyed again.
> >
> > It works.
> >
> > > - If tun1 alreay exists on open of /dev/tun1 then the last close of
> > > /dev/tun1 should not destroy tun1.
> >
> > This is not the current behaviour. The TUN_STAYUP exists to prevent
> > the concurrent destruction from tun_dev_close() while we are in the
> > middle of tun_dev_open(). This could happen from the VOP_REWOKE()
> > path of tun_clone_destroy(). In this case the interface will be
> > destroyed, but not by tun_dev_close().
>
> It is very much the current behaviour. I use this every day. I have
> preconfigured tap0 and tap1 interfaces that are always around.
> So this is very much current behaviour.
>
> > > - If you have /dev/tun1 open and destroy tun1 (the interface) are you
> > > notified that /dev/tun1 is no longer available.
> > >
> >
> > Only tun_dev_read() should be notified, and yes, it works.
> >
> > > I think there are other cases to consider, those are just the obvious ones
> > > that I remember. I glanced at the diff, it changes a hell of a lot and the
> > > bit that scares me is that TUN_STAYUP becomes unused and looking at
> > > tun_dev_close() I think this is indeed not right. In other words I think
> > > the 2nd bit from the list above is violated.
> >
> > As I previously said, the TUN_STAYUP bit required only to prevent
> > concurrent destruction in the middle of tun_dev_open(). Now
> > concurrent destruction is impossible.
>
> I don't think your right at least I always understood that TUN_STAYUP was
> added exactly to prevent the destruction of an interface that was created
> through interface cloning.
>
I see. Current tun creation and destruction is complicated, because we
need to deal with concurrent threads. I looked at TUN_STAYUP flag from
this point and found it useless, because in the race we can't predict
the winner. So I decided to always destroy interface at close. This
updated diff restores existing behavior with TUN_STAYUP flag.
The diff doesn't change a lot. It only makes `if_cloners_lock' external
to if_clone_{create,destroy}() so now we could always say who is the tun
instance creator. This makes things much simple.
I could commit ifq hunks first. This should decrease the diff size.
Index: sys/net/if.c
===================================================================
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.726
diff -u -p -r1.726 if.c
--- sys/net/if.c 3 Feb 2025 08:58:52 -0000 1.726
+++ sys/net/if.c 13 Feb 2025 22:33:28 -0000
@@ -1310,45 +1310,55 @@ if_isconnected(const struct ifnet *ifp0,
* Create a clone network interface.
*/
int
-if_clone_create(const char *name, int rdomain)
+if_clone_create_locked(const char *name, int rdomain)
{
struct if_clone *ifc;
struct ifnet *ifp;
int unit, ret;
+ rw_assert_wrlock(&if_cloners_lock);
+
ifc = if_clone_lookup(name, &unit);
if (ifc == NULL)
return (EINVAL);
-
- rw_enter_write(&if_cloners_lock);
-
if ((ifp = if_unit(name)) != NULL) {
ret = EEXIST;
- goto unlock;
+ goto put;
}
ret = (*ifc->ifc_create)(ifc, unit);
if (ret != 0 || (ifp = if_unit(name)) == NULL)
- goto unlock;
+ goto put;
NET_LOCK();
if_addgroup(ifp, ifc->ifc_name);
if (rdomain != 0)
if_setrdomain(ifp, rdomain);
NET_UNLOCK();
-unlock:
- rw_exit_write(&if_cloners_lock);
+put:
if_put(ifp);
return (ret);
}
+int
+if_clone_create(const char *name, int rdomain)
+{
+ int error;
+
+ rw_enter_write(&if_cloners_lock);
+ error = if_clone_create_locked(name, rdomain);
+ rw_exit_write(&if_cloners_lock);
+
+ return (error);
+}
+
/*
* Destroy a clone network interface.
*/
int
-if_clone_destroy(const char *name)
+if_clone_destroy_locked(const char *name)
{
struct if_clone *ifc;
struct ifnet *ifp;
@@ -1361,16 +1371,14 @@ if_clone_destroy(const char *name)
if (ifc->ifc_destroy == NULL)
return (EOPNOTSUPP);
- rw_enter_write(&if_cloners_lock);
+ rw_assert_wrlock(&if_cloners_lock);
TAILQ_FOREACH(ifp, &ifnetlist, if_list) {
if (strcmp(ifp->if_xname, name) == 0)
break;
}
- if (ifp == NULL) {
- rw_exit_write(&if_cloners_lock);
+ if (ifp == NULL)
return (ENXIO);
- }
NET_LOCK();
if (ifp->if_flags & IFF_UP) {
@@ -1382,9 +1390,19 @@ if_clone_destroy(const char *name)
NET_UNLOCK();
ret = (*ifc->ifc_destroy)(ifp);
+ return (ret);
+}
+
+int
+if_clone_destroy(const char *name)
+{
+ int error;
+
+ rw_enter_write(&if_cloners_lock);
+ error = if_clone_destroy_locked(name);
rw_exit_write(&if_cloners_lock);
- return (ret);
+ return (error);
}
/*
Index: sys/net/if_tun.c
===================================================================
RCS file: /cvs/src/sys/net/if_tun.c,v
retrieving revision 1.250
diff -u -p -r1.250 if_tun.c
--- sys/net/if_tun.c 30 Dec 2024 02:46:00 -0000 1.250
+++ sys/net/if_tun.c 13 Feb 2025 22:33:28 -0000
@@ -115,7 +115,7 @@ int tundebug = TUN_DEBUG;
void tunattach(int);
int tun_dev_open(dev_t, const struct if_clone *, int, struct proc *);
-int tun_dev_close(dev_t, struct proc *);
+int tun_dev_close(dev_t, const struct if_clone *, struct proc *);
int tun_dev_ioctl(dev_t, u_long, void *);
int tun_dev_read(dev_t, struct uio *, int);
int tun_dev_write(dev_t, struct uio *, int, int);
@@ -284,11 +284,7 @@ tun_create(struct if_clone *ifc, int uni
}
sigio_init(&sc->sc_sigio);
-
- /* tell tun_dev_open we're initialised */
-
- sc->sc_flags |= TUN_INITED|TUN_STAYUP;
- wakeup(sc);
+ sc->sc_flags |= TUN_STAYUP;
return (0);
@@ -303,25 +299,10 @@ int
tun_clone_destroy(struct ifnet *ifp)
{
struct tun_softc *sc = ifp->if_softc;
- dev_t dev;
KERNEL_ASSERT_LOCKED();
- if (ISSET(sc->sc_flags, TUN_DEAD))
- return (ENXIO);
SET(sc->sc_flags, TUN_DEAD);
-
- /* kick userland off the device */
- dev = sc->sc_dev;
- if (dev) {
- struct vnode *vp;
-
- if (vfinddev(dev, VCHR, &vp))
- VOP_REVOKE(vp, REVOKEALL);
-
- KASSERT(sc->sc_dev == 0);
- }
-
/* prevent userland from getting to the device again */
SMR_LIST_REMOVE_LOCKED(sc, sc_entry);
smr_barrier();
@@ -390,70 +371,29 @@ tun_dev_open(dev_t dev, const struct if_
struct tun_softc *sc;
struct ifnet *ifp;
int error;
- u_short stayup = 0;
- struct vnode *vp;
-
char name[IFNAMSIZ];
- unsigned int rdomain;
-
- /*
- * Find the vnode associated with this open before we sleep
- * and let something else revoke it. Our caller has a reference
- * to it so we don't need to account for it.
- */
- if (!vfinddev(dev, VCHR, &vp))
- panic("%s vfinddev failed", __func__);
snprintf(name, sizeof(name), "%s%u", ifc->ifc_name, minor(dev));
- rdomain = rtable_l2(p->p_p->ps_rtableid);
-
- /* let's find or make an interface to work with */
- while ((sc = tun_name_lookup(name)) == NULL) {
- error = if_clone_create(name, rdomain);
- switch (error) {
- case 0: /* it's probably ours */
- stayup = TUN_STAYUP;
- /* FALLTHROUGH */
- case EEXIST: /* we may have lost a race with someone else */
- break;
- default:
- return (error);
- }
- }
-
- refcnt_take(&sc->sc_refs);
- /* wait for it to be fully constructed before we use it */
- for (;;) {
- if (ISSET(sc->sc_flags, TUN_DEAD)) {
- error = ENXIO;
- goto done;
- }
+ rw_enter_write(&if_cloners_lock);
- if (ISSET(sc->sc_flags, TUN_INITED))
- break;
+ if ((sc = tun_name_lookup(name)) == NULL) {
+ unsigned int rdomain = rtable_l2(p->p_p->ps_rtableid);
- error = tsleep_nsec(sc, PCATCH, "tuninit", INFSLP);
- if (error != 0) {
- /* XXX if_clone_destroy if stayup? */
- goto done;
- }
- }
+ if ((error = if_clone_create_locked(name, rdomain)) != 0)
+ goto out;
- /* Has tun_clone_destroy torn the rug out under us? */
- if (vp->v_type == VBAD) {
- error = ENXIO;
- goto done;
+ sc = tun_name_lookup(name);
+ CLR(sc->sc_flags, TUN_STAYUP);
}
if (sc->sc_dev != 0) {
- /* aww, we lost */
+ /* already opened */
error = EBUSY;
- goto done;
+ goto out;
}
/* it's ours now */
sc->sc_dev = dev;
- CLR(sc->sc_flags, stayup);
/* automatically mark the interface running on open */
ifp = &sc->sc_if;
@@ -461,72 +401,61 @@ tun_dev_open(dev_t dev, const struct if_
SET(ifp->if_flags, IFF_UP | IFF_RUNNING);
NET_UNLOCK();
tun_link_state(ifp, LINK_STATE_FULL_DUPLEX);
- error = 0;
+out:
+ rw_exit_write(&if_cloners_lock);
-done:
- tun_put(sc);
return (error);
}
-/*
- * tunclose - close the device; if closing the real device, flush pending
- * output and unless STAYUP bring down and destroy the interface.
- */
int
tunclose(dev_t dev, int flag, int mode, struct proc *p)
{
- return (tun_dev_close(dev, p));
+ return (tun_dev_close(dev, &tun_cloner, p));
}
int
tapclose(dev_t dev, int flag, int mode, struct proc *p)
{
- return (tun_dev_close(dev, p));
+ return (tun_dev_close(dev, &tap_cloner, p));
}
int
-tun_dev_close(dev_t dev, struct proc *p)
+tun_dev_close(dev_t dev, const struct if_clone *ifc, struct proc *p)
{
struct tun_softc *sc;
struct ifnet *ifp;
int error = 0;
char name[IFNAMSIZ];
- int destroy = 0;
- sc = tun_get(dev);
- if (sc == NULL)
- return (ENXIO);
+ snprintf(name, sizeof(name), "%s%u", ifc->ifc_name, minor(dev));
- ifp = &sc->sc_if;
+ rw_enter_write(&if_cloners_lock);
- /*
- * junk all pending output
- */
- NET_LOCK();
- CLR(ifp->if_flags, IFF_UP | IFF_RUNNING);
- CLR(ifp->if_capabilities, TUN_IF_CAPS);
- NET_UNLOCK();
- ifq_purge(&ifp->if_snd);
+ if ((sc = tun_name_lookup(name)) == NULL) {
+ error = ENXIO;
+ goto out;
+ }
- CLR(sc->sc_flags, TUN_ASYNC|TUN_HDR);
- sigio_free(&sc->sc_sigio);
+ if (ISSET(sc->sc_flags, TUN_STAYUP)) {
+ ifp = &sc->sc_if;
- if (!ISSET(sc->sc_flags, TUN_DEAD)) {
- /* we can't hold a reference to sc before we start a dtor */
- if (!ISSET(sc->sc_flags, TUN_STAYUP)) {
- destroy = 1;
- strlcpy(name, ifp->if_xname, sizeof(name));
- } else {
- tun_link_state(ifp, LINK_STATE_DOWN);
- }
- }
+ /* junk all pending output */
+ NET_LOCK();
+ CLR(ifp->if_flags, IFF_UP | IFF_RUNNING);
+ CLR(ifp->if_capabilities, TUN_IF_CAPS);
+ NET_UNLOCK();
+ ifq_purge(&ifp->if_snd);
- sc->sc_dev = 0;
+ CLR(sc->sc_flags, TUN_ASYNC|TUN_HDR);
+ sigio_free(&sc->sc_sigio);
- tun_put(sc);
+ tun_link_state(ifp, LINK_STATE_DOWN);
+ sc->sc_dev = 0;
+ } else
+ if_clone_destroy_locked(name);
- if (destroy)
- if_clone_destroy(name);
+out:
+ rw_exit_write(&if_cloners_lock);
return (error);
}
@@ -831,10 +760,24 @@ tun_dev_read(dev_t dev, struct uio *uio,
ifp = &sc->sc_if;
- error = ifq_deq_sleep(&ifp->if_snd, &m0, ISSET(ioflag, IO_NDELAY),
- (PZERO + 1)|PCATCH, "tunread", &sc->sc_reading, &sc->sc_dev);
- if (error != 0)
- goto put;
+ while ((m0 = ifq_dequeue(&ifp->if_snd)) == NULL) {
+ if (ISSET(ioflag, IO_NDELAY)) {
+ error = EWOULDBLOCK;
+ goto put;
+ }
+
+ sc->sc_reading = 1;
+ error = tsleep_nsec(&ifp->if_snd, (PZERO + 1) | PCATCH,
+ "tunread", INFSLP);
+ sc->sc_reading = 0;
+
+ if (error != 0)
+ goto put;
+ if (ISSET(sc->sc_flags, TUN_DEAD)) {
+ error = ENXIO;
+ goto put;
+ }
+ }
#if NBPFILTER > 0
if (ifp->if_bpf)
Index: sys/net/if_tun.h
===================================================================
RCS file: /cvs/src/sys/net/if_tun.h,v
retrieving revision 1.18
diff -u -p -r1.18 if_tun.h
--- sys/net/if_tun.h 17 Nov 2024 00:25:07 -0000 1.18
+++ sys/net/if_tun.h 13 Feb 2025 22:33:28 -0000
@@ -75,7 +75,7 @@ struct tun_hdr {
};
#define TUN_OPEN 0x0001
-#define TUN_INITED 0x0002
+#define TUN_INITED 0x0002 /* unused */
#define TUN_RCOLL 0x0004 /* unused */
#define TUN_IASET 0x0008
#define TUN_DSTADDR 0x0010
Index: sys/net/if_var.h
===================================================================
RCS file: /cvs/src/sys/net/if_var.h,v
retrieving revision 1.135
diff -u -p -r1.135 if_var.h
--- sys/net/if_var.h 24 Jan 2025 09:19:07 -0000 1.135
+++ sys/net/if_var.h 13 Feb 2025 22:33:28 -0000
@@ -325,6 +325,7 @@ int niq_enlist(struct niqueue *, struct
sysctl_mq((_n), (_l), (_op), (_olp), (_np), (_nl), &(_niq)->ni_q)
extern struct ifnet_head ifnetlist;
+extern struct rwlock if_cloners_lock;
void if_start(struct ifnet *);
int if_enqueue(struct ifnet *, struct mbuf *);
@@ -355,6 +356,8 @@ int if_isconnected(const struct ifnet *,
void if_clone_attach(struct if_clone *);
+int if_clone_create_locked(const char *, int);
+int if_clone_destroy_locked(const char *);
int if_clone_create(const char *, int);
int if_clone_destroy(const char *);
Index: sys/net/ifq.c
===================================================================
RCS file: /cvs/src/sys/net/ifq.c,v
retrieving revision 1.56
diff -u -p -r1.56 ifq.c
--- sys/net/ifq.c 3 Feb 2025 08:58:52 -0000 1.56
+++ sys/net/ifq.c 13 Feb 2025 22:33:28 -0000
@@ -478,45 +478,6 @@ ifq_dequeue(struct ifqueue *ifq)
}
int
-ifq_deq_sleep(struct ifqueue *ifq, struct mbuf **mp, int nbio, int priority,
- const char *wmesg, volatile unsigned int *sleeping,
- volatile unsigned int *alive)
-{
- struct mbuf *m;
- void *cookie;
- int error = 0;
-
- ifq_deq_enter(ifq);
- if (ifq->ifq_len == 0 && nbio)
- error = EWOULDBLOCK;
- else {
- for (;;) {
- m = ifq->ifq_ops->ifqop_deq_begin(ifq, &cookie);
- if (m != NULL) {
- ifq->ifq_ops->ifqop_deq_commit(ifq, m, cookie);
- ifq->ifq_len--;
- *mp = m;
- break;
- }
-
- (*sleeping)++;
- error = msleep_nsec(ifq, &ifq->ifq_mtx,
- priority, wmesg, INFSLP);
- (*sleeping)--;
- if (error != 0)
- break;
- if (!(*alive)) {
- error = EIO;
- break;
- }
- }
- }
- ifq_deq_leave(ifq);
-
- return (error);
-}
-
-int
ifq_hdatalen(struct ifqueue *ifq)
{
struct mbuf *m;
Index: sys/net/ifq.h
===================================================================
RCS file: /cvs/src/sys/net/ifq.h,v
retrieving revision 1.43
diff -u -p -r1.43 ifq.h
--- sys/net/ifq.h 3 Feb 2025 08:58:52 -0000 1.43
+++ sys/net/ifq.h 13 Feb 2025 22:33:28 -0000
@@ -446,10 +446,6 @@ void ifq_barrier(struct ifqueue *);
void ifq_set_oactive(struct ifqueue *);
void ifq_deq_set_oactive(struct ifqueue *);
-int ifq_deq_sleep(struct ifqueue *, struct mbuf **, int, int,
- const char *, volatile unsigned int *,
- volatile unsigned int *);
-
#define ifq_len(_ifq) READ_ONCE((_ifq)->ifq_len)
#define ifq_empty(_ifq) (ifq_len(_ifq) == 0)