this is basically pair(4) except at layer 3 instead of Ethernet. i have a few reasons for this.
pair basically exists to move packets between rdomains, which are a layer 3 construct. pushing an ethernet header onto an mbuf data payload and then pulling it off again on the other side of the link is a waste of time. rport instead just stores which address family the packet is in the mbuf header when sending a packet, and switches between l3 input routines on the recving side. it doesnt have to touch the data payload at all. Ethernet as a broadcast medium also implies a bunch of address related issues. having to do arp/ndp between rdomains feels like a waste of time. having to allocate a subnet or keep addresses adjacent on an ipv4 subnet, or using a whole /64 for ipv6 is annoying. rport(4) as a point to point interface lets you use any address and doesnt waste network or broadcast addresses. pair(4) as an Ethernet interface technically supports all the virtual Ethernet interfaces you can stack on a real interface. what does a vlan(4) on a pair(4) interface mean? does it make sense to add a pair interface to a bridge or tpmr? my gut feeling is that none of that makes sense and we'd be better off without these options. i also made rport(4) mpsafe, and have a way to move packets around in parallel in the future too. my test config (shell script) looked like this: ifconfig lo1 create rdomain 1 ifconfig lo1 inet 127.0.0.1/8 ifconfig rport0 create ifconfig rport1 create rdomain 1 ifconfig rport0 inet 169.254.169.254/32 169.254.254.169 ifconfig rport1 inet 169.254.254.169/32 169.254.169.254 ifconfig rport0 parent rport1 ifconfig rport0 up ifconfig rport1 up Index: net/if_rport.c =================================================================== RCS file: net/if_rport.c diff -N net/if_rport.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ net/if_rport.c 9 Jan 2023 04:13:20 -0000 @@ -0,0 +1,456 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2023 David Gwynne <d...@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/syslog.h> +#include <sys/queue.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_types.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip_ipip.h> +#include <netinet/ip_ecn.h> + +#ifdef INET6 +#include <netinet6/in6_var.h> +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> +#endif /* INET6 */ + +#include "bpfilter.h" +#if NBPFILTER > 0 +#include <net/bpf.h> +#endif + +#ifdef MPLS +#include <netmpls/mpls.h> +#endif + +#include "pf.h" +#if NPF > 0 +#include <net/pfvar.h> +#endif + +#define RPORT_MTU_MIN 1280 +#define RPORT_MTU_MAX 32768 /* LOMTU, but could be higher */ +#define RPORT_MTU_DEFAULT RPORT_MTU_MAX + +struct rport_softc { + struct ifnet sc_if; + + unsigned int sc_peer_idx; +}; + +static int rport_clone_create(struct if_clone *, int); +static int rport_clone_destroy(struct ifnet *); + +static int rport_ioctl(struct ifnet *, u_long, caddr_t); +static int rport_output(struct ifnet *, struct mbuf *, struct sockaddr *, + struct rtentry *); +static int rport_enqueue(struct ifnet *, struct mbuf *); +static void rport_start(struct ifqueue *); +static void rport_input(struct ifnet *, struct mbuf *); + +static int rport_up(struct rport_softc *); +static int rport_down(struct rport_softc *); + +static int rport_set_parent(struct rport_softc *, + const struct if_parent *); +static int rport_get_parent(struct rport_softc *, struct if_parent *); +static int rport_del_parent(struct rport_softc *); + +static struct if_clone rport_cloner = + IF_CLONE_INITIALIZER("rport", rport_clone_create, rport_clone_destroy); + +static struct rwlock rport_interfaces_lock = + RWLOCK_INITIALIZER("rports"); + +void +rportattach(int count) +{ + if_clone_attach(&rport_cloner); +} + +static int +rport_clone_create(struct if_clone *ifc, int unit) +{ + struct rport_softc *sc; + struct ifnet *ifp; + + sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); + ifp = &sc->sc_if; + + snprintf(ifp->if_xname, sizeof(ifp->if_xname), + "%s%d", ifc->ifc_name, unit); + + ifp->if_mtu = RPORT_MTU_DEFAULT; + ifp->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; + ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE; + ifp->if_ioctl = rport_ioctl; + ifp->if_bpf_mtap = p2p_bpf_mtap; + ifp->if_output = rport_output; + ifp->if_enqueue = rport_enqueue; + ifp->if_qstart = rport_start; + ifp->if_input = rport_input; + ifp->if_rtrequest = p2p_rtrequest; + ifp->if_type = IFT_TUNNEL; + ifp->if_softc = sc; + + if_attach(ifp); + if_alloc_sadl(ifp); + if_counters_alloc(ifp); + +#if NBPFILTER > 0 + bpfattach(&ifp->if_bpf, ifp, DLT_LOOP, sizeof(uint32_t)); +#endif + + return (0); +} + +int +rport_clone_destroy(struct ifnet *ifp) +{ + struct rport_softc *sc = ifp->if_softc; + + NET_LOCK(); + if (ISSET(ifp->if_flags, IFF_RUNNING)) + rport_down(sc); + rport_del_parent(sc); + NET_UNLOCK(); + + if_detach(ifp); + + free(sc, M_DEVBUF, sizeof(*sc)); + + return (0); +} + +static int +rport_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct rtentry *rt) +{ + struct m_tag *mtag; + int error = 0; + + if (!ISSET(ifp->if_flags, IFF_RUNNING)) { + error = ENETDOWN; + goto drop; + } + + switch (dst->sa_family) { + case AF_INET: +#ifdef INET6 + case AF_INET6: +#endif +#ifdef MPLS + case AF_MPLS: +#endif + break; + default: + error = EAFNOSUPPORT; + goto drop; + } + + /* Try to limit infinite recursion through misconfiguration. */ + mtag = NULL; + while ((mtag = m_tag_find(m, PACKET_TAG_GRE, mtag)) != NULL) { + if (*(int *)(mtag + 1) == ifp->if_index) { + error = EIO; + goto drop; + } + } + + mtag = m_tag_get(PACKET_TAG_GRE, sizeof(ifp->if_index), M_NOWAIT); + if (mtag == NULL) { + error = ENOBUFS; + goto drop; + } + *(int *)(mtag + 1) = ifp->if_index; + m_tag_prepend(m, mtag); + + m->m_flags &= ~(M_BCAST|M_MCAST); + m->m_pkthdr.ph_family = dst->sa_family; +#if NPF > 0 + pf_pkt_addr_changed(m); +#endif + + error = if_enqueue(ifp, m); + if (error) + counters_inc(ifp->if_counters, ifc_oerrors); + + return (error); + +drop: + m_freem(m); + return (error); +} + +static int +rport_enqueue(struct ifnet *ifp, struct mbuf *m) +{ + struct ifqueue *ifq = &ifp->if_snd; + int error; + + error = ifq_enqueue(ifq, m); + if (error) + return (error); + + /* + * always defer handover of packets to the peer to the ifq + * bundle task to provide control over the NET_LOCK scope. + */ + task_add(ifq->ifq_softnet, &ifq->ifq_bundle); + + return (0); +} + +static void +rport_start(struct ifqueue *ifq) +{ + struct ifnet *ifp = ifq->ifq_if; + struct rport_softc *sc = ifp->if_softc; + struct ifnet *ifp0; + struct mbuf *m; + + ifp0 = if_get(sc->sc_peer_idx); + if (ifp0 == NULL || !ISSET(ifp0->if_flags, IFF_RUNNING)) { + ifq_purge(ifq); + if_put(ifp0); + return; + } + + NET_LOCK_SHARED(); + while ((m = ifq_dequeue(ifq)) != NULL) { +#if NBPFILTER > 0 + caddr_t if_bpf = READ_ONCE(ifp->if_bpf); + if (if_bpf && bpf_mtap_af(if_bpf, m->m_pkthdr.ph_family, + m, BPF_DIRECTION_OUT)) { + m_freem(m); + continue; + } +#endif + + if_vinput(ifp0, m); + } + NET_UNLOCK_SHARED(); + + if_put(ifp0); +} + +static void +rport_input(struct ifnet *ifp, struct mbuf *m) +{ + switch (m->m_pkthdr.ph_family) { + case AF_INET: + ipv4_input(ifp, m); + break; +#ifdef INET6 + case AF_INET6: + ipv6_input(ifp, m); + break; +#endif +#ifdef MPLS + case AF_MPLS: + mpls_input(ifp, m); + break; +#endif + default: + counters_inc(ifp->if_counters, ifc_noproto); + m_freem(m); + break; + } +} + +static int +rport_up(struct rport_softc *sc) +{ + NET_ASSERT_LOCKED(); + + SET(sc->sc_if.if_flags, IFF_RUNNING); + + return (0); +} + +static int +rport_down(struct rport_softc *sc) +{ + NET_ASSERT_LOCKED(); + + CLR(sc->sc_if.if_flags, IFF_RUNNING); + + return (0); +} + +static int +rport_set_parent(struct rport_softc *sc, const struct if_parent *p) +{ + struct ifnet *ifp = &sc->sc_if; + struct ifnet *ifp0; + struct rport_softc *sc0; + int error; + + error = rw_enter(&rport_interfaces_lock, RW_WRITE | RW_INTR); + if (error != 0) + return (error); + + ifp0 = if_unit(p->ifp_parent); + if (ifp0 == NULL) { + error = EINVAL; + goto leave; + } + + if (ifp0 == ifp) { + error = EINVAL; + goto leave; + } + + if (ifp0->if_input != rport_input) { + error = EPROTONOSUPPORT; + goto put; + } + + sc0 = ifp0->if_softc; + + if (sc->sc_peer_idx == ifp0->if_index) { + /* nop */ + KASSERT(sc0->sc_peer_idx == ifp->if_index); + goto put; + } + + if (sc->sc_peer_idx != 0 || sc0->sc_peer_idx != 0) { + error = EBUSY; + goto put; + } + + /* commit */ + sc->sc_peer_idx = ifp0->if_index; + sc0->sc_peer_idx = ifp->if_index; + +put: + if_put(ifp0); +leave: + rw_exit(&rport_interfaces_lock); + + return (error); +} + +static int +rport_get_parent(struct rport_softc *sc, struct if_parent *p) +{ + struct ifnet *ifp0; + int error = 0; + + ifp0 = if_get(sc->sc_peer_idx); + if (ifp0 == NULL) + error = EADDRNOTAVAIL; + else { + if (strlcpy(p->ifp_parent, ifp0->if_xname, + sizeof(p->ifp_parent)) >= sizeof(p->ifp_parent)) + panic("%s strlcpy", __func__); + } + if_put(ifp0); + + return (error); +} + +static int +rport_del_parent(struct rport_softc *sc) +{ + struct rport_softc *sc0; + struct ifnet *ifp0; + int error; + + error = rw_enter(&rport_interfaces_lock, RW_WRITE | RW_INTR); + if (error != 0) + return (error); + + ifp0 = if_get(sc->sc_peer_idx); + sc->sc_peer_idx = 0; + + if (ifp0 != NULL) { + sc0 = ifp0->if_softc; + sc0->sc_peer_idx = 0; + } + if_put(ifp0); + + rw_exit(&rport_interfaces_lock); + + return (0); +} + +static int +rport_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct rport_softc *sc = ifp->if_softc; + struct ifreq *ifr = (struct ifreq *)data; + int error = 0; + + switch (cmd) { + case SIOCSIFADDR: + break; + case SIOCSIFFLAGS: + if (ISSET(ifp->if_flags, IFF_UP)) { + if (!ISSET(ifp->if_flags, IFF_RUNNING)) + error = rport_up(sc); + } else { + if (ISSET(ifp->if_flags, IFF_RUNNING)) + error = rport_down(sc); + } + break; + + case SIOCADDMULTI: + case SIOCDELMULTI: + break; + + case SIOCSIFMTU: + if (ifr->ifr_mtu < RPORT_MTU_MIN || + ifr->ifr_mtu > RPORT_MTU_MAX) { + error = EINVAL; + break; + } + + ifp->if_mtu = ifr->ifr_mtu; + break; + + case SIOCSIFPARENT: + error = rport_set_parent(sc, (struct if_parent *)data); + break; + case SIOCGIFPARENT: + error = rport_get_parent(sc, (struct if_parent *)data); + break; + case SIOCDIFPARENT: + error = rport_del_parent(sc); + break; + + default: + error = ENOTTY; + break; + } + + return (error); +} Index: conf/GENERIC =================================================================== RCS file: /cvs/src/sys/conf/GENERIC,v retrieving revision 1.286 diff -u -p -r1.286 GENERIC --- conf/GENERIC 30 Sep 2022 02:56:23 -0000 1.286 +++ conf/GENERIC 9 Jan 2023 04:13:20 -0000 @@ -96,6 +96,7 @@ pseudo-device mpe # MPLS PE interface pseudo-device mpw # MPLS pseudowire support pseudo-device mpip # MPLS IP Layer2 pseudowire support pseudo-device bpe # Provider Backbone Bridge edge interface +pseudo-device rport # rdomain port interface pseudo-device pair # Virtual Ethernet interface pair pseudo-device ppp # PPP pseudo-device pppoe # PPP over Ethernet (RFC 2516) Index: conf/files =================================================================== RCS file: /cvs/src/sys/conf/files,v retrieving revision 1.720 diff -u -p -r1.720 files --- conf/files 22 Dec 2022 05:59:26 -0000 1.720 +++ conf/files 9 Jan 2023 04:13:20 -0000 @@ -547,6 +547,7 @@ pseudo-device msts: tty pseudo-device endrun: tty pseudo-device loop: ifnet +pseudo-device rport: ifnet pseudo-device pair: ifnet, ether pseudo-device ppp: ifnet pseudo-device tun: ifnet @@ -833,6 +834,7 @@ file net/if_mpw.c mpw file net/if_mpip.c mpip file net/if_bpe.c bpe needs-count file net/if_vether.c vether +file net/if_rport.c rport file net/if_pair.c pair file net/if_pppx.c pppx needs-count file net/if_vxlan.c vxlan needs-count