On Fri, 29 Dec 2017 18:00:04 +0800 Jason Wang <[email protected]> wrote:
> This patch implements XDP transmission for TAP. Since we can't create > new queues for TAP during XDP set, exist ptr_ring was reused for > queuing XDP buffers. To differ xdp_buff from sk_buff, TUN_XDP_FLAG > (0x1ULL) was encoded into lowest bit of xpd_buff pointer during > ptr_ring_produce, and was decoded during consuming. XDP metadata was > stored in the headroom of the packet which should work in most of > cases since driver usually reserve enough headroom. Very minor changes > were done for vhost_net: it just need to peek the length depends on > the type of pointer. > > Tests was done on two Intel E5-2630 2.40GHz machines connected back to > back through two 82599ES. Traffic were generated through MoonGen and > testpmd(rxonly) in guest reports 2.97Mpps when xdp_redirect_map is > doing redirection from ixgbe to TAP. IMHO a performance measurement without something to compare against is useless. What was the performance before? > Cc: Jesper Dangaard Brouer <[email protected]> > Signed-off-by: Jason Wang <[email protected]> > --- > drivers/net/tun.c | 205 > ++++++++++++++++++++++++++++++++++++++++--------- > drivers/vhost/net.c | 13 +++- > include/linux/if_tun.h | 17 ++++ > 3 files changed, 197 insertions(+), 38 deletions(-) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index 2c89efe..be6d993 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -240,6 +240,24 @@ struct tun_struct { > struct tun_steering_prog __rcu *steering_prog; > }; > > +bool tun_is_xdp_buff(void *ptr) > +{ > + return (unsigned long)ptr & TUN_XDP_FLAG; > +} > +EXPORT_SYMBOL(tun_is_xdp_buff); > + > +void *tun_xdp_to_ptr(void *ptr) > +{ > + return (void *)((unsigned long)ptr | TUN_XDP_FLAG); > +} > +EXPORT_SYMBOL(tun_xdp_to_ptr); > + > +void *tun_ptr_to_xdp(void *ptr) > +{ > + return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG); > +} > +EXPORT_SYMBOL(tun_ptr_to_xdp); > + > static int tun_napi_receive(struct napi_struct *napi, int budget) > { > struct tun_file *tfile = container_of(napi, struct tun_file, napi); > @@ -630,12 +648,25 @@ static struct tun_struct *tun_enable_queue(struct > tun_file *tfile) > return tun; > } > > +static void tun_ptr_free(void *ptr) > +{ > + if (!ptr) > + return; > + if (tun_is_xdp_buff(ptr)) { > + struct xdp_buff *xdp = tun_ptr_to_xdp(ptr); > + > + put_page(virt_to_head_page(xdp->data)); (Yet another XDP-free call point, I need to convert to use an accessor later to transition driver into an xdp_buff return API) > + } else { > + __skb_array_destroy_skb(ptr); > + } > +} > + > static void tun_queue_purge(struct tun_file *tfile) > { > - struct sk_buff *skb; > + void *ptr; > > - while ((skb = ptr_ring_consume(&tfile->tx_ring)) != NULL) > - kfree_skb(skb); > + while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL) > + tun_ptr_free(ptr); > > skb_queue_purge(&tfile->sk.sk_write_queue); > skb_queue_purge(&tfile->sk.sk_error_queue); > @@ -688,8 +719,7 @@ static void __tun_detach(struct tun_file *tfile, bool > clean) > unregister_netdevice(tun->dev); > } > if (tun) > - ptr_ring_cleanup(&tfile->tx_ring, > - __skb_array_destroy_skb); > + ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free); > sock_put(&tfile->sk); > } > } > @@ -1201,6 +1231,54 @@ static const struct net_device_ops tun_netdev_ops = { > .ndo_get_stats64 = tun_net_get_stats64, > }; > > +static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) > +{ > + struct tun_struct *tun = netdev_priv(dev); > + struct xdp_buff *buff = xdp->data_hard_start; > + int headroom = xdp->data - xdp->data_hard_start; > + struct tun_file *tfile; > + u32 numqueues; > + int ret = 0; > + > + /* Assure headroom is available and buff is properly aligned */ > + if (unlikely(headroom < sizeof(*xdp) || tun_is_xdp_buff(xdp))) > + return -ENOSPC; > + > + *buff = *xdp; > + > + rcu_read_lock(); > + > + numqueues = READ_ONCE(tun->numqueues); > + if (!numqueues) { > + ret = -ENOSPC; > + goto out; > + } > + tfile = rcu_dereference(tun->tfiles[smp_processor_id() % > + numqueues]); Several concurrent CPUs can get the same 'tfile'. > + /* Encode the XDP flag into lowest bit for consumer to differ > + * XDP buffer from sk_buff. > + */ > + if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(buff))) { > + this_cpu_inc(tun->pcpu_stats->tx_dropped); > + ret = -ENOSPC; > + } The ptr_ring_produce() will take a lock per packet, limiting the performance. (Again a case where I would have liked a bulk API for ndo_xdp_xmit()). > + > +out: > + rcu_read_unlock(); > + return ret; > +} > + > +static void tun_xdp_flush(struct net_device *dev) > +{ > + struct tun_struct *tun = netdev_priv(dev); > + struct tun_file *tfile = tun->tfiles[0]; > + > + /* Notify and wake up reader process */ > + if (tfile->flags & TUN_FASYNC) > + kill_fasync(&tfile->fasync, SIGIO, POLL_IN); > + tfile->socket.sk->sk_data_ready(tfile->socket.sk); > +} > + > static const struct net_device_ops tap_netdev_ops = { > .ndo_uninit = tun_net_uninit, > .ndo_open = tun_net_open, > @@ -1218,6 +1296,8 @@ static const struct net_device_ops tap_netdev_ops = { > .ndo_set_rx_headroom = tun_set_headroom, > .ndo_get_stats64 = tun_net_get_stats64, > .ndo_bpf = tun_xdp, > + .ndo_xdp_xmit = tun_xdp_xmit, > + .ndo_xdp_flush = tun_xdp_flush, > }; > > static void tun_flow_init(struct tun_struct *tun) > @@ -1841,6 +1921,40 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, > struct iov_iter *from) > return result; > } > > +static ssize_t tun_put_user_xdp(struct tun_struct *tun, > + struct tun_file *tfile, > + struct xdp_buff *xdp, > + struct iov_iter *iter) > +{ > + int vnet_hdr_sz = 0; > + size_t size = xdp->data_end - xdp->data; > + struct tun_pcpu_stats *stats; > + size_t ret; > + > + if (tun->flags & IFF_VNET_HDR) { > + struct virtio_net_hdr gso = { 0 }; > + > + vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); > + if (unlikely(iov_iter_count(iter) < vnet_hdr_sz)) > + return -EINVAL; > + if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) != > + sizeof(gso))) > + return -EFAULT; > + iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); > + } > + > + ret = copy_to_iter(xdp->data, size, iter) + vnet_hdr_sz; > + > + stats = get_cpu_ptr(tun->pcpu_stats); > + u64_stats_update_begin(&stats->syncp); > + stats->tx_packets++; > + stats->tx_bytes += ret; > + u64_stats_update_end(&stats->syncp); > + put_cpu_ptr(tun->pcpu_stats); > + > + return ret; > +} > + > /* Put packet to the user space buffer */ > static ssize_t tun_put_user(struct tun_struct *tun, > struct tun_file *tfile, [...] > error = -ERESTARTSYS; > @@ -1977,36 +2090,42 @@ static struct sk_buff *tun_ring_recv(struct tun_file > *tfile, int noblock, > > out: > *err = error; > - return skb; > + return ptr; > } > > static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, > struct iov_iter *to, > - int noblock, struct sk_buff *skb) > + int noblock, void *ptr) > { > ssize_t ret; > int err; > > tun_debug(KERN_INFO, tun, "tun_do_read\n"); > > - if (!iov_iter_count(to)) { > - if (skb) > - kfree_skb(skb); > - return 0; > - } > + if (!iov_iter_count(to)) > + tun_ptr_free(ptr); > > - if (!skb) { > + if (!ptr) { > /* Read frames from ring */ > - skb = tun_ring_recv(tfile, noblock, &err); > - if (!skb) > + ptr = tun_ring_recv(tfile, noblock, &err); > + if (!ptr) > return err; > } > > - ret = tun_put_user(tun, tfile, skb, to); > - if (unlikely(ret < 0)) > - kfree_skb(skb); > - else > - consume_skb(skb); > + if (tun_is_xdp_buff(ptr)) { > + struct xdp_buff *xdp = tun_ptr_to_xdp(ptr); > + > + ret = tun_put_user_xdp(tun, tfile, xdp, to); > + put_page(virt_to_head_page(xdp->data)); This again tie us info a page-refcnt based XDP scheme. > + } else { > + struct sk_buff *skb = ptr; > + > + ret = tun_put_user(tun, tfile, skb, to); > + if (unlikely(ret < 0)) > + kfree_skb(skb); > + else > + consume_skb(skb); > + } > > return ret; > } [...] > @@ -3191,8 +3323,7 @@ struct socket *tun_get_socket(struct file *file) > struct tun_file *tfile; > if (file->f_op != &tun_fops) > return ERR_PTR(-EINVAL); > - tfile = file->private_data; > - if (!tfile) > + tfile = file->private_data; if (!tfile) This change looks like a typo... > return ERR_PTR(-EBADFD); > return &tfile->socket; > } -- Best regards, Jesper Dangaard Brouer MSc.CS, Principal Kernel Engineer at Red Hat LinkedIn: http://www.linkedin.com/in/brouer
