On 02/08/2019 02:05 PM, Magnus Karlsson wrote: > This commit adds AF_XDP support to libbpf. The main reason for this is > to facilitate writing applications that use AF_XDP by offering > higher-level APIs that hide many of the details of the AF_XDP > uapi. This is in the same vein as libbpf facilitates XDP adoption by > offering easy-to-use higher level interfaces of XDP > functionality. Hopefully this will facilitate adoption of AF_XDP, make > applications using it simpler and smaller, and finally also make it > possible for applications to benefit from optimizations in the AF_XDP > user space access code. Previously, people just copied and pasted the > code from the sample application into their application, which is not > desirable. > > The interface is composed of two parts: > > * Low-level access interface to the four rings and the packet > * High-level control plane interface for creating and setting > up umems and af_xdp sockets as well as a simple XDP program. > > Tested-by: Björn Töpel <bjorn.to...@intel.com> > Signed-off-by: Magnus Karlsson <magnus.karls...@intel.com> [...] > diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c > new file mode 100644 > index 0000000..a982a76 > --- /dev/null > +++ b/tools/lib/bpf/xsk.c > @@ -0,0 +1,742 @@ > +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) > + > +/* > + * AF_XDP user-space access library. > + * > + * Copyright(c) 2018 - 2019 Intel Corporation. > + * > + * Author(s): Magnus Karlsson <magnus.karls...@intel.com> > + */ > + > +#include <errno.h> > +#include <stdlib.h> > +#include <string.h> > +#include <unistd.h> > +#include <arpa/inet.h> > +#include <asm/barrier.h> > +#include <linux/compiler.h> > +#include <linux/filter.h> > +#include <linux/if_ether.h> > +#include <linux/if_link.h> > +#include <linux/if_packet.h> > +#include <linux/if_xdp.h> > +#include <linux/rtnetlink.h> > +#include <net/if.h> > +#include <sys/mman.h> > +#include <sys/socket.h> > +#include <sys/types.h> > + > +#include "bpf.h" > +#include "libbpf.h" > +#include "libbpf_util.h" > +#include "nlattr.h" > +#include "xsk.h" > + > +#ifndef SOL_XDP > + #define SOL_XDP 283 > +#endif > + > +#ifndef AF_XDP > + #define AF_XDP 44 > +#endif > + > +#ifndef PF_XDP > + #define PF_XDP AF_XDP > +#endif > + > +struct xsk_umem { > + struct xsk_ring_prod *fill; > + struct xsk_ring_cons *comp; > + char *umem_area; > + struct xsk_umem_config config; > + int fd; > + int refcount; > +}; > + > +struct xsk_socket { > + struct xsk_ring_cons *rx; > + struct xsk_ring_prod *tx; > + __u64 outstanding_tx; > + struct xsk_umem *umem; > + struct xsk_socket_config config; > + int fd; > + int xsks_map; > + int ifindex; > + int prog_fd; > + int qidconf_map_fd; > + int xsks_map_fd; > + __u32 queue_id; > +}; > + > +struct xsk_nl_info { > + bool xdp_prog_attached; > + int ifindex; > + int fd; > +}; > + > +#define MAX_QUEUES 128
Why is this a fixed constant here, shouldn't this be dynamic due to being NIC specific anyway? [...] > +void *xsk_umem__get_data(struct xsk_umem *umem, __u64 addr) > +{ > + return &((char *)(umem->umem_area))[addr]; > +} There's also a xsk_umem__get_data_raw() doing the same. Why having both, resp. when to choose which? ;) > +int xsk_umem__fd(const struct xsk_umem *umem) > +{ > + return umem ? umem->fd : -EINVAL; > +} > + > +int xsk_socket__fd(const struct xsk_socket *xsk) > +{ > + return xsk ? xsk->fd : -EINVAL; > +} > + > +static bool xsk_page_aligned(void *buffer) > +{ > + unsigned long addr = (unsigned long)buffer; > + > + return !(addr & (getpagesize() - 1)); > +} > + > +static void xsk_set_umem_config(struct xsk_umem_config *cfg, > + const struct xsk_umem_config *usr_cfg) > +{ > + if (!usr_cfg) { > + cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; > + cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; > + cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; > + cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; > + return; > + } > + > + cfg->fill_size = usr_cfg->fill_size; > + cfg->comp_size = usr_cfg->comp_size; > + cfg->frame_size = usr_cfg->frame_size; > + cfg->frame_headroom = usr_cfg->frame_headroom; Just optional nit, might be a bit nicer to have it in this form: cfg->fill_size = usr_cfg ? usr_cfg->fill_size : XSK_RING_PROD__DEFAULT_NUM_DESCS; > +} > + > +static void xsk_set_xdp_socket_config(struct xsk_socket_config *cfg, > + const struct xsk_socket_config *usr_cfg) > +{ > + if (!usr_cfg) { > + cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; > + cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; > + cfg->libbpf_flags = 0; > + cfg->xdp_flags = 0; > + cfg->bind_flags = 0; > + return; > + } > + > + cfg->rx_size = usr_cfg->rx_size; > + cfg->tx_size = usr_cfg->tx_size; > + cfg->libbpf_flags = usr_cfg->libbpf_flags; > + cfg->xdp_flags = usr_cfg->xdp_flags; > + cfg->bind_flags = usr_cfg->bind_flags; (Ditto) > +} > + > +int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size, > + struct xsk_ring_prod *fill, struct xsk_ring_cons *comp, > + const struct xsk_umem_config *usr_config) > +{ > + struct xdp_mmap_offsets off; > + struct xdp_umem_reg mr; > + struct xsk_umem *umem; > + socklen_t optlen; > + void *map; > + int err; > + > + if (!umem_area || !umem_ptr || !fill || !comp) > + return -EFAULT; > + if (!size && !xsk_page_aligned(umem_area)) > + return -EINVAL; > + > + umem = calloc(1, sizeof(*umem)); > + if (!umem) > + return -ENOMEM; > + > + umem->fd = socket(AF_XDP, SOCK_RAW, 0); > + if (umem->fd < 0) { > + err = -errno; > + goto out_umem_alloc; > + } > + > + umem->umem_area = umem_area; > + xsk_set_umem_config(&umem->config, usr_config); > + > + mr.addr = (uintptr_t)umem_area; > + mr.len = size; > + mr.chunk_size = umem->config.frame_size; > + mr.headroom = umem->config.frame_headroom; > + > + err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)); > + if (err) { > + err = -errno; > + goto out_socket; > + } > + err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_FILL_RING, > + &umem->config.fill_size, > + sizeof(umem->config.fill_size)); > + if (err) { > + err = -errno; > + goto out_socket; > + } > + err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_COMPLETION_RING, > + &umem->config.comp_size, > + sizeof(umem->config.comp_size)); > + if (err) { > + err = -errno; > + goto out_socket; > + } > + > + optlen = sizeof(off); > + err = getsockopt(umem->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen); > + if (err) { > + err = -errno; > + goto out_socket; > + } > + > + map = xsk_mmap(NULL, off.fr.desc + > + umem->config.fill_size * sizeof(__u64), > + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, > + umem->fd, XDP_UMEM_PGOFF_FILL_RING); > + if (map == MAP_FAILED) { > + err = -errno; > + goto out_socket; > + } > + > + umem->fill = fill; > + fill->mask = umem->config.fill_size - 1; > + fill->size = umem->config.fill_size; > + fill->producer = map + off.fr.producer; > + fill->consumer = map + off.fr.consumer; > + fill->ring = map + off.fr.desc; > + fill->cached_cons = umem->config.fill_size; > + > + map = xsk_mmap(NULL, > + off.cr.desc + umem->config.comp_size * sizeof(__u64), > + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, > + umem->fd, XDP_UMEM_PGOFF_COMPLETION_RING); > + if (map == MAP_FAILED) { > + err = -errno; > + goto out_mmap; > + } > + > + umem->comp = comp; > + comp->mask = umem->config.comp_size - 1; > + comp->size = umem->config.comp_size; > + comp->producer = map + off.cr.producer; > + comp->consumer = map + off.cr.consumer; > + comp->ring = map + off.cr.desc; > + > + *umem_ptr = umem; > + return 0; > + > +out_mmap: > + munmap(umem->fill, > + off.fr.desc + umem->config.fill_size * sizeof(__u64)); > +out_socket: > + close(umem->fd); > +out_umem_alloc: > + free(umem); > + return err; > +} > + > +static int xsk_parse_nl(void *cookie, void *msg, struct nlattr **tb) > +{ > + struct nlattr *tb_parsed[IFLA_XDP_MAX + 1]; > + struct xsk_nl_info *nl_info = cookie; > + struct ifinfomsg *ifinfo = msg; > + unsigned char mode; > + int err; > + > + if (nl_info->ifindex && nl_info->ifindex != ifinfo->ifi_index) > + return 0; > + > + if (!tb[IFLA_XDP]) > + return 0; > + > + err = libbpf_nla_parse_nested(tb_parsed, IFLA_XDP_MAX, tb[IFLA_XDP], > + NULL); > + if (err) > + return err; > + > + if (!tb_parsed[IFLA_XDP_ATTACHED] || !tb_parsed[IFLA_XDP_FD]) > + return 0; > + > + mode = libbpf_nla_getattr_u8(tb_parsed[IFLA_XDP_ATTACHED]); > + if (mode == XDP_ATTACHED_NONE) > + return 0; > + > + nl_info->xdp_prog_attached = true; > + nl_info->fd = libbpf_nla_getattr_u32(tb_parsed[IFLA_XDP_FD]); Hm, I don't think this works if I read the intention of this helper correctly. IFLA_XDP_FD is never set for retrieving the prog from the kernel. So the above is a bug. We also have bpf_get_link_xdp_id(). This should probably just be reused in this context here. > + return 0; > +} > + > +static bool xsk_xdp_prog_attached(struct xsk_socket *xsk) > +{ > + struct xsk_nl_info nl_info; > + unsigned int nl_pid; > + char err_buf[256]; > + int sock, err; > + > + sock = libbpf_netlink_open(&nl_pid); > + if (sock < 0) > + return false; > + > + nl_info.xdp_prog_attached = false; > + nl_info.ifindex = xsk->ifindex; > + nl_info.fd = -1; > + > + err = libbpf_nl_get_link(sock, nl_pid, xsk_parse_nl, &nl_info); > + if (err) { > + libbpf_strerror(err, err_buf, sizeof(err_buf)); > + pr_warning("Error:\n%s\n", err_buf); > + close(sock); > + return false; > + } > + > + close(sock); > + xsk->prog_fd = nl_info.fd; > + return nl_info.xdp_prog_attached; > +} (See bpf_get_link_xdp_id().) > + > +static int xsk_load_xdp_prog(struct xsk_socket *xsk) > +{ > + char bpf_log_buf[BPF_LOG_BUF_SIZE]; > + int err, prog_fd; > + > + /* This is the C-program: > + * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) > + * { > + * int *qidconf, index = ctx->rx_queue_index; [...] > + > +int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, > + __u32 queue_id, struct xsk_umem *umem, > + struct xsk_ring_cons *rx, struct xsk_ring_prod *tx, > + const struct xsk_socket_config *usr_config) > +{ > + struct sockaddr_xdp sxdp = {}; > + struct xdp_mmap_offsets off; > + struct xsk_socket *xsk; > + socklen_t optlen; > + void *map; > + int err; > + > + if (!umem || !xsk_ptr || !rx || !tx) > + return -EFAULT; > + > + if (umem->refcount) { > + pr_warning("Error: shared umems not supported by libbpf.\n"); > + return -EBUSY; > + } > + > + xsk = calloc(1, sizeof(*xsk)); > + if (!xsk) > + return -ENOMEM; > + > + if (umem->refcount++ > 0) { Should this refcount rather be atomic actually? > + xsk->fd = socket(AF_XDP, SOCK_RAW, 0); > + if (xsk->fd < 0) { > + err = -errno; > + goto out_xsk_alloc; > + } > + } else { > + xsk->fd = umem->fd; > + } > + > + xsk->outstanding_tx = 0; > + xsk->queue_id = queue_id; > + xsk->umem = umem; > + xsk->ifindex = if_nametoindex(ifname); > + if (!xsk->ifindex) { > + err = -errno; > + goto out_socket; > + } > + > + xsk_set_xdp_socket_config(&xsk->config, usr_config); [...]