On 02/08/2019 02:05 PM, Magnus Karlsson wrote:
> This commit adds AF_XDP support to libbpf. The main reason for this is
> to facilitate writing applications that use AF_XDP by offering
> higher-level APIs that hide many of the details of the AF_XDP
> uapi. This is in the same vein as libbpf facilitates XDP adoption by
> offering easy-to-use higher level interfaces of XDP
> functionality. Hopefully this will facilitate adoption of AF_XDP, make
> applications using it simpler and smaller, and finally also make it
> possible for applications to benefit from optimizations in the AF_XDP
> user space access code. Previously, people just copied and pasted the
> code from the sample application into their application, which is not
> desirable.
> 
> The interface is composed of two parts:
> 
> * Low-level access interface to the four rings and the packet
> * High-level control plane interface for creating and setting
>   up umems and af_xdp sockets as well as a simple XDP program.
> 
> Tested-by: Björn Töpel <bjorn.to...@intel.com>
> Signed-off-by: Magnus Karlsson <magnus.karls...@intel.com>
[...]
> diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
> new file mode 100644
> index 0000000..a982a76
> --- /dev/null
> +++ b/tools/lib/bpf/xsk.c
> @@ -0,0 +1,742 @@
> +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
> +
> +/*
> + * AF_XDP user-space access library.
> + *
> + * Copyright(c) 2018 - 2019 Intel Corporation.
> + *
> + * Author(s): Magnus Karlsson <magnus.karls...@intel.com>
> + */
> +
> +#include <errno.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <arpa/inet.h>
> +#include <asm/barrier.h>
> +#include <linux/compiler.h>
> +#include <linux/filter.h>
> +#include <linux/if_ether.h>
> +#include <linux/if_link.h>
> +#include <linux/if_packet.h>
> +#include <linux/if_xdp.h>
> +#include <linux/rtnetlink.h>
> +#include <net/if.h>
> +#include <sys/mman.h>
> +#include <sys/socket.h>
> +#include <sys/types.h>
> +
> +#include "bpf.h"
> +#include "libbpf.h"
> +#include "libbpf_util.h"
> +#include "nlattr.h"
> +#include "xsk.h"
> +
> +#ifndef SOL_XDP
> + #define SOL_XDP 283
> +#endif
> +
> +#ifndef AF_XDP
> + #define AF_XDP 44
> +#endif
> +
> +#ifndef PF_XDP
> + #define PF_XDP AF_XDP
> +#endif
> +
> +struct xsk_umem {
> +     struct xsk_ring_prod *fill;
> +     struct xsk_ring_cons *comp;
> +     char *umem_area;
> +     struct xsk_umem_config config;
> +     int fd;
> +     int refcount;
> +};
> +
> +struct xsk_socket {
> +     struct xsk_ring_cons *rx;
> +     struct xsk_ring_prod *tx;
> +     __u64 outstanding_tx;
> +     struct xsk_umem *umem;
> +     struct xsk_socket_config config;
> +     int fd;
> +     int xsks_map;
> +     int ifindex;
> +     int prog_fd;
> +     int qidconf_map_fd;
> +     int xsks_map_fd;
> +     __u32 queue_id;
> +};
> +
> +struct xsk_nl_info {
> +     bool xdp_prog_attached;
> +     int ifindex;
> +     int fd;
> +};
> +
> +#define MAX_QUEUES 128

Why is this a fixed constant here, shouldn't this be dynamic due to being NIC
specific anyway?

[...]
> +void *xsk_umem__get_data(struct xsk_umem *umem, __u64 addr)
> +{
> +     return &((char *)(umem->umem_area))[addr];
> +}

There's also a xsk_umem__get_data_raw() doing the same. Why having both, resp.
when to choose which? ;)

> +int xsk_umem__fd(const struct xsk_umem *umem)
> +{
> +     return umem ? umem->fd : -EINVAL;
> +}
> +
> +int xsk_socket__fd(const struct xsk_socket *xsk)
> +{
> +     return xsk ? xsk->fd : -EINVAL;
> +}
> +
> +static bool xsk_page_aligned(void *buffer)
> +{
> +     unsigned long addr = (unsigned long)buffer;
> +
> +     return !(addr & (getpagesize() - 1));
> +}
> +
> +static void xsk_set_umem_config(struct xsk_umem_config *cfg,
> +                             const struct xsk_umem_config *usr_cfg)
> +{
> +     if (!usr_cfg) {
> +             cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
> +             cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
> +             cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
> +             cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
> +             return;
> +     }
> +
> +     cfg->fill_size = usr_cfg->fill_size;
> +     cfg->comp_size = usr_cfg->comp_size;
> +     cfg->frame_size = usr_cfg->frame_size;
> +     cfg->frame_headroom = usr_cfg->frame_headroom;

Just optional nit, might be a bit nicer to have it in this form:

        cfg->fill_size = usr_cfg ? usr_cfg->fill_size :
                         XSK_RING_PROD__DEFAULT_NUM_DESCS;


> +}
> +
> +static void xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
> +                                   const struct xsk_socket_config *usr_cfg)
> +{
> +     if (!usr_cfg) {
> +             cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
> +             cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
> +             cfg->libbpf_flags = 0;
> +             cfg->xdp_flags = 0;
> +             cfg->bind_flags = 0;
> +             return;
> +     }
> +
> +     cfg->rx_size = usr_cfg->rx_size;
> +     cfg->tx_size = usr_cfg->tx_size;
> +     cfg->libbpf_flags = usr_cfg->libbpf_flags;
> +     cfg->xdp_flags = usr_cfg->xdp_flags;
> +     cfg->bind_flags = usr_cfg->bind_flags;

(Ditto)

> +}
> +
> +int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
> +                  struct xsk_ring_prod *fill, struct xsk_ring_cons *comp,
> +                  const struct xsk_umem_config *usr_config)
> +{
> +     struct xdp_mmap_offsets off;
> +     struct xdp_umem_reg mr;
> +     struct xsk_umem *umem;
> +     socklen_t optlen;
> +     void *map;
> +     int err;
> +
> +     if (!umem_area || !umem_ptr || !fill || !comp)
> +             return -EFAULT;
> +     if (!size && !xsk_page_aligned(umem_area))
> +             return -EINVAL;
> +
> +     umem = calloc(1, sizeof(*umem));
> +     if (!umem)
> +             return -ENOMEM;
> +
> +     umem->fd = socket(AF_XDP, SOCK_RAW, 0);
> +     if (umem->fd < 0) {
> +             err = -errno;
> +             goto out_umem_alloc;
> +     }
> +
> +     umem->umem_area = umem_area;
> +     xsk_set_umem_config(&umem->config, usr_config);
> +
> +     mr.addr = (uintptr_t)umem_area;
> +     mr.len = size;
> +     mr.chunk_size = umem->config.frame_size;
> +     mr.headroom = umem->config.frame_headroom;
> +
> +     err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
> +     if (err) {
> +             err = -errno;
> +             goto out_socket;
> +     }
> +     err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_FILL_RING,
> +                      &umem->config.fill_size,
> +                      sizeof(umem->config.fill_size));
> +     if (err) {
> +             err = -errno;
> +             goto out_socket;
> +     }
> +     err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
> +                      &umem->config.comp_size,
> +                      sizeof(umem->config.comp_size));
> +     if (err) {
> +             err = -errno;
> +             goto out_socket;
> +     }
> +
> +     optlen = sizeof(off);
> +     err = getsockopt(umem->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
> +     if (err) {
> +             err = -errno;
> +             goto out_socket;
> +     }
> +
> +     map = xsk_mmap(NULL, off.fr.desc +
> +                    umem->config.fill_size * sizeof(__u64),
> +                    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
> +                    umem->fd, XDP_UMEM_PGOFF_FILL_RING);
> +     if (map == MAP_FAILED) {
> +             err = -errno;
> +             goto out_socket;
> +     }
> +
> +     umem->fill = fill;
> +     fill->mask = umem->config.fill_size - 1;
> +     fill->size = umem->config.fill_size;
> +     fill->producer = map + off.fr.producer;
> +     fill->consumer = map + off.fr.consumer;
> +     fill->ring = map + off.fr.desc;
> +     fill->cached_cons = umem->config.fill_size;
> +
> +     map = xsk_mmap(NULL,
> +                    off.cr.desc + umem->config.comp_size * sizeof(__u64),
> +                    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
> +                    umem->fd, XDP_UMEM_PGOFF_COMPLETION_RING);
> +     if (map == MAP_FAILED) {
> +             err = -errno;
> +             goto out_mmap;
> +     }
> +
> +     umem->comp = comp;
> +     comp->mask = umem->config.comp_size - 1;
> +     comp->size = umem->config.comp_size;
> +     comp->producer = map + off.cr.producer;
> +     comp->consumer = map + off.cr.consumer;
> +     comp->ring = map + off.cr.desc;
> +
> +     *umem_ptr = umem;
> +     return 0;
> +
> +out_mmap:
> +     munmap(umem->fill,
> +            off.fr.desc + umem->config.fill_size * sizeof(__u64));
> +out_socket:
> +     close(umem->fd);
> +out_umem_alloc:
> +     free(umem);
> +     return err;
> +}
> +
> +static int xsk_parse_nl(void *cookie, void *msg, struct nlattr **tb)
> +{
> +     struct nlattr *tb_parsed[IFLA_XDP_MAX + 1];
> +     struct xsk_nl_info *nl_info = cookie;
> +     struct ifinfomsg *ifinfo = msg;
> +     unsigned char mode;
> +     int err;
> +
> +     if (nl_info->ifindex && nl_info->ifindex != ifinfo->ifi_index)
> +             return 0;
> +
> +     if (!tb[IFLA_XDP])
> +             return 0;
> +
> +     err = libbpf_nla_parse_nested(tb_parsed, IFLA_XDP_MAX, tb[IFLA_XDP],
> +                                   NULL);
> +     if (err)
> +             return err;
> +
> +     if (!tb_parsed[IFLA_XDP_ATTACHED] || !tb_parsed[IFLA_XDP_FD])
> +             return 0;
> +
> +     mode = libbpf_nla_getattr_u8(tb_parsed[IFLA_XDP_ATTACHED]);
> +     if (mode == XDP_ATTACHED_NONE)
> +             return 0;
> +
> +     nl_info->xdp_prog_attached = true;
> +     nl_info->fd = libbpf_nla_getattr_u32(tb_parsed[IFLA_XDP_FD]);

Hm, I don't think this works if I read the intention of this helper correctly.

IFLA_XDP_FD is never set for retrieving the prog from the kernel. So the
above is a bug.

We also have bpf_get_link_xdp_id(). This should probably just be reused in
this context here.

> +     return 0;
> +}
> +
> +static bool xsk_xdp_prog_attached(struct xsk_socket *xsk)
> +{
> +     struct xsk_nl_info nl_info;
> +     unsigned int nl_pid;
> +     char err_buf[256];
> +     int sock, err;
> +
> +     sock = libbpf_netlink_open(&nl_pid);
> +     if (sock < 0)
> +             return false;
> +
> +     nl_info.xdp_prog_attached = false;
> +     nl_info.ifindex = xsk->ifindex;
> +     nl_info.fd = -1;
> +
> +     err = libbpf_nl_get_link(sock, nl_pid, xsk_parse_nl, &nl_info);
> +     if (err) {
> +             libbpf_strerror(err, err_buf, sizeof(err_buf));
> +             pr_warning("Error:\n%s\n", err_buf);
> +             close(sock);
> +             return false;
> +     }
> +
> +     close(sock);
> +     xsk->prog_fd = nl_info.fd;
> +     return nl_info.xdp_prog_attached;
> +}

(See bpf_get_link_xdp_id().)

> +
> +static int xsk_load_xdp_prog(struct xsk_socket *xsk)
> +{
> +     char bpf_log_buf[BPF_LOG_BUF_SIZE];
> +     int err, prog_fd;
> +
> +     /* This is the C-program:
> +      * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
> +      * {
> +      *     int *qidconf, index = ctx->rx_queue_index;
[...]
> +
> +int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
> +                    __u32 queue_id, struct xsk_umem *umem,
> +                    struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
> +                    const struct xsk_socket_config *usr_config)
> +{
> +     struct sockaddr_xdp sxdp = {};
> +     struct xdp_mmap_offsets off;
> +     struct xsk_socket *xsk;
> +     socklen_t optlen;
> +     void *map;
> +     int err;
> +
> +     if (!umem || !xsk_ptr || !rx || !tx)
> +             return -EFAULT;
> +
> +     if (umem->refcount) {
> +             pr_warning("Error: shared umems not supported by libbpf.\n");
> +             return -EBUSY;
> +     }
> +
> +     xsk = calloc(1, sizeof(*xsk));
> +     if (!xsk)
> +             return -ENOMEM;
> +
> +     if (umem->refcount++ > 0) {

Should this refcount rather be atomic actually?

> +             xsk->fd = socket(AF_XDP, SOCK_RAW, 0);
> +             if (xsk->fd < 0) {
> +                     err = -errno;
> +                     goto out_xsk_alloc;
> +             }
> +     } else {
> +             xsk->fd = umem->fd;
> +     }
> +
> +     xsk->outstanding_tx = 0;
> +     xsk->queue_id = queue_id;
> +     xsk->umem = umem;
> +     xsk->ifindex = if_nametoindex(ifname);
> +     if (!xsk->ifindex) {
> +             err = -errno;
> +             goto out_socket;
> +     }
> +
> +     xsk_set_xdp_socket_config(&xsk->config, usr_config);
[...]

Reply via email to