Brian Vazquez has proposed BPF_MAP_DUMP command to look up more than one
map entries per syscall.
  
https://lore.kernel.org/bpf/cabcgpau3xxx6cmmxd+1knapivtc2jlbhysdxw-0e9bqel0q...@mail.gmail.com/T/#t

During discussion, we found more use cases can be supported in a similar
map operation batching framework. For example, batched map lookup and delete,
which can be really helpful for bcc.
  https://github.com/iovisor/bcc/blob/master/tools/tcptop.py#L233-L243
  https://github.com/iovisor/bcc/blob/master/tools/slabratetop.py#L129-L138

Also, in bcc, we have API to delete all entries in a map.
  https://github.com/iovisor/bcc/blob/master/src/cc/api/BPFTable.h#L257-L264

For map update, batched operations also useful as sometimes applications need
to populate initial maps with more than one entry. For example, the below
example is from kernel/samples/bpf/xdp_redirect_cpu_user.c:
  
https://github.com/torvalds/linux/blob/master/samples/bpf/xdp_redirect_cpu_user.c#L543-L550

This patch addresses all the above use cases. To make uapi stable, it also
covers other potential use cases. For bpf syscall subcommands are introduced:
        BPF_MAP_LOOKUP_BATCH
        BPF_MAP_LOOKUP_AND_DELETE_BATCH
        BPF_MAP_UPDATE_BATCH
        BPF_MAP_DELETE_BATCH

The UAPI attribute structure looks like:

    struct { /* struct used by BPF_MAP_*_BATCH commands */
            __u64           batch;  /* input/output:
                                     * input: start batch,
                                     *        0 to start from beginning.
                                     * output: next start batch,
                                     *         0 to end batching.
                                     */
            __aligned_u64   keys;
            __aligned_u64   values;
            __u32           count;  /* input/output:
                                     * input: # of elements keys/values.
                                     * output: # of filled elements.
                                     */
            __u32           map_fd;
            __u64           elem_flags;
            __u64           flags;
    } batch;

An opaque value 'batch' is used for user/kernel space communication
for where in the map to start the operation for lookup/lookup_and_delete/delete.
  input 'batch' = 0: to start the operation from the beginning of the map.
  output 'batch': if not 0, the next input for batch operation.

For lookup/lookup_and_delete:
  operation: lookup/lookup_and_delete starting from a particular 'batch'.
  return:
     'batch'       'count'     return code     meaning
      0            0           0               Done. Nothing left
      0            0           -ENOSPC         no space to handle batch 0
      > 0          0           -ENOSPC         no space to handle 'batch'
      > 0          > 0         0               stopped right before 'batch'
Note that:
  (1). Even if return code is 0 and return 'count' > 0, the return 'count' may
       not be equal to input 'count'. This happens when there is no enough space
       to handle a batch.
  (2). If the return code is an error and not -EFAULT,
       'batch' indicates the batch has issues and 'count' indicates the number
       of elements successfully processed.

For delete:
  operation: deletion starting from a particular 'batch'.
  return: 0 means everything is deleted from 'batch'.
          error code means something deletion not happening.

For update:
  operation: update 'count' number of elements in 'keys'/'values'.
  return: 0 means successful updates for all elements.
          error code, if not -EFAULT, 'count' is the number of successful 
updates.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 include/linux/bpf.h      |   9 ++
 include/uapi/linux/bpf.h |  22 +++
 kernel/bpf/hashtab.c     | 324 +++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c     |  68 ++++++++
 4 files changed, 423 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5b9d22338606..3c1302e8e2d4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -37,6 +37,15 @@ struct bpf_map_ops {
        int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
        void (*map_release_uref)(struct bpf_map *map);
        void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
+       int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr,
+                               union bpf_attr __user *uattr);
+       int (*map_lookup_and_delete_batch)(struct bpf_map *map,
+                                          const union bpf_attr *attr,
+                                          union bpf_attr __user *uattr);
+       int (*map_update_batch)(struct bpf_map *map, const union bpf_attr *attr,
+                               union bpf_attr __user *uattr);
+       int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr,
+                               union bpf_attr __user *uattr);
 
        /* funcs callable from userspace and from eBPF programs */
        void *(*map_lookup_elem)(struct bpf_map *map, void *key);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5d2fb183ee2d..9d4f76073dd9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -107,6 +107,10 @@ enum bpf_cmd {
        BPF_MAP_LOOKUP_AND_DELETE_ELEM,
        BPF_MAP_FREEZE,
        BPF_BTF_GET_NEXT_ID,
+       BPF_MAP_LOOKUP_BATCH,
+       BPF_MAP_LOOKUP_AND_DELETE_BATCH,
+       BPF_MAP_UPDATE_BATCH,
+       BPF_MAP_DELETE_BATCH,
 };
 
 enum bpf_map_type {
@@ -396,6 +400,24 @@ union bpf_attr {
                __u64           flags;
        };
 
+       struct { /* struct used by BPF_MAP_*_BATCH commands */
+               __u64           batch;  /* input/output:
+                                        * input: start batch,
+                                        *        0 to start from beginning.
+                                        * output: next start batch,
+                                        *         0 to end batching.
+                                        */
+               __aligned_u64   keys;
+               __aligned_u64   values;
+               __u32           count;  /* input/output:
+                                        * input: # of elements keys/values.
+                                        * output: # of filled elements.
+                                        */
+               __u32           map_fd;
+               __u64           elem_flags;
+               __u64           flags;
+       } batch;
+
        struct { /* anonymous struct used by BPF_PROG_LOAD command */
                __u32           prog_type;      /* one of enum bpf_prog_type */
                __u32           insn_cnt;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 22066a62c8c9..ee7b90200f4d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1232,6 +1232,322 @@ static void htab_map_seq_show_elem(struct bpf_map *map, 
void *key,
        rcu_read_unlock();
 }
 
+static int
+__htab_map_lookup_and_delete_batch(struct bpf_map *map,
+                                  const union bpf_attr *attr,
+                                  union bpf_attr __user *uattr,
+                                  bool do_delete, bool is_lru_map)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
+       void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
+       u64 elem_map_flags, map_flags;
+       struct hlist_nulls_head *head;
+       void __user *ukeys, *uvalues;
+       struct hlist_nulls_node *n;
+       u32 batch, max_count;
+       unsigned long flags;
+       struct htab_elem *l;
+       struct bucket *b;
+       int ret = 0;
+
+       max_count = attr->batch.count;
+       if (!max_count)
+               return 0;
+
+       elem_map_flags = attr->batch.elem_flags;
+       if ((elem_map_flags & ~BPF_F_LOCK) ||
+           ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
+               return -EINVAL;
+
+       map_flags = attr->batch.flags;
+       if (map_flags)
+               return -EINVAL;
+
+       batch = (u32)attr->batch.batch;
+       if (batch >= htab->n_buckets)
+               return -EINVAL;
+
+       /* We cannot do copy_from_user or copy_to_user inside
+        * the rcu_read_lock. Allocate enough space here.
+        */
+       key_size = htab->map.key_size;
+       roundup_key_size = round_up(htab->map.key_size, 8);
+       value_size = htab->map.value_size;
+       keys = kvmalloc(key_size * max_count, GFP_USER | __GFP_NOWARN);
+       values = kvmalloc(value_size * max_count, GFP_USER | __GFP_NOWARN);
+       if (!keys || !values) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       dst_key = keys;
+       dst_val = values;
+       total = 0;
+
+       preempt_disable();
+       this_cpu_inc(bpf_prog_active);
+       rcu_read_lock();
+
+again:
+       b = &htab->buckets[batch];
+       head = &b->head;
+       raw_spin_lock_irqsave(&b->lock, flags);
+
+       bucket_cnt = 0;
+       hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
+               bucket_cnt++;
+
+       if (bucket_cnt > (max_count - total)) {
+               if (total == 0)
+                       ret = -ENOSPC;
+               goto after_loop;
+       }
+
+       hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) {
+               memcpy(dst_key, l->key, key_size);
+
+               value = l->key + roundup_key_size;
+               if (elem_map_flags & BPF_F_LOCK)
+                       copy_map_value_locked(map, dst_val, value, true);
+               else
+                       copy_map_value(map, dst_val, value);
+               check_and_init_map_lock(map, dst_val);
+
+               dst_key += key_size;
+               dst_val += value_size;
+               total++;
+       }
+
+       if (do_delete) {
+               hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) {
+                       hlist_nulls_del_rcu(&l->hash_node);
+                       if (is_lru_map)
+                               bpf_lru_push_free(&htab->lru, &l->lru_node);
+                       else
+                               free_htab_elem(htab, l);
+               }
+       }
+
+       batch++;
+       if (batch >= htab->n_buckets) {
+               batch = 0;
+               goto after_loop;
+       }
+
+       raw_spin_unlock_irqrestore(&b->lock, flags);
+       goto again;
+
+after_loop:
+       raw_spin_unlock_irqrestore(&b->lock, flags);
+
+       rcu_read_unlock();
+       this_cpu_dec(bpf_prog_active);
+       preempt_enable();
+
+       /* copy data back to user */
+       ukeys = u64_to_user_ptr(attr->batch.keys);
+       uvalues = u64_to_user_ptr(attr->batch.values);
+       if (put_user(batch, &uattr->batch.batch) ||
+           copy_to_user(ukeys, keys, total * key_size) ||
+           copy_to_user(uvalues, values, total * value_size) ||
+           put_user(total, &uattr->batch.count))
+               ret = -EFAULT;
+
+out:
+       kvfree(keys);
+       kvfree(values);
+       return ret;
+}
+
+static int
+__htab_map_update_batch(struct bpf_map *map, const union bpf_attr *attr,
+                       union bpf_attr __user *uattr, bool is_lru_map)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       u32 count, max_count, key_size, roundup_key_size, value_size;
+       u64 elem_map_flags, map_flags;
+       void __user *ukey, *uvalue;
+       void *key, *value;
+       int ret = 0;
+
+       max_count = attr->batch.count;
+       if (!max_count)
+               return 0;
+
+       elem_map_flags = attr->batch.elem_flags;
+       if ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))
+               return -EINVAL;
+
+       map_flags = attr->batch.flags;
+       if (map_flags)
+               return -EINVAL;
+
+       key_size = htab->map.key_size;
+       roundup_key_size = round_up(htab->map.key_size, 8);
+       value_size = htab->map.value_size;
+       key = kmalloc(key_size, GFP_USER | __GFP_NOWARN);
+       value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+       if (!key || !value) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       ukey = u64_to_user_ptr(attr->batch.keys);
+       uvalue = u64_to_user_ptr(attr->batch.values);
+       for (count = 0; count < max_count; count++) {
+               if (copy_from_user(key, ukey + count * key_size, key_size) ||
+                   copy_from_user(value, uvalue + count * value_size, 
value_size)) {
+                       ret = -EFAULT;
+                       break;
+               }
+
+               preempt_disable();
+               __this_cpu_inc(bpf_prog_active);
+               rcu_read_lock();
+               if (is_lru_map)
+                       ret = htab_lru_map_update_elem(map, key, value, 
elem_map_flags);
+               else
+                       ret = htab_map_update_elem(map, key, value, 
elem_map_flags);
+               rcu_read_unlock();
+               __this_cpu_dec(bpf_prog_active);
+               preempt_enable();
+
+               if (ret) {
+                       if (put_user(count, &uattr->batch.count))
+                               ret = -EFAULT;
+                       break;
+               }
+       }
+
+out:
+       kfree(key);
+       kfree(value);
+       return ret;
+}
+
+static int
+__htab_map_delete_batch(struct bpf_map *map,
+                       const union bpf_attr *attr,
+                       union bpf_attr __user *uattr,
+                       bool is_lru_map)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       u64 elem_map_flags, map_flags;
+       struct hlist_nulls_head *head;
+       struct hlist_nulls_node *n;
+       u32 batch, max_count;
+       unsigned long flags;
+       struct htab_elem *l;
+       struct bucket *b;
+
+       elem_map_flags = attr->batch.elem_flags;
+       map_flags = attr->batch.flags;
+       if (elem_map_flags || map_flags)
+               return -EINVAL;
+
+       max_count = attr->batch.count;
+       batch = (u32)attr->batch.batch;
+       if (max_count || batch >= htab->n_buckets)
+               return -EINVAL;
+
+       preempt_disable();
+       __this_cpu_inc(bpf_prog_active);
+       rcu_read_lock();
+
+again:
+       b = &htab->buckets[batch];
+       head = &b->head;
+       raw_spin_lock_irqsave(&b->lock, flags);
+
+       hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) {
+               hlist_nulls_del_rcu(&l->hash_node);
+               if (is_lru_map)
+                       bpf_lru_push_free(&htab->lru, &l->lru_node);
+               else
+                       free_htab_elem(htab, l);
+       }
+
+       batch++;
+       if (batch >= htab->n_buckets)
+               goto out;
+
+       raw_spin_unlock_irqrestore(&b->lock, flags);
+       goto again;
+
+out:
+       raw_spin_unlock_irqrestore(&b->lock, flags);
+       rcu_read_unlock();
+       __this_cpu_dec(bpf_prog_active);
+       preempt_enable();
+
+       return 0;
+}
+
+static int
+htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+                     union bpf_attr __user *uattr)
+{
+       return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+                                                 false);
+}
+
+static int
+htab_map_lookup_and_delete_batch(struct bpf_map *map,
+                                const union bpf_attr *attr,
+                                union bpf_attr __user *uattr)
+{
+       return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+                                                 false);
+}
+
+static int
+htab_map_update_batch(struct bpf_map *map, const union bpf_attr *attr,
+                     union bpf_attr __user *uattr)
+{
+       return __htab_map_update_batch(map, attr, uattr, false);
+}
+
+static int
+htab_map_delete_batch(struct bpf_map *map,
+                     const union bpf_attr *attr,
+                     union bpf_attr __user *uattr)
+{
+       return __htab_map_delete_batch(map, attr, uattr, false);
+}
+
+static int
+htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+                         union bpf_attr __user *uattr)
+{
+       return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+                                                 true);
+}
+
+static int
+htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
+                                    const union bpf_attr *attr,
+                                    union bpf_attr __user *uattr)
+{
+       return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+                                                 true);
+}
+
+static int
+htab_lru_map_update_batch(struct bpf_map *map, const union bpf_attr *attr,
+                         union bpf_attr __user *uattr)
+{
+       return __htab_map_update_batch(map, attr, uattr, true);
+}
+
+static int
+htab_lru_map_delete_batch(struct bpf_map *map,
+                         const union bpf_attr *attr,
+                         union bpf_attr __user *uattr)
+{
+       return __htab_map_delete_batch(map, attr, uattr, true);
+}
+
 const struct bpf_map_ops htab_map_ops = {
        .map_alloc_check = htab_map_alloc_check,
        .map_alloc = htab_map_alloc,
@@ -1242,6 +1558,10 @@ const struct bpf_map_ops htab_map_ops = {
        .map_delete_elem = htab_map_delete_elem,
        .map_gen_lookup = htab_map_gen_lookup,
        .map_seq_show_elem = htab_map_seq_show_elem,
+       .map_lookup_batch = htab_map_lookup_batch,
+       .map_lookup_and_delete_batch = htab_map_lookup_and_delete_batch,
+       .map_update_batch = htab_map_update_batch,
+       .map_delete_batch = htab_map_delete_batch,
 };
 
 const struct bpf_map_ops htab_lru_map_ops = {
@@ -1255,6 +1575,10 @@ const struct bpf_map_ops htab_lru_map_ops = {
        .map_delete_elem = htab_lru_map_delete_elem,
        .map_gen_lookup = htab_lru_map_gen_lookup,
        .map_seq_show_elem = htab_map_seq_show_elem,
+       .map_lookup_batch = htab_lru_map_lookup_batch,
+       .map_lookup_and_delete_batch = htab_lru_map_lookup_and_delete_batch,
+       .map_update_batch = htab_lru_map_update_batch,
+       .map_delete_batch = htab_lru_map_delete_batch,
 };
 
 /* Called from eBPF program */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ca60eafa6922..e83bdf7efbd8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2816,6 +2816,62 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
        return err;
 }
 
+#define BPF_MAP_BATCH_LAST_FIELD batch.flags
+
+#define BPF_DO_BATCH(fn)                       \
+       do {                                    \
+               if (!fn) {                      \
+                       err = -ENOTSUPP;        \
+                       goto err_put;           \
+               }                               \
+               err = fn(map, attr, uattr);     \
+       } while(0)
+
+static int bpf_map_do_batch(const union bpf_attr *attr,
+                           union bpf_attr __user *uattr,
+                           int cmd)
+{
+       struct bpf_map *map;
+       int err, ufd;
+       struct fd f;
+
+       if (CHECK_ATTR(BPF_MAP_BATCH))
+               return -EINVAL;
+
+       ufd = attr->batch.map_fd;
+       f = fdget(ufd);
+       map = __bpf_map_get(f);
+       if (IS_ERR(map))
+               return PTR_ERR(map);
+
+       if ((cmd == BPF_MAP_LOOKUP_BATCH ||
+            cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
+           !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
+               err = -EPERM;
+               goto err_put;
+       }
+
+       if (cmd != BPF_MAP_LOOKUP_BATCH &&
+           !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+               err = -EPERM;
+               goto err_put;
+       }
+
+       if (cmd == BPF_MAP_LOOKUP_BATCH) {
+               BPF_DO_BATCH(map->ops->map_lookup_batch);
+       } else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) {
+               BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
+       } else if (cmd == BPF_MAP_UPDATE_BATCH) {
+               BPF_DO_BATCH(map->ops->map_update_batch);
+       } else {
+               BPF_DO_BATCH(map->ops->map_delete_batch);
+       }
+
+err_put:
+       fdput(f);
+       return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, 
size)
 {
        union bpf_attr attr = {};
@@ -2913,6 +2969,18 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, 
uattr, unsigned int, siz
        case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
                err = map_lookup_and_delete_elem(&attr);
                break;
+       case BPF_MAP_LOOKUP_BATCH:
+               err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
+               break;
+       case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
+               err = bpf_map_do_batch(&attr, uattr, 
BPF_MAP_LOOKUP_AND_DELETE_BATCH);
+               break;
+       case BPF_MAP_UPDATE_BATCH:
+               err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH);
+               break;
+       case BPF_MAP_DELETE_BATCH:
+               err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
+               break;
        default:
                err = -EINVAL;
                break;
-- 
2.17.1

Reply via email to