On Wed, Oct 10, 2018 at 10:48 AM Mauricio Vasquez <mauricio.vasq...@polito.it> wrote: > > > > On 10/10/2018 11:48 AM, Song Liu wrote: > > On Wed, Oct 10, 2018 at 7:06 AM Mauricio Vasquez B > > <mauricio.vasq...@polito.it> wrote: > >> The following patch implements a bpf queue/stack maps that > >> provides the peek/pop/push functions. There is not a direct > >> relationship between those functions and the current maps > >> syscalls, hence a new MAP_LOOKUP_AND_DELETE_ELEM syscall is added, > >> this is mapped to the pop operation in the queue/stack maps > >> and it is still to implement in other kind of maps. > >> > >> Signed-off-by: Mauricio Vasquez B <mauricio.vasq...@polito.it> > >> --- > >> include/linux/bpf.h | 1 + > >> include/uapi/linux/bpf.h | 1 + > >> kernel/bpf/syscall.c | 82 > >> ++++++++++++++++++++++++++++++++++++++++++++++ > >> 3 files changed, 84 insertions(+) > >> > >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h > >> index 9b558713447f..5793f0c7fbb5 100644 > >> --- a/include/linux/bpf.h > >> +++ b/include/linux/bpf.h > >> @@ -39,6 +39,7 @@ struct bpf_map_ops { > >> void *(*map_lookup_elem)(struct bpf_map *map, void *key); > >> int (*map_update_elem)(struct bpf_map *map, void *key, void > >> *value, u64 flags); > >> int (*map_delete_elem)(struct bpf_map *map, void *key); > >> + void *(*map_lookup_and_delete_elem)(struct bpf_map *map, void > >> *key); > >> > >> /* funcs called by prog_array and perf_event_array map */ > >> void *(*map_fd_get_ptr)(struct bpf_map *map, struct file > >> *map_file, > >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > >> index f9187b41dff6..3bb94aa2d408 100644 > >> --- a/include/uapi/linux/bpf.h > >> +++ b/include/uapi/linux/bpf.h > >> @@ -103,6 +103,7 @@ enum bpf_cmd { > >> BPF_BTF_LOAD, > >> BPF_BTF_GET_FD_BY_ID, > >> BPF_TASK_FD_QUERY, > >> + BPF_MAP_LOOKUP_AND_DELETE_ELEM, > >> }; > >> > >> enum bpf_map_type { > >> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > >> index f36c080ad356..6907d661dea5 100644 > >> --- a/kernel/bpf/syscall.c > >> +++ b/kernel/bpf/syscall.c > >> @@ -980,6 +980,85 @@ static int map_get_next_key(union bpf_attr *attr) > >> return err; > >> } > >> > >> +#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value > >> + > >> +static int map_lookup_and_delete_elem(union bpf_attr *attr) > >> +{ > >> + void __user *ukey = u64_to_user_ptr(attr->key); > >> + void __user *uvalue = u64_to_user_ptr(attr->value); > >> + int ufd = attr->map_fd; > >> + struct bpf_map *map; > >> + void *key, *value, *ptr; > >> + u32 value_size; > >> + struct fd f; > >> + int err; > >> + > >> + if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) > >> + return -EINVAL; > >> + > >> + f = fdget(ufd); > >> + map = __bpf_map_get(f); > >> + if (IS_ERR(map)) > >> + return PTR_ERR(map); > >> + > >> + if (!(f.file->f_mode & FMODE_CAN_WRITE)) { > >> + err = -EPERM; > >> + goto err_put; > >> + } > >> + > >> + key = __bpf_copy_key(ukey, map->key_size); > >> + if (IS_ERR(key)) { > >> + err = PTR_ERR(key); > >> + goto err_put; > >> + } > >> + > >> + value_size = map->value_size; > >> + > >> + err = -ENOMEM; > >> + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); > >> + if (!value) > >> + goto free_key; > >> + > >> + err = -EFAULT; > >> + if (copy_from_user(value, uvalue, value_size) != 0) > >> + goto free_value; > >> + > >> + /* must increment bpf_prog_active to avoid kprobe+bpf triggering > >> from > >> + * inside bpf map update or delete otherwise deadlocks are possible > >> + */ > >> + preempt_disable(); > >> + __this_cpu_inc(bpf_prog_active); > >> + if (map->ops->map_lookup_and_delete_elem) { > >> + rcu_read_lock(); > >> + ptr = map->ops->map_lookup_and_delete_elem(map, key); > >> + if (ptr) > >> + memcpy(value, ptr, value_size); > > I think we are exposed to race condition with push and pop in parallel. > > map_lookup_and_delete_elem() only updates the head/tail, so it gives > > no protection for the buffer pointed by ptr. > > queue/stack maps does not use this 'ptr', the pop operation directly > copies the value into the buffer allocated in map_lookup_and_delete_elem(). > The copy from the queue/stack buffer into 'value' and the head/tail > update are protected by a spinlock in the queue/stack maps implementation. > > On the other hand, future implementation of map_lookup_and_delete > operation in other kind of maps should guarantee that the return ptr is > rcu protected. > > Does it make sense to you?
I reread the other patch, and found it does NOT use the following logic for queue and stack: rcu_read_lock(); ptr = map->ops->map_lookup_and_delete_elem(map, key); if (ptr) memcpy(value, ptr, value_size); I guess this part is not used at all? Can we just remove it? Thanks, Song > > > >> + rcu_read_unlock(); > >> + err = ptr ? 0 : -ENOENT; > >> + } else { > >> + err = -ENOTSUPP; > >> + } > >> + > >> + __this_cpu_dec(bpf_prog_active); > >> + preempt_enable(); > >> + > >> + if (err) > >> + goto free_value; > >> + > >> + if (copy_to_user(uvalue, value, value_size) != 0) > >> + goto free_value; > >> + > >> + err = 0; > >> + > >> +free_value: > >> + kfree(value); > >> +free_key: > >> + kfree(key); > >> +err_put: > >> + fdput(f); > >> + return err; > >> +} > >> + > >> static const struct bpf_prog_ops * const bpf_prog_types[] = { > >> #define BPF_PROG_TYPE(_id, _name) \ > >> [_id] = & _name ## _prog_ops, > >> @@ -2453,6 +2532,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user > >> *, uattr, unsigned int, siz > >> case BPF_TASK_FD_QUERY: > >> err = bpf_task_fd_query(&attr, uattr); > >> break; > >> + case BPF_MAP_LOOKUP_AND_DELETE_ELEM: > >> + err = map_lookup_and_delete_elem(&attr); > >> + break; > >> default: > >> err = -EINVAL; > >> break; > >> >