On Tue, Jun 16, 2020 at 06:04:14PM -0700, Stanislav Fomichev wrote:
> Attaching to these hooks can break iptables because its optval is
> usually quite big, or at least bigger than the current PAGE_SIZE limit.
> David also mentioned some SCTP options can be big (around 256k).
> 
> For such optvals we expose only the first PAGE_SIZE bytes to
> the BPF program. BPF program has two options:
> 1. Set ctx->optlen to 0 to indicate that the BPF's optval
>    should be ignored and the kernel should use original userspace
>    value.
> 2. Set ctx->optlen to something that's smaller than the PAGE_SIZE.
> 
> v5:
> * use ctx->optlen == 0 with trimmed buffer (Alexei Starovoitov)
> * update the docs accordingly
> 
> v4:
> * use temporary buffer to avoid optval == optval_end == NULL;
>   this removes the corner case in the verifier that might assume
>   non-zero PTR_TO_PACKET/PTR_TO_PACKET_END.
> 
> v3:
> * don't increase the limit, bypass the argument
> 
> v2:
> * proper comments formatting (Jakub Kicinski)
> 
> Fixes: 0d01da6afc54 ("bpf: implement getsockopt and setsockopt hooks")
> Cc: David Laight <david.lai...@aculab.com>
> Signed-off-by: Stanislav Fomichev <s...@google.com>
> ---
>  kernel/bpf/cgroup.c | 53 ++++++++++++++++++++++++++++-----------------
>  1 file changed, 33 insertions(+), 20 deletions(-)
> 
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 4d76f16524cc..ac53102e244a 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -1276,16 +1276,23 @@ static bool __cgroup_bpf_prog_array_is_empty(struct 
> cgroup *cgrp,
>  
>  static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
>  {
> -     if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
> +     if (unlikely(max_optlen < 0))
>               return -EINVAL;
>  
> +     if (unlikely(max_optlen > PAGE_SIZE)) {
> +             /* We don't expose optvals that are greater than PAGE_SIZE
> +              * to the BPF program.
> +              */
> +             max_optlen = PAGE_SIZE;
> +     }
> +
>       ctx->optval = kzalloc(max_optlen, GFP_USER);
>       if (!ctx->optval)
>               return -ENOMEM;
>  
>       ctx->optval_end = ctx->optval + max_optlen;
>  
> -     return 0;
> +     return max_optlen;
>  }
>  
>  static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
> @@ -1319,13 +1326,13 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock 
> *sk, int *level,
>        */
>       max_optlen = max_t(int, 16, *optlen);
>  
> -     ret = sockopt_alloc_buf(&ctx, max_optlen);
> -     if (ret)
> -             return ret;
> +     max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
> +     if (max_optlen < 0)
> +             return max_optlen;
>  
>       ctx.optlen = *optlen;
>  
> -     if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
> +     if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
>               ret = -EFAULT;
>               goto out;
>       }
> @@ -1353,8 +1360,14 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock 
> *sk, int *level,
>               /* export any potential modifications */
>               *level = ctx.level;
>               *optname = ctx.optname;
> -             *optlen = ctx.optlen;
> -             *kernel_optval = ctx.optval;
> +
> +             /* optlen == 0 from BPF indicates that we should
> +              * use original userspace data.
> +              */
> +             if (ctx.optlen != 0) {
> +                     *optlen = ctx.optlen;

I think it should be:
*optlen = min(ctx.optlen, max_optlen);

Otherwise when bpf prog doesn't adjust ctx.oplen the kernel will see
4k only in kernel_optval whereas optlen will be > 4k.
I suspect iptables sockopt should have crashed at this point.
How did you test it?

> +                     *kernel_optval = ctx.optval;
> +             }
>       }
>  

Reply via email to