On Wed, Oct 18, 2017 at 07:10:36AM -0700, John Fastabend wrote: > From: John Fastabend <john.fastab...@gmail.com> > > SK_SKB BPF programs are run from the socket/tcp context but early in > the stack before much of the TCP metadata is needed in tcp_skb_cb. So > we can use some unused fields to place BPF metadata needed for SK_SKB > programs when implementing the redirect function. > > This allows us to drop the preempt disable logic. It does however > require an API change so sk_redirect_map() has been updated to > additionally provide ctx_ptr to skb. Note, we do however continue to > disable/enable preemption around actual BPF program running to account > for map updates. > > Signed-off-by: John Fastabend <john.fastab...@gmail.com> > Acked-by: Daniel Borkmann <dan...@iogearbox.net>
Acked-by: Alexei Starovoitov <a...@kernel.org> > --- > include/linux/filter.h | 2 + > include/net/tcp.h | 5 +++ > kernel/bpf/sockmap.c | 19 ++++++------- > net/core/filter.c | 29 > ++++++++++---------- > samples/sockmap/sockmap_kern.c | 2 + > tools/include/uapi/linux/bpf.h | 3 +- > tools/testing/selftests/bpf/bpf_helpers.h | 2 + > tools/testing/selftests/bpf/sockmap_verdict_prog.c | 4 +-- > 8 files changed, 36 insertions(+), 30 deletions(-) > > diff --git a/include/linux/filter.h b/include/linux/filter.h > index d29e58f..818a0b2 100644 > --- a/include/linux/filter.h > +++ b/include/linux/filter.h > @@ -728,7 +728,7 @@ void xdp_do_flush_map(void); > void bpf_warn_invalid_xdp_action(u32 act); > void bpf_warn_invalid_xdp_redirect(u32 ifindex); > > -struct sock *do_sk_redirect_map(void); > +struct sock *do_sk_redirect_map(struct sk_buff *skb); > > #ifdef CONFIG_BPF_JIT > extern int bpf_jit_enable; > diff --git a/include/net/tcp.h b/include/net/tcp.h > index 89974c5..b1ef98e 100644 > --- a/include/net/tcp.h > +++ b/include/net/tcp.h > @@ -840,6 +840,11 @@ struct tcp_skb_cb { > struct inet6_skb_parm h6; > #endif > } header; /* For incoming skbs */ > + struct { > + __u32 key; > + __u32 flags; > + struct bpf_map *map; > + } bpf; I think it should be ok. cc Eric for visibility. > }; > }; > > diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c > index c68899d..beaabb2 100644 > --- a/kernel/bpf/sockmap.c > +++ b/kernel/bpf/sockmap.c > @@ -39,6 +39,7 @@ > #include <linux/workqueue.h> > #include <linux/list.h> > #include <net/strparser.h> > +#include <net/tcp.h> > > struct bpf_stab { > struct bpf_map map; > @@ -101,9 +102,16 @@ static int smap_verdict_func(struct smap_psock *psock, > struct sk_buff *skb) > return SK_DROP; > > skb_orphan(skb); > + /* We need to ensure that BPF metadata for maps is also cleared > + * when we orphan the skb so that we don't have the possibility > + * to reference a stale map. > + */ > + TCP_SKB_CB(skb)->bpf.map = NULL; > skb->sk = psock->sock; > bpf_compute_data_end(skb); > + preempt_disable(); > rc = (*prog->bpf_func)(skb, prog->insnsi); > + preempt_enable(); > skb->sk = NULL; > > return rc; > @@ -114,17 +122,10 @@ static void smap_do_verdict(struct smap_psock *psock, > struct sk_buff *skb) > struct sock *sk; > int rc; > > - /* Because we use per cpu values to feed input from sock redirect > - * in BPF program to do_sk_redirect_map() call we need to ensure we > - * are not preempted. RCU read lock is not sufficient in this case > - * with CONFIG_PREEMPT_RCU enabled so we must be explicit here. > - */ > - preempt_disable(); > rc = smap_verdict_func(psock, skb); > switch (rc) { > case SK_REDIRECT: > - sk = do_sk_redirect_map(); > - preempt_enable(); > + sk = do_sk_redirect_map(skb); > if (likely(sk)) { > struct smap_psock *peer = smap_psock_sk(sk); > > @@ -141,8 +142,6 @@ static void smap_do_verdict(struct smap_psock *psock, > struct sk_buff *skb) > /* Fall through and free skb otherwise */ > case SK_DROP: > default: > - if (rc != SK_REDIRECT) > - preempt_enable(); > kfree_skb(skb); > } > } > diff --git a/net/core/filter.c b/net/core/filter.c > index 74b8c91..ca1ba0b 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -1839,31 +1839,31 @@ static const struct bpf_func_proto bpf_redirect_proto > = { > .arg2_type = ARG_ANYTHING, > }; > > -BPF_CALL_3(bpf_sk_redirect_map, struct bpf_map *, map, u32, key, u64, flags) > +BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, > + struct bpf_map *, map, u32, key, u64, flags) > { > - struct redirect_info *ri = this_cpu_ptr(&redirect_info); > + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); > > if (unlikely(flags)) > return SK_ABORTED; > > - ri->ifindex = key; > - ri->flags = flags; > - ri->map = map; > + tcb->bpf.key = key; > + tcb->bpf.flags = flags; > + tcb->bpf.map = map; > > return SK_REDIRECT; > } > > -struct sock *do_sk_redirect_map(void) > +struct sock *do_sk_redirect_map(struct sk_buff *skb) > { > - struct redirect_info *ri = this_cpu_ptr(&redirect_info); > + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); > struct sock *sk = NULL; > > - if (ri->map) { > - sk = __sock_map_lookup_elem(ri->map, ri->ifindex); > + if (tcb->bpf.map) { > + sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key); > > - ri->ifindex = 0; > - ri->map = NULL; > - /* we do not clear flags for future lookup */ > + tcb->bpf.key = 0; > + tcb->bpf.map = NULL; > } >