On 1/24/19 12:34 PM, Peter Oskolkov wrote: > This patch implements BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap > BPF helper. It enables BPF programs (specifically, BPF_PROG_TYPE_LWT_IN > and BPF_PROG_TYPE_LWT_XMIT prog types) to add IP encapsulation headers > to packets (e.g. IP/GRE, GUE, IPIP). > > This is useful when thousands of different short-lived flows should be > encapped, each with different and dynamically determined destination. > Although lwtunnels can be used in some of these scenarios, the ability > to dynamically generate encap headers adds more flexibility, e.g. > when routing depends on the state of the host (reflected in global bpf > maps). > > Signed-off-by: Peter Oskolkov <p...@google.com> > --- > include/net/lwtunnel.h | 3 + > net/core/filter.c | 3 +- > net/core/lwt_bpf.c | 142 +++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 147 insertions(+), 1 deletion(-) > > diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h > index 33fd9ba7e0e5..f0973eca8036 100644 > --- a/include/net/lwtunnel.h > +++ b/include/net/lwtunnel.h > @@ -126,6 +126,8 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct > lwtunnel_state *b); > int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb); > int lwtunnel_input(struct sk_buff *skb); > int lwtunnel_xmit(struct sk_buff *skb); > +int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, > + bool ingress); > > static inline void lwtunnel_set_redirect(struct dst_entry *dst) > { > @@ -138,6 +140,7 @@ static inline void lwtunnel_set_redirect(struct dst_entry > *dst) > dst->input = lwtunnel_input; > } > } > + > #else > > static inline void lwtstate_free(struct lwtunnel_state *lws) > diff --git a/net/core/filter.c b/net/core/filter.c > index fd3ae092d3d7..81d18660c38b 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -73,6 +73,7 @@ > #include <linux/seg6_local.h> > #include <net/seg6.h> > #include <net/seg6_local.h> > +#include <net/lwtunnel.h> > > /** > * sk_filter_trim_cap - run a packet through a socket filter > @@ -4796,7 +4797,7 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 > type, void *hdr, u32 len > static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, > bool ingress) > { > - return -EINVAL; /* Implemented in the next patch. */ > + return bpf_lwt_push_ip_encap(skb, hdr, len, ingress); > } > > BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, > hdr, > diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c > index 3e85437f7106..a3f79bff3776 100644 > --- a/net/core/lwt_bpf.c > +++ b/net/core/lwt_bpf.c > @@ -16,6 +16,7 @@ > #include <linux/types.h> > #include <linux/bpf.h> > #include <net/lwtunnel.h> > +#include <net/ip6_route.h> > > struct bpf_lwt_prog { > struct bpf_prog *prog; > @@ -55,6 +56,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct > bpf_lwt_prog *lwt, > > switch (ret) { > case BPF_OK: > + case BPF_LWT_REROUTE: > break; > > case BPF_REDIRECT: > @@ -97,6 +99,8 @@ static int bpf_input(struct sk_buff *skb) > ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); > if (ret < 0) > return ret; > + if (ret == BPF_LWT_REROUTE) > + return dst_input(skb); > } > > if (unlikely(!dst->lwtstate->orig_input)) { > @@ -168,6 +172,13 @@ static int bpf_xmit(struct sk_buff *skb) > return LWTUNNEL_XMIT_CONTINUE; > case BPF_REDIRECT: > return LWTUNNEL_XMIT_DONE; > + case BPF_LWT_REROUTE: > + ret = dst_output(dev_net(skb_dst(skb)->dev), > + skb->sk, skb); > + if (unlikely(ret)) > + return ret; > + /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ > + return LWTUNNEL_XMIT_DONE; > default: > return ret; > } > @@ -389,6 +400,137 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = { > .owner = THIS_MODULE, > }; > > +int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool > ingress) > +{ > + struct dst_entry *dst = NULL; > + struct iphdr *iph; > + bool ipv4; > + int err; > + > + if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM)) > + return -EINVAL; > + > + /* validate protocol and length */ > + iph = (struct iphdr *)hdr; > + if (iph->version == 4) { > + ipv4 = true; > + if (iph->ihl * 4 > len) > + return -EINVAL; > + } else if (iph->version == 6) { > + ipv4 = false; > + if (unlikely(len < sizeof(struct ipv6hdr))) > + return -EINVAL; > + } else { > + return -EINVAL; > + } > + > + /* allocate enough space for the encap headers + L2 hdr */ > + if (ingress) { > + err = skb_cow_head(skb, len + skb->mac_len); > + if (unlikely(err)) > + return err; > + } else { > + /* ip_route_input_noref below does route lookup and dst > + * drop/set for ingress. There is no similar function for > + * egress, so we need to do route lookup and replace skb's > + * dst in this function. > + */ > + struct sock *sk; > + struct net *net; > + > + sk = sk_to_full_sk(skb->sk); > + if (sk) > + net = sock_net(sk); > + else > + net = dev_net(skb_dst(skb)->dev);
This delta gets VRF tests to pass too. Also, you should be able to always get net from the device. diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 526b7cfc6d52..79feebd6da34 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -436,20 +436,24 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) * egress, so we need to do route lookup and replace skb's * dst in this function. */ + struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev); + int oif = l3mdev ? l3mdev->ifindex : 0; struct sock *sk; struct net *net; sk = sk_to_full_sk(skb->sk); - if (sk) + if (sk) { + if (sk->sk_bound_dev_if) + oif = sk->sk_bound_dev_if; net = sock_net(sk); - else + } else net = dev_net(skb_dst(skb)->dev); if (ipv4) { struct flowi4 fl4 = {0}; struct rtable *rt; - fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0; + fl4.flowi4_oif = oif; fl4.flowi4_mark = skb->mark; fl4.flowi4_uid = sock_net_uid(net, sk); fl4.flowi4_tos = RT_TOS(iph->tos); @@ -466,7 +470,7 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) struct ipv6hdr *iph6 = (struct ipv6hdr *)hdr; struct flowi6 fl6 = {0}; - fl6.flowi6_oif = sk ? sk->sk_bound_dev_if : 0; + fl6.flowi6_oif = oif; fl6.flowi6_mark = skb->mark; fl6.flowi6_uid = sock_net_uid(net, sk); fl6.flowlabel = ip6_flowinfo(iph6); > + > + if (ipv4) { > + struct flowi4 fl4 = {0}; > + struct rtable *rt; > + > + fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0; > + fl4.flowi4_mark = skb->mark; > + fl4.flowi4_uid = sock_net_uid(net, sk); > + fl4.flowi4_tos = RT_TOS(iph->tos); > + fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; > + fl4.flowi4_proto = iph->protocol; > + fl4.daddr = iph->daddr; > + fl4.saddr = iph->saddr; > + > + rt = ip_route_output_key(net, &fl4); > + if (IS_ERR(rt) || rt->dst.error) > + return -EINVAL; > + dst = &rt->dst; > + } else { > + struct ipv6hdr *iph6 = (struct ipv6hdr *)hdr; > + struct flowi6 fl6 = {0}; > + > + fl6.flowi6_oif = sk ? sk->sk_bound_dev_if : 0; > + fl6.flowi6_mark = skb->mark; > + fl6.flowi6_uid = sock_net_uid(net, sk); > + fl6.flowlabel = ip6_flowinfo(iph6); > + fl6.flowi6_proto = iph6->nexthdr; > + fl6.daddr = iph6->daddr; > + fl6.saddr = iph6->saddr; > + > + dst = ip6_route_output(net, skb->sk, &fl6); > + if (IS_ERR(dst) || dst->error) > + return -EINVAL; > + } > + > + err = skb_cow_head(skb, len + LL_RESERVED_SPACE(dst->dev)); > + if (unlikely(err)) > + return err; > + } > + > + /* push the encap headers and fix pointers */ > + skb_reset_inner_headers(skb); > + skb->encapsulation = 1; > + skb_push(skb, len); > + if (ingress) > + skb_postpush_rcsum(skb, iph, len); > + skb_reset_network_header(skb); > + iph = ip_hdr(skb); > + memcpy(iph, hdr, len); Calling it iph and using ip_hdr seems wrong given that hdr can also be IPv6. Why not just use skb_network_header? > + bpf_compute_data_pointers(skb); > + > + /* final skb touches + routing */ > + if (ipv4) { > + skb->protocol = htons(ETH_P_IP); > + if (iph->ihl * 4 < len) > + skb_set_transport_header(skb, iph->ihl * 4); > + > + if (!iph->check) > + iph->check = ip_fast_csum((unsigned char *)iph, > + iph->ihl); > + > + if (ingress) { > + err = ip_route_input_noref(skb, iph->daddr, iph->saddr, > + iph->tos, skb_dst(skb)->dev); > + if (err) > + return err; > + } else { > + skb_dst_drop(skb); > + skb_dst_set(skb, dst); > + } > + } else { > + skb->protocol = htons(ETH_P_IPV6); > + if (sizeof(struct ipv6hdr) < len) > + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); > + > + if (ingress) { > + ip6_route_input(skb); > + if (skb_dst(skb)->error) > + return skb_dst(skb)->error; > + } else { > + skb_dst_drop(skb); > + skb_dst_set(skb, dst); > + } > + } > + > + return 0; > +} > + > static int __init bpf_lwt_init(void) > { > return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); >