Flip the IPv4 output path from use of the vrf dst to the l3mdev tx out hook.
Signed-off-by: David Ahern <d...@cumulusnetworks.com> --- drivers/net/vrf.c | 171 ++++++++++++++++++++---------------------------------- net/ipv4/route.c | 4 -- 2 files changed, 64 insertions(+), 111 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 1ce7420322ee..7517645347c3 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -230,79 +230,28 @@ static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, struct net_device *vrf_dev) { - struct iphdr *ip4h = ip_hdr(skb); - int ret = NET_XMIT_DROP; - struct flowi4 fl4 = { - /* needed to match OIF rule */ - .flowi4_oif = vrf_dev->ifindex, - .flowi4_iif = LOOPBACK_IFINDEX, - .flowi4_tos = RT_TOS(ip4h->tos), - .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_L3MDEV_SRC | - FLOWI_FLAG_SKIP_NH_OIF, - .daddr = ip4h->daddr, - }; - struct net *net = dev_net(vrf_dev); - struct rtable *rt; - - rt = ip_route_output_flow(net, &fl4, NULL); - if (IS_ERR(rt)) - goto err; - - if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { - ip_rt_put(rt); - goto err; - } + struct net_vrf *vrf = netdev_priv(vrf_dev); + struct dst_entry *dst = NULL; + struct rtable *rth_local; skb_dst_drop(skb); - /* if dst.dev is loopback or the VRF device again this is locally - * originated traffic destined to a local address. Short circuit - * to Rx path using our local dst - */ - if (rt->dst.dev == net->loopback_dev || rt->dst.dev == vrf_dev) { - struct net_vrf *vrf = netdev_priv(vrf_dev); - struct rtable *rth_local; - struct dst_entry *dst = NULL; - - ip_rt_put(rt); - - rcu_read_lock(); - - rth_local = rcu_dereference(vrf->rth_local); - if (likely(rth_local)) { - dst = &rth_local->dst; - dst_hold(dst); - } - - rcu_read_unlock(); - - if (unlikely(!dst)) - goto err; + rcu_read_lock(); - return vrf_local_xmit(skb, vrf_dev, dst); + rth_local = rcu_dereference(vrf->rth_local); + if (likely(rth_local)) { + dst = &rth_local->dst; + dst_hold(dst); } - skb_dst_set(skb, &rt->dst); - - /* strip the ethernet header added for pass through VRF device */ - __skb_pull(skb, skb_network_offset(skb)); + rcu_read_unlock(); - if (!ip4h->saddr) { - ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0, - RT_SCOPE_LINK); + if (unlikely(!dst)) { + vrf_tx_error(vrf_dev, skb); + return NET_XMIT_DROP; } - ret = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); - if (unlikely(net_xmit_eval(ret))) - vrf_dev->stats.tx_errors++; - else - ret = NET_XMIT_SUCCESS; - -out: - return ret; -err: - vrf_tx_error(vrf_dev, skb); - goto out; + return vrf_local_xmit(skb, vrf_dev, dst); } static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev) @@ -473,64 +422,71 @@ static int vrf_rt6_create(struct net_device *dev) } #endif -/* modelled after ip_finish_output2 */ +/* run skb through packet sockets for tcpdump with dev set to vrf dev */ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = (struct rtable *)dst; - struct net_device *dev = dst->dev; - unsigned int hh_len = LL_RESERVED_SPACE(dev); - struct neighbour *neigh; - u32 nexthop; - int ret = -EINVAL; - - /* Be paranoid, rather than too clever. */ - if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { - struct sk_buff *skb2; - - skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); - if (!skb2) { - ret = -ENOMEM; - goto err; - } - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - - consume_skb(skb); - skb = skb2; + if (likely(skb_headroom(skb) >= ETH_HLEN)) { + struct ethhdr *eth = (struct ethhdr *)skb_push(skb, ETH_HLEN); + + ether_addr_copy(eth->h_source, skb->dev->dev_addr); + eth_zero_addr(eth->h_dest); + eth->h_proto = skb->protocol; + dev_queue_xmit_nit(skb, skb->dev); + skb_pull(skb, ETH_HLEN); } - rcu_read_lock_bh(); - - nexthop = (__force u32)rt_nexthop(rt, ip_hdr(skb)->daddr); - neigh = __ipv4_neigh_lookup_noref(dev, nexthop); - if (unlikely(!neigh)) - neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); - if (!IS_ERR(neigh)) - ret = dst_neigh_output(dst, neigh, skb); - - rcu_read_unlock_bh(); -err: - if (unlikely(ret < 0)) - vrf_tx_error(skb->dev, skb); - return ret; + return 1; } static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev; - - IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); - - skb->dev = dev; skb->protocol = htons(ETH_P_IP); return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, - net, sk, skb, NULL, dev, + net, sk, skb, NULL, skb->dev, vrf_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } +static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev, + struct sock *sk, + struct sk_buff *skb) +{ + struct net *net = dev_net(vrf_dev); + struct net_device *dev = skb->dev; + int err; + + skb->dev = vrf_dev; + + err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, + skb, NULL, vrf_dev, vrf_output); + if (likely(err == 1)) + err = vrf_output(net, sk, skb); + + if (likely(err == 1)) { + skb->dev = dev; + nf_reset(skb); + } else { + skb = NULL; + } + + return skb; +} + +/* called with rcu lock held */ +static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev, + struct sock *sk, + struct sk_buff *skb, + u16 proto) +{ + switch (proto) { + case AF_INET: + return vrf_ip_out(vrf_dev, sk, skb); + } + + return skb; +} + /* holding rtnl */ static void vrf_rtable_release(struct net_device *dev, struct net_vrf *vrf) { @@ -1067,6 +1023,7 @@ static const struct l3mdev_ops vrf_l3mdev_ops = { .l3mdev_get_rtable = vrf_get_rtable, .l3mdev_get_saddr = vrf_get_saddr, .l3mdev_l3_rcv = vrf_l3_rcv, + .l3mdev_l3_out = vrf_l3_out, #if IS_ENABLED(CONFIG_IPV6) .l3mdev_get_rt6_dst = vrf_get_rt6_dst, .l3mdev_get_saddr6 = vrf_get_saddr6, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1119f18fb720..d9936f90a755 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2244,10 +2244,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } - - rth = l3mdev_get_rtable(dev_out, fl4); - if (rth) - goto out; } if (!fl4->daddr) { -- 2.1.4