On Fri, 2018-12-21 at 08:53 +0100, Steffen Klassert wrote: > @@ -403,10 +428,17 @@ struct sk_buff *udp_gro_receive(struct list_head *head, > struct sk_buff *skb, > > rcu_read_lock(); > sk = (*lookup)(skb, uh->source, uh->dest); > - if (!sk) > - goto out_unlock; > + if (!sk) { > + NAPI_GRO_CB(skb)->is_flist = 1; > + pp = call_gro_receive(udp_gro_receive_segment, head, skb); > + rcu_read_unlock(); > + return pp; > + } > + > + if (!udp_sk(sk)->gro_receive) { > + if (!udp_sk(sk)->gro_enabled) > + NAPI_GRO_CB(skb)->is_flist = 1; > > - if (udp_sk(sk)->gro_enabled) { > pp = call_gro_receive(udp_gro_receive_segment, head, skb); > rcu_read_unlock(); > return pp;
I think we could still avoid the lookup when no vxlan/GRO sockets are present moving the lookup into udp{4,6}_gro_receive. Very roughly something alike: diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index f79f1b5b2f9e..b0c0983eac6b 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -420,20 +420,16 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, INDIRECT_CALLABLE_DECLARE(struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport)); struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, - struct udphdr *uh, udp_lookup_t lookup) + struct udphdr *uh, struct sock *sk) { struct sk_buff *pp = NULL; struct sk_buff *p; struct udphdr *uh2; unsigned int off = skb_gro_offset(skb); int flush = 1; - struct sock *sk; - rcu_read_lock(); - sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb, - udp4_lib_lookup_skb, skb, uh->source, uh->dest); - if (!sk) { - NAPI_GRO_CB(skb)->is_flist = 1; + if (!sk || !udp_sk(sk)->gro_receive) { + NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled: 1; pp = call_gro_receive(udp_gro_receive_segment, head, skb); rcu_read_unlock(); return pp; @@ -506,7 +502,12 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) inet_gro_compute_pseudo); skip: NAPI_GRO_CB(skb)->is_ipv6 = 0; - return udp_gro_receive(head, skb, uh, udp4_lib_lookup_skb); + rcu_read_lock(); + sk = static_branch_unlikely(&udp_encap_needed_key) ? + udp4_lib_lookup_skb(skb, uh->source, uh->dest) : NULL; + pp = udp_gro_receive(head, skb, uh, sk); + rcu_read_unlock(); + return pp; flush: NAPI_GRO_CB(skb)->flush = 1; --- Regardless of the above, I think we should drop the later check for gro_receive: --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -450,8 +450,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, if (NAPI_GRO_CB(skb)->encap_mark || (skb->ip_summed != CHECKSUM_PARTIAL && NAPI_GRO_CB(skb)->csum_cnt == 0 && - !NAPI_GRO_CB(skb)->csum_valid) || - !udp_sk(sk)->gro_receive) + !NAPI_GRO_CB(skb)->csum_valid)) goto out_unlock; /* mark that this skb passed once through the tunnel gro layer */ --- Finally this will cause GRO/GSO for local UDP packets delivery to non GSO_SEGMENT sockets. That could be possibly a win or a regression: we save on netfilter/IP stack traversal, but we add additional work, some performances figures would probably help. Cheers, Paolo