On 6/5/2018 10:57 AM, Tom Herbert wrote: > > > On Tue, Jun 5, 2018 at 1:38 AM, Amritha Nambiar > <amritha.namb...@intel.com <mailto:amritha.namb...@intel.com>> wrote: > > This patch adds support to pick Tx queue based on the Rx queue(s) map > configuration set by the admin through the sysfs attribute > for each Tx queue. If the user configuration for receive queue(s) map > does not apply, then the Tx queue selection falls back to CPU(s) map > based selection and finally to hashing. > > Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com > <mailto:amritha.namb...@intel.com>> > Signed-off-by: Sridhar Samudrala <sridhar.samudr...@intel.com > <mailto:sridhar.samudr...@intel.com>> > --- > include/net/busy_poll.h | 3 ++ > include/net/sock.h | 14 +++++++++++ > net/core/dev.c | 60 > ++++++++++++++++++++++++++++++++--------------- > net/core/sock.c | 4 +++ > net/ipv4/tcp_input.c | 3 ++ > 5 files changed, 65 insertions(+), 19 deletions(-) > > diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h > index 71c72a9..fc4fb68 100644 > --- a/include/net/busy_poll.h > +++ b/include/net/busy_poll.h > @@ -136,6 +136,9 @@ static inline void sk_mark_napi_id(struct sock > *sk, const struct sk_buff *skb) > #ifdef CONFIG_NET_RX_BUSY_POLL > sk->sk_napi_id = skb->napi_id; > #endif > +#ifdef CONFIG_XPS > + sk->sk_rx_queue_mapping = skb_get_rx_queue(skb); > +#endif > } > > /* variant used for unconnected sockets */ > diff --git a/include/net/sock.h b/include/net/sock.h > index 4f7c584..12313653 100644 > --- a/include/net/sock.h > +++ b/include/net/sock.h > @@ -139,6 +139,7 @@ typedef __u64 __bitwise __addrpair; > * @skc_node: main hash linkage for various protocol lookup tables > * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol > * @skc_tx_queue_mapping: tx queue number for this connection > + * @skc_rx_queue_mapping: rx queue number for this connection > * @skc_flags: place holder for sk_flags > * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, > * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings > @@ -215,6 +216,9 @@ struct sock_common { > struct hlist_nulls_node skc_nulls_node; > }; > int skc_tx_queue_mapping; > +#ifdef CONFIG_XPS > + int skc_rx_queue_mapping; > > > This is still expensive cost to be adding an int field into sock_common > for a relatively rare use case. Maybe there should be a CONFIG_XPS_RQS? > Or maybe skc_tx_queue_mapping and skc_rx_queue_mapping could be shorts > (so maximum queue mapping would then be 2^16-2).
Thanks for the review, Tom. I will fix up the code incorporating all your feedback in the next version (v4). I could have a new config option CONFIG_XPS_RXQS that would be default off, in addition to the CONFIG_XPS option that's already there. With changing the 'skc_tx_queue_mapping' to short, my concern is that the change would become extensive, there are a lot of places where this gets filled with int or u32 values. > > +#endif > union { > int skc_incoming_cpu; > u32 skc_rcv_wnd; > @@ -326,6 +330,9 @@ struct sock { > #define sk_nulls_node __sk_common.skc_nulls_node > #define sk_refcnt __sk_common.skc_refcnt > #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping > +#ifdef CONFIG_XPS > +#define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping > +#endif > > #define sk_dontcopy_begin __sk_common.skc_dontcopy_begin > #define sk_dontcopy_end __sk_common.skc_dontcopy_end > @@ -1696,6 +1703,13 @@ static inline int sk_tx_queue_get(const > struct sock *sk) > return sk ? sk->sk_tx_queue_mapping : -1; > } > > +static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff > *skb) > +{ > +#ifdef CONFIG_XPS > + sk->sk_rx_queue_mapping = skb_get_rx_queue(skb); > +#endif > +} > + > static inline void sk_set_socket(struct sock *sk, struct socket *sock) > { > sk_tx_queue_clear(sk); > diff --git a/net/core/dev.c b/net/core/dev.c > index bba755f..1880e6c 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -3479,36 +3479,58 @@ sch_handle_egress(struct sk_buff *skb, int > *ret, struct net_device *dev) > } > #endif /* CONFIG_NET_EGRESS */ > > -static inline int get_xps_queue(struct net_device *dev, struct > sk_buff *skb) > +#ifdef CONFIG_XPS > +static int __get_xps_queue_idx(struct net_device *dev, struct > sk_buff *skb, > + struct xps_dev_maps *dev_maps, > unsigned int tci) > +{ > + struct xps_map *map; > + int queue_index = -1; > + > + if (dev->num_tc) { > + tci *= dev->num_tc; > + tci += netdev_get_prio_tc_map(dev, skb->priority); > + } > + > + map = rcu_dereference(dev_maps->attr_map[tci]); > + if (map) { > + if (map->len == 1) > + queue_index = map->queues[0]; > + else > + queue_index = map->queues[reciprocal_scale( > + skb_get_hash(skb), > map->len)]; > + if (unlikely(queue_index >= dev->real_num_tx_queues)) > + queue_index = -1; > + } > + return queue_index; > +} > +#endif > + > +static int get_xps_queue(struct net_device *dev, struct sk_buff *skb) > { > #ifdef CONFIG_XPS > struct xps_dev_maps *dev_maps; > - struct xps_map *map; > + struct sock *sk = skb->sk; > int queue_index = -1; > + unsigned int tci = 0; > > if (!static_key_false(&xps_needed)) > return -1; > > + if (sk && sk->sk_rx_queue_mapping <= dev->num_rx_queues) > + tci = sk->sk_rx_queue_mapping; > > > This is only be needed if xps_rxqs_map is not null so it should be in > the block below. > > > + > rcu_read_lock(); > - dev_maps = rcu_dereference(dev->xps_cpus_map); > - if (dev_maps) { > - unsigned int tci = skb->sender_cpu - 1; > + dev_maps = rcu_dereference(dev->xps_rxqs_map); > + if (dev_maps) > + queue_index = __get_xps_queue_idx(dev, skb, > dev_maps, tci); > > - if (dev->num_tc) { > - tci *= dev->num_tc; > - tci += netdev_get_prio_tc_map(dev, > skb->priority); > - } > > - map = rcu_dereference(dev_maps->attr_map[tci]); > - if (map) { > - if (map->len == 1) > - queue_index = map->queues[0]; > - else > - queue_index = > map->queues[reciprocal_scale(skb_get_hash(skb), > - > map->len)]; > - if (unlikely(queue_index >= > dev->real_num_tx_queues)) > - queue_index = -1; > - } > + if (queue_index < 0) { > + tci = skb->sender_cpu - 1; > + dev_maps = rcu_dereference(dev->xps_cpus_map); > + if (dev_maps) > + queue_index = __get_xps_queue_idx(dev, skb, > dev_maps, > + tci); > } > rcu_read_unlock(); > > diff --git a/net/core/sock.c b/net/core/sock.c > index 435a0ba..3c10d31 100644 > --- a/net/core/sock.c > +++ b/net/core/sock.c > @@ -2824,6 +2824,10 @@ void sock_init_data(struct socket *sock, > struct sock *sk) > sk->sk_pacing_rate = ~0U; > sk->sk_pacing_shift = 10; > sk->sk_incoming_cpu = -1; > + > +#ifdef CONFIG_XPS > + sk->sk_rx_queue_mapping = -1; > +#endif > /* > * Before updating sk_refcnt, we must commit prior changes > to memory > * (Documentation/RCU/rculist_nulls.txt for details) > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c > index d5ffb57..cc69f75 100644 > --- a/net/ipv4/tcp_input.c > +++ b/net/ipv4/tcp_input.c > @@ -78,6 +78,7 @@ > #include <linux/errqueue.h> > #include <trace/events/tcp.h> > #include <linux/static_key.h> > +#include <net/busy_poll.h> > > int sysctl_tcp_max_orphans __read_mostly = NR_FILE; > > @@ -5574,6 +5575,7 @@ void tcp_finish_connect(struct sock *sk, > struct sk_buff *skb) > if (skb) { > icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); > security_inet_conn_established(sk, skb); > + sk_mark_napi_id(sk, skb); > } > > tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); > @@ -6402,6 +6404,7 @@ int tcp_conn_request(struct request_sock_ops > *rsk_ops, > tcp_rsk(req)->snt_isn = isn; > tcp_rsk(req)->txhash = net_tx_rndhash(); > tcp_openreq_init_rwin(req, sk, dst); > + sk_mark_rx_queue(req_to_sk(req), skb); > if (!want_cookie) { > tcp_reqsk_record_syn(sk, req, skb); > fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); > >