Infrastructure changes to allow aRFS to be based on Per Thread Queues
instead of just CPU. The basic change is to create a field in
rps_dev_flow to hold either a CPU or a queue index (not just a CPU
that is).

Changes include:
        - Replace u16 cpu field in rps_dev_flow structure with
          rps_cpu_qid structure that contains either a CPU or a device
          queue index. Note the structure is still sixteen bits
        - Helper functions to clear and set the cpu in the
          rps_cpu_qid of rps_dev_flow
        - Create a sock_masks structure that contains the partition
          of the thirty-two bit entry in rps_sock_flow_table. The
          structure contains two masks, one to extract the upper bits
          of the hash and one to extract the CPU number or queue index
        - Replace rps_cpu_mask with sock_masks from rps_sock_flow_table
        - Add rps_max_num_queues which will be used when creating
          sock_masks for queue entries in rps_sock_flow_table
---
 include/linux/netdevice.h  | 94 +++++++++++++++++++++++++++++++++-----
 net/core/dev.c             | 47 ++++++++++++-------
 net/core/net-sysfs.c       |  2 +-
 net/core/sysctl_net_core.c |  6 ++-
 4 files changed, 119 insertions(+), 30 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bf5f2a85da97..d528aa61fea3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -674,18 +674,65 @@ struct rps_map {
 };
 #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16)))
 
+/* The rps_cpu_qid structure is sixteen bits and holds either a CPU number or
+ * a queue index. The use_qid field specifies which type of value is set (i.e.
+ * if use_qid is 1 then cpu_qid contains a fifteen bit queue identifier, and if
+ * use_qid is 0 then cpu_qid contains a fifteen bit CPU number). No entry is
+ * signified by RPS_NO_CPU_QID in val which is set to NO_QUEUE (0xffff). So the
+ * range of CPU numbers that can be stored is 0..32,767 (0x7fff) and the range
+ * of queue identifiers is 0..32,766. Note that CPU numbers are limited by
+ * CONFIG_NR_CPUS which currently has a maximum supported value of 8,192 (per
+ * arch/x86/Kconfig), so WARN_ON is used to check that a CPU number is less
+ * than 0x8000 when setting the cpu in rps_cpu_qid. The queue index is limited
+ * by configuration.
+ */
+struct rps_cpu_qid {
+       union {
+               u16 val;
+               struct {
+                       u16 use_qid: 1;
+                       union {
+                               u16 cpu: 15;
+                               u16 qid: 15;
+                       };
+               };
+       };
+};
+
+#define RPS_NO_CPU_QID NO_QUEUE        /* No CPU or qid in rps_cpu_qid */
+#define RPS_MAX_CPU    0x7fff          /* Maximum cpu in rps_cpu_qid */
+#define RPS_MAX_QID    0x7ffe          /* Maximum qid in rps_cpu_qid */
+
 /*
  * The rps_dev_flow structure contains the mapping of a flow to a CPU, the
  * tail pointer for that CPU's input queue at the time of last enqueue, and
  * a hardware filter index.
  */
 struct rps_dev_flow {
-       u16 cpu;
+       struct rps_cpu_qid cpu_qid;
        u16 filter;
        unsigned int last_qtail;
 };
 #define RPS_NO_FILTER 0xffff
 
+static inline void rps_dev_flow_clear(struct rps_dev_flow *dev_flow)
+{
+       dev_flow->cpu_qid.val = RPS_NO_CPU_QID;
+}
+
+static inline void rps_dev_flow_set_cpu(struct rps_dev_flow *dev_flow, u16 cpu)
+{
+       struct rps_cpu_qid cpu_qid;
+
+       if (WARN_ON(cpu > RPS_MAX_CPU))
+               return;
+
+       /* Set the rflow target to the CPU atomically */
+       cpu_qid.use_qid = 0;
+       cpu_qid.cpu = cpu;
+       dev_flow->cpu_qid = cpu_qid;
+}
+
 /*
  * The rps_dev_flow_table structure contains a table of flow mappings.
  */
@@ -697,34 +744,57 @@ struct rps_dev_flow_table {
 #define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
     ((_num) * sizeof(struct rps_dev_flow)))
 
+struct rps_sock_masks {
+       u32 mask;
+       u32 hash_mask;
+};
+
 /*
- * The rps_sock_flow_table contains mappings of flows to the last CPU
- * on which they were processed by the application (set in recvmsg).
- * Each entry is a 32bit value. Upper part is the high-order bits
- * of flow hash, lower part is CPU number.
- * rps_cpu_mask is used to partition the space, depending on number of
- * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
- * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f,
- * meaning we use 32-6=26 bits for the hash.
+ * The rps_sock_flow_table contains mappings of flows to the last CPU on which
+ * they were processed by the application (set in recvmsg), or the mapping of
+ * the flow to a per thread queue for the application. Each entry is a 32bit
+ * value. The high order bit indicates whether a CPU number or a queue index is
+ * stored. The next high-order bits contain the flow hash, and the lower bits
+ * contain the CPU number or queue index. The sock_flow table contains two
+ * sets of masks, one for CPU entries (cpu_masks) and one for queue entries
+ * (queue_masks), that are to used partition the space between the hash bits
+ * and the CPU number or queue index. For the cpu masks, cpu_masks.mask is set
+ * to roundup_pow_of_two(nr_cpu_ids) - 1 and the corresponding hash mask,
+ * cpu_masks.hash_mask, is set to (~cpu_masks.mask & ~RPS_SOCK_FLOW_USE_QID).
+ * For example, if 64 CPUs are possible, cpu_masks.mask == 0x3f, meaning we use
+ * 31-6=25 bits for the hash (so cpu_masks.hash_mask == 0x7fffffc0). Similarly,
+ * queue_masks in rps_sock_flow_table is used to partition the space when a
+ * queue index is present.
  */
 struct rps_sock_flow_table {
        u32     mask;
+       struct  rps_sock_masks cpu_masks;
+       struct  rps_sock_masks queue_masks;
 
        u32     ents[] ____cacheline_aligned_in_smp;
 };
 #define        RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct 
rps_sock_flow_table, ents[_num]))
 
-#define RPS_NO_CPU 0xffff
+#define RPS_SOCK_FLOW_USE_QID  (1 << 31)
+#define RPS_SOCK_FLOW_NO_IDENT -1U
 
-extern u32 rps_cpu_mask;
 extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
+extern unsigned int rps_max_num_queues;
+
+static inline void rps_init_sock_masks(struct rps_sock_masks *masks, u32 num)
+{
+       u32 mask = roundup_pow_of_two(num) - 1;
+
+       masks->mask = mask;
+       masks->hash_mask = (~mask & ~RPS_SOCK_FLOW_USE_QID);
+}
 
 static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
                                        u32 hash)
 {
        if (table && hash) {
+               u32 val = hash & table->cpu_masks.hash_mask;
                unsigned int index = hash & table->mask;
-               u32 val = hash & ~rps_cpu_mask;
 
                /* We only give a hint, preemption can change CPU under us */
                val |= raw_smp_processor_id();
diff --git a/net/core/dev.c b/net/core/dev.c
index 9f7a3e78e23a..946940bdd583 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4242,8 +4242,7 @@ static inline void ____napi_schedule(struct softnet_data 
*sd,
 /* One global table that all flow-based protocols share. */
 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
 EXPORT_SYMBOL(rps_sock_flow_table);
-u32 rps_cpu_mask __read_mostly;
-EXPORT_SYMBOL(rps_cpu_mask);
+unsigned int rps_max_num_queues;
 
 struct static_key_false rps_needed __read_mostly;
 EXPORT_SYMBOL(rps_needed);
@@ -4302,7 +4301,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
                        per_cpu(softnet_data, next_cpu).input_queue_head;
        }
 
-       rflow->cpu = next_cpu;
+       rps_dev_flow_set_cpu(rflow, next_cpu);
        return rflow;
 }
 
@@ -4349,22 +4348,39 @@ static int get_rps_cpu(struct net_device *dev, struct 
sk_buff *skb,
 
        sock_flow_table = rcu_dereference(rps_sock_flow_table);
        if (flow_table && sock_flow_table) {
+               u32 next_cpu, comparator, ident;
                struct rps_dev_flow *rflow;
-               u32 next_cpu;
-               u32 ident;
 
                /* First check into global flow table if there is a match */
                ident = sock_flow_table->ents[hash & sock_flow_table->mask];
-               if ((ident ^ hash) & ~rps_cpu_mask)
-                       goto try_rps;
+               comparator = ((ident & RPS_SOCK_FLOW_USE_QID) ?
+                               sock_flow_table->queue_masks.hash_mask :
+                               sock_flow_table->cpu_masks.hash_mask);
 
-               next_cpu = ident & rps_cpu_mask;
+               if ((ident ^ hash) & comparator)
+                       goto try_rps;
 
                /* OK, now we know there is a match,
                 * we can look at the local (per receive queue) flow table
                 */
                rflow = &flow_table->flows[hash & flow_table->mask];
-               tcpu = rflow->cpu;
+
+               /* The flow_sock entry may refer to either a queue or a
+                * CPU. Proceed accordingly.
+                */
+               if (ident & RPS_SOCK_FLOW_USE_QID) {
+                       /* A queue identifier is in the sock_flow_table entry */
+
+                       /* Don't use aRFS to set CPU in this case, skip to
+                        * trying RPS
+                        */
+                       goto try_rps;
+               }
+
+               /* A CPU number is in the sock_flow_table entry */
+
+               next_cpu = ident & sock_flow_table->cpu_masks.mask;
+               tcpu = rflow->cpu_qid.use_qid ? NO_QUEUE : rflow->cpu_qid.cpu;
 
                /*
                 * If the desired CPU (where last recvmsg was done) is
@@ -4396,10 +4412,8 @@ static int get_rps_cpu(struct net_device *dev, struct 
sk_buff *skb,
 
        if (map) {
                tcpu = map->cpus[reciprocal_scale(hash, map->len)];
-               if (cpu_online(tcpu)) {
+               if (cpu_online(tcpu))
                        cpu = tcpu;
-                       goto done;
-               }
        }
 
 done:
@@ -4424,17 +4438,18 @@ bool rps_may_expire_flow(struct net_device *dev, u16 
rxq_index,
 {
        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
        struct rps_dev_flow_table *flow_table;
+       struct rps_cpu_qid cpu_qid;
        struct rps_dev_flow *rflow;
        bool expire = true;
-       unsigned int cpu;
 
        rcu_read_lock();
        flow_table = rcu_dereference(rxqueue->rps_flow_table);
        if (flow_table && flow_id <= flow_table->mask) {
                rflow = &flow_table->flows[flow_id];
-               cpu = READ_ONCE(rflow->cpu);
-               if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
-                   ((int)(per_cpu(softnet_data, cpu).input_queue_head -
+               cpu_qid = READ_ONCE(rflow->cpu_qid);
+               if (rflow->filter == filter_id && !cpu_qid.use_qid &&
+                   cpu_qid.cpu < nr_cpu_ids &&
+                   ((int)(per_cpu(softnet_data, cpu_qid.cpu).input_queue_head -
                           rflow->last_qtail) <
                     (int)(10 * flow_table->mask)))
                        expire = false;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index e353b822bb15..56d27463d466 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -858,7 +858,7 @@ static ssize_t store_rps_dev_flow_table_cnt(struct 
netdev_rx_queue *queue,
 
                table->mask = mask;
                for (count = 0; count <= mask; count++)
-                       table->flows[count].cpu = RPS_NO_CPU;
+                       rps_dev_flow_clear(&table->flows[count]);
        } else {
                table = NULL;
        }
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 9c7d46fbb75a..d09471f29d89 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -65,12 +65,16 @@ static int rps_create_sock_flow_table(size_t size, size_t 
orig_size,
                                return -ENOMEM;
 
                        sock_table->mask = size - 1;
+                       rps_init_sock_masks(&sock_table->cpu_masks,
+                                           nr_cpu_ids);
+                       rps_init_sock_masks(&sock_table->queue_masks,
+                                           rps_max_num_queues);
                } else {
                        sock_table = orig_table;
                }
 
                for (i = 0; i < size; i++)
-                       sock_table->ents[i] = RPS_NO_CPU;
+                       sock_table->ents[i] = RPS_NO_CPU_QID;
        } else {
                sock_table = NULL;
        }
-- 
2.25.1

Reply via email to