On Mon, 2016-11-21 at 10:10 -0800, Eric Dumazet wrote:

> Not sure it this has been tried before, but the doorbell avoidance could
> be done by the driver itself, because it knows a TX completion will come
> shortly (well... if softirqs are not delayed too much !)
> 
> Doorbell would be forced only if :
> 
> (    "skb->xmit_more is not set" AND "TX engine is not 'started yet'" )
> OR
> ( too many [1] packets were put in TX ring buffer, no point deferring
> more)
> 
> Start the pump, but once it is started, let the doorbells being done by
> TX completion.
> 
> ndo_start_xmit and TX completion handler would have to maintain a shared
> state describing if packets were ready but doorbell deferred.
> 
> 
> Note that TX completion means "if at least one packet was drained",
> otherwise busy polling, constantly calling napi->poll() would force a
> doorbell too soon for devices sharing a NAPI for both RX and TX.
> 
> But then, maybe busy poll would like to force a doorbell...
> 
> I could try these ideas on mlx4 shortly.
> 
> 
> [1] limit could be derived from active "ethtool -c" params, eg tx-frames

I have a WIP, that increases pktgen rate by 75 % on mlx4 when bulking is
not used.

lpaa23:~# echo 0 >/sys/class/net/eth0/doorbell_opt 
lpaa23:~# sar -n DEV 1 10|grep eth0
22:43:26         eth0      0.00 5822800.00      0.00 597064.41      0.00      
0.00      1.00
22:43:27         eth0     24.00 5788237.00      2.09 593520.26      0.00      
0.00      0.00
22:43:28         eth0     12.00 5817777.00      1.43 596551.47      0.00      
0.00      1.00
22:43:29         eth0     22.00 5841516.00      1.61 598982.87      0.00      
0.00      0.00
22:43:30         eth0      4.00 4389137.00      0.71 450058.08      0.00      
0.00      1.00
22:43:31         eth0      4.00 5871008.00      0.72 602007.79      0.00      
0.00      0.00
22:43:32         eth0     12.00 5891809.00      1.43 604142.60      0.00      
0.00      1.00
22:43:33         eth0     10.00 5901904.00      1.12 605175.70      0.00      
0.00      0.00
22:43:34         eth0      5.00 5907982.00      0.69 605798.99      0.00      
0.00      1.00
22:43:35         eth0      2.00 5847086.00      0.12 599554.71      0.00      
0.00      0.00
Average:         eth0      9.50 5707925.60      0.99 585285.69      0.00      
0.00      0.50
lpaa23:~# echo 1 >/sys/class/net/eth0/doorbell_opt 
lpaa23:~# sar -n DEV 1 10|grep eth0
22:43:47         eth0      9.00 10226424.00      1.02 1048608.05      0.00      
0.00      1.00
22:43:48         eth0      1.00 10316955.00      0.06 1057890.89      0.00      
0.00      0.00
22:43:49         eth0      1.00 10310104.00      0.10 1057188.32      0.00      
0.00      1.00
22:43:50         eth0      0.00 10249423.00      0.00 1050966.23      0.00      
0.00      0.00
22:43:51         eth0      0.00 10210441.00      0.00 1046969.05      0.00      
0.00      1.00
22:43:52         eth0      2.00 10198389.00      0.16 1045733.17      0.00      
0.00      1.00
22:43:53         eth0      8.00 10079257.00      1.43 1033517.83      0.00      
0.00      0.00
22:43:54         eth0      0.00 7693509.00      0.00 788885.16      0.00      
0.00      0.00
22:43:55         eth0      2.00 10343076.00      0.20 1060569.32      0.00      
0.00      1.00
22:43:56         eth0      1.00 10224571.00      0.14 1048417.93      0.00      
0.00      0.00
Average:         eth0      2.40 9985214.90      0.31 1023874.60      0.00      
0.00      0.50

And about 11 % improvement on an mono-flow UDP_STREAM test.

skb_set_owner_w() is now the most consuming function.


lpaa23:~# ./udpsnd -4 -H 10.246.7.152 -d 2 &
[1] 13696
lpaa23:~# echo 0 >/sys/class/net/eth0/doorbell_opt
lpaa23:~# sar -n DEV 1 10|grep eth0
22:50:47         eth0      3.00 1355422.00      0.45 319706.04      0.00      
0.00      0.00
22:50:48         eth0      2.00 1344270.00      0.42 317035.21      0.00      
0.00      1.00
22:50:49         eth0      3.00 1350503.00      0.51 318478.34      0.00      
0.00      0.00
22:50:50         eth0     29.00 1348593.00      2.86 318113.02      0.00      
0.00      1.00
22:50:51         eth0     14.00 1354855.00      1.83 319508.56      0.00      
0.00      0.00
22:50:52         eth0      7.00 1357794.00      0.73 320226.89      0.00      
0.00      1.00
22:50:53         eth0      5.00 1326130.00      0.63 312784.72      0.00      
0.00      0.00
22:50:54         eth0      2.00 994584.00      0.12 234598.40      0.00      
0.00      1.00
22:50:55         eth0      5.00 1318209.00      0.75 310932.46      0.00      
0.00      0.00
22:50:56         eth0     20.00 1323445.00      1.73 312178.11      0.00      
0.00      1.00
Average:         eth0      9.00 1307380.50      1.00 308356.18      0.00      
0.00      0.50
lpaa23:~# echo 3 >/sys/class/net/eth0/doorbell_opt
lpaa23:~# sar -n DEV 1 10|grep eth0
22:51:03         eth0      4.00 1512055.00      0.54 356599.40      0.00      
0.00      0.00
22:51:04         eth0      4.00 1507631.00      0.55 355609.46      0.00      
0.00      1.00
22:51:05         eth0      4.00 1487789.00      0.42 350917.47      0.00      
0.00      0.00
22:51:06         eth0      7.00 1474460.00      1.22 347811.16      0.00      
0.00      1.00
22:51:07         eth0      2.00 1496529.00      0.24 352995.18      0.00      
0.00      0.00
22:51:08         eth0      3.00 1485856.00      0.49 350425.65      0.00      
0.00      1.00
22:51:09         eth0      1.00 1114808.00      0.06 262905.38      0.00      
0.00      0.00
22:51:10         eth0      2.00 1510924.00      0.30 356397.53      0.00      
0.00      1.00
22:51:11         eth0      2.00 1506408.00      0.30 355345.76      0.00      
0.00      0.00
22:51:12         eth0      2.00 1499122.00      0.32 353668.75      0.00      
0.00      1.00
Average:         eth0      3.10 1459558.20      0.44 344267.57      0.00      
0.00      0.50

 drivers/net/ethernet/mellanox/mlx4/en_rx.c   |    2 
 drivers/net/ethernet/mellanox/mlx4/en_tx.c   |   90 +++++++++++------
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |    4 
 include/linux/netdevice.h                    |    1 
 net/core/net-sysfs.c                         |   18 +++
 5 files changed, 83 insertions(+), 32 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 6562f78b07f4..fbea83218fc0 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -1089,7 +1089,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct 
mlx4_en_cq *cq, int bud
 
        if (polled) {
                if (doorbell_pending)
-                       mlx4_en_xmit_doorbell(priv->tx_ring[TX_XDP][cq->ring]);
+                       mlx4_en_xmit_doorbell(dev, 
priv->tx_ring[TX_XDP][cq->ring]);
 
                mlx4_cq_set_ci(&cq->mcq);
                wmb(); /* ensure HW sees CQ consumer before we post new buffers 
*/
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 4b597dca5c52..affebb435679 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -67,7 +67,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
        ring->size = size;
        ring->size_mask = size - 1;
        ring->sp_stride = stride;
-       ring->full_size = ring->size - HEADROOM - MAX_DESC_TXBBS;
+       ring->full_size = ring->size - HEADROOM - 2*MAX_DESC_TXBBS;
 
        tmp = size * sizeof(struct mlx4_en_tx_info);
        ring->tx_info = kmalloc_node(tmp, GFP_KERNEL | __GFP_NOWARN, node);
@@ -193,6 +193,7 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
        ring->sp_cqn = cq;
        ring->prod = 0;
        ring->cons = 0xffffffff;
+       ring->ncons = 0;
        ring->last_nr_txbb = 1;
        memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info));
        memset(ring->buf, 0, ring->buf_size);
@@ -227,9 +228,9 @@ void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
                       MLX4_QP_STATE_RST, NULL, 0, 0, &ring->sp_qp);
 }
 
-static inline bool mlx4_en_is_tx_ring_full(struct mlx4_en_tx_ring *ring)
+static inline bool mlx4_en_is_tx_ring_full(const struct mlx4_en_tx_ring *ring)
 {
-       return ring->prod - ring->cons > ring->full_size;
+       return READ_ONCE(ring->prod) - READ_ONCE(ring->cons) > ring->full_size;
 }
 
 static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv,
@@ -374,6 +375,7 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct 
mlx4_en_tx_ring *ring)
 
        /* Skip last polled descriptor */
        ring->cons += ring->last_nr_txbb;
+       ring->ncons += ring->last_nr_txbb;
        en_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n",
                 ring->cons, ring->prod);
 
@@ -389,6 +391,7 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct 
mlx4_en_tx_ring *ring)
                                                !!(ring->cons & ring->size), 0,
                                                0 /* Non-NAPI caller */);
                ring->cons += ring->last_nr_txbb;
+               ring->ncons += ring->last_nr_txbb;
                cnt++;
        }
 
@@ -401,6 +404,38 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct 
mlx4_en_tx_ring *ring)
        return cnt;
 }
 
+void mlx4_en_xmit_doorbell(const struct net_device *dev,
+                          struct mlx4_en_tx_ring *ring)
+{
+
+       if (dev->doorbell_opt & 1) {
+               u32 oval = READ_ONCE(ring->prod_bell);
+               u32 nval = READ_ONCE(ring->prod);
+
+               if (oval == nval)
+                       return;
+
+               /* I can not tell yet if a cmpxchg() is needed or not */
+               if (dev->doorbell_opt & 2)
+                       WRITE_ONCE(ring->prod_bell, nval);
+               else
+                       if (cmpxchg(&ring->prod_bell, oval, nval) != oval)
+                               return;
+       }
+       /* Since there is no iowrite*_native() that writes the
+        * value as is, without byteswapping - using the one
+        * the doesn't do byteswapping in the relevant arch
+        * endianness.
+        */
+#if defined(__LITTLE_ENDIAN)
+       iowrite32(
+#else
+       iowrite32be(
+#endif
+                 ring->doorbell_qpn,
+                 ring->bf.uar->map + MLX4_SEND_DOORBELL);
+}
+
 static bool mlx4_en_process_tx_cq(struct net_device *dev,
                                  struct mlx4_en_cq *cq, int napi_budget)
 {
@@ -496,8 +531,13 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
        wmb();
 
        /* we want to dirty this cache line once */
-       ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb;
-       ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped;
+       WRITE_ONCE(ring->last_nr_txbb, last_nr_txbb);
+       ring_cons += txbbs_skipped;
+       WRITE_ONCE(ring->cons, ring_cons);
+       WRITE_ONCE(ring->ncons, ring_cons + last_nr_txbb);
+
+       if (dev->doorbell_opt)
+               mlx4_en_xmit_doorbell(dev, ring);
 
        if (ring->free_tx_desc == mlx4_en_recycle_tx_desc)
                return done < budget;
@@ -725,29 +765,14 @@ static void mlx4_bf_copy(void __iomem *dst, const void 
*src,
        __iowrite64_copy(dst, src, bytecnt / 8);
 }
 
-void mlx4_en_xmit_doorbell(struct mlx4_en_tx_ring *ring)
-{
-       wmb();
-       /* Since there is no iowrite*_native() that writes the
-        * value as is, without byteswapping - using the one
-        * the doesn't do byteswapping in the relevant arch
-        * endianness.
-        */
-#if defined(__LITTLE_ENDIAN)
-       iowrite32(
-#else
-       iowrite32be(
-#endif
-                 ring->doorbell_qpn,
-                 ring->bf.uar->map + MLX4_SEND_DOORBELL);
-}
 
 static void mlx4_en_tx_write_desc(struct mlx4_en_tx_ring *ring,
                                  struct mlx4_en_tx_desc *tx_desc,
                                  union mlx4_wqe_qpn_vlan qpn_vlan,
                                  int desc_size, int bf_index,
                                  __be32 op_own, bool bf_ok,
-                                 bool send_doorbell)
+                                 bool send_doorbell,
+                                 const struct net_device *dev, int nr_txbb)
 {
        tx_desc->ctrl.qpn_vlan = qpn_vlan;
 
@@ -761,6 +786,7 @@ static void mlx4_en_tx_write_desc(struct mlx4_en_tx_ring 
*ring,
 
                wmb();
 
+               ring->prod += nr_txbb;
                mlx4_bf_copy(ring->bf.reg + ring->bf.offset, &tx_desc->ctrl,
                             desc_size);
 
@@ -773,8 +799,9 @@ static void mlx4_en_tx_write_desc(struct mlx4_en_tx_ring 
*ring,
                 */
                dma_wmb();
                tx_desc->ctrl.owner_opcode = op_own;
+               ring->prod += nr_txbb;
                if (send_doorbell)
-                       mlx4_en_xmit_doorbell(ring);
+                       mlx4_en_xmit_doorbell(dev, ring);
                else
                        ring->xmit_more++;
        }
@@ -1017,8 +1044,6 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct 
net_device *dev)
                        op_own |= cpu_to_be32(MLX4_WQE_CTRL_IIP);
        }
 
-       ring->prod += nr_txbb;
-
        /* If we used a bounce buffer then copy descriptor back into place */
        if (unlikely(bounce))
                tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size);
@@ -1033,6 +1058,14 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct 
net_device *dev)
        }
        send_doorbell = !skb->xmit_more || netif_xmit_stopped(ring->tx_queue);
 
+       /* Doorbell avoidance : We can omit doorbell if we know a TX completion
+        * will happen shortly.
+        */
+       if (send_doorbell &&
+           dev->doorbell_opt &&
+           (s32)(READ_ONCE(ring->prod_bell) - READ_ONCE(ring->ncons)) > 0)
+               send_doorbell = false;
+
        real_size = (real_size / 16) & 0x3f;
 
        bf_ok &= desc_size <= MAX_BF && send_doorbell;
@@ -1043,7 +1076,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct 
net_device *dev)
                qpn_vlan.fence_size = real_size;
 
        mlx4_en_tx_write_desc(ring, tx_desc, qpn_vlan, desc_size, bf_index,
-                             op_own, bf_ok, send_doorbell);
+                             op_own, bf_ok, send_doorbell, dev, nr_txbb);
 
        if (unlikely(stop_queue)) {
                /* If queue was emptied after the if (stop_queue) , and before
@@ -1054,7 +1087,6 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct 
net_device *dev)
                 */
                smp_rmb();
 
-               ring_cons = ACCESS_ONCE(ring->cons);
                if (unlikely(!mlx4_en_is_tx_ring_full(ring))) {
                        netif_tx_wake_queue(ring->tx_queue);
                        ring->wake_queue++;
@@ -1158,8 +1190,6 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring 
*rx_ring,
        rx_ring->xdp_tx++;
        AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, length);
 
-       ring->prod += nr_txbb;
-
        stop_queue = mlx4_en_is_tx_ring_full(ring);
        send_doorbell = stop_queue ||
                                *doorbell_pending > MLX4_EN_DOORBELL_BUDGET;
@@ -1173,7 +1203,7 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring 
*rx_ring,
                qpn_vlan.fence_size = real_size;
 
        mlx4_en_tx_write_desc(ring, tx_desc, qpn_vlan, TXBB_SIZE, bf_index,
-                             op_own, bf_ok, send_doorbell);
+                             op_own, bf_ok, send_doorbell, dev, nr_txbb);
        *doorbell_pending = send_doorbell ? 0 : *doorbell_pending + 1;
 
        return NETDEV_TX_OK;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h 
b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 574bcbb1b38f..c3fd0deda198 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -280,6 +280,7 @@ struct mlx4_en_tx_ring {
         */
        u32                     last_nr_txbb;
        u32                     cons;
+       u32                     ncons;
        unsigned long           wake_queue;
        struct netdev_queue     *tx_queue;
        u32                     (*free_tx_desc)(struct mlx4_en_priv *priv,
@@ -290,6 +291,7 @@ struct mlx4_en_tx_ring {
 
        /* cache line used and dirtied in mlx4_en_xmit() */
        u32                     prod ____cacheline_aligned_in_smp;
+       u32                     prod_bell;
        unsigned int            tx_dropped;
        unsigned long           bytes;
        unsigned long           packets;
@@ -699,7 +701,7 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring 
*rx_ring,
                               struct mlx4_en_rx_alloc *frame,
                               struct net_device *dev, unsigned int length,
                               int tx_ind, int *doorbell_pending);
-void mlx4_en_xmit_doorbell(struct mlx4_en_tx_ring *ring);
+void mlx4_en_xmit_doorbell(const struct net_device *dev, struct 
mlx4_en_tx_ring *ring);
 bool mlx4_en_rx_recycle(struct mlx4_en_rx_ring *ring,
                        struct mlx4_en_rx_alloc *frame);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4ffcd874cc20..39565b5425a6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1816,6 +1816,7 @@ struct net_device {
        DECLARE_HASHTABLE       (qdisc_hash, 4);
 #endif
        unsigned long           tx_queue_len;
+       unsigned long           doorbell_opt;
        spinlock_t              tx_global_lock;
        int                     watchdog_timeo;
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index b0c04cf4851d..df05f81f5150 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -367,6 +367,23 @@ static ssize_t gro_flush_timeout_store(struct device *dev,
 }
 NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
 
+static int change_doorbell_opt(struct net_device *dev, unsigned long val)
+{
+       dev->doorbell_opt = val;
+       return 0;
+}
+
+static ssize_t doorbell_opt_store(struct device *dev,
+                                 struct device_attribute *attr,
+                                 const char *buf, size_t len)
+{
+       if (!capable(CAP_NET_ADMIN))
+               return -EPERM;
+
+       return netdev_store(dev, attr, buf, len, change_doorbell_opt);
+}
+NETDEVICE_SHOW_RW(doorbell_opt, fmt_ulong);
+
 static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
                             const char *buf, size_t len)
 {
@@ -531,6 +548,7 @@ static struct attribute *net_class_attrs[] = {
        &dev_attr_phys_port_name.attr,
        &dev_attr_phys_switch_id.attr,
        &dev_attr_proto_down.attr,
+       &dev_attr_doorbell_opt.attr,
        NULL,
 };
 ATTRIBUTE_GROUPS(net_class);


Reply via email to