From: Yamin Friedman <yam...@mellanox.com>

Added the interface in the infiniband driver that applies the rdma_dim
adaptive moderation. There is now a special function for allocating an
ib_cq that uses rdma_dim.

Performance improvement (ConnectX-5 100GbE, x86) running FIO benchmark over
NVMf between two equal end-hosts with 56 cores across a Mellanox switch
using null_blk device:

READS without DIM:
blk size | BW       | IOPS | 99th percentile latency  | 99.99th latency
512B     | 3.8GiB/s | 7.7M | 1401  usec               | 2442  usec
4k       | 7.0GiB/s | 1.8M | 4817  usec               | 6587  usec
64k      | 10.7GiB/s| 175k | 9896  usec               | 10028 usec

IO WRITES without DIM:
blk size | BW       | IOPS | 99th percentile latency  | 99.99th latency
512B     | 3.6GiB/s | 7.5M | 1434  usec               | 2474  usec
4k       | 6.3GiB/s | 1.6M | 938   usec               | 1221  usec
64k      | 10.7GiB/s| 175k | 8979  usec               | 12780 usec

IO READS with DIM:
blk size | BW       | IOPS | 99th percentile latency  | 99.99th latency
512B     | 4GiB/s   | 8.2M | 816    usec              | 889   usec
4k       | 10.1GiB/s| 2.65M| 3359   usec              | 5080  usec
64k      | 10.7GiB/s| 175k | 9896   usec              | 10028 usec

IO WRITES with DIM:
blk size | BW       | IOPS  | 99th percentile latency | 99.99th latency
512B     | 3.9GiB/s | 8.1M  | 799   usec              | 922   usec
4k       | 9.6GiB/s | 2.5M  | 717   usec              | 1004  usec
64k      | 10.7GiB/s| 176k  | 8586  usec              | 12256 usec

The rdma_dim algorithm was designed to measure the effectiveness of
moderation on the flow in a general way and thus should be appropriate
for all RDMA storage protocols.

Signed-off-by: Yamin Friedman <yam...@mellanox.com>
Reviewed-by: Max Gurtovoy <m...@mellanox.com>
Signed-off-by: Saeed Mahameed <sae...@mellanox.com>
---
 drivers/infiniband/core/cq.c                  | 78 ++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx4/Kconfig    |  1 +
 .../net/ethernet/mellanox/mlx5/core/Kconfig   |  1 +
 include/rdma/ib_verbs.h                       | 27 ++++++-
 4 files changed, 102 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index a4c81992267c..326d928d2763 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -14,6 +14,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <rdma/ib_verbs.h>
+#include <linux/rdma_dim.h>
 
 /* # of WCs to poll for with a single call to ib_poll_cq */
 #define IB_POLL_BATCH                  16
@@ -26,6 +27,32 @@
 #define IB_POLL_FLAGS \
        (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
 
+static void ib_cq_rdma_dim_work(struct work_struct *w)
+{
+       struct dim *dim = container_of(w, struct dim, work);
+       struct ib_cq *cq = container_of(dim, struct ib_cq, dim);
+
+       u16 usec = rdma_dim_prof[dim->profile_ix].usec;
+       u16 comps = rdma_dim_prof[dim->profile_ix].comps;
+
+       dim->state = DIM_START_MEASURE;
+
+       cq->device->ops.modify_cq(cq, comps, usec);
+}
+
+static bool rdma_dim_init(struct dim *dim, struct ib_cq *cq)
+{
+       if (!cq->device->ops.modify_cq)
+               return false;
+
+       memset(dim, 0, sizeof(*dim));
+       dim->state = DIM_START_MEASURE;
+       dim->tune_state = DIM_GOING_RIGHT;
+       dim->profile_ix = RDMA_DIM_START_PROFILE;
+
+       return true;
+}
+
 static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs,
                           int batch)
 {
@@ -98,6 +125,24 @@ static int ib_poll_handler(struct irq_poll *iop, int budget)
        return completed;
 }
 
+static int ib_poll_dim_handler(struct irq_poll *iop, int budget)
+{
+       struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
+       int completed;
+       struct dim *dim = &cq->dim;
+
+       completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
+       if (completed < budget) {
+               irq_poll_complete(&cq->iop);
+               if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
+                       irq_poll_sched(&cq->iop);
+       }
+
+       rdma_dim(dim, completed);
+
+       return completed;
+}
+
 static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
 {
        irq_poll_sched(&cq->iop);
@@ -105,14 +150,18 @@ static void ib_cq_completion_softirq(struct ib_cq *cq, 
void *private)
 
 static void ib_cq_poll_work(struct work_struct *work)
 {
-       struct ib_cq *cq = container_of(work, struct ib_cq, work);
+       struct ib_cq *cq = container_of(work, struct ib_cq,
+                                       work);
        int completed;
 
        completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, cq->wc,
                                    IB_POLL_BATCH);
+
        if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
            ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
                queue_work(cq->comp_wq, &cq->work);
+       else if (cq->dim_used)
+               rdma_dim(&cq->dim, completed);
 }
 
 static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
@@ -129,6 +178,7 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, 
void *private)
  * @poll_ctx:          context to poll the CQ from.
  * @caller:            module owner name.
  * @udata:             Valid user data or NULL for kernel object
+ * @use_dim:           use dynamic interrupt moderation
  *
  * This is the proper interface to allocate a CQ for in-kernel users. A
  * CQ allocated with this interface will automatically be polled from the
@@ -138,7 +188,8 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, 
void *private)
 struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
                                 int nr_cqe, int comp_vector,
                                 enum ib_poll_context poll_ctx,
-                                const char *caller, struct ib_udata *udata)
+                                const char *caller, struct ib_udata *udata,
+                                bool use_dim)
 {
        struct ib_cq_init_attr cq_attr = {
                .cqe            = nr_cqe,
@@ -173,13 +224,30 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, 
void *private,
        case IB_POLL_SOFTIRQ:
                cq->comp_handler = ib_cq_completion_softirq;
 
-               irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
+               if (use_dim)
+                       cq->dim_used = rdma_dim_init(&cq->dim, cq);
+
+               if (cq->dim_used) {
+                       irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ,
+                                     ib_poll_dim_handler);
+                       INIT_WORK(&cq->dim.work, ib_cq_rdma_dim_work);
+               } else {
+                       irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ,
+                                     ib_poll_handler);
+               }
+
                ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
                break;
        case IB_POLL_WORKQUEUE:
        case IB_POLL_UNBOUND_WORKQUEUE:
                cq->comp_handler = ib_cq_completion_workqueue;
                INIT_WORK(&cq->work, ib_cq_poll_work);
+               if (use_dim)
+                       cq->dim_used = rdma_dim_init(&cq->dim, cq);
+
+               if (cq->dim_used)
+                       INIT_WORK(&cq->dim.work, ib_cq_rdma_dim_work);
+
                ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
                cq->comp_wq = (cq->poll_ctx == IB_POLL_WORKQUEUE) ?
                                ib_comp_wq : ib_comp_unbound_wq;
@@ -217,10 +285,14 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata 
*udata)
                break;
        case IB_POLL_SOFTIRQ:
                irq_poll_disable(&cq->iop);
+               if (cq->dim_used)
+                       cancel_work_sync(&cq->dim.work);
                break;
        case IB_POLL_WORKQUEUE:
        case IB_POLL_UNBOUND_WORKQUEUE:
                cancel_work_sync(&cq->work);
+               if (cq->dim_used)
+                       cancel_work_sync(&cq->dim.work);
                break;
        default:
                WARN_ON_ONCE(1);
diff --git a/drivers/net/ethernet/mellanox/mlx4/Kconfig 
b/drivers/net/ethernet/mellanox/mlx4/Kconfig
index e69c3c31e701..93cd25997b24 100644
--- a/drivers/net/ethernet/mellanox/mlx4/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx4/Kconfig
@@ -28,6 +28,7 @@ config MLX4_CORE
        tristate
        depends on PCI
        select NET_DEVLINK
+       select DIMLIB
        default n
 
 config MLX4_DEBUG
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig 
b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 7845aa5bf6be..ef292fbb53c9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -7,6 +7,7 @@ config MLX5_CORE
        tristate "Mellanox 5th generation network adapters (ConnectX series) 
core driver"
        depends on PCI
        select NET_DEVLINK
+       select DIMLIB
        imply PTP_1588_CLOCK
        imply VXLAN
        imply MLXFW
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 0742095355f2..7b03fb3e4f0b 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -65,6 +65,7 @@
 #include <rdma/restrack.h>
 #include <uapi/rdma/rdma_user_ioctl.h>
 #include <uapi/rdma/ib_user_ioctl_verbs.h>
+#include <linux/dim.h>
 
 #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN
 
@@ -1638,6 +1639,8 @@ struct ib_cq {
         * Implementation details of the RDMA core, don't use in drivers:
         */
        struct rdma_restrack_entry res;
+       struct dim              dim;
+       bool                    dim_used;
 };
 
 struct ib_srq {
@@ -3746,7 +3749,8 @@ static inline int ib_post_recv(struct ib_qp *qp,
 struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
                                 int nr_cqe, int comp_vector,
                                 enum ib_poll_context poll_ctx,
-                                const char *caller, struct ib_udata *udata);
+                                const char *caller, struct ib_udata *udata,
+                                bool use_dim);
 
 /**
  * ib_alloc_cq_user: Allocate kernel/user CQ
@@ -3764,7 +3768,7 @@ static inline struct ib_cq *ib_alloc_cq_user(struct 
ib_device *dev,
                                             struct ib_udata *udata)
 {
        return __ib_alloc_cq_user(dev, private, nr_cqe, comp_vector, poll_ctx,
-                                 KBUILD_MODNAME, udata);
+                                 KBUILD_MODNAME, udata, false);
 }
 
 /**
@@ -3785,6 +3789,25 @@ static inline struct ib_cq *ib_alloc_cq(struct ib_device 
*dev, void *private,
                                NULL);
 }
 
+/**
+ * ib_alloc_cq_dim: Allocate kernel CQ with dynamic interrupt moderation
+ * @dev: The IB device
+ * @private: Private data attached to the CQE
+ * @nr_cqe: Number of CQEs in the CQ
+ * @comp_vector: Completion vector used for the IRQs
+ * @poll_ctx: Context used for polling the CQ
+ *
+ * NOTE: for user cq use ib_alloc_cq_user with valid udata!
+ */
+static inline struct ib_cq *ib_alloc_cq_dim(struct ib_device *dev,
+                                           void *private, int nr_cqe,
+                                           int comp_vector,
+                                           enum ib_poll_context poll_ctx)
+{
+       return __ib_alloc_cq_user(dev, private, nr_cqe, comp_vector, poll_ctx,
+                                 KBUILD_MODNAME, NULL, true);
+}
+
 /**
  * ib_free_cq_user - Free kernel/user CQ
  * @cq: The CQ to free
-- 
2.21.0

Reply via email to