Re: [Qemu-devel] [Qemu-stable][PATCH] rdma: fix multiple VMs parallel migration

Frank Yang Wed, 04 Sep 2013 01:13:20 -0700

> On 2013-9-3 13:03, Lei Li wrote:
>> Hi Frank,
>>
>> I failed to apply this patch. Please make sure to use git-send-email, 
>> otherwise
>> it's a little hard to review. :)
>>
>> On 08/30/2013 08:39 PM, Frank Yang wrote:
>>> When several VMs migrate with RDMA at the same time, the increased pressure 
>>> cause packet loss probabilistically and make source and destination wait 
>>> for each other. There might be some of VMs blocked during the migration.
>>>
>>> Fix the bug by using two completion queues, for sending and receiving 
>>> respectively.
>>
>>>
>>> From 0c4829495cdc89eea2e94b103ac42c3f6a4b32c2 Mon Sep 17 00:00:00 2001
>>> From: Frank Yang <[email protected] <mailto:[email protected]>>
>>> Date: Fri, 30 Aug 2013 17:53:34 +0800
>>> Subject: [PATCH] rdma: fix multiple VMs parallel migration
>>
>> The commit message should be here within the patch. You can use 'git commit 
>> --amend'
>> to add it.
>>  
>>
>>>
>>> Signed-off-by: Frank Yang <[email protected] 
>>> <mailto:[email protected]>>
>>> ---
>>>  migration-rdma.c | 57 
>>> ++++++++++++++++++++++++++++++++++++--------------------
>>>  1 file changed, 37 insertions(+), 20 deletions(-)
>>>
>>> diff --git a/migration-rdma.c b/migration-rdma.c
>>> index 3d1266f..d0eacbb 100644
>>> --- a/migration-rdma.c
>>> +++ b/migration-rdma.c
>>> @@ -362,7 +362,8 @@ typedef struct RDMAContext {
>>>      struct ibv_qp *qp;                      /* queue pair */
>>>      struct ibv_comp_channel *comp_channel;  /* completion channel */
>>>      struct ibv_pd *pd;                      /* protection domain */
>>> -    struct ibv_cq *cq;                      /* completion queue */
>>> +    struct ibv_cq *send_cq;                 /* send completion queue */
>>> +    struct ibv_cq *recv_cq;                 /* receive completion queue */
>>>      /*
>>>       * If a previous write failed (perhaps because of a failed
>>> @@ -1006,9 +1007,12 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
>>>       * Completion queue can be filled by both read and write work requests,
>>>       * so must reflect the sum of both possible queue sizes.
>>>       */
>>> -    rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
>>> +    rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 
>>> 2),
>>>              NULL, rdma->comp_channel, 0);
>>> -    if (!rdma->cq) {
>>> +    rdma->recv_cq = ibv_create_cq(rdma->verbs, RDMA_SIGNALED_SEND_MAX, 
>>> NULL,
>>> +            rdma->comp_channel, 0);
>>> +
>>> +    if (!rdma->send_cq || !rdma->recv_cq) {
>>>          fprintf(stderr, "failed to allocate completion queue\n");
>>>          goto err_alloc_pd_cq;
>>>      }
>>> @@ -1040,8 +1044,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
>>>      attr.cap.max_recv_wr = 3;
>>>      attr.cap.max_send_sge = 1;
>>>      attr.cap.max_recv_sge = 1;
>>> -    attr.send_cq = rdma->cq;
>>> -    attr.recv_cq = rdma->cq;
>>> +    attr.send_cq = rdma->send_cq;
>>> +    attr.recv_cq = rdma->recv_cq;
>>>      attr.qp_type = IBV_QPT_RC;
>>>      ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
>>> @@ -1361,13 +1365,18 @@ static void qemu_rdma_signal_unregister(RDMAContext 
>>> *rdma, uint64_t index,
>>>   * Return the work request ID that completed.
>>>   */
>>>  static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
>>> -                               uint32_t *byte_len)
>>> +                               uint32_t *byte_len, int wrid_requested)
>>>  {
>>>      int ret;
>>>      struct ibv_wc wc;
>>>      uint64_t wr_id;
>>> -    ret = ibv_poll_cq(rdma->cq, 1, &wc);
>>> +    if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
>>> +        wrid_requested == RDMA_WRID_SEND_CONTROL) {
>>> +        ret = ibv_poll_cq(rdma->send_cq, 1, &wc);
>>> +    } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
>>> +        ret = ibv_poll_cq(rdma->recv_cq, 1, &wc);
>>> +    }
>>>      if (!ret) {
>>>          *wr_id_out = RDMA_WRID_NONE;
>>> @@ -1460,12 +1469,9 @@ static int qemu_rdma_block_for_wrid(RDMAContext 
>>> *rdma, int wrid_requested,
>>>      void *cq_ctx;
>>>      uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
>>> -    if (ibv_req_notify_cq(rdma->cq, 0)) {
>>> -        return -1;
>>> -    }
>>>      /* poll cq first */
>>>      while (wr_id != wrid_requested) {
>>> -        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
>>> +        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
>>>          if (ret < 0) {
>>>              return ret;
>>>          }
>>> @@ -1487,6 +1493,17 @@ static int qemu_rdma_block_for_wrid(RDMAContext 
>>> *rdma, int wrid_requested,
>>>      }
>>>      while (1) {
>>> +        if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
>>> +            wrid_requested == RDMA_WRID_SEND_CONTROL) {
>>> +            if (ibv_req_notify_cq(rdma->send_cq, 0)) {
>>> +                return -1;
>>> +            }
>>> +        } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
>>> +            if (ibv_req_notify_cq(rdma->recv_cq, 0)) {
>>> +                return -1;
>>> +            }
>>> +        }
>>> +
>>>          /*
>>>           * Coroutine doesn't start until process_incoming_migration()
>>>           * so don't yield unless we know we're running inside of a 
>>> coroutine.
>>> @@ -1502,12 +1519,8 @@ static int qemu_rdma_block_for_wrid(RDMAContext 
>>> *rdma, int wrid_requested,
>>>          num_cq_events++;
>>> -        if (ibv_req_notify_cq(cq, 0)) {
>>> -            goto err_block_for_wrid;
>>> -        }
>>> -
>>>          while (wr_id != wrid_requested) {
>>> -            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
>>> +            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, 
>>> wrid_requested);
>>>              if (ret < 0) {
>>>                  goto err_block_for_wrid;
>>>              }
>>> @@ -2236,9 +2249,13 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
>>>          ibv_destroy_qp(rdma->qp);
>>>          rdma->qp = NULL;
>>>      }
>>> -    if (rdma->cq) {
>>> -        ibv_destroy_cq(rdma->cq);
>>> -        rdma->cq = NULL;
>>> +    if (rdma->send_cq) {
>>> +        ibv_destroy_cq(rdma->send_cq);
>>> +        rdma->send_cq = NULL;
>>> +    }
>>> +    if (rdma->recv_cq) {
>>> +        ibv_destroy_cq(rdma->recv_cq);
>>> +        rdma->recv_cq = NULL;
>>>      }
>>>      if (rdma->comp_channel) {
>>>  ibv_destroy_comp_channel(rdma->comp_channel);
>>> @@ -2770,7 +2787,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void 
>>> *opaque,
>>>       */
>>>      while (1) {
>>>          uint64_t wr_id, wr_id_in;
>>> -        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
>>> +        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL, 
>>> RDMA_WRID_RDMA_WRITE);
>>>          if (ret < 0) {
>>>              fprintf(stderr, "rdma migration: polling error! %d\n", ret);
>>>              goto err;
>>> -- 
>>> 1.8.3.msysgit.0
>>>
>>>
>>
>>
> Understood. Thank you. The following patch should be fine.
> 
> From 7b7d2c5b51c53c23f7194d35b469dedd892ef89f Mon Sep 17 00:00:00 2001
> From: Frank Yang <[email protected]>
> Date: Tue, 3 Sep 2013 18:26:54 +0800
> Subject: [PATCH] rdma: fix multiple VMs parallel migration
> 
> Signed-off-by: Frank Yang <[email protected]>
> ---
> migration-rdma.c | 64 +++++++++++++++++++++++++++++++++++++-------------------
> 1 file changed, 42 insertions(+), 22 deletions(-)
> 
> diff --git a/migration-rdma.c b/migration-rdma.c
> index 3d1266f..30f8c11 100644
> --- a/migration-rdma.c
> +++ b/migration-rdma.c
> @@ -362,7 +362,8 @@ typedef struct RDMAContext {
>     struct ibv_qp *qp;                      /* queue pair */
>     struct ibv_comp_channel *comp_channel;  /* completion channel */
>     struct ibv_pd *pd;                      /* protection domain */
> -    struct ibv_cq *cq;                      /* completion queue */
> +    struct ibv_cq *send_cq;                 /* send completion queue */
> +    struct ibv_cq *recv_cq;                 /* receive completion queue */
> 
>     /*
>      * If a previous write failed (perhaps because of a failed
> @@ -1003,12 +1004,18 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
>     }
> 
>     /*
> -     * Completion queue can be filled by both read and write work requests,
> -     * so must reflect the sum of both possible queue sizes.
> +     * Create two completion queues for sending and receiving
> +     * respectively.
> +     * Send completion queue can be filled by both send and
> +     * write work requests, so must reflect the sum of both
> +     * possible queue sizes.
>      */
> -    rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
> +    rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 2),
>             NULL, rdma->comp_channel, 0);
> -    if (!rdma->cq) {
> +    rdma->recv_cq = ibv_create_cq(rdma->verbs, RDMA_SIGNALED_SEND_MAX, NULL,
> +            rdma->comp_channel, 0);
> +
> +    if (!rdma->send_cq || !rdma->recv_cq) {
>         fprintf(stderr, "failed to allocate completion queue\n");
>         goto err_alloc_pd_cq;
>     }
> @@ -1040,8 +1047,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
>     attr.cap.max_recv_wr = 3;
>     attr.cap.max_send_sge = 1;
>     attr.cap.max_recv_sge = 1;
> -    attr.send_cq = rdma->cq;
> -    attr.recv_cq = rdma->cq;
> +    attr.send_cq = rdma->send_cq;
> +    attr.recv_cq = rdma->recv_cq;
>     attr.qp_type = IBV_QPT_RC;
> 
>     ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
> @@ -1361,13 +1368,18 @@ static void qemu_rdma_signal_unregister(RDMAContext 
> *rdma, uint64_t index,
>  * Return the work request ID that completed.
>  */
> static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
> -                               uint32_t *byte_len)
> +                               uint32_t *byte_len, int wrid_requested)
> {
>     int ret;
>     struct ibv_wc wc;
>     uint64_t wr_id;
> 
> -    ret = ibv_poll_cq(rdma->cq, 1, &wc);
> +    if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
> +        wrid_requested == RDMA_WRID_SEND_CONTROL) {
> +        ret = ibv_poll_cq(rdma->send_cq, 1, &wc);
> +    } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
> +        ret = ibv_poll_cq(rdma->recv_cq, 1, &wc);
> +    }
> 
>     if (!ret) {
>         *wr_id_out = RDMA_WRID_NONE;
> @@ -1460,12 +1472,9 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, 
> int wrid_requested,
>     void *cq_ctx;
>     uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
> 
> -    if (ibv_req_notify_cq(rdma->cq, 0)) {
> -        return -1;
> -    }
>     /* poll cq first */
>     while (wr_id != wrid_requested) {
> -        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
> +        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
>         if (ret < 0) {
>             return ret;
>         }
> @@ -1487,6 +1496,17 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, 
> int wrid_requested,
>     }
> 
>     while (1) {
> +        if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
> +        wrid_requested == RDMA_WRID_SEND_CONTROL) {
> +        if (ibv_req_notify_cq(rdma->send_cq, 0)) {
> +                return -1;
> +        }
> +        } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
> +            if (ibv_req_notify_cq(rdma->recv_cq, 0)) {
> +                return -1;
> +            }
> +        }
> +
>         /*
>          * Coroutine doesn't start until process_incoming_migration()
>          * so don't yield unless we know we're running inside of a coroutine.
> @@ -1502,12 +1522,8 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, 
> int wrid_requested,
> 
>         num_cq_events++;
> 
> -        if (ibv_req_notify_cq(cq, 0)) {
> -            goto err_block_for_wrid;
> -        }
> -
>         while (wr_id != wrid_requested) {
> -            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
> +            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
>             if (ret < 0) {
>                 goto err_block_for_wrid;
>             }
> @@ -2236,9 +2252,13 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
>         ibv_destroy_qp(rdma->qp);
>         rdma->qp = NULL;
>     }
> -    if (rdma->cq) {
> -        ibv_destroy_cq(rdma->cq);
> -        rdma->cq = NULL;
> +    if (rdma->send_cq) {
> +        ibv_destroy_cq(rdma->send_cq);
> +        rdma->send_cq = NULL;
> +    }
> +    if (rdma->recv_cq) {
> +        ibv_destroy_cq(rdma->recv_cq);
> +        rdma->recv_cq = NULL;
>     }
>     if (rdma->comp_channel) {
>         ibv_destroy_comp_channel(rdma->comp_channel);
> @@ -2770,7 +2790,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void 
> *opaque,
>      */
>     while (1) {
>         uint64_t wr_id, wr_id_in;
> -        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
> +        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL, 
> RDMA_WRID_RDMA_WRITE);
>         if (ret < 0) {
>             fprintf(stderr, "rdma migration: polling error! %d\n", ret);
>             goto err;
> -- 
> 1.8.3.msysgit.0
> 
> 
>


Sorry, my bad. Please follow this patch:

From: Frank Yang <[email protected]>

Signed-off-by: Frank Yang <[email protected]>
---
 migration-rdma.c | 64 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 42 insertions(+), 22 deletions(-)

diff --git a/migration-rdma.c b/migration-rdma.c
index 3d1266f..f3206c4 100644
--- a/migration-rdma.c
+++ b/migration-rdma.c
@@ -362,7 +362,8 @@ typedef struct RDMAContext {
     struct ibv_qp *qp;                      /* queue pair */
     struct ibv_comp_channel *comp_channel;  /* completion channel */
     struct ibv_pd *pd;                      /* protection domain */
-    struct ibv_cq *cq;                      /* completion queue */
+    struct ibv_cq *send_cq;                 /* send completion queue */
+    struct ibv_cq *recv_cq;                 /* receive completion queue */
 
     /*
      * If a previous write failed (perhaps because of a failed
@@ -1003,12 +1004,18 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
     }
 
     /*
-     * Completion queue can be filled by both read and write work requests,
-     * so must reflect the sum of both possible queue sizes.
+     * Create two completion queues for sending and receiving
+     * respectively.
+     * Send completion queue can be filled by both send and
+     * write work requests, so must reflect the sum of both
+     * possible queue sizes.
      */
-    rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
+    rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 2),
             NULL, rdma->comp_channel, 0);
-    if (!rdma->cq) {
+    rdma->recv_cq = ibv_create_cq(rdma->verbs, RDMA_SIGNALED_SEND_MAX, NULL,
+            rdma->comp_channel, 0);
+
+    if (!rdma->send_cq || !rdma->recv_cq) {
         fprintf(stderr, "failed to allocate completion queue\n");
         goto err_alloc_pd_cq;
     }
@@ -1040,8 +1047,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
     attr.cap.max_recv_wr = 3;
     attr.cap.max_send_sge = 1;
     attr.cap.max_recv_sge = 1;
-    attr.send_cq = rdma->cq;
-    attr.recv_cq = rdma->cq;
+    attr.send_cq = rdma->send_cq;
+    attr.recv_cq = rdma->recv_cq;
     attr.qp_type = IBV_QPT_RC;
 
     ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
@@ -1361,13 +1368,18 @@ static void qemu_rdma_signal_unregister(RDMAContext 
*rdma, uint64_t index,
  * Return the work request ID that completed.
  */
 static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
-                               uint32_t *byte_len)
+                               uint32_t *byte_len, int wrid_requested)
 {
     int ret;
     struct ibv_wc wc;
     uint64_t wr_id;
 
-    ret = ibv_poll_cq(rdma->cq, 1, &wc);
+    if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
+        wrid_requested == RDMA_WRID_SEND_CONTROL) {
+        ret = ibv_poll_cq(rdma->send_cq, 1, &wc);
+    } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
+        ret = ibv_poll_cq(rdma->recv_cq, 1, &wc);
+    }
 
     if (!ret) {
         *wr_id_out = RDMA_WRID_NONE;
@@ -1460,12 +1472,9 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, 
int wrid_requested,
     void *cq_ctx;
     uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
 
-    if (ibv_req_notify_cq(rdma->cq, 0)) {
-        return -1;
-    }
     /* poll cq first */
     while (wr_id != wrid_requested) {
-        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
+        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
         if (ret < 0) {
             return ret;
         }
@@ -1487,6 +1496,17 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, 
int wrid_requested,
     }
 
     while (1) {
+        if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
+            wrid_requested == RDMA_WRID_SEND_CONTROL) {
+            if (ibv_req_notify_cq(rdma->send_cq, 0)) {
+                return -1;
+            }
+        } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
+            if (ibv_req_notify_cq(rdma->recv_cq, 0)) {
+                return -1;
+            }
+        }
+
         /*
          * Coroutine doesn't start until process_incoming_migration()
          * so don't yield unless we know we're running inside of a coroutine.
@@ -1502,12 +1522,8 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, 
int wrid_requested,
 
         num_cq_events++;
 
-        if (ibv_req_notify_cq(cq, 0)) {
-            goto err_block_for_wrid;
-        }
-
         while (wr_id != wrid_requested) {
-            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
+            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
             if (ret < 0) {
                 goto err_block_for_wrid;
             }
@@ -2236,9 +2252,13 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
         ibv_destroy_qp(rdma->qp);
         rdma->qp = NULL;
     }
-    if (rdma->cq) {
-        ibv_destroy_cq(rdma->cq);
-        rdma->cq = NULL;
+    if (rdma->send_cq) {
+        ibv_destroy_cq(rdma->send_cq);
+        rdma->send_cq = NULL;
+    }
+    if (rdma->recv_cq) {
+        ibv_destroy_cq(rdma->recv_cq);
+        rdma->recv_cq = NULL;
     }
     if (rdma->comp_channel) {
         ibv_destroy_comp_channel(rdma->comp_channel);
@@ -2770,7 +2790,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void 
*opaque,
      */
     while (1) {
         uint64_t wr_id, wr_id_in;
-        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
+        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL, RDMA_WRID_RDMA_WRITE);
         if (ret < 0) {
             fprintf(stderr, "rdma migration: polling error! %d\n", ret);
             goto err;
-- 
1.8.3.msysgit.0

Re: [Qemu-devel] [Qemu-stable][PATCH] rdma: fix multiple VMs parallel migration

Reply via email to