> On 2013-9-3 13:03, Lei Li wrote: >> Hi Frank, >> >> I failed to apply this patch. Please make sure to use git-send-email, >> otherwise >> it's a little hard to review. :) >> >> On 08/30/2013 08:39 PM, Frank Yang wrote: >>> When several VMs migrate with RDMA at the same time, the increased pressure >>> cause packet loss probabilistically and make source and destination wait >>> for each other. There might be some of VMs blocked during the migration. >>> >>> Fix the bug by using two completion queues, for sending and receiving >>> respectively. >> >>> >>> From 0c4829495cdc89eea2e94b103ac42c3f6a4b32c2 Mon Sep 17 00:00:00 2001 >>> From: Frank Yang <[email protected] <mailto:[email protected]>> >>> Date: Fri, 30 Aug 2013 17:53:34 +0800 >>> Subject: [PATCH] rdma: fix multiple VMs parallel migration >> >> The commit message should be here within the patch. You can use 'git commit >> --amend' >> to add it. >> >> >>> >>> Signed-off-by: Frank Yang <[email protected] >>> <mailto:[email protected]>> >>> --- >>> migration-rdma.c | 57 >>> ++++++++++++++++++++++++++++++++++++-------------------- >>> 1 file changed, 37 insertions(+), 20 deletions(-) >>> >>> diff --git a/migration-rdma.c b/migration-rdma.c >>> index 3d1266f..d0eacbb 100644 >>> --- a/migration-rdma.c >>> +++ b/migration-rdma.c >>> @@ -362,7 +362,8 @@ typedef struct RDMAContext { >>> struct ibv_qp *qp; /* queue pair */ >>> struct ibv_comp_channel *comp_channel; /* completion channel */ >>> struct ibv_pd *pd; /* protection domain */ >>> - struct ibv_cq *cq; /* completion queue */ >>> + struct ibv_cq *send_cq; /* send completion queue */ >>> + struct ibv_cq *recv_cq; /* receive completion queue */ >>> /* >>> * If a previous write failed (perhaps because of a failed >>> @@ -1006,9 +1007,12 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma) >>> * Completion queue can be filled by both read and write work requests, >>> * so must reflect the sum of both possible queue sizes. >>> */ >>> - rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3), >>> + rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * >>> 2), >>> NULL, rdma->comp_channel, 0); >>> - if (!rdma->cq) { >>> + rdma->recv_cq = ibv_create_cq(rdma->verbs, RDMA_SIGNALED_SEND_MAX, >>> NULL, >>> + rdma->comp_channel, 0); >>> + >>> + if (!rdma->send_cq || !rdma->recv_cq) { >>> fprintf(stderr, "failed to allocate completion queue\n"); >>> goto err_alloc_pd_cq; >>> } >>> @@ -1040,8 +1044,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma) >>> attr.cap.max_recv_wr = 3; >>> attr.cap.max_send_sge = 1; >>> attr.cap.max_recv_sge = 1; >>> - attr.send_cq = rdma->cq; >>> - attr.recv_cq = rdma->cq; >>> + attr.send_cq = rdma->send_cq; >>> + attr.recv_cq = rdma->recv_cq; >>> attr.qp_type = IBV_QPT_RC; >>> ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr); >>> @@ -1361,13 +1365,18 @@ static void qemu_rdma_signal_unregister(RDMAContext >>> *rdma, uint64_t index, >>> * Return the work request ID that completed. >>> */ >>> static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out, >>> - uint32_t *byte_len) >>> + uint32_t *byte_len, int wrid_requested) >>> { >>> int ret; >>> struct ibv_wc wc; >>> uint64_t wr_id; >>> - ret = ibv_poll_cq(rdma->cq, 1, &wc); >>> + if (wrid_requested == RDMA_WRID_RDMA_WRITE || >>> + wrid_requested == RDMA_WRID_SEND_CONTROL) { >>> + ret = ibv_poll_cq(rdma->send_cq, 1, &wc); >>> + } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) { >>> + ret = ibv_poll_cq(rdma->recv_cq, 1, &wc); >>> + } >>> if (!ret) { >>> *wr_id_out = RDMA_WRID_NONE; >>> @@ -1460,12 +1469,9 @@ static int qemu_rdma_block_for_wrid(RDMAContext >>> *rdma, int wrid_requested, >>> void *cq_ctx; >>> uint64_t wr_id = RDMA_WRID_NONE, wr_id_in; >>> - if (ibv_req_notify_cq(rdma->cq, 0)) { >>> - return -1; >>> - } >>> /* poll cq first */ >>> while (wr_id != wrid_requested) { >>> - ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len); >>> + ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested); >>> if (ret < 0) { >>> return ret; >>> } >>> @@ -1487,6 +1493,17 @@ static int qemu_rdma_block_for_wrid(RDMAContext >>> *rdma, int wrid_requested, >>> } >>> while (1) { >>> + if (wrid_requested == RDMA_WRID_RDMA_WRITE || >>> + wrid_requested == RDMA_WRID_SEND_CONTROL) { >>> + if (ibv_req_notify_cq(rdma->send_cq, 0)) { >>> + return -1; >>> + } >>> + } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) { >>> + if (ibv_req_notify_cq(rdma->recv_cq, 0)) { >>> + return -1; >>> + } >>> + } >>> + >>> /* >>> * Coroutine doesn't start until process_incoming_migration() >>> * so don't yield unless we know we're running inside of a >>> coroutine. >>> @@ -1502,12 +1519,8 @@ static int qemu_rdma_block_for_wrid(RDMAContext >>> *rdma, int wrid_requested, >>> num_cq_events++; >>> - if (ibv_req_notify_cq(cq, 0)) { >>> - goto err_block_for_wrid; >>> - } >>> - >>> while (wr_id != wrid_requested) { >>> - ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len); >>> + ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, >>> wrid_requested); >>> if (ret < 0) { >>> goto err_block_for_wrid; >>> } >>> @@ -2236,9 +2249,13 @@ static void qemu_rdma_cleanup(RDMAContext *rdma) >>> ibv_destroy_qp(rdma->qp); >>> rdma->qp = NULL; >>> } >>> - if (rdma->cq) { >>> - ibv_destroy_cq(rdma->cq); >>> - rdma->cq = NULL; >>> + if (rdma->send_cq) { >>> + ibv_destroy_cq(rdma->send_cq); >>> + rdma->send_cq = NULL; >>> + } >>> + if (rdma->recv_cq) { >>> + ibv_destroy_cq(rdma->recv_cq); >>> + rdma->recv_cq = NULL; >>> } >>> if (rdma->comp_channel) { >>> ibv_destroy_comp_channel(rdma->comp_channel); >>> @@ -2770,7 +2787,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void >>> *opaque, >>> */ >>> while (1) { >>> uint64_t wr_id, wr_id_in; >>> - int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL); >>> + int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL, >>> RDMA_WRID_RDMA_WRITE); >>> if (ret < 0) { >>> fprintf(stderr, "rdma migration: polling error! %d\n", ret); >>> goto err; >>> -- >>> 1.8.3.msysgit.0 >>> >>> >> >> > Understood. Thank you. The following patch should be fine. > > From 7b7d2c5b51c53c23f7194d35b469dedd892ef89f Mon Sep 17 00:00:00 2001 > From: Frank Yang <[email protected]> > Date: Tue, 3 Sep 2013 18:26:54 +0800 > Subject: [PATCH] rdma: fix multiple VMs parallel migration > > Signed-off-by: Frank Yang <[email protected]> > --- > migration-rdma.c | 64 +++++++++++++++++++++++++++++++++++++------------------- > 1 file changed, 42 insertions(+), 22 deletions(-) > > diff --git a/migration-rdma.c b/migration-rdma.c > index 3d1266f..30f8c11 100644 > --- a/migration-rdma.c > +++ b/migration-rdma.c > @@ -362,7 +362,8 @@ typedef struct RDMAContext { > struct ibv_qp *qp; /* queue pair */ > struct ibv_comp_channel *comp_channel; /* completion channel */ > struct ibv_pd *pd; /* protection domain */ > - struct ibv_cq *cq; /* completion queue */ > + struct ibv_cq *send_cq; /* send completion queue */ > + struct ibv_cq *recv_cq; /* receive completion queue */ > > /* > * If a previous write failed (perhaps because of a failed > @@ -1003,12 +1004,18 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma) > } > > /* > - * Completion queue can be filled by both read and write work requests, > - * so must reflect the sum of both possible queue sizes. > + * Create two completion queues for sending and receiving > + * respectively. > + * Send completion queue can be filled by both send and > + * write work requests, so must reflect the sum of both > + * possible queue sizes. > */ > - rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3), > + rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 2), > NULL, rdma->comp_channel, 0); > - if (!rdma->cq) { > + rdma->recv_cq = ibv_create_cq(rdma->verbs, RDMA_SIGNALED_SEND_MAX, NULL, > + rdma->comp_channel, 0); > + > + if (!rdma->send_cq || !rdma->recv_cq) { > fprintf(stderr, "failed to allocate completion queue\n"); > goto err_alloc_pd_cq; > } > @@ -1040,8 +1047,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma) > attr.cap.max_recv_wr = 3; > attr.cap.max_send_sge = 1; > attr.cap.max_recv_sge = 1; > - attr.send_cq = rdma->cq; > - attr.recv_cq = rdma->cq; > + attr.send_cq = rdma->send_cq; > + attr.recv_cq = rdma->recv_cq; > attr.qp_type = IBV_QPT_RC; > > ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr); > @@ -1361,13 +1368,18 @@ static void qemu_rdma_signal_unregister(RDMAContext > *rdma, uint64_t index, > * Return the work request ID that completed. > */ > static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out, > - uint32_t *byte_len) > + uint32_t *byte_len, int wrid_requested) > { > int ret; > struct ibv_wc wc; > uint64_t wr_id; > > - ret = ibv_poll_cq(rdma->cq, 1, &wc); > + if (wrid_requested == RDMA_WRID_RDMA_WRITE || > + wrid_requested == RDMA_WRID_SEND_CONTROL) { > + ret = ibv_poll_cq(rdma->send_cq, 1, &wc); > + } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) { > + ret = ibv_poll_cq(rdma->recv_cq, 1, &wc); > + } > > if (!ret) { > *wr_id_out = RDMA_WRID_NONE; > @@ -1460,12 +1472,9 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, > int wrid_requested, > void *cq_ctx; > uint64_t wr_id = RDMA_WRID_NONE, wr_id_in; > > - if (ibv_req_notify_cq(rdma->cq, 0)) { > - return -1; > - } > /* poll cq first */ > while (wr_id != wrid_requested) { > - ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len); > + ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested); > if (ret < 0) { > return ret; > } > @@ -1487,6 +1496,17 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, > int wrid_requested, > } > > while (1) { > + if (wrid_requested == RDMA_WRID_RDMA_WRITE || > + wrid_requested == RDMA_WRID_SEND_CONTROL) { > + if (ibv_req_notify_cq(rdma->send_cq, 0)) { > + return -1; > + } > + } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) { > + if (ibv_req_notify_cq(rdma->recv_cq, 0)) { > + return -1; > + } > + } > + > /* > * Coroutine doesn't start until process_incoming_migration() > * so don't yield unless we know we're running inside of a coroutine. > @@ -1502,12 +1522,8 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, > int wrid_requested, > > num_cq_events++; > > - if (ibv_req_notify_cq(cq, 0)) { > - goto err_block_for_wrid; > - } > - > while (wr_id != wrid_requested) { > - ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len); > + ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested); > if (ret < 0) { > goto err_block_for_wrid; > } > @@ -2236,9 +2252,13 @@ static void qemu_rdma_cleanup(RDMAContext *rdma) > ibv_destroy_qp(rdma->qp); > rdma->qp = NULL; > } > - if (rdma->cq) { > - ibv_destroy_cq(rdma->cq); > - rdma->cq = NULL; > + if (rdma->send_cq) { > + ibv_destroy_cq(rdma->send_cq); > + rdma->send_cq = NULL; > + } > + if (rdma->recv_cq) { > + ibv_destroy_cq(rdma->recv_cq); > + rdma->recv_cq = NULL; > } > if (rdma->comp_channel) { > ibv_destroy_comp_channel(rdma->comp_channel); > @@ -2770,7 +2790,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void > *opaque, > */ > while (1) { > uint64_t wr_id, wr_id_in; > - int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL); > + int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL, > RDMA_WRID_RDMA_WRITE); > if (ret < 0) { > fprintf(stderr, "rdma migration: polling error! %d\n", ret); > goto err; > -- > 1.8.3.msysgit.0 > > >
Sorry, my bad. Please follow this patch: From: Frank Yang <[email protected]> Signed-off-by: Frank Yang <[email protected]> --- migration-rdma.c | 64 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 22 deletions(-) diff --git a/migration-rdma.c b/migration-rdma.c index 3d1266f..f3206c4 100644 --- a/migration-rdma.c +++ b/migration-rdma.c @@ -362,7 +362,8 @@ typedef struct RDMAContext { struct ibv_qp *qp; /* queue pair */ struct ibv_comp_channel *comp_channel; /* completion channel */ struct ibv_pd *pd; /* protection domain */ - struct ibv_cq *cq; /* completion queue */ + struct ibv_cq *send_cq; /* send completion queue */ + struct ibv_cq *recv_cq; /* receive completion queue */ /* * If a previous write failed (perhaps because of a failed @@ -1003,12 +1004,18 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma) } /* - * Completion queue can be filled by both read and write work requests, - * so must reflect the sum of both possible queue sizes. + * Create two completion queues for sending and receiving + * respectively. + * Send completion queue can be filled by both send and + * write work requests, so must reflect the sum of both + * possible queue sizes. */ - rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3), + rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 2), NULL, rdma->comp_channel, 0); - if (!rdma->cq) { + rdma->recv_cq = ibv_create_cq(rdma->verbs, RDMA_SIGNALED_SEND_MAX, NULL, + rdma->comp_channel, 0); + + if (!rdma->send_cq || !rdma->recv_cq) { fprintf(stderr, "failed to allocate completion queue\n"); goto err_alloc_pd_cq; } @@ -1040,8 +1047,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma) attr.cap.max_recv_wr = 3; attr.cap.max_send_sge = 1; attr.cap.max_recv_sge = 1; - attr.send_cq = rdma->cq; - attr.recv_cq = rdma->cq; + attr.send_cq = rdma->send_cq; + attr.recv_cq = rdma->recv_cq; attr.qp_type = IBV_QPT_RC; ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr); @@ -1361,13 +1368,18 @@ static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index, * Return the work request ID that completed. */ static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out, - uint32_t *byte_len) + uint32_t *byte_len, int wrid_requested) { int ret; struct ibv_wc wc; uint64_t wr_id; - ret = ibv_poll_cq(rdma->cq, 1, &wc); + if (wrid_requested == RDMA_WRID_RDMA_WRITE || + wrid_requested == RDMA_WRID_SEND_CONTROL) { + ret = ibv_poll_cq(rdma->send_cq, 1, &wc); + } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) { + ret = ibv_poll_cq(rdma->recv_cq, 1, &wc); + } if (!ret) { *wr_id_out = RDMA_WRID_NONE; @@ -1460,12 +1472,9 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested, void *cq_ctx; uint64_t wr_id = RDMA_WRID_NONE, wr_id_in; - if (ibv_req_notify_cq(rdma->cq, 0)) { - return -1; - } /* poll cq first */ while (wr_id != wrid_requested) { - ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len); + ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested); if (ret < 0) { return ret; } @@ -1487,6 +1496,17 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested, } while (1) { + if (wrid_requested == RDMA_WRID_RDMA_WRITE || + wrid_requested == RDMA_WRID_SEND_CONTROL) { + if (ibv_req_notify_cq(rdma->send_cq, 0)) { + return -1; + } + } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) { + if (ibv_req_notify_cq(rdma->recv_cq, 0)) { + return -1; + } + } + /* * Coroutine doesn't start until process_incoming_migration() * so don't yield unless we know we're running inside of a coroutine. @@ -1502,12 +1522,8 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested, num_cq_events++; - if (ibv_req_notify_cq(cq, 0)) { - goto err_block_for_wrid; - } - while (wr_id != wrid_requested) { - ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len); + ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested); if (ret < 0) { goto err_block_for_wrid; } @@ -2236,9 +2252,13 @@ static void qemu_rdma_cleanup(RDMAContext *rdma) ibv_destroy_qp(rdma->qp); rdma->qp = NULL; } - if (rdma->cq) { - ibv_destroy_cq(rdma->cq); - rdma->cq = NULL; + if (rdma->send_cq) { + ibv_destroy_cq(rdma->send_cq); + rdma->send_cq = NULL; + } + if (rdma->recv_cq) { + ibv_destroy_cq(rdma->recv_cq); + rdma->recv_cq = NULL; } if (rdma->comp_channel) { ibv_destroy_comp_channel(rdma->comp_channel); @@ -2770,7 +2790,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque, */ while (1) { uint64_t wr_id, wr_id_in; - int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL); + int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL, RDMA_WRID_RDMA_WRITE); if (ret < 0) { fprintf(stderr, "rdma migration: polling error! %d\n", ret); goto err; -- 1.8.3.msysgit.0
