On Wed, Feb 21, 2024 at 8:00 AM Elena Ufimtseva <[email protected]> wrote:
>
>
>
> On Fri, Feb 16, 2024 at 2:42 PM Hao Xiang <[email protected]> wrote:
>>
>> 1. Implements the zero page detection and handling on the multifd
>> threads for non-compression, zlib and zstd compression backends.
>> 2. Added a new value 'multifd' in ZeroPageDetection enumeration.
>> 3. Add proper asserts to ensure pages->normal are used for normal pages
>> in all scenarios.
>>
>> Signed-off-by: Hao Xiang <[email protected]>
>> ---
>> migration/meson.build | 1 +
>> migration/multifd-zero-page.c | 59 +++++++++++++++++++++++++++++++++++
>> migration/multifd-zlib.c | 26 ++++++++++++---
>> migration/multifd-zstd.c | 25 ++++++++++++---
>> migration/multifd.c | 50 +++++++++++++++++++++++------
>> migration/multifd.h | 7 +++++
>> qapi/migration.json | 4 ++-
>> 7 files changed, 151 insertions(+), 21 deletions(-)
>> create mode 100644 migration/multifd-zero-page.c
>>
>> diff --git a/migration/meson.build b/migration/meson.build
>> index 92b1cc4297..1eeb915ff6 100644
>> --- a/migration/meson.build
>> +++ b/migration/meson.build
>> @@ -22,6 +22,7 @@ system_ss.add(files(
>> 'migration.c',
>> 'multifd.c',
>> 'multifd-zlib.c',
>> + 'multifd-zero-page.c',
>> 'ram-compress.c',
>> 'options.c',
>> 'postcopy-ram.c',
>> diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c
>> new file mode 100644
>> index 0000000000..f0cd8e2c53
>> --- /dev/null
>> +++ b/migration/multifd-zero-page.c
>> @@ -0,0 +1,59 @@
>> +/*
>> + * Multifd zero page detection implementation.
>> + *
>> + * Copyright (c) 2024 Bytedance Inc
>> + *
>> + * Authors:
>> + * Hao Xiang <[email protected]>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> + * See the COPYING file in the top-level directory.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qemu/cutils.h"
>> +#include "exec/ramblock.h"
>> +#include "migration.h"
>> +#include "multifd.h"
>> +#include "options.h"
>> +#include "ram.h"
>> +
>> +void multifd_zero_page_check_send(MultiFDSendParams *p)
>> +{
>> + /*
>> + * QEMU older than 9.0 don't understand zero page
>> + * on multifd channel. This switch is required to
>> + * maintain backward compatibility.
>> + */
>> + bool use_multifd_zero_page =
>> + (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD);
>> + MultiFDPages_t *pages = p->pages;
>> + RAMBlock *rb = pages->block;
>> +
>> + assert(pages->num != 0);
>
>
> Not needed, the check is done right before calling send_prepare.
>
>>
>> + assert(pages->normal_num == 0);
>> + assert(pages->zero_num == 0);
>
>
> Why these asserts are needed?
The idea is that when multifd_zero_page_check_send is called, I want
to make sure zero page checking was not processed on this packet
before. It is perhaps redundant. It won't compile in free build.
>>
>> +
>>
>> + for (int i = 0; i < pages->num; i++) {
>> + uint64_t offset = pages->offset[i];
>> + if (use_multifd_zero_page &&
>> + buffer_is_zero(rb->host + offset, p->page_size)) {
>> + pages->zero[pages->zero_num] = offset;
>> + pages->zero_num++;
>> + ram_release_page(rb->idstr, offset);
>> + } else {
>> + pages->normal[pages->normal_num] = offset;
>> + pages->normal_num++;
>> + }
>> + }
>> +}
>> +
>> +void multifd_zero_page_check_recv(MultiFDRecvParams *p)
>> +{
>> + for (int i = 0; i < p->zero_num; i++) {
>> + void *page = p->host + p->zero[i];
>> + if (!buffer_is_zero(page, p->page_size)) {
>> + memset(page, 0, p->page_size);
>> + }
>> + }
>> +}
>> diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
>> index 012e3bdea1..cdfe0fa70e 100644
>> --- a/migration/multifd-zlib.c
>> +++ b/migration/multifd-zlib.c
>> @@ -123,13 +123,20 @@ static int zlib_send_prepare(MultiFDSendParams *p,
>> Error **errp)
>> int ret;
>> uint32_t i;
>>
>> + multifd_zero_page_check_send(p);
>> +
>> + if (!pages->normal_num) {
>> + p->next_packet_size = 0;
>> + goto out;
>> + }
>> +
>> multifd_send_prepare_header(p);
>>
>> - for (i = 0; i < pages->num; i++) {
>> + for (i = 0; i < pages->normal_num; i++) {
>> uint32_t available = z->zbuff_len - out_size;
>> int flush = Z_NO_FLUSH;
>>
>> - if (i == pages->num - 1) {
>> + if (i == pages->normal_num - 1) {
>> flush = Z_SYNC_FLUSH;
>> }
>>
>> @@ -138,7 +145,7 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error
>> **errp)
>> * with compression. zlib does not guarantee that this is safe,
>> * therefore copy the page before calling deflate().
>> */
>> - memcpy(z->buf, p->pages->block->host + pages->offset[i],
>> p->page_size);
>> + memcpy(z->buf, p->pages->block->host + pages->normal[i],
>> p->page_size);
>> zs->avail_in = p->page_size;
>> zs->next_in = z->buf;
>>
>> @@ -172,10 +179,10 @@ static int zlib_send_prepare(MultiFDSendParams *p,
>> Error **errp)
>> p->iov[p->iovs_num].iov_len = out_size;
>> p->iovs_num++;
>> p->next_packet_size = out_size;
>> - p->flags |= MULTIFD_FLAG_ZLIB;
>>
>> +out:
>> + p->flags |= MULTIFD_FLAG_ZLIB;
>> multifd_send_fill_packet(p);
>> -
>
> Spurious?
We need to set the flag anyway otherwise the receiver side will complain.
>
>> return 0;
>> }
>>
>> @@ -261,6 +268,14 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error
>> **errp)
>> p->id, flags, MULTIFD_FLAG_ZLIB);
>> return -1;
>> }
>> +
>> + multifd_zero_page_check_recv(p);
>> +
>> + if (!p->normal_num) {
>> + assert(in_size == 0);
>> + return 0;
>
>
> return here will have no effect. Also, why is assert needed here?
We return here so we don't end up calling qio_channel_read_all with
buflen = 0. The assert makes sure that normal_num/next_packet_size
states are consistent.
> This change also does not seem to fit the description of the patch, probaby
> separate patch will be better.
These changes are needed to ensure basic functionalities are working
after enabling the multifd zero page format.
>
>>
>> + }
>> +
>> ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
>>
>> if (ret != 0) {
>> @@ -310,6 +325,7 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error
>> **errp)
>> p->id, out_size, expected_size);
>> return -1;
>> }
>> +
>> return 0;
>> }
>>
>> diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
>> index dc8fe43e94..27a1eba075 100644
>> --- a/migration/multifd-zstd.c
>> +++ b/migration/multifd-zstd.c
>> @@ -118,19 +118,26 @@ static int zstd_send_prepare(MultiFDSendParams *p,
>> Error **errp)
>> int ret;
>> uint32_t i;
>>
>> + multifd_zero_page_check_send(p);
>> +
>> + if (!pages->normal_num) {
>> + p->next_packet_size = 0;
>> + goto out;
>> + }
>> +
>> multifd_send_prepare_header(p);
>>
>> z->out.dst = z->zbuff;
>> z->out.size = z->zbuff_len;
>> z->out.pos = 0;
>>
>> - for (i = 0; i < pages->num; i++) {
>> + for (i = 0; i < pages->normal_num; i++) {
>> ZSTD_EndDirective flush = ZSTD_e_continue;
>>
>> - if (i == pages->num - 1) {
>> + if (i == pages->normal_num - 1) {
>> flush = ZSTD_e_flush;
>> }
>> - z->in.src = p->pages->block->host + pages->offset[i];
>> + z->in.src = p->pages->block->host + pages->normal[i];
>> z->in.size = p->page_size;
>> z->in.pos = 0;
>>
>> @@ -161,10 +168,10 @@ static int zstd_send_prepare(MultiFDSendParams *p,
>> Error **errp)
>> p->iov[p->iovs_num].iov_len = z->out.pos;
>> p->iovs_num++;
>> p->next_packet_size = z->out.pos;
>> - p->flags |= MULTIFD_FLAG_ZSTD;
>>
>> +out:
>> + p->flags |= MULTIFD_FLAG_ZSTD;
>> multifd_send_fill_packet(p);
>> -
>
> Spurious removal.
Can you elaborate on this?
>
>>
>> return 0;
>> }
>>
>> @@ -257,6 +264,14 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error
>> **errp)
>> p->id, flags, MULTIFD_FLAG_ZSTD);
>> return -1;
>> }
>> +
>> + multifd_zero_page_check_recv(p);
>> +
>> + if (!p->normal_num) {
>> + assert(in_size == 0);
>> + return 0;
>> + }
>> +
>
> Same question here about assert.
>
>>
>> ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
>>
>> if (ret != 0) {
>> diff --git a/migration/multifd.c b/migration/multifd.c
>> index a33dba40d9..fbb40ea10b 100644
>> --- a/migration/multifd.c
>> +++ b/migration/multifd.c
>> @@ -11,6 +11,7 @@
>> */
>>
>> #include "qemu/osdep.h"
>> +#include "qemu/cutils.h"
>> #include "qemu/rcu.h"
>> #include "exec/target_page.h"
>> #include "sysemu/sysemu.h"
>> @@ -126,6 +127,8 @@ static int nocomp_send_prepare(MultiFDSendParams *p,
>> Error **errp)
>> MultiFDPages_t *pages = p->pages;
>> int ret;
>>
>> + multifd_zero_page_check_send(p);
>> +
>> if (!use_zero_copy_send) {
>> /*
>> * Only !zerocopy needs the header in IOV; zerocopy will
>> @@ -134,13 +137,13 @@ static int nocomp_send_prepare(MultiFDSendParams *p,
>> Error **errp)
>> multifd_send_prepare_header(p);
>> }
>>
>> - for (int i = 0; i < pages->num; i++) {
>> - p->iov[p->iovs_num].iov_base = pages->block->host +
>> pages->offset[i];
>> + for (int i = 0; i < pages->normal_num; i++) {
>> + p->iov[p->iovs_num].iov_base = pages->block->host +
>> pages->normal[i];
>> p->iov[p->iovs_num].iov_len = p->page_size;
>> p->iovs_num++;
>> }
>>
>> - p->next_packet_size = pages->num * p->page_size;
>> + p->next_packet_size = pages->normal_num * p->page_size;
>> p->flags |= MULTIFD_FLAG_NOCOMP;
>>
>> multifd_send_fill_packet(p);
>> @@ -202,6 +205,13 @@ static int nocomp_recv_pages(MultiFDRecvParams *p,
>> Error **errp)
>> p->id, flags, MULTIFD_FLAG_NOCOMP);
>> return -1;
>> }
>> +
>> + multifd_zero_page_check_recv(p);
>> +
>> + if (!p->normal_num) {
>> + return 0;
>> + }
>> +
>> for (int i = 0; i < p->normal_num; i++) {
>> p->iov[i].iov_base = p->host + p->normal[i];
>> p->iov[i].iov_len = p->page_size;
>> @@ -339,7 +349,7 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
>>
>> packet->flags = cpu_to_be32(p->flags);
>> packet->pages_alloc = cpu_to_be32(p->pages->allocated);
>> - packet->normal_pages = cpu_to_be32(pages->num);
>> + packet->normal_pages = cpu_to_be32(pages->normal_num);
>> packet->zero_pages = cpu_to_be32(pages->zero_num);
>> packet->next_packet_size = cpu_to_be32(p->next_packet_size);
>>
>> @@ -350,18 +360,25 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
>> strncpy(packet->ramblock, pages->block->idstr, 256);
>> }
>>
>> - for (i = 0; i < pages->num; i++) {
>> + for (i = 0; i < pages->normal_num; i++) {
>> /* there are architectures where ram_addr_t is 32 bit */
>> - uint64_t temp = pages->offset[i];
>> + uint64_t temp = pages->normal[i];
>>
>> packet->offset[i] = cpu_to_be64(temp);
>> }
>>
>> + for (i = 0; i < pages->zero_num; i++) {
>> + /* there are architectures where ram_addr_t is 32 bit */
>> + uint64_t temp = pages->zero[i];
>> +
>> + packet->offset[pages->normal_num + i] = cpu_to_be64(temp);
>> + }
>> +
>> p->packets_sent++;
>> - p->total_normal_pages += pages->num;
>> + p->total_normal_pages += pages->normal_num;
>> p->total_zero_pages += pages->zero_num;
>>
>> - trace_multifd_send(p->id, packet_num, pages->num, pages->zero_num,
>> + trace_multifd_send(p->id, packet_num, pages->normal_num,
>> pages->zero_num,
>> p->flags, p->next_packet_size);
>> }
>>
>> @@ -451,6 +468,18 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams
>> *p, Error **errp)
>> p->normal[i] = offset;
>> }
>>
>> + for (i = 0; i < p->zero_num; i++) {
>> + uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]);
>> +
>> + if (offset > (p->block->used_length - p->page_size)) {
>> + error_setg(errp, "multifd: offset too long %" PRIu64
>> + " (max " RAM_ADDR_FMT ")",
>> + offset, p->block->used_length);
>> + return -1;
>> + }
>> + p->zero[i] = offset;
>> + }
>> +
>> return 0;
>> }
>>
>> @@ -842,7 +871,7 @@ static void *multifd_send_thread(void *opaque)
>>
>> stat64_add(&mig_stats.multifd_bytes,
>> p->next_packet_size + p->packet_len);
>> - stat64_add(&mig_stats.normal_pages, pages->num);
>> + stat64_add(&mig_stats.normal_pages, pages->normal_num);
>> stat64_add(&mig_stats.zero_pages, pages->zero_num);
>>
>> multifd_pages_reset(p->pages);
>> @@ -1256,7 +1285,8 @@ static void *multifd_recv_thread(void *opaque)
>> p->flags &= ~MULTIFD_FLAG_SYNC;
>> qemu_mutex_unlock(&p->mutex);
>>
>> - if (p->normal_num) {
>> + if (p->normal_num + p->zero_num) {
>> + assert(!(flags & MULTIFD_FLAG_SYNC));
>
> This assertion seems to be not relevant to this patch. Could you post it
> separately and explain why it's needed here?
I thought this is nice to have because it ensures that sync packet and
data packet are mutually exclusive. I agree this is not needed for
things to work but does this deserve its own patch?
>
>>
>> ret = multifd_recv_state->ops->recv_pages(p, &local_err);
>> if (ret != 0) {
>> break;
>> diff --git a/migration/multifd.h b/migration/multifd.h
>> index 9822ff298a..125f0bbe60 100644
>> --- a/migration/multifd.h
>> +++ b/migration/multifd.h
>> @@ -53,6 +53,11 @@ typedef struct {
>> uint32_t unused32[1]; /* Reserved for future use */
>> uint64_t unused64[3]; /* Reserved for future use */
>> char ramblock[256];
>> + /*
>> + * This array contains the pointers to:
>> + * - normal pages (initial normal_pages entries)
>> + * - zero pages (following zero_pages entries)
>> + */
>> uint64_t offset[];
>> } __attribute__((packed)) MultiFDPacket_t;
>>
>> @@ -224,6 +229,8 @@ typedef struct {
>>
>> void multifd_register_ops(int method, MultiFDMethods *ops);
>> void multifd_send_fill_packet(MultiFDSendParams *p);
>> +void multifd_zero_page_check_send(MultiFDSendParams *p);
>> +void multifd_zero_page_check_recv(MultiFDRecvParams *p);
>>
>> static inline void multifd_send_prepare_header(MultiFDSendParams *p)
>> {
>> diff --git a/qapi/migration.json b/qapi/migration.json
>> index 99843a8e95..e2450b92d4 100644
>> --- a/qapi/migration.json
>> +++ b/qapi/migration.json
>> @@ -660,9 +660,11 @@
>> #
>> # @none: Do not perform zero page checking.
>> #
>> +# @multifd: Perform zero page checking on the multifd sender thread. (since
>> 9.0)
>> +#
>> ##
>> { 'enum': 'ZeroPageDetection',
>> - 'data': [ 'legacy', 'none' ] }
>> + 'data': [ 'legacy', 'none', 'multifd' ] }
>>
>> ##
>> # @BitmapMigrationBitmapAliasTransform:
>> --
>> 2.30.2
>>
>>
>
>
> --
> Elena