On Wed, Feb 25, 2015 at 04:51:50PM +0000, Dr. David Alan Gilbert (git) wrote: > From: "Dr. David Alan Gilbert" <[email protected]> > > Where postcopy is preceeded by a period of precopy, the destination will > have received pages that may have been dirtied on the source after the > page was sent. The destination must throw these pages away before > starting it's CPUs. > > Maintain a 'sentmap' of pages that have already been sent. > Calculate list of sent & dirty pages > Provide helpers on the destination side to discard these. > > Signed-off-by: Dr. David Alan Gilbert <[email protected]> > --- > arch_init.c | 275 > ++++++++++++++++++++++++++++++++++++++- > include/migration/migration.h | 12 ++ > include/migration/postcopy-ram.h | 34 +++++ > include/qemu/typedefs.h | 1 + > migration/migration.c | 1 + > migration/postcopy-ram.c | 111 ++++++++++++++++ > savevm.c | 3 - > trace-events | 4 + > 8 files changed, 435 insertions(+), 6 deletions(-) > > diff --git a/arch_init.c b/arch_init.c > index 7bc5fa6..21e7ebe 100644 > --- a/arch_init.c > +++ b/arch_init.c > @@ -40,6 +40,7 @@ > #include "hw/audio/audio.h" > #include "sysemu/kvm.h" > #include "migration/migration.h" > +#include "migration/postcopy-ram.h" > #include "hw/i386/smbios.h" > #include "exec/address-spaces.h" > #include "hw/audio/pcspk.h" > @@ -414,9 +415,17 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t > **current_data, > return bytes_sent; > } > > +/* mr: The region to search for dirty pages in > + * start: Start address (typically so we can continue from previous page) > + * ram_addr_abs: Pointer into which to store the address of the dirty page > + * within the global ram_addr space > + * > + * Returns: byte offset within memory region of the start of a dirty page > + */ > static inline > ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, > - ram_addr_t start) > + ram_addr_t start, > + ram_addr_t *ram_addr_abs) > { > unsigned long base = mr->ram_addr >> TARGET_PAGE_BITS; > unsigned long nr = base + (start >> TARGET_PAGE_BITS); > @@ -435,6 +444,7 @@ ram_addr_t > migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, > clear_bit(next, migration_bitmap); > migration_dirty_pages--; > } > + *ram_addr_abs = next << TARGET_PAGE_BITS; > return (next - base) << TARGET_PAGE_BITS; > } > > @@ -571,6 +581,19 @@ static void migration_bitmap_sync(void) > } > } > > +static RAMBlock *ram_find_block(const char *id) > +{ > + RAMBlock *block; > + > + QTAILQ_FOREACH(block, &ram_list.blocks, next) { > + if (!strcmp(id, block->idstr)) { > + return block; > + } > + } > + > + return NULL; > +} > + > /* > * ram_save_page: Send the given page to the stream > * > @@ -659,13 +682,16 @@ static int ram_find_and_save_block(QEMUFile *f, bool > last_stage) > bool complete_round = false; > int bytes_sent = 0; > MemoryRegion *mr; > + ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in > + ram_addr_t space */ > > if (!block) > block = QTAILQ_FIRST(&ram_list.blocks); > > while (true) { > mr = block->mr; > - offset = migration_bitmap_find_and_reset_dirty(mr, offset); > + offset = migration_bitmap_find_and_reset_dirty(mr, offset, > + &dirty_ram_abs); > if (complete_round && block == last_seen_block && > offset >= last_offset) { > break; > @@ -683,6 +709,11 @@ static int ram_find_and_save_block(QEMUFile *f, bool > last_stage) > > /* if page is unmodified, continue to the next */ > if (bytes_sent > 0) { > + MigrationState *ms = migrate_get_current(); > + if (ms->sentmap) { > + set_bit(dirty_ram_abs >> TARGET_PAGE_BITS, ms->sentmap); > + } > + > last_sent_block = block; > break; > } > @@ -742,12 +773,19 @@ void free_xbzrle_decoded_buf(void) > > static void migration_end(void) > { > + MigrationState *s = migrate_get_current(); > + > if (migration_bitmap) { > memory_global_dirty_log_stop(); > g_free(migration_bitmap); > migration_bitmap = NULL; > } > > + if (s->sentmap) { > + g_free(s->sentmap); > + s->sentmap = NULL; > + } > + > XBZRLE_cache_lock(); > if (XBZRLE.cache) { > cache_fini(XBZRLE.cache); > @@ -815,6 +853,232 @@ void ram_debug_dump_bitmap(unsigned long *todump, bool > expected) > } > } > > +/* **** functions for postcopy ***** */ > + > +/* > + * A helper to get 32 bits from a bit map; trivial for HOST_LONG_BITS=32 > + * messier for 64; the bitmaps are actually long's that are 32 or 64bit > + */ > +static uint32_t get_32bits_map(unsigned long *map, int64_t start) > +{ > +#if HOST_LONG_BITS == 64 > + uint64_t tmp64; > + > + tmp64 = map[start / 64]; > + return (start & 32) ? (tmp64 >> 32) : (tmp64 & 0xffffffffu); > +#elif HOST_LONG_BITS == 32 > + /* > + * Irrespective of host endianness, sentmap[n] is for pages earlier > + * than sentmap[n+1] so we can't just cast up > + */ > + return map[start / 32]; > +#else > +#error "Host long other than 64/32 not supported" > +#endif > +} > + > +/* > + * A helper to put 32 bits into a bit map; trivial for HOST_LONG_BITS=32 > + * messier for 64; the bitmaps are actually long's that are 32 or 64bit > + */ > +__attribute__ (( unused )) /* Until later in patch series */ > +static void put_32bits_map(unsigned long *map, int64_t start, > + uint32_t v) > +{ > +#if HOST_LONG_BITS == 64 > + uint64_t tmp64 = v; > + uint64_t mask = 0xffffffffu; > + > + if (start & 32) { > + tmp64 = tmp64 << 32; > + mask = mask << 32; > + } > + > + map[start / 64] = (map[start / 64] & ~mask) | tmp64; > +#elif HOST_LONG_BITS == 32 > + /* > + * Irrespective of host endianness, sentmap[n] is for pages earlier > + * than sentmap[n+1] so we can't just cast up > + */ > + map[start / 32] = v; > +#else > +#error "Host long other than 64/32 not supported" > +#endif > +} > + > +/* > + * When working on 32bit chunks of a bitmap where the only valid section > + * is between start..end (inclusive), generate a mask with only those > + * valid bits set for the current 32bit word within that bitmask. > + */ > +static int make_32bit_mask(unsigned long start, unsigned long end, > + unsigned long cur32) > +{ > + unsigned long first32, last32; > + uint32_t mask = ~(uint32_t)0; > + first32 = start / 32; > + last32 = end / 32; > + > + if ((cur32 == first32) && (start & 31)) { > + /* e.g. (start & 31) = 3 > + * 1 << . -> 2^3 > + * . - 1 -> 2^3 - 1 i.e. mask 2..0 > + * ~. -> mask 31..3 > + */ > + mask &= ~((((uint32_t)1) << (start & 31)) - 1); > + } > + > + if ((cur32 == last32) && ((end & 31) != 31)) { > + /* e.g. (end & 31) = 3 > + * . +1 -> 4 > + * 1 << . -> 2^4 > + * . -1 -> 2^4 - 1 > + * = mask set 3..0 > + */ > + mask &= (((uint32_t)1) << ((end & 31) + 1)) - 1; > + } > + > + return mask; > +}
Urgh.. looks correct, but continue to makes me question if this is a
sensible wire encoding of the discard map.
> +
> +/*
> + * Callback from ram_postcopy_each_ram_discard for each RAMBlock
> + * start,end: Indexes into the bitmap for the first and last bit
> + * representing the named block
> + */
> +static int pc_send_discard_bm_ram(MigrationState *ms,
> + PostcopyDiscardState *pds,
> + unsigned long start, unsigned long end)
I know it's a long name, but I'd prefer to see this called
"postcopy_". I have to keep reminding myself that "pc" means
"postcopy" here, not that it's a PC machine type specific function.
> +{
> + /*
> + * There is no guarantee that start, end are on convenient 32bit
> multiples
> + * (We always send 32bit chunks over the wire, irrespective of long size)
> + */
> + unsigned long first32, last32, cur32;
> + first32 = start / 32;
> + last32 = end / 32;
> +
> + for (cur32 = first32; cur32 <= last32; cur32++) {
> + /* Deal with start/end not on alignment */
> + uint32_t mask = make_32bit_mask(start, end, cur32);
> +
> + uint32_t data = get_32bits_map(ms->sentmap, cur32 * 32);
> + data &= mask;
> +
> + if (data) {
> + postcopy_discard_send_chunk(ms, pds, (cur32-first32) * 32, data);
> + }
> + }
> +
> + return 0;
> +}
> +/*
> + * Utility for the outgoing postcopy code.
> + * Calls postcopy_send_discard_bm_ram for each RAMBlock
> + * passing it bitmap indexes and name.
> + * Returns: 0 on success
> + * (qemu_ram_foreach_block ends up passing unscaled lengths
> + * which would mean postcopy code would have to deal with target page)
> + */
> +static int pc_each_ram_discard(MigrationState *ms)
s/discard/send_discard/
> +{
> + struct RAMBlock *block;
> + int ret;
> +
> + QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> + unsigned long first = block->offset >> TARGET_PAGE_BITS;
> + unsigned long last = (block->offset + (block->max_length-1))
> + >> TARGET_PAGE_BITS;
> + PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
> + first & 31,
> + block->idstr);
> +
> + /*
> + * Postcopy sends chunks of bitmap over the wire, but it
> + * just needs indexes at this point, avoids it having
> + * target page specific code.
> + */
> + ret = pc_send_discard_bm_ram(ms, pds, first, last);
> + postcopy_discard_send_finish(ms, pds);
> + if (ret) {
> + return ret;
> + }
> + }
> +
> + return 0;
> +}
> +
> +/*
> + * Transmit the set of pages to be discarded after precopy to the target
> + * these are pages that have been sent previously but have been dirtied
> + * Hopefully this is pretty sparse
> + */
> +int ram_postcopy_send_discard_bitmap(MigrationState *ms)
> +{
> + /* This should be our last sync, the src is now paused */
> + migration_bitmap_sync();
> +
> + /*
> + * Update the sentmap to be sentmap&=dirty
> + */
> + bitmap_and(ms->sentmap, ms->sentmap, migration_bitmap,
> + last_ram_offset() >> TARGET_PAGE_BITS);
> +
> +
> + trace_ram_postcopy_send_discard_bitmap();
> +#ifdef DEBUG_POSTCOPY
> + ram_debug_dump_bitmap(ms->sentmap, false);
> +#endif
> +
> + return pc_each_ram_discard(ms);
> +}
> +
> +/*
> + * At the start of the postcopy phase of migration, any now-dirty
> + * precopied pages are discarded.
> + *
> + * start..end is an inclusive range of bits indexed in the source
> + * VMs bitmap for this RAMBlock, source_target_page_bits tells
> + * us what one of those bits represents.
> + *
> + * start/end are offsets from the start of the bitmap for RAMBlock
> 'block_name'
> + *
> + * Returns 0 on success.
> + */
> +int ram_discard_range(MigrationIncomingState *mis,
> + const char *block_name,
> + uint64_t start, uint64_t end)
> +{
> + assert(end >= start);
> +
> + RAMBlock *rb = ram_find_block(block_name);
> +
> + if (!rb) {
> + error_report("ram_discard_range: Failed to find block '%s'",
> + block_name);
> + return -1;
> + }
> +
> + uint64_t index_offset = rb->offset >> TARGET_PAGE_BITS;
> + postcopy_pmi_discard_range(mis, start + index_offset, (end - start) + 1);
> +
> + /* +1 gives the byte after the end of the last page to be discarded */
> + ram_addr_t end_offset = (end+1) << TARGET_PAGE_BITS;
> + uint8_t *host_startaddr = rb->host + (start << TARGET_PAGE_BITS);
> + uint8_t *host_endaddr;
> +
> + if (end_offset <= rb->used_length) {
> + host_endaddr = rb->host + (end_offset-1);
> + return postcopy_ram_discard_range(mis, host_startaddr, host_endaddr);
> + } else {
> + error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
> + "/%" PRIu64 "/%zu)",
> + block_name, start, end, rb->used_length);
> + return -1;
> + }
> +}
> +
> static int ram_save_setup(QEMUFile *f, void *opaque)
> {
> RAMBlock *block;
> @@ -854,7 +1118,6 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>
> acct_clear();
> }
> -
> qemu_mutex_lock_iothread();
> qemu_mutex_lock_ramlist();
> bytes_transferred = 0;
> @@ -864,6 +1127,12 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
> migration_bitmap = bitmap_new(ram_bitmap_pages);
> bitmap_set(migration_bitmap, 0, ram_bitmap_pages);
>
> + if (migrate_postcopy_ram()) {
> + MigrationState *s = migrate_get_current();
> + s->sentmap = bitmap_new(ram_bitmap_pages);
> + bitmap_clear(s->sentmap, 0, ram_bitmap_pages);
> + }
> +
> /*
> * Count the total number of pages used by ram blocks not including any
> * gaps due to alignment or unplugs.
> diff --git a/include/migration/migration.h b/include/migration/migration.h
> index 86200b9..e749f4c 100644
> --- a/include/migration/migration.h
> +++ b/include/migration/migration.h
> @@ -125,6 +125,13 @@ struct MigrationState
>
> /* Flag set once the migration has been asked to enter postcopy */
> bool start_postcopy;
> +
> + /* bitmap of pages that have been sent at least once
> + * only maintained and used in postcopy at the moment
> + * where it's used to send the dirtymap at the start
> + * of the postcopy phase
> + */
> + unsigned long *sentmap;
> };
>
> void process_incoming_migration(QEMUFile *f);
> @@ -194,6 +201,11 @@ double xbzrle_mig_cache_miss_rate(void);
>
> void ram_handle_compressed(void *host, uint8_t ch, uint64_t size);
> void ram_debug_dump_bitmap(unsigned long *todump, bool expected);
> +/* For outgoing discard bitmap */
> +int ram_postcopy_send_discard_bitmap(MigrationState *ms);
> +/* For incoming postcopy discard */
> +int ram_discard_range(MigrationIncomingState *mis, const char *block_name,
> + uint64_t start, uint64_t end);
>
> /**
> * @migrate_add_blocker - prevent migration from proceeding
> diff --git a/include/migration/postcopy-ram.h
> b/include/migration/postcopy-ram.h
> index e93ee8a..1fec1c1 100644
> --- a/include/migration/postcopy-ram.h
> +++ b/include/migration/postcopy-ram.h
> @@ -28,4 +28,38 @@ void postcopy_pmi_destroy(MigrationIncomingState *mis);
> void postcopy_pmi_discard_range(MigrationIncomingState *mis,
> size_t start, size_t npages);
> void postcopy_pmi_dump(MigrationIncomingState *mis);
> +
> +/*
> + * Discard the contents of memory start..end inclusive.
> + * We can assume that if we've been called postcopy_ram_hosttest returned
> true
> + */
> +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
> + uint8_t *end);
> +
> +
> +/*
> + * Called at the start of each RAMBlock by the bitmap code
> + * offset is the bit within the first 32bit chunk of mask
> + * that represents the first page of the RAM Block
> + * Returns a new PDS
> + */
> +PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
> + uint8_t offset,
> + const char *name);
> +
> +/*
> + * Called by the bitmap code for each chunk to discard
> + * May send a discard message, may just leave it queued to
> + * be sent later
> + */
> +void postcopy_discard_send_chunk(MigrationState *ms, PostcopyDiscardState
> *pds,
> + unsigned long pos, uint32_t bitmap);
> +
> +/*
> + * Called at the end of each RAMBlock by the bitmap code
> + * Sends any outstanding discard messages, frees the PDS
> + */
> +void postcopy_discard_send_finish(MigrationState *ms,
> + PostcopyDiscardState *pds);
> +
> #endif
> diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
> index 924eeb6..0651275 100644
> --- a/include/qemu/typedefs.h
> +++ b/include/qemu/typedefs.h
> @@ -61,6 +61,7 @@ typedef struct PCIExpressHost PCIExpressHost;
> typedef struct PCIHostState PCIHostState;
> typedef struct PCMCIACardState PCMCIACardState;
> typedef struct PixelFormat PixelFormat;
> +typedef struct PostcopyDiscardState PostcopyDiscardState;
> typedef struct PostcopyPMI PostcopyPMI;
> typedef struct PropertyInfo PropertyInfo;
> typedef struct Property Property;
> diff --git a/migration/migration.c b/migration/migration.c
> index 6b20b56..850fe1a 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -22,6 +22,7 @@
> #include "block/block.h"
> #include "qemu/sockets.h"
> #include "migration/block.h"
> +#include "migration/postcopy-ram.h"
> #include "qemu/thread.h"
> #include "qmp-commands.h"
> #include "trace.h"
> diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
> index 4f29055..391e9c6 100644
> --- a/migration/postcopy-ram.c
> +++ b/migration/postcopy-ram.c
> @@ -28,6 +28,19 @@
> #include "qemu/error-report.h"
> #include "trace.h"
>
> +#define MAX_DISCARDS_PER_COMMAND 12
> +
> +struct PostcopyDiscardState {
> + const char *name;
> + uint16_t cur_entry;
> + uint64_t addrlist[MAX_DISCARDS_PER_COMMAND];
> + uint32_t masklist[MAX_DISCARDS_PER_COMMAND];
> + uint8_t offset; /* Offset within 32bit mask at addr0 representing 1st
> + page of block */
> + unsigned int nsentwords;
> + unsigned int nsentcmds;
> +};
> +
> /* Postcopy needs to detect accesses to pages that haven't yet been copied
> * across, and efficiently map new pages in, the techniques for doing this
> * are target OS specific.
> @@ -364,6 +377,21 @@ out:
> return ret;
> }
>
> +/*
> + * Discard the contents of memory start..end inclusive.
> + * We can assume that if we've been called postcopy_ram_hosttest returned
> true
> + */
> +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
> + uint8_t *end)
> +{
> + if (madvise(start, (end-start)+1, MADV_DONTNEED)) {
> + perror("postcopy_ram_discard_range MADV_DONTNEED");
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> #else
> /* No target OS support, stubs just fail */
>
> @@ -380,5 +408,88 @@ void postcopy_hook_early_receive(MigrationIncomingState
> *mis,
> /* We don't support postcopy so don't care */
> }
>
> +void postcopy_pmi_destroy(MigrationIncomingState *mis)
> +{
> + /* Called in normal cleanup path - so it's OK */
> +}
> +
> +void postcopy_pmi_discard_range(MigrationIncomingState *mis,
> + size_t start, size_t npages)
> +{
> + assert(0);
> +}
> +
> +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
> + uint8_t *end)
> +{
> + assert(0);
> +}
> #endif
>
> +/* -------------------------------------------------------------------------
> */
> +
> +/*
> + * Called at the start of each RAMBlock by the bitmap code
> + * offset is the bit within the first 64bit chunk of mask
> + * that represents the first page of the RAM Block
> + * Returns a new PDS
> + */
> +PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
> + uint8_t offset,
> + const char *name)
> +{
> + PostcopyDiscardState *res = g_try_malloc(sizeof(PostcopyDiscardState));
> +
> + if (res) {
> + res->name = name;
> + res->cur_entry = 0;
> + res->nsentwords = 0;
> + res->nsentcmds = 0;
> + res->offset = offset;
> + }
> +
> + return res;
> +}
> +
> +/*
> + * Called by the bitmap code for each chunk to discard
> + * May send a discard message, may just leave it queued to
> + * be sent later
> + */
> +void postcopy_discard_send_chunk(MigrationState *ms, PostcopyDiscardState
> *pds,
> + unsigned long pos, uint32_t bitmap)
> +{
> + pds->addrlist[pds->cur_entry] = pos;
> + pds->masklist[pds->cur_entry] = bitmap;
> + pds->cur_entry++;
> + pds->nsentwords++;
> +
> + if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
> + /* Full set, ship it! */
> + qemu_savevm_send_postcopy_ram_discard(ms->file, pds->name,
> + pds->cur_entry, pds->offset,
> + pds->addrlist, pds->masklist);
> + pds->nsentcmds++;
> + pds->cur_entry = 0;
> + }
> +}
> +
> +/*
> + * Called at the end of each RAMBlock by the bitmap code
> + * Sends any outstanding discard messages, frees the PDS
> + */
> +void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState
> *pds)
> +{
> + /* Anything unsent? */
> + if (pds->cur_entry) {
> + qemu_savevm_send_postcopy_ram_discard(ms->file, pds->name,
> + pds->cur_entry, pds->offset,
> + pds->addrlist, pds->masklist);
> + pds->nsentcmds++;
> + }
> +
> + trace_postcopy_discard_send_finish(pds->name, pds->nsentwords,
> + pds->nsentcmds);
> +
> + g_free(pds);
> +}
> diff --git a/savevm.c b/savevm.c
> index 1e8d289..2589b8c 100644
> --- a/savevm.c
> +++ b/savevm.c
> @@ -1282,15 +1282,12 @@ static int
> loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
> * we know there must be at least 1 bit set due to the loop entry
> * If there is no 0 firstzero will be 32
> */
> - /* TODO - ram_discard_range gets added in a later patch
> int ret = ram_discard_range(mis, ramid,
> startaddr + firstset - first_bit_offset,
> startaddr + (firstzero - 1) -
> first_bit_offset);
> - ret = -1;
> if (ret) {
> return ret;
> }
> - */
>
> /* mask= .?0000000000 */
> /* ^fz ^fs */
> diff --git a/trace-events b/trace-events
> index a555b56..f985117 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -1217,6 +1217,7 @@ qemu_file_fclose(void) ""
> migration_bitmap_sync_start(void) ""
> migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64""
> migration_throttle(void) ""
> +ram_postcopy_send_discard_bitmap(void) ""
>
> # hw/display/qxl.c
> disable qxl_interface_set_mm_time(int qid, uint32_t mm_time) "%d %d"
> @@ -1478,6 +1479,9 @@ rdma_start_incoming_migration_after_rdma_listen(void) ""
> rdma_start_outgoing_migration_after_rdma_connect(void) ""
> rdma_start_outgoing_migration_after_rdma_source_init(void) ""
>
> +# migration/postcopy-ram.c
> +postcopy_discard_send_finish(const char *ramblock, int nwords, int ncmds)
> "%s mask words sent=%d in %d commands"
> +
> # kvm-all.c
> kvm_ioctl(int type, void *arg) "type 0x%x, arg %p"
> kvm_vm_ioctl(int type, void *arg) "type 0x%x, arg %p"
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
pgpa3AbLzFDzf.pgp
Description: PGP signature
