Peter Xu <[email protected]> wrote:
> rp_state.error was a boolean used to show error happened in return path
> thread. That's not only duplicating error reporting (migrate_set_error),
> but also not good enough in that we only do error_report() and set it to
> true, we never can keep a history of the exact error and show it in
> query-migrate.
>
> To make this better, a few things done:
>
> - Use error_setg() rather than error_report() across the whole lifecycle
> of return path thread, keeping the error in an Error*.
Good.
> - Use migrate_set_error() to apply that captured error to the global
> migration object when error occured in this thread.
Good.
> - With above, no need to have mark_source_rp_bad(), remove it, alongside
> with rp_state.error itself.
Good.
> uint64_t ram_pagesize_summary(void);
> -int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t
> len);
> +int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t
> len,
> + Error **errp);
good.
> @@ -1793,37 +1782,36 @@ static void
> migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
> */
> if (!QEMU_IS_ALIGNED(start, our_host_ps) ||
> !QEMU_IS_ALIGNED(len, our_host_ps)) {
> - error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT
> - " len: %zd", __func__, start, len);
> - mark_source_rp_bad(ms);
> + error_setg(errp, "MIG_RP_MSG_REQ_PAGES: Misaligned page request,
> start:"
> + RAM_ADDR_FMT " len: %zd", start, len);
> return;
> }
>
> - if (ram_save_queue_pages(rbname, start, len)) {
> - mark_source_rp_bad(ms);
> - }
> + ram_save_queue_pages(rbname, start, len, errp);
ram_save_queue_pages() returns an int.
I think this function should return an int.
Next is independent of this patch:
> -static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name)
> +static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name,
> + Error **errp)
> {
> RAMBlock *block = qemu_ram_block_by_name(block_name);
>
> if (!block) {
> - error_report("%s: invalid block name '%s'", __func__, block_name);
> + error_setg(errp, "MIG_RP_MSG_RECV_BITMAP has invalid block name
> '%s'",
> + block_name);
> return -EINVAL;
We sent -EINVAL.
> }
>
> /* Fetch the received bitmap and refresh the dirty bitmap */
> - return ram_dirty_bitmap_reload(s, block);
> + return ram_dirty_bitmap_reload(s, block, errp);
> }
>
> -static int migrate_handle_rp_resume_ack(MigrationState *s, uint32_t value)
> +static int migrate_handle_rp_resume_ack(MigrationState *s,
> + uint32_t value, Error **errp)
> {
> trace_source_return_path_thread_resume_ack(value);
>
> if (value != MIGRATION_RESUME_ACK_VALUE) {
> - error_report("%s: illegal resume_ack value %"PRIu32,
> - __func__, value);
> + error_setg(errp, "illegal resume_ack value %"PRIu32, value);
> return -1;
And here -1.
On both callers we just check if it is different from zero. We never
use the return value as errno, so I think we should move to -1, if there
is an error, that is what errp is for.
> -/* Returns 0 if the RP was ok, otherwise there was an error on the RP */
> -static int await_return_path_close_on_source(MigrationState *ms)
> +static void await_return_path_close_on_source(MigrationState *ms)
> {
> - int ret;
> -
> if (!ms->rp_state.rp_thread_created) {
> - return 0;
> + return;
> }
>
> trace_migration_return_path_end_before();
> @@ -2060,18 +2050,10 @@ static int
> await_return_path_close_on_source(MigrationState *ms)
> }
> }
>
> - trace_await_return_path_close_on_source_joining();
> qemu_thread_join(&ms->rp_state.rp_thread);
> ms->rp_state.rp_thread_created = false;
> - trace_await_return_path_close_on_source_close();
> -
> - ret = ms->rp_state.error;
> - ms->rp_state.error = false;
> -
> migration_release_dst_files(ms);
> -
> - trace_migration_return_path_end_after(ret);
> - return ret;
> + trace_migration_return_path_end_after();
> }
>
> static inline void
> @@ -2367,7 +2349,10 @@ static void migration_completion(MigrationState *s)
> goto fail;
> }
>
> - if (await_return_path_close_on_source(s)) {
> + await_return_path_close_on_source(s);
> +
> + /* If return path has error, should have been set here */
> + if (migrate_has_error(s)) {
> goto fail;
> }
In general, I think this is bad. We are moving for
int foo(..)
{
}
....
if (foo()) {
goto fail;
}
to:
void foo(..)
{
}
....
foo();
if (bar()) {
goto fail;
}
I would preffer to move the other way around. Move the error
synchrconously. My plan is that at some point in time
qemu_file_get_error() dissapears, i.e. we return the error when we
receive it and we handle it synchronously.
And yes, that is a something will take a lot of time, but I will hope we
move on that direction, not in trusting more setting internal errors,
use void functions and then check with yet another functions.
On top of your changes:
> -int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
> +int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block, Error **errp)
> {
> int ret = -EINVAL;
> /* from_dst_file is always valid because we're within rp_thread */
> @@ -4163,8 +4165,8 @@ int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock
> *block)
> trace_ram_dirty_bitmap_reload_begin(block->idstr);
>
> if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
> - error_report("%s: incorrect state %s", __func__,
> - MigrationStatus_str(s->state));
> + error_setg(errp, "Reload bitmap in incorrect state %s",
> + MigrationStatus_str(s->state));
> return -EINVAL;
return -1
same for the rest of the cases. Callers only check for != 0, and if you
want details, you need to look at errp.
See the nice series for migration/rdma.c for why this is better (and
more consistent).
Rest of the patch is very nice.
Thanks, Juan.