Re: [PATCH] migration: Add more error handling to analyze-migration.py

Peter Xu Thu, 02 Jan 2025 14:04:24 -0800

On Thu, Jan 02, 2025 at 03:58:31PM -0300, Fabiano Rosas wrote:
> The analyze-migration script was seen failing in s390x in misterious
> ways. It seems we're reaching the subsection constructor without any


It might be a good idea to add some debug lines indeed. Though are you sure
it's from parsing a subsection?

https://lore.kernel.org/all/20241219123213.GA692742@fedora/

====8<====
stderr:
Traceback (most recent call last):
  File 
"/home/gitlab-runner/builds/4S3awx_3/0/qemu-project/qemu/build/scripts/analyze-migration.py",
 line 688, in <module>
    dump.read(dump_memory = args.memory)
  File 
"/home/gitlab-runner/builds/4S3awx_3/0/qemu-project/qemu/build/scripts/analyze-migration.py",
 line 625, in read
    section.read()
  File 
"/home/gitlab-runner/builds/4S3awx_3/0/qemu-project/qemu/build/scripts/analyze-migration.py",
 line 461, in read
    field['data'] = reader(field, self.file)
  File 
"/home/gitlab-runner/builds/4S3awx_3/0/qemu-project/qemu/build/scripts/analyze-migration.py",
 line 434, in __init__
    for field in self.desc['struct']['fields']:
KeyError: 'fields'
====8<====

It reads to me that it's not in "if 'subsections' in self.desc['struct']"
loop yet, instead it looks like a real STRUCT field in one of the device
sections?  If that's true, then...

> fields, which would indicate an empty .subsection entry in the vmstate
> definition. We don't have any of those, at least not without the
> unmigratable flag set, so this should never happen.
> 
> Add some debug statements so that we can see what's going on the next
> time the issue happens.
> 
> Signed-off-by: Fabiano Rosas <faro...@suse.de>
> ---
>  scripts/analyze-migration.py | 33 +++++++++++++++++++++++++++------
>  1 file changed, 27 insertions(+), 6 deletions(-)
> 
> diff --git a/scripts/analyze-migration.py b/scripts/analyze-migration.py
> index 8a254a5b6a..1dd98f1d5a 100755
> --- a/scripts/analyze-migration.py
> +++ b/scripts/analyze-migration.py
> @@ -429,6 +429,10 @@ def __init__(self, desc, file):
>          super(VMSDFieldStruct, self).__init__(desc, file)
>          self.data = collections.OrderedDict()
>  
> +        if 'fields' not in self.desc['struct']:
> +            raise Exception("No fields in subsection key=%s name=%s" %
> +                            (self.section_key, self.vmsd_name))

... this self.section_key / self.vmsd_name may not exist..

In all cases, IMHO this would be better the debug lines work with both pure
structs and sections.

> +
>          # When we see compressed array elements, unfold them here
>          new_fields = []
>          for field in self.desc['struct']['fields']:
> @@ -477,6 +481,11 @@ def read(self):
>                      raise Exception("Subsection %s not found at offset %x" % 
> ( subsection['vmsd_name'], self.file.tell()))
>                  name = self.file.readstr()
>                  version_id = self.file.read32()
> +
> +                if not subsection:
> +                    raise Exception("Empty description for subsection %s" %
> +                                    name)

This is checking subsection==None??  I doubt whether this will hit..

> +
>                  self.data[name] = VMSDSection(self.file, version_id, 
> subsection, (name, 0))
>                  self.data[name].read()
>  
> @@ -575,9 +584,8 @@ def __init__(self, filename):
>          self.filename = filename
>          self.vmsd_desc = None
>  
> -    def read(self, desc_only = False, dump_memory = False, write_memory = 
> False):
> -        # Read in the whole file
> -        file = MigrationFile(self.filename)
> +    def _read(self, file, vmsd_json, desc_only = False, dump_memory = False,
> +              write_memory = False):
>  
>          # File magic
>          data = file.read32()
> @@ -589,7 +597,7 @@ def read(self, desc_only = False, dump_memory = False, 
> write_memory = False):
>          if data != self.QEMU_VM_FILE_VERSION:
>              raise Exception("Invalid version number %d" % data)
>  
> -        self.load_vmsd_json(file)
> +        self.load_vmsd_json(file, vmsd_json)
>  
>          # Read sections
>          self.sections = collections.OrderedDict()
> @@ -632,12 +640,25 @@ def read(self, desc_only = False, dump_memory = False, 
> write_memory = False):
>                      raise Exception("Mismatched section footer: %x vs %x" % 
> (read_section_id, section_id))
>              else:
>                  raise Exception("Unknown section type: %d" % section_type)
> -        file.close()
>  
> -    def load_vmsd_json(self, file):
> +    def read(self, desc_only = False, dump_memory = False,
> +             write_memory = False):
> +        file = MigrationFile(self.filename)
>          vmsd_json = file.read_migration_debug_json()
> +
> +        try:
> +            self._read(file, vmsd_json, desc_only, dump_memory, write_memory)
> +        except:
> +            raise Exception("Full JSON dump:\n%s", vmsd_json)

Better dump the Exception line itself too? IIUC that needs things like:

  except Exception as e:
      raise Exception(XXX, e, vmsd_json)

More below..

> +        finally:
> +            file.close()
> +
> +    def load_vmsd_json(self, file, vmsd_json):
>          self.vmsd_desc = json.loads(vmsd_json, 
> object_pairs_hook=collections.OrderedDict)
>          for device in self.vmsd_desc['devices']:
> +            if 'fields' not in device:
> +                raise Exception("vmstate for device %s has no fields" %
> +                                device['name'])

This looks a valid check, though I still doubt whether this would hit at
all for this specific error (which looks like a top level STRUCT of a
section, that is missing "fields").

>              key = (device['name'], device['instance_id'])
>              value = ( VMSDSection, device )
>              self.section_classes[key] = value
> -- 
> 2.35.3
> 

For the capture part, would it be easier if we trap at the very top level
once and for all, then try to dump the vmsd_desc as long as existed?  Like
this:

====8<====
@@ -685,9 +686,15 @@ def default(self, o):
     f.close()
 elif args.dump == "state":
     dump = MigrationDump(args.file)
-    dump.read(dump_memory = args.memory)
-    dict = dump.getDict()
-    print(jsonenc.encode(dict))
+    try:
+        dump.read(dump_memory = args.memory)
+        dict = dump.getDict()
+        print(jsonenc.encode(dict))
+    except Exception as e:
+        # If loading vmstate stream failed, try the best to dump vmsd desc
+        raise Exception(
+            f"Caught exception when reading dump: {e}\n"
+            f"Trying to dump vmsd_desc: {jsonenc.encode(dump.vmsd_desc)}")
 elif args.dump == "desc":
     dump = MigrationDump(args.file)
     dump.read(desc_only = True)
====8<====

So no matter what caused error, fallback to try dump vmsd_desc as long as
available.

Thanks,

-- 
Peter Xu

Re: [PATCH] migration: Add more error handling to analyze-migration.py

Reply via email to