Peter Xu <pet...@redhat.com> writes: > On Thu, Jan 02, 2025 at 03:58:31PM -0300, Fabiano Rosas wrote: >> The analyze-migration script was seen failing in s390x in misterious >> ways. It seems we're reaching the subsection constructor without any > > It might be a good idea to add some debug lines indeed. Though are you sure > it's from parsing a subsection?
Ah, indeed it's not always a subsection. So I need to go back to the code and do more auditing, we might have a wrong struct macro somewhere. > > https://lore.kernel.org/all/20241219123213.GA692742@fedora/ > > ====8<==== > stderr: > Traceback (most recent call last): > File > "/home/gitlab-runner/builds/4S3awx_3/0/qemu-project/qemu/build/scripts/analyze-migration.py", > line 688, in <module> > dump.read(dump_memory = args.memory) > File > "/home/gitlab-runner/builds/4S3awx_3/0/qemu-project/qemu/build/scripts/analyze-migration.py", > line 625, in read > section.read() > File > "/home/gitlab-runner/builds/4S3awx_3/0/qemu-project/qemu/build/scripts/analyze-migration.py", > line 461, in read > field['data'] = reader(field, self.file) > File > "/home/gitlab-runner/builds/4S3awx_3/0/qemu-project/qemu/build/scripts/analyze-migration.py", > line 434, in __init__ > for field in self.desc['struct']['fields']: > KeyError: 'fields' > ====8<==== > > It reads to me that it's not in "if 'subsections' in self.desc['struct']" > loop yet, instead it looks like a real STRUCT field in one of the device > sections? If that's true, then... > >> fields, which would indicate an empty .subsection entry in the vmstate >> definition. We don't have any of those, at least not without the >> unmigratable flag set, so this should never happen. >> >> Add some debug statements so that we can see what's going on the next >> time the issue happens. >> >> Signed-off-by: Fabiano Rosas <faro...@suse.de> >> --- >> scripts/analyze-migration.py | 33 +++++++++++++++++++++++++++------ >> 1 file changed, 27 insertions(+), 6 deletions(-) >> >> diff --git a/scripts/analyze-migration.py b/scripts/analyze-migration.py >> index 8a254a5b6a..1dd98f1d5a 100755 >> --- a/scripts/analyze-migration.py >> +++ b/scripts/analyze-migration.py >> @@ -429,6 +429,10 @@ def __init__(self, desc, file): >> super(VMSDFieldStruct, self).__init__(desc, file) >> self.data = collections.OrderedDict() >> >> + if 'fields' not in self.desc['struct']: >> + raise Exception("No fields in subsection key=%s name=%s" % >> + (self.section_key, self.vmsd_name)) > > ... this self.section_key / self.vmsd_name may not exist.. Right, so there's no information we can get at this point I think. But raising will still trigger the dump later on. > > In all cases, IMHO this would be better the debug lines work with both pure > structs and sections. Yeah, I need to change that to something generic. > >> + >> # When we see compressed array elements, unfold them here >> new_fields = [] >> for field in self.desc['struct']['fields']: >> @@ -477,6 +481,11 @@ def read(self): >> raise Exception("Subsection %s not found at offset %x" >> % ( subsection['vmsd_name'], self.file.tell())) >> name = self.file.readstr() >> version_id = self.file.read32() >> + >> + if not subsection: >> + raise Exception("Empty description for subsection %s" % >> + name) > > This is checking subsection==None?? I doubt whether this will hit.. Well, there could be a None in the list of subsections somehow, no? > >> + >> self.data[name] = VMSDSection(self.file, version_id, >> subsection, (name, 0)) >> self.data[name].read() >> >> @@ -575,9 +584,8 @@ def __init__(self, filename): >> self.filename = filename >> self.vmsd_desc = None >> >> - def read(self, desc_only = False, dump_memory = False, write_memory = >> False): >> - # Read in the whole file >> - file = MigrationFile(self.filename) >> + def _read(self, file, vmsd_json, desc_only = False, dump_memory = False, >> + write_memory = False): >> >> # File magic >> data = file.read32() >> @@ -589,7 +597,7 @@ def read(self, desc_only = False, dump_memory = False, >> write_memory = False): >> if data != self.QEMU_VM_FILE_VERSION: >> raise Exception("Invalid version number %d" % data) >> >> - self.load_vmsd_json(file) >> + self.load_vmsd_json(file, vmsd_json) >> >> # Read sections >> self.sections = collections.OrderedDict() >> @@ -632,12 +640,25 @@ def read(self, desc_only = False, dump_memory = False, >> write_memory = False): >> raise Exception("Mismatched section footer: %x vs %x" % >> (read_section_id, section_id)) >> else: >> raise Exception("Unknown section type: %d" % section_type) >> - file.close() >> >> - def load_vmsd_json(self, file): >> + def read(self, desc_only = False, dump_memory = False, >> + write_memory = False): >> + file = MigrationFile(self.filename) >> vmsd_json = file.read_migration_debug_json() >> + >> + try: >> + self._read(file, vmsd_json, desc_only, dump_memory, >> write_memory) >> + except: >> + raise Exception("Full JSON dump:\n%s", vmsd_json) > > Better dump the Exception line itself too? IIUC that needs things like: > > except Exception as e: > raise Exception(XXX, e, vmsd_json) It already shows both exceptions in the form: Traceback (most recent call last): <backtrace> Exception: <msg> During handling of the above exception, another exception occurred: Traceback (most recent call last): <backtrace> Exception: ('Full JSON dump:\n%s', '{"configuration": {"vmsd_name" ... > > More below.. > >> + finally: >> + file.close() >> + >> + def load_vmsd_json(self, file, vmsd_json): >> self.vmsd_desc = json.loads(vmsd_json, >> object_pairs_hook=collections.OrderedDict) >> for device in self.vmsd_desc['devices']: >> + if 'fields' not in device: >> + raise Exception("vmstate for device %s has no fields" % >> + device['name']) > > This looks a valid check, though I still doubt whether this would hit at > all for this specific error (which looks like a top level STRUCT of a > section, that is missing "fields"). Ok, but it's a (VMSDSection, device) tuple below. That "device" might end up in the constructor of the FieldStruct: MigrationDump.load_vmsd_json: ... value = ( VMSDSection, device ) self.section_classes[key] = value ... MigrationDump.read: ... classdesc = self.section_classes[section_key] section = classdesc[0](file, version_id, classdesc[1], section_key) class VMSDSection(VMSDFieldStruct): def __init__(self, file, version_id, device, section_key): ... # A section really is nothing but a FieldStruct :) super(VMSDSection, self).__init__({ 'struct' : desc }, file) >> key = (device['name'], device['instance_id']) >> value = ( VMSDSection, device ) >> self.section_classes[key] = value >> -- >> 2.35.3 >> > > For the capture part, would it be easier if we trap at the very top level > once and for all, then try to dump the vmsd_desc as long as existed? Like > this: > > ====8<==== > @@ -685,9 +686,15 @@ def default(self, o): > f.close() > elif args.dump == "state": > dump = MigrationDump(args.file) > - dump.read(dump_memory = args.memory) > - dict = dump.getDict() > - print(jsonenc.encode(dict)) > + try: > + dump.read(dump_memory = args.memory) This is not the only dump.read() call in the file. Ideally we'd wrap them all. But then there's a slight clash with the parameter validation part (args.dump == ...) because it raises if no valid option is used. I'll try to improve this a bit. I was just trying to get this thing to spit something useful so we can unblock the migration PR. > + dict = dump.getDict() > + print(jsonenc.encode(dict)) > + except Exception as e: > + # If loading vmstate stream failed, try the best to dump vmsd desc > + raise Exception( > + f"Caught exception when reading dump: {e}\n" > + f"Trying to dump vmsd_desc: {jsonenc.encode(dump.vmsd_desc)}") I wanted to avoid doing any processing after the error because there's the chance there's Python issue during decoding. But I can maybe store the string instead at load_vmsd_json. > elif args.dump == "desc": > dump = MigrationDump(args.file) > dump.read(desc_only = True) > ====8<==== > > So no matter what caused error, fallback to try dump vmsd_desc as long as > available. > > Thanks,