https://github.com/JDevlieghere updated https://github.com/llvm/llvm-project/pull/154727
>From defb8e0fe69009362537b3e2c9c05c4eac544505 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere <jo...@devlieghere.com> Date: Thu, 21 Aug 2025 03:25:19 -0700 Subject: [PATCH] [lldb] Corretly parse Wasm segments My original implementation for parsing Wasm segments was wrong in two related ways. I had a bug in calculating the file vm address and I didn't fully understand the difference between active and passive segments and how that impacted their file vm address. With this PR, we now support parsing init expressions for active segments, rather than just skipping over them. This is necessary to determine where they get loaded. Similar to llvm-objdump, we currently only support simple opcodes (i.e. constants). We also currently do not support active segments that use a non-zero memory index. However this covers all segments for a non-trivial Swift binary compiled to Wasm. --- .../ObjectFile/wasm/ObjectFileWasm.cpp | 329 +++++++++++------- .../Plugins/ObjectFile/wasm/ObjectFileWasm.h | 13 +- lldb/test/Shell/Symtab/symtab-wasm.test | 25 +- 3 files changed, 227 insertions(+), 140 deletions(-) diff --git a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp index 777b20e9bb0f6..492b441867205 100644 --- a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp +++ b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp @@ -71,6 +71,47 @@ GetWasmString(llvm::DataExtractor &data, llvm::DataExtractor::Cursor &c) { return std::string(toStringRef(llvm::ArrayRef(str_storage))); } +/// An "init expr" refers to a constant expression used to determine the initial +/// value of certain elements within a module during instantiation. These +/// expressions are restricted to operations that can be evaluated at module +/// instantiation time. Currently we only support simple constant opcodes. +static lldb::offset_t GetWasmOffsetFromInitExpr(DataExtractor &data, + lldb::offset_t &offset) { + lldb::offset_t init_expr_offset = LLDB_INVALID_OFFSET; + + uint8_t opcode = data.GetU8(&offset); + switch (opcode) { + case llvm::wasm::WASM_OPCODE_I32_CONST: + case llvm::wasm::WASM_OPCODE_I64_CONST: + init_expr_offset = data.GetSLEB128(&offset); + break; + case llvm::wasm::WASM_OPCODE_GLOBAL_GET: + init_expr_offset = data.GetULEB128(&offset); + break; + case llvm::wasm::WASM_OPCODE_F32_CONST: + case llvm::wasm::WASM_OPCODE_F64_CONST: + // Not a meaningful offset. + data.GetFloat(&offset); + break; + case llvm::wasm::WASM_OPCODE_REF_NULL: + // Not a meaningful offset. + data.GetULEB128(&offset); + break; + } + + // Make sure the opcodes we read aren't part of an extended init expr. + opcode = data.GetU8(&offset); + if (opcode == llvm::wasm::WASM_OPCODE_END) + return init_expr_offset; + + // Extended init expressions are not supported, but we still have to parse + // them to skip over them and read the next segment. + do { + opcode = data.GetU8(&offset); + } while (opcode != llvm::wasm::WASM_OPCODE_END); + return LLDB_INVALID_OFFSET; +} + /// Checks whether the data buffer starts with a valid Wasm module header. static bool ValidateModuleHeader(const DataBufferSP &data_sp) { if (!data_sp || data_sp->GetByteSize() < kWasmHeaderSize) @@ -261,17 +302,20 @@ bool ObjectFileWasm::ParseHeader() { return true; } -static llvm::Expected<std::vector<AddressRange>> -ParseFunctions(SectionSP code_section_sp) { - DataExtractor data; - code_section_sp->GetSectionData(data); +struct WasmFunction { + lldb::offset_t section_offset = LLDB_INVALID_OFFSET; + uint32_t size = 0; +}; + +static llvm::Expected<std::vector<WasmFunction>> +ParseFunctions(DataExtractor &data) { lldb::offset_t offset = 0; llvm::Expected<uint32_t> function_count = GetULEB32(data, offset); if (!function_count) return function_count.takeError(); - std::vector<AddressRange> functions; + std::vector<WasmFunction> functions; functions.reserve(*function_count); for (uint32_t i = 0; i < *function_count; ++i) { @@ -281,7 +325,7 @@ ParseFunctions(SectionSP code_section_sp) { // llvm-objdump considers the ULEB with the function size to be part of the // function. We can't do that here because that would break symbolic // breakpoints, as that address is never executed. - functions.emplace_back(code_section_sp, offset, *function_size); + functions.push_back({offset, *function_size}); std::optional<lldb::offset_t> next_offset = llvm::checkedAddUnsigned<lldb::offset_t>(offset, *function_size); @@ -294,17 +338,22 @@ ParseFunctions(SectionSP code_section_sp) { } struct WasmSegment { - WasmSegment(SectionSP section_sp, lldb::offset_t offset, uint32_t size) - : address_range(section_sp, offset, size) {}; + enum SegmentType { + Active, + Passive, + }; + std::string name; - AddressRange address_range; -}; + SegmentType type = Passive; + lldb::offset_t section_offset = LLDB_INVALID_OFFSET; + uint32_t size = 0; + uint32_t memory_index = 0; + lldb::offset_t init_expr_offset = 0; -static llvm::Expected<std::vector<WasmSegment>> -ParseData(SectionSP data_section_sp) { - DataExtractor data; - data_section_sp->GetSectionData(data); + lldb::offset_t GetFileOffset() const { return section_offset & 0xffffffff; } +}; +static llvm::Expected<std::vector<WasmSegment>> ParseData(DataExtractor &data) { lldb::offset_t offset = 0; llvm::Expected<uint32_t> segment_count = GetULEB32(data, offset); @@ -319,27 +368,34 @@ ParseData(SectionSP data_section_sp) { if (!flags) return flags.takeError(); + WasmSegment segment; + // Data segments have a mode that identifies them as either passive or // active. An active data segment copies its contents into a memory during // instantiation, as specified by a memory index and a constant expression // defining an offset into that memory. + segment.type = (*flags & llvm::wasm::WASM_DATA_SEGMENT_IS_PASSIVE) + ? WasmSegment::Passive + : WasmSegment::Active; + if (*flags & llvm::wasm::WASM_DATA_SEGMENT_HAS_MEMINDEX) { + assert(segment.type == WasmSegment::Active); llvm::Expected<uint32_t> memidx = GetULEB32(data, offset); if (!memidx) return memidx.takeError(); + segment.memory_index = *memidx; } - if ((*flags & llvm::wasm::WASM_DATA_SEGMENT_IS_PASSIVE) == 0) { - // Skip over the constant expression. - for (uint8_t b = 0; b != llvm::wasm::WASM_OPCODE_END;) - b = data.GetU8(&offset); - } + if (segment.type == WasmSegment::Active) + segment.init_expr_offset = GetWasmOffsetFromInitExpr(data, offset); llvm::Expected<uint32_t> segment_size = GetULEB32(data, offset); if (!segment_size) return segment_size.takeError(); - segments.emplace_back(data_section_sp, offset, *segment_size); + segment.section_offset = offset; + segment.size = *segment_size; + segments.push_back(segment); std::optional<lldb::offset_t> next_offset = llvm::checkedAddUnsigned<lldb::offset_t>(offset, *segment_size); @@ -352,13 +408,11 @@ ParseData(SectionSP data_section_sp) { } static llvm::Expected<std::vector<Symbol>> -ParseNames(SectionSP name_section_sp, - const std::vector<AddressRange> &function_ranges, +ParseNames(SectionSP code_section_sp, DataExtractor &name_data, + const std::vector<WasmFunction> &functions, std::vector<WasmSegment> &segments) { - DataExtractor name_section_data; - name_section_sp->GetSectionData(name_section_data); - llvm::DataExtractor data = name_section_data.GetAsLLVM(); + llvm::DataExtractor data = name_data.GetAsLLVM(); llvm::DataExtractor::Cursor c(0); std::vector<Symbol> symbols; while (c && c.tell() < data.size()) { @@ -380,12 +434,13 @@ ParseNames(SectionSP name_section_sp, llvm::Expected<std::string> name = GetWasmString(data, c); if (!name) return name.takeError(); - if (*idx >= function_ranges.size()) + if (*idx >= functions.size()) continue; symbols.emplace_back( - symbols.size(), Mangled(*name), lldb::eSymbolTypeCode, + symbols.size(), *name, lldb::eSymbolTypeCode, /*external=*/false, /*is_debug=*/false, /*is_trampoline=*/false, - /*is_artificial=*/false, function_ranges[*idx], + /*is_artificial=*/false, code_section_sp, + functions[i].section_offset, functions[i].size, /*size_is_valid=*/true, /*contains_linker_annotations=*/false, /*flags=*/0); } @@ -405,12 +460,6 @@ ParseNames(SectionSP name_section_sp, continue; // Update the segment name. segments[i].name = *name; - symbols.emplace_back( - symbols.size(), Mangled(*name), lldb::eSymbolTypeData, - /*external=*/false, /*is_debug=*/false, /*is_trampoline=*/false, - /*is_artificial=*/false, segments[i].address_range, - /*size_is_valid=*/true, /*contains_linker_annotations=*/false, - /*flags=*/0); } } break; @@ -432,80 +481,11 @@ ParseNames(SectionSP name_section_sp, } void ObjectFileWasm::ParseSymtab(Symtab &symtab) { - assert(m_sections_up && "sections must be parsed"); - Log *log = GetLog(LLDBLog::Object); - - // The name section contains names and indexes. First parse the data from the - // relevant sections so we can access it by its index. - std::vector<AddressRange> functions; - std::vector<WasmSegment> segments; - - // Parse the code section. - if (SectionSP code_section_sp = - m_sections_up->FindSectionByType(lldb::eSectionTypeCode, false)) { - llvm::Expected<std::vector<AddressRange>> maybe_functions = - ParseFunctions(code_section_sp); - if (!maybe_functions) { - LLDB_LOG_ERROR(log, maybe_functions.takeError(), - "Failed to parse Wasm code section: {0}"); - return; - } - functions = *maybe_functions; - } - - // Parse the data section. - SectionSP data_section_sp = - m_sections_up->FindSectionByType(lldb::eSectionTypeData, false); - if (data_section_sp) { - llvm::Expected<std::vector<WasmSegment>> maybe_segments = - ParseData(data_section_sp); - if (!maybe_segments) { - LLDB_LOG_ERROR(log, maybe_segments.takeError(), - "Failed to parse Wasm data section: {0}"); - return; - } - segments = *maybe_segments; - } - - // Parse the name section. - SectionSP name_section_sp = - m_sections_up->FindSectionByType(lldb::eSectionTypeWasmName, false); - if (!name_section_sp) { - LLDB_LOG(log, "Failed to parse Wasm symbol table: no names section"); - return; - } - - llvm::Expected<std::vector<Symbol>> symbols = - ParseNames(name_section_sp, functions, segments); - if (!symbols) { - LLDB_LOG_ERROR(log, symbols.takeError(), "Failed to parse Wasm names: {0}"); - return; - } - - for (const Symbol &symbol : *symbols) + for (const Symbol &symbol : m_symbols) symtab.AddSymbol(symbol); - lldb::user_id_t segment_id = 0; - for (const WasmSegment &segment : segments) { - const lldb::addr_t segment_addr = - segment.address_range.GetBaseAddress().GetFileAddress(); - const size_t segment_size = segment.address_range.GetByteSize(); - SectionSP segment_sp = std::make_shared<Section>( - /*parent_section_sp=*/data_section_sp, GetModule(), - /*obj_file=*/this, - ++segment_id << 8, // 1-based segment index, shifted by 8 bits to avoid - // collision with section IDs. - ConstString(segment.name), eSectionTypeData, - /*file_vm_addr=*/segment_addr, - /*vm_size=*/segment_size, - /*file_offset=*/segment_addr, - /*file_size=*/segment_size, - /*log2align=*/0, /*flags=*/0); - m_sections_up->AddSection(segment_sp); - GetModule()->GetSectionList()->AddSection(segment_sp); - } - symtab.Finalize(); + m_symbols.clear(); } static SectionType GetSectionTypeFromName(llvm::StringRef Name) { @@ -516,7 +496,27 @@ static SectionType GetSectionTypeFromName(llvm::StringRef Name) { return eSectionTypeOther; } +std::optional<ObjectFileWasm::section_info> +ObjectFileWasm::GetSectionInfo(uint32_t section_id) { + for (const section_info §_info : m_sect_infos) { + if (sect_info.id == section_id) + return sect_info; + } + return std::nullopt; +} + +std::optional<ObjectFileWasm::section_info> +ObjectFileWasm::GetSectionInfo(llvm::StringRef section_name) { + for (const section_info §_info : m_sect_infos) { + if (sect_info.name == section_name) + return sect_info; + } + return std::nullopt; +} + void ObjectFileWasm::CreateSections(SectionList &unified_section_list) { + Log *log = GetLog(LLDBLog::Object); + if (m_sections_up) return; @@ -530,7 +530,7 @@ void ObjectFileWasm::CreateSections(SectionList &unified_section_list) { SectionType section_type = eSectionTypeOther; ConstString section_name; offset_t file_offset = sect_info.offset & 0xffffffff; - addr_t vm_addr = file_offset; + addr_t vm_addr = sect_info.offset; size_t vm_size = sect_info.size; if (llvm::wasm::WASM_SEC_CODE == sect_info.id) { @@ -542,9 +542,6 @@ void ObjectFileWasm::CreateSections(SectionList &unified_section_list) { // For this reason Section::GetFileAddress() must return zero for the // Code section. vm_addr = 0; - } else if (llvm::wasm::WASM_SEC_DATA == sect_info.id) { - section_type = eSectionTypeData; - section_name = ConstString("data"); } else { section_type = GetSectionTypeFromName(sect_info.name.GetStringRef()); if (section_type == eSectionTypeOther) @@ -556,23 +553,107 @@ void ObjectFileWasm::CreateSections(SectionList &unified_section_list) { } } - SectionSP section_sp( - new Section(GetModule(), // Module to which this section belongs. - this, // ObjectFile to which this section belongs and - // should read section data from. - section_type, // Section ID. - section_name, // Section name. - section_type, // Section type. - vm_addr, // VM address. - vm_size, // VM size in bytes of this section. - file_offset, // Offset of this section in the file. - sect_info.size, // Size of the section as found in the file. - 0, // Alignment of the section - 0, // Flags for this section. - 1)); // Number of host bytes per target byte + SectionSP section_sp = std::make_shared<Section>( + GetModule(), // Module to which this section belongs. + this, // ObjectFile to which this section belongs and + // should read section data from. + section_type, // Section ID. + section_name, // Section name. + section_type, // Section type. + vm_addr, // VM address. + vm_size, // VM size in bytes of this section. + file_offset, // Offset of this section in the file. + sect_info.size, // Size of the section as found in the file. + 0, // Alignment of the section + 0, // Flags for this section. + 1); // Number of host bytes per target byte m_sections_up->AddSection(section_sp); unified_section_list.AddSection(section_sp); } + + // The name section contains names and indexes. First parse the data from the + // relevant sections so we can access it by its index. + std::vector<WasmFunction> functions; + std::vector<WasmSegment> segments; + + // Parse the code section. + if (std::optional<section_info> info = + GetSectionInfo(llvm::wasm::WASM_SEC_CODE)) { + DataExtractor code_data = ReadImageData(info->offset, info->size); + llvm::Expected<std::vector<WasmFunction>> maybe_functions = + ParseFunctions(code_data); + if (!maybe_functions) { + LLDB_LOG_ERROR(log, maybe_functions.takeError(), + "Failed to parse Wasm code section: {0}"); + } else { + functions = *maybe_functions; + } + } + + // Parse the data section. + std::optional<section_info> data_info = + GetSectionInfo(llvm::wasm::WASM_SEC_DATA); + if (data_info) { + DataExtractor data_data = ReadImageData(data_info->offset, data_info->size); + llvm::Expected<std::vector<WasmSegment>> maybe_segments = + ParseData(data_data); + if (!maybe_segments) { + LLDB_LOG_ERROR(log, maybe_segments.takeError(), + "Failed to parse Wasm data section: {0}"); + } else { + segments = *maybe_segments; + } + } + + if (std::optional<section_info> info = GetSectionInfo("name")) { + DataExtractor names_data = ReadImageData(info->offset, info->size); + llvm::Expected<std::vector<Symbol>> symbols = ParseNames( + m_sections_up->FindSectionByType(lldb::eSectionTypeCode, false), + names_data, functions, segments); + if (!symbols) { + LLDB_LOG_ERROR(log, symbols.takeError(), + "Failed to parse Wasm names: {0}"); + } else { + m_symbols = *symbols; + } + } + + lldb::user_id_t segment_id = 0; + for (const WasmSegment &segment : segments) { + if (segment.type == WasmSegment::Active) { + // FIXME: Support segments with a memory index. + if (segment.memory_index != 0) { + LLDB_LOG(log, "Skipping segment {0}: non-zero memory index is " + "currently unsupported"); + continue; + } + + if (segment.init_expr_offset == LLDB_INVALID_OFFSET) { + LLDB_LOG(log, "Skipping segment {0}: unsupported init expression"); + continue; + } + } + + const lldb::addr_t file_vm_addr = + segment.type == WasmSegment::Active + ? segment.init_expr_offset + : data_info->offset + segment.section_offset; + const lldb::offset_t file_offset = + data_info->GetFileOffset() + segment.GetFileOffset(); + SectionSP segment_sp = std::make_shared<Section>( + GetModule(), + /*obj_file=*/this, + ++segment_id << 8, // 1-based segment index, shifted by 8 bits to avoid + // collision with section IDs. + ConstString(segment.name), eSectionTypeData, + /*file_vm_addr=*/file_vm_addr, + /*vm_size=*/segment.size, + /*file_offset=*/file_offset, + /*file_size=*/segment.size, + /*log2align=*/0, /*flags=*/0); + m_sections_up->AddSection(segment_sp); + GetModule()->GetSectionList()->AddSection(segment_sp); + } } bool ObjectFileWasm::SetLoadAddress(Target &target, lldb::addr_t load_address, @@ -697,7 +778,7 @@ void ObjectFileWasm::Dump(Stream *s) { } void ObjectFileWasm::DumpSectionHeader(llvm::raw_ostream &ostream, - const section_info_t &sh) { + const section_info &sh) { ostream << llvm::left_justify(sh.name.GetStringRef(), 16) << " " << llvm::format_hex(sh.offset, 10) << " " << llvm::format_hex(sh.size, 10) << " " << llvm::format_hex(sh.id, 6) diff --git a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.h b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.h index 531b5f0437a43..86ecbf26803cf 100644 --- a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.h +++ b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.h @@ -128,20 +128,25 @@ class ObjectFileWasm : public ObjectFile { /// Read a range of bytes from the Wasm module. DataExtractor ReadImageData(lldb::offset_t offset, uint32_t size); - typedef struct section_info { + struct section_info { lldb::offset_t offset; uint32_t size; uint32_t id; ConstString name; - } section_info_t; + lldb::offset_t GetFileOffset() const { return offset & 0xffffffff; } + }; + + std::optional<section_info> GetSectionInfo(uint32_t section_id); + std::optional<section_info> GetSectionInfo(llvm::StringRef section_name); /// Wasm section header dump routines. /// \{ - void DumpSectionHeader(llvm::raw_ostream &ostream, const section_info_t &sh); + void DumpSectionHeader(llvm::raw_ostream &ostream, const section_info &sh); void DumpSectionHeaders(llvm::raw_ostream &ostream); /// \} - std::vector<section_info_t> m_sect_infos; + std::vector<section_info> m_sect_infos; + std::vector<Symbol> m_symbols; ArchSpec m_arch; UUID m_uuid; }; diff --git a/lldb/test/Shell/Symtab/symtab-wasm.test b/lldb/test/Shell/Symtab/symtab-wasm.test index 4170d9aba9eea..524691b897322 100644 --- a/lldb/test/Shell/Symtab/symtab-wasm.test +++ b/lldb/test/Shell/Symtab/symtab-wasm.test @@ -1,15 +1,16 @@ # RUN: yaml2obj %S/Inputs/simple.wasm.yaml -o %t.wasm -# RUN: %lldb %t.wasm -o 'image dump symtab' -o 'image dump sections' | FileCheck %s -CHECK: Code 0x0000000000000002 0x0000000000000002 0x00000000 __wasm_call_ctors -CHECK: Code 0x0000000000000005 0x0000000000000029 0x00000000 add -CHECK: Code 0x000000000000002f 0x000000000000004c 0x00000000 __original_main -CHECK: Code 0x000000000000007c 0x0000000000000009 0x00000000 main -CHECK: Data 0x0000000000000233 0x0000000000000009 0x00000000 .rodata -CHECK: Data 0x0000000000000242 0x0000000000000004 0x00000000 .data +# RUN: %lldb %t.wasm -o 'image dump symtab' | FileCheck %s --check-prefix SYMTAB +SYMTAB: Code 0x0000000000000002 0x0000000000000002 0x00000000 __wasm_call_ctors +SYMTAB: Code 0x0000000000000005 0x0000000000000029 0x00000000 add +SYMTAB: Code 0x000000000000002f 0x000000000000004c 0x00000000 __original_main +SYMTAB: Code 0x000000000000007c 0x0000000000000009 0x00000000 main -CHECK: 0x0000000000000001 code {{.*}} 0x000001a1 0x00000085 0x00000000 symtab-wasm.test.tmp.wasm.code -CHECK: 0x0000000000000003 data {{.*}} 0x0000022c 0x0000001a 0x00000000 symtab-wasm.test.tmp.wasm.data -CHECK: 0x0000000000000040 wasm-name {{.*}} 0x00000251 0x00000059 0x00000000 symtab-wasm.test.tmp.wasm.name -CHECK: 0x0000000000000100 data {{.*}} 0x00000233 0x00000009 0x00000000 symtab-wasm.test.tmp.wasm.data..rodata -CHECK: 0x0000000000000200 data {{.*}} 0x00000242 0x00000004 0x00000000 symtab-wasm.test.tmp.wasm.data..data +# RUN: %lldb %t.wasm -o 'image dump sections' | FileCheck %s --check-prefix SECTIONS +SECTIONS: 0x0000000000000001 code [0x0000000000000000-0x0000000000000085) --- 0x000001a1 0x00000085 0x00000000 symtab-wasm.test.tmp.wasm.code +SECTIONS: 0x0000000000000040 wasm-name --- 0x00000251 0x00000059 0x00000000 symtab-wasm.test.tmp.wasm.name +SECTIONS: 0x0000000000000100 data [0x0000000000000400-0x0000000000000409) --- 0x00000233 0x00000009 0x00000000 symtab-wasm.test.tmp.wasm..rodata +SECTIONS: 0x0000000000000200 data [0x000000000000040c-0x0000000000000410) --- 0x00000242 0x00000004 0x00000000 symtab-wasm.test.tmp.wasm..data + +# RUN: %lldb %t.wasm -o 'x/s 0x0000000000000400' | FileCheck %s --check-prefix STR +STR: "data str" _______________________________________________ lldb-commits mailing list lldb-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/lldb-commits