[lldb] Support parsing data symbols from the Wasm name section #153494

JDevlieghere · 2025-08-13T20:54:55Z

This PR adds support for parsing the data symbols from the WebAssembly name section, which consists of a name and address range for the segments in the Wasm data section. Unlike other object file formats, Wasm has no symbols for referencing items within those segments (i.e. symbols the user has defined).

llvmbot · 2025-08-13T21:23:44Z

@llvm/pr-subscribers-lldb

Author: Jonas Devlieghere (JDevlieghere)

Changes

This PR adds support for parsing data symbols from the WebAssembly name section.

Full diff: https://github.com/llvm/llvm-project/pull/153494.diff

3 Files Affected:

(modified) lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp (+92-22)
(modified) lldb/test/Shell/Symtab/Inputs/simple.wasm.yaml (+78-19)
(modified) lldb/test/Shell/Symtab/symtab-wasm.test (+6-4)

diff --git a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
index 919cc21c32ffd..b3144f28f4913 100644
--- a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
+++ b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
@@ -251,11 +251,11 @@ bool ObjectFileWasm::ParseHeader() {
 
 static llvm::Expected<std::vector<AddressRange>>
 ParseFunctions(SectionSP code_section_sp) {
-  DataExtractor code_section_data;
-  code_section_sp->GetSectionData(code_section_data);
+  DataExtractor data;
+  code_section_sp->GetSectionData(data);
   lldb::offset_t offset = 0;
 
-  const uint64_t function_count = code_section_data.GetULEB128(&offset);
+  const uint64_t function_count = data.GetULEB128(&offset);
   if (function_count > std::numeric_limits<uint32_t>::max())
     return llvm::createStringError("function count overflows uint32_t");
 
@@ -263,7 +263,7 @@ ParseFunctions(SectionSP code_section_sp) {
   functions.reserve(function_count);
 
   for (uint32_t i = 0; i < function_count; ++i) {
-    const uint64_t function_size = code_section_data.GetULEB128(&offset);
+    const uint64_t function_size = data.GetULEB128(&offset);
     if (function_size > std::numeric_limits<uint32_t>::max())
       return llvm::createStringError("function size overflows uint32_t");
     // llvm-objdump considers the ULEB with the function size to be part of the
@@ -281,9 +281,45 @@ ParseFunctions(SectionSP code_section_sp) {
   return functions;
 }
 
+static llvm::Expected<std::vector<AddressRange>>
+ParseData(SectionSP data_section_sp) {
+  DataExtractor data;
+  data_section_sp->GetSectionData(data);
+
+  lldb::offset_t offset = 0;
+
+  const uint64_t segment_count = data.GetULEB128(&offset);
+  if (segment_count > std::numeric_limits<uint32_t>::max())
+    return llvm::createStringError("segment count overflows uint32_t");
+
+  std::vector<AddressRange> segments;
+  segments.reserve(segment_count);
+
+  for (uint32_t i = 0; i < segment_count; ++i) {
+    const uint64_t flags = data.GetULEB128(&offset);
+    if (flags > std::numeric_limits<uint32_t>::max())
+      return llvm::createStringError("segment flags overflows uint32_t");
+
+    const uint64_t segment_size = data.GetULEB128(&offset);
+    if (flags > std::numeric_limits<uint32_t>::max())
+      return llvm::createStringError("segment size overflows uint32_t");
+
+    segments.emplace_back(data_section_sp, offset, segment_size);
+
+    std::optional<lldb::offset_t> next_offset =
+        llvm::checkedAddUnsigned(offset, segment_size);
+    if (!next_offset)
+      return llvm::createStringError("segment offset overflows uint64_t");
+    offset = *next_offset;
+  }
+
+  return segments;
+}
+
 static llvm::Expected<std::vector<Symbol>>
 ParseNames(SectionSP name_section_sp,
-           const std::vector<AddressRange> &functions) {
+           const std::vector<AddressRange> &function_ranges,
+           const std::vector<AddressRange> &segment_ranges) {
   DataExtractor name_section_data;
   name_section_sp->GetSectionData(name_section_data);
 
@@ -305,17 +341,34 @@ ParseNames(SectionSP name_section_sp,
       for (uint64_t i = 0; c && i < count; ++i) {
         const uint64_t idx = data.getULEB128(c);
         const std::optional<std::string> name = GetWasmString(data, c);
-        if (!name || idx >= functions.size())
+        if (!name || idx >= function_ranges.size())
           continue;
         symbols.emplace_back(
             symbols.size(), Mangled(*name), lldb::eSymbolTypeCode,
             /*external=*/false, /*is_debug=*/false, /*is_trampoline=*/false,
-            /*is_artificial=*/false, functions[idx],
+            /*is_artificial=*/false, function_ranges[idx],
             /*size_is_valid=*/true, /*contains_linker_annotations=*/false,
             /*flags=*/0);
       }
     } break;
-    case llvm::wasm::WASM_NAMES_DATA_SEGMENT:
+    case llvm::wasm::WASM_NAMES_DATA_SEGMENT: {
+      const uint64_t count = data.getULEB128(c);
+      if (count > std::numeric_limits<uint32_t>::max())
+        return llvm::createStringError("data count overflows uint32_t");
+      for (uint64_t i = 0; c && i < count; ++i) {
+        const uint64_t idx = data.getULEB128(c);
+        const std::optional<std::string> name = GetWasmString(data, c);
+        if (!name || idx >= segment_ranges.size())
+          continue;
+        symbols.emplace_back(
+            symbols.size(), Mangled(*name), lldb::eSymbolTypeData,
+            /*external=*/false, /*is_debug=*/false, /*is_trampoline=*/false,
+            /*is_artificial=*/false, segment_ranges[idx],
+            /*size_is_valid=*/true, /*contains_linker_annotations=*/false,
+            /*flags=*/0);
+      }
+
+    } break;
     case llvm::wasm::WASM_NAMES_GLOBAL:
     case llvm::wasm::WASM_NAMES_LOCAL:
     default:
@@ -336,21 +389,35 @@ void ObjectFileWasm::ParseSymtab(Symtab &symtab) {
   assert(m_sections_up && "sections must be parsed");
   Log *log = GetLog(LLDBLog::Object);
 
-  // The name section contains names and indexes. First parse the functions from
-  // the code section so we can access them by their index.
-  SectionSP code_section_sp =
-      m_sections_up->FindSectionByType(lldb::eSectionTypeCode, false);
-  if (!code_section_sp) {
-    LLDB_LOG(log, "Failed to parse Wasm symbol table: no functions section");
-    return;
+  // The name section contains names and indexes. First parse the data from the
+  // relevant sections so we can access it by its index.
+  std::vector<AddressRange> function_ranges;
+  std::vector<AddressRange> segment_ranges;
+
+  // Parse the code section.
+  if (SectionSP code_section_sp =
+          m_sections_up->FindSectionByType(lldb::eSectionTypeCode, false)) {
+    llvm::Expected<std::vector<AddressRange>> functions =
+        ParseFunctions(code_section_sp);
+    if (!functions) {
+      LLDB_LOG_ERROR(log, functions.takeError(),
+                     "Failed to parse Wasm code section: {0}");
+      return;
+    }
+    function_ranges = *functions;
   }
 
-  llvm::Expected<std::vector<AddressRange>> functions =
-      ParseFunctions(code_section_sp);
-  if (!functions) {
-    LLDB_LOG_ERROR(log, functions.takeError(),
-                   "Failed to parse Wasm functions: {0}");
-    return;
+  // Parse the data section.
+  if (SectionSP data_section_sp =
+          m_sections_up->FindSectionByType(lldb::eSectionTypeData, false)) {
+    llvm::Expected<std::vector<AddressRange>> segments =
+        ParseData(data_section_sp);
+    if (!segments) {
+      LLDB_LOG_ERROR(log, segments.takeError(),
+                     "Failed to parse Wasm data section: {0}");
+      return;
+    }
+    segment_ranges = *segments;
   }
 
   // Parse the name section.
@@ -362,7 +429,7 @@ void ObjectFileWasm::ParseSymtab(Symtab &symtab) {
   }
 
   llvm::Expected<std::vector<Symbol>> symbols =
-      ParseNames(name_section_sp, *functions);
+      ParseNames(name_section_sp, function_ranges, segment_ranges);
   if (!symbols) {
     LLDB_LOG_ERROR(log, symbols.takeError(), "Failed to parse Wasm names: {0}");
     return;
@@ -408,6 +475,9 @@ void ObjectFileWasm::CreateSections(SectionList &unified_section_list) {
       // For this reason Section::GetFileAddress() must return zero for the
       // Code section.
       vm_addr = 0;
+    } else if (llvm::wasm::WASM_SEC_DATA == sect_info.id) {
+      section_type = eSectionTypeData;
+      section_name = ConstString("data");
     } else {
       section_type = GetSectionTypeFromName(sect_info.name.GetStringRef());
       if (section_type == eSectionTypeOther)
diff --git a/lldb/test/Shell/Symtab/Inputs/simple.wasm.yaml b/lldb/test/Shell/Symtab/Inputs/simple.wasm.yaml
index 165bb53662f40..67b04aa3cf81c 100644
--- a/lldb/test/Shell/Symtab/Inputs/simple.wasm.yaml
+++ b/lldb/test/Shell/Symtab/Inputs/simple.wasm.yaml
@@ -1,3 +1,15 @@
+# clang -target wasm32 -nostdlib -Wl,--no-entry -Wl,--export-all -O0 -g -o simple.wasm simple.c
+# char* str = "data str";
+#
+# int add(int a, int b) {
+#   return a + b;
+# }
+#
+# int main() {
+#   int i = 1;
+#   int j = 2;
+#   return add(i, j);
+# }
 --- !WASM
 FileHeader:
   Version:         0x1
@@ -37,13 +49,13 @@ Sections:
         Mutable:         true
         InitExpr:
           Opcode:          I32_CONST
-          Value:           66560
+          Value:           66576
       - Index:           1
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
-          Value:           1024
+          Value:           1036
       - Index:           2
         Type:            I32
         Mutable:         false
@@ -55,44 +67,50 @@ Sections:
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
-          Value:           1024
+          Value:           1040
       - Index:           4
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
-          Value:           66560
+          Value:           1040
       - Index:           5
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
-          Value:           1024
+          Value:           66576
       - Index:           6
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
-          Value:           66560
+          Value:           1024
       - Index:           7
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
-          Value:           131072
+          Value:           66576
       - Index:           8
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
-          Value:           0
+          Value:           131072
       - Index:           9
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
-          Value:           1
+          Value:           0
       - Index:           10
+        Type:            I32
+        Mutable:         false
+        InitExpr:
+          Opcode:          I32_CONST
+          Value:           1
+      - Index:           11
         Type:            I32
         Mutable:         false
         InitExpr:
@@ -115,6 +133,9 @@ Sections:
       - Name:            main
         Kind:            FUNCTION
         Index:           3
+      - Name:            str
+        Kind:            GLOBAL
+        Index:           1
       - Name:            __main_void
         Kind:            FUNCTION
         Index:           2
@@ -123,34 +144,34 @@ Sections:
         Index:           0
       - Name:            __dso_handle
         Kind:            GLOBAL
-        Index:           1
+        Index:           2
       - Name:            __data_end
         Kind:            GLOBAL
-        Index:           2
+        Index:           3
       - Name:            __stack_low
         Kind:            GLOBAL
-        Index:           3
+        Index:           4
       - Name:            __stack_high
         Kind:            GLOBAL
-        Index:           4
+        Index:           5
       - Name:            __global_base
         Kind:            GLOBAL
-        Index:           5
+        Index:           6
       - Name:            __heap_base
         Kind:            GLOBAL
-        Index:           6
+        Index:           7
       - Name:            __heap_end
         Kind:            GLOBAL
-        Index:           7
+        Index:           8
       - Name:            __memory_base
         Kind:            GLOBAL
-        Index:           8
+        Index:           9
       - Name:            __table_base
         Kind:            GLOBAL
-        Index:           9
+        Index:           10
       - Name:            __wasm_first_page_end
         Kind:            GLOBAL
-        Index:           10
+        Index:           11
   - Type:            CODE
     Functions:
       - Index:           0
@@ -169,6 +190,35 @@ Sections:
       - Index:           3
         Locals:          []
         Body:            1082808080000F0B
+  - Type:            DATA
+    Segments:
+      - SectionOffset:   7
+        InitFlags:       0
+        Offset:
+          Opcode:          I32_CONST
+          Value:           1024
+        Content:         '646174612073747200'
+      - SectionOffset:   22
+        InitFlags:       0
+        Offset:
+          Opcode:          I32_CONST
+          Value:           1036
+        Content:         '00040000'
+  - Type:            CUSTOM
+    Name:            .debug_abbrev
+    Payload:         011101250E1305030E10171B0E11015517000002340049133A0B3B0B02180000030101491300000421004913370B0000052400030E3E0B0B0B0000062400030E0B0B3E0B0000073400030E49133F193A0B3B0B02180000080F0049130000092E01110112064018030E3A0B3B0B271949133F1900000A05000218030E3A0B3B0B491300000B2E01110112064018030E3A0B3B0B49133F1900000C34000218030E3A0B3B0B4913000000
+  - Type:            CUSTOM
+    Name:            .debug_info
+    Payload:         D100000004000000000004017F0000001D005E0000000000000016000000000000000000000002330000000101050300040000033F0000000446000000090005080000000601066B000000080707040000005E000000010105030C040000083F00000009050000002900000004ED00029F5A0000000103CD0000000A02910C690000000103CD0000000A029108670000000103CD000000000B2F0000004C00000004ED00009F0D0000000107CD0000000C029108140000000108CD0000000C029104120000000109CD000000000500000000050400
+  - Type:            CUSTOM
+    Name:            .debug_ranges
+    Payload:         050000002E0000002F0000007B0000000000000000000000
+  - Type:            CUSTOM
+    Name:            .debug_str
+    Payload:         696E74007374720063686172006D61696E006A0069002F55736572732F6A6F6E61732F7761736D2D6D6963726F2D72756E74696D652F70726F647563742D6D696E692F706C6174666F726D732F64617277696E2F6275696C64006164640073696D706C652E6300620061005F5F41525241595F53495A455F545950455F5F00636C616E672076657273696F6E2032322E302E306769742028676974406769746875622E636F6D3A6C6C766D2F6C6C766D2D70726F6A6563742E67697420363363633265333930646235376362633430306235313937373162373030356561623166633736612900
+  - Type:            CUSTOM
+    Name:            .debug_line
+    Payload:         62000000040020000000010101FB0E0D0001010101000000010000010073696D706C652E6300000000000005020500000014050A0A08AD050E0658050C5805032002020001010005022F0000001805070A08BB75050E7505110658050A58050382020F000101
   - Type:            CUSTOM
     Name:            name
     FunctionNames:
@@ -183,8 +233,17 @@ Sections:
     GlobalNames:
       - Index:           0
         Name:            __stack_pointer
+    DataSegmentNames:
+      - Index:           0
+        Name:            .rodata
+      - Index:           1
+        Name:            .data
   - Type:            CUSTOM
+    HeaderSecSizeEncodingLen: 2
     Name:            producers
+    Languages:
+      - Name:            C11
+        Version:         ''
     Tools:
       - Name:            clang
         Version:         '22.0.0git'
diff --git a/lldb/test/Shell/Symtab/symtab-wasm.test b/lldb/test/Shell/Symtab/symtab-wasm.test
index fc185cd81a0ec..5374b0c2f2892 100644
--- a/lldb/test/Shell/Symtab/symtab-wasm.test
+++ b/lldb/test/Shell/Symtab/symtab-wasm.test
@@ -1,7 +1,9 @@
 # RUN: yaml2obj %S/Inputs/simple.wasm.yaml -o %t.wasm
 # RUN: %lldb %t.wasm -o 'image dump symtab'
 
-# CHECK: Code 0x0000000000000002 {{.*}} __wasm_call_ctors
-# CHECK: Code 0x0000000000000005 {{.*}} add
-# CHECK: Code 0x000000000000002f {{.*}} __original_main
-# CHECK: Code 0x000000000000007c {{.*}} main
+# CHECK: Code 0x0000000000000002 0x0000000000000002 {{.*}} __wasm_call_ctors
+# CHECK: Code 0x0000000000000005 0x0000000000000029 {{.*}} add
+# CHECK: Code 0x000000000000002f 0x000000000000004c {{.*}} __original_main
+# CHECK: Code 0x000000000000007c 0x0000000000000009 {{.*}} main
+# CHECK: Data 0x000000000000022f 0x0000000000000041 {{.*}} .rodata
+# CHECK: Data 0x0000000000000270 0x0000000000000000 {{.*}} .data

This PR adds support for parsing data symbols from the WebAssembly name section.

DavidSpickett · 2025-08-14T08:09:52Z

This PR adds support for parsing data symbols from the WebAssembly name section.

What I see is parsing the names of data segments, no test checks we can read a symbol that's in those segments. You added a global str, can we read that by name?

Or is a "data symbol" a symbol telling us where the data segment is, and that's where the "symbol symbols" will all be and later you'll add code to read those?

lldb/test/Shell/Symtab/symtab-wasm.test

JDevlieghere · 2025-08-14T14:07:21Z

@DavidSpickett Great question and that definitely deserved more explanation from my end. I was a bit rushed in getting the PR up to unlock Adrian who needs the data for the Swift Reflection metadata.

In WebAssembly, the data section (WASM_NAMES_DATA_SEGMENT) contain the actual initialized data that will be copied into (linear) memory when the module is loaded. The names section has names for the different segments, but there's no symbols for referencing items within those segments (like .data and .rodata) the same way that's the case for ELF and Mach-O. I also believe those sections are not "standardized": I couldn't find anything about .data or .rodata in the Wasm spec.

llvm-objdump gives you the same:

SYMBOL TABLE:
00000188 g     F CODE   00000003 __wasm_call_ctors
0000018b g     F CODE   0000002a add
000001b5 g     F CODE   0000004d __original_main
00000202 g     F CODE   0000000a main
0000002e g       GLOBAL 00000007 __stack_pointer
00000400 l     O DATA   00000009 .rodata
0000040c l     O DATA   00000004 .data

That said, I do think these should also be represented as segments in LLDB, and that's indeed something I have planned to do next.

DavidSpickett · 2025-08-14T14:42:25Z

So a "data symbol" is nothing to do with symbols the user has defined, these are kind of internal symbols which are section name offsets into this one giant data segment.

Add something to the PR description like "data symbols provide the names of the sub-sections of the wasm data segment", whatever the correct form of that is.

Just wanted to understand that. Otherwise, I have no other comments.

This is a continuation of llvm#153494. In a WebAssembly file, the "name" section contains names for the segments in the data section (WASM_NAMES_DATA_SEGMENT). We already parse these as sections, as with this PR, we also create sub-sections for the data segments.

…153494) This PR adds support for parsing the data symbols from the WebAssembly name section, which consists of a name and address range for the segments in the Wasm data section. Unlike other object file formats, Wasm has no symbols for referencing items within those segments (i.e. symbols the user has defined). (cherry picked from commit d0e40ff)

JDevlieghere requested review from DavidSpickett and adrian-prantl August 13, 2025 20:54

JDevlieghere force-pushed the wasm-data-symbols branch from e68b6d3 to b12ff18 Compare August 13, 2025 21:23

JDevlieghere marked this pull request as ready for review August 13, 2025 21:23

llvmbot added the lldb label Aug 13, 2025

[lldb] Support parsing data symbols from the Wasm name section

2d0a3da

This PR adds support for parsing data symbols from the WebAssembly name section.

JDevlieghere force-pushed the wasm-data-symbols branch from b12ff18 to 2d0a3da Compare August 13, 2025 21:24

JDevlieghere mentioned this pull request Aug 13, 2025

WebAssembly support in LLDB #150449

Open

6 tasks

adrian-prantl approved these changes Aug 13, 2025

View reviewed changes

DavidSpickett reviewed Aug 14, 2025

View reviewed changes

lldb/test/Shell/Symtab/symtab-wasm.test Show resolved Hide resolved

JDevlieghere merged commit d0e40ff into llvm:main Aug 14, 2025
9 checks passed

JDevlieghere deleted the wasm-data-symbols branch August 14, 2025 15:01

JDevlieghere mentioned this pull request Aug 14, 2025

[lldb] Create sections for Wasm segments #153634

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[lldb] Support parsing data symbols from the Wasm name section #153494

[lldb] Support parsing data symbols from the Wasm name section #153494

Uh oh!

JDevlieghere commented Aug 13, 2025 •

edited

Loading

Uh oh!

llvmbot commented Aug 13, 2025

Uh oh!

DavidSpickett commented Aug 14, 2025 •

edited

Loading

Uh oh!

Uh oh!

JDevlieghere commented Aug 14, 2025 •

edited

Loading

Uh oh!

DavidSpickett commented Aug 14, 2025

Uh oh!

Uh oh!

Uh oh!

[lldb] Support parsing data symbols from the Wasm name section #153494

[lldb] Support parsing data symbols from the Wasm name section #153494

Uh oh!

Conversation

JDevlieghere commented Aug 13, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Aug 13, 2025

Uh oh!

DavidSpickett commented Aug 14, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

JDevlieghere commented Aug 14, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

DavidSpickett commented Aug 14, 2025

Uh oh!

Uh oh!

Uh oh!

JDevlieghere commented Aug 13, 2025 •

edited

Loading

DavidSpickett commented Aug 14, 2025 •

edited

Loading

JDevlieghere commented Aug 14, 2025 •

edited

Loading