[clang-tools-extra] [clang-doc] Add standalone Markdown parsing library (PR #202991)

Neil Nair via cfe-commits Fri, 12 Jun 2026 15:48:11 -0700

https://github.com/Neil-N4 updated 
https://github.com/llvm/llvm-project/pull/202991


>From b4623400ed04066b222882d46bbddf1819ffca9e Mon Sep 17 00:00:00 2001
From: Neil-N4 <[email protected]>
Date: Wed, 10 Jun 2026 09:51:48 -0400
Subject: [PATCH 1/9] [clang-doc] Add standalone Markdown parsing library

---
 .../clang-doc/support/CMakeLists.txt          |   3 +-
 .../clang-doc/support/Markdown.cpp            | 145 ++++++++++++++++++
 .../clang-doc/support/Markdown.h              |  72 +++++++++
 .../unittests/clang-doc/CMakeLists.txt        |   4 +-
 .../clang-doc/MarkdownParserTest.cpp          |  94 ++++++++++++
 5 files changed, 316 insertions(+), 2 deletions(-)
 create mode 100644 clang-tools-extra/clang-doc/support/Markdown.cpp
 create mode 100644 clang-tools-extra/clang-doc/support/Markdown.h
 create mode 100644 clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp

diff --git a/clang-tools-extra/clang-doc/support/CMakeLists.txt 
b/clang-tools-extra/clang-doc/support/CMakeLists.txt
index 8ac913ffbe998..acff865190ff9 100644
--- a/clang-tools-extra/clang-doc/support/CMakeLists.txt
+++ b/clang-tools-extra/clang-doc/support/CMakeLists.txt
@@ -6,5 +6,6 @@ set(LLVM_LINK_COMPONENTS
 
 add_clang_library(clangDocSupport STATIC
   File.cpp
+  Markdown.cpp
   Utils.cpp
-  )
+  )
\ No newline at end of file
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp 
b/clang-tools-extra/clang-doc/support/Markdown.cpp
new file mode 100644
index 0000000000000..776150b939d27
--- /dev/null
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -0,0 +1,145 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Markdown.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/DebugLog.h"
+
+#define DEBUG_TYPE "clang-doc-markdown"
+
+using namespace llvm;
+
+namespace clang::doc::markdown {
+
+static MDNode makeText(StringRef S) {
+  return {NodeKind::NK_Text, S, {}};
+}
+
+// A line is a table separator if it only contains |, -, :, and spaces,
+// and has at least one -.
+static bool isSepRow(StringRef Line) {
+  return Line.contains('-') &&
+         Line.find_first_not_of("|-: ") == StringRef::npos;
+}
+
+// Returns true if Line begins with a bullet list marker (-, *, or +)
+// followed by a space.
+static bool isListItem(StringRef Line) {
+  return Line.starts_with("- ") || Line.starts_with("* ") ||
+         Line.starts_with("+ ");
+}
+
+static ArrayRef<MDNode> allocateNodes(const SmallVectorImpl<MDNode> &Nodes,
+                                      BumpPtrAllocator &Arena) {
+  if (Nodes.empty())
+    return {};
+  MDNode *Allocated = Arena.Allocate<MDNode>(Nodes.size());
+  std::uninitialized_copy(Nodes.begin(), Nodes.end(), Allocated);
+  return ArrayRef<MDNode>(Allocated, Nodes.size());
+}
+
+ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
+                               BumpPtrAllocator &Arena) {
+  if (ParagraphText.trim().empty())
+    return {};
+
+  SmallVector<StringRef, 16> Lines;
+  ParagraphText.split(Lines, '\n');
+
+  SmallVector<MDNode> Nodes;
+  size_t I = 0, E = Lines.size();
+
+  while (I < E) {
+    StringRef Line = Lines[I].trim();
+
+    if (Line.empty()) {
+      ++I;
+      continue;
+    }
+
+    // TODO: Follow CommonMark spec §4.5 more closely -- opening fences may be
+    // indented up to 3 spaces, the closing fence must use the same character
+    // and be at least as long as the opening fence, and the closing fence may
+    // only be followed by spaces. Doxygen specifics should be handled on a
+    // case-by-case basis.
+    if (Line.starts_with("```") || Line.starts_with("~~~")) {
+      char Fence = Line[0];
+      StringRef Lang = Line.drop_front(3).trim();
+      SmallVector<MDNode> CodeLines;
+      ++I;
+      while (I < E) {
+        StringRef CodeLine = Lines[I].trim();
+        if (CodeLine.size() >= 3 &&
+            all_of(CodeLine.take_front(3),
+                   [Fence](char C) { return C == Fence; }))
+          break;
+        CodeLines.push_back(makeText(Lines[I]));
+        ++I;
+      }
+      ++I; // skip closing fence
+      MDNode Code;
+      Code.Kind = NodeKind::NK_FencedCode;
+      Code.Content = Lang;
+      Code.Children = allocateNodes(CodeLines, Arena);
+      LDBG() << "emitting NK_FencedCode lang='" << Lang
+             << "' lines=" << CodeLines.size();
+      Nodes.push_back(Code);
+      continue;
+    }
+
+    // Pipe table: current line has | and next line is a separator row.
+    if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) {
+      SmallVector<MDNode> Rows;
+      while (I < E && Lines[I].trim().contains('|')) {
+        Rows.push_back(makeText(Lines[I].trim()));
+        ++I;
+      }
+      MDNode Table;
+      Table.Kind = NodeKind::NK_Table;
+      Table.Content = {};
+      Table.Children = allocateNodes(Rows, Arena);
+      LDBG() << "emitting NK_Table rows=" << Rows.size();
+      Nodes.push_back(Table);
+      continue;
+    }
+
+    // Unordered list item.
+    if (isListItem(Line)) {
+      SmallVector<MDNode> Items;
+      while (I < E) {
+        StringRef L = Lines[I].trim();
+        if (!isListItem(L))
+          break;
+        MDNode Item;
+        Item.Kind = NodeKind::NK_ListItem;
+        Item.Content = L.drop_front(2).trim();
+        Item.Children = {};
+        Items.push_back(Item);
+        ++I;
+      }
+      MDNode List;
+      List.Kind = NodeKind::NK_UnorderedList;
+      List.Content = {};
+      List.Children = allocateNodes(Items, Arena);
+      LDBG() << "emitting NK_UnorderedList items=" << Items.size();
+      Nodes.push_back(List);
+      continue;
+    }
+
+    // Plain text fallback.
+    Nodes.push_back(makeText(Line));
+    ++I;
+  }
+
+  LDBG() << "parseMarkdown done nodes=" << Nodes.size();
+  return allocateNodes(Nodes, Arena);
+}
+
+} // namespace clang::doc::markdown
\ No newline at end of file
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h 
b/clang-tools-extra/clang-doc/support/Markdown.h
new file mode 100644
index 0000000000000..890f764f937b1
--- /dev/null
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines a standalone Markdown parsing library for the LLVM
+/// ecosystem. The parser takes plain text and returns a tree of typed nodes
+/// with no knowledge of comments, Doxygen, or Clang-Doc internals.
+///
+/// This is a simple Markdown parser for use inside Clang-Doc's comment
+/// pipeline. You give it a paragraph of text and an arena allocator, and it
+/// gives back a list of typed nodes describing the Markdown structure it 
found.
+///
+/// The main entry point is parseMarkdown(). If the text has no Markdown in it,
+/// you get back an empty list and can fall back to plain-text output. If it
+/// does, you get a tree of MDNode structs where each node has a kind, optional
+/// content (like the language tag on a code fence), and optional children.
+///
+/// All nodes are allocated in the arena you pass in. You own the arena and are
+/// responsible for keeping it alive as long as you use the nodes.
+///
+/// The parser handles fenced code blocks, pipe tables, and unordered lists.
+/// Anything it does not recognize comes back as a plain text node. It will
+/// never crash on bad input.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+
+namespace clang::doc::markdown {
+
+enum class NodeKind {
+  // Block nodes
+  NK_Paragraph,
+  NK_FencedCode,
+  NK_Table,
+  NK_UnorderedList,
+  NK_OrderedList,
+  NK_ListItem,
+  NK_ThematicBreak,
+  // Inline nodes
+  NK_Text,
+  NK_InlineCode,
+  NK_Emphasis,
+  NK_Strong,
+  NK_SoftBreak,
+};
+
+struct MDNode {
+  NodeKind Kind;
+  llvm::StringRef Content; // lang tag for FencedCode, leaf text for Text
+  llvm::ArrayRef<MDNode> Children; // arena allocated
+};
+
+/// Parses Markdown from a single comment paragraph's text.
+/// Returns an empty ArrayRef if no Markdown constructs are found,
+/// so generators can fall back to plain-text rendering at zero cost.
+llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText,
+                                     llvm::BumpPtrAllocator &Arena);
+
+} // namespace clang::doc::markdown
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
\ No newline at end of file
diff --git a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt 
b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
index 01b34ec9a791e..b74207ac88fa7 100644
--- a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
+++ b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
@@ -26,6 +26,7 @@ add_extra_unittest(ClangDocTests
   ClangDocTest.cpp
   GeneratorTest.cpp
   HTMLGeneratorTest.cpp
+  MarkdownParserTest.cpp
   MDGeneratorTest.cpp
   MergeTest.cpp
   SerializeTest.cpp
@@ -49,5 +50,6 @@ clang_target_link_libraries(ClangDocTests
 target_link_libraries(ClangDocTests
   PRIVATE
   clangDoc
+  clangDocSupport
   LLVMTestingSupport
-  )
+  )
\ No newline at end of file
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp 
b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
new file mode 100644
index 0000000000000..8df5efc7f1d5f
--- /dev/null
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -0,0 +1,94 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "support/Markdown.h"
+#include "llvm/Support/Allocator.h"
+#include "gtest/gtest.h"
+
+using namespace clang::doc::markdown;
+
+namespace {
+
+TEST(MarkdownParserTest, EmptyInput) {
+  llvm::BumpPtrAllocator Arena;
+  auto Nodes = parseMarkdown("", Arena);
+  EXPECT_TRUE(Nodes.empty());
+}
+
+TEST(MarkdownParserTest, WhitespaceOnlyInput) {
+  llvm::BumpPtrAllocator Arena;
+  auto Nodes = parseMarkdown("   \n  \n", Arena);
+  EXPECT_TRUE(Nodes.empty());
+}
+
+TEST(MarkdownParserTest, PlainText) {
+  llvm::BumpPtrAllocator Arena;
+  auto Nodes = parseMarkdown("hello world", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Text);
+  EXPECT_EQ(Nodes[0].Content, "hello world");
+}
+
+TEST(MarkdownParserTest, FencedCodeBlock) {
+  llvm::BumpPtrAllocator Arena;
+  auto Nodes = parseMarkdown("```cpp\nint x = 0;\n```", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
+  EXPECT_EQ(Nodes[0].Content, "cpp");
+  ASSERT_EQ(Nodes[0].Children.size(), 1u);
+}
+
+TEST(MarkdownParserTest, FencedCodeBlockNoLang) {
+  llvm::BumpPtrAllocator Arena;
+  auto Nodes = parseMarkdown("```\nsome code\n```", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
+  EXPECT_TRUE(Nodes[0].Content.empty());
+}
+
+TEST(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
+  llvm::BumpPtrAllocator Arena;
+  auto Nodes = parseMarkdown("```cpp\nint x = 0;", Arena);
+  // Unterminated fence should not crash and should produce a code node
+  // with whatever lines were found.
+  EXPECT_FALSE(Nodes.empty());
+}
+
+TEST(MarkdownParserTest, PipeTable) {
+  llvm::BumpPtrAllocator Arena;
+  auto Nodes = parseMarkdown("| A | B |\n|---|---|\n| 1 | 2 |", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table);
+}
+
+TEST(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
+  llvm::BumpPtrAllocator Arena;
+  auto Nodes = parseMarkdown("a | b\nc | d", Arena);
+  // No separator row so should not be parsed as a table
+  for (const auto &Node : Nodes)
+    EXPECT_NE(Node.Kind, NodeKind::NK_Table);
+}
+
+TEST(MarkdownParserTest, UnorderedList) {
+  llvm::BumpPtrAllocator Arena;
+  auto Nodes = parseMarkdown("- foo\n- bar\n- baz", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_UnorderedList);
+  ASSERT_EQ(Nodes[0].Children.size(), 3u);
+  EXPECT_EQ(Nodes[0].Children[0].Content, "foo");
+  EXPECT_EQ(Nodes[0].Children[1].Content, "bar");
+  EXPECT_EQ(Nodes[0].Children[2].Content, "baz");
+}
+
+TEST(MarkdownParserTest, MixedContent) {
+  llvm::BumpPtrAllocator Arena;
+  auto Nodes = parseMarkdown("some text\n```\ncode\n```\n- item", Arena);
+  EXPECT_EQ(Nodes.size(), 3u);
+}
+
+} // namespace
\ No newline at end of file

>From f4cb4a28630e0f91289bfd4416c59114c5654ff7 Mon Sep 17 00:00:00 2001
From: Neil-N4 <[email protected]>
Date: Wed, 10 Jun 2026 11:35:54 -0400
Subject: [PATCH 2/9] [clang-doc] Address review feedback: test fixture, raw
 strings, DEBUG_TYPE, EOF newlines

---
 .../clang-doc/support/Markdown.cpp            |  4 +-
 .../clang-doc/support/Markdown.h              |  2 +-
 .../clang-doc/MarkdownParserTest.cpp          | 97 +++++++++++--------
 3 files changed, 61 insertions(+), 42 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp 
b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 776150b939d27..9e008abf8b08d 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -12,7 +12,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/DebugLog.h"
 
-#define DEBUG_TYPE "clang-doc-markdown"
+#define DEBUG_TYPE "clang-doc"
 
 using namespace llvm;
 
@@ -142,4 +142,4 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
   return allocateNodes(Nodes, Arena);
 }
 
-} // namespace clang::doc::markdown
\ No newline at end of file
+} // namespace clang::doc::markdown
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h 
b/clang-tools-extra/clang-doc/support/Markdown.h
index 890f764f937b1..09b79cc8f2437 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -69,4 +69,4 @@ llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef 
ParagraphText,
 
 } // namespace clang::doc::markdown
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
\ No newline at end of file
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp 
b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 8df5efc7f1d5f..ff9bad88da136 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -14,80 +14,99 @@ using namespace clang::doc::markdown;
 
 namespace {
 
-TEST(MarkdownParserTest, EmptyInput) {
+struct MarkdownParserTest : public ::testing::Test {
   llvm::BumpPtrAllocator Arena;
+};
+
+TEST_F(MarkdownParserTest, EmptyInput) {
   auto Nodes = parseMarkdown("", Arena);
   EXPECT_TRUE(Nodes.empty());
 }
 
-TEST(MarkdownParserTest, WhitespaceOnlyInput) {
-  llvm::BumpPtrAllocator Arena;
+TEST_F(MarkdownParserTest, WhitespaceOnlyInput) {
   auto Nodes = parseMarkdown("   \n  \n", Arena);
   EXPECT_TRUE(Nodes.empty());
 }
 
-TEST(MarkdownParserTest, PlainText) {
-  llvm::BumpPtrAllocator Arena;
+TEST_F(MarkdownParserTest, PlainText) {
   auto Nodes = parseMarkdown("hello world", Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Text);
-  EXPECT_EQ(Nodes[0].Content, "hello world");
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_Text);
+  EXPECT_EQ(N.Content, "hello world");
 }
 
-TEST(MarkdownParserTest, FencedCodeBlock) {
-  llvm::BumpPtrAllocator Arena;
-  auto Nodes = parseMarkdown("```cpp\nint x = 0;\n```", Arena);
+TEST_F(MarkdownParserTest, FencedCodeBlock) {
+  auto Nodes = parseMarkdown(R"(```cpp
+int x = 0;
+````)",
+                             Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
-  EXPECT_EQ(Nodes[0].Content, "cpp");
-  ASSERT_EQ(Nodes[0].Children.size(), 1u);
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  EXPECT_EQ(N.Content, "cpp");
+  ASSERT_EQ(N.Children.size(), 1u);
 }
 
-TEST(MarkdownParserTest, FencedCodeBlockNoLang) {
-  llvm::BumpPtrAllocator Arena;
-  auto Nodes = parseMarkdown("```\nsome code\n```", Arena);
+TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) {
+  auto Nodes = parseMarkdown(R"(```
+some code
+```)",
+                             Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
-  EXPECT_TRUE(Nodes[0].Content.empty());
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  EXPECT_TRUE(N.Content.empty());
 }
 
-TEST(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
-  llvm::BumpPtrAllocator Arena;
-  auto Nodes = parseMarkdown("```cpp\nint x = 0;", Arena);
+TEST_F(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
+  auto Nodes = parseMarkdown(R"(```cpp
+int x = 0;)",
+                             Arena);
   // Unterminated fence should not crash and should produce a code node
   // with whatever lines were found.
   EXPECT_FALSE(Nodes.empty());
 }
 
-TEST(MarkdownParserTest, PipeTable) {
-  llvm::BumpPtrAllocator Arena;
-  auto Nodes = parseMarkdown("| A | B |\n|---|---|\n| 1 | 2 |", Arena);
+TEST_F(MarkdownParserTest, PipeTable) {
+  auto Nodes = parseMarkdown(R"(| A | B |
+|---|---|
+| 1 | 2 |)",
+                             Arena);
   ASSERT_EQ(Nodes.size(), 1u);
   EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table);
 }
 
-TEST(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
-  llvm::BumpPtrAllocator Arena;
-  auto Nodes = parseMarkdown("a | b\nc | d", Arena);
-  // No separator row so should not be parsed as a table
+TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
+  auto Nodes = parseMarkdown(R"(a | b
+c | d)",
+                             Arena);
+  // No separator row so should not be parsed as a table.
   for (const auto &Node : Nodes)
     EXPECT_NE(Node.Kind, NodeKind::NK_Table);
 }
 
-TEST(MarkdownParserTest, UnorderedList) {
-  llvm::BumpPtrAllocator Arena;
-  auto Nodes = parseMarkdown("- foo\n- bar\n- baz", Arena);
+TEST_F(MarkdownParserTest, UnorderedList) {
+  auto Nodes = parseMarkdown(R"(- foo
+- bar
+- baz)",
+                             Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_UnorderedList);
-  ASSERT_EQ(Nodes[0].Children.size(), 3u);
-  EXPECT_EQ(Nodes[0].Children[0].Content, "foo");
-  EXPECT_EQ(Nodes[0].Children[1].Content, "bar");
-  EXPECT_EQ(Nodes[0].Children[2].Content, "baz");
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_UnorderedList);
+  ASSERT_EQ(N.Children.size(), 3u);
+  EXPECT_EQ(N.Children[0].Content, "foo");
+  EXPECT_EQ(N.Children[1].Content, "bar");
+  EXPECT_EQ(N.Children[2].Content, "baz");
 }
 
-TEST(MarkdownParserTest, MixedContent) {
-  llvm::BumpPtrAllocator Arena;
-  auto Nodes = parseMarkdown("some text\n```\ncode\n```\n- item", Arena);
+TEST_F(MarkdownParserTest, MixedContent) {
+  auto Nodes = parseMarkdown(R"(some text
+```
+code
+````
+- item)",
+                             Arena);
   EXPECT_EQ(Nodes.size(), 3u);
 }
 

>From 3ef8f62edab311caff0907ab2b9a0c3aaeb14353 Mon Sep 17 00:00:00 2001
From: Neil-N4 <[email protected]>
Date: Wed, 10 Jun 2026 13:45:44 -0400
Subject: [PATCH 3/9] [clang-doc] Add CommonMark spec tests for fenced code
 blocks

---
 .../clang-doc/MarkdownParserTest.cpp          | 112 +++++++++++++++++-
 1 file changed, 108 insertions(+), 4 deletions(-)

diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp 
b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index ff9bad88da136..4ca979c1f1d24 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -39,7 +39,7 @@ TEST_F(MarkdownParserTest, PlainText) {
 TEST_F(MarkdownParserTest, FencedCodeBlock) {
   auto Nodes = parseMarkdown(R"(```cpp
 int x = 0;
-````)",
+````````)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
   const auto &N = Nodes[0];
@@ -51,7 +51,7 @@ int x = 0;
 TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) {
   auto Nodes = parseMarkdown(R"(```
 some code
-```)",
+```````)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
   const auto &N = Nodes[0];
@@ -102,12 +102,116 @@ TEST_F(MarkdownParserTest, UnorderedList) {
 
 TEST_F(MarkdownParserTest, MixedContent) {
   auto Nodes = parseMarkdown(R"(some text
-```
+```````
 code
-````
+````````
 - item)",
                              Arena);
   EXPECT_EQ(Nodes.size(), 3u);
 }
 
+// CommonMark §4.5 example 120: tilde fences work the same as backtick fences.
+TEST_F(MarkdownParserTest, TildeFence) {
+  auto Nodes = parseMarkdown(R"(~~~
+int x = 0;
+~~~)",
+                             Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  EXPECT_TRUE(N.Content.empty());
+  ASSERT_EQ(N.Children.size(), 1u);
+}
+
+// CommonMark §4.5 example 120: tilde fence with a language tag.
+TEST_F(MarkdownParserTest, TildeFenceWithLang) {
+  auto Nodes = parseMarkdown(R"(~~~cpp
+int x = 0;
+~~~)",
+                             Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  EXPECT_EQ(N.Content, "cpp");
+  ASSERT_EQ(N.Children.size(), 1u);
+}
+
+// CommonMark §4.5 example 122: a tilde line does not close a backtick fence.
+TEST_F(MarkdownParserTest, ClosingFenceMustMatchOpeningChar) {
+  auto Nodes = parseMarkdown(R"(```
+aaa
+~~~
+````````)",
+                             Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  // ~~~ is content, not a closing fence.
+  ASSERT_EQ(N.Children.size(), 2u);
+}
+
+// CommonMark §4.5 example 130: a code block can be empty.
+TEST_F(MarkdownParserTest, EmptyFencedCodeBlock) {
+  auto Nodes = parseMarkdown(R"(```
+```````)",
+                             Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  EXPECT_TRUE(N.Children.empty());
+}
+
+// CommonMark §4.5 example 129: a code block may contain only blank lines.
+TEST_F(MarkdownParserTest, FencedCodeBlockBlankLineContent) {
+  auto Nodes = parseMarkdown("```\n\n  \n```", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  ASSERT_EQ(N.Children.size(), 2u);
+}
+
+// CommonMark §4.5 example 142: lang tag is captured from the info string.
+TEST_F(MarkdownParserTest, InfoStringLangTag) {
+  auto Nodes = parseMarkdown(R"(```ruby
+def foo(x)
+  return 3
+end
+``````)",
+                             Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  EXPECT_EQ(N.Content, "ruby");
+  ASSERT_EQ(N.Children.size(), 3u);
+}
+
+// CommonMark §4.5 example 146: tilde fence info string may contain backticks.
+TEST_F(MarkdownParserTest, TildeFenceInfoStringWithBackticks) {
+  auto Nodes = parseMarkdown(R"(~~~ aa ``` ~~~
+foo
+~~~)",
+                             Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  EXPECT_EQ(N.Content, "aa ``` ~~~");
+  ASSERT_EQ(N.Children.size(), 1u);
+}
+
+// CommonMark §4.5 example 124: closing fence must be at least as long as the
+// opening fence.
+// TODO: our parser currently closes on the first line with 3 matching fence
+// chars regardless of opening fence length. Fix as part of the CommonMark
+// TODO in parseMarkdown().
+TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) {
+  auto Nodes = parseMarkdown("````\naaa\n```", Arena);
+  // The ``` line should not close the ```` fence per CommonMark, but our
+  // parser currently treats it as a closing fence. This test documents the
+  // current (non-conformant) behavior.
+  ASSERT_EQ(Nodes.size(), 1u);
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  ASSERT_EQ(N.Children.size(), 1u);
+}
+
 } // namespace
\ No newline at end of file

>From ffb56028d83a542a775119a7b0c2f88271b2df84 Mon Sep 17 00:00:00 2001
From: Neil-N4 <[email protected]>
Date: Wed, 10 Jun 2026 13:59:52 -0400
Subject: [PATCH 4/9] [clang-doc] Replace flat MDNode with typed node hierarchy
 using LLVM RTTI

---
 .../clang-doc/support/Markdown.cpp            |  84 +++---
 .../clang-doc/support/Markdown.h              | 264 ++++++++++++++++--
 .../clang-doc/MarkdownParserTest.cpp          |  84 +++---
 3 files changed, 312 insertions(+), 120 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp 
b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 9e008abf8b08d..bee15c3e23ec3 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -18,8 +18,24 @@ using namespace llvm;
 
 namespace clang::doc::markdown {
 
-static MDNode makeText(StringRef S) {
-  return {NodeKind::NK_Text, S, {}};
+// Allocates a contiguous array of T in the arena and returns an ArrayRef.
+template <typename T>
+static ArrayRef<T> allocateArray(SmallVectorImpl<T> &Vec,
+                                 BumpPtrAllocator &Arena) {
+  if (Vec.empty())
+    return {};
+  T *Allocated = Arena.Allocate<T>(Vec.size());
+  std::uninitialized_copy(Vec.begin(), Vec.end(), Allocated);
+  return ArrayRef<T>(Allocated, Vec.size());
+}
+
+// Interns a StringRef into the arena so it outlives the parse loop.
+static StringRef internString(StringRef S, BumpPtrAllocator &Arena) {
+  if (S.empty())
+    return {};
+  char *Buf = Arena.Allocate<char>(S.size());
+  std::copy(S.begin(), S.end(), Buf);
+  return StringRef(Buf, S.size());
 }
 
 // A line is a table separator if it only contains |, -, :, and spaces,
@@ -36,24 +52,15 @@ static bool isListItem(StringRef Line) {
          Line.starts_with("+ ");
 }
 
-static ArrayRef<MDNode> allocateNodes(const SmallVectorImpl<MDNode> &Nodes,
-                                      BumpPtrAllocator &Arena) {
-  if (Nodes.empty())
-    return {};
-  MDNode *Allocated = Arena.Allocate<MDNode>(Nodes.size());
-  std::uninitialized_copy(Nodes.begin(), Nodes.end(), Allocated);
-  return ArrayRef<MDNode>(Allocated, Nodes.size());
-}
-
-ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
-                               BumpPtrAllocator &Arena) {
+ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
+                                 BumpPtrAllocator &Arena) {
   if (ParagraphText.trim().empty())
     return {};
 
   SmallVector<StringRef, 16> Lines;
   ParagraphText.split(Lines, '\n');
 
-  SmallVector<MDNode> Nodes;
+  SmallVector<MDNode *> Nodes;
   size_t I = 0, E = Lines.size();
 
   while (I < E) {
@@ -71,8 +78,8 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
     // case-by-case basis.
     if (Line.starts_with("```") || Line.starts_with("~~~")) {
       char Fence = Line[0];
-      StringRef Lang = Line.drop_front(3).trim();
-      SmallVector<MDNode> CodeLines;
+      StringRef Lang = internString(Line.drop_front(3).trim(), Arena);
+      SmallVector<StringRef> CodeLines;
       ++I;
       while (I < E) {
         StringRef CodeLine = Lines[I].trim();
@@ -80,15 +87,13 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
             all_of(CodeLine.take_front(3),
                    [Fence](char C) { return C == Fence; }))
           break;
-        CodeLines.push_back(makeText(Lines[I]));
+        CodeLines.push_back(internString(Lines[I], Arena));
         ++I;
       }
       ++I; // skip closing fence
-      MDNode Code;
-      Code.Kind = NodeKind::NK_FencedCode;
-      Code.Content = Lang;
-      Code.Children = allocateNodes(CodeLines, Arena);
-      LDBG() << "emitting NK_FencedCode lang='" << Lang
+      auto *Code =
+          new (Arena) FencedCodeNode(Lang, allocateArray(CodeLines, Arena));
+      LDBG() << "emitting FencedCodeNode lang='" << Lang
              << "' lines=" << CodeLines.size();
       Nodes.push_back(Code);
       continue;
@@ -96,50 +101,45 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
 
     // Pipe table: current line has | and next line is a separator row.
     if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) {
-      SmallVector<MDNode> Rows;
+      SmallVector<StringRef> Rows;
       while (I < E && Lines[I].trim().contains('|')) {
-        Rows.push_back(makeText(Lines[I].trim()));
+        Rows.push_back(internString(Lines[I].trim(), Arena));
         ++I;
       }
-      MDNode Table;
-      Table.Kind = NodeKind::NK_Table;
-      Table.Content = {};
-      Table.Children = allocateNodes(Rows, Arena);
-      LDBG() << "emitting NK_Table rows=" << Rows.size();
+      auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
+      LDBG() << "emitting TableNode rows=" << Rows.size();
       Nodes.push_back(Table);
       continue;
     }
 
     // Unordered list item.
     if (isListItem(Line)) {
-      SmallVector<MDNode> Items;
+      SmallVector<ListItemNode *> Items;
       while (I < E) {
         StringRef L = Lines[I].trim();
         if (!isListItem(L))
           break;
-        MDNode Item;
-        Item.Kind = NodeKind::NK_ListItem;
-        Item.Content = L.drop_front(2).trim();
-        Item.Children = {};
+        StringRef ItemText = internString(L.drop_front(2).trim(), Arena);
+        SmallVector<MDNode *> ItemChildren;
+        ItemChildren.push_back(new (Arena) TextNode(ItemText));
+        auto *Item =
+            new (Arena) ListItemNode(allocateArray(ItemChildren, Arena));
         Items.push_back(Item);
         ++I;
       }
-      MDNode List;
-      List.Kind = NodeKind::NK_UnorderedList;
-      List.Content = {};
-      List.Children = allocateNodes(Items, Arena);
-      LDBG() << "emitting NK_UnorderedList items=" << Items.size();
+      auto *List = new (Arena) UnorderedListNode(allocateArray(Items, Arena));
+      LDBG() << "emitting UnorderedListNode items=" << Items.size();
       Nodes.push_back(List);
       continue;
     }
 
     // Plain text fallback.
-    Nodes.push_back(makeText(Line));
+    Nodes.push_back(new (Arena) TextNode(internString(Line, Arena)));
     ++I;
   }
 
   LDBG() << "parseMarkdown done nodes=" << Nodes.size();
-  return allocateNodes(Nodes, Arena);
+  return allocateArray(Nodes, Arena);
 }
 
-} // namespace clang::doc::markdown
+} // namespace clang::doc::markdown
\ No newline at end of file
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h 
b/clang-tools-extra/clang-doc/support/Markdown.h
index 09b79cc8f2437..3d457bcddfac6 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -7,30 +7,50 @@
 
//===----------------------------------------------------------------------===//
 ///
 /// \file
-/// This file defines a standalone Markdown parsing library for the LLVM
-/// ecosystem. The parser takes plain text and returns a tree of typed nodes
-/// with no knowledge of comments, Doxygen, or Clang-Doc internals.
+/// Standalone Markdown parsing library for the LLVM ecosystem.
 ///
-/// This is a simple Markdown parser for use inside Clang-Doc's comment
-/// pipeline. You give it a paragraph of text and an arena allocator, and it
-/// gives back a list of typed nodes describing the Markdown structure it 
found.
+/// The parser takes plain paragraph text and returns a polymorphic tree of
+/// MDNode-derived objects allocated in a caller-supplied BumpPtrAllocator.
+/// Node types form a closed class hierarchy rooted at MDNode. Each concrete
+/// type carries exactly the fields it needs -- no overloaded Content field,
+/// no unused arrays. Use llvm::isa<>/cast<>/dyn_cast<> for type-safe
+/// downcasting; each concrete type provides classof() for this purpose.
 ///
-/// The main entry point is parseMarkdown(). If the text has no Markdown in it,
-/// you get back an empty list and can fall back to plain-text output. If it
-/// does, you get a tree of MDNode structs where each node has a kind, optional
-/// content (like the language tag on a code fence), and optional children.
+/// See
+/// 
https://llvm.org/docs/ProgrammerManual.html#the-isa-cast-and-dyn-cast-templates
 ///
-/// All nodes are allocated in the arena you pass in. You own the arena and are
-/// responsible for keeping it alive as long as you use the nodes.
+/// Field ordering in each derived struct is chosen to minimize padding:
+/// 4-byte fields (like Level or Start) are declared before 16-byte fields
+/// (ArrayRef, StringRef) so that no implicit padding is inserted between the
+/// base class's 4-byte Kind and the first derived field.
 ///
-/// The parser handles fenced code blocks, pipe tables, and unordered lists.
-/// Anything it does not recognize comes back as a plain text node. It will
-/// never crash on bad input.
+/// Inline nodes (appear inside ParagraphNode, HeadingNode, etc.):
+///   TextNode       -- plain text run
+///   SoftBreakNode  -- soft line break
+///   HardBreakNode  -- hard line break (trailing spaces or backslash)
+///   InlineCodeNode -- inline code span (`code`)
+///   EmphasisNode   -- emphasis (*text* or _text_)
+///   StrongNode     -- strong emphasis (**text** or __text__)
+///
+/// Block nodes:
+///   ParagraphNode     -- sequence of inline nodes
+///   HeadingNode       -- ATX heading (# through ######), level 1-6
+///   FencedCodeNode    -- fenced code block (``` or ~~~)
+///   TableNode         -- pipe table (raw row text; TODO: structured cells)
+///   UnorderedListNode -- bullet list (-, *, +)
+///   OrderedListNode   -- numbered list with explicit start number
+///   ListItemNode      -- single item inside a list
+///   BlockQuoteNode    -- block quote (>)
+///   ThematicBreakNode -- horizontal rule (---, ***, ___)
+///
+/// All nodes are arena-allocated. The caller owns the arena and must keep it
+/// alive for the lifetime of any returned nodes. The parser never crashes on
+/// malformed input; unrecognized text falls back to TextNode.
 ///
 
//===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
@@ -38,35 +58,217 @@
 
 namespace clang::doc::markdown {
 
+/// Discriminator for all Markdown AST nodes. Inline kinds are grouped before
+/// block kinds so that the sentinels NK_LastInline and NK_FirstBlock enable
+/// cheap range-based checks in classof() implementations.
 enum class NodeKind {
+  // Inline nodes
+  NK_Text,
+  NK_SoftBreak,
+  NK_HardBreak,
+  NK_InlineCode,
+  NK_Emphasis,
+  NK_Strong,
+  NK_LastInline = NK_Strong, // sentinel -- all inline kinds are <= this
+
   // Block nodes
   NK_Paragraph,
+  NK_Heading,
   NK_FencedCode,
   NK_Table,
   NK_UnorderedList,
   NK_OrderedList,
   NK_ListItem,
+  NK_BlockQuote,
   NK_ThematicBreak,
-  // Inline nodes
-  NK_Text,
-  NK_InlineCode,
-  NK_Emphasis,
-  NK_Strong,
-  NK_SoftBreak,
+  NK_FirstBlock = NK_Paragraph, // sentinel -- all block kinds are >= this
 };
 
+/// Base type for all Markdown AST nodes. Carries only the kind discriminator.
+/// Nodes are arena-allocated and have no virtual destructor; use
+/// llvm::isa<>/cast<>/dyn_cast<> for type-safe downcasting.
 struct MDNode {
   NodeKind Kind;
-  llvm::StringRef Content; // lang tag for FencedCode, leaf text for Text
-  llvm::ArrayRef<MDNode> Children; // arena allocated
+  explicit MDNode(NodeKind K) : Kind(K) {}
+};
+
+//===----------------------------------------------------------------------===//
+// Inline nodes
+//===----------------------------------------------------------------------===//
+
+/// Plain text run.
+struct TextNode : MDNode {
+  llvm::StringRef Text;
+  explicit TextNode(llvm::StringRef Text)
+      : MDNode(NodeKind::NK_Text), Text(Text) {}
+  static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Text; }
+};
+
+/// Soft line break -- a newline that does not end the paragraph.
+struct SoftBreakNode : MDNode {
+  SoftBreakNode() : MDNode(NodeKind::NK_SoftBreak) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_SoftBreak;
+  }
+};
+
+/// Hard line break -- two trailing spaces or a backslash before a newline.
+struct HardBreakNode : MDNode {
+  HardBreakNode() : MDNode(NodeKind::NK_HardBreak) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_HardBreak;
+  }
+};
+
+/// Inline code span: `code`. Code does not include the surrounding backticks.
+struct InlineCodeNode : MDNode {
+  llvm::StringRef Code;
+  explicit InlineCodeNode(llvm::StringRef Code)
+      : MDNode(NodeKind::NK_InlineCode), Code(Code) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_InlineCode;
+  }
+};
+
+/// Emphasized text: *text* or _text_.
+struct EmphasisNode : MDNode {
+  llvm::ArrayRef<MDNode *> Children;
+  explicit EmphasisNode(llvm::ArrayRef<MDNode *> Children)
+      : MDNode(NodeKind::NK_Emphasis), Children(Children) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_Emphasis;
+  }
 };
 
-/// Parses Markdown from a single comment paragraph's text.
-/// Returns an empty ArrayRef if no Markdown constructs are found,
-/// so generators can fall back to plain-text rendering at zero cost.
-llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText,
-                                     llvm::BumpPtrAllocator &Arena);
+/// Strongly emphasized text: **text** or __text__.
+struct StrongNode : MDNode {
+  llvm::ArrayRef<MDNode *> Children;
+  explicit StrongNode(llvm::ArrayRef<MDNode *> Children)
+      : MDNode(NodeKind::NK_Strong), Children(Children) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_Strong;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Block nodes
+//===----------------------------------------------------------------------===//
+
+/// A paragraph -- sequence of inline nodes separated from other blocks by
+/// blank lines.
+struct ParagraphNode : MDNode {
+  llvm::ArrayRef<MDNode *> Children;
+  explicit ParagraphNode(llvm::ArrayRef<MDNode *> Children)
+      : MDNode(NodeKind::NK_Paragraph), Children(Children) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_Paragraph;
+  }
+};
+
+/// ATX heading: one to six leading # characters. Level is declared before
+/// Children to avoid padding between the base class's 4-byte Kind and the
+/// 8-byte-aligned ArrayRef, keeping sizeof(HeadingNode) at 24 bytes.
+struct HeadingNode : MDNode {
+  unsigned Level;                    // 1-6
+  llvm::ArrayRef<MDNode *> Children; // inline content
+  HeadingNode(unsigned Level, llvm::ArrayRef<MDNode *> Children)
+      : MDNode(NodeKind::NK_Heading), Level(Level), Children(Children) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_Heading;
+  }
+};
+
+/// Fenced code block opened with ``` or ~~~. Lang is the info string (e.g.
+/// "cpp"); empty when no language was specified. Lines contains the raw text
+/// of each interior line, without the opening or closing fence.
+///
+/// TODO: Follow CommonMark spec §4.5 -- the opening fence may be indented up
+/// to 3 spaces; the closing fence must use the same character and be at least
+/// as long as the opening fence; only spaces may follow the closing fence.
+struct FencedCodeNode : MDNode {
+  llvm::StringRef Lang;
+  llvm::ArrayRef<llvm::StringRef> Lines;
+  FencedCodeNode(llvm::StringRef Lang, llvm::ArrayRef<llvm::StringRef> Lines)
+      : MDNode(NodeKind::NK_FencedCode), Lang(Lang), Lines(Lines) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_FencedCode;
+  }
+};
+
+/// Pipe table. Rows contains the raw text of each row line including the
+/// header and separator rows.
+/// TODO: replace with a structured header/body/cell representation.
+struct TableNode : MDNode {
+  llvm::ArrayRef<llvm::StringRef> Rows;
+  explicit TableNode(llvm::ArrayRef<llvm::StringRef> Rows)
+      : MDNode(NodeKind::NK_Table), Rows(Rows) {}
+  static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Table; 
}
+};
+
+/// A single list item. Children may contain block-level nodes for loose
+/// lists, or a single inline sequence for tight lists.
+struct ListItemNode : MDNode {
+  llvm::ArrayRef<MDNode *> Children;
+  explicit ListItemNode(llvm::ArrayRef<MDNode *> Children)
+      : MDNode(NodeKind::NK_ListItem), Children(Children) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_ListItem;
+  }
+};
+
+/// Unordered (bullet) list. Markers are -, *, or +.
+struct UnorderedListNode : MDNode {
+  llvm::ArrayRef<ListItemNode *> Items;
+  explicit UnorderedListNode(llvm::ArrayRef<ListItemNode *> Items)
+      : MDNode(NodeKind::NK_UnorderedList), Items(Items) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_UnorderedList;
+  }
+};
+
+/// Ordered (numbered) list. Start is the number on the first item. Start is
+/// declared before Items to avoid padding, keeping sizeof at 24 bytes.
+struct OrderedListNode : MDNode {
+  unsigned Start;
+  llvm::ArrayRef<ListItemNode *> Items;
+  OrderedListNode(unsigned Start, llvm::ArrayRef<ListItemNode *> Items)
+      : MDNode(NodeKind::NK_OrderedList), Start(Start), Items(Items) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_OrderedList;
+  }
+};
+
+/// Block quote (> ...). Children are block-level nodes inside the quote.
+struct BlockQuoteNode : MDNode {
+  llvm::ArrayRef<MDNode *> Children;
+  explicit BlockQuoteNode(llvm::ArrayRef<MDNode *> Children)
+      : MDNode(NodeKind::NK_BlockQuote), Children(Children) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_BlockQuote;
+  }
+};
+
+/// Thematic break: a line of three or more ---, ***, or ___ characters.
+struct ThematicBreakNode : MDNode {
+  ThematicBreakNode() : MDNode(NodeKind::NK_ThematicBreak) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_ThematicBreak;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Parser entry point
+//===----------------------------------------------------------------------===//
+
+/// Parse Markdown from a single paragraph of plain text. Returns a list of
+/// top-level block nodes allocated in Arena. Returns an empty ArrayRef if no
+/// Markdown constructs are found, letting callers fall back to plain-text
+/// rendering at zero cost. The parser never crashes on malformed input.
+///
+/// The caller must keep Arena alive for the lifetime of any returned nodes.
+llvm::ArrayRef<MDNode *> parseMarkdown(llvm::StringRef ParagraphText,
+                                       llvm::BumpPtrAllocator &Arena);
 
 } // namespace clang::doc::markdown
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp 
b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 4ca979c1f1d24..b61094f034375 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -8,9 +8,11 @@
 
 #include "support/Markdown.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
 #include "gtest/gtest.h"
 
 using namespace clang::doc::markdown;
+using namespace llvm;
 
 namespace {
 
@@ -31,9 +33,8 @@ TEST_F(MarkdownParserTest, WhitespaceOnlyInput) {
 TEST_F(MarkdownParserTest, PlainText) {
   auto Nodes = parseMarkdown("hello world", Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_Text);
-  EXPECT_EQ(N.Content, "hello world");
+  auto *N = cast<TextNode>(Nodes[0]);
+  EXPECT_EQ(N->Text, "hello world");
 }
 
 TEST_F(MarkdownParserTest, FencedCodeBlock) {
@@ -42,10 +43,9 @@ int x = 0;
 ````````)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
-  EXPECT_EQ(N.Content, "cpp");
-  ASSERT_EQ(N.Children.size(), 1u);
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
+  EXPECT_EQ(N->Lang, "cpp");
+  ASSERT_EQ(N->Lines.size(), 1u);
 }
 
 TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) {
@@ -54,9 +54,8 @@ some code
 ```````)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
-  EXPECT_TRUE(N.Content.empty());
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
+  EXPECT_TRUE(N->Lang.empty());
 }
 
 TEST_F(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
@@ -74,7 +73,7 @@ TEST_F(MarkdownParserTest, PipeTable) {
 | 1 | 2 |)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table);
+  EXPECT_TRUE(isa<TableNode>(Nodes[0]));
 }
 
 TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
@@ -82,8 +81,8 @@ TEST_F(MarkdownParserTest, 
PipeCharacterWithoutSepRowIsPlainText) {
 c | d)",
                              Arena);
   // No separator row so should not be parsed as a table.
-  for (const auto &Node : Nodes)
-    EXPECT_NE(Node.Kind, NodeKind::NK_Table);
+  for (const auto *Node : Nodes)
+    EXPECT_FALSE(isa<TableNode>(Node));
 }
 
 TEST_F(MarkdownParserTest, UnorderedList) {
@@ -92,12 +91,11 @@ TEST_F(MarkdownParserTest, UnorderedList) {
 - baz)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_UnorderedList);
-  ASSERT_EQ(N.Children.size(), 3u);
-  EXPECT_EQ(N.Children[0].Content, "foo");
-  EXPECT_EQ(N.Children[1].Content, "bar");
-  EXPECT_EQ(N.Children[2].Content, "baz");
+  auto *N = cast<UnorderedListNode>(Nodes[0]);
+  ASSERT_EQ(N->Items.size(), 3u);
+  EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "foo");
+  EXPECT_EQ(cast<TextNode>(N->Items[1]->Children[0])->Text, "bar");
+  EXPECT_EQ(cast<TextNode>(N->Items[2]->Children[0])->Text, "baz");
 }
 
 TEST_F(MarkdownParserTest, MixedContent) {
@@ -117,10 +115,9 @@ int x = 0;
 ~~~)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
-  EXPECT_TRUE(N.Content.empty());
-  ASSERT_EQ(N.Children.size(), 1u);
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
+  EXPECT_TRUE(N->Lang.empty());
+  ASSERT_EQ(N->Lines.size(), 1u);
 }
 
 // CommonMark §4.5 example 120: tilde fence with a language tag.
@@ -130,10 +127,9 @@ int x = 0;
 ~~~)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
-  EXPECT_EQ(N.Content, "cpp");
-  ASSERT_EQ(N.Children.size(), 1u);
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
+  EXPECT_EQ(N->Lang, "cpp");
+  ASSERT_EQ(N->Lines.size(), 1u);
 }
 
 // CommonMark §4.5 example 122: a tilde line does not close a backtick fence.
@@ -144,10 +140,9 @@ aaa
 ````````)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
   // ~~~ is content, not a closing fence.
-  ASSERT_EQ(N.Children.size(), 2u);
+  ASSERT_EQ(N->Lines.size(), 2u);
 }
 
 // CommonMark §4.5 example 130: a code block can be empty.
@@ -156,18 +151,16 @@ TEST_F(MarkdownParserTest, EmptyFencedCodeBlock) {
 ```````)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
-  EXPECT_TRUE(N.Children.empty());
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
+  EXPECT_TRUE(N->Lines.empty());
 }
 
 // CommonMark §4.5 example 129: a code block may contain only blank lines.
 TEST_F(MarkdownParserTest, FencedCodeBlockBlankLineContent) {
   auto Nodes = parseMarkdown("```\n\n  \n```", Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
-  ASSERT_EQ(N.Children.size(), 2u);
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
+  ASSERT_EQ(N->Lines.size(), 2u);
 }
 
 // CommonMark §4.5 example 142: lang tag is captured from the info string.
@@ -179,10 +172,9 @@ end
 ``````)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
-  EXPECT_EQ(N.Content, "ruby");
-  ASSERT_EQ(N.Children.size(), 3u);
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
+  EXPECT_EQ(N->Lang, "ruby");
+  ASSERT_EQ(N->Lines.size(), 3u);
 }
 
 // CommonMark §4.5 example 146: tilde fence info string may contain backticks.
@@ -192,10 +184,9 @@ foo
 ~~~)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
-  EXPECT_EQ(N.Content, "aa ``` ~~~");
-  ASSERT_EQ(N.Children.size(), 1u);
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
+  EXPECT_EQ(N->Lang, "aa ``` ~~~");
+  ASSERT_EQ(N->Lines.size(), 1u);
 }
 
 // CommonMark §4.5 example 124: closing fence must be at least as long as the
@@ -209,9 +200,8 @@ TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) {
   // parser currently treats it as a closing fence. This test documents the
   // current (non-conformant) behavior.
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
-  ASSERT_EQ(N.Children.size(), 1u);
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
+  ASSERT_EQ(N->Lines.size(), 1u);
 }
 
 } // namespace
\ No newline at end of file

>From 25fe7daff183f51a8b31ed0d8481b9a2f1fbdbd8 Mon Sep 17 00:00:00 2001
From: Neil-N4 <[email protected]>
Date: Fri, 12 Jun 2026 02:33:44 -0400
Subject: [PATCH 5/9] [clang-doc] Introduce LineReader cursor for the Markdown
 parse loop

Replace the raw size_t I = 0, E = Lines.size() index arithmetic in
parseMarkdown() with a LineReader cursor that encapsulates the position
and exposes peek(), peek(Offset), advance(), and atEnd(). The parse
logic and emitted nodes are unchanged; this only removes manual index
bookkeeping. All 18 MarkdownParserTest cases still pass.

Co-Authored-By: Claude Opus 4.8 <[email protected]>
---
 .../clang-doc/support/Markdown.cpp            | 73 ++++++++++++++-----
 1 file changed, 54 insertions(+), 19 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp 
b/clang-tools-extra/clang-doc/support/Markdown.cpp
index bee15c3e23ec3..f171457e73046 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/DebugLog.h"
+#include <cassert>
 
 #define DEBUG_TYPE "clang-doc"
 
@@ -52,6 +53,42 @@ static bool isListItem(StringRef Line) {
          Line.starts_with("+ ");
 }
 
+// A forward cursor over the lines of a paragraph. Encapsulates the parse
+// position so the loop can inspect the current or an upcoming line and consume
+// lines without manual index arithmetic. Lines are stored untrimmed; callers
+// trim where they need a normalized view.
+class LineReader {
+public:
+  explicit LineReader(ArrayRef<StringRef> Lines) : Lines(Lines) {}
+
+  // True once every line has been consumed.
+  bool atEnd() const { return Pos >= Lines.size(); }
+
+  // The current line, untrimmed. Must not be called when atEnd().
+  StringRef peek() const {
+    assert(!atEnd() && "peek past end of input");
+    return Lines[Pos];
+  }
+
+  // The line Offset positions ahead of the cursor, or an empty StringRef when
+  // that position is past the end. peek(0) is the current line.
+  StringRef peek(size_t Offset) const {
+    size_t Target = Pos + Offset;
+    return Target < Lines.size() ? Lines[Target] : StringRef();
+  }
+
+  // Consume the current line and return it, untrimmed. Must not be called when
+  // atEnd().
+  StringRef advance() {
+    assert(!atEnd() && "advance past end of input");
+    return Lines[Pos++];
+  }
+
+private:
+  ArrayRef<StringRef> Lines;
+  size_t Pos = 0;
+};
+
 ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
                                  BumpPtrAllocator &Arena) {
   if (ParagraphText.trim().empty())
@@ -61,13 +98,13 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
   ParagraphText.split(Lines, '\n');
 
   SmallVector<MDNode *> Nodes;
-  size_t I = 0, E = Lines.size();
+  LineReader Reader(Lines);
 
-  while (I < E) {
-    StringRef Line = Lines[I].trim();
+  while (!Reader.atEnd()) {
+    StringRef Line = Reader.peek().trim();
 
     if (Line.empty()) {
-      ++I;
+      Reader.advance();
       continue;
     }
 
@@ -79,18 +116,18 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
     if (Line.starts_with("```") || Line.starts_with("~~~")) {
       char Fence = Line[0];
       StringRef Lang = internString(Line.drop_front(3).trim(), Arena);
+      Reader.advance(); // consume opening fence
       SmallVector<StringRef> CodeLines;
-      ++I;
-      while (I < E) {
-        StringRef CodeLine = Lines[I].trim();
+      while (!Reader.atEnd()) {
+        StringRef CodeLine = Reader.peek().trim();
         if (CodeLine.size() >= 3 &&
             all_of(CodeLine.take_front(3),
                    [Fence](char C) { return C == Fence; }))
           break;
-        CodeLines.push_back(internString(Lines[I], Arena));
-        ++I;
+        CodeLines.push_back(internString(Reader.advance(), Arena));
       }
-      ++I; // skip closing fence
+      if (!Reader.atEnd())
+        Reader.advance(); // consume closing fence
       auto *Code =
           new (Arena) FencedCodeNode(Lang, allocateArray(CodeLines, Arena));
       LDBG() << "emitting FencedCodeNode lang='" << Lang
@@ -100,12 +137,10 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
     }
 
     // Pipe table: current line has | and next line is a separator row.
-    if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) {
+    if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
       SmallVector<StringRef> Rows;
-      while (I < E && Lines[I].trim().contains('|')) {
-        Rows.push_back(internString(Lines[I].trim(), Arena));
-        ++I;
-      }
+      while (!Reader.atEnd() && Reader.peek().trim().contains('|'))
+        Rows.push_back(internString(Reader.advance().trim(), Arena));
       auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
       LDBG() << "emitting TableNode rows=" << Rows.size();
       Nodes.push_back(Table);
@@ -115,8 +150,8 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
     // Unordered list item.
     if (isListItem(Line)) {
       SmallVector<ListItemNode *> Items;
-      while (I < E) {
-        StringRef L = Lines[I].trim();
+      while (!Reader.atEnd()) {
+        StringRef L = Reader.peek().trim();
         if (!isListItem(L))
           break;
         StringRef ItemText = internString(L.drop_front(2).trim(), Arena);
@@ -125,7 +160,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
         auto *Item =
             new (Arena) ListItemNode(allocateArray(ItemChildren, Arena));
         Items.push_back(Item);
-        ++I;
+        Reader.advance();
       }
       auto *List = new (Arena) UnorderedListNode(allocateArray(Items, Arena));
       LDBG() << "emitting UnorderedListNode items=" << Items.size();
@@ -135,7 +170,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
 
     // Plain text fallback.
     Nodes.push_back(new (Arena) TextNode(internString(Line, Arena)));
-    ++I;
+    Reader.advance();
   }
 
   LDBG() << "parseMarkdown done nodes=" << Nodes.size();

>From 060bf63fe9f19fa45ef941f10594897351591d56 Mon Sep 17 00:00:00 2001
From: Neil-N4 <[email protected]>
Date: Fri, 12 Jun 2026 02:44:27 -0400
Subject: [PATCH 6/9] [clang-doc] Parse inline emphasis, strong, and code in
 Markdown
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an inline pass over paragraph text that recognizes emphasis
(*text* or _text_), strong (**text** or __text__), and inline code
(`code`), emitting the EmphasisNode, StrongNode, and InlineCodeNode
types already in the hierarchy. Emphasis and strong recurse into their
content, and runs that match no construct stay plain TextNodes.

Delimiter matching uses a simplified subset of the CommonMark §6
flanking rules: a delimiter opens only with non-whitespace inside it and
closes only with non-whitespace before it, and code spans close on a
backtick run of equal length. The full delimiter-stack model is left as
a TODO. Adds 12 unit tests covering each construct plus the unmatched
and unterminated cases.

Co-Authored-By: Claude Opus 4.8 <[email protected]>
---
 .../clang-doc/support/Markdown.cpp            | 121 +++++++++++++++++-
 .../clang-doc/MarkdownParserTest.cpp          |  97 ++++++++++++++
 2 files changed, 216 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp 
b/clang-tools-extra/clang-doc/support/Markdown.cpp
index f171457e73046..f1af4f5430772 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -8,6 +8,7 @@
 
 #include "Markdown.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/DebugLog.h"
@@ -89,6 +90,121 @@ class LineReader {
   size_t Pos = 0;
 };
 
+// Returns the number of consecutive copies of C starting at S[Start].
+static size_t countRun(StringRef S, size_t Start, char C) {
+  size_t I = Start;
+  while (I < S.size() && S[I] == C)
+    ++I;
+  return I - Start;
+}
+
+// Strips one leading and one trailing space from a code span's content when
+// both are present and the content is not all spaces, per CommonMark §6.1.
+static StringRef trimCodeSpan(StringRef Code) {
+  if (Code.size() >= 2 && Code.front() == ' ' && Code.back() == ' ' &&
+      Code.find_first_not_of(' ') != StringRef::npos)
+    return Code.drop_front().drop_back();
+  return Code;
+}
+
+// Finds the start index of a closing emphasis run of exactly Count copies of 
C,
+// searching forward from From. Requires non-whitespace immediately inside both
+// the opening and closing delimiters and non-empty content, a simplified take
+// on the CommonMark §6.2 flanking rules. Returns StringRef::npos if no valid
+// closing run exists.
+static size_t findClosingDelim(StringRef S, size_t From, char C, size_t Count) 
{
+  size_t E = S.size();
+  // Opening delimiter is not left-flanking if whitespace follows it.
+  if (From >= E || isSpace(S[From]))
+    return StringRef::npos;
+  for (size_t J = From; J + Count <= E; ++J) {
+    if (S[J] != C)
+      continue;
+    size_t Run = countRun(S, J, C);
+    if (Run != Count) {
+      J += Run - 1; // Skip the whole run; the loop's ++J lands past it.
+      continue;
+    }
+    // Reject empty content and closing runs that are not right-flanking.
+    if (J == From || isSpace(S[J - 1]))
+      continue;
+    return J;
+  }
+  return StringRef::npos;
+}
+
+// Parses the inline content of a single line into a sequence of inline nodes:
+// inline code (`code`), strong (**text** or __text__), and emphasis (*text* or
+// _text_). Runs that match no construct become TextNodes. Emphasis and strong
+// recurse so their content may itself contain inline constructs. Text with no
+// markers yields a single TextNode.
+//
+// TODO: This covers the common cases but not the full CommonMark §6 inline
+// model (delimiter stacks, intraword underscore rules, links, autolinks).
+static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
+  SmallVector<MDNode *> Nodes;
+  size_t TextStart = 0, I = 0, E = S.size();
+
+  auto flushText = [&](size_t End) {
+    if (End > TextStart)
+      Nodes.push_back(new (Arena) TextNode(
+          internString(S.substr(TextStart, End - TextStart), Arena)));
+  };
+
+  while (I < E) {
+    char C = S[I];
+
+    // Inline code span: a run of N backticks closed by a run of N backticks.
+    if (C == '`') {
+      size_t N = countRun(S, I, '`');
+      size_t J = I + N;
+      while (J < E && countRun(S, J, '`') != N)
+        J += S[J] == '`' ? countRun(S, J, '`') : 1;
+      if (J < E) {
+        flushText(I);
+        StringRef Code = trimCodeSpan(S.substr(I + N, J - (I + N)));
+        Nodes.push_back(new (Arena) InlineCodeNode(internString(Code, Arena)));
+        I = J + N;
+        TextStart = I;
+        continue;
+      }
+      // No closing run; leave the backticks as literal text.
+      I += N;
+      continue;
+    }
+
+    // Emphasis (*text*, _text_) and strong (**text**, __text__).
+    if (C == '*' || C == '_') {
+      // Strong binds the two-delimiter form before single-delimiter emphasis.
+      if (I + 1 < E && S[I + 1] == C) {
+        size_t Close = findClosingDelim(S, I + 2, C, 2);
+        if (Close != StringRef::npos) {
+          flushText(I);
+          StringRef Inner = S.substr(I + 2, Close - (I + 2));
+          Nodes.push_back(new (Arena) StrongNode(parseInline(Inner, Arena)));
+          I = Close + 2;
+          TextStart = I;
+          continue;
+        }
+      }
+      size_t Close = findClosingDelim(S, I + 1, C, 1);
+      if (Close != StringRef::npos) {
+        flushText(I);
+        StringRef Inner = S.substr(I + 1, Close - (I + 1));
+        Nodes.push_back(new (Arena) EmphasisNode(parseInline(Inner, Arena)));
+        I = Close + 1;
+        TextStart = I;
+        continue;
+      }
+    }
+
+    ++I;
+  }
+
+  flushText(E);
+  return allocateArray(Nodes, Arena);
+}
+
 ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
                                  BumpPtrAllocator &Arena) {
   if (ParagraphText.trim().empty())
@@ -168,8 +284,9 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
       continue;
     }
 
-    // Plain text fallback.
-    Nodes.push_back(new (Arena) TextNode(internString(Line, Arena)));
+    // Plain text, scanned for inline constructs (emphasis, strong, code).
+    for (MDNode *Inline : parseInline(Line, Arena))
+      Nodes.push_back(Inline);
     Reader.advance();
   }
 
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp 
b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index b61094f034375..ea72dacfb08e5 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -204,4 +204,101 @@ TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) {
   ASSERT_EQ(N->Lines.size(), 1u);
 }
 
+TEST_F(MarkdownParserTest, EmphasisAsterisk) {
+  auto Nodes = parseMarkdown("an *important* word", Arena);
+  ASSERT_EQ(Nodes.size(), 3u);
+  EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "an ");
+  auto *Em = cast<EmphasisNode>(Nodes[1]);
+  ASSERT_EQ(Em->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "important");
+  EXPECT_EQ(cast<TextNode>(Nodes[2])->Text, " word");
+}
+
+TEST_F(MarkdownParserTest, EmphasisUnderscore) {
+  auto Nodes = parseMarkdown("_em_", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *Em = cast<EmphasisNode>(Nodes[0]);
+  ASSERT_EQ(Em->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "em");
+}
+
+TEST_F(MarkdownParserTest, StrongAsterisk) {
+  auto Nodes = parseMarkdown("**bold**", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *St = cast<StrongNode>(Nodes[0]);
+  ASSERT_EQ(St->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bold");
+}
+
+TEST_F(MarkdownParserTest, StrongUnderscore) {
+  auto Nodes = parseMarkdown("__bold__", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *St = cast<StrongNode>(Nodes[0]);
+  ASSERT_EQ(St->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bold");
+}
+
+// Two delimiters must be parsed as strong, not as nested emphasis.
+TEST_F(MarkdownParserTest, StrongBindsBeforeEmphasis) {
+  auto Nodes = parseMarkdown("**strong**", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_TRUE(isa<StrongNode>(Nodes[0]));
+}
+
+TEST_F(MarkdownParserTest, InlineCode) {
+  auto Nodes = parseMarkdown("call `foo()` here", Arena);
+  ASSERT_EQ(Nodes.size(), 3u);
+  EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "call ");
+  EXPECT_EQ(cast<InlineCodeNode>(Nodes[1])->Code, "foo()");
+  EXPECT_EQ(cast<TextNode>(Nodes[2])->Text, " here");
+}
+
+// CommonMark §6.1: a doubled backtick fence lets the span contain a single
+// backtick.
+TEST_F(MarkdownParserTest, InlineCodeDoubleBacktick) {
+  auto Nodes = parseMarkdown("``a`b``", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_EQ(cast<InlineCodeNode>(Nodes[0])->Code, "a`b");
+}
+
+// Emphasis and strong recurse, so a code span inside emphasis is parsed.
+TEST_F(MarkdownParserTest, CodeSpanInsideEmphasis) {
+  auto Nodes = parseMarkdown("*see `x`*", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *Em = cast<EmphasisNode>(Nodes[0]);
+  ASSERT_EQ(Em->Children.size(), 2u);
+  EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "see ");
+  EXPECT_EQ(cast<InlineCodeNode>(Em->Children[1])->Code, "x");
+}
+
+TEST_F(MarkdownParserTest, CodeSpanInsideStrong) {
+  auto Nodes = parseMarkdown("**a `b`**", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *St = cast<StrongNode>(Nodes[0]);
+  ASSERT_EQ(St->Children.size(), 2u);
+  EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "a ");
+  EXPECT_EQ(cast<InlineCodeNode>(St->Children[1])->Code, "b");
+}
+
+// A delimiter with whitespace on the inside does not open emphasis.
+TEST_F(MarkdownParserTest, UnmatchedDelimiterIsText) {
+  auto Nodes = parseMarkdown("a * b", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "a * b");
+}
+
+// An unterminated code span leaves the backtick as literal text.
+TEST_F(MarkdownParserTest, UnterminatedCodeSpanIsText) {
+  auto Nodes = parseMarkdown("a `b c", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "a `b c");
+}
+
+// Inline parsing must not disturb plain text with no markers.
+TEST_F(MarkdownParserTest, PlainTextHasNoInlineNodes) {
+  auto Nodes = parseMarkdown("just words", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "just words");
+}
+
 } // namespace
\ No newline at end of file

>From 0af1c8e2999a20e2044cc337a8c4f0d8112d208b Mon Sep 17 00:00:00 2001
From: Neil-N4 <[email protected]>
Date: Fri, 12 Jun 2026 10:18:32 -0400
Subject: [PATCH 7/9] [clang-doc] Address review feedback: rename inline parser
 variables, simplify header docs

---
 .../clang-doc/support/Markdown.cpp            | 54 ++++++++++---------
 .../clang-doc/support/Markdown.h              | 25 +++------
 2 files changed, 34 insertions(+), 45 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp 
b/clang-tools-extra/clang-doc/support/Markdown.cpp
index f1af4f5430772..ef29daa76a166 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -143,7 +143,7 @@ static size_t findClosingDelim(StringRef S, size_t From, 
char C, size_t Count) {
 // model (delimiter stacks, intraword underscore rules, links, autolinks).
 static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
   SmallVector<MDNode *> Nodes;
-  size_t TextStart = 0, I = 0, E = S.size();
+  size_t TextStart = 0, Pos = 0, E = S.size();
 
   auto flushText = [&](size_t End) {
     if (End > TextStart)
@@ -151,54 +151,56 @@ static ArrayRef<MDNode *> parseInline(StringRef S, 
BumpPtrAllocator &Arena) {
           internString(S.substr(TextStart, End - TextStart), Arena)));
   };
 
-  while (I < E) {
-    char C = S[I];
+  while (Pos < E) {
+    char C = S[Pos];
 
-    // Inline code span: a run of N backticks closed by a run of N backticks.
+    // Inline code span: an opening backtick run closed by a run of the same
+    // length.
     if (C == '`') {
-      size_t N = countRun(S, I, '`');
-      size_t J = I + N;
-      while (J < E && countRun(S, J, '`') != N)
-        J += S[J] == '`' ? countRun(S, J, '`') : 1;
-      if (J < E) {
-        flushText(I);
-        StringRef Code = trimCodeSpan(S.substr(I + N, J - (I + N)));
+      size_t OpenLen = countRun(S, Pos, '`');
+      size_t ClosePos = Pos + OpenLen;
+      while (ClosePos < E && countRun(S, ClosePos, '`') != OpenLen)
+        ClosePos += S[ClosePos] == '`' ? countRun(S, ClosePos, '`') : 1;
+      if (ClosePos < E) {
+        flushText(Pos);
+        StringRef Code =
+            trimCodeSpan(S.substr(Pos + OpenLen, ClosePos - (Pos + OpenLen)));
         Nodes.push_back(new (Arena) InlineCodeNode(internString(Code, Arena)));
-        I = J + N;
-        TextStart = I;
+        Pos = ClosePos + OpenLen;
+        TextStart = Pos;
         continue;
       }
       // No closing run; leave the backticks as literal text.
-      I += N;
+      Pos += OpenLen;
       continue;
     }
 
     // Emphasis (*text*, _text_) and strong (**text**, __text__).
     if (C == '*' || C == '_') {
       // Strong binds the two-delimiter form before single-delimiter emphasis.
-      if (I + 1 < E && S[I + 1] == C) {
-        size_t Close = findClosingDelim(S, I + 2, C, 2);
+      if (Pos + 1 < E && S[Pos + 1] == C) {
+        size_t Close = findClosingDelim(S, Pos + 2, C, 2);
         if (Close != StringRef::npos) {
-          flushText(I);
-          StringRef Inner = S.substr(I + 2, Close - (I + 2));
+          flushText(Pos);
+          StringRef Inner = S.substr(Pos + 2, Close - (Pos + 2));
           Nodes.push_back(new (Arena) StrongNode(parseInline(Inner, Arena)));
-          I = Close + 2;
-          TextStart = I;
+          Pos = Close + 2;
+          TextStart = Pos;
           continue;
         }
       }
-      size_t Close = findClosingDelim(S, I + 1, C, 1);
+      size_t Close = findClosingDelim(S, Pos + 1, C, 1);
       if (Close != StringRef::npos) {
-        flushText(I);
-        StringRef Inner = S.substr(I + 1, Close - (I + 1));
+        flushText(Pos);
+        StringRef Inner = S.substr(Pos + 1, Close - (Pos + 1));
         Nodes.push_back(new (Arena) EmphasisNode(parseInline(Inner, Arena)));
-        I = Close + 1;
-        TextStart = I;
+        Pos = Close + 1;
+        TextStart = Pos;
         continue;
       }
     }
 
-    ++I;
+    ++Pos;
   }
 
   flushText(E);
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h 
b/clang-tools-extra/clang-doc/support/Markdown.h
index 3d457bcddfac6..60390465588c3 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -9,20 +9,10 @@
 /// \file
 /// Standalone Markdown parsing library for the LLVM ecosystem.
 ///
-/// The parser takes plain paragraph text and returns a polymorphic tree of
-/// MDNode-derived objects allocated in a caller-supplied BumpPtrAllocator.
-/// Node types form a closed class hierarchy rooted at MDNode. Each concrete
-/// type carries exactly the fields it needs -- no overloaded Content field,
-/// no unused arrays. Use llvm::isa<>/cast<>/dyn_cast<> for type-safe
-/// downcasting; each concrete type provides classof() for this purpose.
-///
-/// See
-/// 
https://llvm.org/docs/ProgrammerManual.html#the-isa-cast-and-dyn-cast-templates
-///
-/// Field ordering in each derived struct is chosen to minimize padding:
-/// 4-byte fields (like Level or Start) are declared before 16-byte fields
-/// (ArrayRef, StringRef) so that no implicit padding is inserted between the
-/// base class's 4-byte Kind and the first derived field.
+/// The parser takes a single paragraph of plain text and returns a list of
+/// nodes describing the Markdown it found. Each kind of construct has its own
+/// node type, and every node shares a common MDNode base, so you can use
+/// llvm::isa<>/cast<>/dyn_cast<> to check what a node is.
 ///
 /// Inline nodes (appear inside ParagraphNode, HeadingNode, etc.):
 ///   TextNode       -- plain text run
@@ -165,9 +155,7 @@ struct ParagraphNode : MDNode {
   }
 };
 
-/// ATX heading: one to six leading # characters. Level is declared before
-/// Children to avoid padding between the base class's 4-byte Kind and the
-/// 8-byte-aligned ArrayRef, keeping sizeof(HeadingNode) at 24 bytes.
+/// ATX heading: one to six leading # characters.
 struct HeadingNode : MDNode {
   unsigned Level;                    // 1-6
   llvm::ArrayRef<MDNode *> Children; // inline content
@@ -226,8 +214,7 @@ struct UnorderedListNode : MDNode {
   }
 };
 
-/// Ordered (numbered) list. Start is the number on the first item. Start is
-/// declared before Items to avoid padding, keeping sizeof at 24 bytes.
+/// Ordered (numbered) list. Start is the number on the first item.
 struct OrderedListNode : MDNode {
   unsigned Start;
   llvm::ArrayRef<ListItemNode *> Items;

>From b76bfa182db40e7a358ffb7d42506aff24453e14 Mon Sep 17 00:00:00 2001
From: Neil-N4 <[email protected]>
Date: Fri, 12 Jun 2026 11:50:32 -0400
Subject: [PATCH 8/9] [clang-doc] Add libFuzzer harness for parseMarkdown()

---
 clang-tools-extra/clang-doc/CMakeLists.txt    |  1 +
 .../clang-doc/fuzzer/CMakeLists.txt           | 21 +++++++++++++
 .../clang-doc/fuzzer/DummyMarkdownFuzzer.cpp  | 21 +++++++++++++
 .../clang-doc/fuzzer/FuzzMarkdown.cpp         | 30 +++++++++++++++++++
 4 files changed, 73 insertions(+)
 create mode 100644 clang-tools-extra/clang-doc/fuzzer/CMakeLists.txt
 create mode 100644 clang-tools-extra/clang-doc/fuzzer/DummyMarkdownFuzzer.cpp
 create mode 100644 clang-tools-extra/clang-doc/fuzzer/FuzzMarkdown.cpp

diff --git a/clang-tools-extra/clang-doc/CMakeLists.txt 
b/clang-tools-extra/clang-doc/CMakeLists.txt
index 22e2c8159e9f6..f64d1129ed4af 100644
--- a/clang-tools-extra/clang-doc/CMakeLists.txt
+++ b/clang-tools-extra/clang-doc/CMakeLists.txt
@@ -44,6 +44,7 @@ target_link_libraries(clangDoc
   )
 
 add_subdirectory(tool)
+add_subdirectory(fuzzer)
 
 if (LLVM_INCLUDE_BENCHMARKS)
   add_subdirectory(benchmarks)
diff --git a/clang-tools-extra/clang-doc/fuzzer/CMakeLists.txt 
b/clang-tools-extra/clang-doc/fuzzer/CMakeLists.txt
new file mode 100644
index 0000000000000..5e6e943891052
--- /dev/null
+++ b/clang-tools-extra/clang-doc/fuzzer/CMakeLists.txt
@@ -0,0 +1,21 @@
+# Resolve "support/Markdown.h" against the parent clang-doc directory.
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..)
+
+set(LLVM_LINK_COMPONENTS
+  FuzzerCLI
+  Support
+  )
+
+# This fuzzer runs on oss-fuzz, so keep it around even if it looks 
unreferenced.
+# With a fuzzing engine configured (LLVM_USE_SANITIZE_COVERAGE or an external
+# LLVM_LIB_FUZZING_ENGINE) this builds a real fuzz target; otherwise DUMMY_MAIN
+# provides a main() so it still builds and can be replayed over saved inputs.
+add_llvm_fuzzer(clang-doc-markdown-fuzzer
+  FuzzMarkdown.cpp
+  DUMMY_MAIN DummyMarkdownFuzzer.cpp
+  )
+
+target_link_libraries(clang-doc-markdown-fuzzer
+  PRIVATE
+  clangDocSupport
+  )
diff --git a/clang-tools-extra/clang-doc/fuzzer/DummyMarkdownFuzzer.cpp 
b/clang-tools-extra/clang-doc/fuzzer/DummyMarkdownFuzzer.cpp
new file mode 100644
index 0000000000000..61466e0fa4ef6
--- /dev/null
+++ b/clang-tools-extra/clang-doc/fuzzer/DummyMarkdownFuzzer.cpp
@@ -0,0 +1,21 @@
+//===-- DummyMarkdownFuzzer.cpp - Entry point to test the fuzzer 
----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of main so we can build and test the harness without linking
+// libFuzzer. Each command line argument is treated as a file to run the
+// harness on.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/FuzzMutate/FuzzerCLI.h"
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size);
+
+int main(int argc, char *argv[]) {
+  return llvm::runFuzzerOnInputs(argc, argv, LLVMFuzzerTestOneInput);
+}
diff --git a/clang-tools-extra/clang-doc/fuzzer/FuzzMarkdown.cpp 
b/clang-tools-extra/clang-doc/fuzzer/FuzzMarkdown.cpp
new file mode 100644
index 0000000000000..e407b3baccf2e
--- /dev/null
+++ b/clang-tools-extra/clang-doc/fuzzer/FuzzMarkdown.cpp
@@ -0,0 +1,30 @@
+//===-- FuzzMarkdown.cpp - Fuzzer for the clang-doc Markdown parser 
-------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements a libFuzzer harness for parseMarkdown(). It feeds
+/// arbitrary bytes to the parser and checks that it never crashes. The parsed
+/// nodes are walked so the returned tree is exercised, not just allocated.
+///
+//===----------------------------------------------------------------------===//
+
+#include "support/Markdown.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+#include <cstddef>
+#include <cstdint>
+
+using namespace clang::doc::markdown;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  llvm::BumpPtrAllocator Arena;
+  llvm::StringRef Input(reinterpret_cast<const char *>(Data), Size);
+  for (const MDNode *Node : parseMarkdown(Input, Arena))
+    (void)Node->Kind;
+  return 0;
+}

>From 77e28993d7a167410fd1a1ee97d2824945b44063 Mon Sep 17 00:00:00 2001
From: Neil-N4 <[email protected]>
Date: Fri, 12 Jun 2026 18:47:47 -0400
Subject: [PATCH 9/9] [clang-doc] Address review feedback: rename
 findClosingDelim params, add table TODO, fix EOF newline

---
 .../clang-doc/support/Markdown.cpp            | 28 +++++++++++--------
 .../clang-doc/MarkdownParserTest.cpp          |  2 +-
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp 
b/clang-tools-extra/clang-doc/support/Markdown.cpp
index ef29daa76a166..6a57cd7900ea2 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -107,26 +107,27 @@ static StringRef trimCodeSpan(StringRef Code) {
   return Code;
 }
 
-// Finds the start index of a closing emphasis run of exactly Count copies of 
C,
-// searching forward from From. Requires non-whitespace immediately inside both
-// the opening and closing delimiters and non-empty content, a simplified take
-// on the CommonMark §6.2 flanking rules. Returns StringRef::npos if no valid
-// closing run exists.
-static size_t findClosingDelim(StringRef S, size_t From, char C, size_t Count) 
{
+// Finds the start index of a closing emphasis run of exactly DelimLen copies 
of
+// DelimChar, searching forward from StartPos. Requires non-whitespace
+// immediately inside both the opening and closing delimiters and non-empty
+// content, a simplified take on the CommonMark §6.2 flanking rules. Returns
+// StringRef::npos if no valid closing run exists.
+static size_t findClosingDelim(StringRef S, size_t StartPos, char DelimChar,
+                               size_t DelimLen) {
   size_t E = S.size();
   // Opening delimiter is not left-flanking if whitespace follows it.
-  if (From >= E || isSpace(S[From]))
+  if (StartPos >= E || isSpace(S[StartPos]))
     return StringRef::npos;
-  for (size_t J = From; J + Count <= E; ++J) {
-    if (S[J] != C)
+  for (size_t J = StartPos; J + DelimLen <= E; ++J) {
+    if (S[J] != DelimChar)
       continue;
-    size_t Run = countRun(S, J, C);
-    if (Run != Count) {
+    size_t Run = countRun(S, J, DelimChar);
+    if (Run != DelimLen) {
       J += Run - 1; // Skip the whole run; the loop's ++J lands past it.
       continue;
     }
     // Reject empty content and closing runs that are not right-flanking.
-    if (J == From || isSpace(S[J - 1]))
+    if (J == StartPos || isSpace(S[J - 1]))
       continue;
     return J;
   }
@@ -257,6 +258,9 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
     // Pipe table: current line has | and next line is a separator row.
     if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
       SmallVector<StringRef> Rows;
+      // TODO: Rows are kept as raw line text for now. Table cells may contain
+      // inline content (emphasis, code spans, links), so each row may need to
+      // be split on '|' and parsed further into structured cells.
       while (!Reader.atEnd() && Reader.peek().trim().contains('|'))
         Rows.push_back(internString(Reader.advance().trim(), Arena));
       auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp 
b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index ea72dacfb08e5..28bb9d567e6bc 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -301,4 +301,4 @@ TEST_F(MarkdownParserTest, PlainTextHasNoInlineNodes) {
   EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "just words");
 }
 
-} // namespace
\ No newline at end of file
+} // namespace

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang-tools-extra] [clang-doc] Add standalone Markdown parsing library (PR #202991)

Reply via email to