https://github.com/Neil-N4 updated https://github.com/llvm/llvm-project/pull/202991
>From b4623400ed04066b222882d46bbddf1819ffca9e Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Wed, 10 Jun 2026 09:51:48 -0400 Subject: [PATCH 01/13] [clang-doc] Add standalone Markdown parsing library --- .../clang-doc/support/CMakeLists.txt | 3 +- .../clang-doc/support/Markdown.cpp | 145 ++++++++++++++++++ .../clang-doc/support/Markdown.h | 72 +++++++++ .../unittests/clang-doc/CMakeLists.txt | 4 +- .../clang-doc/MarkdownParserTest.cpp | 94 ++++++++++++ 5 files changed, 316 insertions(+), 2 deletions(-) create mode 100644 clang-tools-extra/clang-doc/support/Markdown.cpp create mode 100644 clang-tools-extra/clang-doc/support/Markdown.h create mode 100644 clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp diff --git a/clang-tools-extra/clang-doc/support/CMakeLists.txt b/clang-tools-extra/clang-doc/support/CMakeLists.txt index 8ac913ffbe998..acff865190ff9 100644 --- a/clang-tools-extra/clang-doc/support/CMakeLists.txt +++ b/clang-tools-extra/clang-doc/support/CMakeLists.txt @@ -6,5 +6,6 @@ set(LLVM_LINK_COMPONENTS add_clang_library(clangDocSupport STATIC File.cpp + Markdown.cpp Utils.cpp - ) + ) \ No newline at end of file diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp new file mode 100644 index 0000000000000..776150b939d27 --- /dev/null +++ b/clang-tools-extra/clang-doc/support/Markdown.cpp @@ -0,0 +1,145 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Markdown.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/DebugLog.h" + +#define DEBUG_TYPE "clang-doc-markdown" + +using namespace llvm; + +namespace clang::doc::markdown { + +static MDNode makeText(StringRef S) { + return {NodeKind::NK_Text, S, {}}; +} + +// A line is a table separator if it only contains |, -, :, and spaces, +// and has at least one -. +static bool isSepRow(StringRef Line) { + return Line.contains('-') && + Line.find_first_not_of("|-: ") == StringRef::npos; +} + +// Returns true if Line begins with a bullet list marker (-, *, or +) +// followed by a space. +static bool isListItem(StringRef Line) { + return Line.starts_with("- ") || Line.starts_with("* ") || + Line.starts_with("+ "); +} + +static ArrayRef<MDNode> allocateNodes(const SmallVectorImpl<MDNode> &Nodes, + BumpPtrAllocator &Arena) { + if (Nodes.empty()) + return {}; + MDNode *Allocated = Arena.Allocate<MDNode>(Nodes.size()); + std::uninitialized_copy(Nodes.begin(), Nodes.end(), Allocated); + return ArrayRef<MDNode>(Allocated, Nodes.size()); +} + +ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText, + BumpPtrAllocator &Arena) { + if (ParagraphText.trim().empty()) + return {}; + + SmallVector<StringRef, 16> Lines; + ParagraphText.split(Lines, '\n'); + + SmallVector<MDNode> Nodes; + size_t I = 0, E = Lines.size(); + + while (I < E) { + StringRef Line = Lines[I].trim(); + + if (Line.empty()) { + ++I; + continue; + } + + // TODO: Follow CommonMark spec §4.5 more closely -- opening fences may be + // indented up to 3 spaces, the closing fence must use the same character + // and be at least as long as the opening fence, and the closing fence may + // only be followed by spaces. Doxygen specifics should be handled on a + // case-by-case basis. + if (Line.starts_with("```") || Line.starts_with("~~~")) { + char Fence = Line[0]; + StringRef Lang = Line.drop_front(3).trim(); + SmallVector<MDNode> CodeLines; + ++I; + while (I < E) { + StringRef CodeLine = Lines[I].trim(); + if (CodeLine.size() >= 3 && + all_of(CodeLine.take_front(3), + [Fence](char C) { return C == Fence; })) + break; + CodeLines.push_back(makeText(Lines[I])); + ++I; + } + ++I; // skip closing fence + MDNode Code; + Code.Kind = NodeKind::NK_FencedCode; + Code.Content = Lang; + Code.Children = allocateNodes(CodeLines, Arena); + LDBG() << "emitting NK_FencedCode lang='" << Lang + << "' lines=" << CodeLines.size(); + Nodes.push_back(Code); + continue; + } + + // Pipe table: current line has | and next line is a separator row. + if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) { + SmallVector<MDNode> Rows; + while (I < E && Lines[I].trim().contains('|')) { + Rows.push_back(makeText(Lines[I].trim())); + ++I; + } + MDNode Table; + Table.Kind = NodeKind::NK_Table; + Table.Content = {}; + Table.Children = allocateNodes(Rows, Arena); + LDBG() << "emitting NK_Table rows=" << Rows.size(); + Nodes.push_back(Table); + continue; + } + + // Unordered list item. + if (isListItem(Line)) { + SmallVector<MDNode> Items; + while (I < E) { + StringRef L = Lines[I].trim(); + if (!isListItem(L)) + break; + MDNode Item; + Item.Kind = NodeKind::NK_ListItem; + Item.Content = L.drop_front(2).trim(); + Item.Children = {}; + Items.push_back(Item); + ++I; + } + MDNode List; + List.Kind = NodeKind::NK_UnorderedList; + List.Content = {}; + List.Children = allocateNodes(Items, Arena); + LDBG() << "emitting NK_UnorderedList items=" << Items.size(); + Nodes.push_back(List); + continue; + } + + // Plain text fallback. + Nodes.push_back(makeText(Line)); + ++I; + } + + LDBG() << "parseMarkdown done nodes=" << Nodes.size(); + return allocateNodes(Nodes, Arena); +} + +} // namespace clang::doc::markdown \ No newline at end of file diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h new file mode 100644 index 0000000000000..890f764f937b1 --- /dev/null +++ b/clang-tools-extra/clang-doc/support/Markdown.h @@ -0,0 +1,72 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines a standalone Markdown parsing library for the LLVM +/// ecosystem. The parser takes plain text and returns a tree of typed nodes +/// with no knowledge of comments, Doxygen, or Clang-Doc internals. +/// +/// This is a simple Markdown parser for use inside Clang-Doc's comment +/// pipeline. You give it a paragraph of text and an arena allocator, and it +/// gives back a list of typed nodes describing the Markdown structure it found. +/// +/// The main entry point is parseMarkdown(). If the text has no Markdown in it, +/// you get back an empty list and can fall back to plain-text output. If it +/// does, you get a tree of MDNode structs where each node has a kind, optional +/// content (like the language tag on a code fence), and optional children. +/// +/// All nodes are allocated in the arena you pass in. You own the arena and are +/// responsible for keeping it alive as long as you use the nodes. +/// +/// The parser handles fenced code blocks, pipe tables, and unordered lists. +/// Anything it does not recognize comes back as a plain text node. It will +/// never crash on bad input. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Allocator.h" + +namespace clang::doc::markdown { + +enum class NodeKind { + // Block nodes + NK_Paragraph, + NK_FencedCode, + NK_Table, + NK_UnorderedList, + NK_OrderedList, + NK_ListItem, + NK_ThematicBreak, + // Inline nodes + NK_Text, + NK_InlineCode, + NK_Emphasis, + NK_Strong, + NK_SoftBreak, +}; + +struct MDNode { + NodeKind Kind; + llvm::StringRef Content; // lang tag for FencedCode, leaf text for Text + llvm::ArrayRef<MDNode> Children; // arena allocated +}; + +/// Parses Markdown from a single comment paragraph's text. +/// Returns an empty ArrayRef if no Markdown constructs are found, +/// so generators can fall back to plain-text rendering at zero cost. +llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, + llvm::BumpPtrAllocator &Arena); + +} // namespace clang::doc::markdown + +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H \ No newline at end of file diff --git a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt index 01b34ec9a791e..b74207ac88fa7 100644 --- a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt +++ b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt @@ -26,6 +26,7 @@ add_extra_unittest(ClangDocTests ClangDocTest.cpp GeneratorTest.cpp HTMLGeneratorTest.cpp + MarkdownParserTest.cpp MDGeneratorTest.cpp MergeTest.cpp SerializeTest.cpp @@ -49,5 +50,6 @@ clang_target_link_libraries(ClangDocTests target_link_libraries(ClangDocTests PRIVATE clangDoc + clangDocSupport LLVMTestingSupport - ) + ) \ No newline at end of file diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp new file mode 100644 index 0000000000000..8df5efc7f1d5f --- /dev/null +++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp @@ -0,0 +1,94 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "support/Markdown.h" +#include "llvm/Support/Allocator.h" +#include "gtest/gtest.h" + +using namespace clang::doc::markdown; + +namespace { + +TEST(MarkdownParserTest, EmptyInput) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("", Arena); + EXPECT_TRUE(Nodes.empty()); +} + +TEST(MarkdownParserTest, WhitespaceOnlyInput) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown(" \n \n", Arena); + EXPECT_TRUE(Nodes.empty()); +} + +TEST(MarkdownParserTest, PlainText) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("hello world", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Text); + EXPECT_EQ(Nodes[0].Content, "hello world"); +} + +TEST(MarkdownParserTest, FencedCodeBlock) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("```cpp\nint x = 0;\n```", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode); + EXPECT_EQ(Nodes[0].Content, "cpp"); + ASSERT_EQ(Nodes[0].Children.size(), 1u); +} + +TEST(MarkdownParserTest, FencedCodeBlockNoLang) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("```\nsome code\n```", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode); + EXPECT_TRUE(Nodes[0].Content.empty()); +} + +TEST(MarkdownParserTest, UnterminatedFenceReturnsEmpty) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("```cpp\nint x = 0;", Arena); + // Unterminated fence should not crash and should produce a code node + // with whatever lines were found. + EXPECT_FALSE(Nodes.empty()); +} + +TEST(MarkdownParserTest, PipeTable) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("| A | B |\n|---|---|\n| 1 | 2 |", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table); +} + +TEST(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("a | b\nc | d", Arena); + // No separator row so should not be parsed as a table + for (const auto &Node : Nodes) + EXPECT_NE(Node.Kind, NodeKind::NK_Table); +} + +TEST(MarkdownParserTest, UnorderedList) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("- foo\n- bar\n- baz", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_UnorderedList); + ASSERT_EQ(Nodes[0].Children.size(), 3u); + EXPECT_EQ(Nodes[0].Children[0].Content, "foo"); + EXPECT_EQ(Nodes[0].Children[1].Content, "bar"); + EXPECT_EQ(Nodes[0].Children[2].Content, "baz"); +} + +TEST(MarkdownParserTest, MixedContent) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("some text\n```\ncode\n```\n- item", Arena); + EXPECT_EQ(Nodes.size(), 3u); +} + +} // namespace \ No newline at end of file >From f4cb4a28630e0f91289bfd4416c59114c5654ff7 Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Wed, 10 Jun 2026 11:35:54 -0400 Subject: [PATCH 02/13] [clang-doc] Address review feedback: test fixture, raw strings, DEBUG_TYPE, EOF newlines --- .../clang-doc/support/Markdown.cpp | 4 +- .../clang-doc/support/Markdown.h | 2 +- .../clang-doc/MarkdownParserTest.cpp | 97 +++++++++++-------- 3 files changed, 61 insertions(+), 42 deletions(-) diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp index 776150b939d27..9e008abf8b08d 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.cpp +++ b/clang-tools-extra/clang-doc/support/Markdown.cpp @@ -12,7 +12,7 @@ #include "llvm/Support/Allocator.h" #include "llvm/Support/DebugLog.h" -#define DEBUG_TYPE "clang-doc-markdown" +#define DEBUG_TYPE "clang-doc" using namespace llvm; @@ -142,4 +142,4 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText, return allocateNodes(Nodes, Arena); } -} // namespace clang::doc::markdown \ No newline at end of file +} // namespace clang::doc::markdown diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h index 890f764f937b1..09b79cc8f2437 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.h +++ b/clang-tools-extra/clang-doc/support/Markdown.h @@ -69,4 +69,4 @@ llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, } // namespace clang::doc::markdown -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H \ No newline at end of file +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp index 8df5efc7f1d5f..ff9bad88da136 100644 --- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp +++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp @@ -14,80 +14,99 @@ using namespace clang::doc::markdown; namespace { -TEST(MarkdownParserTest, EmptyInput) { +struct MarkdownParserTest : public ::testing::Test { llvm::BumpPtrAllocator Arena; +}; + +TEST_F(MarkdownParserTest, EmptyInput) { auto Nodes = parseMarkdown("", Arena); EXPECT_TRUE(Nodes.empty()); } -TEST(MarkdownParserTest, WhitespaceOnlyInput) { - llvm::BumpPtrAllocator Arena; +TEST_F(MarkdownParserTest, WhitespaceOnlyInput) { auto Nodes = parseMarkdown(" \n \n", Arena); EXPECT_TRUE(Nodes.empty()); } -TEST(MarkdownParserTest, PlainText) { - llvm::BumpPtrAllocator Arena; +TEST_F(MarkdownParserTest, PlainText) { auto Nodes = parseMarkdown("hello world", Arena); ASSERT_EQ(Nodes.size(), 1u); - EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Text); - EXPECT_EQ(Nodes[0].Content, "hello world"); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_Text); + EXPECT_EQ(N.Content, "hello world"); } -TEST(MarkdownParserTest, FencedCodeBlock) { - llvm::BumpPtrAllocator Arena; - auto Nodes = parseMarkdown("```cpp\nint x = 0;\n```", Arena); +TEST_F(MarkdownParserTest, FencedCodeBlock) { + auto Nodes = parseMarkdown(R"(```cpp +int x = 0; +````)", + Arena); ASSERT_EQ(Nodes.size(), 1u); - EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode); - EXPECT_EQ(Nodes[0].Content, "cpp"); - ASSERT_EQ(Nodes[0].Children.size(), 1u); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + EXPECT_EQ(N.Content, "cpp"); + ASSERT_EQ(N.Children.size(), 1u); } -TEST(MarkdownParserTest, FencedCodeBlockNoLang) { - llvm::BumpPtrAllocator Arena; - auto Nodes = parseMarkdown("```\nsome code\n```", Arena); +TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) { + auto Nodes = parseMarkdown(R"(``` +some code +```)", + Arena); ASSERT_EQ(Nodes.size(), 1u); - EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode); - EXPECT_TRUE(Nodes[0].Content.empty()); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + EXPECT_TRUE(N.Content.empty()); } -TEST(MarkdownParserTest, UnterminatedFenceReturnsEmpty) { - llvm::BumpPtrAllocator Arena; - auto Nodes = parseMarkdown("```cpp\nint x = 0;", Arena); +TEST_F(MarkdownParserTest, UnterminatedFenceReturnsEmpty) { + auto Nodes = parseMarkdown(R"(```cpp +int x = 0;)", + Arena); // Unterminated fence should not crash and should produce a code node // with whatever lines were found. EXPECT_FALSE(Nodes.empty()); } -TEST(MarkdownParserTest, PipeTable) { - llvm::BumpPtrAllocator Arena; - auto Nodes = parseMarkdown("| A | B |\n|---|---|\n| 1 | 2 |", Arena); +TEST_F(MarkdownParserTest, PipeTable) { + auto Nodes = parseMarkdown(R"(| A | B | +|---|---| +| 1 | 2 |)", + Arena); ASSERT_EQ(Nodes.size(), 1u); EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table); } -TEST(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) { - llvm::BumpPtrAllocator Arena; - auto Nodes = parseMarkdown("a | b\nc | d", Arena); - // No separator row so should not be parsed as a table +TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) { + auto Nodes = parseMarkdown(R"(a | b +c | d)", + Arena); + // No separator row so should not be parsed as a table. for (const auto &Node : Nodes) EXPECT_NE(Node.Kind, NodeKind::NK_Table); } -TEST(MarkdownParserTest, UnorderedList) { - llvm::BumpPtrAllocator Arena; - auto Nodes = parseMarkdown("- foo\n- bar\n- baz", Arena); +TEST_F(MarkdownParserTest, UnorderedList) { + auto Nodes = parseMarkdown(R"(- foo +- bar +- baz)", + Arena); ASSERT_EQ(Nodes.size(), 1u); - EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_UnorderedList); - ASSERT_EQ(Nodes[0].Children.size(), 3u); - EXPECT_EQ(Nodes[0].Children[0].Content, "foo"); - EXPECT_EQ(Nodes[0].Children[1].Content, "bar"); - EXPECT_EQ(Nodes[0].Children[2].Content, "baz"); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_UnorderedList); + ASSERT_EQ(N.Children.size(), 3u); + EXPECT_EQ(N.Children[0].Content, "foo"); + EXPECT_EQ(N.Children[1].Content, "bar"); + EXPECT_EQ(N.Children[2].Content, "baz"); } -TEST(MarkdownParserTest, MixedContent) { - llvm::BumpPtrAllocator Arena; - auto Nodes = parseMarkdown("some text\n```\ncode\n```\n- item", Arena); +TEST_F(MarkdownParserTest, MixedContent) { + auto Nodes = parseMarkdown(R"(some text +``` +code +```` +- item)", + Arena); EXPECT_EQ(Nodes.size(), 3u); } >From 3ef8f62edab311caff0907ab2b9a0c3aaeb14353 Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Wed, 10 Jun 2026 13:45:44 -0400 Subject: [PATCH 03/13] [clang-doc] Add CommonMark spec tests for fenced code blocks --- .../clang-doc/MarkdownParserTest.cpp | 112 +++++++++++++++++- 1 file changed, 108 insertions(+), 4 deletions(-) diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp index ff9bad88da136..4ca979c1f1d24 100644 --- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp +++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp @@ -39,7 +39,7 @@ TEST_F(MarkdownParserTest, PlainText) { TEST_F(MarkdownParserTest, FencedCodeBlock) { auto Nodes = parseMarkdown(R"(```cpp int x = 0; -````)", +````````)", Arena); ASSERT_EQ(Nodes.size(), 1u); const auto &N = Nodes[0]; @@ -51,7 +51,7 @@ int x = 0; TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) { auto Nodes = parseMarkdown(R"(``` some code -```)", +```````)", Arena); ASSERT_EQ(Nodes.size(), 1u); const auto &N = Nodes[0]; @@ -102,12 +102,116 @@ TEST_F(MarkdownParserTest, UnorderedList) { TEST_F(MarkdownParserTest, MixedContent) { auto Nodes = parseMarkdown(R"(some text -``` +``````` code -```` +```````` - item)", Arena); EXPECT_EQ(Nodes.size(), 3u); } +// CommonMark §4.5 example 120: tilde fences work the same as backtick fences. +TEST_F(MarkdownParserTest, TildeFence) { + auto Nodes = parseMarkdown(R"(~~~ +int x = 0; +~~~)", + Arena); + ASSERT_EQ(Nodes.size(), 1u); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + EXPECT_TRUE(N.Content.empty()); + ASSERT_EQ(N.Children.size(), 1u); +} + +// CommonMark §4.5 example 120: tilde fence with a language tag. +TEST_F(MarkdownParserTest, TildeFenceWithLang) { + auto Nodes = parseMarkdown(R"(~~~cpp +int x = 0; +~~~)", + Arena); + ASSERT_EQ(Nodes.size(), 1u); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + EXPECT_EQ(N.Content, "cpp"); + ASSERT_EQ(N.Children.size(), 1u); +} + +// CommonMark §4.5 example 122: a tilde line does not close a backtick fence. +TEST_F(MarkdownParserTest, ClosingFenceMustMatchOpeningChar) { + auto Nodes = parseMarkdown(R"(``` +aaa +~~~ +````````)", + Arena); + ASSERT_EQ(Nodes.size(), 1u); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + // ~~~ is content, not a closing fence. + ASSERT_EQ(N.Children.size(), 2u); +} + +// CommonMark §4.5 example 130: a code block can be empty. +TEST_F(MarkdownParserTest, EmptyFencedCodeBlock) { + auto Nodes = parseMarkdown(R"(``` +```````)", + Arena); + ASSERT_EQ(Nodes.size(), 1u); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + EXPECT_TRUE(N.Children.empty()); +} + +// CommonMark §4.5 example 129: a code block may contain only blank lines. +TEST_F(MarkdownParserTest, FencedCodeBlockBlankLineContent) { + auto Nodes = parseMarkdown("```\n\n \n```", Arena); + ASSERT_EQ(Nodes.size(), 1u); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + ASSERT_EQ(N.Children.size(), 2u); +} + +// CommonMark §4.5 example 142: lang tag is captured from the info string. +TEST_F(MarkdownParserTest, InfoStringLangTag) { + auto Nodes = parseMarkdown(R"(```ruby +def foo(x) + return 3 +end +``````)", + Arena); + ASSERT_EQ(Nodes.size(), 1u); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + EXPECT_EQ(N.Content, "ruby"); + ASSERT_EQ(N.Children.size(), 3u); +} + +// CommonMark §4.5 example 146: tilde fence info string may contain backticks. +TEST_F(MarkdownParserTest, TildeFenceInfoStringWithBackticks) { + auto Nodes = parseMarkdown(R"(~~~ aa ``` ~~~ +foo +~~~)", + Arena); + ASSERT_EQ(Nodes.size(), 1u); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + EXPECT_EQ(N.Content, "aa ``` ~~~"); + ASSERT_EQ(N.Children.size(), 1u); +} + +// CommonMark §4.5 example 124: closing fence must be at least as long as the +// opening fence. +// TODO: our parser currently closes on the first line with 3 matching fence +// chars regardless of opening fence length. Fix as part of the CommonMark +// TODO in parseMarkdown(). +TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) { + auto Nodes = parseMarkdown("````\naaa\n```", Arena); + // The ``` line should not close the ```` fence per CommonMark, but our + // parser currently treats it as a closing fence. This test documents the + // current (non-conformant) behavior. + ASSERT_EQ(Nodes.size(), 1u); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + ASSERT_EQ(N.Children.size(), 1u); +} + } // namespace \ No newline at end of file >From ffb56028d83a542a775119a7b0c2f88271b2df84 Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Wed, 10 Jun 2026 13:59:52 -0400 Subject: [PATCH 04/13] [clang-doc] Replace flat MDNode with typed node hierarchy using LLVM RTTI --- .../clang-doc/support/Markdown.cpp | 84 +++--- .../clang-doc/support/Markdown.h | 264 ++++++++++++++++-- .../clang-doc/MarkdownParserTest.cpp | 84 +++--- 3 files changed, 312 insertions(+), 120 deletions(-) diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp index 9e008abf8b08d..bee15c3e23ec3 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.cpp +++ b/clang-tools-extra/clang-doc/support/Markdown.cpp @@ -18,8 +18,24 @@ using namespace llvm; namespace clang::doc::markdown { -static MDNode makeText(StringRef S) { - return {NodeKind::NK_Text, S, {}}; +// Allocates a contiguous array of T in the arena and returns an ArrayRef. +template <typename T> +static ArrayRef<T> allocateArray(SmallVectorImpl<T> &Vec, + BumpPtrAllocator &Arena) { + if (Vec.empty()) + return {}; + T *Allocated = Arena.Allocate<T>(Vec.size()); + std::uninitialized_copy(Vec.begin(), Vec.end(), Allocated); + return ArrayRef<T>(Allocated, Vec.size()); +} + +// Interns a StringRef into the arena so it outlives the parse loop. +static StringRef internString(StringRef S, BumpPtrAllocator &Arena) { + if (S.empty()) + return {}; + char *Buf = Arena.Allocate<char>(S.size()); + std::copy(S.begin(), S.end(), Buf); + return StringRef(Buf, S.size()); } // A line is a table separator if it only contains |, -, :, and spaces, @@ -36,24 +52,15 @@ static bool isListItem(StringRef Line) { Line.starts_with("+ "); } -static ArrayRef<MDNode> allocateNodes(const SmallVectorImpl<MDNode> &Nodes, - BumpPtrAllocator &Arena) { - if (Nodes.empty()) - return {}; - MDNode *Allocated = Arena.Allocate<MDNode>(Nodes.size()); - std::uninitialized_copy(Nodes.begin(), Nodes.end(), Allocated); - return ArrayRef<MDNode>(Allocated, Nodes.size()); -} - -ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText, - BumpPtrAllocator &Arena) { +ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, + BumpPtrAllocator &Arena) { if (ParagraphText.trim().empty()) return {}; SmallVector<StringRef, 16> Lines; ParagraphText.split(Lines, '\n'); - SmallVector<MDNode> Nodes; + SmallVector<MDNode *> Nodes; size_t I = 0, E = Lines.size(); while (I < E) { @@ -71,8 +78,8 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText, // case-by-case basis. if (Line.starts_with("```") || Line.starts_with("~~~")) { char Fence = Line[0]; - StringRef Lang = Line.drop_front(3).trim(); - SmallVector<MDNode> CodeLines; + StringRef Lang = internString(Line.drop_front(3).trim(), Arena); + SmallVector<StringRef> CodeLines; ++I; while (I < E) { StringRef CodeLine = Lines[I].trim(); @@ -80,15 +87,13 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText, all_of(CodeLine.take_front(3), [Fence](char C) { return C == Fence; })) break; - CodeLines.push_back(makeText(Lines[I])); + CodeLines.push_back(internString(Lines[I], Arena)); ++I; } ++I; // skip closing fence - MDNode Code; - Code.Kind = NodeKind::NK_FencedCode; - Code.Content = Lang; - Code.Children = allocateNodes(CodeLines, Arena); - LDBG() << "emitting NK_FencedCode lang='" << Lang + auto *Code = + new (Arena) FencedCodeNode(Lang, allocateArray(CodeLines, Arena)); + LDBG() << "emitting FencedCodeNode lang='" << Lang << "' lines=" << CodeLines.size(); Nodes.push_back(Code); continue; @@ -96,50 +101,45 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText, // Pipe table: current line has | and next line is a separator row. if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) { - SmallVector<MDNode> Rows; + SmallVector<StringRef> Rows; while (I < E && Lines[I].trim().contains('|')) { - Rows.push_back(makeText(Lines[I].trim())); + Rows.push_back(internString(Lines[I].trim(), Arena)); ++I; } - MDNode Table; - Table.Kind = NodeKind::NK_Table; - Table.Content = {}; - Table.Children = allocateNodes(Rows, Arena); - LDBG() << "emitting NK_Table rows=" << Rows.size(); + auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena)); + LDBG() << "emitting TableNode rows=" << Rows.size(); Nodes.push_back(Table); continue; } // Unordered list item. if (isListItem(Line)) { - SmallVector<MDNode> Items; + SmallVector<ListItemNode *> Items; while (I < E) { StringRef L = Lines[I].trim(); if (!isListItem(L)) break; - MDNode Item; - Item.Kind = NodeKind::NK_ListItem; - Item.Content = L.drop_front(2).trim(); - Item.Children = {}; + StringRef ItemText = internString(L.drop_front(2).trim(), Arena); + SmallVector<MDNode *> ItemChildren; + ItemChildren.push_back(new (Arena) TextNode(ItemText)); + auto *Item = + new (Arena) ListItemNode(allocateArray(ItemChildren, Arena)); Items.push_back(Item); ++I; } - MDNode List; - List.Kind = NodeKind::NK_UnorderedList; - List.Content = {}; - List.Children = allocateNodes(Items, Arena); - LDBG() << "emitting NK_UnorderedList items=" << Items.size(); + auto *List = new (Arena) UnorderedListNode(allocateArray(Items, Arena)); + LDBG() << "emitting UnorderedListNode items=" << Items.size(); Nodes.push_back(List); continue; } // Plain text fallback. - Nodes.push_back(makeText(Line)); + Nodes.push_back(new (Arena) TextNode(internString(Line, Arena))); ++I; } LDBG() << "parseMarkdown done nodes=" << Nodes.size(); - return allocateNodes(Nodes, Arena); + return allocateArray(Nodes, Arena); } -} // namespace clang::doc::markdown +} // namespace clang::doc::markdown \ No newline at end of file diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h index 09b79cc8f2437..3d457bcddfac6 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.h +++ b/clang-tools-extra/clang-doc/support/Markdown.h @@ -7,30 +7,50 @@ //===----------------------------------------------------------------------===// /// /// \file -/// This file defines a standalone Markdown parsing library for the LLVM -/// ecosystem. The parser takes plain text and returns a tree of typed nodes -/// with no knowledge of comments, Doxygen, or Clang-Doc internals. +/// Standalone Markdown parsing library for the LLVM ecosystem. /// -/// This is a simple Markdown parser for use inside Clang-Doc's comment -/// pipeline. You give it a paragraph of text and an arena allocator, and it -/// gives back a list of typed nodes describing the Markdown structure it found. +/// The parser takes plain paragraph text and returns a polymorphic tree of +/// MDNode-derived objects allocated in a caller-supplied BumpPtrAllocator. +/// Node types form a closed class hierarchy rooted at MDNode. Each concrete +/// type carries exactly the fields it needs -- no overloaded Content field, +/// no unused arrays. Use llvm::isa<>/cast<>/dyn_cast<> for type-safe +/// downcasting; each concrete type provides classof() for this purpose. /// -/// The main entry point is parseMarkdown(). If the text has no Markdown in it, -/// you get back an empty list and can fall back to plain-text output. If it -/// does, you get a tree of MDNode structs where each node has a kind, optional -/// content (like the language tag on a code fence), and optional children. +/// See +/// https://llvm.org/docs/ProgrammerManual.html#the-isa-cast-and-dyn-cast-templates /// -/// All nodes are allocated in the arena you pass in. You own the arena and are -/// responsible for keeping it alive as long as you use the nodes. +/// Field ordering in each derived struct is chosen to minimize padding: +/// 4-byte fields (like Level or Start) are declared before 16-byte fields +/// (ArrayRef, StringRef) so that no implicit padding is inserted between the +/// base class's 4-byte Kind and the first derived field. /// -/// The parser handles fenced code blocks, pipe tables, and unordered lists. -/// Anything it does not recognize comes back as a plain text node. It will -/// never crash on bad input. +/// Inline nodes (appear inside ParagraphNode, HeadingNode, etc.): +/// TextNode -- plain text run +/// SoftBreakNode -- soft line break +/// HardBreakNode -- hard line break (trailing spaces or backslash) +/// InlineCodeNode -- inline code span (`code`) +/// EmphasisNode -- emphasis (*text* or _text_) +/// StrongNode -- strong emphasis (**text** or __text__) +/// +/// Block nodes: +/// ParagraphNode -- sequence of inline nodes +/// HeadingNode -- ATX heading (# through ######), level 1-6 +/// FencedCodeNode -- fenced code block (``` or ~~~) +/// TableNode -- pipe table (raw row text; TODO: structured cells) +/// UnorderedListNode -- bullet list (-, *, +) +/// OrderedListNode -- numbered list with explicit start number +/// ListItemNode -- single item inside a list +/// BlockQuoteNode -- block quote (>) +/// ThematicBreakNode -- horizontal rule (---, ***, ___) +/// +/// All nodes are arena-allocated. The caller owns the arena and must keep it +/// alive for the lifetime of any returned nodes. The parser never crashes on +/// malformed input; unrecognized text falls back to TextNode. /// //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" @@ -38,35 +58,217 @@ namespace clang::doc::markdown { +/// Discriminator for all Markdown AST nodes. Inline kinds are grouped before +/// block kinds so that the sentinels NK_LastInline and NK_FirstBlock enable +/// cheap range-based checks in classof() implementations. enum class NodeKind { + // Inline nodes + NK_Text, + NK_SoftBreak, + NK_HardBreak, + NK_InlineCode, + NK_Emphasis, + NK_Strong, + NK_LastInline = NK_Strong, // sentinel -- all inline kinds are <= this + // Block nodes NK_Paragraph, + NK_Heading, NK_FencedCode, NK_Table, NK_UnorderedList, NK_OrderedList, NK_ListItem, + NK_BlockQuote, NK_ThematicBreak, - // Inline nodes - NK_Text, - NK_InlineCode, - NK_Emphasis, - NK_Strong, - NK_SoftBreak, + NK_FirstBlock = NK_Paragraph, // sentinel -- all block kinds are >= this }; +/// Base type for all Markdown AST nodes. Carries only the kind discriminator. +/// Nodes are arena-allocated and have no virtual destructor; use +/// llvm::isa<>/cast<>/dyn_cast<> for type-safe downcasting. struct MDNode { NodeKind Kind; - llvm::StringRef Content; // lang tag for FencedCode, leaf text for Text - llvm::ArrayRef<MDNode> Children; // arena allocated + explicit MDNode(NodeKind K) : Kind(K) {} +}; + +//===----------------------------------------------------------------------===// +// Inline nodes +//===----------------------------------------------------------------------===// + +/// Plain text run. +struct TextNode : MDNode { + llvm::StringRef Text; + explicit TextNode(llvm::StringRef Text) + : MDNode(NodeKind::NK_Text), Text(Text) {} + static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Text; } +}; + +/// Soft line break -- a newline that does not end the paragraph. +struct SoftBreakNode : MDNode { + SoftBreakNode() : MDNode(NodeKind::NK_SoftBreak) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_SoftBreak; + } +}; + +/// Hard line break -- two trailing spaces or a backslash before a newline. +struct HardBreakNode : MDNode { + HardBreakNode() : MDNode(NodeKind::NK_HardBreak) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_HardBreak; + } +}; + +/// Inline code span: `code`. Code does not include the surrounding backticks. +struct InlineCodeNode : MDNode { + llvm::StringRef Code; + explicit InlineCodeNode(llvm::StringRef Code) + : MDNode(NodeKind::NK_InlineCode), Code(Code) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_InlineCode; + } +}; + +/// Emphasized text: *text* or _text_. +struct EmphasisNode : MDNode { + llvm::ArrayRef<MDNode *> Children; + explicit EmphasisNode(llvm::ArrayRef<MDNode *> Children) + : MDNode(NodeKind::NK_Emphasis), Children(Children) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_Emphasis; + } }; -/// Parses Markdown from a single comment paragraph's text. -/// Returns an empty ArrayRef if no Markdown constructs are found, -/// so generators can fall back to plain-text rendering at zero cost. -llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, - llvm::BumpPtrAllocator &Arena); +/// Strongly emphasized text: **text** or __text__. +struct StrongNode : MDNode { + llvm::ArrayRef<MDNode *> Children; + explicit StrongNode(llvm::ArrayRef<MDNode *> Children) + : MDNode(NodeKind::NK_Strong), Children(Children) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_Strong; + } +}; + +//===----------------------------------------------------------------------===// +// Block nodes +//===----------------------------------------------------------------------===// + +/// A paragraph -- sequence of inline nodes separated from other blocks by +/// blank lines. +struct ParagraphNode : MDNode { + llvm::ArrayRef<MDNode *> Children; + explicit ParagraphNode(llvm::ArrayRef<MDNode *> Children) + : MDNode(NodeKind::NK_Paragraph), Children(Children) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_Paragraph; + } +}; + +/// ATX heading: one to six leading # characters. Level is declared before +/// Children to avoid padding between the base class's 4-byte Kind and the +/// 8-byte-aligned ArrayRef, keeping sizeof(HeadingNode) at 24 bytes. +struct HeadingNode : MDNode { + unsigned Level; // 1-6 + llvm::ArrayRef<MDNode *> Children; // inline content + HeadingNode(unsigned Level, llvm::ArrayRef<MDNode *> Children) + : MDNode(NodeKind::NK_Heading), Level(Level), Children(Children) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_Heading; + } +}; + +/// Fenced code block opened with ``` or ~~~. Lang is the info string (e.g. +/// "cpp"); empty when no language was specified. Lines contains the raw text +/// of each interior line, without the opening or closing fence. +/// +/// TODO: Follow CommonMark spec §4.5 -- the opening fence may be indented up +/// to 3 spaces; the closing fence must use the same character and be at least +/// as long as the opening fence; only spaces may follow the closing fence. +struct FencedCodeNode : MDNode { + llvm::StringRef Lang; + llvm::ArrayRef<llvm::StringRef> Lines; + FencedCodeNode(llvm::StringRef Lang, llvm::ArrayRef<llvm::StringRef> Lines) + : MDNode(NodeKind::NK_FencedCode), Lang(Lang), Lines(Lines) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_FencedCode; + } +}; + +/// Pipe table. Rows contains the raw text of each row line including the +/// header and separator rows. +/// TODO: replace with a structured header/body/cell representation. +struct TableNode : MDNode { + llvm::ArrayRef<llvm::StringRef> Rows; + explicit TableNode(llvm::ArrayRef<llvm::StringRef> Rows) + : MDNode(NodeKind::NK_Table), Rows(Rows) {} + static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Table; } +}; + +/// A single list item. Children may contain block-level nodes for loose +/// lists, or a single inline sequence for tight lists. +struct ListItemNode : MDNode { + llvm::ArrayRef<MDNode *> Children; + explicit ListItemNode(llvm::ArrayRef<MDNode *> Children) + : MDNode(NodeKind::NK_ListItem), Children(Children) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_ListItem; + } +}; + +/// Unordered (bullet) list. Markers are -, *, or +. +struct UnorderedListNode : MDNode { + llvm::ArrayRef<ListItemNode *> Items; + explicit UnorderedListNode(llvm::ArrayRef<ListItemNode *> Items) + : MDNode(NodeKind::NK_UnorderedList), Items(Items) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_UnorderedList; + } +}; + +/// Ordered (numbered) list. Start is the number on the first item. Start is +/// declared before Items to avoid padding, keeping sizeof at 24 bytes. +struct OrderedListNode : MDNode { + unsigned Start; + llvm::ArrayRef<ListItemNode *> Items; + OrderedListNode(unsigned Start, llvm::ArrayRef<ListItemNode *> Items) + : MDNode(NodeKind::NK_OrderedList), Start(Start), Items(Items) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_OrderedList; + } +}; + +/// Block quote (> ...). Children are block-level nodes inside the quote. +struct BlockQuoteNode : MDNode { + llvm::ArrayRef<MDNode *> Children; + explicit BlockQuoteNode(llvm::ArrayRef<MDNode *> Children) + : MDNode(NodeKind::NK_BlockQuote), Children(Children) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_BlockQuote; + } +}; + +/// Thematic break: a line of three or more ---, ***, or ___ characters. +struct ThematicBreakNode : MDNode { + ThematicBreakNode() : MDNode(NodeKind::NK_ThematicBreak) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_ThematicBreak; + } +}; + +//===----------------------------------------------------------------------===// +// Parser entry point +//===----------------------------------------------------------------------===// + +/// Parse Markdown from a single paragraph of plain text. Returns a list of +/// top-level block nodes allocated in Arena. Returns an empty ArrayRef if no +/// Markdown constructs are found, letting callers fall back to plain-text +/// rendering at zero cost. The parser never crashes on malformed input. +/// +/// The caller must keep Arena alive for the lifetime of any returned nodes. +llvm::ArrayRef<MDNode *> parseMarkdown(llvm::StringRef ParagraphText, + llvm::BumpPtrAllocator &Arena); } // namespace clang::doc::markdown -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp index 4ca979c1f1d24..b61094f034375 100644 --- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp +++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp @@ -8,9 +8,11 @@ #include "support/Markdown.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/Casting.h" #include "gtest/gtest.h" using namespace clang::doc::markdown; +using namespace llvm; namespace { @@ -31,9 +33,8 @@ TEST_F(MarkdownParserTest, WhitespaceOnlyInput) { TEST_F(MarkdownParserTest, PlainText) { auto Nodes = parseMarkdown("hello world", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_Text); - EXPECT_EQ(N.Content, "hello world"); + auto *N = cast<TextNode>(Nodes[0]); + EXPECT_EQ(N->Text, "hello world"); } TEST_F(MarkdownParserTest, FencedCodeBlock) { @@ -42,10 +43,9 @@ int x = 0; ````````)", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); - EXPECT_EQ(N.Content, "cpp"); - ASSERT_EQ(N.Children.size(), 1u); + auto *N = cast<FencedCodeNode>(Nodes[0]); + EXPECT_EQ(N->Lang, "cpp"); + ASSERT_EQ(N->Lines.size(), 1u); } TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) { @@ -54,9 +54,8 @@ some code ```````)", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); - EXPECT_TRUE(N.Content.empty()); + auto *N = cast<FencedCodeNode>(Nodes[0]); + EXPECT_TRUE(N->Lang.empty()); } TEST_F(MarkdownParserTest, UnterminatedFenceReturnsEmpty) { @@ -74,7 +73,7 @@ TEST_F(MarkdownParserTest, PipeTable) { | 1 | 2 |)", Arena); ASSERT_EQ(Nodes.size(), 1u); - EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table); + EXPECT_TRUE(isa<TableNode>(Nodes[0])); } TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) { @@ -82,8 +81,8 @@ TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) { c | d)", Arena); // No separator row so should not be parsed as a table. - for (const auto &Node : Nodes) - EXPECT_NE(Node.Kind, NodeKind::NK_Table); + for (const auto *Node : Nodes) + EXPECT_FALSE(isa<TableNode>(Node)); } TEST_F(MarkdownParserTest, UnorderedList) { @@ -92,12 +91,11 @@ TEST_F(MarkdownParserTest, UnorderedList) { - baz)", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_UnorderedList); - ASSERT_EQ(N.Children.size(), 3u); - EXPECT_EQ(N.Children[0].Content, "foo"); - EXPECT_EQ(N.Children[1].Content, "bar"); - EXPECT_EQ(N.Children[2].Content, "baz"); + auto *N = cast<UnorderedListNode>(Nodes[0]); + ASSERT_EQ(N->Items.size(), 3u); + EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "foo"); + EXPECT_EQ(cast<TextNode>(N->Items[1]->Children[0])->Text, "bar"); + EXPECT_EQ(cast<TextNode>(N->Items[2]->Children[0])->Text, "baz"); } TEST_F(MarkdownParserTest, MixedContent) { @@ -117,10 +115,9 @@ int x = 0; ~~~)", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); - EXPECT_TRUE(N.Content.empty()); - ASSERT_EQ(N.Children.size(), 1u); + auto *N = cast<FencedCodeNode>(Nodes[0]); + EXPECT_TRUE(N->Lang.empty()); + ASSERT_EQ(N->Lines.size(), 1u); } // CommonMark §4.5 example 120: tilde fence with a language tag. @@ -130,10 +127,9 @@ int x = 0; ~~~)", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); - EXPECT_EQ(N.Content, "cpp"); - ASSERT_EQ(N.Children.size(), 1u); + auto *N = cast<FencedCodeNode>(Nodes[0]); + EXPECT_EQ(N->Lang, "cpp"); + ASSERT_EQ(N->Lines.size(), 1u); } // CommonMark §4.5 example 122: a tilde line does not close a backtick fence. @@ -144,10 +140,9 @@ aaa ````````)", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + auto *N = cast<FencedCodeNode>(Nodes[0]); // ~~~ is content, not a closing fence. - ASSERT_EQ(N.Children.size(), 2u); + ASSERT_EQ(N->Lines.size(), 2u); } // CommonMark §4.5 example 130: a code block can be empty. @@ -156,18 +151,16 @@ TEST_F(MarkdownParserTest, EmptyFencedCodeBlock) { ```````)", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); - EXPECT_TRUE(N.Children.empty()); + auto *N = cast<FencedCodeNode>(Nodes[0]); + EXPECT_TRUE(N->Lines.empty()); } // CommonMark §4.5 example 129: a code block may contain only blank lines. TEST_F(MarkdownParserTest, FencedCodeBlockBlankLineContent) { auto Nodes = parseMarkdown("```\n\n \n```", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); - ASSERT_EQ(N.Children.size(), 2u); + auto *N = cast<FencedCodeNode>(Nodes[0]); + ASSERT_EQ(N->Lines.size(), 2u); } // CommonMark §4.5 example 142: lang tag is captured from the info string. @@ -179,10 +172,9 @@ end ``````)", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); - EXPECT_EQ(N.Content, "ruby"); - ASSERT_EQ(N.Children.size(), 3u); + auto *N = cast<FencedCodeNode>(Nodes[0]); + EXPECT_EQ(N->Lang, "ruby"); + ASSERT_EQ(N->Lines.size(), 3u); } // CommonMark §4.5 example 146: tilde fence info string may contain backticks. @@ -192,10 +184,9 @@ foo ~~~)", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); - EXPECT_EQ(N.Content, "aa ``` ~~~"); - ASSERT_EQ(N.Children.size(), 1u); + auto *N = cast<FencedCodeNode>(Nodes[0]); + EXPECT_EQ(N->Lang, "aa ``` ~~~"); + ASSERT_EQ(N->Lines.size(), 1u); } // CommonMark §4.5 example 124: closing fence must be at least as long as the @@ -209,9 +200,8 @@ TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) { // parser currently treats it as a closing fence. This test documents the // current (non-conformant) behavior. ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); - ASSERT_EQ(N.Children.size(), 1u); + auto *N = cast<FencedCodeNode>(Nodes[0]); + ASSERT_EQ(N->Lines.size(), 1u); } } // namespace \ No newline at end of file >From 25fe7daff183f51a8b31ed0d8481b9a2f1fbdbd8 Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Fri, 12 Jun 2026 02:33:44 -0400 Subject: [PATCH 05/13] [clang-doc] Introduce LineReader cursor for the Markdown parse loop Replace the raw size_t I = 0, E = Lines.size() index arithmetic in parseMarkdown() with a LineReader cursor that encapsulates the position and exposes peek(), peek(Offset), advance(), and atEnd(). The parse logic and emitted nodes are unchanged; this only removes manual index bookkeeping. All 18 MarkdownParserTest cases still pass. Co-Authored-By: Claude Opus 4.8 <[email protected]> --- .../clang-doc/support/Markdown.cpp | 73 ++++++++++++++----- 1 file changed, 54 insertions(+), 19 deletions(-) diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp index bee15c3e23ec3..f171457e73046 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.cpp +++ b/clang-tools-extra/clang-doc/support/Markdown.cpp @@ -11,6 +11,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/DebugLog.h" +#include <cassert> #define DEBUG_TYPE "clang-doc" @@ -52,6 +53,42 @@ static bool isListItem(StringRef Line) { Line.starts_with("+ "); } +// A forward cursor over the lines of a paragraph. Encapsulates the parse +// position so the loop can inspect the current or an upcoming line and consume +// lines without manual index arithmetic. Lines are stored untrimmed; callers +// trim where they need a normalized view. +class LineReader { +public: + explicit LineReader(ArrayRef<StringRef> Lines) : Lines(Lines) {} + + // True once every line has been consumed. + bool atEnd() const { return Pos >= Lines.size(); } + + // The current line, untrimmed. Must not be called when atEnd(). + StringRef peek() const { + assert(!atEnd() && "peek past end of input"); + return Lines[Pos]; + } + + // The line Offset positions ahead of the cursor, or an empty StringRef when + // that position is past the end. peek(0) is the current line. + StringRef peek(size_t Offset) const { + size_t Target = Pos + Offset; + return Target < Lines.size() ? Lines[Target] : StringRef(); + } + + // Consume the current line and return it, untrimmed. Must not be called when + // atEnd(). + StringRef advance() { + assert(!atEnd() && "advance past end of input"); + return Lines[Pos++]; + } + +private: + ArrayRef<StringRef> Lines; + size_t Pos = 0; +}; + ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, BumpPtrAllocator &Arena) { if (ParagraphText.trim().empty()) @@ -61,13 +98,13 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, ParagraphText.split(Lines, '\n'); SmallVector<MDNode *> Nodes; - size_t I = 0, E = Lines.size(); + LineReader Reader(Lines); - while (I < E) { - StringRef Line = Lines[I].trim(); + while (!Reader.atEnd()) { + StringRef Line = Reader.peek().trim(); if (Line.empty()) { - ++I; + Reader.advance(); continue; } @@ -79,18 +116,18 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, if (Line.starts_with("```") || Line.starts_with("~~~")) { char Fence = Line[0]; StringRef Lang = internString(Line.drop_front(3).trim(), Arena); + Reader.advance(); // consume opening fence SmallVector<StringRef> CodeLines; - ++I; - while (I < E) { - StringRef CodeLine = Lines[I].trim(); + while (!Reader.atEnd()) { + StringRef CodeLine = Reader.peek().trim(); if (CodeLine.size() >= 3 && all_of(CodeLine.take_front(3), [Fence](char C) { return C == Fence; })) break; - CodeLines.push_back(internString(Lines[I], Arena)); - ++I; + CodeLines.push_back(internString(Reader.advance(), Arena)); } - ++I; // skip closing fence + if (!Reader.atEnd()) + Reader.advance(); // consume closing fence auto *Code = new (Arena) FencedCodeNode(Lang, allocateArray(CodeLines, Arena)); LDBG() << "emitting FencedCodeNode lang='" << Lang @@ -100,12 +137,10 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, } // Pipe table: current line has | and next line is a separator row. - if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) { + if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) { SmallVector<StringRef> Rows; - while (I < E && Lines[I].trim().contains('|')) { - Rows.push_back(internString(Lines[I].trim(), Arena)); - ++I; - } + while (!Reader.atEnd() && Reader.peek().trim().contains('|')) + Rows.push_back(internString(Reader.advance().trim(), Arena)); auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena)); LDBG() << "emitting TableNode rows=" << Rows.size(); Nodes.push_back(Table); @@ -115,8 +150,8 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, // Unordered list item. if (isListItem(Line)) { SmallVector<ListItemNode *> Items; - while (I < E) { - StringRef L = Lines[I].trim(); + while (!Reader.atEnd()) { + StringRef L = Reader.peek().trim(); if (!isListItem(L)) break; StringRef ItemText = internString(L.drop_front(2).trim(), Arena); @@ -125,7 +160,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, auto *Item = new (Arena) ListItemNode(allocateArray(ItemChildren, Arena)); Items.push_back(Item); - ++I; + Reader.advance(); } auto *List = new (Arena) UnorderedListNode(allocateArray(Items, Arena)); LDBG() << "emitting UnorderedListNode items=" << Items.size(); @@ -135,7 +170,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, // Plain text fallback. Nodes.push_back(new (Arena) TextNode(internString(Line, Arena))); - ++I; + Reader.advance(); } LDBG() << "parseMarkdown done nodes=" << Nodes.size(); >From 060bf63fe9f19fa45ef941f10594897351591d56 Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Fri, 12 Jun 2026 02:44:27 -0400 Subject: [PATCH 06/13] [clang-doc] Parse inline emphasis, strong, and code in Markdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an inline pass over paragraph text that recognizes emphasis (*text* or _text_), strong (**text** or __text__), and inline code (`code`), emitting the EmphasisNode, StrongNode, and InlineCodeNode types already in the hierarchy. Emphasis and strong recurse into their content, and runs that match no construct stay plain TextNodes. Delimiter matching uses a simplified subset of the CommonMark §6 flanking rules: a delimiter opens only with non-whitespace inside it and closes only with non-whitespace before it, and code spans close on a backtick run of equal length. The full delimiter-stack model is left as a TODO. Adds 12 unit tests covering each construct plus the unmatched and unterminated cases. Co-Authored-By: Claude Opus 4.8 <[email protected]> --- .../clang-doc/support/Markdown.cpp | 121 +++++++++++++++++- .../clang-doc/MarkdownParserTest.cpp | 97 ++++++++++++++ 2 files changed, 216 insertions(+), 2 deletions(-) diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp index f171457e73046..f1af4f5430772 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.cpp +++ b/clang-tools-extra/clang-doc/support/Markdown.cpp @@ -8,6 +8,7 @@ #include "Markdown.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/DebugLog.h" @@ -89,6 +90,121 @@ class LineReader { size_t Pos = 0; }; +// Returns the number of consecutive copies of C starting at S[Start]. +static size_t countRun(StringRef S, size_t Start, char C) { + size_t I = Start; + while (I < S.size() && S[I] == C) + ++I; + return I - Start; +} + +// Strips one leading and one trailing space from a code span's content when +// both are present and the content is not all spaces, per CommonMark §6.1. +static StringRef trimCodeSpan(StringRef Code) { + if (Code.size() >= 2 && Code.front() == ' ' && Code.back() == ' ' && + Code.find_first_not_of(' ') != StringRef::npos) + return Code.drop_front().drop_back(); + return Code; +} + +// Finds the start index of a closing emphasis run of exactly Count copies of C, +// searching forward from From. Requires non-whitespace immediately inside both +// the opening and closing delimiters and non-empty content, a simplified take +// on the CommonMark §6.2 flanking rules. Returns StringRef::npos if no valid +// closing run exists. +static size_t findClosingDelim(StringRef S, size_t From, char C, size_t Count) { + size_t E = S.size(); + // Opening delimiter is not left-flanking if whitespace follows it. + if (From >= E || isSpace(S[From])) + return StringRef::npos; + for (size_t J = From; J + Count <= E; ++J) { + if (S[J] != C) + continue; + size_t Run = countRun(S, J, C); + if (Run != Count) { + J += Run - 1; // Skip the whole run; the loop's ++J lands past it. + continue; + } + // Reject empty content and closing runs that are not right-flanking. + if (J == From || isSpace(S[J - 1])) + continue; + return J; + } + return StringRef::npos; +} + +// Parses the inline content of a single line into a sequence of inline nodes: +// inline code (`code`), strong (**text** or __text__), and emphasis (*text* or +// _text_). Runs that match no construct become TextNodes. Emphasis and strong +// recurse so their content may itself contain inline constructs. Text with no +// markers yields a single TextNode. +// +// TODO: This covers the common cases but not the full CommonMark §6 inline +// model (delimiter stacks, intraword underscore rules, links, autolinks). +static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) { + SmallVector<MDNode *> Nodes; + size_t TextStart = 0, I = 0, E = S.size(); + + auto flushText = [&](size_t End) { + if (End > TextStart) + Nodes.push_back(new (Arena) TextNode( + internString(S.substr(TextStart, End - TextStart), Arena))); + }; + + while (I < E) { + char C = S[I]; + + // Inline code span: a run of N backticks closed by a run of N backticks. + if (C == '`') { + size_t N = countRun(S, I, '`'); + size_t J = I + N; + while (J < E && countRun(S, J, '`') != N) + J += S[J] == '`' ? countRun(S, J, '`') : 1; + if (J < E) { + flushText(I); + StringRef Code = trimCodeSpan(S.substr(I + N, J - (I + N))); + Nodes.push_back(new (Arena) InlineCodeNode(internString(Code, Arena))); + I = J + N; + TextStart = I; + continue; + } + // No closing run; leave the backticks as literal text. + I += N; + continue; + } + + // Emphasis (*text*, _text_) and strong (**text**, __text__). + if (C == '*' || C == '_') { + // Strong binds the two-delimiter form before single-delimiter emphasis. + if (I + 1 < E && S[I + 1] == C) { + size_t Close = findClosingDelim(S, I + 2, C, 2); + if (Close != StringRef::npos) { + flushText(I); + StringRef Inner = S.substr(I + 2, Close - (I + 2)); + Nodes.push_back(new (Arena) StrongNode(parseInline(Inner, Arena))); + I = Close + 2; + TextStart = I; + continue; + } + } + size_t Close = findClosingDelim(S, I + 1, C, 1); + if (Close != StringRef::npos) { + flushText(I); + StringRef Inner = S.substr(I + 1, Close - (I + 1)); + Nodes.push_back(new (Arena) EmphasisNode(parseInline(Inner, Arena))); + I = Close + 1; + TextStart = I; + continue; + } + } + + ++I; + } + + flushText(E); + return allocateArray(Nodes, Arena); +} + ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, BumpPtrAllocator &Arena) { if (ParagraphText.trim().empty()) @@ -168,8 +284,9 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, continue; } - // Plain text fallback. - Nodes.push_back(new (Arena) TextNode(internString(Line, Arena))); + // Plain text, scanned for inline constructs (emphasis, strong, code). + for (MDNode *Inline : parseInline(Line, Arena)) + Nodes.push_back(Inline); Reader.advance(); } diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp index b61094f034375..ea72dacfb08e5 100644 --- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp +++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp @@ -204,4 +204,101 @@ TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) { ASSERT_EQ(N->Lines.size(), 1u); } +TEST_F(MarkdownParserTest, EmphasisAsterisk) { + auto Nodes = parseMarkdown("an *important* word", Arena); + ASSERT_EQ(Nodes.size(), 3u); + EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "an "); + auto *Em = cast<EmphasisNode>(Nodes[1]); + ASSERT_EQ(Em->Children.size(), 1u); + EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "important"); + EXPECT_EQ(cast<TextNode>(Nodes[2])->Text, " word"); +} + +TEST_F(MarkdownParserTest, EmphasisUnderscore) { + auto Nodes = parseMarkdown("_em_", Arena); + ASSERT_EQ(Nodes.size(), 1u); + auto *Em = cast<EmphasisNode>(Nodes[0]); + ASSERT_EQ(Em->Children.size(), 1u); + EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "em"); +} + +TEST_F(MarkdownParserTest, StrongAsterisk) { + auto Nodes = parseMarkdown("**bold**", Arena); + ASSERT_EQ(Nodes.size(), 1u); + auto *St = cast<StrongNode>(Nodes[0]); + ASSERT_EQ(St->Children.size(), 1u); + EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bold"); +} + +TEST_F(MarkdownParserTest, StrongUnderscore) { + auto Nodes = parseMarkdown("__bold__", Arena); + ASSERT_EQ(Nodes.size(), 1u); + auto *St = cast<StrongNode>(Nodes[0]); + ASSERT_EQ(St->Children.size(), 1u); + EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bold"); +} + +// Two delimiters must be parsed as strong, not as nested emphasis. +TEST_F(MarkdownParserTest, StrongBindsBeforeEmphasis) { + auto Nodes = parseMarkdown("**strong**", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_TRUE(isa<StrongNode>(Nodes[0])); +} + +TEST_F(MarkdownParserTest, InlineCode) { + auto Nodes = parseMarkdown("call `foo()` here", Arena); + ASSERT_EQ(Nodes.size(), 3u); + EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "call "); + EXPECT_EQ(cast<InlineCodeNode>(Nodes[1])->Code, "foo()"); + EXPECT_EQ(cast<TextNode>(Nodes[2])->Text, " here"); +} + +// CommonMark §6.1: a doubled backtick fence lets the span contain a single +// backtick. +TEST_F(MarkdownParserTest, InlineCodeDoubleBacktick) { + auto Nodes = parseMarkdown("``a`b``", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(cast<InlineCodeNode>(Nodes[0])->Code, "a`b"); +} + +// Emphasis and strong recurse, so a code span inside emphasis is parsed. +TEST_F(MarkdownParserTest, CodeSpanInsideEmphasis) { + auto Nodes = parseMarkdown("*see `x`*", Arena); + ASSERT_EQ(Nodes.size(), 1u); + auto *Em = cast<EmphasisNode>(Nodes[0]); + ASSERT_EQ(Em->Children.size(), 2u); + EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "see "); + EXPECT_EQ(cast<InlineCodeNode>(Em->Children[1])->Code, "x"); +} + +TEST_F(MarkdownParserTest, CodeSpanInsideStrong) { + auto Nodes = parseMarkdown("**a `b`**", Arena); + ASSERT_EQ(Nodes.size(), 1u); + auto *St = cast<StrongNode>(Nodes[0]); + ASSERT_EQ(St->Children.size(), 2u); + EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "a "); + EXPECT_EQ(cast<InlineCodeNode>(St->Children[1])->Code, "b"); +} + +// A delimiter with whitespace on the inside does not open emphasis. +TEST_F(MarkdownParserTest, UnmatchedDelimiterIsText) { + auto Nodes = parseMarkdown("a * b", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "a * b"); +} + +// An unterminated code span leaves the backtick as literal text. +TEST_F(MarkdownParserTest, UnterminatedCodeSpanIsText) { + auto Nodes = parseMarkdown("a `b c", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "a `b c"); +} + +// Inline parsing must not disturb plain text with no markers. +TEST_F(MarkdownParserTest, PlainTextHasNoInlineNodes) { + auto Nodes = parseMarkdown("just words", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "just words"); +} + } // namespace \ No newline at end of file >From 0af1c8e2999a20e2044cc337a8c4f0d8112d208b Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Fri, 12 Jun 2026 10:18:32 -0400 Subject: [PATCH 07/13] [clang-doc] Address review feedback: rename inline parser variables, simplify header docs --- .../clang-doc/support/Markdown.cpp | 54 ++++++++++--------- .../clang-doc/support/Markdown.h | 25 +++------ 2 files changed, 34 insertions(+), 45 deletions(-) diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp index f1af4f5430772..ef29daa76a166 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.cpp +++ b/clang-tools-extra/clang-doc/support/Markdown.cpp @@ -143,7 +143,7 @@ static size_t findClosingDelim(StringRef S, size_t From, char C, size_t Count) { // model (delimiter stacks, intraword underscore rules, links, autolinks). static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) { SmallVector<MDNode *> Nodes; - size_t TextStart = 0, I = 0, E = S.size(); + size_t TextStart = 0, Pos = 0, E = S.size(); auto flushText = [&](size_t End) { if (End > TextStart) @@ -151,54 +151,56 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) { internString(S.substr(TextStart, End - TextStart), Arena))); }; - while (I < E) { - char C = S[I]; + while (Pos < E) { + char C = S[Pos]; - // Inline code span: a run of N backticks closed by a run of N backticks. + // Inline code span: an opening backtick run closed by a run of the same + // length. if (C == '`') { - size_t N = countRun(S, I, '`'); - size_t J = I + N; - while (J < E && countRun(S, J, '`') != N) - J += S[J] == '`' ? countRun(S, J, '`') : 1; - if (J < E) { - flushText(I); - StringRef Code = trimCodeSpan(S.substr(I + N, J - (I + N))); + size_t OpenLen = countRun(S, Pos, '`'); + size_t ClosePos = Pos + OpenLen; + while (ClosePos < E && countRun(S, ClosePos, '`') != OpenLen) + ClosePos += S[ClosePos] == '`' ? countRun(S, ClosePos, '`') : 1; + if (ClosePos < E) { + flushText(Pos); + StringRef Code = + trimCodeSpan(S.substr(Pos + OpenLen, ClosePos - (Pos + OpenLen))); Nodes.push_back(new (Arena) InlineCodeNode(internString(Code, Arena))); - I = J + N; - TextStart = I; + Pos = ClosePos + OpenLen; + TextStart = Pos; continue; } // No closing run; leave the backticks as literal text. - I += N; + Pos += OpenLen; continue; } // Emphasis (*text*, _text_) and strong (**text**, __text__). if (C == '*' || C == '_') { // Strong binds the two-delimiter form before single-delimiter emphasis. - if (I + 1 < E && S[I + 1] == C) { - size_t Close = findClosingDelim(S, I + 2, C, 2); + if (Pos + 1 < E && S[Pos + 1] == C) { + size_t Close = findClosingDelim(S, Pos + 2, C, 2); if (Close != StringRef::npos) { - flushText(I); - StringRef Inner = S.substr(I + 2, Close - (I + 2)); + flushText(Pos); + StringRef Inner = S.substr(Pos + 2, Close - (Pos + 2)); Nodes.push_back(new (Arena) StrongNode(parseInline(Inner, Arena))); - I = Close + 2; - TextStart = I; + Pos = Close + 2; + TextStart = Pos; continue; } } - size_t Close = findClosingDelim(S, I + 1, C, 1); + size_t Close = findClosingDelim(S, Pos + 1, C, 1); if (Close != StringRef::npos) { - flushText(I); - StringRef Inner = S.substr(I + 1, Close - (I + 1)); + flushText(Pos); + StringRef Inner = S.substr(Pos + 1, Close - (Pos + 1)); Nodes.push_back(new (Arena) EmphasisNode(parseInline(Inner, Arena))); - I = Close + 1; - TextStart = I; + Pos = Close + 1; + TextStart = Pos; continue; } } - ++I; + ++Pos; } flushText(E); diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h index 3d457bcddfac6..60390465588c3 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.h +++ b/clang-tools-extra/clang-doc/support/Markdown.h @@ -9,20 +9,10 @@ /// \file /// Standalone Markdown parsing library for the LLVM ecosystem. /// -/// The parser takes plain paragraph text and returns a polymorphic tree of -/// MDNode-derived objects allocated in a caller-supplied BumpPtrAllocator. -/// Node types form a closed class hierarchy rooted at MDNode. Each concrete -/// type carries exactly the fields it needs -- no overloaded Content field, -/// no unused arrays. Use llvm::isa<>/cast<>/dyn_cast<> for type-safe -/// downcasting; each concrete type provides classof() for this purpose. -/// -/// See -/// https://llvm.org/docs/ProgrammerManual.html#the-isa-cast-and-dyn-cast-templates -/// -/// Field ordering in each derived struct is chosen to minimize padding: -/// 4-byte fields (like Level or Start) are declared before 16-byte fields -/// (ArrayRef, StringRef) so that no implicit padding is inserted between the -/// base class's 4-byte Kind and the first derived field. +/// The parser takes a single paragraph of plain text and returns a list of +/// nodes describing the Markdown it found. Each kind of construct has its own +/// node type, and every node shares a common MDNode base, so you can use +/// llvm::isa<>/cast<>/dyn_cast<> to check what a node is. /// /// Inline nodes (appear inside ParagraphNode, HeadingNode, etc.): /// TextNode -- plain text run @@ -165,9 +155,7 @@ struct ParagraphNode : MDNode { } }; -/// ATX heading: one to six leading # characters. Level is declared before -/// Children to avoid padding between the base class's 4-byte Kind and the -/// 8-byte-aligned ArrayRef, keeping sizeof(HeadingNode) at 24 bytes. +/// ATX heading: one to six leading # characters. struct HeadingNode : MDNode { unsigned Level; // 1-6 llvm::ArrayRef<MDNode *> Children; // inline content @@ -226,8 +214,7 @@ struct UnorderedListNode : MDNode { } }; -/// Ordered (numbered) list. Start is the number on the first item. Start is -/// declared before Items to avoid padding, keeping sizeof at 24 bytes. +/// Ordered (numbered) list. Start is the number on the first item. struct OrderedListNode : MDNode { unsigned Start; llvm::ArrayRef<ListItemNode *> Items; >From b76bfa182db40e7a358ffb7d42506aff24453e14 Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Fri, 12 Jun 2026 11:50:32 -0400 Subject: [PATCH 08/13] [clang-doc] Add libFuzzer harness for parseMarkdown() --- clang-tools-extra/clang-doc/CMakeLists.txt | 1 + .../clang-doc/fuzzer/CMakeLists.txt | 21 +++++++++++++ .../clang-doc/fuzzer/DummyMarkdownFuzzer.cpp | 21 +++++++++++++ .../clang-doc/fuzzer/FuzzMarkdown.cpp | 30 +++++++++++++++++++ 4 files changed, 73 insertions(+) create mode 100644 clang-tools-extra/clang-doc/fuzzer/CMakeLists.txt create mode 100644 clang-tools-extra/clang-doc/fuzzer/DummyMarkdownFuzzer.cpp create mode 100644 clang-tools-extra/clang-doc/fuzzer/FuzzMarkdown.cpp diff --git a/clang-tools-extra/clang-doc/CMakeLists.txt b/clang-tools-extra/clang-doc/CMakeLists.txt index 22e2c8159e9f6..f64d1129ed4af 100644 --- a/clang-tools-extra/clang-doc/CMakeLists.txt +++ b/clang-tools-extra/clang-doc/CMakeLists.txt @@ -44,6 +44,7 @@ target_link_libraries(clangDoc ) add_subdirectory(tool) +add_subdirectory(fuzzer) if (LLVM_INCLUDE_BENCHMARKS) add_subdirectory(benchmarks) diff --git a/clang-tools-extra/clang-doc/fuzzer/CMakeLists.txt b/clang-tools-extra/clang-doc/fuzzer/CMakeLists.txt new file mode 100644 index 0000000000000..5e6e943891052 --- /dev/null +++ b/clang-tools-extra/clang-doc/fuzzer/CMakeLists.txt @@ -0,0 +1,21 @@ +# Resolve "support/Markdown.h" against the parent clang-doc directory. +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..) + +set(LLVM_LINK_COMPONENTS + FuzzerCLI + Support + ) + +# This fuzzer runs on oss-fuzz, so keep it around even if it looks unreferenced. +# With a fuzzing engine configured (LLVM_USE_SANITIZE_COVERAGE or an external +# LLVM_LIB_FUZZING_ENGINE) this builds a real fuzz target; otherwise DUMMY_MAIN +# provides a main() so it still builds and can be replayed over saved inputs. +add_llvm_fuzzer(clang-doc-markdown-fuzzer + FuzzMarkdown.cpp + DUMMY_MAIN DummyMarkdownFuzzer.cpp + ) + +target_link_libraries(clang-doc-markdown-fuzzer + PRIVATE + clangDocSupport + ) diff --git a/clang-tools-extra/clang-doc/fuzzer/DummyMarkdownFuzzer.cpp b/clang-tools-extra/clang-doc/fuzzer/DummyMarkdownFuzzer.cpp new file mode 100644 index 0000000000000..61466e0fa4ef6 --- /dev/null +++ b/clang-tools-extra/clang-doc/fuzzer/DummyMarkdownFuzzer.cpp @@ -0,0 +1,21 @@ +//===-- DummyMarkdownFuzzer.cpp - Entry point to test the fuzzer ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implementation of main so we can build and test the harness without linking +// libFuzzer. Each command line argument is treated as a file to run the +// harness on. +// +//===----------------------------------------------------------------------===// + +#include "llvm/FuzzMutate/FuzzerCLI.h" + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size); + +int main(int argc, char *argv[]) { + return llvm::runFuzzerOnInputs(argc, argv, LLVMFuzzerTestOneInput); +} diff --git a/clang-tools-extra/clang-doc/fuzzer/FuzzMarkdown.cpp b/clang-tools-extra/clang-doc/fuzzer/FuzzMarkdown.cpp new file mode 100644 index 0000000000000..e407b3baccf2e --- /dev/null +++ b/clang-tools-extra/clang-doc/fuzzer/FuzzMarkdown.cpp @@ -0,0 +1,30 @@ +//===-- FuzzMarkdown.cpp - Fuzzer for the clang-doc Markdown parser -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements a libFuzzer harness for parseMarkdown(). It feeds +/// arbitrary bytes to the parser and checks that it never crashes. The parsed +/// nodes are walked so the returned tree is exercised, not just allocated. +/// +//===----------------------------------------------------------------------===// + +#include "support/Markdown.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Allocator.h" +#include <cstddef> +#include <cstdint> + +using namespace clang::doc::markdown; + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { + llvm::BumpPtrAllocator Arena; + llvm::StringRef Input(reinterpret_cast<const char *>(Data), Size); + for (const MDNode *Node : parseMarkdown(Input, Arena)) + (void)Node->Kind; + return 0; +} >From 77e28993d7a167410fd1a1ee97d2824945b44063 Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Fri, 12 Jun 2026 18:47:47 -0400 Subject: [PATCH 09/13] [clang-doc] Address review feedback: rename findClosingDelim params, add table TODO, fix EOF newline --- .../clang-doc/support/Markdown.cpp | 28 +++++++++++-------- .../clang-doc/MarkdownParserTest.cpp | 2 +- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp index ef29daa76a166..6a57cd7900ea2 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.cpp +++ b/clang-tools-extra/clang-doc/support/Markdown.cpp @@ -107,26 +107,27 @@ static StringRef trimCodeSpan(StringRef Code) { return Code; } -// Finds the start index of a closing emphasis run of exactly Count copies of C, -// searching forward from From. Requires non-whitespace immediately inside both -// the opening and closing delimiters and non-empty content, a simplified take -// on the CommonMark §6.2 flanking rules. Returns StringRef::npos if no valid -// closing run exists. -static size_t findClosingDelim(StringRef S, size_t From, char C, size_t Count) { +// Finds the start index of a closing emphasis run of exactly DelimLen copies of +// DelimChar, searching forward from StartPos. Requires non-whitespace +// immediately inside both the opening and closing delimiters and non-empty +// content, a simplified take on the CommonMark §6.2 flanking rules. Returns +// StringRef::npos if no valid closing run exists. +static size_t findClosingDelim(StringRef S, size_t StartPos, char DelimChar, + size_t DelimLen) { size_t E = S.size(); // Opening delimiter is not left-flanking if whitespace follows it. - if (From >= E || isSpace(S[From])) + if (StartPos >= E || isSpace(S[StartPos])) return StringRef::npos; - for (size_t J = From; J + Count <= E; ++J) { - if (S[J] != C) + for (size_t J = StartPos; J + DelimLen <= E; ++J) { + if (S[J] != DelimChar) continue; - size_t Run = countRun(S, J, C); - if (Run != Count) { + size_t Run = countRun(S, J, DelimChar); + if (Run != DelimLen) { J += Run - 1; // Skip the whole run; the loop's ++J lands past it. continue; } // Reject empty content and closing runs that are not right-flanking. - if (J == From || isSpace(S[J - 1])) + if (J == StartPos || isSpace(S[J - 1])) continue; return J; } @@ -257,6 +258,9 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, // Pipe table: current line has | and next line is a separator row. if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) { SmallVector<StringRef> Rows; + // TODO: Rows are kept as raw line text for now. Table cells may contain + // inline content (emphasis, code spans, links), so each row may need to + // be split on '|' and parsed further into structured cells. while (!Reader.atEnd() && Reader.peek().trim().contains('|')) Rows.push_back(internString(Reader.advance().trim(), Arena)); auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena)); diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp index ea72dacfb08e5..28bb9d567e6bc 100644 --- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp +++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp @@ -301,4 +301,4 @@ TEST_F(MarkdownParserTest, PlainTextHasNoInlineNodes) { EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "just words"); } -} // namespace \ No newline at end of file +} // namespace >From f33ef2ce3f9292e10f1e1dd220a500070ef21bc5 Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Fri, 12 Jun 2026 22:29:15 -0400 Subject: [PATCH 10/13] [clang-doc] Address review feedback: make UnterminatedFence and MixedContent tests explicit --- .../unittests/clang-doc/MarkdownParserTest.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp index 28bb9d567e6bc..207ae938c299a 100644 --- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp +++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp @@ -58,13 +58,17 @@ some code EXPECT_TRUE(N->Lang.empty()); } -TEST_F(MarkdownParserTest, UnterminatedFenceReturnsEmpty) { +TEST_F(MarkdownParserTest, UnterminatedFenceProducesCodeNode) { auto Nodes = parseMarkdown(R"(```cpp int x = 0;)", Arena); - // Unterminated fence should not crash and should produce a code node - // with whatever lines were found. - EXPECT_FALSE(Nodes.empty()); + // An unterminated fence should not crash. The parser falls back to emitting a + // FencedCodeNode with whatever lines were found before the end of input. + ASSERT_EQ(Nodes.size(), 1u); + auto *N = cast<FencedCodeNode>(Nodes[0]); + EXPECT_EQ(N->Lang, "cpp"); + ASSERT_EQ(N->Lines.size(), 1u); + EXPECT_EQ(N->Lines[0], "int x = 0;"); } TEST_F(MarkdownParserTest, PipeTable) { @@ -105,7 +109,10 @@ code ```````` - item)", Arena); - EXPECT_EQ(Nodes.size(), 3u); + ASSERT_EQ(Nodes.size(), 3u); + EXPECT_TRUE(isa<TextNode>(Nodes[0])); + EXPECT_TRUE(isa<FencedCodeNode>(Nodes[1])); + EXPECT_TRUE(isa<UnorderedListNode>(Nodes[2])); } // CommonMark §4.5 example 120: tilde fences work the same as backtick fences. >From 4371be42e6ccb7a955301c77b5b732e45675347d Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Fri, 12 Jun 2026 22:35:54 -0400 Subject: [PATCH 11/13] [clang-doc] Replace internString with llvm::StringSaver, matching Mustache pattern --- .../clang-doc/support/Markdown.cpp | 34 ++++++++----------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp index 6a57cd7900ea2..be2800bff5df7 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.cpp +++ b/clang-tools-extra/clang-doc/support/Markdown.cpp @@ -12,6 +12,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/DebugLog.h" +#include "llvm/Support/StringSaver.h" #include <cassert> #define DEBUG_TYPE "clang-doc" @@ -31,15 +32,6 @@ static ArrayRef<T> allocateArray(SmallVectorImpl<T> &Vec, return ArrayRef<T>(Allocated, Vec.size()); } -// Interns a StringRef into the arena so it outlives the parse loop. -static StringRef internString(StringRef S, BumpPtrAllocator &Arena) { - if (S.empty()) - return {}; - char *Buf = Arena.Allocate<char>(S.size()); - std::copy(S.begin(), S.end(), Buf); - return StringRef(Buf, S.size()); -} - // A line is a table separator if it only contains |, -, :, and spaces, // and has at least one -. static bool isSepRow(StringRef Line) { @@ -142,14 +134,15 @@ static size_t findClosingDelim(StringRef S, size_t StartPos, char DelimChar, // // TODO: This covers the common cases but not the full CommonMark §6 inline // model (delimiter stacks, intraword underscore rules, links, autolinks). -static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) { +static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena, + StringSaver &Saver) { SmallVector<MDNode *> Nodes; size_t TextStart = 0, Pos = 0, E = S.size(); auto flushText = [&](size_t End) { if (End > TextStart) Nodes.push_back(new (Arena) TextNode( - internString(S.substr(TextStart, End - TextStart), Arena))); + Saver.save(S.substr(TextStart, End - TextStart)))); }; while (Pos < E) { @@ -166,7 +159,7 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) { flushText(Pos); StringRef Code = trimCodeSpan(S.substr(Pos + OpenLen, ClosePos - (Pos + OpenLen))); - Nodes.push_back(new (Arena) InlineCodeNode(internString(Code, Arena))); + Nodes.push_back(new (Arena) InlineCodeNode(Saver.save(Code))); Pos = ClosePos + OpenLen; TextStart = Pos; continue; @@ -184,7 +177,8 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) { if (Close != StringRef::npos) { flushText(Pos); StringRef Inner = S.substr(Pos + 2, Close - (Pos + 2)); - Nodes.push_back(new (Arena) StrongNode(parseInline(Inner, Arena))); + Nodes.push_back(new (Arena) + StrongNode(parseInline(Inner, Arena, Saver))); Pos = Close + 2; TextStart = Pos; continue; @@ -194,7 +188,8 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) { if (Close != StringRef::npos) { flushText(Pos); StringRef Inner = S.substr(Pos + 1, Close - (Pos + 1)); - Nodes.push_back(new (Arena) EmphasisNode(parseInline(Inner, Arena))); + Nodes.push_back(new (Arena) + EmphasisNode(parseInline(Inner, Arena, Saver))); Pos = Close + 1; TextStart = Pos; continue; @@ -213,6 +208,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, if (ParagraphText.trim().empty()) return {}; + StringSaver Saver(Arena); SmallVector<StringRef, 16> Lines; ParagraphText.split(Lines, '\n'); @@ -234,7 +230,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, // case-by-case basis. if (Line.starts_with("```") || Line.starts_with("~~~")) { char Fence = Line[0]; - StringRef Lang = internString(Line.drop_front(3).trim(), Arena); + StringRef Lang = Saver.save(Line.drop_front(3).trim()); Reader.advance(); // consume opening fence SmallVector<StringRef> CodeLines; while (!Reader.atEnd()) { @@ -243,7 +239,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, all_of(CodeLine.take_front(3), [Fence](char C) { return C == Fence; })) break; - CodeLines.push_back(internString(Reader.advance(), Arena)); + CodeLines.push_back(Saver.save(Reader.advance())); } if (!Reader.atEnd()) Reader.advance(); // consume closing fence @@ -262,7 +258,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, // inline content (emphasis, code spans, links), so each row may need to // be split on '|' and parsed further into structured cells. while (!Reader.atEnd() && Reader.peek().trim().contains('|')) - Rows.push_back(internString(Reader.advance().trim(), Arena)); + Rows.push_back(Saver.save(Reader.advance().trim())); auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena)); LDBG() << "emitting TableNode rows=" << Rows.size(); Nodes.push_back(Table); @@ -276,7 +272,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, StringRef L = Reader.peek().trim(); if (!isListItem(L)) break; - StringRef ItemText = internString(L.drop_front(2).trim(), Arena); + StringRef ItemText = Saver.save(L.drop_front(2).trim()); SmallVector<MDNode *> ItemChildren; ItemChildren.push_back(new (Arena) TextNode(ItemText)); auto *Item = @@ -291,7 +287,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, } // Plain text, scanned for inline constructs (emphasis, strong, code). - for (MDNode *Inline : parseInline(Line, Arena)) + for (MDNode *Inline : parseInline(Line, Arena, Saver)) Nodes.push_back(Inline); Reader.advance(); } >From 0b5f53715fc6e78a56145609893fa61f5cf4f353 Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Fri, 12 Jun 2026 22:59:09 -0400 Subject: [PATCH 12/13] [clang-doc] Address review feedback: fix comment accuracy and trim AI-sounding language --- clang-tools-extra/clang-doc/support/Markdown.h | 12 +++++------- .../unittests/clang-doc/MarkdownParserTest.cpp | 13 +++++-------- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h index 60390465588c3..8c2055868671a 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.h +++ b/clang-tools-extra/clang-doc/support/Markdown.h @@ -49,8 +49,7 @@ namespace clang::doc::markdown { /// Discriminator for all Markdown AST nodes. Inline kinds are grouped before -/// block kinds so that the sentinels NK_LastInline and NK_FirstBlock enable -/// cheap range-based checks in classof() implementations. +/// block kinds. enum class NodeKind { // Inline nodes NK_Text, @@ -193,8 +192,7 @@ struct TableNode : MDNode { static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Table; } }; -/// A single list item. Children may contain block-level nodes for loose -/// lists, or a single inline sequence for tight lists. +/// A single list item. Children holds the item's inline content. struct ListItemNode : MDNode { llvm::ArrayRef<MDNode *> Children; explicit ListItemNode(llvm::ArrayRef<MDNode *> Children) @@ -248,9 +246,9 @@ struct ThematicBreakNode : MDNode { //===----------------------------------------------------------------------===// /// Parse Markdown from a single paragraph of plain text. Returns a list of -/// top-level block nodes allocated in Arena. Returns an empty ArrayRef if no -/// Markdown constructs are found, letting callers fall back to plain-text -/// rendering at zero cost. The parser never crashes on malformed input. +/// top-level nodes allocated in Arena. Returns an empty ArrayRef only for empty +/// or whitespace-only input; plain text with no Markdown constructs returns a +/// single TextNode. /// /// The caller must keep Arena alive for the lifetime of any returned nodes. llvm::ArrayRef<MDNode *> parseMarkdown(llvm::StringRef ParagraphText, diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp index 207ae938c299a..e2fd07159d446 100644 --- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp +++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp @@ -196,16 +196,13 @@ foo ASSERT_EQ(N->Lines.size(), 1u); } -// CommonMark §4.5 example 124: closing fence must be at least as long as the -// opening fence. -// TODO: our parser currently closes on the first line with 3 matching fence -// chars regardless of opening fence length. Fix as part of the CommonMark -// TODO in parseMarkdown(). +// CommonMark §4.5 example 124: the closing fence must be at least as long as +// the opening fence. Our parser closes on the first line with 3 matching fence +// chars regardless of opening length, so this documents the current +// non-conformant behavior. +// TODO: fix as part of the CommonMark TODO in parseMarkdown(). TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) { auto Nodes = parseMarkdown("````\naaa\n```", Arena); - // The ``` line should not close the ```` fence per CommonMark, but our - // parser currently treats it as a closing fence. This test documents the - // current (non-conformant) behavior. ASSERT_EQ(Nodes.size(), 1u); auto *N = cast<FencedCodeNode>(Nodes[0]); ASSERT_EQ(N->Lines.size(), 1u); >From 14f455ecde0305ec38e20ca6068b0d8f5f259776 Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Fri, 12 Jun 2026 23:07:02 -0400 Subject: [PATCH 13/13] [clang-doc] Wrap plain-text paragraph lines in ParagraphNode --- .../clang-doc/support/Markdown.cpp | 7 +- .../clang-doc/MarkdownParserTest.cpp | 67 +++++++++++++------ 2 files changed, 50 insertions(+), 24 deletions(-) diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp index be2800bff5df7..59e651d2b8b05 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.cpp +++ b/clang-tools-extra/clang-doc/support/Markdown.cpp @@ -286,9 +286,10 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, continue; } - // Plain text, scanned for inline constructs (emphasis, strong, code). - for (MDNode *Inline : parseInline(Line, Arena, Saver)) - Nodes.push_back(Inline); + // Plain text line: scan for inline constructs (emphasis, strong, code) and + // wrap the result in a paragraph. + auto Inlines = parseInline(Line, Arena, Saver); + Nodes.push_back(new (Arena) ParagraphNode(Inlines)); Reader.advance(); } diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp index e2fd07159d446..63d978061b99b 100644 --- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp +++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp @@ -33,8 +33,9 @@ TEST_F(MarkdownParserTest, WhitespaceOnlyInput) { TEST_F(MarkdownParserTest, PlainText) { auto Nodes = parseMarkdown("hello world", Arena); ASSERT_EQ(Nodes.size(), 1u); - auto *N = cast<TextNode>(Nodes[0]); - EXPECT_EQ(N->Text, "hello world"); + auto *P = cast<ParagraphNode>(Nodes[0]); + ASSERT_EQ(P->Children.size(), 1u); + EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "hello world"); } TEST_F(MarkdownParserTest, FencedCodeBlock) { @@ -110,7 +111,7 @@ code - item)", Arena); ASSERT_EQ(Nodes.size(), 3u); - EXPECT_TRUE(isa<TextNode>(Nodes[0])); + EXPECT_TRUE(isa<ParagraphNode>(Nodes[0])); EXPECT_TRUE(isa<FencedCodeNode>(Nodes[1])); EXPECT_TRUE(isa<UnorderedListNode>(Nodes[2])); } @@ -210,18 +211,22 @@ TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) { TEST_F(MarkdownParserTest, EmphasisAsterisk) { auto Nodes = parseMarkdown("an *important* word", Arena); - ASSERT_EQ(Nodes.size(), 3u); - EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "an "); - auto *Em = cast<EmphasisNode>(Nodes[1]); + ASSERT_EQ(Nodes.size(), 1u); + auto *P = cast<ParagraphNode>(Nodes[0]); + ASSERT_EQ(P->Children.size(), 3u); + EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "an "); + auto *Em = cast<EmphasisNode>(P->Children[1]); ASSERT_EQ(Em->Children.size(), 1u); EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "important"); - EXPECT_EQ(cast<TextNode>(Nodes[2])->Text, " word"); + EXPECT_EQ(cast<TextNode>(P->Children[2])->Text, " word"); } TEST_F(MarkdownParserTest, EmphasisUnderscore) { auto Nodes = parseMarkdown("_em_", Arena); ASSERT_EQ(Nodes.size(), 1u); - auto *Em = cast<EmphasisNode>(Nodes[0]); + auto *P = cast<ParagraphNode>(Nodes[0]); + ASSERT_EQ(P->Children.size(), 1u); + auto *Em = cast<EmphasisNode>(P->Children[0]); ASSERT_EQ(Em->Children.size(), 1u); EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "em"); } @@ -229,7 +234,9 @@ TEST_F(MarkdownParserTest, EmphasisUnderscore) { TEST_F(MarkdownParserTest, StrongAsterisk) { auto Nodes = parseMarkdown("**bold**", Arena); ASSERT_EQ(Nodes.size(), 1u); - auto *St = cast<StrongNode>(Nodes[0]); + auto *P = cast<ParagraphNode>(Nodes[0]); + ASSERT_EQ(P->Children.size(), 1u); + auto *St = cast<StrongNode>(P->Children[0]); ASSERT_EQ(St->Children.size(), 1u); EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bold"); } @@ -237,7 +244,9 @@ TEST_F(MarkdownParserTest, StrongAsterisk) { TEST_F(MarkdownParserTest, StrongUnderscore) { auto Nodes = parseMarkdown("__bold__", Arena); ASSERT_EQ(Nodes.size(), 1u); - auto *St = cast<StrongNode>(Nodes[0]); + auto *P = cast<ParagraphNode>(Nodes[0]); + ASSERT_EQ(P->Children.size(), 1u); + auto *St = cast<StrongNode>(P->Children[0]); ASSERT_EQ(St->Children.size(), 1u); EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bold"); } @@ -246,15 +255,19 @@ TEST_F(MarkdownParserTest, StrongUnderscore) { TEST_F(MarkdownParserTest, StrongBindsBeforeEmphasis) { auto Nodes = parseMarkdown("**strong**", Arena); ASSERT_EQ(Nodes.size(), 1u); - EXPECT_TRUE(isa<StrongNode>(Nodes[0])); + auto *P = cast<ParagraphNode>(Nodes[0]); + ASSERT_EQ(P->Children.size(), 1u); + EXPECT_TRUE(isa<StrongNode>(P->Children[0])); } TEST_F(MarkdownParserTest, InlineCode) { auto Nodes = parseMarkdown("call `foo()` here", Arena); - ASSERT_EQ(Nodes.size(), 3u); - EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "call "); - EXPECT_EQ(cast<InlineCodeNode>(Nodes[1])->Code, "foo()"); - EXPECT_EQ(cast<TextNode>(Nodes[2])->Text, " here"); + ASSERT_EQ(Nodes.size(), 1u); + auto *P = cast<ParagraphNode>(Nodes[0]); + ASSERT_EQ(P->Children.size(), 3u); + EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "call "); + EXPECT_EQ(cast<InlineCodeNode>(P->Children[1])->Code, "foo()"); + EXPECT_EQ(cast<TextNode>(P->Children[2])->Text, " here"); } // CommonMark §6.1: a doubled backtick fence lets the span contain a single @@ -262,14 +275,18 @@ TEST_F(MarkdownParserTest, InlineCode) { TEST_F(MarkdownParserTest, InlineCodeDoubleBacktick) { auto Nodes = parseMarkdown("``a`b``", Arena); ASSERT_EQ(Nodes.size(), 1u); - EXPECT_EQ(cast<InlineCodeNode>(Nodes[0])->Code, "a`b"); + auto *P = cast<ParagraphNode>(Nodes[0]); + ASSERT_EQ(P->Children.size(), 1u); + EXPECT_EQ(cast<InlineCodeNode>(P->Children[0])->Code, "a`b"); } // Emphasis and strong recurse, so a code span inside emphasis is parsed. TEST_F(MarkdownParserTest, CodeSpanInsideEmphasis) { auto Nodes = parseMarkdown("*see `x`*", Arena); ASSERT_EQ(Nodes.size(), 1u); - auto *Em = cast<EmphasisNode>(Nodes[0]); + auto *P = cast<ParagraphNode>(Nodes[0]); + ASSERT_EQ(P->Children.size(), 1u); + auto *Em = cast<EmphasisNode>(P->Children[0]); ASSERT_EQ(Em->Children.size(), 2u); EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "see "); EXPECT_EQ(cast<InlineCodeNode>(Em->Children[1])->Code, "x"); @@ -278,7 +295,9 @@ TEST_F(MarkdownParserTest, CodeSpanInsideEmphasis) { TEST_F(MarkdownParserTest, CodeSpanInsideStrong) { auto Nodes = parseMarkdown("**a `b`**", Arena); ASSERT_EQ(Nodes.size(), 1u); - auto *St = cast<StrongNode>(Nodes[0]); + auto *P = cast<ParagraphNode>(Nodes[0]); + ASSERT_EQ(P->Children.size(), 1u); + auto *St = cast<StrongNode>(P->Children[0]); ASSERT_EQ(St->Children.size(), 2u); EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "a "); EXPECT_EQ(cast<InlineCodeNode>(St->Children[1])->Code, "b"); @@ -288,21 +307,27 @@ TEST_F(MarkdownParserTest, CodeSpanInsideStrong) { TEST_F(MarkdownParserTest, UnmatchedDelimiterIsText) { auto Nodes = parseMarkdown("a * b", Arena); ASSERT_EQ(Nodes.size(), 1u); - EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "a * b"); + auto *P = cast<ParagraphNode>(Nodes[0]); + ASSERT_EQ(P->Children.size(), 1u); + EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "a * b"); } // An unterminated code span leaves the backtick as literal text. TEST_F(MarkdownParserTest, UnterminatedCodeSpanIsText) { auto Nodes = parseMarkdown("a `b c", Arena); ASSERT_EQ(Nodes.size(), 1u); - EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "a `b c"); + auto *P = cast<ParagraphNode>(Nodes[0]); + ASSERT_EQ(P->Children.size(), 1u); + EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "a `b c"); } // Inline parsing must not disturb plain text with no markers. TEST_F(MarkdownParserTest, PlainTextHasNoInlineNodes) { auto Nodes = parseMarkdown("just words", Arena); ASSERT_EQ(Nodes.size(), 1u); - EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "just words"); + auto *P = cast<ParagraphNode>(Nodes[0]); + ASSERT_EQ(P->Children.size(), 1u); + EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "just words"); } } // namespace _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
