https://github.com/Nerixyz updated https://github.com/llvm/llvm-project/pull/120843
>From 577d6d06224410a1da80cad2377041b8cd6db3ca Mon Sep 17 00:00:00 2001 From: Nerixyz <nerix...@outlook.de> Date: Sat, 21 Dec 2024 16:23:32 +0100 Subject: [PATCH] [Clang][Comments] Allow HTML tags across multiple lines --- clang/docs/ReleaseNotes.rst | 2 + clang/lib/AST/CommentLexer.cpp | 69 +++++++++++- clang/test/AST/ast-dump-comment.cpp | 13 +++ clang/unittests/AST/CommentLexer.cpp | 154 ++++++++++++++++++++++++++ clang/unittests/AST/CommentParser.cpp | 23 ++-- 5 files changed, 244 insertions(+), 17 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index edb2e4a10ded05..e0af9c15fdd07a 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -873,6 +873,8 @@ Bug Fixes to AST Handling Miscellaneous Bug Fixes ^^^^^^^^^^^^^^^^^^^^^^^ +- HTML tags in comments that span multiple lines are now parsed correctly by Clang's comment parser. (#GH120843) + Miscellaneous Clang Crashes Fixed ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/AST/CommentLexer.cpp b/clang/lib/AST/CommentLexer.cpp index ec9a5b480aa295..804be89a8d4ddc 100644 --- a/clang/lib/AST/CommentLexer.cpp +++ b/clang/lib/AST/CommentLexer.cpp @@ -196,6 +196,15 @@ const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { return BufferEnd; } +const char *skipHorizontalWhitespace(const char *BufferPtr, + const char *BufferEnd) { + for (; BufferPtr != BufferEnd; ++BufferPtr) { + if (!isHorizontalWhitespace(*BufferPtr)) + return BufferPtr; + } + return BufferEnd; +} + bool isWhitespace(const char *BufferPtr, const char *BufferEnd) { return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; } @@ -637,17 +646,41 @@ void Lexer::setupAndLexHTMLStartTag(Token &T) { formTokenWithChars(T, TagNameEnd, tok::html_start_tag); T.setHTMLTagStartName(Name); - BufferPtr = skipWhitespace(BufferPtr, CommentEnd); + BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd); + if (BufferPtr == CommentEnd) { // in BCPL comments + State = LS_HTMLStartTag; + return; + } const char C = *BufferPtr; if (BufferPtr != CommentEnd && - (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C))) + (C == '>' || C == '/' || isVerticalWhitespace(C) || + isHTMLIdentifierStartingCharacter(C))) State = LS_HTMLStartTag; } void Lexer::lexHTMLStartTag(Token &T) { assert(State == LS_HTMLStartTag); + // Skip leading whitespace and comment decorations + while (isVerticalWhitespace(*BufferPtr)) { + BufferPtr = skipNewline(BufferPtr, CommentEnd); + + if (CommentState == LCS_InsideCComment) + skipLineStartingDecorations(); + + BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd); + if (BufferPtr == CommentEnd) { + // HTML starting tags must be defined in a single comment block. + // It's likely a user-error where they forgot to terminate the comment. + State = LS_Normal; + // Since at least one newline was skipped and one token needs to be lexed, + // return a newline. + formTokenWithChars(T, BufferPtr, tok::newline); + return; + } + } + const char *TokenPtr = BufferPtr; char C = *TokenPtr; if (isHTMLIdentifierCharacter(C)) { @@ -693,14 +726,13 @@ void Lexer::lexHTMLStartTag(Token &T) { // Now look ahead and return to normal state if we don't see any HTML tokens // ahead. - BufferPtr = skipWhitespace(BufferPtr, CommentEnd); + BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd); if (BufferPtr == CommentEnd) { - State = LS_Normal; return; } C = *BufferPtr; - if (!isHTMLIdentifierStartingCharacter(C) && + if (!isHTMLIdentifierStartingCharacter(C) && !isVerticalWhitespace(C) && C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') { State = LS_Normal; return; @@ -774,8 +806,17 @@ void Lexer::lex(Token &T) { BufferPtr++; CommentState = LCS_InsideBCPLComment; - if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine) + switch (State) { + case LS_VerbatimBlockFirstLine: + case LS_VerbatimBlockBody: + break; + case LS_HTMLStartTag: + BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd); + break; + default: State = LS_Normal; + break; + } CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); goto again; } @@ -807,6 +848,14 @@ void Lexer::lex(Token &T) { while(EndWhitespace != BufferEnd && *EndWhitespace != '/') EndWhitespace++; + // When lexing the start of an HTML tag (i.e. going through the attributes) + // there won't be any newlines generated. + if (State == LS_HTMLStartTag && EndWhitespace != BufferEnd) { + CommentState = LCS_BeforeComment; + BufferPtr = EndWhitespace; + goto again; + } + // Turn any whitespace between comments (and there is only whitespace // between them -- guaranteed by comment extraction) into a newline. We // have two newlines between C comments in total (first one was synthesized @@ -829,6 +878,14 @@ void Lexer::lex(Token &T) { BufferPtr += 2; assert(BufferPtr <= BufferEnd); + // When lexing the start of an HTML tag (i.e. going through the + // attributes) there won't be any newlines generated - whitespace still + // needs to be skipped. + if (State == LS_HTMLStartTag && BufferPtr != BufferEnd) { + CommentState = LCS_BetweenComments; + goto again; + } + // Synthenize newline just after the C comment, regardless if there is // actually a newline. formTokenWithChars(T, BufferPtr, tok::newline); diff --git a/clang/test/AST/ast-dump-comment.cpp b/clang/test/AST/ast-dump-comment.cpp index 9798295b420f9a..40c3edb62821bb 100644 --- a/clang/test/AST/ast-dump-comment.cpp +++ b/clang/test/AST/ast-dump-comment.cpp @@ -91,6 +91,19 @@ int Test_HTMLTagComment; // CHECK-NEXT: TextComment{{.*}} Text=" " // CHECK-NEXT: HTMLStartTagComment{{.*}} Name="br" SelfClosing +/// <a +/// href="foo" +/// >Aaa</a>b +int Test_HTMLTagMultilineBCPL; +// CHECK: VarDecl{{.*}}Test_HTMLTagMultilineBCPL +// CHECK-NEXT: FullComment +// CHECK-NEXT: ParagraphComment +// CHECK-NEXT: TextComment{{.*}} Text=" " +// CHECK-NEXT: HTMLStartTagComment{{.*}} Name="a" Attrs: "href="foo" +// CHECK-NEXT: TextComment{{.*}} Text="Aaa" +// CHECK-NEXT: HTMLEndTagComment{{.*}} Name="a" +// CHECK-NEXT: TextComment{{.*}} Text="b" + /// \verbatim /// Aaa /// \endverbatim diff --git a/clang/unittests/AST/CommentLexer.cpp b/clang/unittests/AST/CommentLexer.cpp index 1e7bad89898f4c..22866f0eb23edc 100644 --- a/clang/unittests/AST/CommentLexer.cpp +++ b/clang/unittests/AST/CommentLexer.cpp @@ -1453,6 +1453,160 @@ TEST_F(CommentLexerTest, HTML19) { ASSERT_EQ(tok::newline, Toks[2].getKind()); } +TEST_F(CommentLexerTest, HTML20) { + const char *Source = "// <a\n" + "// \n" + "// href=\"foo\"\n" + "// \n" + "// bar>text</a>"; + + std::vector<Token> Toks; + + lexString(Source, Toks); + + ASSERT_EQ(11U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::html_start_tag, Toks[1].getKind()); + ASSERT_EQ(StringRef("a"), Toks[1].getHTMLTagStartName()); + + ASSERT_EQ(tok::html_ident, Toks[2].getKind()); + ASSERT_EQ(StringRef("href"), Toks[2].getHTMLIdent()); + + ASSERT_EQ(tok::html_equals, Toks[3].getKind()); + + ASSERT_EQ(tok::html_quoted_string, Toks[4].getKind()); + ASSERT_EQ(StringRef("foo"), Toks[4].getHTMLQuotedString()); + + ASSERT_EQ(tok::html_ident, Toks[5].getKind()); + ASSERT_EQ(StringRef("bar"), Toks[5].getHTMLIdent()); + + ASSERT_EQ(tok::html_greater, Toks[6].getKind()); + + ASSERT_EQ(tok::text, Toks[7].getKind()); + ASSERT_EQ(StringRef("text"), Toks[7].getText()); + + ASSERT_EQ(tok::html_end_tag, Toks[8].getKind()); + ASSERT_EQ(StringRef("a"), Toks[8].getHTMLTagEndName()); + + ASSERT_EQ(tok::html_greater, Toks[9].getKind()); + + ASSERT_EQ(tok::newline, Toks[10].getKind()); +} + +TEST_F(CommentLexerTest, HTML21) { + const char *Source = "/**\n" + " * <a\n" + " * \n" + " * href=\"foo\"\n" + " * \n" + " * bar>text</a>\n" + " */"; + + std::vector<Token> Toks; + + lexString(Source, Toks); + + ASSERT_EQ(15U, Toks.size()); + + ASSERT_EQ(tok::newline, Toks[0].getKind()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef(" "), Toks[1].getText()); + + ASSERT_EQ(tok::html_start_tag, Toks[2].getKind()); + ASSERT_EQ(StringRef("a"), Toks[2].getHTMLTagStartName()); + + ASSERT_EQ(tok::html_ident, Toks[3].getKind()); + ASSERT_EQ(StringRef("href"), Toks[3].getHTMLIdent()); + + ASSERT_EQ(tok::html_equals, Toks[4].getKind()); + + ASSERT_EQ(tok::html_quoted_string, Toks[5].getKind()); + ASSERT_EQ(StringRef("foo"), Toks[5].getHTMLQuotedString()); + + ASSERT_EQ(tok::html_ident, Toks[6].getKind()); + ASSERT_EQ(StringRef("bar"), Toks[6].getHTMLIdent()); + + ASSERT_EQ(tok::html_greater, Toks[7].getKind()); + + ASSERT_EQ(tok::text, Toks[8].getKind()); + ASSERT_EQ(StringRef("text"), Toks[8].getText()); + + ASSERT_EQ(tok::html_end_tag, Toks[9].getKind()); + ASSERT_EQ(StringRef("a"), Toks[9].getHTMLTagEndName()); + + ASSERT_EQ(tok::html_greater, Toks[10].getKind()); + + ASSERT_EQ(tok::newline, Toks[11].getKind()); + + ASSERT_EQ(tok::text, Toks[12].getKind()); + ASSERT_EQ(StringRef(" "), Toks[12].getText()); + + ASSERT_EQ(tok::newline, Toks[13].getKind()); + + ASSERT_EQ(tok::newline, Toks[14].getKind()); +} + +TEST_F(CommentLexerTest, HTML22) { + const char *Source = "/**\n" + " * <a\n" + " */"; + + std::vector<Token> Toks; + + lexString(Source, Toks); + + ASSERT_EQ(6U, Toks.size()); + + ASSERT_EQ(tok::newline, Toks[0].getKind()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef(" "), Toks[1].getText()); + + ASSERT_EQ(tok::html_start_tag, Toks[2].getKind()); + ASSERT_EQ(StringRef("a"), Toks[2].getHTMLTagStartName()); + + ASSERT_EQ(tok::newline, Toks[3].getKind()); + + ASSERT_EQ(tok::newline, Toks[4].getKind()); + + ASSERT_EQ(tok::newline, Toks[5].getKind()); +} + +TEST_F(CommentLexerTest, HTML23) { + // NOTE: "//<" is considered a comment start + const char *Source = "// <\n" + "// a\n" + "// >"; + + std::vector<Token> Toks; + + lexString(Source, Toks); + + ASSERT_EQ(7U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("<"), Toks[1].getText()); + + ASSERT_EQ(tok::newline, Toks[2].getKind()); + + ASSERT_EQ(tok::text, Toks[3].getKind()); + ASSERT_EQ(StringRef(" a"), Toks[3].getText()); + + ASSERT_EQ(tok::newline, Toks[4].getKind()); + + ASSERT_EQ(tok::text, Toks[5].getKind()); + ASSERT_EQ(StringRef(" >"), Toks[5].getText()); + + ASSERT_EQ(tok::newline, Toks[6].getKind()); +} + TEST_F(CommentLexerTest, NotAKnownHTMLTag1) { const char *Source = "// <tag>"; diff --git a/clang/unittests/AST/CommentParser.cpp b/clang/unittests/AST/CommentParser.cpp index e0df182d430c36..aa08b6718e506f 100644 --- a/clang/unittests/AST/CommentParser.cpp +++ b/clang/unittests/AST/CommentParser.cpp @@ -1065,9 +1065,10 @@ TEST_F(CommentParserTest, InlineCommand5) { TEST_F(CommentParserTest, HTML1) { const char *Sources[] = { - "// <a", - "// <a>", - "// <a >" + "// <a", + "// <a>", + "// <a >", + "// <a\n// >", }; for (size_t i = 0, e = std::size(Sources); i != e; i++) { @@ -1088,8 +1089,9 @@ TEST_F(CommentParserTest, HTML1) { TEST_F(CommentParserTest, HTML2) { const char *Sources[] = { - "// <br/>", - "// <br />" + "// <br/>", + "// <br />", + "// <br \n// />", }; for (size_t i = 0, e = std::size(Sources); i != e; i++) { @@ -1110,10 +1112,8 @@ TEST_F(CommentParserTest, HTML2) { TEST_F(CommentParserTest, HTML3) { const char *Sources[] = { - "// <a href", - "// <a href ", - "// <a href>", - "// <a href >", + "// <a href", "// <a href ", "// <a href>", + "// <a href >", "// <a \n// href >", }; for (size_t i = 0, e = std::size(Sources); i != e; i++) { @@ -1134,8 +1134,9 @@ TEST_F(CommentParserTest, HTML3) { TEST_F(CommentParserTest, HTML4) { const char *Sources[] = { - "// <a href=\"bbb\"", - "// <a href=\"bbb\">", + "// <a href=\"bbb\"", + "// <a href=\"bbb\">", + "// <a \n// href=\"bbb\">", }; for (size_t i = 0, e = std::size(Sources); i != e; i++) { _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits