hokein updated this revision to Diff 462448.
hokein added a comment.

Fix a bug caused by my change, and add test.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D130460/new/

https://reviews.llvm.org/D130460

Files:
  clang-tools-extra/pseudo/include/clang-pseudo/Language.h
  clang-tools-extra/pseudo/include/clang-pseudo/cxx/Recovery.h
  clang-tools-extra/pseudo/lib/GLR.cpp
  clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt
  clang-tools-extra/pseudo/lib/cxx/CXX.cpp
  clang-tools-extra/pseudo/lib/cxx/Recovery.cpp
  clang-tools-extra/pseudo/lib/cxx/cxx.bnf
  clang-tools-extra/pseudo/test/cxx/recovery-declarations.cpp
  clang-tools-extra/pseudo/unittests/CMakeLists.txt
  clang-tools-extra/pseudo/unittests/CXXRecoveryTest.cpp
  clang-tools-extra/pseudo/unittests/GLRTest.cpp

Index: clang-tools-extra/pseudo/unittests/GLRTest.cpp
===================================================================
--- clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -48,13 +48,13 @@
                            testing::UnorderedElementsAreArray(Parents));
 }
 
-Token::Index recoverBraces(Token::Index Begin, const TokenStream &Code) {
-  EXPECT_GT(Begin, 0u);
-  const Token &Left = Code.tokens()[Begin - 1];
+Token::Index recoverBraces(const RecoveryParams &P) {
+  EXPECT_GT(P.Begin, 0u);
+  const Token &Left = P.Tokens.tokens()[P.Begin - 1];
   EXPECT_EQ(Left.Kind, tok::l_brace);
   if (const auto* Right = Left.pair()) {
     EXPECT_EQ(Right->Kind, tok::r_brace);
-    return Code.index(*Right);
+    return P.Tokens.index(*Right);
   }
   return Token::Invalid;
 }
@@ -613,7 +613,7 @@
   TestLang.Table = LRTable::buildSLR(TestLang.G);
   TestLang.RecoveryStrategies.try_emplace(
       extensionID("Skip"),
-      [](Token::Index Start, const TokenStream &) { return Start; });
+      [](const RecoveryParams& P) { return P.Begin; });
   clang::LangOptions LOptions;
   TokenStream Tokens = cook(lex("foo", LOptions), LOptions);
 
@@ -641,7 +641,7 @@
   TestLang.Table = LRTable::buildSLR(TestLang.G);
   TestLang.RecoveryStrategies.try_emplace(
       extensionID("AcceptAnyTokenInstead"),
-      [](Token::Index Start, const TokenStream &Stream) { return Start + 1; });
+      [](const RecoveryParams& P) { return P.Begin + 1; });
 
   const ForestNode &Parsed =
       glrParse({Tokens, Arena, GSStack}, id("sentence"), TestLang);
@@ -660,9 +660,9 @@
   )bnf");
   TestLang.Table = LRTable::buildSLR(TestLang.G);
   bool fallback_recovered = false;
-  auto fallback = [&](Token::Index Start, const TokenStream & Code) {
+  auto fallback = [&](const RecoveryParams& P) {
     fallback_recovered = true;
-    return Code.tokens().size();
+    return P.Tokens.tokens().size();
   };
   TestLang.RecoveryStrategies.try_emplace(
       extensionID("Fallback"),
@@ -694,7 +694,7 @@
   TestLang.Table = LRTable::buildSLR(TestLang.G);
   TestLang.RecoveryStrategies.try_emplace(
       extensionID("Skip"),
-      [](Token::Index Start, const TokenStream &) { return Start; });
+      [](const RecoveryParams& P) { return P.Begin; });
   clang::LangOptions LOptions;
   TokenStream Tokens = cook(lex("main", LOptions), LOptions);
 
Index: clang-tools-extra/pseudo/unittests/CXXRecoveryTest.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/unittests/CXXRecoveryTest.cpp
@@ -0,0 +1,110 @@
+//===--- CXXRecoveryTest.cpp - ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Bracket.h"
+#include "clang-pseudo/Language.h"
+#include "clang-pseudo/Token.h"
+#include "clang-pseudo/cxx/Recovery.h"
+
+#include "clang/Basic/LangOptions.h"
+#include "llvm/Testing/Support/Annotations.h"
+#include "gtest/gtest.h"
+
+namespace clang::pseudo {
+
+namespace {
+
+std::pair<RecoveryParams, /*Expected End*/ Token::Index>
+extract(const llvm::Annotations &Test) {
+  assert(Test.points().size() == 2 && "Expect two ^s in the test annotations");
+  LangOptions Opts = clang::pseudo::genericLangOpts();
+  std::string Code = Test.code().str();
+  TokenStream Cook = cook(lex(Code, Opts), Opts);
+  pairBrackets(Cook);
+
+  RecoveryParams P{0, Token::Invalid, Cook};
+  auto StartOffset = [&](const Token &T) -> size_t {
+    return T.Data - Code.data();
+  };
+  for (const auto &T : Cook.tokens()) {
+    if (StartOffset(T) == Test.points().front()) {
+      P.Cursor = Cook.index(T);
+      break;
+    }
+  }
+  assert(P.Cursor != Token::Invalid && "no cursor position provided");
+  Token::Index Expect = Token::Invalid;
+  if (Test.points().back() > StartOffset(Cook.tokens().back()))
+    Expect = Cook.tokens().size(); // EOF
+  else {
+    for (const auto &T : Cook.tokens()) {
+      if (StartOffset(T) == Test.points().back()) {
+        Expect = Cook.index(T);
+      }
+    }
+    assert(Expect != Token::Invalid);
+  }
+  return {P, Expect};
+}
+
+TEST(CXXRecoveryTest, Declaration) {
+  // We have 3 different positions in the TokenStream:
+  //   - Begin: where we start to parse the code (always 0 in the test)
+  //   - Cursor: where the parse failed (specified by the first ^)
+  //   - End: expected ends of the range (specified by the second ^)
+  const char *Tests[] = {
+      R"cpp(
+        namespace ^??? { }
+        ^Foo foo;
+      )cpp",
+      R"cpp(
+        auto lambda = [] ^xxx () xxx {} ;
+        ^Foo foo;
+      )cpp",
+      R"cpp(
+        LLVM_DISCARD int ^bar() {}
+        ^int;
+      )cpp",
+      // Recovery should at least consume 1 token.
+      R"cpp(
+        int
+        ^int
+        ^
+      )cpp",
+      // FIXME: we skip too much, Foo is a better end point.
+      R"cpp(
+        LLVM_DISCARD int ^bar() {}
+        Foo foo();
+        ^int;
+      )cpp",
+      R"cpp(
+        void Foo::AcquireLease() ^LOCKS_EXCLUDED(mutex_) {}
+
+        Foo abc();
+        ^
+      )cpp",
+      // FIXME: skipp too much, we should end at private.
+      R"cpp(
+        class Foo {
+        public:
+          void abc() ^ERRROR
+        private:
+          int a;
+        };
+        ^
+      )cpp",
+  };
+  for (const auto *T : Tests) {
+    llvm::Annotations Code(T);
+    const auto &[RP, ExpectedEnd] = extract(Code);
+    EXPECT_EQ(pseudo::cxx::recoverNextDeclaration(RP), ExpectedEnd)
+        << Code.code();
+  }
+}
+} // namespace
+} // namespace clang::pseudo
Index: clang-tools-extra/pseudo/unittests/CMakeLists.txt
===================================================================
--- clang-tools-extra/pseudo/unittests/CMakeLists.txt
+++ clang-tools-extra/pseudo/unittests/CMakeLists.txt
@@ -5,6 +5,7 @@
 add_custom_target(ClangPseudoUnitTests)
 add_unittest(ClangPseudoUnitTests ClangPseudoTests
   BracketTest.cpp
+  CXXRecoveryTest.cpp
   CXXTest.cpp
   DirectiveTreeTest.cpp
   DisambiguateTest.cpp
Index: clang-tools-extra/pseudo/test/cxx/recovery-declarations.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/test/cxx/recovery-declarations.cpp
@@ -0,0 +1,30 @@
+// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
+
+this is not a valid declaration;
+// CHECK: declaration := <opaque>
+
+int x[0];
+// CHECK:      simple-declaration := decl-specifier-seq init-declarator-list ;
+// CHECK-NEXT: ├─decl-specifier-seq~INT := tok[7]
+// CHECK-NEXT: ├─init-declarator-list~noptr-declarator := noptr-declarator [ constant-expression ]
+
+also_not_ok ??? @@@ (;
+// CHECK: declaration := <opaque>
+
+class Y{};
+// CHECK:      simple-declaration := decl-specifier-seq ;
+// CHECK-NEXT: ├─decl-specifier-seq~class-specifier := class-head { }
+
+// FIXME: support recovery of incomplete scopes
+// namespace foo {
+
+class class class class function
+// CHECK: declaration := <opaque>
+
+export {
+;
+?
+// CHECK:      declaration-seq := declaration-seq declaration
+// CHECK-NEXT: ├─declaration-seq~; := tok[34]
+// CHECK-NEXT: └─declaration := <opaque>
+}
Index: clang-tools-extra/pseudo/lib/cxx/cxx.bnf
===================================================================
--- clang-tools-extra/pseudo/lib/cxx/cxx.bnf
+++ clang-tools-extra/pseudo/lib/cxx/cxx.bnf
@@ -304,8 +304,8 @@
 declaration-statement := block-declaration
 
 # gram.dcl
-declaration-seq := declaration
-declaration-seq := declaration-seq declaration
+declaration-seq := declaration [recover=NextDeclaration]
+declaration-seq := declaration-seq declaration [recover=NextDeclaration]
 declaration := block-declaration
 declaration := nodeclspec-function-declaration
 declaration := function-definition
@@ -554,7 +554,7 @@
 class-key := CLASS
 class-key := STRUCT
 class-key := UNION
-member-specification := member-declaration member-specification_opt
+member-specification := member-declaration [recover=NextDeclaration] member-specification_opt
 member-specification := access-specifier : member-specification_opt
 member-declaration := decl-specifier-seq member-declarator-list_opt ;
 member-declaration := member-declarator-list ;
Index: clang-tools-extra/pseudo/lib/cxx/Recovery.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/lib/cxx/Recovery.cpp
@@ -0,0 +1,121 @@
+//===--- Recovery.cpp - ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/cxx/Recovery.h"
+#include "clang-pseudo/Language.h"
+#include "clang/Basic/TokenKinds.h"
+
+namespace clang::pseudo::cxx {
+
+Token::Index recoverNextDeclaration(const RecoveryParams &P) {
+  if (P.Begin == P.Tokens.tokens().size())
+    return Token::Invalid;
+  // Declarations that don't end in semicolons:
+  //   function-definition
+  //   EXPORT {...}
+  //   EXTERN string-literal {...}
+  //   INLINE_opt NAMESPACE IDENTIFIER {...}
+  //   INLINE_opt NAMESPACE {...}
+  //   NAMESPACE enclosing-namespace-specifier :: INLINE_opt IDENTIFIER {...}
+  // Functions are hard to do anything about:
+  //  - if the declarator is good and the braces are present, parsing succeeds
+  //  - if the declarator is bad, it's hard to know it's a function
+  //  - if the braces are missing, we don't know where to recover
+  // So we only try to detect the others.
+  bool AcceptBraces = false;
+  auto ConsiderForBraces = [&](tok::TokenKind K) {
+    if (K == tok::kw_export || K == tok::kw_extern || K == tok::kw_namespace)
+      AcceptBraces = true;
+  };
+
+  // Some tokens are pretty good indicators of a declaration starting, when
+  // they appear at the beginning of a line.
+  auto LikelyStartsDeclaration = [&](tok::TokenKind K) -> bool {
+    switch (K) {
+    // Things you might be declaring.
+    case tok::kw_template:
+    case tok::kw_using:
+    case tok::kw_namespace:
+    case tok::kw_typedef:
+    case tok::kw_class:
+    case tok::kw_struct:
+    case tok::kw_union:
+    case tok::kw_friend:
+    // Types & other decl-specifiers
+    // FIXME: consider IDENTIFIER
+    case tok::kw_void:
+    case tok::kw_auto:
+    case tok::kw_int:
+    case tok::kw_char:
+    case tok::kw_float:
+    case tok::kw_double:
+    case tok::kw_long:
+    case tok::kw_signed:
+    case tok::kw_unsigned:
+    case tok::kw_const:
+    case tok::kw_volatile:
+    case tok::kw_static:
+    case tok::kw_extern:
+    case tok::kw_inline:
+    case tok::kw_explicit:
+    case tok::kw_virtual:
+    // Modules
+    case tok::kw_import:
+    case tok::kw_export:
+    // Class protection
+    case tok::kw_public:
+    case tok::kw_private:
+    case tok::kw_protected:
+      return true;
+    default:
+      return false;
+    }
+  };
+
+  // The indentation level where the declaration started hints at where a new
+  // sibling declaration might start.
+  unsigned OrigIndent = P.Tokens.tokens()[P.Begin].Indent;
+
+  // Walk over tokens at the appropriate bracket nesting level.
+  for (const Token *T = &P.Tokens.tokens()[P.Begin]; T->Kind != tok::eof;
+       T = &T->next()) {
+    auto TIndex = P.Tokens.index(*T);
+    // Handle (paired) brackets.
+    if (const Token *PairedT = T->pair()) {
+      // Closing bracket not paired within the recovery range.
+      auto PairedTIndex = P.Tokens.index(*PairedT);
+      if (PairedT < T) {
+        assert(PairedTIndex < P.Begin && "Why didn't we skip this one?");
+        assert(TIndex >= P.Cursor &&
+               "How did we parse an unmatched closing bracket?!");
+        return TIndex;
+      }
+      // Opening bracket: is this a semicolonless statement?
+      if (T->Kind == tok::l_brace && AcceptBraces && PairedTIndex >= P.Cursor)
+        return PairedTIndex + 1;
+      // Skip over opening/closing bracket pair.
+      T = PairedT;
+      continue;
+    }
+    ConsiderForBraces(T->Kind);
+
+    // Semicolons terminate statements.
+    if (T->Kind == tok::semi && TIndex >= P.Cursor)
+      return TIndex + 1;
+
+    // Stop if it looks like a new declaration is starting on the next line.
+    if (TIndex > P.Cursor // ensure cursor is moving forward after recovery
+        && (TIndex > 0 && T->Line != T->prev().Line) // start on the next line.
+        && T->Indent == OrigIndent && LikelyStartsDeclaration(T->Kind))
+      return TIndex;
+  }
+  // If we reached EOF, then assume the declaration runs right to EOF.
+  return P.Tokens.tokens().size();
+}
+
+} // namespace clang::pseudo::cxx
Index: clang-tools-extra/pseudo/lib/cxx/CXX.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/cxx/CXX.cpp
+++ clang-tools-extra/pseudo/lib/cxx/CXX.cpp
@@ -9,6 +9,7 @@
 #include "clang-pseudo/cxx/CXX.h"
 #include "clang-pseudo/Forest.h"
 #include "clang-pseudo/Language.h"
+#include "clang-pseudo/cxx/Recovery.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "clang-pseudo/grammar/LRTable.h"
 #include "clang/Basic/CharInfo.h"
@@ -410,14 +411,14 @@
 #undef SYMBOL_GUARD
 }
 
-Token::Index recoverBrackets(Token::Index Begin, const TokenStream &Tokens) {
-  assert(Begin > 0);
-  const Token &Left = Tokens.tokens()[Begin - 1];
+Token::Index recoverBrackets(const RecoveryParams &P) {
+  assert(P.Begin > 0);
+  const Token &Left = P.Tokens.tokens()[P.Begin - 1];
   assert(Left.Kind == tok::l_brace || Left.Kind == tok::l_paren ||
          Left.Kind == tok::l_square);
   if (const Token *Right = Left.pair()) {
-    assert(Tokens.index(*Right) > Begin - 1);
-    return Tokens.index(*Right);
+    assert(P.Tokens.index(*Right) > P.Begin - 1);
+    return P.Tokens.index(*Right);
   }
   return Token::Invalid;
 }
@@ -425,6 +426,7 @@
 llvm::DenseMap<ExtensionID, RecoveryStrategy> buildRecoveryStrategies() {
   return {
       {Extension::Brackets, recoverBrackets},
+      {Extension::NextDeclaration, recoverNextDeclaration},
   };
 }
 
Index: clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt
===================================================================
--- clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt
+++ clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt
@@ -4,6 +4,7 @@
 
 add_clang_library(clangPseudoCXX
   CXX.cpp
+  Recovery.cpp
 
   DEPENDS
   cxx_gen
Index: clang-tools-extra/pseudo/lib/GLR.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/GLR.cpp
+++ clang-tools-extra/pseudo/lib/GLR.cpp
@@ -28,11 +28,12 @@
 namespace {
 
 Token::Index findRecoveryEndpoint(ExtensionID Strategy, Token::Index Begin,
+                                  Token::Index Cursor,
                                   const TokenStream &Tokens,
                                   const Language &Lang) {
   assert(Strategy != 0);
   if (auto S = Lang.RecoveryStrategies.lookup(Strategy))
-    return S(Begin, Tokens);
+    return S({Begin, Cursor, Tokens});
   return Token::Invalid;
 }
 
@@ -142,7 +143,7 @@
       break;
 
     auto End = findRecoveryEndpoint(Option.Strategy, Option.Position,
-                                    Params.Code, Lang);
+                                    TokenIndex, Params.Code, Lang);
     // Recovery may not take the parse backwards.
     if (End == Token::Invalid || End < TokenIndex)
       continue;
Index: clang-tools-extra/pseudo/include/clang-pseudo/cxx/Recovery.h
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/include/clang-pseudo/cxx/Recovery.h
@@ -0,0 +1,33 @@
+//===--- Recovery.h - C++ specific recovery heuristics ------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_PSEUDO_CXX_RECOVERY_H
+#define CLANG_PSEUDO_CXX_RECOVERY_H
+
+#include "clang-pseudo/Language.h"
+
+namespace clang::pseudo::cxx {
+
+// Heuristically finds the end of range of a declaration if parsing fails while
+// attempting to parse a declaration.
+//
+// Returns an *excluded* end of token range. Given the broken code below,
+//
+//    int a = ? garbage ?;
+//    double;
+//
+// We failed to parse the first declaration, the function returns the index of
+// `double` token. We skip up to the first `;` token and assume tokens in
+// [P.Begin, recoverNextDeclaration(P)) range was a declaration.
+//
+// Returns Token::Invalid if we can't find an end point.
+Token::Index recoverNextDeclaration(const RecoveryParams &P);
+
+} // namespace clang::pseudo::cxx
+
+#endif // CLANG_PSEUDO_CXX_RECOVERY_H
Index: clang-tools-extra/pseudo/include/clang-pseudo/Language.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/Language.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/Language.h
@@ -34,6 +34,14 @@
 // Return true if the guard is satisfied.
 using RuleGuard = llvm::function_ref<bool(const GuardParams &)>;
 
+struct RecoveryParams {
+  // The position in the stream where the node being recovered will start.
+  Token::Index Begin;
+  // The position in the stream where the parse failed.
+  // Begin <= Cursor <= End (for a valid recovery).
+  Token::Index Cursor;
+  const TokenStream &Tokens;
+};
 // A recovery strategy determines a region of code to skip when parsing fails.
 //
 // For example, given `class-def := CLASS IDENT { body [recover=Brackets] }`,
@@ -43,7 +51,7 @@
 // The provided index is the token where the skipped region begins.
 // Returns the (excluded) end of the range, or Token::Invalid for no recovery.
 using RecoveryStrategy =
-    llvm::function_ref<Token::Index(Token::Index Start, const TokenStream &)>;
+    llvm::function_ref<Token::Index(const RecoveryParams &)>;
 
 // Specify a language that can be parsed by the pseduoparser.
 struct Language {
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to