sammccall created this revision.
sammccall added a reviewer: hokein.
Herald added a project: All.
sammccall requested review of this revision.
Herald added subscribers: cfe-commits, alextsao1999.
Herald added a project: clang-tools-extra.

Our GLR uses lookahead: only perform reductions that might be consumed by the
shift immediately following. However when shift fails and so reduce is followed
by recovery instead, this restriction is incorrect and leads to missing heads.

In turn this means certain recovery strategies can't be made to work. e.g.

  ns := NAMESPACE { namespace-body } [recover=Skip]
  ns-body := namespace_opt

When `namespace { namespace {` is parsed, we can recover the inner `ns` (using
the `Skip` strategy to ignore the missing `}`). However this `namespace` will
not be reduced to a `namespace-body` as EOF is not in the follow-set, and so we
are unable to recover the outer `ns`.

This patch fixes this by tracking which heads were produced by constrained
reduce, and discarding and rebuilding them before performing recovery.

This is a prerequisite for the `Skip` strategy mentioned above, though there are
some other limitations we need to address too.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D130523

Files:
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
  clang-tools-extra/pseudo/lib/GLR.cpp
  clang-tools-extra/pseudo/unittests/GLRTest.cpp

Index: clang-tools-extra/pseudo/unittests/GLRTest.cpp
===================================================================
--- clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -604,6 +604,33 @@
             "[  5, end) └─} := tok[5]\n");
 }
 
+TEST_F(GLRTest, RecoverUnrestrictedReduce) {
+  // Here, ! is not in any rule and therefore not in the follow set of `word`.
+  // We would not normally reduce `word := IDENTIFIER`, but do so for recovery.
+
+  build(R"bnf(
+    _ := sentence
+
+    word := IDENTIFIER
+    sentence := word word [recover=AcceptAnyTokenInstead]
+  )bnf");
+
+  clang::LangOptions LOptions;
+  const TokenStream &Tokens = cook(lex("id !", LOptions), LOptions);
+  TestLang.Table = LRTable::buildSLR(TestLang.G);
+  TestLang.RecoveryStrategies.try_emplace(
+      extensionID("AcceptAnyTokenInstead"),
+      [](Token::Index Start, const TokenStream &Stream) { return Start + 1; });
+
+  const ForestNode &Parsed =
+      glrParse({Tokens, Arena, GSStack}, id("sentence"), TestLang);
+  EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
+            "[  0, end) sentence := word word [recover=AcceptAnyTokenInstead]\n"
+            "[  0,   1) ├─word := IDENTIFIER\n"
+            "[  0,   1) │ └─IDENTIFIER := tok[0]\n"
+            "[  1, end) └─word := <opaque>\n");
+}
+
 TEST_F(GLRTest, NoExplicitAccept) {
   build(R"bnf(
     _ := test
Index: clang-tools-extra/pseudo/lib/GLR.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/GLR.cpp
+++ clang-tools-extra/pseudo/lib/GLR.cpp
@@ -43,7 +43,6 @@
                 unsigned &TokenIndex, const ParseParams &Params,
                 const Language &Lang,
                 std::vector<const GSS::Node *> &NewHeads) {
-  LLVM_DEBUG(llvm::dbgs() << "Recovery at token " << TokenIndex << "...\n");
   // Describes a possibility to recover by forcibly interpreting a range of
   // tokens around the cursor as a nonterminal that we expected to see.
   struct PlaceholderRecovery {
@@ -597,6 +596,10 @@
   std::vector<const GSS::Node *> Heads = {GSS.addNode(/*State=*/StartState,
                                                       /*ForestNode=*/nullptr,
                                                       {})};
+  // Invariant: Heads is partitioned by source: {shifted | reduced}.
+  // HeadsPartition is the index of the first head formed by reduction.
+  // We use this to discard and recreate the reduced heads during recovery.
+  unsigned HeadsPartition = 0;
   std::vector<const GSS::Node *> NextHeads;
   auto MaybeGC = [&, Roots(std::vector<const GSS::Node *>{}), I(0u)]() mutable {
     assert(NextHeads.empty() && "Running GC at the wrong time!");
@@ -619,8 +622,17 @@
     // If we weren't able to consume the token, try to skip over some tokens
     // so we can keep parsing.
     if (NextHeads.empty()) {
-      // FIXME: Heads may not be fully reduced, because our reductions were
-      // constrained by lookahead (but lookahead is meaningless to recovery).
+      // The reduction in the previous round was constrained by lookahead.
+      // On valid code this only rejects dead ends, but on broken code we should
+      // consider all possibilities.
+      //
+      // We discard all heads formed by reduction, and recreate them without
+      // this constraint. This may duplicate some nodes, but it's rare.
+      LLVM_DEBUG(llvm::dbgs() << "Shift failed, will attempt recovery. "
+                                 "Re-reducing without lookahead.");
+      Heads.resize(HeadsPartition);
+      Reduce(Heads, /*allow all reductions*/ tokenSymbol(tok::unknown));
+
       glrRecover(Heads, I, Params, Lang, NextHeads);
       if (NextHeads.empty())
         // FIXME: Ensure the `_ := start-symbol` rules have a fallback
@@ -632,6 +644,7 @@
     // Form nonterminals containing the token we just consumed.
     SymbolID Lookahead =
         I == Terminals.size() ? tokenSymbol(tok::eof) : Terminals[I].symbol();
+    HeadsPartition = NextHeads.size();
     Reduce(NextHeads, Lookahead);
     // Prepare for the next token.
     std::swap(Heads, NextHeads);
Index: clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
@@ -105,7 +105,11 @@
   bool canFollow(SymbolID Nonterminal, SymbolID Terminal) const {
     assert(isToken(Terminal));
     assert(isNonterminal(Nonterminal));
-    return FollowSets.test(tok::NUM_TOKENS * Nonterminal +
+    // tok::unknown is a sentinel value used in recovery: can follow anything.
+    if (tok::unknown)
+      return true;
+    return Terminal == tokenSymbol(tok::unknown) ||
+           FollowSets.test(tok::NUM_TOKENS * Nonterminal +
                            symbolToToken(Terminal));
   }
 
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to