[PATCH] D127357: [pseudo] wip/prototype: use LR0 instead of SLR1 table

Sam McCall via Phabricator via cfe-commits Tue, 21 Jun 2022 09:10:29 -0700

sammccall updated this revision to Diff 438732.
sammccall added a comment.

tweak & document fast-path



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D127357/new/

https://reviews.llvm.org/D127357

Files:
  clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
  clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
  clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
  clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
  clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h
  clang-tools-extra/pseudo/lib/GLR.cpp
  clang-tools-extra/pseudo/lib/cxx/CXX.cpp
  clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
  clang-tools-extra/pseudo/lib/grammar/LRTable.cpp
  clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
  clang-tools-extra/pseudo/test/lr-build-basic.test
  clang-tools-extra/pseudo/test/lr-build-conflicts.test
  clang-tools-extra/pseudo/tool/ClangPseudo.cpp
  clang-tools-extra/pseudo/unittests/GLRTest.cpp
  clang-tools-extra/pseudo/unittests/GrammarTest.cpp
  clang-tools-extra/pseudo/unittests/LRTableTest.cpp

Index: clang-tools-extra/pseudo/unittests/LRTableTest.cpp
===================================================================
--- clang-tools-extra/pseudo/unittests/LRTableTest.cpp
+++ clang-tools-extra/pseudo/unittests/LRTableTest.cpp
@@ -17,36 +17,33 @@
 namespace pseudo {
 namespace {
 
+using testing::ElementsAre;
 using testing::IsEmpty;
 using testing::UnorderedElementsAre;
-using Action = LRTable::Action;
 
 TEST(LRTable, Builder) {
-  GrammarTable GTable;
-
+  GrammarTable GT;
   //           eof   semi  ...
   // +-------+----+-------+---
   // |state0 |    | s0,r0 |...
   // |state1 | acc|       |...
   // |state2 |    |  r1   |...
   // +-------+----+-------+---
-  std::vector<LRTable::Entry> Entries = {
-      {/* State */ 0, tokenSymbol(tok::semi), Action::shift(0)},
-      {/* State */ 0, tokenSymbol(tok::semi), Action::reduce(0)},
-      {/* State */ 1, tokenSymbol(tok::eof), Action::reduce(2)},
-      {/* State */ 2, tokenSymbol(tok::semi), Action::reduce(1)}};
-  GrammarTable GT;
-  LRTable T = LRTable::buildForTests(GT, Entries);
-  EXPECT_THAT(T.find(0, tokenSymbol(tok::eof)), IsEmpty());
-  EXPECT_THAT(T.find(0, tokenSymbol(tok::semi)),
-              UnorderedElementsAre(Action::shift(0), Action::reduce(0)));
-  EXPECT_THAT(T.find(1, tokenSymbol(tok::eof)),
-              UnorderedElementsAre(Action::reduce(2)));
-  EXPECT_THAT(T.find(1, tokenSymbol(tok::semi)), IsEmpty());
-  EXPECT_THAT(T.find(2, tokenSymbol(tok::semi)),
-              UnorderedElementsAre(Action::reduce(1)));
+  LRTable::Builder Builder;
+  Builder.StateCount = 3;
+  Builder.Shift[{/*State=*/0, tokenSymbol(tok::semi)}] = 0;
+  Builder.Reduce.push_back({/*State=*/0, /*Rule=*/0});
+  Builder.Reduce.push_back({/*State=*/1, /*Rule=*/2});
+  Builder.Reduce.push_back({/*State=*/2, /*Rule=*/1});
+  LRTable T = std::move(Builder).build();
+  EXPECT_EQ(T.getShiftState(0, tokenSymbol(tok::eof)), llvm::None);
+  EXPECT_EQ(T.getShiftState(0, tokenSymbol(tok::semi)), LRTable::StateID{0});
+  EXPECT_THAT(T.getReduceRules(0), ElementsAre(0));
+  EXPECT_EQ(T.getShiftState(1, tokenSymbol(tok::semi)), llvm::None);
+  EXPECT_THAT(T.getReduceRules(1), ElementsAre(2));
+  EXPECT_THAT(T.getReduceRules(2), ElementsAre(1));
   // Verify the behaivor for other non-available-actions terminals.
-  EXPECT_THAT(T.find(2, tokenSymbol(tok::kw_int)), IsEmpty());
+  EXPECT_EQ(T.getShiftState(2, tokenSymbol(tok::kw_int)), llvm::None);
 }
 
 } // namespace
Index: clang-tools-extra/pseudo/unittests/GrammarTest.cpp
===================================================================
--- clang-tools-extra/pseudo/unittests/GrammarTest.cpp
+++ clang-tools-extra/pseudo/unittests/GrammarTest.cpp
@@ -142,66 +142,6 @@
                          "Unknown attribute 'unknown'"));
 }
 
-TEST_F(GrammarTest, FirstAndFollowSets) {
-  build(
-      R"bnf(
-_ := expr
-expr := expr - term
-expr := term
-term := IDENTIFIER
-term := ( expr )
-)bnf");
-  ASSERT_TRUE(Diags.empty());
-  auto ToPairs = [](std::vector<llvm::DenseSet<SymbolID>> Input) {
-    std::vector<std::pair<SymbolID, llvm::DenseSet<SymbolID>>> Sets;
-    for (SymbolID ID = 0; ID < Input.size(); ++ID)
-      Sets.emplace_back(ID, std::move(Input[ID]));
-    return Sets;
-  };
-
-  EXPECT_THAT(
-      ToPairs(firstSets(*G)),
-      UnorderedElementsAre(
-          Pair(id("_"), UnorderedElementsAre(id("IDENTIFIER"), id("("))),
-          Pair(id("expr"), UnorderedElementsAre(id("IDENTIFIER"), id("("))),
-          Pair(id("term"), UnorderedElementsAre(id("IDENTIFIER"), id("(")))));
-  EXPECT_THAT(
-      ToPairs(followSets(*G)),
-      UnorderedElementsAre(
-          Pair(id("_"), UnorderedElementsAre(id("EOF"))),
-          Pair(id("expr"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))),
-          Pair(id("term"), UnorderedElementsAre(id("-"), id("EOF"), id(")")))));
-
-  build(R"bnf(
-# A simplfied C++ decl-specifier-seq.
-_ := decl-specifier-seq
-decl-specifier-seq := decl-specifier decl-specifier-seq
-decl-specifier-seq := decl-specifier
-decl-specifier := simple-type-specifier
-decl-specifier := INLINE
-simple-type-specifier := INT
-   )bnf");
-  ASSERT_TRUE(Diags.empty());
-  EXPECT_THAT(
-      ToPairs(firstSets(*G)),
-      UnorderedElementsAre(
-          Pair(id("_"), UnorderedElementsAre(id("INLINE"), id("INT"))),
-          Pair(id("decl-specifier-seq"),
-               UnorderedElementsAre(id("INLINE"), id("INT"))),
-          Pair(id("simple-type-specifier"), UnorderedElementsAre(id("INT"))),
-          Pair(id("decl-specifier"),
-               UnorderedElementsAre(id("INLINE"), id("INT")))));
-  EXPECT_THAT(
-      ToPairs(followSets(*G)),
-      UnorderedElementsAre(
-          Pair(id("_"), UnorderedElementsAre(id("EOF"))),
-          Pair(id("decl-specifier-seq"), UnorderedElementsAre(id("EOF"))),
-          Pair(id("decl-specifier"),
-               UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))),
-          Pair(id("simple-type-specifier"),
-               UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF")))));
-}
-
 } // namespace
 } // namespace pseudo
 } // namespace clang
Index: clang-tools-extra/pseudo/unittests/GLRTest.cpp
===================================================================
--- clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -29,8 +29,9 @@
 
 namespace {
 
-using Action = LRTable::Action;
 using testing::AllOf;
+using testing::ElementsAre;
+using testing::UnorderedElementsAre;
 
 MATCHER_P(state, StateID, "") { return arg->State == StateID; }
 MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
@@ -83,17 +84,10 @@
     return 0;
   }
 
-  NewHeadCallback captureNewHeads() {
-    return [this](const GSS::Node *NewHead) {
-      NewHeadResults.push_back(NewHead);
-    };
-  };
-
 protected:
   std::unique_ptr<Grammar> G;
   ForestArena Arena;
   GSS GSStack;
-  std::vector<const GSS::Node*> NewHeadResults;
 };
 
 TEST_F(GLRTest, ShiftMergingHeads) {
@@ -107,33 +101,35 @@
   //   0---1---4
   //   â---2---â
   //   â---3---5
+  LRTable::Builder LR;
+  LR.StateCount = 6;
+  LR.Shift[{1, tokenSymbol(tok::semi)}] = 4;
+  LR.Shift[{2, tokenSymbol(tok::semi)}] = 4;
+  LR.Shift[{3, tokenSymbol(tok::semi)}] = 5;
+
   auto *GSSNode0 =
       GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
-  auto *GSSNode1 = GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr,
+  auto *GSSNode1 = GSStack.addNode(/*State=*/1, /*ForestNode=*/nullptr,
                                    /*Parents=*/{GSSNode0});
-  auto *GSSNode2 = GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr,
+  auto *GSSNode2 = GSStack.addNode(/*State=*/2, /*ForestNode=*/nullptr,
                                    /*Parents=*/{GSSNode0});
-  auto *GSSNode3 = GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr,
+  auto *GSSNode3 = GSStack.addNode(/*State=*/3, /*ForestNode=*/nullptr,
                                    /*Parents=*/{GSSNode0});
 
   buildGrammar({}, {}); // Create a fake empty grammar.
-  LRTable T = LRTable::buildForTests(G->table(), /*Entries=*/{});
+  LRTable T = std::move(LR).build();
 
   ForestNode &SemiTerminal = Arena.createTerminal(tok::semi, 0);
-  std::vector<ParseStep> PendingShift = {
-      {GSSNode1, Action::shift(4)},
-      {GSSNode3, Action::shift(5)},
-      {GSSNode2, Action::shift(4)},
-  };
-  glrShift(PendingShift, SemiTerminal, {*G, T, Arena, GSStack},
-           captureNewHeads());
-
-  EXPECT_THAT(NewHeadResults, testing::UnorderedElementsAre(
-                                  AllOf(state(4), parsedSymbol(&SemiTerminal),
-                                        parents({GSSNode1, GSSNode2})),
-                                  AllOf(state(5), parsedSymbol(&SemiTerminal),
-                                        parents({GSSNode3}))))
-      << NewHeadResults;
+  std::vector<const GSS::Node *> NewHeads;
+  glrShift({GSSNode1, GSSNode3, GSSNode2}, SemiTerminal,
+           {*G, T, Arena, GSStack}, NewHeads);
+
+  EXPECT_THAT(NewHeads, UnorderedElementsAre(
+                            AllOf(state(4), parsedSymbol(&SemiTerminal),
+                                  parents({GSSNode1, GSSNode2})),
+                            AllOf(state(5), parsedSymbol(&SemiTerminal),
+                                  parents({GSSNode3}))))
+      << NewHeads;
 }
 
 TEST_F(GLRTest, ReduceConflictsSplitting) {
@@ -146,26 +142,28 @@
   buildGrammar({"class-name", "enum-name"},
                {"class-name := IDENTIFIER", "enum-name := IDENTIFIER"});
 
-  LRTable Table = LRTable::buildForTests(
-      G->table(), {{/*State=*/0, id("class-name"), Action::goTo(2)},
-                   {/*State=*/0, id("enum-name"), Action::goTo(3)}});
+  LRTable::Builder LR;
+  LR.StateCount = 4;
+  LR.GoTo[{0, id("class-name")}] = 2;
+  LR.GoTo[{0, id("enum-name")}] = 3;
+  LR.Reduce.push_back({1, ruleFor("class-name")});
+  LR.Reduce.push_back({1, ruleFor("enum-name")});
+  LRTable Table = std::move(LR).build();
 
   const auto *GSSNode0 =
       GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
   const auto *GSSNode1 =
-      GSStack.addNode(3, &Arena.createTerminal(tok::identifier, 0), {GSSNode0});
-
-  std::vector<ParseStep> PendingReduce = {
-      {GSSNode1, Action::reduce(ruleFor("class-name"))},
-      {GSSNode1, Action::reduce(ruleFor("enum-name"))}};
-  glrReduce(PendingReduce, {*G, Table, Arena, GSStack},
-            captureNewHeads());
-  EXPECT_THAT(NewHeadResults,
-              testing::UnorderedElementsAre(
-                  AllOf(state(2), parsedSymbolID(id("class-name")),
-                        parents({GSSNode0})),
-                  AllOf(state(3), parsedSymbolID(id("enum-name")),
-                        parents({GSSNode0})))) << NewHeadResults;
+      GSStack.addNode(1, &Arena.createTerminal(tok::identifier, 0), {GSSNode0});
+
+  std::vector<const GSS::Node *> Heads = {GSSNode1};
+  glrReduce(Heads, {*G, Table, Arena, GSStack});
+  EXPECT_THAT(Heads, UnorderedElementsAre(
+                         GSSNode1,
+                         AllOf(state(2), parsedSymbolID(id("class-name")),
+                               parents({GSSNode0})),
+                         AllOf(state(3), parsedSymbolID(id("enum-name")),
+                               parents({GSSNode0}))))
+      << Heads;
 }
 
 TEST_F(GLRTest, ReduceSplittingDueToMultipleBases) {
@@ -189,24 +187,26 @@
       /*State=*/4, &Arena.createTerminal(tok::star, /*TokenIndex=*/1),
       /*Parents=*/{GSSNode2, GSSNode3});
 
-  LRTable Table = LRTable::buildForTests(
-      G->table(),
-      {{/*State=*/2, id("ptr-operator"), Action::goTo(/*NextState=*/5)},
-       {/*State=*/3, id("ptr-operator"), Action::goTo(/*NextState=*/6)}});
-  std::vector<ParseStep> PendingReduce = {
-      {GSSNode4, Action::reduce(ruleFor("ptr-operator"))}};
-  glrReduce(PendingReduce, {*G, Table, Arena, GSStack},
-            captureNewHeads());
-
-  EXPECT_THAT(NewHeadResults,
-              testing::UnorderedElementsAre(
-                  AllOf(state(5), parsedSymbolID(id("ptr-operator")),
-                        parents({GSSNode2})),
-                  AllOf(state(6), parsedSymbolID(id("ptr-operator")),
-                        parents({GSSNode3})))) << NewHeadResults;
+  LRTable::Builder LR;
+  LR.StateCount = 7;
+  LR.GoTo[{2, id("ptr-operator")}] = 5;
+  LR.GoTo[{3, id("ptr-operator")}] = 6;
+  LR.Reduce.push_back({4, ruleFor("ptr-operator")});
+  LRTable Table = std::move(LR).build();
+
+  std::vector<const GSS::Node *> Heads = {GSSNode4};
+  glrReduce(Heads, {*G, Table, Arena, GSStack});
+
+  EXPECT_THAT(Heads, UnorderedElementsAre(
+                         GSSNode4,
+                         AllOf(state(5), parsedSymbolID(id("ptr-operator")),
+                               parents({GSSNode2})),
+                         AllOf(state(6), parsedSymbolID(id("ptr-operator")),
+                               parents({GSSNode3}))))
+      << Heads;
   // Verify that the payload of the two new heads is shared, only a single
   // ptr-operator node is created in the forest.
-  EXPECT_EQ(NewHeadResults[0]->Payload, NewHeadResults[1]->Payload);
+  EXPECT_EQ(Heads[1]->Payload, Heads[2]->Payload);
 }
 
 TEST_F(GLRTest, ReduceJoiningWithMultipleBases) {
@@ -238,28 +238,26 @@
       GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
                       /*Parents=*/{GSSNode2});
 
-  LRTable Table = LRTable::buildForTests(
-      G->table(),
-      {{/*State=*/1, id("type-name"), Action::goTo(/*NextState=*/5)},
-       {/*State=*/2, id("type-name"), Action::goTo(/*NextState=*/5)}});
+  LRTable::Builder LR;
+  LR.StateCount = 6;
+  LR.GoTo[{1, id("type-name")}] = 5;
+  LR.GoTo[{2, id("type-name")}] = 5;
   // FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
-  std::vector<ParseStep> PendingReduce = {
-      {
-          GSSNode3, Action::reduce(/*RuleID=*/0) // type-name := class-name
-      },
-      {
-          GSSNode4, Action::reduce(/*RuleID=*/1) // type-name := enum-name
-      }};
-  glrReduce(PendingReduce, {*G, Table, Arena, GSStack},
-            captureNewHeads());
+  LR.Reduce.push_back({3, /* type-name := class-name */0});
+  LR.Reduce.push_back({4, /* type-name := enum-name */1});
+  LRTable Table = std::move(LR).build();
+
+  std::vector<const GSS::Node *> Heads = {GSSNode3, GSSNode4};
+  glrReduce(Heads, {*G, Table, Arena, GSStack});
 
   // Verify that the stack heads are joint at state 5 after reduces.
-  EXPECT_THAT(NewHeadResults, testing::UnorderedElementsAre(AllOf(
-                                  state(5), parsedSymbolID(id("type-name")),
-                                  parents({GSSNode1, GSSNode2}))))
-      << NewHeadResults;
+  EXPECT_THAT(Heads, ElementsAre(GSSNode3, GSSNode4,
+                                          AllOf(state(5),
+                                                parsedSymbolID(id("type-name")),
+                                                parents({GSSNode1, GSSNode2}))))
+      << Heads;
   // Verify that we create an ambiguous ForestNode of two parses of `type-name`.
-  EXPECT_EQ(NewHeadResults.front()->Payload->dumpRecursive(*G),
+  EXPECT_EQ(Heads.back()->Payload->dumpRecursive(*G),
             "[  1, end) type-name := <ambiguous>\n"
             "[  1, end) ââtype-name := class-name\n"
             "[  1, end) â ââclass-name := <opaque>\n"
@@ -295,25 +293,22 @@
   const auto *GSSNode4 =
       GSStack.addNode(/*State=*/4, /*ForestNode=*/StartTerminal,
                       /*Parents=*/{GSSNode2});
-
-  LRTable Table = LRTable::buildForTests(
-      G->table(), {{/*State=*/0, id("pointer"), Action::goTo(5)}});
+  LRTable::Builder LR;
+  LR.StateCount = 6;
+  LR.GoTo[{0, id("pointer")}] = 5;
   // FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
-  std::vector<ParseStep> PendingReduce = {
-      {
-          GSSNode3, Action::reduce(/*RuleID=*/0) // pointer := class-name *
-      },
-      {
-          GSSNode4, Action::reduce(/*RuleID=*/1) // pointer := enum-name *
-      }};
-  glrReduce(PendingReduce, {*G, Table, Arena, GSStack},
-            captureNewHeads());
-
-  EXPECT_THAT(NewHeadResults, testing::UnorderedElementsAre(
-                                  AllOf(state(5), parsedSymbolID(id("pointer")),
-                                        parents({GSSNode0}))))
-      << NewHeadResults;
-  EXPECT_EQ(NewHeadResults.front()->Payload->dumpRecursive(*G),
+  LR.Reduce.push_back({3, /* pointer := class-name */0});
+  LR.Reduce.push_back({4, /* pointer := enum-name */1});
+  LRTable Table = std::move(LR).build();
+
+  std::vector<const GSS::Node *> Heads = { GSSNode3, GSSNode4 };
+  glrReduce(Heads, {*G, Table, Arena, GSStack});
+
+  EXPECT_THAT(Heads, ElementsAre(GSSNode3, GSSNode4,
+                                 AllOf(state(5), parsedSymbolID(id("pointer")),
+                                       parents({GSSNode0}))))
+      << Heads;
+  EXPECT_EQ(Heads.back()->Payload->dumpRecursive(*G),
             "[  0, end) pointer := <ambiguous>\n"
             "[  0, end) ââpointer := class-name *\n"
             "[  0,   1) â ââclass-name := <opaque>\n"
@@ -342,7 +337,7 @@
   )bnf");
   clang::LangOptions LOptions;
   const TokenStream &Tokens = cook(lex("{ abc", LOptions), LOptions);
-  auto LRTable = LRTable::buildSLR(*G);
+  auto LRTable = LRTable::buildLR0(*G);
 
   const ForestNode &Parsed =
       glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("test"));
@@ -380,7 +375,7 @@
   )bnf");
   clang::LangOptions LOptions;
   const TokenStream &Tokens = cook(lex("IDENTIFIER", LOptions), LOptions);
-  auto LRTable = LRTable::buildSLR(*G);
+  auto LRTable = LRTable::buildLR0(*G);
 
   const ForestNode &Parsed =
       glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("test"));
@@ -405,7 +400,7 @@
   // of the nonterminal `test` when the next token is `eof`, verify that the
   // parser stops at the right state.
   const TokenStream &Tokens = cook(lex("id id", LOptions), LOptions);
-  auto LRTable = LRTable::buildSLR(*G);
+  auto LRTable = LRTable::buildLR0(*G);
 
   const ForestNode &Parsed =
       glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("test"));
Index: clang-tools-extra/pseudo/tool/ClangPseudo.cpp
===================================================================
--- clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -108,7 +108,7 @@
       llvm::outs() << G->dump();
     if (PrintGraph)
       llvm::outs() << clang::pseudo::LRGraph::buildLR0(*G).dumpForTests(*G);
-    auto LRTable = clang::pseudo::LRTable::buildSLR(*G);
+    auto LRTable = clang::pseudo::LRTable::buildLR0(*G);
     if (PrintTable)
       llvm::outs() << LRTable.dumpForTests(*G);
     if (PrintStatistics)
Index: clang-tools-extra/pseudo/test/lr-build-conflicts.test
===================================================================
--- clang-tools-extra/pseudo/test/lr-build-conflicts.test
+++ clang-tools-extra/pseudo/test/lr-build-conflicts.test
@@ -35,12 +35,10 @@
 # TABLE-NEXT: State 1
 # TABLE-NEXT:     '-': shift state 3
 # TABLE-NEXT: State 2
-# TABLE-NEXT:     'EOF': reduce by rule 1 'expr := IDENTIFIER'
-# TABLE-NEXT:     '-': reduce by rule 1 'expr := IDENTIFIER'
+# TABLE-NEXT:     reduce by rule 1 'expr := IDENTIFIER'
 # TABLE-NEXT: State 3
 # TABLE-NEXT:     'IDENTIFIER': shift state 2
 # TABLE-NEXT:     'expr': go to state 4
 # TABLE-NEXT: State 4
-# TABLE-NEXT:     'EOF': reduce by rule 0 'expr := expr - expr'
+# TABLE-NEXT:     reduce by rule 0 'expr := expr - expr'
 # TABLE-NEXT:     '-': shift state 3
-# TABLE-NEXT:     '-': reduce by rule 0 'expr := expr - expr'
Index: clang-tools-extra/pseudo/test/lr-build-basic.test
===================================================================
--- clang-tools-extra/pseudo/test/lr-build-basic.test
+++ clang-tools-extra/pseudo/test/lr-build-basic.test
@@ -23,6 +23,6 @@
 # TABLE-NEXT:     'id': go to state 2
 # TABLE-NEXT: State 1
 # TABLE-NEXT: State 2
-# TABLE-NEXT:     'EOF': reduce by rule 1 'expr := id'
+# TABLE-NEXT:     reduce by rule 1 'expr := id'
 # TABLE-NEXT: State 3
-# TABLE-NEXT:     'EOF': reduce by rule 0 'id := IDENTIFIER'
+# TABLE-NEXT:     reduce by rule 0 'id := IDENTIFIER'
Index: clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
+++ clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
@@ -9,130 +9,40 @@
 #include "clang-pseudo/Grammar.h"
 #include "clang-pseudo/LRGraph.h"
 #include "clang-pseudo/LRTable.h"
-#include "clang/Basic/TokenKinds.h"
 #include <cstdint>
 
-namespace llvm {
-template <> struct DenseMapInfo<clang::pseudo::LRTable::Entry> {
-  using Entry = clang::pseudo::LRTable::Entry;
-  static inline Entry getEmptyKey() {
-    static Entry E{static_cast<clang::pseudo::SymbolID>(-1), 0,
-                   clang::pseudo::LRTable::Action::sentinel()};
-    return E;
-  }
-  static inline Entry getTombstoneKey() {
-    static Entry E{static_cast<clang::pseudo::SymbolID>(-2), 0,
-                   clang::pseudo::LRTable::Action::sentinel()};
-    return E;
-  }
-  static unsigned getHashValue(const Entry &I) {
-    return llvm::hash_combine(I.State, I.Symbol, I.Act.opaque());
-  }
-  static bool isEqual(const Entry &LHS, const Entry &RHS) {
-    return LHS.State == RHS.State && LHS.Symbol == RHS.Symbol &&
-           LHS.Act == RHS.Act;
-  }
-};
-} // namespace llvm
-
 namespace clang {
 namespace pseudo {
 
-class LRTable::Builder {
-public:
-  Builder(llvm::ArrayRef<std::pair<SymbolID, StateID>> StartStates)
-      : StartStates(StartStates) {}
-
-  bool insert(Entry E) { return Entries.insert(std::move(E)).second; }
-  LRTable build(const GrammarTable &GT, unsigned NumStates) && {
-    // E.g. given the following parsing table with 3 states and 3 terminals:
-    //
-    //            a    b     c
-    // +-------+----+-------+-+
-    // |state0 |    | s0,r0 | |
-    // |state1 | acc|       | |
-    // |state2 |    |  r1   | |
-    // +-------+----+-------+-+
-    //
-    // The final LRTable:
-    //  - StateOffset: [s0] = 0, [s1] = 2, [s2] = 3, [sentinel] = 4
-    //  - Symbols:     [ b,   b,   a,  b]
-    //    Actions:     [ s0, r0, acc, r1]
-    //                   ~~~~~~ range for state 0
-    //                           ~~~~ range for state 1
-    //                                ~~ range for state 2
-    // First step, we sort all entries by (State, Symbol, Action).
-    std::vector<Entry> Sorted(Entries.begin(), Entries.end());
-    llvm::sort(Sorted, [](const Entry &L, const Entry &R) {
-      return std::forward_as_tuple(L.State, L.Symbol, L.Act.opaque()) <
-             std::forward_as_tuple(R.State, R.Symbol, R.Act.opaque());
-    });
-
-    LRTable Table;
-    Table.Actions.reserve(Sorted.size());
-    Table.Symbols.reserve(Sorted.size());
-    // We are good to finalize the States and Actions.
-    for (const auto &E : Sorted) {
-      Table.Actions.push_back(E.Act);
-      Table.Symbols.push_back(E.Symbol);
-    }
-    // Initialize the terminal and nonterminal offset, all ranges are empty by
-    // default.
-    Table.StateOffset = std::vector<uint32_t>(NumStates + 1, 0);
-    size_t SortedIndex = 0;
-    for (StateID State = 0; State < Table.StateOffset.size(); ++State) {
-      Table.StateOffset[State] = SortedIndex;
-      while (SortedIndex < Sorted.size() && Sorted[SortedIndex].State == State)
-        ++SortedIndex;
-    }
-    Table.StartStates = std::move(StartStates);
-    return Table;
-  }
-
-private:
-  llvm::DenseSet<Entry> Entries;
-  std::vector<std::pair<SymbolID, StateID>> StartStates;
-};
+LRTable LRTable::buildLR0(const Grammar &G) {
+  auto Graph = LRGraph::buildLR0(G);
+  assert(Graph.states().size() <= (1 << StateBits) &&
+         "Graph states execceds the maximum limit!");
 
-LRTable LRTable::buildForTests(const GrammarTable &GT,
-                               llvm::ArrayRef<Entry> Entries) {
-  StateID MaxState = 0;
-  for (const auto &Entry : Entries)
-    MaxState = std::max(MaxState, Entry.State);
-  Builder Build({});
-  for (const Entry &E : Entries)
-    Build.insert(E);
-  return std::move(Build).build(GT, /*NumStates=*/MaxState + 1);
-}
+  Builder B;
+  B.StartStates = Graph.startStates();
+  B.StateCount = Graph.states().size();
 
-LRTable LRTable::buildSLR(const Grammar &G) {
-  auto Graph = LRGraph::buildLR0(G);
-  Builder Build(Graph.startStates());
   for (const auto &T : Graph.edges()) {
-    Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst);
-    Build.insert({T.Src, T.Label, Act});
+    (isToken(T.Label) ? B.Shift : B.GoTo)
+        .try_emplace(StateSymbol{T.Src, T.Label}, T.Dst);
   }
-  assert(Graph.states().size() <= (1 << StateBits) &&
-         "Graph states execceds the maximum limit!");
-  auto FollowSets = followSets(G);
+
   for (StateID SID = 0; SID < Graph.states().size(); ++SID) {
     for (const Item &I : Graph.states()[SID].Items) {
-      // If we've just parsed the start symbol, this means we successfully parse
-      // the input. We don't add the reduce action of `_ := start_symbol` in the
-      // LRTable (the GLR parser handles it specifically).
-      if (G.lookupRule(I.rule()).Target == G.underscore() && !I.hasNext())
-        continue;
       if (!I.hasNext()) {
-        // If we've reached the end of a rule A := ..., then we can reduce if
-        // the next token is in the follow set of A.
-        for (SymbolID Follow : FollowSets[G.lookupRule(I.rule()).Target]) {
-          assert(isToken(Follow));
-          Build.insert({SID, Follow, Action::reduce(I.rule())});
-        }
+        // If we've just parsed the start symbol, this means we successfully
+        // parse the input. We don't add the reduce action of `_ :=
+        // start_symbol` in the LRTable (the GLR parser handles it
+        // specifically).
+        if (G.lookupRule(I.rule()).Target == G.underscore())
+          continue;
+        // If we've reached the end of a rule A := ..., then we can reduce.
+        B.Reduce.push_back({SID, I.rule()});
       }
     }
   }
-  return std::move(Build).build(G.table(), Graph.states().size());
+  return LRTable(std::move(B));
 }
 
 } // namespace pseudo
Index: clang-tools-extra/pseudo/lib/grammar/LRTable.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/grammar/LRTable.cpp
+++ clang-tools-extra/pseudo/lib/grammar/LRTable.cpp
@@ -10,35 +10,19 @@
 #include "clang-pseudo/Grammar.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace clang {
 namespace pseudo {
 
-llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const LRTable::Action &A) {
-  switch (A.kind()) {
-  case LRTable::Action::Shift:
-    return OS << llvm::formatv("shift state {0}", A.getShiftState());
-  case LRTable::Action::Reduce:
-    return OS << llvm::formatv("reduce by rule {0}", A.getReduceRule());
-  case LRTable::Action::GoTo:
-    return OS << llvm::formatv("go to state {0}", A.getGoToState());
-  case LRTable::Action::Sentinel:
-    llvm_unreachable("unexpected Sentinel action kind!");
-  }
-  llvm_unreachable("unexpected action kind!");
-}
-
 std::string LRTable::dumpStatistics() const {
   return llvm::formatv(R"(
 Statistics of the LR parsing table:
-    number of states: {0}
-    number of actions: {1}
-    size of the table (bytes): {2}
+    number of actions: shift={0} reduce={1} goto={2}
+    size of the table (bytes): {3}
 )",
-                       StateOffset.size() - 1, Actions.size(), bytes())
+                       Shift.size(), Reduce.size(), Goto.size(), bytes())
       .str();
 }
 
@@ -46,66 +30,28 @@
   std::string Result;
   llvm::raw_string_ostream OS(Result);
   OS << "LRTable:\n";
-  for (StateID S = 0; S < StateOffset.size() - 1; ++S) {
+  for (StateID S = 0; S < Reduce.keys(); ++S) {
     OS << llvm::formatv("State {0}\n", S);
+    for (RuleID R : getReduceRules(S))
+      OS.indent(4) << llvm::formatv("reduce by rule {0} '{1}'\n", R,
+                                    G.dumpRule(R));
     for (uint16_t Terminal = 0; Terminal < NumTerminals; ++Terminal) {
       SymbolID TokID = tokenSymbol(static_cast<tok::TokenKind>(Terminal));
-      for (auto A : find(S, TokID)) {
-        if (A.kind() == LRTable::Action::Shift)
-          OS.indent(4) << llvm::formatv("'{0}': shift state {1}\n",
-                                        G.symbolName(TokID), A.getShiftState());
-        else if (A.kind() == LRTable::Action::Reduce)
-          OS.indent(4) << llvm::formatv("'{0}': reduce by rule {1} '{2}'\n",
-                                        G.symbolName(TokID), A.getReduceRule(),
-                                        G.dumpRule(A.getReduceRule()));
-      }
+      if (auto Next = getShiftState(S, TokID))
+        OS.indent(4) << llvm::formatv("'{0}': shift state {1}\n",
+                                      G.symbolName(TokID), *Next);
     }
     for (SymbolID NontermID = 0; NontermID < G.table().Nonterminals.size();
          ++NontermID) {
-      if (find(S, NontermID).empty())
-        continue;
-      OS.indent(4) << llvm::formatv("'{0}': go to state {1}\n",
-                                    G.symbolName(NontermID),
-                                    getGoToState(S, NontermID));
+      auto It = Goto.find(StateSymbol{S, NontermID});
+      if (It != Goto.end())
+        OS.indent(4) << llvm::formatv("'{0}': go to state {1}\n",
+                                      G.symbolName(NontermID), It->second);
     }
   }
   return OS.str();
 }
 
-llvm::ArrayRef<LRTable::Action> LRTable::getActions(StateID State,
-                                                    SymbolID Terminal) const {
-  assert(pseudo::isToken(Terminal) && "expect terminal symbol!");
-  return find(State, Terminal);
-}
-
-LRTable::StateID LRTable::getGoToState(StateID State,
-                                       SymbolID Nonterminal) const {
-  assert(pseudo::isNonterminal(Nonterminal) && "expected nonterminal symbol!");
-  auto Result = find(State, Nonterminal);
-  assert(Result.size() == 1 && Result.front().kind() == Action::GoTo);
-  return Result.front().getGoToState();
-}
-
-llvm::ArrayRef<LRTable::Action> LRTable::find(StateID Src, SymbolID ID) const {
-  assert(Src + 1u < StateOffset.size());
-  std::pair<size_t, size_t> Range =
-      std::make_pair(StateOffset[Src], StateOffset[Src + 1]);
-  auto SymbolRange = llvm::makeArrayRef(Symbols.data() + Range.first,
-                                        Symbols.data() + Range.second);
-
-  assert(llvm::is_sorted(SymbolRange) &&
-         "subrange of the Symbols should be sorted!");
-  const LRTable::StateID *Start =
-      llvm::partition_point(SymbolRange, [&ID](SymbolID S) { return S < ID; });
-  if (Start == SymbolRange.end())
-    return {};
-  const LRTable::StateID *End = Start;
-  while (End != SymbolRange.end() && *End == ID)
-    ++End;
-  return llvm::makeArrayRef(&Actions[Start - Symbols.data()],
-                            /*length=*/End - Start);
-}
-
 LRTable::StateID LRTable::getStartState(SymbolID Target) const {
   assert(llvm::is_sorted(StartStates) && "StartStates must be sorted!");
   auto It = llvm::partition_point(
@@ -117,5 +63,14 @@
   return It->second;
 }
 
+LRTable::LRTable(Builder B)
+    : Shift(std::move(B.Shift)), Reduce(std::move(B.Reduce), B.StateCount),
+      Goto(std::move(B.GoTo)), StartStates(std::move(B.StartStates)) {
+  assert(llvm::all_of(Shift,
+                      [&](auto &E) { return E.first.State < B.StateCount; }));
+  assert(llvm::all_of(Goto,
+                      [&](auto &E) { return E.first.State < B.StateCount; }));
+}
+
 } // namespace pseudo
 } // namespace clang
Index: clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
+++ clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
@@ -87,87 +87,6 @@
   return OS.str();
 }
 
-std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &G) {
-  std::vector<llvm::DenseSet<SymbolID>> FirstSets(
-      G.table().Nonterminals.size());
-  auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) {
-    assert(isNonterminal(Target));
-    if (isToken(First))
-      return FirstSets[Target].insert(First).second;
-    bool Changed = false;
-    for (SymbolID SID : FirstSets[First])
-      Changed |= FirstSets[Target].insert(SID).second;
-    return Changed;
-  };
-
-  // A rule S := T ... implies elements in FIRST(S):
-  //  - if T is a terminal, FIRST(S) contains T
-  //  - if T is a nonterminal, FIRST(S) contains FIRST(T)
-  // Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may
-  // end up being incomplete.
-  // We iterate until we hit a fixed point.
-  // (This isn't particularly efficient, but table building isn't on the
-  // critical path).
-  bool Changed = true;
-  while (Changed) {
-    Changed = false;
-    for (const auto &R : G.table().Rules)
-      // We only need to consider the first element because symbols are
-      // non-nullable.
-      Changed |= ExpandFirstSet(R.Target, R.seq().front());
-  }
-  return FirstSets;
-}
-
-std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &G) {
-  auto FirstSets = firstSets(G);
-  std::vector<llvm::DenseSet<SymbolID>> FollowSets(
-      G.table().Nonterminals.size());
-  // Expand the follow set of a nonterminal symbol Y by adding all from the
-  // given symbol set.
-  auto ExpandFollowSet = [&FollowSets](SymbolID Y,
-                                       const llvm::DenseSet<SymbolID> &ToAdd) {
-    assert(isNonterminal(Y));
-    bool Changed = false;
-    for (SymbolID F : ToAdd)
-      Changed |= FollowSets[Y].insert(F).second;
-    return Changed;
-  };
-  // Follow sets is computed based on the following 3 rules, the computation
-  // is completed at a fixed point where there is no more new symbols can be
-  // added to any of the follow sets.
-  //
-  // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol of the
-  // augmented grammar, in our case it is '_'.
-  FollowSets[G.underscore()].insert(tokenSymbol(tok::eof));
-  bool Changed = true;
-  while (Changed) {
-    Changed = false;
-    for (const auto &R : G.table().Rules) {
-      // Rule 2: for a rule X := ... Y Z, we add all symbols from FIRST(Z) to
-      // FOLLOW(Y).
-      for (size_t I = 0; I + 1 < R.seq().size(); ++I) {
-        if (isToken(R.seq()[I]))
-          continue;
-        // We only need to consider the next symbol because symbols are
-        // non-nullable.
-        SymbolID Next = R.seq()[I + 1];
-        if (isToken(Next))
-          // First set for a terminal is itself.
-          Changed |= ExpandFollowSet(R.seq()[I], {Next});
-        else
-          Changed |= ExpandFollowSet(R.seq()[I], FirstSets[Next]);
-      }
-      // Rule 3: for a rule X := ... Z, we add all symbols from FOLLOW(X) to
-      // FOLLOW(Z).
-      SymbolID Z = R.seq().back();
-      if (isNonterminal(Z))
-        Changed |= ExpandFollowSet(Z, FollowSets[R.Target]);
-    }
-  }
-  return FollowSets;
-}
-
 static llvm::ArrayRef<std::string> getTerminalNames() {
   static const auto &TerminalNames = []() {
     auto TerminalNames = new std::string[NumTerminals];
Index: clang-tools-extra/pseudo/lib/cxx/CXX.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/cxx/CXX.cpp
+++ clang-tools-extra/pseudo/lib/cxx/CXX.cpp
@@ -25,7 +25,7 @@
 }
 
 const LRTable &getLRTable() {
-  static LRTable *Table = new LRTable(LRTable::buildSLR(getGrammar()));
+  static LRTable *Table = new LRTable(LRTable::buildLR0(getGrammar()));
   return *Table;
 }
 
Index: clang-tools-extra/pseudo/lib/GLR.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/GLR.cpp
+++ clang-tools-extra/pseudo/lib/GLR.cpp
@@ -46,67 +46,35 @@
   auto &GSS = Params.GSStack;
 
   // Lists of active shift, reduce actions.
-  std::vector<ParseStep> PendingShift, PendingReduce;
-  auto AddSteps = [&](const GSS::Node *Head, SymbolID NextTok) {
-    for (const auto &Action : Params.Table.getActions(Head->State, NextTok)) {
-      switch (Action.kind()) {
-      case LRTable::Action::Shift:
-        PendingShift.push_back({Head, Action});
-        break;
-      case LRTable::Action::Reduce:
-        PendingReduce.push_back({Head, Action});
-        break;
-      default:
-        llvm_unreachable("unexpected action kind!");
-      }
-    }
-  };
   StateID StartState = Params.Table.getStartState(StartSymbol);
-  std::vector<const GSS::Node *> NewHeads = {
+  std::vector<const GSS::Node *> Heads = {
       GSS.addNode(/*State=*/StartState,
                   /*ForestNode=*/nullptr, {})};
+  std::vector<const GSS::Node *> NextHeads;
   auto MaybeGC = [&, Roots(std::vector<const GSS::Node *>{}), I(0u)]() mutable {
-    assert(PendingShift.empty() && PendingReduce.empty() &&
-           "Running GC at the wrong time!");
-
+    assert(NextHeads.empty() && "Running GC at the wrong time!");
     if (++I != 20) // Run periodically to balance CPU and memory usage.
       return;
     I = 0;
 
     // We need to copy the list: Roots is consumed by the GC.
-    Roots = NewHeads;
+    Roots = Heads;
     GSS.gc(std::move(Roots));
   };
   for (const ForestNode &Terminal : Terminals) {
     LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next token {0} (id={1})\n",
                                              G.symbolName(Terminal.symbol()),
                                              Terminal.symbol()));
-    for (const auto *Head : NewHeads)
-      AddSteps(Head, Terminal.symbol());
-    NewHeads.clear();
-    glrReduce(PendingReduce, Params,
-              [&](const GSS::Node * NewHead) {
-                // A reduce will enable more steps.
-                AddSteps(NewHead, Terminal.symbol());
-              });
-
-    glrShift(PendingShift, Terminal, Params,
-             [&](const GSS::Node *NewHead) { NewHeads.push_back(NewHead); });
+    glrShift(Heads, Terminal, Params, NextHeads);
+    std::swap(Heads, NextHeads);
+    glrReduce(Heads, Params);
+    NextHeads.clear();
     MaybeGC();
   }
-  LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next is eof\n"));
-  for (const auto *Heads : NewHeads)
-    AddSteps(Heads, tokenSymbol(tok::eof));
+  LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n"));
 
   StateID AcceptState = Params.Table.getGoToState(StartState, StartSymbol);
   // Collect new heads created from the final reduce.
-  std::vector<const GSS::Node*> Heads;
-  glrReduce(PendingReduce, Params, [&](const GSS::Node *NewHead) {
-    Heads.push_back(NewHead);
-    // A reduce will enable more steps.
-    AddSteps(NewHead, tokenSymbol(tok::eof));
-  });
-
   const ForestNode *Result = nullptr;
   for (const auto *Head : Heads) {
     if (Head->State == AcceptState) {
@@ -138,42 +106,40 @@
 // After the shift action, the GSS is:
 //   0---1---2---4
 //       â---3---â
-void glrShift(std::vector<ParseStep> &PendingShift, const ForestNode &NewTok,
-              const ParseParams &Params, NewHeadCallback NewHeadCB) {
+void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
+              const ForestNode &NewTok, const ParseParams &Params,
+              std::vector<const GSS::Node *> &NewHeads) {
   assert(NewTok.kind() == ForestNode::Terminal);
-  assert(llvm::all_of(PendingShift,
-                      [](const ParseStep &Step) {
-                        return Step.Action.kind() == LRTable::Action::Shift;
-                      }) &&
-         "Pending shift actions must be shift actions");
   LLVM_DEBUG(llvm::dbgs() << llvm::formatv("  Shift {0} ({1} active heads):\n",
                                            Params.G.symbolName(NewTok.symbol()),
-                                           PendingShift.size()));
+                                           OldHeads.size()));
 
   // We group pending shifts by their target state so we can merge them.
-  llvm::stable_sort(PendingShift, [](const ParseStep &L, const ParseStep &R) {
-    return L.Action.getShiftState() < R.Action.getShiftState();
-  });
-  auto Rest = llvm::makeArrayRef(PendingShift);
+  llvm::SmallVector<std::pair<StateID, const GSS::Node *>, 8> Shifts;
+  for (const auto* H : OldHeads)
+    if (auto S = Params.Table.getShiftState(H->State, NewTok.symbol()))
+      Shifts.push_back({*S, H});
+  llvm::stable_sort(Shifts, llvm::less_first{});
+
+  auto Rest = llvm::makeArrayRef(Shifts);
   llvm::SmallVector<const GSS::Node *> Parents;
   while (!Rest.empty()) {
     // Collect the batch of PendingShift that have compatible shift states.
     // Their heads become TempParents, the parents of the new GSS node.
-    StateID NextState = Rest.front().Action.getShiftState();
+    StateID NextState = Rest.front().first;
 
     Parents.clear();
     for (const auto &Base : Rest) {
-      if (Base.Action.getShiftState() != NextState)
+      if (Base.first != NextState)
         break;
-      Parents.push_back(Base.Head);
+      Parents.push_back(Base.second);
     }
     Rest = Rest.drop_front(Parents.size());
 
     LLVM_DEBUG(llvm::dbgs() << llvm::formatv("    --> S{0} ({1} heads)\n",
                                              NextState, Parents.size()));
-    NewHeadCB(Params.GSStack.addNode(NextState, &NewTok, Parents));
+    NewHeads.push_back(Params.GSStack.addNode(NextState, &NewTok, Parents));
   }
-  PendingShift.clear();
 }
 
 namespace {
@@ -231,8 +197,8 @@
 //   After reducing 3 by `pointer := class-name STAR` and
 //                  2 by`enum-name := class-name STAR`:
 //     0--5(pointer)       // 5 is goto(0, pointer)
-void glrReduce(std::vector<ParseStep> &PendingReduce, const ParseParams &Params,
-               NewHeadCallback NewHeadCB) {
+void glrReduce(std::vector<const GSS::Node *> &Heads,
+               const ParseParams &Params) {
   // There are two interacting complications:
   // 1.  Performing one reduce can unlock new reduces on the newly-created head.
   // 2a. The ambiguous ForestNodes must be complete (have all sequence nodes).
@@ -291,6 +257,36 @@
   KeyedQueue<Family, PushSpec> Sequences;
 
   Sequence TempSequence;
+
+  // We treat Heads as a queue of Pop operations still to be performed.
+  // PoppedHeads is our position within it.
+  unsigned PoppedHeads = 0;
+  // In general the sequencing is complicated: each pop can yield multiple
+  // pending pushes that might run in a different order than we found them.
+  // However in trivial cases (only pop that yields only one push) we can
+  // bypass all these fancy queues and pop+push directly. This is very common.
+  auto PopAndPushTrivial = [&]() -> bool {
+    if (!Sequences.empty() || Heads.size() != PoppedHeads + 1)
+      return false;
+    const GSS::Node *Head = Heads.back();
+    auto Rules = Params.Table.getReduceRules(Head->State);
+    if (Rules.size() != 1)
+      return false;
+    const auto &Rule = Params.G.lookupRule(Rules.front());
+    const GSS::Node *Base = Head;
+    TempSequence.resize_for_overwrite(Rule.Size);
+    for (unsigned I = 0; I < Rule.Size; ++I) {
+      if (Base->parents().size() != 1)
+        return false;
+      TempSequence[Rule.Size - 1 - I] = Base->Payload;
+      Base = Base->parents().front();
+    }
+    const ForestNode *Parsed =
+        &Params.Forest.createSequence(Rule.Target, Rules.front(), TempSequence);
+    StateID NextState = Params.Table.getGoToState(Base->State, Rule.Target);
+    Heads.push_back(Params.GSStack.addNode(NextState, Parsed, {Base}));
+    return true;
+  };
   // Pop walks up the parent chain(s) for a reduction from Head by to Rule.
   // Once we reach the end, record the bases and sequences.
   auto Pop = [&](const GSS::Node *Head, RuleID RID) {
@@ -312,9 +308,12 @@
     DFS(Head, 0, DFS);
   };
   auto PopPending = [&] {
-    for (const ParseStep &Pending : PendingReduce)
-      Pop(Pending.Head, Pending.Action.getReduceRule());
-    PendingReduce.clear();
+    for (; PoppedHeads < Heads.size(); ++PoppedHeads) {
+      if (PopAndPushTrivial())
+        continue;
+      for (RuleID R : Params.Table.getReduceRules(Heads[PoppedHeads]->State))
+        Pop(Heads[PoppedHeads], R);
+    }
   };
 
   std::vector<std::pair</*Goto*/ StateID, const GSS::Node *>> FamilyBases;
@@ -379,9 +378,7 @@
       }
       BasesLeft = BasesLeft.drop_front(Parents.size());
 
-      // Invoking the callback for new heads, a real GLR parser may add new
-      // reduces to the PendingReduce queue!
-      NewHeadCB(Params.GSStack.addNode(NextState, Parsed, Parents));
+      Heads.push_back(Params.GSStack.addNode(NextState, Parsed, Parents));
     }
     PopPending();
   }
Index: clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h
@@ -38,9 +38,39 @@
 
 #include "clang-pseudo/Grammar.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Capacity.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cstdint>
 #include <vector>
 
+namespace clang {
+namespace pseudo {
+struct StateSymbol {
+  using StateID = uint16_t; // XXX
+  StateID State;
+  SymbolID Symbol;
+};
+} // namespace pseudo
+} // namespace clang
+namespace llvm {
+template <> struct llvm::DenseMapInfo<clang::pseudo::StateSymbol> {
+  using StateSymbol = clang::pseudo::StateSymbol;
+  static inline StateSymbol getEmptyKey() {
+    return StateSymbol{StateSymbol::StateID(-1), 0};
+  }
+  static inline StateSymbol getTombstoneKey() {
+    return StateSymbol{StateSymbol::StateID(-1), 1};
+  }
+  static unsigned getHashValue(const StateSymbol &Val) {
+    return (Val.State * 2754435761U) ^ Val.Symbol; // Knuth hash
+  }
+  static bool isEqual(const StateSymbol &LHS, const StateSymbol &RHS) {
+    return LHS.State == RHS.State && LHS.Symbol == RHS.Symbol;
+  }
+};
+} // namespace llvm
 namespace clang {
 namespace pseudo {
 
@@ -60,79 +90,26 @@
   using StateID = uint16_t;
   static constexpr unsigned StateBits = 13;
 
-  // Action represents the terminal and nonterminal actions, it combines the
-  // entry of the ACTION and GOTO tables from the LR literature.
-  class Action {
-  public:
-    enum Kind : uint8_t {
-      Sentinel = 0,
-      // Terminal actions, corresponding to entries of ACTION table.
-
-      // Shift to state n: move forward with the lookahead, and push state n
-      // onto the state stack.
-      // A shift is a forward transition, and the value n is the next state that
-      // the parser is to enter.
-      Shift,
-      // Reduce by a rule: pop the state stack.
-      Reduce,
-
-      // NOTE: there are no typical accept actions in the LRtable, accept
-      // actions are handled specifically in the parser -- if the parser
-      // reaches to a target state (which is goto(StartState, StartSymbol)) at
-      // the EOF token after a reduce, this indicates the input has been parsed
-      // as the StartSymbol successfully.
-
-      // Nonterminal actions, corresponding to entry of GOTO table.
-
-      // Go to state n: push state n onto the state stack.
-      // Similar to Shift, but it is a nonterminal forward transition.
-      GoTo,
-    };
-
-    static Action goTo(StateID S) { return Action(GoTo, S); }
-    static Action shift(StateID S) { return Action(Shift, S); }
-    static Action reduce(RuleID RID) { return Action(Reduce, RID); }
-    static Action sentinel() { return Action(Sentinel, 0); }
-
-    StateID getShiftState() const {
-      assert(kind() == Shift);
-      return Value;
-    }
-    StateID getGoToState() const {
-      assert(kind() == GoTo);
-      return Value;
-    }
-    RuleID getReduceRule() const {
-      assert(kind() == Reduce);
-      return Value;
-    }
-    Kind kind() const { return static_cast<Kind>(K); }
-
-    bool operator==(const Action &L) const { return opaque() == L.opaque(); }
-    uint16_t opaque() const { return K << ValueBits | Value; };
-
-  private:
-    Action(Kind K1, unsigned Value) : K(K1), Value(Value) {}
-    static constexpr unsigned ValueBits = StateBits;
-    static constexpr unsigned KindBits = 3;
-    static_assert(ValueBits >= RuleBits, "Value must be able to store RuleID");
-    static_assert(KindBits + ValueBits <= 16,
-                  "Must be able to store kind and value efficiently");
-    uint16_t K : KindBits;
-    // Either StateID or RuleID, depending on the Kind.
-    uint16_t Value : ValueBits;
-  };
-
-  // Returns all available actions for the given state on a terminal.
-  // Expected to be called by LR parsers.
-  llvm::ArrayRef<Action> getActions(StateID State, SymbolID Terminal) const;
-  // Returns the state after we reduce a nonterminal.
-  // Expected to be called by LR parsers.
-  StateID getGoToState(StateID State, SymbolID Nonterminal) const;
-
-  // Looks up available actions.
-  // Returns empty if no available actions in the table.
-  llvm::ArrayRef<Action> find(StateID State, SymbolID Symbol) const;
+  // Return the list of reductions applicable from a given state.
+  // These correspond to items with the dot at the end.
+  llvm::ArrayRef<RuleID> getReduceRules(StateID State) const {
+    return Reduce.find(State);
+  }
+  // Return the state to transition to after shifting a token Terminal.
+  // Returns None if this terminal is not legal here.
+  llvm::Optional<StateID> getShiftState(StateID From, SymbolID Terminal) const {
+    auto It = Shift.find(StateSymbol{From, Terminal});
+    if (It == Shift.end())
+      return llvm::None;
+    return It->second;
+  }
+  // Return the state to transition to after reducing a symbol Nonterminal.
+  // REQUIRES: this nonterminal is legal here.
+  StateID getGoToState(StateID From, SymbolID Nonterminal) const {
+    auto It = Goto.find(StateSymbol{From, Nonterminal});
+    assert(It != Goto.end());
+    return It->second;
+  }
 
   // Returns the state from which the LR parser should start to parse the input
   // tokens as the given StartSymbol.
@@ -146,47 +123,75 @@
   StateID getStartState(SymbolID StartSymbol) const;
 
   size_t bytes() const {
-    return sizeof(*this) + Actions.capacity() * sizeof(Action) +
-           Symbols.capacity() * sizeof(SymbolID) +
-           StateOffset.capacity() * sizeof(uint32_t);
+    return sizeof(*this) + llvm::capacity_in_bytes(Shift) +
+           llvm::capacity_in_bytes(Goto) + Reduce.bytes();
   }
 
   std::string dumpStatistics() const;
   std::string dumpForTests(const Grammar &G) const;
 
-  // Build a SLR(1) parsing table.
-  static LRTable buildSLR(const Grammar &G);
+  static LRTable buildLR0(const Grammar &G);
+  struct Builder {
+    unsigned StateCount = 0;
+    std::vector<std::pair<StateID, RuleID>> Reduce;
+    llvm::DenseMap<StateSymbol, StateID> Shift;
+    llvm::DenseMap<StateSymbol, StateID> GoTo;
+    std::vector<std::pair<SymbolID, StateID>> StartStates;
 
-  class Builder;
-  // Represents an entry in the table, used for building the LRTable.
-  struct Entry {
-    StateID State;
-    SymbolID Symbol;
-    Action Act;
+    LRTable build() && { return LRTable(std::move(*this)); }
   };
-  // Build a specifid table for testing purposes.
-  static LRTable buildForTests(const GrammarTable &, llvm::ArrayRef<Entry>);
 
 private:
-  // Conceptually the LR table is a multimap from (State, SymbolID) => Action.
-  // Our physical representation is quite different for compactness.
-
-  // Index is StateID, value is the offset into Symbols/Actions
-  // where the entries for this state begin.
-  // Give a state id, the corresponding half-open range of Symbols/Actions is
-  // [StateOffset[id], StateOffset[id+1]).
-  std::vector<uint32_t> StateOffset;
-  // Parallel to Actions, the value is SymbolID (columns of the matrix).
-  // Grouped by the StateID, and only subranges are sorted.
-  std::vector<SymbolID> Symbols;
-  // A flat list of available actions, sorted by (State, SymbolID).
-  std::vector<Action> Actions;
+  // A multimap from K => V, where Ks form a dense range [0, n).
+  // A flat array stores the values:
+  //   Values = [ values for k=0 | values for k=1 | values for k=2 | ... ]
+  // And another stores the index for each K:
+  //   Keys = [ index for k=0, index for k=1, ... , Values.size()]
+  // Lookup[k] is Values[ Keys[k]...Keys[k+1] ]
+  template <typename K, typename V> struct OffsetTable {
+    std::vector<uint32_t> Keys;
+    std::vector<V> Values;
+
+    OffsetTable(std::vector<std::pair<K, V>> &&Pairs, K Limit) {
+      assert(llvm::all_of(Pairs, [&](auto &P) { return P.first < Limit; }));
+      llvm::stable_sort(Pairs, llvm::less_first{});
+      Keys.reserve(Limit + 1);
+      Values.reserve(Pairs.size());
+      unsigned I = 0;
+      for (K Key = 0; Key < Limit; ++Key) {
+        Keys.push_back(Values.size());
+        while (I < Pairs.size() && Pairs[I].first == Key)
+          Values.push_back(Pairs[I++].second);
+      }
+      Keys.push_back(Values.size());
+      assert(Values.size() == Pairs.size());
+      assert(Keys.size() == Limit + 1);
+    }
+
+    size_t bytes() const {
+      return sizeof(*this) + llvm::capacity_in_bytes(Keys) +
+             llvm::capacity_in_bytes(Values);
+    }
+    size_t size() const { return Values.size(); }
+    size_t keys() const { return Keys.size() - 1; }
+
+    llvm::ArrayRef<V> find(K Key) const {
+      return llvm::makeArrayRef(&Values[Keys[Key]], &Values[Keys[Key + 1]]);
+    }
+  };
+
+  llvm::DenseMap<StateSymbol, StateID> Shift;
+  OffsetTable<StateID, RuleID> Reduce;
+  llvm::DenseMap<StateSymbol, StateID> Goto;
   // A sorted table, storing the start state for each target parsing symbol.
   std::vector<std::pair<SymbolID, StateID>> StartStates;
+
+  LRTable(Builder B);
 };
-llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &);
 
 } // namespace pseudo
 } // namespace clang
 
+namespace llvm {} // namespace llvm
+
 #endif // CLANG_PSEUDO_LRTABLE_H
Index: clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
@@ -174,12 +174,6 @@
   // the augmented grammar.)
   SymbolID Underscore;
 };
-// For each nonterminal X, computes the set of terminals that begin strings
-// derived from X. (Known as FIRST sets in grammar-based parsers).
-std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &);
-// For each nonterminal X, computes the set of terminals that could immediately
-// follow X. (Known as FOLLOW sets in grammar-based parsers).
-std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &);
 
 // Storage for the underlying data of the Grammar.
 // It can be constructed dynamically (from compiling BNF file) or statically
Index: clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
@@ -132,34 +132,23 @@
 const ForestNode &glrParse(const TokenStream &Code, const ParseParams &Params,
                            SymbolID StartSymbol);
 
-// An active stack head can have multiple available actions (reduce/reduce
-// actions, reduce/shift actions).
-// A step is any one action applied to any one stack head.
-struct ParseStep {
-  // A specific stack head.
-  const GSS::Node *Head = nullptr;
-  // An action associated with the head.
-  LRTable::Action Action = LRTable::Action::sentinel();
-};
-// A callback is invoked whenever a new GSS head is created during the GLR
-// parsing process (glrShift, or glrReduce).
-using NewHeadCallback = std::function<void(const GSS::Node *)>;
 // Apply all PendingShift actions on a given GSS state, newly-created heads are
 // passed to the callback.
 //
 // When this function returns, PendingShift is empty.
 //
 // Exposed for testing only.
-void glrShift(std::vector<ParseStep> &PendingShift, const ForestNode &NextTok,
-              const ParseParams &Params, NewHeadCallback NewHeadCB);
+void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
+              const ForestNode &NextTok, const ParseParams &Params,
+              std::vector<const GSS::Node *> &NewHeads);
 // Applies PendingReduce actions, until no more reduce actions are available.
 //
 // When this function returns, PendingReduce is empty. Calls to NewHeadCB may
 // add elements to PendingReduce
 //
 // Exposed for testing only.
-void glrReduce(std::vector<ParseStep> &PendingReduce, const ParseParams &Params,
-               NewHeadCallback NewHeadCB);
+void glrReduce(std::vector<const GSS::Node *> &Heads,
+               const ParseParams &Params);
 
 } // namespace pseudo
 } // namespace clang
Index: clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
===================================================================
--- clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
+++ clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
@@ -25,7 +25,7 @@
 class Fuzzer {
   clang::LangOptions LangOpts = clang::pseudo::genericLangOpts();
   std::unique_ptr<Grammar> G;
-  LRTable T;
+  llvm::Optional<LRTable> T;
   bool Print;
 
 public:
@@ -44,7 +44,7 @@
         llvm::errs() << Diag << "\n";
       std::exit(1);
     }
-    T = LRTable::buildSLR(*G);
+    T = LRTable::buildLR0(*G);
   }
 
   void operator()(llvm::StringRef Code) {
@@ -58,9 +58,9 @@
 
     clang::pseudo::ForestArena Arena;
     clang::pseudo::GSS GSS;
-    auto &Root =
-        glrParse(ParseableStream, clang::pseudo::ParseParams{*G, T, Arena, GSS},
-                 *G->findNonterminal("translation-unit"));
+    auto &Root = glrParse(ParseableStream,
+                          clang::pseudo::ParseParams{*G, *T, Arena, GSS},
+                          *G->findNonterminal("translation-unit"));
     if (Print)
       llvm::outs() << Root.dumpRecursive(*G);
   }
Index: clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
===================================================================
--- clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
+++ clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
@@ -77,11 +77,11 @@
 }
 BENCHMARK(parseBNF);
 
-static void buildSLR(benchmark::State &State) {
+static void buildLR0(benchmark::State &State) {
   for (auto _ : State)
-    LRTable::buildSLR(*G);
+    LRTable::buildLR0(*G);
 }
-BENCHMARK(buildSLR);
+BENCHMARK(buildLR0);
 
 TokenStream lexAndPreprocess() {
   clang::LangOptions LangOpts = genericLangOpts();
@@ -129,7 +129,7 @@
 BENCHMARK(preprocess);
 
 static void glrParse(benchmark::State &State) {
-  LRTable Table = clang::pseudo::LRTable::buildSLR(*G);
+  LRTable Table = clang::pseudo::LRTable::buildLR0(*G);
   SymbolID StartSymbol = *G->findNonterminal("translation-unit");
   TokenStream Stream = lexAndPreprocess();
   for (auto _ : State) {
@@ -143,7 +143,7 @@
 BENCHMARK(glrParse);
 
 static void full(benchmark::State &State) {
-  LRTable Table = clang::pseudo::LRTable::buildSLR(*G);
+  LRTable Table = clang::pseudo::LRTable::buildLR0(*G);
   SymbolID StartSymbol = *G->findNonterminal("translation-unit");
   for (auto _ : State) {
     TokenStream Stream = lexAndPreprocess();

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D127357: [pseudo] wip/prototype: use LR0 instead of SLR1 table

Reply via email to