hokein updated this revision to Diff 435889.
hokein added a comment.

- add comments;
- add unittest and lit test;
- misc improvements;


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D127448/new/

https://reviews.llvm.org/D127448

Files:
  clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
  clang-tools-extra/pseudo/benchmarks/CMakeLists.txt
  clang-tools-extra/pseudo/fuzzer/CMakeLists.txt
  clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
  clang-tools-extra/pseudo/gen/Main.cpp
  clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
  clang-tools-extra/pseudo/include/clang-pseudo/Language.h
  clang-tools-extra/pseudo/include/clang-pseudo/cli/CLI.h
  clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h
  clang-tools-extra/pseudo/lib/CMakeLists.txt
  clang-tools-extra/pseudo/lib/GLR.cpp
  clang-tools-extra/pseudo/lib/cli/CLI.cpp
  clang-tools-extra/pseudo/lib/cli/CMakeLists.txt
  clang-tools-extra/pseudo/lib/cxx.bnf
  clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt
  clang-tools-extra/pseudo/lib/cxx/CXX.cpp
  clang-tools-extra/pseudo/test/cxx/contextual-keywords.cpp
  clang-tools-extra/pseudo/tool/CMakeLists.txt
  clang-tools-extra/pseudo/tool/ClangPseudo.cpp
  clang-tools-extra/pseudo/unittests/GLRTest.cpp

Index: clang-tools-extra/pseudo/unittests/GLRTest.cpp
===================================================================
--- clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -48,7 +48,15 @@
     std::vector<std::string> Diags;
     G = Grammar::parseBNF(GrammarBNF, Diags);
   }
-
+  // FIXME: move to TokenStream class.
+  TokenStream emptyTokenStream() {
+    TokenStream Empty;
+    Empty.finalize();
+    return Empty;
+  }
+  ParseLang getTestLang() {
+    return {*G, Table, Guards};
+  }
   void buildGrammar(std::vector<std::string> Nonterminals,
                     std::vector<std::string> Rules) {
     Nonterminals.push_back("_");
@@ -72,7 +80,13 @@
     ADD_FAILURE() << "No such symbol found: " << Name;
     return 0;
   }
-
+  ExtensionID extensionID(llvm::StringRef AttrValueName) const {
+    for (unsigned I = 0; I < G->table().AttributeValues.size(); ++I)
+      if (G->table().AttributeValues[I] == AttrValueName)
+        return static_cast<ExtensionID>(I);
+    ADD_FAILURE() << "No such attribute value found: " << AttrValueName;
+    return 0;
+  }
   RuleID ruleFor(llvm::StringRef NonterminalName) const {
     auto RuleRange = G->table().Nonterminals[id(NonterminalName)].RuleRange;
     if (RuleRange.End - RuleRange.Start == 1)
@@ -91,6 +105,8 @@
 
 protected:
   std::unique_ptr<Grammar> G;
+  LRTable Table;
+  llvm::DenseMap<ExtensionID, Guard> Guards;
   ForestArena Arena;
   GSS GSStack;
   std::vector<const GSS::Node*> NewHeadResults;
@@ -117,7 +133,7 @@
                                    /*Parents=*/{GSSNode0});
 
   buildGrammar({}, {}); // Create a fake empty grammar.
-  LRTable T = LRTable::buildForTests(G->table(), /*Entries=*/{});
+  Table = LRTable::buildForTests(G->table(), /*Entries=*/{});
 
   ForestNode &SemiTerminal = Arena.createTerminal(tok::semi, 0);
   std::vector<ParseStep> PendingShift = {
@@ -125,7 +141,7 @@
       {GSSNode3, Action::shift(5)},
       {GSSNode2, Action::shift(4)},
   };
-  glrShift(PendingShift, SemiTerminal, {*G, T, Arena, GSStack},
+  glrShift(PendingShift, SemiTerminal,  {getTestLang(), Arena, GSStack},
            captureNewHeads());
 
   EXPECT_THAT(NewHeadResults, testing::UnorderedElementsAre(
@@ -146,7 +162,7 @@
   buildGrammar({"class-name", "enum-name"},
                {"class-name := IDENTIFIER", "enum-name := IDENTIFIER"});
 
-  LRTable Table = LRTable::buildForTests(
+  Table = LRTable::buildForTests(
       G->table(), {{/*State=*/0, id("class-name"), Action::goTo(2)},
                    {/*State=*/0, id("enum-name"), Action::goTo(3)}});
 
@@ -158,7 +174,7 @@
   std::vector<ParseStep> PendingReduce = {
       {GSSNode1, Action::reduce(ruleFor("class-name"))},
       {GSSNode1, Action::reduce(ruleFor("enum-name"))}};
-  glrReduce(PendingReduce, {*G, Table, Arena, GSStack},
+  glrReduce(PendingReduce, {getTestLang(), Arena, GSStack}, emptyTokenStream(),
             captureNewHeads());
   EXPECT_THAT(NewHeadResults,
               testing::UnorderedElementsAre(
@@ -189,13 +205,13 @@
       /*State=*/4, &Arena.createTerminal(tok::star, /*TokenIndex=*/1),
       /*Parents=*/{GSSNode2, GSSNode3});
 
-  LRTable Table = LRTable::buildForTests(
+  Table = LRTable::buildForTests(
       G->table(),
       {{/*State=*/2, id("ptr-operator"), Action::goTo(/*NextState=*/5)},
        {/*State=*/3, id("ptr-operator"), Action::goTo(/*NextState=*/6)}});
   std::vector<ParseStep> PendingReduce = {
       {GSSNode4, Action::reduce(ruleFor("ptr-operator"))}};
-  glrReduce(PendingReduce, {*G, Table, Arena, GSStack},
+  glrReduce(PendingReduce, {getTestLang(), Arena, GSStack}, emptyTokenStream(),
             captureNewHeads());
 
   EXPECT_THAT(NewHeadResults,
@@ -238,7 +254,7 @@
       GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
                       /*Parents=*/{GSSNode2});
 
-  LRTable Table = LRTable::buildForTests(
+  Table = LRTable::buildForTests(
       G->table(),
       {{/*State=*/1, id("type-name"), Action::goTo(/*NextState=*/5)},
        {/*State=*/2, id("type-name"), Action::goTo(/*NextState=*/5)}});
@@ -250,7 +266,7 @@
       {
           GSSNode4, Action::reduce(/*RuleID=*/1) // type-name := enum-name
       }};
-  glrReduce(PendingReduce, {*G, Table, Arena, GSStack},
+  glrReduce(PendingReduce, {getTestLang(), Arena, GSStack}, emptyTokenStream(),
             captureNewHeads());
 
   // Verify that the stack heads are joint at state 5 after reduces.
@@ -296,7 +312,7 @@
       GSStack.addNode(/*State=*/4, /*ForestNode=*/StartTerminal,
                       /*Parents=*/{GSSNode2});
 
-  LRTable Table = LRTable::buildForTests(
+  Table = LRTable::buildForTests(
       G->table(), {{/*State=*/0, id("pointer"), Action::goTo(5)}});
   // FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
   std::vector<ParseStep> PendingReduce = {
@@ -306,7 +322,7 @@
       {
           GSSNode4, Action::reduce(/*RuleID=*/1) // pointer := enum-name *
       }};
-  glrReduce(PendingReduce, {*G, Table, Arena, GSStack},
+  glrReduce(PendingReduce,{getTestLang(), Arena, GSStack}, emptyTokenStream(),
             captureNewHeads());
 
   EXPECT_THAT(NewHeadResults, testing::UnorderedElementsAre(
@@ -340,12 +356,12 @@
     left-paren := {
     expr := IDENTIFIER
   )bnf");
+  Table = LRTable::buildSLR(*G);
   clang::LangOptions LOptions;
   const TokenStream &Tokens = cook(lex("{ abc", LOptions), LOptions);
-  auto LRTable = LRTable::buildSLR(*G);
 
   const ForestNode &Parsed =
-      glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("test"));
+      glrParse(Tokens, {getTestLang(), Arena, GSStack}, id("test"));
   // Verify that there is no duplicated sequence node of `expr := IDENTIFIER`
   // in the forest, see the `#1` and `=#1` in the dump string.
   EXPECT_EQ(Parsed.dumpRecursive(*G),
@@ -380,10 +396,10 @@
   )bnf");
   clang::LangOptions LOptions;
   const TokenStream &Tokens = cook(lex("IDENTIFIER", LOptions), LOptions);
-  auto LRTable = LRTable::buildSLR(*G);
+  Table = LRTable::buildSLR(*G);
 
   const ForestNode &Parsed =
-      glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("test"));
+      glrParse(Tokens, {getTestLang(), Arena, GSStack}, id("test"));
   EXPECT_EQ(Parsed.dumpRecursive(*G),
             "[  0, end) test := <ambiguous>\n"
             "[  0, end) ├─test := IDENTIFIER\n"
@@ -405,10 +421,10 @@
   // of the nonterminal `test` when the next token is `eof`, verify that the
   // parser stops at the right state.
   const TokenStream &Tokens = cook(lex("id id", LOptions), LOptions);
-  auto LRTable = LRTable::buildSLR(*G);
+  Table = LRTable::buildSLR(*G);
 
   const ForestNode &Parsed =
-      glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("test"));
+      glrParse(Tokens, {getTestLang(), Arena, GSStack}, id("test"));
   EXPECT_EQ(Parsed.dumpRecursive(*G),
             "[  0, end) test := IDENTIFIER test\n"
             "[  0,   1) ├─IDENTIFIER := tok[0]\n"
@@ -416,6 +432,36 @@
             "[  1, end)   └─IDENTIFIER := tok[1]\n");
 }
 
+TEST_F(GLRTest, GuardExtension) {
+  build(R"bnf(
+    _ := start
+
+    start := IDENTIFIER [guard=TestOnly]
+  )bnf");
+  Guards.insert(std::make_pair(
+      extensionID("TestOnly"),
+      [&](llvm::ArrayRef<const ForestNode *> RHS, const TokenStream &Tokens) {
+        assert(RHS.size() == 1 &&
+               RHS.front()->symbol() == tokenSymbol(clang::tok::identifier));
+        return Tokens.tokens()[RHS.front()->startTokenIndex()].text() == "test";
+      }));
+  clang::LangOptions LOptions;
+  Table = LRTable::buildSLR(*G);
+
+  std::string Input = "test";
+  const TokenStream &Succeeded = cook(lex(Input, LOptions), LOptions);
+  EXPECT_EQ(glrParse(Succeeded, {getTestLang(), Arena, GSStack}, id("start"))
+                .dumpRecursive(*G),
+            "[  0, end) start := IDENTIFIER [guard=TestOnly]\n"
+            "[  0, end) └─IDENTIFIER := tok[0]\n");
+
+  Input = "notest";
+  const TokenStream &Failed = cook(lex(Input, LOptions), LOptions);
+  EXPECT_EQ(glrParse(Failed, {getTestLang(), Arena, GSStack}, id("start"))
+                .dumpRecursive(*G),
+            "[  0, end) start := <opaque>\n");
+}
+
 TEST(GSSTest, GC) {
   //      ┌-A-┬-AB
   //      ├-B-┘
Index: clang-tools-extra/pseudo/tool/ClangPseudo.cpp
===================================================================
--- clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -10,6 +10,7 @@
 #include "clang-pseudo/DirectiveTree.h"
 #include "clang-pseudo/GLR.h"
 #include "clang-pseudo/Token.h"
+#include "clang-pseudo/cli/CLI.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "clang-pseudo/grammar/LRGraph.h"
 #include "clang-pseudo/grammar/LRTable.h"
@@ -20,14 +21,11 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Signals.h"
 
-using clang::pseudo::Grammar;
 using clang::pseudo::TokenStream;
 using llvm::cl::desc;
 using llvm::cl::init;
 using llvm::cl::opt;
 
-static opt<std::string>
-    Grammar("grammar", desc("Parse and check a BNF grammar file."), init(""));
 static opt<bool> PrintGrammar("print-grammar", desc("Print the grammar."));
 static opt<bool> PrintGraph("print-graph",
                             desc("Print the LR graph for the grammar"));
@@ -93,42 +91,34 @@
     pairBrackets(*ParseableStream);
   }
 
-  if (Grammar.getNumOccurrences()) {
-    std::string Text = readOrDie(Grammar);
-    std::vector<std::string> Diags;
-    auto G = Grammar::parseBNF(Text, Diags);
-
-    if (!Diags.empty()) {
-      llvm::errs() << llvm::join(Diags, "\n");
-      return 2;
-    }
-    llvm::outs() << llvm::formatv("grammar file {0} is parsed successfully\n",
-                                  Grammar);
+  if (true) {
+    const auto &Lang = clang::pseudo::cli::getLanguage();
     if (PrintGrammar)
-      llvm::outs() << G->dump();
+      llvm::outs() << Lang.G.dump();
     if (PrintGraph)
-      llvm::outs() << clang::pseudo::LRGraph::buildLR0(*G).dumpForTests(*G);
-    auto LRTable = clang::pseudo::LRTable::buildSLR(*G);
+      llvm::outs() << clang::pseudo::LRGraph::buildLR0(Lang.G).dumpForTests(
+          Lang.G);
+
     if (PrintTable)
-      llvm::outs() << LRTable.dumpForTests(*G);
+      llvm::outs() << Lang.Table.dumpForTests(Lang.G);
     if (PrintStatistics)
-      llvm::outs() << LRTable.dumpStatistics();
+      llvm::outs() << Lang.Table.dumpStatistics();
 
     if (ParseableStream) {
       clang::pseudo::ForestArena Arena;
       clang::pseudo::GSS GSS;
       llvm::Optional<clang::pseudo::SymbolID> StartSymID =
-          G->findNonterminal(StartSymbol);
+          Lang.G.findNonterminal(StartSymbol);
       if (!StartSymID) {
         llvm::errs() << llvm::formatv(
-            "The start symbol {0} doesn't exit in the grammar!\n", Grammar);
+            "The start symbol {0} doesn't exit in the grammar!\n", StartSymbol);
         return 2;
       }
-      auto &Root = glrParse(*ParseableStream,
-                            clang::pseudo::ParseParams{*G, LRTable, Arena, GSS},
-                            *StartSymID);
+      auto &Root =
+          glrParse(*ParseableStream,
+                   clang::pseudo::ParseParams{Lang, Arena, GSS}, *StartSymID);
       if (PrintForest)
-        llvm::outs() << Root.dumpRecursive(*G, /*Abbreviated=*/true);
+        llvm::outs() << Root.dumpRecursive(Lang.G, /*Abbreviated=*/true);
 
       if (PrintStatistics) {
         llvm::outs() << "Forest bytes: " << Arena.bytes()
Index: clang-tools-extra/pseudo/tool/CMakeLists.txt
===================================================================
--- clang-tools-extra/pseudo/tool/CMakeLists.txt
+++ clang-tools-extra/pseudo/tool/CMakeLists.txt
@@ -13,5 +13,6 @@
   PRIVATE
   clangPseudo
   clangPseudoGrammar
+  clangPseudoCLI
   )
 
Index: clang-tools-extra/pseudo/test/cxx/contextual-keywords.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/test/cxx/contextual-keywords.cpp
@@ -0,0 +1,9 @@
+// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
+// Verify that the contextual-{final,override} rules are guarded conditionally,
+// No ambiguous parsing for the virt-specifier.
+class Foo {
+    void foo1() override;
+// CHECK: virt-specifier-seq~IDENTIFIER := tok[7]
+    void foo2() final;
+// CHECK: virt-specifier-seq~IDENTIFIER := tok[13]
+};
\ No newline at end of file
Index: clang-tools-extra/pseudo/lib/cxx/CXX.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/cxx/CXX.cpp
+++ clang-tools-extra/pseudo/lib/cxx/CXX.cpp
@@ -7,16 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang-pseudo/cxx/CXX.h"
+#include "clang-pseudo/Forest.h"
+#include "clang-pseudo/Language.h"
+#include "clang-pseudo/grammar/Grammar.h"
 #include "clang-pseudo/grammar/LRTable.h"
+#include <utility>
 
 namespace clang {
 namespace pseudo {
 namespace cxx {
-
+namespace {
 static const char *CXXBNF =
 #include "CXXBNF.inc"
     ;
-
 const Grammar &getGrammar() {
   static std::vector<std::string> Diags;
   static Grammar *G = Grammar::parseBNF(CXXBNF, Diags).release();
@@ -29,6 +32,29 @@
   return *Table;
 }
 
+bool guardOverride(llvm::ArrayRef<const ForestNode *> RHS,
+                   const TokenStream &Tokens) {
+  assert(RHS.size() == 1 &&
+         RHS.front()->symbol() == tokenSymbol(clang::tok::identifier));
+  return Tokens.tokens()[RHS.front()->startTokenIndex()].text() == "override";
+}
+bool guardFinal(llvm::ArrayRef<const ForestNode *> RHS,
+                const TokenStream &Tokens) {
+  assert(RHS.size() == 1 &&
+         RHS.front()->symbol() == tokenSymbol(clang::tok::identifier));
+  return Tokens.tokens()[RHS.front()->startTokenIndex()].text() == "final";
+}
+} // namespace
+
+const ParseLang &getLanguage() {
+  static llvm::DenseMap<ExtensionID, Guard> *Guards =
+      new llvm::DenseMap<ExtensionID, Guard>(
+          {{(ExtensionID)Extension::Override, guardOverride},
+           {(ExtensionID)Extension::Final, guardFinal}});
+  static ParseLang *L = new ParseLang{getGrammar(), getLRTable(), *Guards};
+  return *L;
+}
+
 } // namespace cxx
 } // namespace pseudo
 } // namespace clang
Index: clang-tools-extra/pseudo/lib/cxx.bnf
===================================================================
--- clang-tools-extra/pseudo/lib/cxx.bnf
+++ clang-tools-extra/pseudo/lib/cxx.bnf
@@ -739,8 +739,8 @@
 
 #! Contextual keywords -- clang lexer always lexes them as identifier tokens.
 #! Placeholders for literal text in the grammar that lex as other things.
-contextual-override := IDENTIFIER
-contextual-final := IDENTIFIER
+contextual-override := IDENTIFIER [guard=Override]
+contextual-final := IDENTIFIER [guard=Final]
 contextual-zero := NUMERIC_CONSTANT
 module-keyword := IDENTIFIER
 import-keyword := IDENTIFIER
Index: clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt
===================================================================
--- clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt
+++ clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt
@@ -10,4 +10,5 @@
 
   LINK_LIBS
   clangPseudoGrammar
+  clangPseudo
   )
Index: clang-tools-extra/pseudo/lib/cli/CMakeLists.txt
===================================================================
--- clang-tools-extra/pseudo/lib/cli/CMakeLists.txt
+++ clang-tools-extra/pseudo/lib/cli/CMakeLists.txt
@@ -2,12 +2,11 @@
   Support
   )
 
-add_clang_library(clangPseudoCXX
-  CXX.cpp
-
-  DEPENDS
-  cxx_gen
+add_clang_library(clangPseudoCLI
+  CLI.cpp
 
   LINK_LIBS
+  clangPseudo
   clangPseudoGrammar
+  clangPseudoCXX
   )
Index: clang-tools-extra/pseudo/lib/cli/CLI.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/lib/cli/CLI.cpp
@@ -0,0 +1,67 @@
+//===--- CLI.cpp -  ----------------------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "clang-pseudo/cli/CLI.h"
+#include "clang-pseudo/Language.h"
+#include "clang-pseudo/cxx/CXX.h"
+#include "clang-pseudo/grammar/LRTable.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <utility>
+
+static llvm::cl::opt<std::string>
+    Grammar("grammar",
+            llvm::cl::desc(
+                "Specify a BNF grammar file path, or builtin language (cxx)."),
+            llvm::cl::init("cxx"));
+
+namespace clang {
+namespace pseudo {
+
+static bool alwaysAccept(llvm::ArrayRef<const ForestNode *> RHS,
+                         const TokenStream &) {
+  return true;
+}
+
+namespace cli {
+const ParseLang &getLanguage() {
+  if (::Grammar == "cxx")
+    return cxx::getLanguage();
+
+  static ParseLang *PL = [&]() {
+    // Read from a bnf grammar file.
+    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GrammarText =
+        llvm::MemoryBuffer::getFile(::Grammar);
+    if (std::error_code EC = GrammarText.getError()) {
+      llvm::errs() << "Error: can't read grammar file '" << ::Grammar
+                   << "': " << EC.message() << "\n";
+      std::exit(1);
+    }
+    std::vector<std::string> Diags;
+    auto G = Grammar::parseBNF(GrammarText->get()->getBuffer(), Diags);
+    if (!Diags.empty()) {
+      for (const auto &Diag : Diags)
+        llvm::errs() << Diag << "\n";
+      std::exit(1);
+    }
+
+    LRTable *Table = new LRTable(LRTable::buildSLR(*G));
+    llvm::DenseMap<ExtensionID, Guard> *Guards =
+        new llvm::DenseMap<ExtensionID, Guard>();
+    for (ExtensionID ID = 1; ID < G->table().AttributeValues.size(); ++ID)
+      Guards->insert(std::make_pair(ID, alwaysAccept));
+    return new ParseLang{*G.release(), *Table, *Guards};
+  }();
+  return *PL;
+}
+} // namespace cli
+
+} // namespace pseudo
+} // namespace clang
Index: clang-tools-extra/pseudo/lib/GLR.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/GLR.cpp
+++ clang-tools-extra/pseudo/lib/GLR.cpp
@@ -41,14 +41,14 @@
                            SymbolID StartSymbol) {
   assert(isNonterminal(StartSymbol) && "Start symbol must be a nonterminal");
   llvm::ArrayRef<ForestNode> Terminals = Params.Forest.createTerminals(Tokens);
-  auto &G = Params.G;
+  auto &G = Params.Lang.G;
   (void)G;
   auto &GSS = Params.GSStack;
 
   // Lists of active shift, reduce actions.
   std::vector<ParseStep> PendingShift, PendingReduce;
   auto AddSteps = [&](const GSS::Node *Head, SymbolID NextTok) {
-    for (const auto &Action : Params.Table.getActions(Head->State, NextTok)) {
+    for (const auto &Action : Params.Lang.Table.getActions(Head->State, NextTok)) {
       switch (Action.kind()) {
       case LRTable::Action::Shift:
         PendingShift.push_back({Head, Action});
@@ -61,7 +61,7 @@
       }
     }
   };
-  StateID StartState = Params.Table.getStartState(StartSymbol);
+  StateID StartState = Params.Lang.Table.getStartState(StartSymbol);
   std::vector<const GSS::Node *> NewHeads = {
       GSS.addNode(/*State=*/StartState,
                   /*ForestNode=*/nullptr, {})};
@@ -84,7 +84,7 @@
     for (const auto *Head : NewHeads)
       AddSteps(Head, Terminal.symbol());
     NewHeads.clear();
-    glrReduce(PendingReduce, Params,
+    glrReduce(PendingReduce, Params, Tokens,
               [&](const GSS::Node * NewHead) {
                 // A reduce will enable more steps.
                 AddSteps(NewHead, Terminal.symbol());
@@ -98,10 +98,10 @@
   for (const auto *Heads : NewHeads)
     AddSteps(Heads, tokenSymbol(tok::eof));
 
-  StateID AcceptState = Params.Table.getGoToState(StartState, StartSymbol);
+  StateID AcceptState = Params.Lang.Table.getGoToState(StartState, StartSymbol);
   // Collect new heads created from the final reduce.
   std::vector<const GSS::Node*> Heads;
-  glrReduce(PendingReduce, Params, [&](const GSS::Node *NewHead) {
+  glrReduce(PendingReduce, Params, Tokens, [&](const GSS::Node *NewHead) {
     Heads.push_back(NewHead);
     // A reduce will enable more steps.
     AddSteps(NewHead, tokenSymbol(tok::eof));
@@ -147,7 +147,7 @@
                       }) &&
          "Pending shift actions must be shift actions");
   LLVM_DEBUG(llvm::dbgs() << llvm::formatv("  Shift {0} ({1} active heads):\n",
-                                           Params.G.symbolName(NewTok.symbol()),
+                                           Params.Lang.G.symbolName(NewTok.symbol()),
                                            PendingShift.size()));
 
   // We group pending shifts by their target state so we can merge them.
@@ -232,6 +232,7 @@
 //                  2 by`enum-name := class-name STAR`:
 //     0--5(pointer)       // 5 is goto(0, pointer)
 void glrReduce(std::vector<ParseStep> &PendingReduce, const ParseParams &Params,
+               const TokenStream& Tokens,
                NewHeadCallback NewHeadCB) {
   // There are two interacting complications:
   // 1.  Performing one reduce can unlock new reduces on the newly-created head.
@@ -294,12 +295,18 @@
   // Pop walks up the parent chain(s) for a reduction from Head by to Rule.
   // Once we reach the end, record the bases and sequences.
   auto Pop = [&](const GSS::Node *Head, RuleID RID) {
-    LLVM_DEBUG(llvm::dbgs() << "  Pop " << Params.G.dumpRule(RID) << "\n");
-    const auto &Rule = Params.G.lookupRule(RID);
+    LLVM_DEBUG(llvm::dbgs() << "  Pop " << Params.Lang.G.dumpRule(RID) << "\n");
+    const auto &Rule = Params.Lang.G.lookupRule(RID);
     Family F{/*Start=*/0, /*Symbol=*/Rule.Target, /*Rule=*/RID};
     TempSequence.resize_for_overwrite(Rule.Size);
     auto DFS = [&](const GSS::Node *N, unsigned I, auto &DFS) {
       if (I == Rule.Size) {
+        if (Rule.Guard) {
+          auto It = Params.Lang.Guards.find(Rule.Guard);
+          assert(It != Params.Lang.Guards.end() && "missing guard!");
+          if (!It->getSecond()(TempSequence, Tokens))
+            return;
+        }
         F.Start = TempSequence.front()->startTokenIndex();
         LLVM_DEBUG(llvm::dbgs() << "    --> base at S" << N->State << "\n");
         Sequences.emplace(F, PushSpec{N, TempSequence});
@@ -331,7 +338,7 @@
   while (!Sequences.empty()) {
     Family F = Sequences.top().first;
 
-    LLVM_DEBUG(llvm::dbgs() << "  Push " << Params.G.symbolName(F.Symbol)
+    LLVM_DEBUG(llvm::dbgs() << "  Push " << Params.Lang.G.symbolName(F.Symbol)
                             << " from token " << F.Start << "\n");
 
     // Grab the sequences and bases for this family.
@@ -344,7 +351,7 @@
       FamilySequences.emplace_back(Sequences.top().first.Rule,
                                    Sequences.top().second.Seq);
       FamilyBases.emplace_back(
-          Params.Table.getGoToState(Sequences.top().second.Base->State,
+          Params.Lang.Table.getGoToState(Sequences.top().second.Base->State,
                                     F.Symbol),
           Sequences.top().second.Base);
 
@@ -362,7 +369,7 @@
         SequenceNodes.size() == 1
             ? SequenceNodes.front()
             : &Params.Forest.createAmbiguous(F.Symbol, SequenceNodes);
-    LLVM_DEBUG(llvm::dbgs() << "    --> " << Parsed->dump(Params.G) << "\n");
+    LLVM_DEBUG(llvm::dbgs() << "    --> " << Parsed->dump(Params.Lang.G) << "\n");
 
     // Bases for this family, deduplicate them, and group by the goTo State.
     sortAndUnique(FamilyBases);
Index: clang-tools-extra/pseudo/lib/CMakeLists.txt
===================================================================
--- clang-tools-extra/pseudo/lib/CMakeLists.txt
+++ clang-tools-extra/pseudo/lib/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory(cli)
 add_subdirectory(cxx)
 add_subdirectory(grammar)
 
Index: clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h
@@ -28,7 +28,7 @@
 namespace clang {
 namespace pseudo {
 class LRTable;
-
+class ParseLang;
 namespace cxx {
 // Symbol represents nonterminal symbols in the C++ grammar.
 // It provides a simple uniform way to access a particular nonterminal.
@@ -38,10 +38,13 @@
 #undef NONTERMINAL
 };
 
-// Returns the C++ grammar.
-const Grammar &getGrammar();
-// Returns the corresponding LRTable for the C++ grammar.
-const LRTable &getLRTable();
+enum class Extension : ExtensionID {
+#define EXTENSION(X, Y) X = Y,
+#include "CXXSymbols.inc"
+#undef EXTENSION
+};
+
+const ParseLang &getLanguage();
 
 } // namespace cxx
 
Index: clang-tools-extra/pseudo/include/clang-pseudo/cli/CLI.h
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/include/clang-pseudo/cli/CLI.h
@@ -0,0 +1,31 @@
+//===--- CLI.h - Get grammar from variant sources ----------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A library shared among different pseudoparser-based tools. It provides a
+// uniform way to get basic pieces of the parser (Grammar, LRTable etc) from
+// variant grammar sources.
+// It defines a `--grammar` CLI flag, which supports 1) using a grammar from a
+// file (--grammar=/path/to/lang.bnf) or using the prebuilt cxx language
+// (--grammar=cxx).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_PSEUDO_CLI_CLI_H
+#define CLANG_PSEUDO_CLI_CLI_H
+
+namespace clang {
+namespace pseudo {
+struct ParseLang;
+namespace cli {
+// Returns the corresponding language from the '--grammar' command-line flag.
+const ParseLang &getLanguage();
+} // namespace cli
+} // namespace pseudo
+} // namespace clang
+
+#endif // CLANG_PSEUDO_CLI_CLI_H
Index: clang-tools-extra/pseudo/include/clang-pseudo/Language.h
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/include/clang-pseudo/Language.h
@@ -0,0 +1,47 @@
+//===--- Language.h -------------------------------------------- -*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_PSEUDO_LANGUAGE_H
+#define CLANG_PSEUDO_LANGUAGE_H
+
+#include "clang-pseudo/grammar/Grammar.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+
+namespace clang {
+namespace pseudo {
+class ForestNode;
+class TokenStream;
+class LRTable;
+
+// Interface for implementing the grammar "guard" attribute.
+//
+// It is used by the GLR parser to determine whether a reduction of a rule will
+// be conducted during the reduce time.
+//
+// Returns false if the reduction is not conducted (this parsing branch in GLR
+// will die).
+using Guard = llvm::function_ref<bool(llvm::ArrayRef<const ForestNode *> RHS,
+                                      const TokenStream &)>;
+
+// Specify a language that can be parsed by the pseduoparser.
+// Manifest generated from a bnf grammar file.
+struct ParseLang {
+  const Grammar &G;
+  const LRTable &Table;
+  // Binding "guard" extension id to a piece of C++ code.
+  const llvm::DenseMap<ExtensionID, Guard> &Guards;
+
+  // FIXME: add clang::LangOptions.
+  // FIXME: add default start symbols.
+};
+
+} // namespace pseudo
+} // namespace clang
+
+#endif // CLANG_PSEUDO_LANGUAGE_H
\ No newline at end of file
Index: clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
@@ -30,6 +30,7 @@
 #define CLANG_PSEUDO_GLR_H
 
 #include "clang-pseudo/Forest.h"
+#include "clang-pseudo/Language.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "clang-pseudo/grammar/LRTable.h"
 #include "llvm/Support/Allocator.h"
@@ -113,12 +114,7 @@
 
 // Parameters for the GLR parsing.
 struct ParseParams {
-  // The grammar of the language we're going to parse.
-  const Grammar &G;
-  // The LR table which GLR uses to parse the input, should correspond to the
-  // Grammar G.
-  const LRTable &Table;
-
+  const ParseLang &Lang;
   // Arena for data structure used by the GLR algorithm.
   ForestArena &Forest;  // Storage for the output forest.
   GSS &GSStack;         // Storage for parsing stacks.
@@ -159,7 +155,7 @@
 //
 // Exposed for testing only.
 void glrReduce(std::vector<ParseStep> &PendingReduce, const ParseParams &Params,
-               NewHeadCallback NewHeadCB);
+               const TokenStream &Tokens, NewHeadCallback NewHeadCB);
 
 } // namespace pseudo
 } // namespace clang
Index: clang-tools-extra/pseudo/gen/Main.cpp
===================================================================
--- clang-tools-extra/pseudo/gen/Main.cpp
+++ clang-tools-extra/pseudo/gen/Main.cpp
@@ -79,6 +79,14 @@
 
   switch (Emit) {
   case EmitSymbolList:
+    Out.os() << R"cpp(
+#ifndef NONTERMINAL
+#define NONTERMINAL(X, Y)
+#endif
+#ifndef EXTENSION
+#define EXTENSION(X, Y)
+#endif
+    )cpp";
     for (clang::pseudo::SymbolID ID = 0; ID < G->table().Nonterminals.size();
          ++ID) {
       std::string Name = G->symbolName(ID).str();
@@ -86,6 +94,16 @@
       std::replace(Name.begin(), Name.end(), '-', '_');
       Out.os() << llvm::formatv("NONTERMINAL({0}, {1})\n", Name, ID);
     }
+    for (clang::pseudo::ExtensionID AID = 1 /*skip the sentinel 0 value*/;
+         AID < G->table().AttributeValues.size(); ++AID) {
+      llvm::StringRef Name = G->table().AttributeValues[AID];
+      assert(!Name.empty());
+      Out.os() << llvm::formatv("EXTENSION({0}, {1})\n", Name, AID);
+    }
+    Out.os() << R"cpp(
+#undef NONTERMINAL
+#undef EXTENSION
+    )cpp";
     break;
   case EmitGrammarContent:
     for (llvm::StringRef Line : llvm::split(GrammarText, '\n')) {
Index: clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
===================================================================
--- clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
+++ clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
@@ -10,6 +10,7 @@
 #include "clang-pseudo/Forest.h"
 #include "clang-pseudo/GLR.h"
 #include "clang-pseudo/Token.h"
+#include "clang-pseudo/cli/CLI.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "clang-pseudo/grammar/LRTable.h"
 #include "clang/Basic/LangOptions.h"
@@ -24,28 +25,11 @@
 
 class Fuzzer {
   clang::LangOptions LangOpts = clang::pseudo::genericLangOpts();
-  std::unique_ptr<Grammar> G;
   LRTable T;
   bool Print;
 
 public:
-  Fuzzer(llvm::StringRef GrammarPath, bool Print) : Print(Print) {
-    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GrammarText =
-        llvm::MemoryBuffer::getFile(GrammarPath);
-    if (std::error_code EC = GrammarText.getError()) {
-      llvm::errs() << "Error: can't read grammar file '" << GrammarPath
-                   << "': " << EC.message() << "\n";
-      std::exit(1);
-    }
-    std::vector<std::string> Diags;
-    G = Grammar::parseBNF(GrammarText->get()->getBuffer(), Diags);
-    if (!Diags.empty()) {
-      for (const auto &Diag : Diags)
-        llvm::errs() << Diag << "\n";
-      std::exit(1);
-    }
-    T = LRTable::buildSLR(*G);
-  }
+  Fuzzer(bool Print) : Print(Print) {}
 
   void operator()(llvm::StringRef Code) {
     std::string CodeStr = Code.str(); // Must be null-terminated.
@@ -59,10 +43,11 @@
     clang::pseudo::ForestArena Arena;
     clang::pseudo::GSS GSS;
     auto &Root =
-        glrParse(ParseableStream, clang::pseudo::ParseParams{*G, T, Arena, GSS},
-                 *G->findNonterminal("translation-unit"));
+        glrParse(ParseableStream,
+                 clang::pseudo::ParseParams{cli::getLanguage(), Arena, GSS},
+                 *cli::getLanguage().G.findNonterminal("translation-unit"));
     if (Print)
-      llvm::outs() << Root.dumpRecursive(*G);
+      llvm::outs() << Root.dumpRecursive(cli::getLanguage().G);
   }
 };
 
@@ -75,16 +60,11 @@
 extern "C" {
 
 // Set up the fuzzer from command line flags:
-//  -grammar=<file> (required) - path to cxx.bnf
 //  -print                     - used for testing the fuzzer
 int LLVMFuzzerInitialize(int *Argc, char ***Argv) {
-  llvm::StringRef GrammarFile;
   bool PrintForest = false;
   auto ConsumeArg = [&](llvm::StringRef Arg) -> bool {
-    if (Arg.consume_front("-grammar=")) {
-      GrammarFile = Arg;
-      return true;
-    } else if (Arg == "-print") {
+    if (Arg == "-print") {
       PrintForest = true;
       return true;
     }
@@ -92,11 +72,7 @@
   };
   *Argc = std::remove_if(*Argv + 1, *Argv + *Argc, ConsumeArg) - *Argv;
 
-  if (GrammarFile.empty()) {
-    fprintf(stderr, "Fuzzer needs -grammar=/path/to/cxx.bnf\n");
-    exit(1);
-  }
-  clang::pseudo::Fuzz = new clang::pseudo::Fuzzer(GrammarFile, PrintForest);
+  clang::pseudo::Fuzz = new clang::pseudo::Fuzzer(PrintForest);
   return 0;
 }
 
Index: clang-tools-extra/pseudo/fuzzer/CMakeLists.txt
===================================================================
--- clang-tools-extra/pseudo/fuzzer/CMakeLists.txt
+++ clang-tools-extra/pseudo/fuzzer/CMakeLists.txt
@@ -11,5 +11,6 @@
 target_link_libraries(clang-pseudo-fuzzer
   PRIVATE
   clangPseudo
+  clangPseudoCLI
   clangPseudoGrammar
   )
Index: clang-tools-extra/pseudo/benchmarks/CMakeLists.txt
===================================================================
--- clang-tools-extra/pseudo/benchmarks/CMakeLists.txt
+++ clang-tools-extra/pseudo/benchmarks/CMakeLists.txt
@@ -3,6 +3,7 @@
 target_link_libraries(ClangPseudoBenchmark
   PRIVATE
   clangPseudo
+  clangPseudoCLI
   clangPseudoGrammar
   LLVMSupport
   )
Index: clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
===================================================================
--- clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
+++ clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
@@ -25,6 +25,7 @@
 #include "clang-pseudo/Forest.h"
 #include "clang-pseudo/GLR.h"
 #include "clang-pseudo/Token.h"
+#include "clang-pseudo/cli/CLI.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "clang-pseudo/grammar/LRTable.h"
 #include "clang/Basic/LangOptions.h"
@@ -39,9 +40,6 @@
 using llvm::cl::opt;
 using llvm::cl::Required;
 
-static opt<std::string> GrammarFile("grammar",
-                                    desc("Parse and check a BNF grammar file."),
-                                    Required);
 static opt<std::string> Source("source", desc("Source file"), Required);
 
 namespace clang {
@@ -49,11 +47,9 @@
 namespace bench {
 namespace {
 
-const std::string *GrammarText = nullptr;
 const std::string *SourceText = nullptr;
-const Grammar *G = nullptr;
 
-void setupGrammarAndSource() {
+void setupSource() {
   auto ReadFile = [](llvm::StringRef FilePath) -> std::string {
     llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GrammarText =
         llvm::MemoryBuffer::getFile(FilePath);
@@ -64,22 +60,12 @@
     }
     return GrammarText.get()->getBuffer().str();
   };
-  GrammarText = new std::string(ReadFile(GrammarFile));
   SourceText = new std::string(ReadFile(Source));
-  std::vector<std::string> Diags;
-  G = Grammar::parseBNF(*GrammarText, Diags).release();
 }
 
-static void parseBNF(benchmark::State &State) {
-  std::vector<std::string> Diags;
-  for (auto _ : State)
-    Grammar::parseBNF(*GrammarText, Diags);
-}
-BENCHMARK(parseBNF);
-
 static void buildSLR(benchmark::State &State) {
   for (auto _ : State)
-    LRTable::buildSLR(*G);
+    LRTable::buildSLR(cli::getLanguage().G);
 }
 BENCHMARK(buildSLR);
 
@@ -129,13 +115,14 @@
 BENCHMARK(preprocess);
 
 static void glrParse(benchmark::State &State) {
-  LRTable Table = clang::pseudo::LRTable::buildSLR(*G);
-  SymbolID StartSymbol = *G->findNonterminal("translation-unit");
+  SymbolID StartSymbol =
+      *cli::getLanguage().G.findNonterminal("translation-unit");
   TokenStream Stream = lexAndPreprocess();
   for (auto _ : State) {
     pseudo::ForestArena Forest;
     pseudo::GSS GSS;
-    pseudo::glrParse(Stream, ParseParams{*G, Table, Forest, GSS}, StartSymbol);
+    pseudo::glrParse(Stream, ParseParams{cli::getLanguage(), Forest, GSS},
+                     StartSymbol);
   }
   State.SetBytesProcessed(static_cast<uint64_t>(State.iterations()) *
                           SourceText->size());
@@ -143,14 +130,14 @@
 BENCHMARK(glrParse);
 
 static void full(benchmark::State &State) {
-  LRTable Table = clang::pseudo::LRTable::buildSLR(*G);
-  SymbolID StartSymbol = *G->findNonterminal("translation-unit");
+  SymbolID StartSymbol =
+      *cli::getLanguage().G.findNonterminal("translation-unit");
   for (auto _ : State) {
     TokenStream Stream = lexAndPreprocess();
     pseudo::ForestArena Forest;
     pseudo::GSS GSS;
-    pseudo::glrParse(lexAndPreprocess(), ParseParams{*G, Table, Forest, GSS},
-                     StartSymbol);
+    pseudo::glrParse(lexAndPreprocess(),
+                     ParseParams{cli::getLanguage(), Forest, GSS}, StartSymbol);
   }
   State.SetBytesProcessed(static_cast<uint64_t>(State.iterations()) *
                           SourceText->size());
@@ -165,7 +152,7 @@
 int main(int argc, char *argv[]) {
   benchmark::Initialize(&argc, argv);
   llvm::cl::ParseCommandLineOptions(argc, argv);
-  clang::pseudo::bench::setupGrammarAndSource();
+  clang::pseudo::bench::setupSource();
   benchmark::RunSpecifiedBenchmarks();
   return 0;
 }
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to