sammccall created this revision.
sammccall added a reviewer: hokein.
Herald added a subscriber: mgorny.
Herald added a project: All.
sammccall requested review of this revision.
Herald added subscribers: cfe-commits, alextsao1999.
Herald added a project: clang-tools-extra.

Mostly mechanics here. Interesting decisions:

- apply disambiguation in-place instead of copying the forest debatable, but 
even the final tree size is significant
- split decide/apply into different functions - this allows the hard part 
(decide) to be tested non-destructively and combined with HTML forest easily
- add non-const accessors to forest to enable apply
- unit tests but no lit tests: my plan is to test actual C++ disambiguation 
heuristics with lit, generic disambiguation mechanics without the C++ grammar


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D132487

Files:
  clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h
  clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
  clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
  clang-tools-extra/pseudo/lib/CMakeLists.txt
  clang-tools-extra/pseudo/lib/Disambiguate.cpp
  clang-tools-extra/pseudo/lib/GLR.cpp
  clang-tools-extra/pseudo/tool/ClangPseudo.cpp
  clang-tools-extra/pseudo/tool/HTMLForest.cpp
  clang-tools-extra/pseudo/unittests/CMakeLists.txt
  clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp

Index: clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp
@@ -0,0 +1,111 @@
+//===--- DisambiguateTest.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Disambiguate.h"
+#include "clang-pseudo/Forest.h"
+#include "clang-pseudo/Token.h"
+#include "clang/Basic/TokenKinds.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <vector>
+
+namespace clang {
+namespace pseudo {
+namespace {
+using testing::ElementsAre;
+using testing::Pair;
+using testing::UnorderedElementsAre;
+
+// Common disambiguation test fixture.
+// This is the ambiguous forest representing parses of 'a * b;'.
+class DisambiguateTest : public ::testing::Test {
+protected:
+  // Greatly simplified C++ grammar.
+  enum Symbol : SymbolID {
+    Statement,
+    Declarator,
+    Expression,
+    DeclSpecifier,
+    Type,
+    Template,
+  };
+  enum Rule : RuleID {
+    /* LHS__RHS1_RHS2 means LHS := RHS1 RHS2 */
+    Statement__DeclSpecifier_Declarator_Semi,
+    Declarator__Star_Declarator,
+    Declarator__Identifier,
+    Statement__Expression_Semi,
+    Expression__Expression_Star_Expression,
+    Expression__Identifier,
+    DeclSpecifier__Type,
+    DeclSpecifier__Template,
+    Type__Identifier,
+    Template__Identifier,
+  };
+
+  ForestArena Arena;
+  ForestNode &A = Arena.createTerminal(tok::identifier, 0);
+  ForestNode &Star = Arena.createTerminal(tok::star, 1);
+  ForestNode &B = Arena.createTerminal(tok::identifier, 2);
+  ForestNode &Semi = Arena.createTerminal(tok::semi, 3);
+
+  // Parse as multiplication expression.
+  ForestNode &AExpr =
+      Arena.createSequence(Expression, Expression__Identifier, &A);
+  ForestNode &BExpr =
+      Arena.createSequence(Expression, Expression__Identifier, &B);
+  ForestNode &Expr =
+      Arena.createSequence(Expression, Expression__Expression_Star_Expression,
+                           {&AExpr, &Star, &BExpr});
+  ForestNode &ExprStmt = Arena.createSequence(
+      Statement, Statement__Expression_Semi, {&Expr, &Semi});
+  // Parse as declaration (`a` may be CTAD or not).
+  ForestNode &AType =
+      Arena.createSequence(DeclSpecifier, DeclSpecifier__Type,
+                           &Arena.createSequence(Type, Type__Identifier, &A));
+  ForestNode &ATemplate = Arena.createSequence(
+      DeclSpecifier, DeclSpecifier__Template,
+      &Arena.createSequence(Template, Template__Identifier, &A));
+  ForestNode &DeclSpec =
+      Arena.createAmbiguous(DeclSpecifier, {&AType, &ATemplate});
+  ForestNode &BDeclarator =
+      Arena.createSequence(Declarator, Declarator__Identifier, &B);
+  ForestNode &BPtr = Arena.createSequence(
+      Declarator, Declarator__Star_Declarator, {&Star, &BDeclarator});
+  ForestNode &DeclStmt =
+      Arena.createSequence(Statement, Statement__DeclSpecifier_Declarator_Semi,
+                           {&DeclSpec, &Star, &BDeclarator});
+  // Top-level ambiguity
+  ForestNode &Stmt = Arena.createAmbiguous(Statement, {&ExprStmt, &DeclStmt});
+};
+
+TEST_F(DisambiguateTest, Remove) {
+  Disambiguation D;
+  D.try_emplace(&Stmt, 1);     // statement is a declaration, not an expression
+  D.try_emplace(&DeclSpec, 0); // a is a type, not a (CTAD) template
+  ForestNode *Root = &Stmt;
+  removeAmbiguities(Root, D);
+
+  EXPECT_EQ(Root, &DeclStmt);
+  EXPECT_THAT(DeclStmt.elements(), ElementsAre(&AType, &Star, &BDeclarator));
+}
+
+TEST_F(DisambiguateTest, DummyStrategy) {
+  Disambiguation D = disambiguate(&Stmt, {});
+  EXPECT_THAT(D, UnorderedElementsAre(Pair(&Stmt, 1), Pair(&DeclSpec, 1)));
+
+  ForestNode *Root = &Stmt;
+  removeAmbiguities(Root, D);
+  EXPECT_EQ(Root, &DeclStmt);
+  EXPECT_THAT(DeclStmt.elements(),
+              ElementsAre(&ATemplate, &Star, &BDeclarator));
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace clang
Index: clang-tools-extra/pseudo/unittests/CMakeLists.txt
===================================================================
--- clang-tools-extra/pseudo/unittests/CMakeLists.txt
+++ clang-tools-extra/pseudo/unittests/CMakeLists.txt
@@ -7,6 +7,7 @@
   BracketTest.cpp
   CXXTest.cpp
   DirectiveTreeTest.cpp
+  DisambiguateTest.cpp
   ForestTest.cpp
   GLRTest.cpp
   GrammarTest.cpp
Index: clang-tools-extra/pseudo/tool/HTMLForest.cpp
===================================================================
--- clang-tools-extra/pseudo/tool/HTMLForest.cpp
+++ clang-tools-extra/pseudo/tool/HTMLForest.cpp
@@ -43,6 +43,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang-pseudo/Disambiguate.h"
 #include "clang-pseudo/Forest.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "llvm/ADT/StringExtras.h"
@@ -60,6 +61,7 @@
   const Grammar &G;
   const ForestNode &Root;
   const TokenStream &Stream;
+  const Disambiguation &Disambig;
 
   void write() {
     Out << "<!doctype html>\n";
@@ -150,7 +152,8 @@
           break;
         case ForestNode::Ambiguous:
           Out.attribute("kind", "ambiguous");
-          Out.attribute("selected", AssignID(N->children().front(), End));
+          Out.attribute("selected",
+                        AssignID(N->children()[Disambig.lookup(N)], End));
           break;
         case ForestNode::Opaque:
           Out.attribute("kind", "opaque");
@@ -180,8 +183,9 @@
 // We only accept the derived stream here.
 // FIXME: allow the original stream instead?
 void writeHTMLForest(llvm::raw_ostream &OS, const Grammar &G,
-                     const ForestNode &Root, const TokenStream &Stream) {
-  Writer{OS, G, Root, Stream}.write();
+                     const ForestNode &Root, const Disambiguation &Disambig,
+                     const TokenStream &Stream) {
+  Writer{OS, G, Root, Stream, Disambig}.write();
 }
 
 } // namespace pseudo
Index: clang-tools-extra/pseudo/tool/ClangPseudo.cpp
===================================================================
--- clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -8,6 +8,7 @@
 
 #include "clang-pseudo/Bracket.h"
 #include "clang-pseudo/DirectiveTree.h"
+#include "clang-pseudo/Disambiguate.h"
 #include "clang-pseudo/Forest.h"
 #include "clang-pseudo/GLR.h"
 #include "clang-pseudo/Language.h"
@@ -45,6 +46,8 @@
 static opt<bool>
     StripDirectives("strip-directives",
                     desc("Strip directives and select conditional sections"));
+static opt<bool> Disambiguate("disambiguate",
+                              desc("Choose best tree from parse forest"));
 static opt<bool> PrintStatistics("print-statistics", desc("Print GLR parser statistics"));
 static opt<bool> PrintForest("print-forest", desc("Print parse forest"));
 static opt<bool> ForestAbbrev("forest-abbrev", desc("Abbreviate parse forest"),
@@ -70,7 +73,8 @@
 namespace pseudo {
 // Defined in HTMLForest.cpp
 void writeHTMLForest(llvm::raw_ostream &OS, const Grammar &,
-                     const ForestNode &Root, const TokenStream &);
+                     const ForestNode &Root, const Disambiguation &,
+                     const TokenStream &);
 namespace {
 
 struct NodeStats {
@@ -158,6 +162,9 @@
                  *StartSymID, Lang);
     if (PrintForest)
       llvm::outs() << Root.dumpRecursive(Lang.G, /*Abbreviated=*/ForestAbbrev);
+    clang::pseudo::Disambiguation Disambig;
+    if (Disambiguate)
+      Disambig = clang::pseudo::disambiguate(&Root, {});
 
     if (HTMLForest.getNumOccurrences()) {
       std::error_code EC;
@@ -167,7 +174,8 @@
                      << "\n";
         return 2;
       }
-      clang::pseudo::writeHTMLForest(HTMLOut, Lang.G, Root, *ParseableStream);
+      clang::pseudo::writeHTMLForest(HTMLOut, Lang.G, Root, Disambig,
+                                     *ParseableStream);
     }
 
     if (PrintStatistics) {
@@ -219,6 +227,14 @@
       llvm::outs() << llvm::formatv("Unparsed: {0}%\n",
                                     100.0 * UnparsedTokens / Len);
     }
+
+    if (Disambiguate && PrintForest) {
+      ForestNode *DisambigRoot = &Root;
+      removeAmbiguities(DisambigRoot, Disambig);
+      llvm::outs() << "Disambiguated tree:\n";
+      llvm::outs() << DisambigRoot->dumpRecursive(Lang.G,
+                                                  /*Abbreviated=*/ForestAbbrev);
+    }
   }
 
   return 0;
Index: clang-tools-extra/pseudo/lib/GLR.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/GLR.cpp
+++ clang-tools-extra/pseudo/lib/GLR.cpp
@@ -599,8 +599,8 @@
 
 } // namespace
 
-const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
-                           const Language &Lang) {
+ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
+                     const Language &Lang) {
   GLRReduce Reduce(Params, Lang);
   assert(isNonterminal(StartSymbol) && "Start symbol must be a nonterminal");
   llvm::ArrayRef<ForestNode> Terminals = Params.Forest.createTerminals(Params.Code);
@@ -687,7 +687,7 @@
     return Result;
   };
   if (auto *Result = SearchForAccept(Heads))
-    return *Result;
+    return *const_cast<ForestNode *>(Result); // Safe: we created all nodes.
   // We failed to parse the input, returning an opaque forest node for recovery.
   // FIXME: as above, we can add fallback error handling so this is impossible.
   return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
Index: clang-tools-extra/pseudo/lib/Disambiguate.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/lib/Disambiguate.cpp
@@ -0,0 +1,48 @@
+//===--- Disambiguate.cpp - Find the best tree in the forest --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Disambiguate.h"
+
+namespace clang::pseudo {
+
+Disambiguation disambiguate(const ForestNode *Root,
+                            const DisambiguateParams &Params) {
+  // FIXME: this is a dummy placeholder strategy, implement a real one!
+  Disambiguation Result;
+  for (const ForestNode &N : Root->descendants()) {
+    if (N.kind() == ForestNode::Ambiguous)
+      Result.try_emplace(&N, 1);
+  }
+  return Result;
+}
+
+void removeAmbiguities(ForestNode *&Root, const Disambiguation &D) {
+  std::vector<ForestNode **> Queue = {&Root};
+  while (!Queue.empty()) {
+    ForestNode **Next = Queue.back();
+    Queue.pop_back();
+    switch ((*Next)->kind()) {
+    case ForestNode::Sequence:
+      for (ForestNode *&Child : (*Next)->elements())
+        Queue.push_back(&Child);
+      break;
+    case ForestNode::Ambiguous: {
+      assert(D.count(*Next) != 0 && "disambiguation is incomplete!");
+      ForestNode *ChosenChild = (*Next)->alternatives()[D.lookup(*Next)];
+      *Next = ChosenChild;
+      Queue.push_back(Next);
+      break;
+    }
+    case ForestNode::Terminal:
+    case ForestNode::Opaque:
+      break;
+    }
+  }
+}
+
+} // namespace clang::pseudo
Index: clang-tools-extra/pseudo/lib/CMakeLists.txt
===================================================================
--- clang-tools-extra/pseudo/lib/CMakeLists.txt
+++ clang-tools-extra/pseudo/lib/CMakeLists.txt
@@ -7,6 +7,7 @@
 add_clang_library(clangPseudo
   Bracket.cpp
   DirectiveTree.cpp
+  Disambiguate.cpp
   Forest.cpp
   GLR.cpp
   Lex.cpp
Index: clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
@@ -130,8 +130,8 @@
 // A rule `_ := StartSymbol` must exit for the chosen start symbol.
 //
 // If the parsing fails, we model it as an opaque node in the forest.
-const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
-                           const Language &Lang);
+ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
+                     const Language &Lang);
 
 // Shift a token onto all OldHeads, placing the results into NewHeads.
 //
Index: clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
@@ -80,6 +80,10 @@
     assert(kind() == Sequence);
     return children(Data >> RuleBits);
   }
+  llvm::MutableArrayRef<ForestNode *> elements() {
+    assert(kind() == Sequence);
+    return children(Data >> RuleBits);
+  }
 
   // Returns all possible interpretations of the code.
   // REQUIRES: this is an Ambiguous node.
@@ -87,6 +91,10 @@
     assert(kind() == Ambiguous);
     return children(Data);
   }
+  llvm::MutableArrayRef<ForestNode *> alternatives() {
+    assert(kind() == Ambiguous);
+    return children(Data);
+  }
 
   llvm::ArrayRef<const ForestNode *> children() const {
     switch (kind()) {
@@ -134,6 +142,10 @@
     return llvm::makeArrayRef(reinterpret_cast<ForestNode *const *>(this + 1),
                               Num);
   }
+  llvm::MutableArrayRef<ForestNode *> children(uint16_t Num) {
+    return llvm::makeMutableArrayRef(reinterpret_cast<ForestNode **>(this + 1),
+                                     Num);
+  }
 
   Token::Index StartIndex;
   Kind K : 4;
Index: clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h
@@ -0,0 +1,64 @@
+//===--- Disambiguate.h - Find the best tree in the forest -------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A GLR parse forest represents every possible parse tree for the source code.
+//
+// Before we can do useful analysis/editing of the code, we need to pick a
+// single tree which we think is accurate. We use three main types of clues:
+//
+// A) Semantic language rules may restrict which parses are allowed.
+//    For example, `string string string X` is *grammatical* C++, but only a
+//    single type-name is allowed in a decl-specifier-sequence.
+//    Where possible, these interpretations are forbidden by guards.
+//    Sometimes this isn't possible, or we want our parser to be lenient.
+//
+// B) Some constructs are rarer, while others are common.
+//    For example `a<b>::c` is often a template specialization, and rarely a
+//    double comparison between a, b, and c.
+//
+// C) Identifier text hints whether they name types/values/templates etc.
+//    "std" is usually a namespace, a project index may also guide us.
+//    Hints may be within the document: if one occurrence of 'foo' is a variable
+//    then the others probably are too.
+//    (Text need not match: similar CaseStyle can be a weak hint, too).
+//
+//----------------------------------------------------------------------------//
+//
+// Mechanically, we replace each ambiguous node with its best alternative.
+//
+// "Best" is determined by assigning bonuses/penalties to nodes, to express
+// the clues of type A and B above. A forest node representing an unlikely
+// parse would apply a penalty to every subtree is is present in.
+// Disambiguation proceeds bottom-up, so that the score of each alternative
+// is known when a decision is made.
+//
+// Identifier-based hints within the document mean some nodes should be
+// *correlated*. Rather than resolve these simultaneously, we make the most
+// certain decisions first and use these results to update bonuses elsewhere.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Forest.h"
+
+namespace clang::pseudo {
+
+struct DisambiguateParams {};
+
+// Maps ambiguous nodes onto the index of their preferred alternative.
+using Disambiguation = llvm::DenseMap<const ForestNode *, unsigned>;
+
+// Resolve each ambiguous node in the forest.
+// Maps each ambiguous node to the index of the chosen alternative.
+// FIXME: current implementation is a placeholder and chooses arbitrarily.
+Disambiguation disambiguate(const ForestNode *Root,
+                            const DisambiguateParams &Params);
+
+// Remove all ambiguities from the forest, resolving them according to Disambig.
+void removeAmbiguities(ForestNode *&Root, const Disambiguation &Disambig);
+
+} // namespace clang::pseudo
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to