sammccall created this revision.
sammccall added a reviewer: hokein.
Herald added a subscriber: mgorny.
sammccall requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.
The TokenStream class is the representation of the source code that will
be fed into the GLR parser.
This patch allows a "raw" TokenStream to be built by reading source code.
It also supports scanning a TokenStream to find the directive structure.
Next steps (with placeholders in the code): heuristically choosing a
path through #ifs, preprocessing the code by stripping directives and
comments, cooking raw_identifiers.
These will produce a suitable stream to feed into the parser proper.
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D119162
Files:
clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h
clang/include/clang/Tooling/Syntax/Pseudo/Token.h
clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
clang/lib/Tooling/Syntax/Pseudo/Lex.cpp
clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp
clang/lib/Tooling/Syntax/Pseudo/Token.cpp
clang/test/Syntax/Inputs/example.c
clang/test/Syntax/lex.test
clang/tools/clang-pseudo/ClangPseudo.cpp
Index: clang/tools/clang-pseudo/ClangPseudo.cpp
===================================================================
--- clang/tools/clang-pseudo/ClangPseudo.cpp
+++ clang/tools/clang-pseudo/ClangPseudo.cpp
@@ -6,7 +6,10 @@
//
//===----------------------------------------------------------------------===//
+#include "clang/Basic/LangOptions.h"
#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FormatVariadic.h"
@@ -21,19 +24,31 @@
CheckGrammar("check-grammar", desc("Parse and check a BNF grammar file."),
init(""));
+static opt<std::string> Source("source", desc("Source file"));
+static opt<bool> PrintSource("print-source", desc("Print token stream"));
+static opt<bool> PrintTokens("print-tokens", desc("Print detailed token info"));
+static opt<bool>
+ PrintPPStructure("print-pp-structure",
+ desc("Print directive structure of source code"));
+
+static std::string readOrDie(llvm::StringRef Path) {
+ llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
+ llvm::MemoryBuffer::getFile(Path);
+ if (std::error_code EC = Text.getError()) {
+ llvm::errs() << "Error: can't read grammar file '" << CheckGrammar
+ << "': " << EC.message() << "\n";
+ ::exit(1);
+ }
+ return Text.get()->getBuffer().str();
+}
+
int main(int argc, char *argv[]) {
llvm::cl::ParseCommandLineOptions(argc, argv, "");
if (CheckGrammar.getNumOccurrences()) {
- llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
- llvm::MemoryBuffer::getFile(CheckGrammar);
- if (std::error_code EC = Text.getError()) {
- llvm::errs() << "Error: can't read grammar file '" << CheckGrammar
- << "': " << EC.message() << "\n";
- return 1;
- }
+ std::string Text = readOrDie(CheckGrammar);
std::vector<std::string> Diags;
- auto RSpecs = Grammar::parseBNF(Text.get()->getBuffer(), Diags);
+ auto RSpecs = Grammar::parseBNF(Text, Diags);
if (!Diags.empty()) {
llvm::errs() << llvm::join(Diags, "\n");
@@ -43,5 +58,20 @@
CheckGrammar);
return 0;
}
+
+ if (Source.getNumOccurrences()) {
+ std::string Text = readOrDie(Source);
+ clang::LangOptions LangOpts; // FIXME: use real options.
+ auto Stream = clang::syntax::pseudo::lex(Text, LangOpts);
+ auto Structure = clang::syntax::pseudo::PPStructure::parse(Stream);
+
+ if (PrintPPStructure)
+ llvm::outs() << Structure;
+ if (PrintSource)
+ Stream.print(llvm::outs());
+ if (PrintTokens)
+ llvm::outs() << Stream;
+ }
+
return 0;
}
Index: clang/test/Syntax/lex.test
===================================================================
--- /dev/null
+++ clang/test/Syntax/lex.test
@@ -0,0 +1,38 @@
+// RUN: clang-pseudo -source %S/Inputs/example.c -print-source | FileCheck %s -check-prefix=SOURCE --strict-whitespace
+ SOURCE: int is_debug() {
+SOURCE-NEXT: #ifndef NDEBUG
+SOURCE-NEXT: return 1; // in debug mode
+SOURCE-NEXT: #else
+SOURCE-NEXT: return 0;
+SOURCE-NEXT: #end
+SOURCE-NEXT: }
+// RUN: clang-pseudo -source %S/Inputs/example.c -print-tokens | FileCheck %s -check-prefix=TOKEN
+TOKEN: 0: raw_identifier 0:0 "int" flags=1
+TOKEN-NEXT: raw_identifier 0:0 "is_debug"
+TOKEN-NEXT: l_paren 0:0 "("
+TOKEN-NEXT: r_paren 0:0 ")"
+TOKEN-NEXT: l_brace 0:0 "{"
+TOKEN-NEXT: hash 1:0 "#" flags=1
+TOKEN-NEXT: raw_identifier 1:0 "ifndef"
+TOKEN-NEXT: raw_identifier 1:0 "NDEBUG"
+TOKEN-NEXT: raw_identifier 2:2 "return" flags=1
+TOKEN-NEXT: numeric_constant 2:2 "1"
+TOKEN-NEXT: semi 2:2 ";"
+TOKEN-NEXT: comment 2:2 "// in debug mode"
+TOKEN-NEXT: hash 3:0 "#" flags=1
+TOKEN-NEXT: raw_identifier 3:0 "else"
+TOKEN-NEXT: raw_identifier 4:2 "return" flags=1
+TOKEN-NEXT: numeric_constant 4:2 "0"
+TOKEN-NEXT: semi 4:2 ";"
+TOKEN-NEXT: hash 5:0 "#" flags=1
+TOKEN-NEXT: raw_identifier 5:0 "endif"
+TOKEN-NEXT: r_brace 6:0 "}" flags=1
+// RUN: clang-pseudo -source %S/Inputs/example.c -print-pp-structure | FileCheck %s -check-prefix=PPS --strict-whitespace
+ PPS: code (5 tokens)
+PPS-NEXT: #ifndef (3 tokens)
+PPS-NEXT: code (4 tokens)
+PPS-NEXT: #else (2 tokens)
+PPS-NEXT: code (3 tokens)
+PPS-NEXT: #endif (2 tokens)
+PPS-NEXT: code (1 tokens)
+
Index: clang/test/Syntax/Inputs/example.c
===================================================================
--- /dev/null
+++ clang/test/Syntax/Inputs/example.c
@@ -0,0 +1,7 @@
+int is_debug() {
+#ifndef NDEBUG
+ return 1; // in debug mode
+#else
+ return 0;
+#endif
+}
Index: clang/lib/Tooling/Syntax/Pseudo/Token.cpp
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/Pseudo/Token.cpp
@@ -0,0 +1,94 @@
+//===--- Token.cpp - Tokens and token streams in the pseudoparser ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T) {
+ OS << llvm::formatv("{0} {1}:{2} ", clang::tok::getTokenName(T.Kind), T.Line,
+ T.Indent);
+ OS << '"';
+ llvm::printEscapedString(T.text(), OS);
+ OS << '"';
+ if (T.Pair != Token::Invalid)
+ OS << " pair=" << T.Pair;
+ if (T.Flags)
+ OS << llvm::format(" flags=%2x", T.Flags);
+ return OS;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const TokenStream &TS) {
+ OS << "Index Kind Line Text\n";
+ for (const auto &T : TS.tokens()) {
+ OS << llvm::format("%5d: %16s %4d:%-2d ", TS.index(T),
+ clang::tok::getTokenName(T.Kind), T.Line, T.Indent);
+ OS << '"';
+ llvm::printEscapedString(T.text(), OS);
+ OS << '"';
+ if (T.Pair != Token::Invalid)
+ OS << " pair=" << T.Pair;
+ if (T.Flags)
+ OS << llvm::format(" flags=%x", T.Flags);
+ OS << '\n';
+ }
+ return OS;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token::Range &R) {
+ OS << llvm::formatv("[{0},{1})", R.Begin, R.End);
+ return OS;
+}
+
+TokenStream::TokenStream(std::shared_ptr<void> Payload)
+ : Payload(std::move(Payload)) {
+ Storage.emplace_back();
+ Storage.back().Kind = clang::tok::eof;
+}
+
+void TokenStream::finalize() {
+ unsigned LastLine = Storage.back().Line;
+ Storage.emplace_back();
+ Storage.back().Kind = tok::eof;
+ Storage.back().Line = LastLine + 1;
+
+ Tokens = Storage;
+ Tokens = Tokens.drop_front().drop_back();
+}
+
+void TokenStream::print(llvm::raw_ostream &OS) const {
+ bool FirstToken = true;
+ unsigned LastLine = -1;
+ StringRef LastText;
+ for (const auto &T : tokens()) {
+ StringRef Text = T.text();
+ if (FirstToken) {
+ FirstToken = false;
+ } else if (T.Line == LastLine) {
+ if (LastText.data() + LastText.size() != Text.data())
+ OS << ' ';
+ } else {
+ OS << '\n';
+ OS.indent(T.Indent);
+ }
+ OS << Text;
+ LastLine = T.Line;
+ LastText = Text;
+ }
+ if (!FirstToken)
+ OS << '\n';
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
Index: clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp
@@ -0,0 +1,197 @@
+//===--- Preprocess.cpp - Preprocess token streams ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
+#include "clang/Basic/IdentifierTable.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+class Parser {
+public:
+ explicit Parser(const TokenStream &Code) : Code(Code), Tok(&Code.front()) {}
+ void parse(PPStructure *result) { parse(result, /*TopLevel=*/true); }
+
+private:
+ // Roles that a directive might take within a conditional block.
+ enum class Cond { None, If, Else, End };
+ static Cond classifyDirective(tok::PPKeywordKind kind) {
+ switch (kind) {
+ case clang::tok::pp_if:
+ case clang::tok::pp_ifdef:
+ case clang::tok::pp_ifndef:
+ return Cond::If;
+ case clang::tok::pp_elif:
+ case clang::tok::pp_elifdef:
+ case clang::tok::pp_elifndef:
+ case clang::tok::pp_else:
+ return Cond::Else;
+ case clang::tok::pp_endif:
+ return Cond::End;
+ default:
+ return Cond::None;
+ }
+ }
+
+ // Parses tokens starting at Tok into PP.
+ // If we reach an #end or #else directive that ends PP, returns it.
+ // If TopLevel is true, then we do not expect #end and always return None.
+ llvm::Optional<PPStructure::Directive> parse(PPStructure *PP, bool TopLevel) {
+ auto StartsDirective =
+ [&, AllowDirectiveAt((const Token *)nullptr)]() mutable {
+ if (Tok->flag(LexFlags::StartsPPLine)) {
+ // If we considered a comment at the start of a PP-line, it doesn't
+ // start a directive but the directive can still start after it.
+ if (Tok->Kind == tok::comment)
+ AllowDirectiveAt = Tok + 1;
+ return Tok->Kind == tok::hash;
+ }
+ return Tok->Kind == tok::hash && AllowDirectiveAt == Tok;
+ };
+ while (Tok->Kind != tok::eof) {
+ while (StartsDirective()) {
+ PPStructure::Directive Directive;
+ parseDirective(&Directive);
+ Cond Kind = classifyDirective(Directive.Kind);
+ if (Kind == Cond::If) {
+ PPStructure::Conditional Conditional;
+ Conditional.Branches.emplace_back();
+ Conditional.Branches.back().first = std::move(Directive);
+ parseConditional(&Conditional);
+ PP->Chunks.push_back(std::move(Conditional));
+ continue;
+ }
+ // Unexpected #else or #endif at top level; parse as normal directives.
+ if (Kind == Cond::None || TopLevel) {
+ PP->Chunks.push_back(std::move(Directive));
+ continue;
+ }
+ assert(Kind == Cond::Else || Kind == Cond::End);
+ return std::move(Directive);
+ }
+ const Token *Start = Tok;
+ while (Tok->Kind != tok::eof && !StartsDirective())
+ ++Tok;
+ if (Tok != Start)
+ PP->Chunks.push_back(PPStructure::Code{
+ Token::Range{Code.index(*Start), Code.index(*Tok)}});
+ }
+ return None;
+ }
+
+ // Parse the rest of a conditional section, after seeing the #if directive.
+ // Returns after consuming the #end directive.
+ void parseConditional(PPStructure::Conditional *C) {
+ assert(C->Branches.size() == 1 &&
+ C->Branches.front().second.Chunks.empty() &&
+ "Should be ready to parse first branch body");
+ while (Tok->Kind != tok::eof) {
+ auto Terminator = parse(&C->Branches.back().second, /*TopLevel=*/false);
+ if (!Terminator) {
+ assert(Tok->Kind == tok::eof && "gave up parsing before eof?");
+ C->End.Tokens = Token::Range::empty(Code.index(*Tok));
+ return;
+ }
+ if (classifyDirective(Terminator->Kind) == Cond::End) {
+ C->End = std::move(*Terminator);
+ return;
+ }
+ assert(classifyDirective(Terminator->Kind) == Cond::Else &&
+ "ended branch unexpectedly");
+ C->Branches.emplace_back();
+ C->Branches.back().first = std::move(*Terminator);
+ }
+ }
+
+ // Parse a directive. Tok is the hash.
+ void parseDirective(PPStructure::Directive *D) {
+ assert(Tok->Kind == tok::hash);
+ D->Tokens.Begin = Code.index(*Tok);
+ do {
+ ++Tok;
+ } while (Tok->Kind == tok::comment && !Tok->flag(LexFlags::StartsPPLine));
+ // Technically directive names can be spelled with UCNs or split over lines.
+ // In practice, this never happens.
+ if (Tok->Kind == tok::raw_identifier)
+ D->Kind = Idents.get(Tok->text()).getPPKeywordID();
+ while (Tok->Kind != tok::eof && !Tok->flag(LexFlags::StartsPPLine))
+ ++Tok;
+ D->Tokens.End = Code.index(*Tok);
+ }
+
+ const TokenStream &Code;
+ const Token *Tok;
+ clang::IdentifierTable Idents;
+};
+
+} // namespace
+
+PPStructure PPStructure::parse(const TokenStream &Code) {
+ PPStructure Result;
+ Parser(Code).parse(&Result);
+ return Result;
+}
+
+static llvm::StringLiteral ppKeywordName(tok::PPKeywordKind kind) {
+ switch (kind) {
+#define PPKEYWORD(x) \
+ case tok::pp_##x: \
+ return #x;
+#include "clang/Basic/TokenKinds.def"
+ default:
+ return "unknown";
+ }
+}
+
+static void dump(llvm::raw_ostream &OS, const PPStructure &PP,
+ unsigned Indent) {
+ auto DumpDirective = [&](const PPStructure::Directive &Directive) {
+ OS.indent(Indent) << llvm::formatv("#{0} ({1} tokens)\n",
+ ppKeywordName(Directive.Kind),
+ Directive.Tokens.size());
+ };
+
+ for (const auto &Chunk : PP.Chunks) {
+ switch (Chunk.kind()) {
+ case PPStructure::Chunk::K_Empty:
+ llvm_unreachable("invalid chunk");
+ case PPStructure::Chunk::K_Code: {
+ const PPStructure::Code &Code(Chunk);
+ OS.indent(Indent) << llvm::formatv("code ({0} tokens)\n",
+ Code.Tokens.size());
+ break;
+ }
+ case PPStructure::Chunk::K_Directive: {
+ const PPStructure::Directive &Directive(Chunk);
+ DumpDirective(Directive);
+ break;
+ }
+ case PPStructure::Chunk::K_Conditional: {
+ const PPStructure::Conditional &Conditional(Chunk);
+ for (const auto &Branch : Conditional.Branches) {
+ DumpDirective(Branch.first);
+ dump(OS, Branch.second, Indent + 2);
+ }
+ DumpDirective(Conditional.End);
+ break;
+ }
+ }
+ }
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const PPStructure &PP) {
+ dump(OS, PP, 0);
+ return OS;
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
Index: clang/lib/Tooling/Syntax/Pseudo/Lex.cpp
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/Pseudo/Lex.cpp
@@ -0,0 +1,77 @@
+//===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Lex/Lexer.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
+ clang::SourceLocation Start;
+ // Tokenize using clang's lexer in raw mode.
+ // std::string guarantees null-termination, which the lexer needs.
+ clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(),
+ Code.data() + Code.size());
+ Lexer.SetCommentRetentionState(true);
+
+ TokenStream Result;
+ clang::Token CT;
+ unsigned LastOffset = 0;
+ unsigned Line = 0;
+ unsigned Indent = 0;
+ for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof;
+ Lexer.LexFromRawLexer(CT)) {
+ unsigned Offset =
+ CT.getLocation().getRawEncoding() - Start.getRawEncoding();
+
+ Token Tok;
+ Tok.Data = &Code[Offset];
+ Tok.Length = CT.getLength();
+ Tok.Kind = CT.getKind();
+
+ // Update current line number and indentation from raw source code.
+ unsigned NewLineStart = 0;
+ for (unsigned i = LastOffset; i < Offset; ++i) {
+ if (Code[i] == '\n') {
+ NewLineStart = i + 1;
+ ++Line;
+ }
+ }
+ // Indentation isn't always well defined when lines are continued.
+ if ((NewLineStart || !LastOffset) && CT.isAtStartOfLine()) {
+ Indent = 0;
+ for (char c : StringRef(Code).slice(NewLineStart, Offset)) {
+ if (c == ' ')
+ ++Indent;
+ else if (c == '\t')
+ Indent += 8;
+ else
+ break;
+ }
+ }
+ Tok.Indent = Indent;
+ Tok.Line = Line;
+
+ if (CT.isAtStartOfLine())
+ Tok.setFlag(LexFlags::StartsPPLine);
+ if (CT.needsCleaning() || CT.hasUCN())
+ Tok.setFlag(LexFlags::DirtyIdentifier);
+
+ Result.push(Tok);
+ LastOffset = Offset;
+ }
+ Result.finalize();
+ return Result;
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
Index: clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
===================================================================
--- clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -3,7 +3,10 @@
add_clang_library(clangToolingSyntaxPseudo
Grammar.cpp
GrammarBNF.cpp
-
+ Lex.cpp
+ Preprocess.cpp
+ Token.cpp
+
LINK_LIBS
clangBasic
clangLex
Index: clang/include/clang/Tooling/Syntax/Pseudo/Token.h
===================================================================
--- /dev/null
+++ clang/include/clang/Tooling/Syntax/Pseudo/Token.h
@@ -0,0 +1,172 @@
+//===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Tokens are the first level of abstraction above bytes used in pseudoparsing.
+// We use clang's lexer to scan the bytes (in raw mode, with no preprocessor).
+// The tokens is wrapped into pseudo::Token, along with line/indent info.
+//
+// Unlike clang, we make multiple passes over the whole file, out-of-order.
+// Therefore we retain the whole token sequence in memory. (This is feasible as
+// we process one file at a time). pseudo::TokenStream holds such a stream.
+// The initial stream holds the raw tokens read from the file, later passes
+// operate on derived TokenStreams (e.g. with directives stripped).
+//
+// Similar facilities from clang that are *not* used:
+// - SourceManager: designed around multiple files and precise macro expansion.
+// - clang::Token: coupled to SourceManager, doesn't retain layout info.
+// (pseudo::Token is similar, but without SourceLocations).
+// - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros.
+// (pseudo::TokenStream is similar, but a flat token list).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKEN_H
+#define LLVM_CLANG_TOOLING_SYNTAX_TOKEN_H
+
+#include "clang/Basic/LLVM.h"
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <vector>
+
+namespace clang {
+class LangOptions;
+namespace syntax {
+namespace pseudo {
+
+/// A single C++ or preprocessor token.
+///
+/// Unlike clang::Token and syntax::Token, these tokens are not connected to a
+/// SourceManager - we are not dealing with multiple files.
+struct Token {
+ /// An Index identifies a token within a stream.
+ using Index = uint32_t;
+ /// A sentinel Index indicating no token.
+ constexpr static Index Invalid = std::numeric_limits<Index>::max();
+ struct Range;
+
+ /// The token text.
+ ///
+ /// Typically from the original source file, but may have been synthesized.
+ StringRef text() const { return StringRef(Data, Length); }
+ const char *Data;
+ uint32_t Length;
+
+ /// Zero-based line number.
+ uint32_t Line = 0;
+ /// Width of whitespace before the first token on this line.
+ uint8_t Indent = 0;
+ /// Flags have some meaning defined by the function that produced this stream.
+ uint8_t Flags = 0;
+ // Helpers to get/set Flags based on `enum class`.
+ template <class T> bool flag(T Mask) const {
+ return Flags & uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
+ }
+ template <class T> void setFlag(T Mask) {
+ Flags |= uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
+ }
+
+ /// The type of token as determined by clang's lexer.
+ clang::tok::TokenKind Kind = clang::tok::unknown;
+ /// If this token is a bracket, the index of the matching bracket.
+ Index Pair = Invalid;
+
+ const Token &next() const { return *(this + 1); }
+ const Token &prev() const { return *(this - 1); }
+ Token &next() { return *(this + 1); }
+ Token &prev() { return *(this - 1); }
+};
+static_assert(sizeof(Token) <= sizeof(char *) + 16, "Careful with layout!");
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);
+
+/// A half-open range of tokens within a stream.
+struct Token::Range {
+ Token::Index Begin = 0;
+ Token::Index End = 0;
+
+ uint32_t size() const { return End - Begin; }
+ static Range empty(unsigned Index) { return Range{Index, Index}; }
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &);
+
+/// A complete sequence of Tokens representing a source file.
+///
+/// This may match a raw file from disk, or be derived from a previous stream.
+/// For example, stripping comments from a TokenStream results in a new stream.
+///
+/// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes:
+/// int main ( ) ;
+/// eof kw_int ident l_paren r_paren semi eof
+/// front() back()
+/// 0 1 2 3 4 5
+class TokenStream {
+public:
+ /// Create an empty stream.
+ ///
+ /// Initially, the stream is mutable and not finalized.
+ /// It may only be read after Finalize() is called.
+ ///
+ /// Payload is an opaque object which will be owned by the stream.
+ /// e.g. an allocator to hold backing storage for synthesized token text.
+ explicit TokenStream(std::shared_ptr<void> Payload = nullptr);
+
+ /// Append a token to the stream, which must not be finalized.
+ void push(Token T) { Storage.push_back(std::move(T)); }
+
+ /// Finalize the token stream, allowing it to be read, but no longer written.
+ void finalize();
+
+ /// Returns the index of T within the stream.
+ ///
+ /// T must be within the stream or the end sentinel (not the start sentinel).
+ Token::Index index(const Token &T) const {
+ assert(&T != Storage.data() && "start sentinel");
+ assert(&T >= Storage.data() && &T < Storage.data() + Storage.size());
+ return &T - Tokens.data();
+ }
+
+ MutableArrayRef<Token> tokens() { return Tokens; }
+ ArrayRef<Token> tokens() const { return Tokens; }
+ MutableArrayRef<Token> tokens(Token::Range R) {
+ return Tokens.slice(R.Begin, R.End);
+ }
+ ArrayRef<Token> tokens(Token::Range R) const {
+ return Tokens.slice(R.Begin, R.End);
+ }
+
+ /// May return the end sentinel if the stream is empty.
+ Token &front() { return Storage[1]; }
+ const Token &front() const { return Storage[1]; }
+
+ /// Print the tokens in this stream to the output stream.
+ ///
+ /// The presence of newlines/spaces is preserved, but not the quantity.
+ void print(llvm::raw_ostream &) const;
+
+private:
+ std::shared_ptr<void> Payload;
+
+ MutableArrayRef<Token> Tokens;
+ std::vector<Token> Storage;
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &);
+
+/// Extracts a token stream from the source code.
+///
+/// The tokens will reference the data of the provided string.
+TokenStream lex(const std::string &, const clang::LangOptions &);
+enum class LexFlags : uint8_t { DirtyIdentifier, StartsPPLine };
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+#endif
Index: clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h
===================================================================
--- /dev/null
+++ clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h
@@ -0,0 +1,145 @@
+//===--- Preprocess.h - Preprocess token streams -----------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// XXX
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_PREPROCESS_H
+#define LLVM_CLANG_TOOLING_SYNTAX_PREPROCESS_H
+
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include <vector>
+
+namespace clang {
+class LangOptions;
+namespace syntax {
+namespace pseudo {
+
+/// Describes the structure of a source file, as seen by the preprocessor.
+///
+/// The structure is a tree, whose leaves are plain source code and directives,
+/// and whose internal nodes are #if...#endif sections.
+///
+/// (root)
+/// |-+ Directive #include <stdio.h>
+/// |-+ Code int main() {
+/// | ` printf("hello, ");
+/// |-+ Conditional -+ Directive #ifndef NDEBUG
+/// | |-+ Code printf("debug\n");
+/// | |-+ Directive #else
+/// | |-+ Code printf("production\n");
+/// | `-+ Directive #endif
+/// |-+ Code return 0;
+/// ` }
+///
+/// Unlike the clang preprocessor, we model the full tree explicitly.
+/// This class does not recognize macro usage, only directives.
+struct PPStructure {
+ /// A range of code containing no directives.
+ struct Code {
+ Token::Range Tokens;
+ };
+ /// A preprocessor directive.
+ struct Directive {
+ /// Raw tokens making up the directive, starting with `#`.
+ Token::Range Tokens;
+ clang::tok::PPKeywordKind Kind = clang::tok::pp_not_keyword;
+ };
+ /// A preprocessor conditional section.
+ ///
+ /// This starts with an #if, #ifdef, #ifndef etc directive.
+ /// It covers all #else branches, and spans until the matching #endif.
+ struct Conditional {
+ /// The sequence of directives that introduce top-level alternative parses.
+ ///
+ /// The first branch will have an #if type directive.
+ /// Subsequent branches will have #else type directives.
+ std::vector<std::pair<Directive, PPStructure>> Branches;
+ /// The directive terminating the conditional, should be #endif.
+ Directive End;
+ };
+
+ /// Some piece of the file. {One of Code, Directive, Conditional}.
+ class Chunk; // Defined below.
+ std::vector<Chunk> Chunks;
+
+ /// Extract preprocessor structure by examining the raw tokens.
+ static PPStructure parse(const TokenStream &);
+
+ /// Determine heuristically a set of conditional branches to take.
+ ///
+ /// Current heuristics (in preference order):
+ /// - respect constants: `#if 1`, `#elif false` etc.
+ /// - avoid paths that reach #error
+ /// - maximize non-comment tokens seen
+ /// - maximize number of directives seen
+ void chooseBranches(const TokenStream &) {
+ llvm_unreachable("unimplemented");
+ }
+
+ /// Produce a derived token stream without directives and not-taken branches.
+ ///
+ /// Additionally, raw identifiers are "cooked", converting them to identifiers
+ /// or keywords according to the LangOptions.
+ ///
+ /// The input TokenStream should be the one this structure describes.
+ TokenStream preprocess(const TokenStream &,
+ const clang::LangOptions &) const {
+ llvm_unreachable("unimplemented");
+ }
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure &);
+
+// FIXME: This approximates std::variant<Code, Directive, Conditional>.
+// Switch once we can use C++17.
+class PPStructure::Chunk {
+public:
+ enum Kind { K_Empty, K_Code, K_Directive, K_Conditional };
+ Kind kind() const {
+ return CodeVariant ? K_Code
+ : DirectiveVariant ? K_Directive
+ : K_Conditional;
+ }
+
+ Chunk() = delete;
+ Chunk(const Chunk &) = delete;
+ Chunk(Chunk &&) = default;
+ Chunk &operator=(const Chunk &) = delete;
+ Chunk &operator=(Chunk &&) = default;
+ ~Chunk() = default;
+
+ // T => Chunk constructor.
+ Chunk(Code C) : CodeVariant(std::move(C)) {}
+ Chunk(Directive C) : DirectiveVariant(std::move(C)) {}
+ Chunk(Conditional C) : ConditionalVariant(std::move(C)) {}
+
+ // Chunk => T& and const T& conversions.
+#define CONVERSION(CONST, V) \
+ explicit operator CONST V &() CONST { return *V##Variant; }
+ CONVERSION(const, Code);
+ CONVERSION(, Code);
+ CONVERSION(const, Directive);
+ CONVERSION(, Directive);
+ CONVERSION(const, Conditional);
+ CONVERSION(, Conditional);
+#undef CONVERSION
+
+private:
+ // Wasteful, a union variant would be better!
+ llvm::Optional<Code> CodeVariant;
+ llvm::Optional<Directive> DirectiveVariant;
+ llvm::Optional<Conditional> ConditionalVariant;
+};
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+#endif
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits