Re: [PATCH] D20198: clang-format: [JS] sort ES6 imports.

Martin Probst via cfe-commits Thu, 12 May 2016 07:36:09 -0700

mprobst updated this revision to Diff 57039.
mprobst added a comment.

- extract TokenAnalyzer.h and SortJavaScriptImports.h/cpp
- clean up imports/
- includes



http://reviews.llvm.org/D20198

Files:
  lib/Format/CMakeLists.txt
  lib/Format/Format.cpp
  lib/Format/FormatToken.h
  lib/Format/FormatTokenLexer.h
  lib/Format/SortJavaScriptImports.cpp
  lib/Format/SortJavaScriptImports.h
  lib/Format/TokenAnalyzer.h
  unittests/Format/CMakeLists.txt
  unittests/Format/SortImportsTestJS.cpp

Index: unittests/Format/SortImportsTestJS.cpp
===================================================================
--- /dev/null
+++ unittests/Format/SortImportsTestJS.cpp
@@ -0,0 +1,124 @@
+//===- unittest/Format/SortImportsTestJS.cpp - JS import sort unit tests --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "FormatTestUtils.h"
+#include "clang/Format/Format.h"
+#include "llvm/Support/Debug.h"
+#include "gtest/gtest.h"
+
+#define DEBUG_TYPE "format-test"
+
+namespace clang {
+namespace format {
+namespace {
+
+class SortImportsTestJS : public ::testing::Test {
+protected:
+  std::vector<tooling::Range> GetCodeRange(StringRef Code) {
+    return std::vector<tooling::Range>(1, tooling::Range(0, Code.size()));
+  }
+
+  std::string sort(StringRef Code, StringRef FileName = "input.js") {
+    auto Ranges = GetCodeRange(Code);
+    std::string Sorted =
+        applyAllReplacements(Code, sortIncludes(Style, Code, Ranges, FileName));
+    return applyAllReplacements(Sorted,
+                                reformat(Style, Sorted, Ranges, FileName));
+  }
+
+  void verifySort(llvm::StringRef Expected, llvm::StringRef Code) {
+    std::string Result = sort(Code);
+    EXPECT_EQ(Expected.str(), Result) << "Formatted:\n" << Result;
+  }
+
+  FormatStyle Style = getGoogleStyle(FormatStyle::LK_JavaScript);
+};
+
+TEST_F(SortImportsTestJS, BasicSorting) {
+  verifySort("import {sym} from 'a';\n"
+             "import {sym} from 'b';\n"
+             "import {sym} from 'c';\n"
+             "\n"
+             "let x = 1;",
+             "import {sym} from 'a';\n"
+             "import {sym} from 'c';\n"
+             "import {sym} from 'b';\n"
+             "let x = 1;");
+}
+
+TEST_F(SortImportsTestJS, Comments) {
+  verifySort("/** @fileoverview This is a great file. */\n"
+             "// A very important import follows.\n"
+             "import {sym} from 'a'; /* more comments */\n"
+             "import {sym} from 'b'; // from //foo:bar\n",
+             "/** @fileoverview This is a great file. */\n"
+             "import {sym} from 'b'; // from //foo:bar\n"
+             "// A very important import follows.\n"
+             "import {sym} from 'a'; /* more comments */\n");
+}
+
+TEST_F(SortImportsTestJS, SortStar) {
+  verifySort("import * as foo from 'a';\n"
+             "import {sym} from 'a';\n"
+             "import * as bar from 'b';\n",
+             "import {sym} from 'a';\n"
+             "import * as foo from 'a';\n"
+             "import * as bar from 'b';\n");
+}
+
+TEST_F(SortImportsTestJS, AliasesSymbols) {
+  verifySort("import {sym1 as alias1} from 'b';\n"
+             "import {sym2 as alias2, sym3 as alias3} from 'c';\n",
+             "import {sym2 as alias2, sym3 as alias3} from 'c';\n"
+             "import {sym1 as alias1} from 'b';\n");
+}
+
+TEST_F(SortImportsTestJS, GroupImports) {
+  verifySort("import {a} from 'absolute';\n"
+             "\n"
+             "import {b} from '../parent';\n"
+             "import {b} from '../parent/nested';\n"
+             "\n"
+             "import {b} from './relative/path';\n"
+             "import {b} from './relative/path/nested';\n"
+             "\n"
+             "let x = 1;\n",
+             "import {b} from './relative/path/nested';\n"
+             "import {b} from './relative/path';\n"
+             "import {b} from '../parent/nested';\n"
+             "import {b} from '../parent';\n"
+             "import {a} from 'absolute';\n"
+             "let x = 1;\n");
+}
+
+TEST_F(SortImportsTestJS, Exports) {
+  verifySort("import {S} from 'bpath';\n"
+             "\n"
+             "import {T} from './cpath';\n"
+             "\n"
+             "export {A, B} from 'apath';\n"
+             "export {P} from '../parent';\n"
+             "export {R} from './relative';\n"
+             "export {S};\n"
+             "\n"
+             "let x = 1;\n"
+             "export y = 1;\n",
+             "export {R} from './relative';\n"
+             "import {T} from './cpath';\n"
+             "export {S};\n"
+             "export {A, B} from 'apath';\n"
+             "import {S} from 'bpath';\n"
+             "export {P} from '../parent';\n"
+             "let x = 1;\n"
+             "export y = 1;\n");
+}
+
+} // end namespace
+} // end namespace format
+} // end namespace clang
Index: unittests/Format/CMakeLists.txt
===================================================================
--- unittests/Format/CMakeLists.txt
+++ unittests/Format/CMakeLists.txt
@@ -9,6 +9,7 @@
   FormatTestJS.cpp
   FormatTestProto.cpp
   FormatTestSelective.cpp
+  SortImportsTestJS.cpp
   SortIncludesTest.cpp
   )
 
Index: lib/Format/TokenAnalyzer.h
===================================================================
--- /dev/null
+++ lib/Format/TokenAnalyzer.h
@@ -0,0 +1,211 @@
+//===--- TokenAnalyzer.h - Analyze Token Streams ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares an abstract TokenAnalyzer, and associated helper
+/// classes. TokenAnalyzer can be extended to generate replacements based on
+/// an annotated and pre-processed token stream.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_FORMAT_TOKENANALYZER_H
+#define LLVM_CLANG_LIB_FORMAT_TOKENANALYZER_H
+
+#include "AffectedRangeManager.h"
+#include "Encoding.h"
+#include "FormatToken.h"
+#include "FormatTokenLexer.h"
+#include "TokenAnnotator.h"
+#include "UnwrappedLineParser.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticOptions.h"
+#include "clang/Basic/FileManager.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Format/Format.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "format-formatter"
+
+namespace clang {
+namespace format {
+
+static StringRef getLanguageName(FormatStyle::LanguageKind Language) {
+  switch (Language) {
+  case FormatStyle::LK_Cpp:
+    return "C++";
+  case FormatStyle::LK_Java:
+    return "Java";
+  case FormatStyle::LK_JavaScript:
+    return "JavaScript";
+  case FormatStyle::LK_Proto:
+    return "Proto";
+  default:
+    return "Unknown";
+  }
+}
+
+class Environment {
+public:
+  Environment(SourceManager &SM, FileID ID, ArrayRef<CharSourceRange> Ranges)
+      : ID(ID), CharRanges(Ranges.begin(), Ranges.end()), SM(SM) {}
+
+  Environment(FileID ID, std::unique_ptr<FileManager> FileMgr,
+              std::unique_ptr<SourceManager> VirtualSM,
+              std::unique_ptr<DiagnosticsEngine> Diagnostics,
+              const std::vector<CharSourceRange> &CharRanges)
+      : ID(ID), CharRanges(CharRanges.begin(), CharRanges.end()),
+        SM(*VirtualSM), FileMgr(std::move(FileMgr)),
+        VirtualSM(std::move(VirtualSM)), Diagnostics(std::move(Diagnostics)) {}
+
+  // This sets up an virtual file system with file \p FileName containing \p
+  // Code.
+  static std::unique_ptr<Environment>
+  CreateVirtualEnvironment(StringRef Code, StringRef FileName,
+                           ArrayRef<tooling::Range> Ranges) {
+    // This is referenced by `FileMgr` and will be released by `FileMgr` when it
+    // is deleted.
+    IntrusiveRefCntPtr<vfs::InMemoryFileSystem> InMemoryFileSystem(
+        new vfs::InMemoryFileSystem);
+    // This is passed to `SM` as reference, so the pointer has to be referenced
+    // in `Environment` so that `FileMgr` can out-live this function scope.
+    std::unique_ptr<FileManager> FileMgr(
+        new FileManager(FileSystemOptions(), InMemoryFileSystem));
+    // This is passed to `SM` as reference, so the pointer has to be referenced
+    // by `Environment` due to the same reason above.
+    std::unique_ptr<DiagnosticsEngine> Diagnostics(new DiagnosticsEngine(
+        IntrusiveRefCntPtr<DiagnosticIDs>(new DiagnosticIDs),
+        new DiagnosticOptions));
+    // This will be stored as reference, so the pointer has to be stored in
+    // due to the same reason above.
+    std::unique_ptr<SourceManager> VirtualSM(
+        new SourceManager(*Diagnostics, *FileMgr));
+    InMemoryFileSystem->addFile(
+        FileName, 0, llvm::MemoryBuffer::getMemBuffer(
+                         Code, FileName, /*RequiresNullTerminator=*/false));
+    FileID ID = VirtualSM->createFileID(
+        FileMgr->getFile(FileName), SourceLocation(), clang::SrcMgr::C_User);
+    assert(ID.isValid());
+    SourceLocation StartOfFile = VirtualSM->getLocForStartOfFile(ID);
+    std::vector<CharSourceRange> CharRanges;
+    for (const tooling::Range &Range : Ranges) {
+      SourceLocation Start = StartOfFile.getLocWithOffset(Range.getOffset());
+      SourceLocation End = Start.getLocWithOffset(Range.getLength());
+      CharRanges.push_back(CharSourceRange::getCharRange(Start, End));
+    }
+    return llvm::make_unique<Environment>(ID, std::move(FileMgr),
+                                          std::move(VirtualSM),
+                                          std::move(Diagnostics), CharRanges);
+  }
+
+  FileID getFileID() const { return ID; }
+
+  StringRef getFileName() const { return FileName; }
+
+  ArrayRef<CharSourceRange> getCharRanges() const { return CharRanges; }
+
+  const SourceManager &getSourceManager() const { return SM; }
+
+private:
+  FileID ID;
+  StringRef FileName;
+  SmallVector<CharSourceRange, 8> CharRanges;
+  SourceManager &SM;
+
+  // The order of these fields are important - they should be in the same order
+  // as they are created in `CreateVirtualEnvironment` so that they can be
+  // deleted in the reverse order as they are created.
+  std::unique_ptr<FileManager> FileMgr;
+  std::unique_ptr<SourceManager> VirtualSM;
+  std::unique_ptr<DiagnosticsEngine> Diagnostics;
+};
+
+class TokenAnalyzer : public UnwrappedLineConsumer {
+public:
+  TokenAnalyzer(const Environment &Env, const FormatStyle &Style)
+      : Style(Style), Env(Env),
+        AffectedRangeMgr(Env.getSourceManager(), Env.getCharRanges()),
+        UnwrappedLines(1),
+        Encoding(encoding::detectEncoding(
+            Env.getSourceManager().getBufferData(Env.getFileID()))) {
+    DEBUG(llvm::dbgs() << "File encoding: "
+                       << (Encoding == encoding::Encoding_UTF8 ? "UTF8"
+                                                               : "unknown")
+                       << "\n");
+    DEBUG(llvm::dbgs() << "Language: " << getLanguageName(Style.Language)
+                       << "\n");
+  }
+
+  tooling::Replacements process() {
+    tooling::Replacements Result;
+    FormatTokenLexer Tokens(Env.getSourceManager(), Env.getFileID(), Style,
+                            Encoding);
+
+    UnwrappedLineParser Parser(Style, Tokens.getKeywords(), Tokens.lex(),
+                               *this);
+    Parser.parse();
+    assert(UnwrappedLines.rbegin()->empty());
+    for (unsigned Run = 0, RunE = UnwrappedLines.size(); Run + 1 != RunE;
+         ++Run) {
+      DEBUG(llvm::dbgs() << "Run " << Run << "...\n");
+      SmallVector<AnnotatedLine *, 16> AnnotatedLines;
+
+      TokenAnnotator Annotator(Style, Tokens.getKeywords());
+      for (unsigned i = 0, e = UnwrappedLines[Run].size(); i != e; ++i) {
+        AnnotatedLines.push_back(new AnnotatedLine(UnwrappedLines[Run][i]));
+        Annotator.annotate(*AnnotatedLines.back());
+      }
+
+      tooling::Replacements RunResult =
+          analyze(Annotator, AnnotatedLines, Tokens, Result);
+
+      DEBUG({
+        llvm::dbgs() << "Replacements for run " << Run << ":\n";
+        for (tooling::Replacements::iterator I = RunResult.begin(),
+                                             E = RunResult.end();
+             I != E; ++I) {
+          llvm::dbgs() << I->toString() << "\n";
+        }
+      });
+      for (unsigned i = 0, e = AnnotatedLines.size(); i != e; ++i) {
+        delete AnnotatedLines[i];
+      }
+      Result.insert(RunResult.begin(), RunResult.end());
+    }
+    return Result;
+  }
+
+protected:
+  virtual tooling::Replacements
+  analyze(TokenAnnotator &Annotator,
+          SmallVectorImpl<AnnotatedLine *> &AnnotatedLines,
+          FormatTokenLexer &Tokens, tooling::Replacements &Result) = 0;
+
+  void consumeUnwrappedLine(const UnwrappedLine &TheLine) override {
+    assert(!UnwrappedLines.empty());
+    UnwrappedLines.back().push_back(TheLine);
+  }
+
+  void finishRun() override {
+    UnwrappedLines.push_back(SmallVector<UnwrappedLine, 16>());
+  }
+
+  FormatStyle Style;
+  // Stores Style, FileID and SourceManager etc.
+  const Environment &Env;
+  // AffectedRangeMgr stores ranges to be fixed.
+  AffectedRangeManager AffectedRangeMgr;
+  SmallVector<SmallVector<UnwrappedLine, 16>, 2> UnwrappedLines;
+  encoding::Encoding Encoding;
+};
+
+} // end namespace format
+} // end namespace clang
+
+#endif
Index: lib/Format/SortJavaScriptImports.h
===================================================================
--- /dev/null
+++ lib/Format/SortJavaScriptImports.h
@@ -0,0 +1,36 @@
+//===--- SortJavaScriptImports.h - Sort ES6 Imports -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a sorter for JavaScript ES6 imports.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_FORMAT_SORTJAVASCRIPTIMPORTS_H
+#define LLVM_CLANG_LIB_FORMAT_SORTJAVASCRIPTIMPORTS_H
+
+#include "clang/Basic/LLVM.h"
+#include "clang/Format/Format.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace clang {
+namespace format {
+
+// Sort JavaScript ES6 imports/exports in ``Code``.
+tooling::Replacements sortJavaScriptImports(const FormatStyle &Style,
+                                            StringRef Code,
+                                            ArrayRef<tooling::Range> Ranges,
+                                            StringRef FileName,
+                                            unsigned *Cursor = nullptr);
+
+} // end namespace format
+} // end namespace clang
+
+#endif
Index: lib/Format/SortJavaScriptImports.cpp
===================================================================
--- /dev/null
+++ lib/Format/SortJavaScriptImports.cpp
@@ -0,0 +1,299 @@
+//===--- SortJavaScriptImports.h - Sort ES6 Imports -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a sort operation for JavaScript ES6 imports.
+///
+//===----------------------------------------------------------------------===//
+
+#include "SortJavaScriptImports.h"
+#include "SortJavaScriptImports.h"
+#include "TokenAnalyzer.h"
+#include "TokenAnnotator.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticOptions.h"
+#include "clang/Basic/LLVM.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Format/Format.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+#include <string>
+
+#define DEBUG_TYPE "format-formatter"
+
+namespace clang {
+namespace format {
+
+class FormatTokenLexer;
+
+using clang::format::FormatStyle;
+
+// An imported symbol in a JavaScript ES6 import/export, possibly aliased.
+struct JsImportedSymbol {
+  StringRef Symbol;
+  StringRef Alias;
+};
+
+struct JsImportExport {
+  bool IsExport;
+  // JS imports are sorted into these categories, in order.
+  enum JsImportCategory {
+    ABSOLUTE,        // from 'something'
+    RELATIVE_PARENT, // from '../*'
+    RELATIVE,        // from './*'
+  };
+  JsImportCategory Category;
+  // Empty for `export {a, b};`.
+  StringRef URL;
+  // Prefix from "import * as prefix". Empty for symbol imports. Implies an
+  // empty names list.
+  StringRef Prefix;
+  // Symbols from `import {SymbolA, SymbolB, ...} from ...;`.
+  SmallVector<JsImportedSymbol, 1> Symbols;
+  // Textual position of the import/export, including preceding and trailing
+  // comments.
+  SourceLocation Start;
+  SourceLocation End;
+};
+
+bool operator<(const JsImportExport &LHS, const JsImportExport &RHS) {
+  if (LHS.IsExport != RHS.IsExport)
+    return LHS.IsExport < RHS.IsExport;
+  if (LHS.Category != RHS.Category)
+    return LHS.Category < RHS.Category;
+  // NB: empty URLs sort *last* (for export {...};).
+  if (LHS.URL.empty() != RHS.URL.empty())
+    return LHS.URL.empty() < RHS.URL.empty();
+  if (LHS.URL != RHS.URL)
+    return LHS.URL < RHS.URL;
+  // NB: '*' imports (with prefix) sort before {a, b, ...} imports.
+  if (LHS.Prefix.empty() != RHS.Prefix.empty())
+    return LHS.Prefix.empty() < RHS.Prefix.empty();
+  if (LHS.Prefix != RHS.Prefix)
+    return LHS.Prefix > RHS.Prefix;
+  return false;
+}
+
+// JavaScriptImportSorter sorts JavaScript ES6 imports and exports. It is
+// implemented as a TokenAnalyzer because ES6 imports have substantial syntactic
+// structure, making it messy to sort them using regular expressions.
+class JavaScriptImportSorter : public TokenAnalyzer {
+public:
+  JavaScriptImportSorter(const Environment &Env, const FormatStyle &Style)
+      : TokenAnalyzer(Env, Style),
+        FileContents(Env.getSourceManager().getBufferData(Env.getFileID())) {}
+
+  tooling::Replacements
+  analyze(TokenAnnotator &Annotator,
+          SmallVectorImpl<AnnotatedLine *> &AnnotatedLines,
+          FormatTokenLexer &Tokens, tooling::Replacements &Result) override {
+    AffectedRangeMgr.computeAffectedLines(AnnotatedLines.begin(),
+                                          AnnotatedLines.end());
+
+    const AdditionalKeywords &Keywords = Tokens.getKeywords();
+
+    SmallVector<JsImportExport, 16> Imports;
+    SourceLocation LastStart;
+    for (auto Line : AnnotatedLines) {
+      if (!Line->Affected)
+        break;
+      Current = Line->First;
+      LineEnd = Line->Last;
+      JsImportExport ImpExp;
+      skipComments();
+      if (LastStart.isInvalid() || Imports.empty()) {
+        // After the first file level comment, consider line comments to be part
+        // of the import that immediately follows them by using the previously
+        // set LastStart.
+        LastStart = Line->First->Tok.getLocation();
+      }
+      if (!Current)
+        continue; // Only comments on this line.
+      ImpExp.Start = LastStart;
+      LastStart = SourceLocation();
+      if (!parseImportExport(Keywords, ImpExp))
+        break;
+      ImpExp.End = LineEnd->Tok.getEndLoc();
+      DEBUG({
+        llvm::dbgs() << "Import: {"
+                     << "is_export: " << ImpExp.IsExport
+                     << ", cat: " << ImpExp.Category << ", url: " << ImpExp.URL
+                     << ", prefix: " << ImpExp.Prefix;
+        for (size_t i = 0; i < ImpExp.Symbols.size(); ++i)
+          llvm::dbgs() << ", " << ImpExp.Symbols[i].Symbol << " as "
+                       << ImpExp.Symbols[i].Alias;
+        llvm::dbgs() << ", text: " << getSourceText(ImpExp.Start, ImpExp.End);
+        llvm::dbgs() << "}\n";
+      });
+      Imports.push_back(ImpExp);
+    }
+
+    if (Imports.empty())
+      return Result;
+
+    SmallVector<unsigned, 16> Indices;
+    for (unsigned i = 0, e = Imports.size(); i != e; ++i)
+      Indices.push_back(i);
+    std::stable_sort(Indices.begin(), Indices.end(),
+                     [&](unsigned LHSI, unsigned RHSI) {
+                       return Imports[LHSI] < Imports[RHSI];
+                     });
+
+    bool OutOfOrder = false;
+    for (unsigned i = 0, e = Indices.size(); i != e; ++i) {
+      if (i != Indices[i]) {
+        OutOfOrder = true;
+        break;
+      }
+    }
+    if (!OutOfOrder)
+      return Result;
+
+    // Replace all existing import/export statements.
+    std::string ImportsText;
+    for (unsigned i = 0, e = Indices.size(); i != e; ++i) {
+      JsImportExport ImpExp = Imports[Indices[i]];
+      StringRef ImportStmt = getSourceText(ImpExp.Start, ImpExp.End);
+      ImportsText += ImportStmt;
+      ImportsText += "\n";
+      // Separate groups outside of exports with two line breaks.
+      if (i + 1 < e && !ImpExp.IsExport &&
+          ImpExp.Category != Imports[Indices[i + 1]].Category)
+        ImportsText += "\n";
+    }
+    SourceLocation InsertionPoint = Imports[0].Start;
+    SourceLocation End = Imports[Imports.size() - 1].End;
+    DEBUG(llvm::dbgs() << "Replacing imports:\n"
+                       << getSourceText(InsertionPoint, End) << "\nwith:\n"
+                       << ImportsText);
+    Result.insert(tooling::Replacement(
+        Env.getSourceManager(),
+        CharSourceRange::getCharRange(InsertionPoint, End), ImportsText));
+
+    return Result;
+  }
+
+private:
+  FormatToken *Current;
+  FormatToken *LineEnd;
+  StringRef FileContents;
+
+  void skipComments() { Current = skipComments(Current); }
+
+  FormatToken *skipComments(FormatToken *Tok) {
+    while (Tok && Tok->is(tok::comment))
+      Tok = Tok->Next;
+    return Tok;
+  }
+
+  bool nextToken() {
+    Current = Current->Next;
+    skipComments();
+    return Current && Current != LineEnd->Next;
+  }
+
+  StringRef getSourceText(SourceLocation Start, SourceLocation End) {
+    const SourceManager &SM = Env.getSourceManager();
+    return FileContents.substr(SM.getFileOffset(Start),
+                               SM.getFileOffset(End) - SM.getFileOffset(Start));
+  }
+
+  bool parseImportExport(const AdditionalKeywords &Keywords,
+                         JsImportExport &ImpExp) {
+    if (!Current || !Current->isOneOf(Keywords.kw_import, tok::kw_export))
+      return false;
+    ImpExp.IsExport = Current->is(tok::kw_export);
+    nextToken();
+
+    if (!parseImportExportSpecifier(Keywords, ImpExp) || !nextToken())
+      return false;
+    if (Current->is(Keywords.kw_from)) {
+      // imports have a 'from' clause, exports might not.
+      if (!nextToken())
+        return false;
+      if (!Current->isStringLiteral())
+        return false;
+      // URL = TokenText without the quotes.
+      ImpExp.URL = Current->TokenText.substr(1, Current->TokenText.size() - 2);
+      if (ImpExp.URL.startswith("..")) {
+        ImpExp.Category = JsImportExport::JsImportCategory::RELATIVE_PARENT;
+      } else if (ImpExp.URL.startswith(".")) {
+        ImpExp.Category = JsImportExport::JsImportCategory::RELATIVE;
+      } else {
+        ImpExp.Category = JsImportExport::JsImportCategory::ABSOLUTE;
+      }
+    } else {
+      // w/o URL groups with "empty".
+      ImpExp.Category = JsImportExport::JsImportCategory::RELATIVE;
+    }
+    return true;
+  }
+
+  bool parseImportExportSpecifier(const AdditionalKeywords &Keywords,
+                                  JsImportExport &ImpExp) {
+    // * as prefix from '...';
+    if (Current->is(tok::star)) {
+      if (!nextToken())
+        return false;
+      if (!Current->is(Keywords.kw_as) || !nextToken())
+        return false;
+      if (!Current->is(tok::identifier))
+        return false;
+      ImpExp.Prefix = Current->TokenText;
+      return true;
+    }
+
+    if (!Current->is(tok::l_brace))
+      return false;
+
+    // {sym as alias, sym2 as ...} from '...';
+    if (!nextToken())
+      return false;
+    while (true) {
+      if (!Current->is(tok::identifier))
+        return false;
+
+      JsImportedSymbol Symbol;
+      Symbol.Symbol = Current->TokenText;
+      nextToken();
+
+      if (Current->is(Keywords.kw_as)) {
+        nextToken();
+        if (!Current->is(tok::identifier))
+          return false;
+        Symbol.Alias = Current->TokenText;
+        nextToken();
+      }
+      ImpExp.Symbols.push_back(Symbol);
+
+      if (Current->is(tok::r_brace))
+        return true;
+      if (!Current->is(tok::comma))
+        return false;
+      nextToken();
+    }
+  }
+};
+
+tooling::Replacements sortJavaScriptImports(const FormatStyle &Style,
+                                            StringRef Code,
+                                            ArrayRef<tooling::Range> Ranges,
+                                            StringRef FileName,
+                                            unsigned *Cursor) {
+  // TODO(martinprobst): Cursor support.
+  std::unique_ptr<Environment> Env =
+      Environment::CreateVirtualEnvironment(Code, FileName, Ranges);
+  JavaScriptImportSorter Sorter(*Env, Style);
+  return Sorter.process();
+}
+
+} // end namespace format
+} // end namespace clang
Index: lib/Format/FormatTokenLexer.h
===================================================================
--- /dev/null
+++ lib/Format/FormatTokenLexer.h
@@ -0,0 +1,684 @@
+//===--- FormatTokenLexer.h - Format C++ code ----------------*- C++ ----*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains FormatTokenLexer, which tokenizes a source file
+/// into a token stream suitable for ClangFormat.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H
+#define LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H
+
+#include "FormatToken.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Format/Format.h"
+#include "llvm/Support/Regex.h"
+
+namespace clang {
+namespace format {
+
+class FormatTokenLexer {
+public:
+  FormatTokenLexer(const SourceManager &SourceMgr, FileID ID,
+                   const FormatStyle &Style, encoding::Encoding Encoding)
+      : FormatTok(nullptr), IsFirstToken(true), GreaterStashed(false),
+        LessStashed(false), Column(0), TrailingWhitespace(0),
+        SourceMgr(SourceMgr), ID(ID), Style(Style),
+        IdentTable(getFormattingLangOpts(Style)), Keywords(IdentTable),
+        Encoding(Encoding), FirstInLineIndex(0), FormattingDisabled(false),
+        MacroBlockBeginRegex(Style.MacroBlockBegin),
+        MacroBlockEndRegex(Style.MacroBlockEnd) {
+    Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
+                        getFormattingLangOpts(Style)));
+    Lex->SetKeepWhitespaceMode(true);
+
+    for (const std::string &ForEachMacro : Style.ForEachMacros)
+      ForEachMacros.push_back(&IdentTable.get(ForEachMacro));
+    std::sort(ForEachMacros.begin(), ForEachMacros.end());
+  }
+
+  ArrayRef<FormatToken *> lex() {
+    assert(Tokens.empty());
+    assert(FirstInLineIndex == 0);
+    do {
+      Tokens.push_back(getNextToken());
+      if (Style.Language == FormatStyle::LK_JavaScript)
+        tryParseJSRegexLiteral();
+      tryMergePreviousTokens();
+      if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
+        FirstInLineIndex = Tokens.size() - 1;
+    } while (Tokens.back()->Tok.isNot(tok::eof));
+    return Tokens;
+  }
+
+  const AdditionalKeywords &getKeywords() { return Keywords; }
+
+private:
+  void tryMergePreviousTokens() {
+    if (tryMerge_TMacro())
+      return;
+    if (tryMergeConflictMarkers())
+      return;
+    if (tryMergeLessLess())
+      return;
+
+    if (Style.Language == FormatStyle::LK_JavaScript) {
+      if (tryMergeTemplateString())
+        return;
+
+      static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
+      static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
+                                                     tok::equal};
+      static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
+                                                    tok::greaterequal};
+      static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
+      // FIXME: Investigate what token type gives the correct operator priority.
+      if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
+        return;
+      if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
+        return;
+      if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
+        return;
+      if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
+        return;
+    }
+  }
+
+  bool tryMergeLessLess() {
+    // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
+    if (Tokens.size() < 3)
+      return false;
+
+    bool FourthTokenIsLess = false;
+    if (Tokens.size() > 3)
+      FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
+
+    auto First = Tokens.end() - 3;
+    if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
+        First[0]->isNot(tok::less) || FourthTokenIsLess)
+      return false;
+
+    // Only merge if there currently is no whitespace between the two "<".
+    if (First[1]->WhitespaceRange.getBegin() !=
+        First[1]->WhitespaceRange.getEnd())
+      return false;
+
+    First[0]->Tok.setKind(tok::lessless);
+    First[0]->TokenText = "<<";
+    First[0]->ColumnWidth += 1;
+    Tokens.erase(Tokens.end() - 2);
+    return true;
+  }
+
+  bool tryMergeTokens(ArrayRef<tok::TokenKind> Kinds, TokenType NewType) {
+    if (Tokens.size() < Kinds.size())
+      return false;
+
+    SmallVectorImpl<FormatToken *>::const_iterator First =
+        Tokens.end() - Kinds.size();
+    if (!First[0]->is(Kinds[0]))
+      return false;
+    unsigned AddLength = 0;
+    for (unsigned i = 1; i < Kinds.size(); ++i) {
+      if (!First[i]->is(Kinds[i]) ||
+          First[i]->WhitespaceRange.getBegin() !=
+              First[i]->WhitespaceRange.getEnd())
+        return false;
+      AddLength += First[i]->TokenText.size();
+    }
+    Tokens.resize(Tokens.size() - Kinds.size() + 1);
+    First[0]->TokenText = StringRef(First[0]->TokenText.data(),
+                                    First[0]->TokenText.size() + AddLength);
+    First[0]->ColumnWidth += AddLength;
+    First[0]->Type = NewType;
+    return true;
+  }
+
+  // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
+  bool precedesOperand(FormatToken *Tok) {
+    // NB: This is not entirely correct, as an r_paren can introduce an operand
+    // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
+    // corner case to not matter in practice, though.
+    return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
+                        tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
+                        tok::colon, tok::question, tok::tilde) ||
+           Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
+                        tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
+                        tok::kw_typeof, Keywords.kw_instanceof,
+                        Keywords.kw_in) ||
+           Tok->isBinaryOperator();
+  }
+
+  bool canPrecedeRegexLiteral(FormatToken *Prev) {
+    if (!Prev)
+      return true;
+
+    // Regex literals can only follow after prefix unary operators, not after
+    // postfix unary operators. If the '++' is followed by a non-operand
+    // introducing token, the slash here is the operand and not the start of a
+    // regex.
+    if (Prev->isOneOf(tok::plusplus, tok::minusminus))
+      return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
+
+    // The previous token must introduce an operand location where regex
+    // literals can occur.
+    if (!precedesOperand(Prev))
+      return false;
+
+    return true;
+  }
+
+  // Tries to parse a JavaScript Regex literal starting at the current token,
+  // if that begins with a slash and is in a location where JavaScript allows
+  // regex literals. Changes the current token to a regex literal and updates
+  // its text if successful.
+  void tryParseJSRegexLiteral() {
+    FormatToken *RegexToken = Tokens.back();
+    if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
+      return;
+
+    FormatToken *Prev = nullptr;
+    for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
+      // NB: Because previous pointers are not initialized yet, this cannot use
+      // Token.getPreviousNonComment.
+      if ((*I)->isNot(tok::comment)) {
+        Prev = *I;
+        break;
+      }
+    }
+
+    if (!canPrecedeRegexLiteral(Prev))
+      return;
+
+    // 'Manually' lex ahead in the current file buffer.
+    const char *Offset = Lex->getBufferLocation();
+    const char *RegexBegin = Offset - RegexToken->TokenText.size();
+    StringRef Buffer = Lex->getBuffer();
+    bool InCharacterClass = false;
+    bool HaveClosingSlash = false;
+    for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
+      // Regular expressions are terminated with a '/', which can only be
+      // escaped using '\' or a character class between '[' and ']'.
+      // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
+      switch (*Offset) {
+      case '\\':
+        // Skip the escaped character.
+        ++Offset;
+        break;
+      case '[':
+        InCharacterClass = true;
+        break;
+      case ']':
+        InCharacterClass = false;
+        break;
+      case '/':
+        if (!InCharacterClass)
+          HaveClosingSlash = true;
+        break;
+      }
+    }
+
+    RegexToken->Type = TT_RegexLiteral;
+    // Treat regex literals like other string_literals.
+    RegexToken->Tok.setKind(tok::string_literal);
+    RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
+    RegexToken->ColumnWidth = RegexToken->TokenText.size();
+
+    resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
+  }
+
+  bool tryMergeTemplateString() {
+    if (Tokens.size() < 2)
+      return false;
+
+    FormatToken *EndBacktick = Tokens.back();
+    // Backticks get lexed as tok::unknown tokens. If a template string contains
+    // a comment start, it gets lexed as a tok::comment, or tok::unknown if
+    // unterminated.
+    if (!EndBacktick->isOneOf(tok::comment, tok::string_literal,
+                              tok::char_constant, tok::unknown))
+      return false;
+    size_t CommentBacktickPos = EndBacktick->TokenText.find('`');
+    // Unknown token that's not actually a backtick, or a comment that doesn't
+    // contain a backtick.
+    if (CommentBacktickPos == StringRef::npos)
+      return false;
+
+    unsigned TokenCount = 0;
+    bool IsMultiline = false;
+    unsigned EndColumnInFirstLine =
+        EndBacktick->OriginalColumn + EndBacktick->ColumnWidth;
+    for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; I++) {
+      ++TokenCount;
+      if (I[0]->IsMultiline)
+        IsMultiline = true;
+
+      // If there was a preceding template string, this must be the start of a
+      // template string, not the end.
+      if (I[0]->is(TT_TemplateString))
+        return false;
+
+      if (I[0]->isNot(tok::unknown) || I[0]->TokenText != "`") {
+        // Keep track of the rhs offset of the last token to wrap across lines -
+        // its the rhs offset of the first line of the template string, used to
+        // determine its width.
+        if (I[0]->IsMultiline)
+          EndColumnInFirstLine = I[0]->OriginalColumn + I[0]->ColumnWidth;
+        // If the token has newlines, the token before it (if it exists) is the
+        // rhs end of the previous line.
+        if (I[0]->NewlinesBefore > 0 && (I + 1 != E)) {
+          EndColumnInFirstLine = I[1]->OriginalColumn + I[1]->ColumnWidth;
+          IsMultiline = true;
+        }
+        continue;
+      }
+
+      Tokens.resize(Tokens.size() - TokenCount);
+      Tokens.back()->Type = TT_TemplateString;
+      const char *EndOffset =
+          EndBacktick->TokenText.data() + 1 + CommentBacktickPos;
+      if (CommentBacktickPos != 0) {
+        // If the backtick was not the first character (e.g. in a comment),
+        // re-lex after the backtick position.
+        SourceLocation Loc = EndBacktick->Tok.getLocation();
+        resetLexer(SourceMgr.getFileOffset(Loc) + CommentBacktickPos + 1);
+      }
+      Tokens.back()->TokenText =
+          StringRef(Tokens.back()->TokenText.data(),
+                    EndOffset - Tokens.back()->TokenText.data());
+
+      unsigned EndOriginalColumn = EndBacktick->OriginalColumn;
+      if (EndOriginalColumn == 0) {
+        SourceLocation Loc = EndBacktick->Tok.getLocation();
+        EndOriginalColumn = SourceMgr.getSpellingColumnNumber(Loc);
+      }
+      // If the ` is further down within the token (e.g. in a comment).
+      EndOriginalColumn += CommentBacktickPos;
+
+      if (IsMultiline) {
+        // ColumnWidth is from backtick to last token in line.
+        // LastLineColumnWidth is 0 to backtick.
+        // x = `some content
+        //     until here`;
+        Tokens.back()->ColumnWidth =
+            EndColumnInFirstLine - Tokens.back()->OriginalColumn;
+        // +1 for the ` itself.
+        Tokens.back()->LastLineColumnWidth = EndOriginalColumn + 1;
+        Tokens.back()->IsMultiline = true;
+      } else {
+        // Token simply spans from start to end, +1 for the ` itself.
+        Tokens.back()->ColumnWidth =
+            EndOriginalColumn - Tokens.back()->OriginalColumn + 1;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  bool tryMerge_TMacro() {
+    if (Tokens.size() < 4)
+      return false;
+    FormatToken *Last = Tokens.back();
+    if (!Last->is(tok::r_paren))
+      return false;
+
+    FormatToken *String = Tokens[Tokens.size() - 2];
+    if (!String->is(tok::string_literal) || String->IsMultiline)
+      return false;
+
+    if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
+      return false;
+
+    FormatToken *Macro = Tokens[Tokens.size() - 4];
+    if (Macro->TokenText != "_T")
+      return false;
+
+    const char *Start = Macro->TokenText.data();
+    const char *End = Last->TokenText.data() + Last->TokenText.size();
+    String->TokenText = StringRef(Start, End - Start);
+    String->IsFirst = Macro->IsFirst;
+    String->LastNewlineOffset = Macro->LastNewlineOffset;
+    String->WhitespaceRange = Macro->WhitespaceRange;
+    String->OriginalColumn = Macro->OriginalColumn;
+    String->ColumnWidth = encoding::columnWidthWithTabs(
+        String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
+    String->NewlinesBefore = Macro->NewlinesBefore;
+    String->HasUnescapedNewline = Macro->HasUnescapedNewline;
+
+    Tokens.pop_back();
+    Tokens.pop_back();
+    Tokens.pop_back();
+    Tokens.back() = String;
+    return true;
+  }
+
+  bool tryMergeConflictMarkers() {
+    if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
+      return false;
+
+    // Conflict lines look like:
+    // <marker> <text from the vcs>
+    // For example:
+    // >>>>>>> /file/in/file/system at revision 1234
+    //
+    // We merge all tokens in a line that starts with a conflict marker
+    // into a single token with a special token type that the unwrapped line
+    // parser will use to correctly rebuild the underlying code.
+
+    FileID ID;
+    // Get the position of the first token in the line.
+    unsigned FirstInLineOffset;
+    std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
+        Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
+    StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer();
+    // Calculate the offset of the start of the current line.
+    auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
+    if (LineOffset == StringRef::npos) {
+      LineOffset = 0;
+    } else {
+      ++LineOffset;
+    }
+
+    auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
+    StringRef LineStart;
+    if (FirstSpace == StringRef::npos) {
+      LineStart = Buffer.substr(LineOffset);
+    } else {
+      LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
+    }
+
+    TokenType Type = TT_Unknown;
+    if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
+      Type = TT_ConflictStart;
+    } else if (LineStart == "|||||||" || LineStart == "=======" ||
+               LineStart == "====") {
+      Type = TT_ConflictAlternative;
+    } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
+      Type = TT_ConflictEnd;
+    }
+
+    if (Type != TT_Unknown) {
+      FormatToken *Next = Tokens.back();
+
+      Tokens.resize(FirstInLineIndex + 1);
+      // We do not need to build a complete token here, as we will skip it
+      // during parsing anyway (as we must not touch whitespace around conflict
+      // markers).
+      Tokens.back()->Type = Type;
+      Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
+
+      Tokens.push_back(Next);
+      return true;
+    }
+
+    return false;
+  }
+
+  FormatToken *getStashedToken() {
+    // Create a synthesized second '>' or '<' token.
+    Token Tok = FormatTok->Tok;
+    StringRef TokenText = FormatTok->TokenText;
+
+    unsigned OriginalColumn = FormatTok->OriginalColumn;
+    FormatTok = new (Allocator.Allocate()) FormatToken;
+    FormatTok->Tok = Tok;
+    SourceLocation TokLocation =
+        FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
+    FormatTok->Tok.setLocation(TokLocation);
+    FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
+    FormatTok->TokenText = TokenText;
+    FormatTok->ColumnWidth = 1;
+    FormatTok->OriginalColumn = OriginalColumn + 1;
+
+    return FormatTok;
+  }
+
+  FormatToken *getNextToken() {
+    if (GreaterStashed) {
+      GreaterStashed = false;
+      return getStashedToken();
+    }
+    if (LessStashed) {
+      LessStashed = false;
+      return getStashedToken();
+    }
+
+    FormatTok = new (Allocator.Allocate()) FormatToken;
+    readRawToken(*FormatTok);
+    SourceLocation WhitespaceStart =
+        FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
+    FormatTok->IsFirst = IsFirstToken;
+    IsFirstToken = false;
+
+    // Consume and record whitespace until we find a significant token.
+    unsigned WhitespaceLength = TrailingWhitespace;
+    while (FormatTok->Tok.is(tok::unknown)) {
+      StringRef Text = FormatTok->TokenText;
+      auto EscapesNewline = [&](int pos) {
+        // A '\r' here is just part of '\r\n'. Skip it.
+        if (pos >= 0 && Text[pos] == '\r')
+          --pos;
+        // See whether there is an odd number of '\' before this.
+        unsigned count = 0;
+        for (; pos >= 0; --pos, ++count)
+          if (Text[pos] != '\\')
+            break;
+        return count & 1;
+      };
+      // FIXME: This miscounts tok:unknown tokens that are not just
+      // whitespace, e.g. a '`' character.
+      for (int i = 0, e = Text.size(); i != e; ++i) {
+        switch (Text[i]) {
+        case '\n':
+          ++FormatTok->NewlinesBefore;
+          FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
+          FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
+          Column = 0;
+          break;
+        case '\r':
+          FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
+          Column = 0;
+          break;
+        case '\f':
+        case '\v':
+          Column = 0;
+          break;
+        case ' ':
+          ++Column;
+          break;
+        case '\t':
+          Column += Style.TabWidth - Column % Style.TabWidth;
+          break;
+        case '\\':
+          if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
+            FormatTok->Type = TT_ImplicitStringLiteral;
+          break;
+        default:
+          FormatTok->Type = TT_ImplicitStringLiteral;
+          break;
+        }
+        if (FormatTok->Type == TT_ImplicitStringLiteral)
+          break;
+      }
+
+      if (FormatTok->is(TT_ImplicitStringLiteral))
+        break;
+      WhitespaceLength += FormatTok->Tok.getLength();
+
+      readRawToken(*FormatTok);
+    }
+
+    // In case the token starts with escaped newlines, we want to
+    // take them into account as whitespace - this pattern is quite frequent
+    // in macro definitions.
+    // FIXME: Add a more explicit test.
+    while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\' &&
+           FormatTok->TokenText[1] == '\n') {
+      ++FormatTok->NewlinesBefore;
+      WhitespaceLength += 2;
+      FormatTok->LastNewlineOffset = 2;
+      Column = 0;
+      FormatTok->TokenText = FormatTok->TokenText.substr(2);
+    }
+
+    FormatTok->WhitespaceRange = SourceRange(
+        WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
+
+    FormatTok->OriginalColumn = Column;
+
+    TrailingWhitespace = 0;
+    if (FormatTok->Tok.is(tok::comment)) {
+      // FIXME: Add the trimmed whitespace to Column.
+      StringRef UntrimmedText = FormatTok->TokenText;
+      FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
+      TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
+    } else if (FormatTok->Tok.is(tok::raw_identifier)) {
+      IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
+      FormatTok->Tok.setIdentifierInfo(&Info);
+      FormatTok->Tok.setKind(Info.getTokenID());
+      if (Style.Language == FormatStyle::LK_Java &&
+          FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
+                             tok::kw_operator)) {
+        FormatTok->Tok.setKind(tok::identifier);
+        FormatTok->Tok.setIdentifierInfo(nullptr);
+      } else if (Style.Language == FormatStyle::LK_JavaScript &&
+                 FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
+                                    tok::kw_operator)) {
+        FormatTok->Tok.setKind(tok::identifier);
+        FormatTok->Tok.setIdentifierInfo(nullptr);
+      }
+    } else if (FormatTok->Tok.is(tok::greatergreater)) {
+      FormatTok->Tok.setKind(tok::greater);
+      FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
+      GreaterStashed = true;
+    } else if (FormatTok->Tok.is(tok::lessless)) {
+      FormatTok->Tok.setKind(tok::less);
+      FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
+      LessStashed = true;
+    }
+
+    // Now FormatTok is the next non-whitespace token.
+
+    StringRef Text = FormatTok->TokenText;
+    size_t FirstNewlinePos = Text.find('\n');
+    if (FirstNewlinePos == StringRef::npos) {
+      // FIXME: ColumnWidth actually depends on the start column, we need to
+      // take this into account when the token is moved.
+      FormatTok->ColumnWidth =
+          encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
+      Column += FormatTok->ColumnWidth;
+    } else {
+      FormatTok->IsMultiline = true;
+      // FIXME: ColumnWidth actually depends on the start column, we need to
+      // take this into account when the token is moved.
+      FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
+          Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
+
+      // The last line of the token always starts in column 0.
+      // Thus, the length can be precomputed even in the presence of tabs.
+      FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
+          Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth,
+          Encoding);
+      Column = FormatTok->LastLineColumnWidth;
+    }
+
+    if (Style.Language == FormatStyle::LK_Cpp) {
+      if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
+            Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
+                tok::pp_define) &&
+          std::find(ForEachMacros.begin(), ForEachMacros.end(),
+                    FormatTok->Tok.getIdentifierInfo()) !=
+              ForEachMacros.end()) {
+        FormatTok->Type = TT_ForEachMacro;
+      } else if (FormatTok->is(tok::identifier)) {
+        if (MacroBlockBeginRegex.match(Text)) {
+          FormatTok->Type = TT_MacroBlockBegin;
+        } else if (MacroBlockEndRegex.match(Text)) {
+          FormatTok->Type = TT_MacroBlockEnd;
+        }
+      }
+    }
+
+    return FormatTok;
+  }
+
+  FormatToken *FormatTok;
+  bool IsFirstToken;
+  bool GreaterStashed, LessStashed;
+  unsigned Column;
+  unsigned TrailingWhitespace;
+  std::unique_ptr<Lexer> Lex;
+  const SourceManager &SourceMgr;
+  FileID ID;
+  const FormatStyle &Style;
+  IdentifierTable IdentTable;
+  AdditionalKeywords Keywords;
+  encoding::Encoding Encoding;
+  llvm::SpecificBumpPtrAllocator<FormatToken> Allocator;
+  // Index (in 'Tokens') of the last token that starts a new line.
+  unsigned FirstInLineIndex;
+  SmallVector<FormatToken *, 16> Tokens;
+  SmallVector<IdentifierInfo *, 8> ForEachMacros;
+
+  bool FormattingDisabled;
+
+  llvm::Regex MacroBlockBeginRegex;
+  llvm::Regex MacroBlockEndRegex;
+
+  void readRawToken(FormatToken &Tok) {
+    Lex->LexFromRawLexer(Tok.Tok);
+    Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
+                              Tok.Tok.getLength());
+    // For formatting, treat unterminated string literals like normal string
+    // literals.
+    if (Tok.is(tok::unknown)) {
+      if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
+        Tok.Tok.setKind(tok::string_literal);
+        Tok.IsUnterminatedLiteral = true;
+      } else if (Style.Language == FormatStyle::LK_JavaScript &&
+                 Tok.TokenText == "''") {
+        Tok.Tok.setKind(tok::string_literal);
+      }
+    }
+
+    if (Style.Language == FormatStyle::LK_JavaScript &&
+        Tok.is(tok::char_constant)) {
+      Tok.Tok.setKind(tok::string_literal);
+    }
+
+    if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
+                                 Tok.TokenText == "/* clang-format on */")) {
+      FormattingDisabled = false;
+    }
+
+    Tok.Finalized = FormattingDisabled;
+
+    if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
+                                 Tok.TokenText == "/* clang-format off */")) {
+      FormattingDisabled = true;
+    }
+  }
+
+  void resetLexer(unsigned Offset) {
+    StringRef Buffer = SourceMgr.getBufferData(ID);
+    Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
+                        getFormattingLangOpts(Style), Buffer.begin(),
+                        Buffer.begin() + Offset, Buffer.end()));
+    Lex->SetKeepWhitespaceMode(true);
+    TrailingWhitespace = 0;
+  }
+};
+
+
+} // namespace format
+} // namespace clang
+
+#endif
Index: lib/Format/FormatToken.h
===================================================================
--- lib/Format/FormatToken.h
+++ lib/Format/FormatToken.h
@@ -535,6 +535,7 @@
     kw_NS_ENUM = &IdentTable.get("NS_ENUM");
     kw_NS_OPTIONS = &IdentTable.get("NS_OPTIONS");
 
+    kw_as = &IdentTable.get("as");
     kw_async = &IdentTable.get("async");
     kw_await = &IdentTable.get("await");
     kw_finally = &IdentTable.get("finally");
@@ -585,6 +586,7 @@
   IdentifierInfo *kw___except;
 
   // JavaScript keywords.
+  IdentifierInfo *kw_as;
   IdentifierInfo *kw_async;
   IdentifierInfo *kw_await;
   IdentifierInfo *kw_finally;
Index: lib/Format/Format.cpp
===================================================================
--- lib/Format/Format.cpp
+++ lib/Format/Format.cpp
@@ -16,7 +16,10 @@
 #include "clang/Format/Format.h"
 #include "AffectedRangeManager.h"
 #include "ContinuationIndenter.h"
+#include "SortJavaScriptImports.h"
 #include "TokenAnnotator.h"
+#include "FormatTokenLexer.h"
+#include "TokenAnalyzer.h"
 #include "UnwrappedLineFormatter.h"
 #include "UnwrappedLineParser.h"
 #include "WhitespaceManager.h"
@@ -782,827 +785,6 @@
 
 namespace {
 
-class FormatTokenLexer {
-public:
-  FormatTokenLexer(const SourceManager &SourceMgr, FileID ID,
-                   const FormatStyle &Style, encoding::Encoding Encoding)
-      : FormatTok(nullptr), IsFirstToken(true), GreaterStashed(false),
-        LessStashed(false), Column(0), TrailingWhitespace(0),
-        SourceMgr(SourceMgr), ID(ID), Style(Style),
-        IdentTable(getFormattingLangOpts(Style)), Keywords(IdentTable),
-        Encoding(Encoding), FirstInLineIndex(0), FormattingDisabled(false),
-        MacroBlockBeginRegex(Style.MacroBlockBegin),
-        MacroBlockEndRegex(Style.MacroBlockEnd) {
-    Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
-                        getFormattingLangOpts(Style)));
-    Lex->SetKeepWhitespaceMode(true);
-
-    for (const std::string &ForEachMacro : Style.ForEachMacros)
-      ForEachMacros.push_back(&IdentTable.get(ForEachMacro));
-    std::sort(ForEachMacros.begin(), ForEachMacros.end());
-  }
-
-  ArrayRef<FormatToken *> lex() {
-    assert(Tokens.empty());
-    assert(FirstInLineIndex == 0);
-    do {
-      Tokens.push_back(getNextToken());
-      if (Style.Language == FormatStyle::LK_JavaScript)
-        tryParseJSRegexLiteral();
-      tryMergePreviousTokens();
-      if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
-        FirstInLineIndex = Tokens.size() - 1;
-    } while (Tokens.back()->Tok.isNot(tok::eof));
-    return Tokens;
-  }
-
-  const AdditionalKeywords &getKeywords() { return Keywords; }
-
-private:
-  void tryMergePreviousTokens() {
-    if (tryMerge_TMacro())
-      return;
-    if (tryMergeConflictMarkers())
-      return;
-    if (tryMergeLessLess())
-      return;
-
-    if (Style.Language == FormatStyle::LK_JavaScript) {
-      if (tryMergeTemplateString())
-        return;
-
-      static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
-      static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
-                                                     tok::equal};
-      static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
-                                                    tok::greaterequal};
-      static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
-      // FIXME: Investigate what token type gives the correct operator priority.
-      if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
-        return;
-      if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
-        return;
-      if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
-        return;
-      if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
-        return;
-    }
-  }
-
-  bool tryMergeLessLess() {
-    // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
-    if (Tokens.size() < 3)
-      return false;
-
-    bool FourthTokenIsLess = false;
-    if (Tokens.size() > 3)
-      FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
-
-    auto First = Tokens.end() - 3;
-    if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
-        First[0]->isNot(tok::less) || FourthTokenIsLess)
-      return false;
-
-    // Only merge if there currently is no whitespace between the two "<".
-    if (First[1]->WhitespaceRange.getBegin() !=
-        First[1]->WhitespaceRange.getEnd())
-      return false;
-
-    First[0]->Tok.setKind(tok::lessless);
-    First[0]->TokenText = "<<";
-    First[0]->ColumnWidth += 1;
-    Tokens.erase(Tokens.end() - 2);
-    return true;
-  }
-
-  bool tryMergeTokens(ArrayRef<tok::TokenKind> Kinds, TokenType NewType) {
-    if (Tokens.size() < Kinds.size())
-      return false;
-
-    SmallVectorImpl<FormatToken *>::const_iterator First =
-        Tokens.end() - Kinds.size();
-    if (!First[0]->is(Kinds[0]))
-      return false;
-    unsigned AddLength = 0;
-    for (unsigned i = 1; i < Kinds.size(); ++i) {
-      if (!First[i]->is(Kinds[i]) ||
-          First[i]->WhitespaceRange.getBegin() !=
-              First[i]->WhitespaceRange.getEnd())
-        return false;
-      AddLength += First[i]->TokenText.size();
-    }
-    Tokens.resize(Tokens.size() - Kinds.size() + 1);
-    First[0]->TokenText = StringRef(First[0]->TokenText.data(),
-                                    First[0]->TokenText.size() + AddLength);
-    First[0]->ColumnWidth += AddLength;
-    First[0]->Type = NewType;
-    return true;
-  }
-
-  // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
-  bool precedesOperand(FormatToken *Tok) {
-    // NB: This is not entirely correct, as an r_paren can introduce an operand
-    // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
-    // corner case to not matter in practice, though.
-    return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
-                        tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
-                        tok::colon, tok::question, tok::tilde) ||
-           Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
-                        tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
-                        tok::kw_typeof, Keywords.kw_instanceof,
-                        Keywords.kw_in) ||
-           Tok->isBinaryOperator();
-  }
-
-  bool canPrecedeRegexLiteral(FormatToken *Prev) {
-    if (!Prev)
-      return true;
-
-    // Regex literals can only follow after prefix unary operators, not after
-    // postfix unary operators. If the '++' is followed by a non-operand
-    // introducing token, the slash here is the operand and not the start of a
-    // regex.
-    if (Prev->isOneOf(tok::plusplus, tok::minusminus))
-      return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
-
-    // The previous token must introduce an operand location where regex
-    // literals can occur.
-    if (!precedesOperand(Prev))
-      return false;
-
-    return true;
-  }
-
-  // Tries to parse a JavaScript Regex literal starting at the current token,
-  // if that begins with a slash and is in a location where JavaScript allows
-  // regex literals. Changes the current token to a regex literal and updates
-  // its text if successful.
-  void tryParseJSRegexLiteral() {
-    FormatToken *RegexToken = Tokens.back();
-    if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
-      return;
-
-    FormatToken *Prev = nullptr;
-    for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
-      // NB: Because previous pointers are not initialized yet, this cannot use
-      // Token.getPreviousNonComment.
-      if ((*I)->isNot(tok::comment)) {
-        Prev = *I;
-        break;
-      }
-    }
-
-    if (!canPrecedeRegexLiteral(Prev))
-      return;
-
-    // 'Manually' lex ahead in the current file buffer.
-    const char *Offset = Lex->getBufferLocation();
-    const char *RegexBegin = Offset - RegexToken->TokenText.size();
-    StringRef Buffer = Lex->getBuffer();
-    bool InCharacterClass = false;
-    bool HaveClosingSlash = false;
-    for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
-      // Regular expressions are terminated with a '/', which can only be
-      // escaped using '\' or a character class between '[' and ']'.
-      // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
-      switch (*Offset) {
-      case '\\':
-        // Skip the escaped character.
-        ++Offset;
-        break;
-      case '[':
-        InCharacterClass = true;
-        break;
-      case ']':
-        InCharacterClass = false;
-        break;
-      case '/':
-        if (!InCharacterClass)
-          HaveClosingSlash = true;
-        break;
-      }
-    }
-
-    RegexToken->Type = TT_RegexLiteral;
-    // Treat regex literals like other string_literals.
-    RegexToken->Tok.setKind(tok::string_literal);
-    RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
-    RegexToken->ColumnWidth = RegexToken->TokenText.size();
-
-    resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
-  }
-
-  bool tryMergeTemplateString() {
-    if (Tokens.size() < 2)
-      return false;
-
-    FormatToken *EndBacktick = Tokens.back();
-    // Backticks get lexed as tok::unknown tokens. If a template string contains
-    // a comment start, it gets lexed as a tok::comment, or tok::unknown if
-    // unterminated.
-    if (!EndBacktick->isOneOf(tok::comment, tok::string_literal,
-                              tok::char_constant, tok::unknown))
-      return false;
-    size_t CommentBacktickPos = EndBacktick->TokenText.find('`');
-    // Unknown token that's not actually a backtick, or a comment that doesn't
-    // contain a backtick.
-    if (CommentBacktickPos == StringRef::npos)
-      return false;
-
-    unsigned TokenCount = 0;
-    bool IsMultiline = false;
-    unsigned EndColumnInFirstLine =
-        EndBacktick->OriginalColumn + EndBacktick->ColumnWidth;
-    for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; I++) {
-      ++TokenCount;
-      if (I[0]->IsMultiline)
-        IsMultiline = true;
-
-      // If there was a preceding template string, this must be the start of a
-      // template string, not the end.
-      if (I[0]->is(TT_TemplateString))
-        return false;
-
-      if (I[0]->isNot(tok::unknown) || I[0]->TokenText != "`") {
-        // Keep track of the rhs offset of the last token to wrap across lines -
-        // its the rhs offset of the first line of the template string, used to
-        // determine its width.
-        if (I[0]->IsMultiline)
-          EndColumnInFirstLine = I[0]->OriginalColumn + I[0]->ColumnWidth;
-        // If the token has newlines, the token before it (if it exists) is the
-        // rhs end of the previous line.
-        if (I[0]->NewlinesBefore > 0 && (I + 1 != E)) {
-          EndColumnInFirstLine = I[1]->OriginalColumn + I[1]->ColumnWidth;
-          IsMultiline = true;
-        }
-        continue;
-      }
-
-      Tokens.resize(Tokens.size() - TokenCount);
-      Tokens.back()->Type = TT_TemplateString;
-      const char *EndOffset =
-          EndBacktick->TokenText.data() + 1 + CommentBacktickPos;
-      if (CommentBacktickPos != 0) {
-        // If the backtick was not the first character (e.g. in a comment),
-        // re-lex after the backtick position.
-        SourceLocation Loc = EndBacktick->Tok.getLocation();
-        resetLexer(SourceMgr.getFileOffset(Loc) + CommentBacktickPos + 1);
-      }
-      Tokens.back()->TokenText =
-          StringRef(Tokens.back()->TokenText.data(),
-                    EndOffset - Tokens.back()->TokenText.data());
-
-      unsigned EndOriginalColumn = EndBacktick->OriginalColumn;
-      if (EndOriginalColumn == 0) {
-        SourceLocation Loc = EndBacktick->Tok.getLocation();
-        EndOriginalColumn = SourceMgr.getSpellingColumnNumber(Loc);
-      }
-      // If the ` is further down within the token (e.g. in a comment).
-      EndOriginalColumn += CommentBacktickPos;
-
-      if (IsMultiline) {
-        // ColumnWidth is from backtick to last token in line.
-        // LastLineColumnWidth is 0 to backtick.
-        // x = `some content
-        //     until here`;
-        Tokens.back()->ColumnWidth =
-            EndColumnInFirstLine - Tokens.back()->OriginalColumn;
-        // +1 for the ` itself.
-        Tokens.back()->LastLineColumnWidth = EndOriginalColumn + 1;
-        Tokens.back()->IsMultiline = true;
-      } else {
-        // Token simply spans from start to end, +1 for the ` itself.
-        Tokens.back()->ColumnWidth =
-            EndOriginalColumn - Tokens.back()->OriginalColumn + 1;
-      }
-      return true;
-    }
-    return false;
-  }
-
-  bool tryMerge_TMacro() {
-    if (Tokens.size() < 4)
-      return false;
-    FormatToken *Last = Tokens.back();
-    if (!Last->is(tok::r_paren))
-      return false;
-
-    FormatToken *String = Tokens[Tokens.size() - 2];
-    if (!String->is(tok::string_literal) || String->IsMultiline)
-      return false;
-
-    if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
-      return false;
-
-    FormatToken *Macro = Tokens[Tokens.size() - 4];
-    if (Macro->TokenText != "_T")
-      return false;
-
-    const char *Start = Macro->TokenText.data();
-    const char *End = Last->TokenText.data() + Last->TokenText.size();
-    String->TokenText = StringRef(Start, End - Start);
-    String->IsFirst = Macro->IsFirst;
-    String->LastNewlineOffset = Macro->LastNewlineOffset;
-    String->WhitespaceRange = Macro->WhitespaceRange;
-    String->OriginalColumn = Macro->OriginalColumn;
-    String->ColumnWidth = encoding::columnWidthWithTabs(
-        String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
-    String->NewlinesBefore = Macro->NewlinesBefore;
-    String->HasUnescapedNewline = Macro->HasUnescapedNewline;
-
-    Tokens.pop_back();
-    Tokens.pop_back();
-    Tokens.pop_back();
-    Tokens.back() = String;
-    return true;
-  }
-
-  bool tryMergeConflictMarkers() {
-    if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
-      return false;
-
-    // Conflict lines look like:
-    // <marker> <text from the vcs>
-    // For example:
-    // >>>>>>> /file/in/file/system at revision 1234
-    //
-    // We merge all tokens in a line that starts with a conflict marker
-    // into a single token with a special token type that the unwrapped line
-    // parser will use to correctly rebuild the underlying code.
-
-    FileID ID;
-    // Get the position of the first token in the line.
-    unsigned FirstInLineOffset;
-    std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
-        Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
-    StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer();
-    // Calculate the offset of the start of the current line.
-    auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
-    if (LineOffset == StringRef::npos) {
-      LineOffset = 0;
-    } else {
-      ++LineOffset;
-    }
-
-    auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
-    StringRef LineStart;
-    if (FirstSpace == StringRef::npos) {
-      LineStart = Buffer.substr(LineOffset);
-    } else {
-      LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
-    }
-
-    TokenType Type = TT_Unknown;
-    if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
-      Type = TT_ConflictStart;
-    } else if (LineStart == "|||||||" || LineStart == "=======" ||
-               LineStart == "====") {
-      Type = TT_ConflictAlternative;
-    } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
-      Type = TT_ConflictEnd;
-    }
-
-    if (Type != TT_Unknown) {
-      FormatToken *Next = Tokens.back();
-
-      Tokens.resize(FirstInLineIndex + 1);
-      // We do not need to build a complete token here, as we will skip it
-      // during parsing anyway (as we must not touch whitespace around conflict
-      // markers).
-      Tokens.back()->Type = Type;
-      Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
-
-      Tokens.push_back(Next);
-      return true;
-    }
-
-    return false;
-  }
-
-  FormatToken *getStashedToken() {
-    // Create a synthesized second '>' or '<' token.
-    Token Tok = FormatTok->Tok;
-    StringRef TokenText = FormatTok->TokenText;
-
-    unsigned OriginalColumn = FormatTok->OriginalColumn;
-    FormatTok = new (Allocator.Allocate()) FormatToken;
-    FormatTok->Tok = Tok;
-    SourceLocation TokLocation =
-        FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
-    FormatTok->Tok.setLocation(TokLocation);
-    FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
-    FormatTok->TokenText = TokenText;
-    FormatTok->ColumnWidth = 1;
-    FormatTok->OriginalColumn = OriginalColumn + 1;
-
-    return FormatTok;
-  }
-
-  FormatToken *getNextToken() {
-    if (GreaterStashed) {
-      GreaterStashed = false;
-      return getStashedToken();
-    }
-    if (LessStashed) {
-      LessStashed = false;
-      return getStashedToken();
-    }
-
-    FormatTok = new (Allocator.Allocate()) FormatToken;
-    readRawToken(*FormatTok);
-    SourceLocation WhitespaceStart =
-        FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
-    FormatTok->IsFirst = IsFirstToken;
-    IsFirstToken = false;
-
-    // Consume and record whitespace until we find a significant token.
-    unsigned WhitespaceLength = TrailingWhitespace;
-    while (FormatTok->Tok.is(tok::unknown)) {
-      StringRef Text = FormatTok->TokenText;
-      auto EscapesNewline = [&](int pos) {
-        // A '\r' here is just part of '\r\n'. Skip it.
-        if (pos >= 0 && Text[pos] == '\r')
-          --pos;
-        // See whether there is an odd number of '\' before this.
-        unsigned count = 0;
-        for (; pos >= 0; --pos, ++count)
-          if (Text[pos] != '\\')
-            break;
-        return count & 1;
-      };
-      // FIXME: This miscounts tok:unknown tokens that are not just
-      // whitespace, e.g. a '`' character.
-      for (int i = 0, e = Text.size(); i != e; ++i) {
-        switch (Text[i]) {
-        case '\n':
-          ++FormatTok->NewlinesBefore;
-          FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
-          FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
-          Column = 0;
-          break;
-        case '\r':
-          FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
-          Column = 0;
-          break;
-        case '\f':
-        case '\v':
-          Column = 0;
-          break;
-        case ' ':
-          ++Column;
-          break;
-        case '\t':
-          Column += Style.TabWidth - Column % Style.TabWidth;
-          break;
-        case '\\':
-          if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
-            FormatTok->Type = TT_ImplicitStringLiteral;
-          break;
-        default:
-          FormatTok->Type = TT_ImplicitStringLiteral;
-          break;
-        }
-        if (FormatTok->Type == TT_ImplicitStringLiteral)
-          break;
-      }
-
-      if (FormatTok->is(TT_ImplicitStringLiteral))
-        break;
-      WhitespaceLength += FormatTok->Tok.getLength();
-
-      readRawToken(*FormatTok);
-    }
-
-    // In case the token starts with escaped newlines, we want to
-    // take them into account as whitespace - this pattern is quite frequent
-    // in macro definitions.
-    // FIXME: Add a more explicit test.
-    while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\' &&
-           FormatTok->TokenText[1] == '\n') {
-      ++FormatTok->NewlinesBefore;
-      WhitespaceLength += 2;
-      FormatTok->LastNewlineOffset = 2;
-      Column = 0;
-      FormatTok->TokenText = FormatTok->TokenText.substr(2);
-    }
-
-    FormatTok->WhitespaceRange = SourceRange(
-        WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
-
-    FormatTok->OriginalColumn = Column;
-
-    TrailingWhitespace = 0;
-    if (FormatTok->Tok.is(tok::comment)) {
-      // FIXME: Add the trimmed whitespace to Column.
-      StringRef UntrimmedText = FormatTok->TokenText;
-      FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
-      TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
-    } else if (FormatTok->Tok.is(tok::raw_identifier)) {
-      IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
-      FormatTok->Tok.setIdentifierInfo(&Info);
-      FormatTok->Tok.setKind(Info.getTokenID());
-      if (Style.Language == FormatStyle::LK_Java &&
-          FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
-                             tok::kw_operator)) {
-        FormatTok->Tok.setKind(tok::identifier);
-        FormatTok->Tok.setIdentifierInfo(nullptr);
-      } else if (Style.Language == FormatStyle::LK_JavaScript &&
-                 FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
-                                    tok::kw_operator)) {
-        FormatTok->Tok.setKind(tok::identifier);
-        FormatTok->Tok.setIdentifierInfo(nullptr);
-      }
-    } else if (FormatTok->Tok.is(tok::greatergreater)) {
-      FormatTok->Tok.setKind(tok::greater);
-      FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
-      GreaterStashed = true;
-    } else if (FormatTok->Tok.is(tok::lessless)) {
-      FormatTok->Tok.setKind(tok::less);
-      FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
-      LessStashed = true;
-    }
-
-    // Now FormatTok is the next non-whitespace token.
-
-    StringRef Text = FormatTok->TokenText;
-    size_t FirstNewlinePos = Text.find('\n');
-    if (FirstNewlinePos == StringRef::npos) {
-      // FIXME: ColumnWidth actually depends on the start column, we need to
-      // take this into account when the token is moved.
-      FormatTok->ColumnWidth =
-          encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
-      Column += FormatTok->ColumnWidth;
-    } else {
-      FormatTok->IsMultiline = true;
-      // FIXME: ColumnWidth actually depends on the start column, we need to
-      // take this into account when the token is moved.
-      FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
-          Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
-
-      // The last line of the token always starts in column 0.
-      // Thus, the length can be precomputed even in the presence of tabs.
-      FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
-          Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth,
-          Encoding);
-      Column = FormatTok->LastLineColumnWidth;
-    }
-
-    if (Style.Language == FormatStyle::LK_Cpp) {
-      if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
-            Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
-                tok::pp_define) &&
-          std::find(ForEachMacros.begin(), ForEachMacros.end(),
-                    FormatTok->Tok.getIdentifierInfo()) !=
-              ForEachMacros.end()) {
-        FormatTok->Type = TT_ForEachMacro;
-      } else if (FormatTok->is(tok::identifier)) {
-        if (MacroBlockBeginRegex.match(Text)) {
-          FormatTok->Type = TT_MacroBlockBegin;
-        } else if (MacroBlockEndRegex.match(Text)) {
-          FormatTok->Type = TT_MacroBlockEnd;
-        }
-      }
-    }
-
-    return FormatTok;
-  }
-
-  FormatToken *FormatTok;
-  bool IsFirstToken;
-  bool GreaterStashed, LessStashed;
-  unsigned Column;
-  unsigned TrailingWhitespace;
-  std::unique_ptr<Lexer> Lex;
-  const SourceManager &SourceMgr;
-  FileID ID;
-  const FormatStyle &Style;
-  IdentifierTable IdentTable;
-  AdditionalKeywords Keywords;
-  encoding::Encoding Encoding;
-  llvm::SpecificBumpPtrAllocator<FormatToken> Allocator;
-  // Index (in 'Tokens') of the last token that starts a new line.
-  unsigned FirstInLineIndex;
-  SmallVector<FormatToken *, 16> Tokens;
-  SmallVector<IdentifierInfo *, 8> ForEachMacros;
-
-  bool FormattingDisabled;
-
-  llvm::Regex MacroBlockBeginRegex;
-  llvm::Regex MacroBlockEndRegex;
-
-  void readRawToken(FormatToken &Tok) {
-    Lex->LexFromRawLexer(Tok.Tok);
-    Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
-                              Tok.Tok.getLength());
-    // For formatting, treat unterminated string literals like normal string
-    // literals.
-    if (Tok.is(tok::unknown)) {
-      if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
-        Tok.Tok.setKind(tok::string_literal);
-        Tok.IsUnterminatedLiteral = true;
-      } else if (Style.Language == FormatStyle::LK_JavaScript &&
-                 Tok.TokenText == "''") {
-        Tok.Tok.setKind(tok::string_literal);
-      }
-    }
-
-    if (Style.Language == FormatStyle::LK_JavaScript &&
-        Tok.is(tok::char_constant)) {
-      Tok.Tok.setKind(tok::string_literal);
-    }
-
-    if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
-                                 Tok.TokenText == "/* clang-format on */")) {
-      FormattingDisabled = false;
-    }
-
-    Tok.Finalized = FormattingDisabled;
-
-    if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
-                                 Tok.TokenText == "/* clang-format off */")) {
-      FormattingDisabled = true;
-    }
-  }
-
-  void resetLexer(unsigned Offset) {
-    StringRef Buffer = SourceMgr.getBufferData(ID);
-    Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
-                        getFormattingLangOpts(Style), Buffer.begin(),
-                        Buffer.begin() + Offset, Buffer.end()));
-    Lex->SetKeepWhitespaceMode(true);
-    TrailingWhitespace = 0;
-  }
-};
-
-static StringRef getLanguageName(FormatStyle::LanguageKind Language) {
-  switch (Language) {
-  case FormatStyle::LK_Cpp:
-    return "C++";
-  case FormatStyle::LK_Java:
-    return "Java";
-  case FormatStyle::LK_JavaScript:
-    return "JavaScript";
-  case FormatStyle::LK_Proto:
-    return "Proto";
-  default:
-    return "Unknown";
-  }
-}
-
-class Environment {
-public:
-  Environment(SourceManager &SM, FileID ID, ArrayRef<CharSourceRange> Ranges)
-      : ID(ID), CharRanges(Ranges.begin(), Ranges.end()), SM(SM) {}
-
-  Environment(FileID ID, std::unique_ptr<FileManager> FileMgr,
-              std::unique_ptr<SourceManager> VirtualSM,
-              std::unique_ptr<DiagnosticsEngine> Diagnostics,
-              const std::vector<CharSourceRange> &CharRanges)
-      : ID(ID), CharRanges(CharRanges.begin(), CharRanges.end()),
-        SM(*VirtualSM), FileMgr(std::move(FileMgr)),
-        VirtualSM(std::move(VirtualSM)), Diagnostics(std::move(Diagnostics)) {}
-
-  // This sets up an virtual file system with file \p FileName containing \p
-  // Code.
-  static std::unique_ptr<Environment>
-  CreateVirtualEnvironment(StringRef Code, StringRef FileName,
-                           ArrayRef<tooling::Range> Ranges) {
-    // This is referenced by `FileMgr` and will be released by `FileMgr` when it
-    // is deleted.
-    IntrusiveRefCntPtr<vfs::InMemoryFileSystem> InMemoryFileSystem(
-        new vfs::InMemoryFileSystem);
-    // This is passed to `SM` as reference, so the pointer has to be referenced
-    // in `Environment` so that `FileMgr` can out-live this function scope.
-    std::unique_ptr<FileManager> FileMgr(
-        new FileManager(FileSystemOptions(), InMemoryFileSystem));
-    // This is passed to `SM` as reference, so the pointer has to be referenced
-    // by `Environment` due to the same reason above.
-    std::unique_ptr<DiagnosticsEngine> Diagnostics(new DiagnosticsEngine(
-        IntrusiveRefCntPtr<DiagnosticIDs>(new DiagnosticIDs),
-        new DiagnosticOptions));
-    // This will be stored as reference, so the pointer has to be stored in
-    // due to the same reason above.
-    std::unique_ptr<SourceManager> VirtualSM(
-        new SourceManager(*Diagnostics, *FileMgr));
-    InMemoryFileSystem->addFile(
-        FileName, 0, llvm::MemoryBuffer::getMemBuffer(
-                         Code, FileName, /*RequiresNullTerminator=*/false));
-    FileID ID = VirtualSM->createFileID(
-        FileMgr->getFile(FileName), SourceLocation(), clang::SrcMgr::C_User);
-    assert(ID.isValid());
-    SourceLocation StartOfFile = VirtualSM->getLocForStartOfFile(ID);
-    std::vector<CharSourceRange> CharRanges;
-    for (const tooling::Range &Range : Ranges) {
-      SourceLocation Start = StartOfFile.getLocWithOffset(Range.getOffset());
-      SourceLocation End = Start.getLocWithOffset(Range.getLength());
-      CharRanges.push_back(CharSourceRange::getCharRange(Start, End));
-    }
-    return llvm::make_unique<Environment>(ID, std::move(FileMgr),
-                                          std::move(VirtualSM),
-                                          std::move(Diagnostics), CharRanges);
-  }
-
-  FileID getFileID() const { return ID; }
-
-  StringRef getFileName() const { return FileName; }
-
-  ArrayRef<CharSourceRange> getCharRanges() const { return CharRanges; }
-
-  const SourceManager &getSourceManager() const { return SM; }
-
-private:
-  FileID ID;
-  StringRef FileName;
-  SmallVector<CharSourceRange, 8> CharRanges;
-  SourceManager &SM;
-
-  // The order of these fields are important - they should be in the same order
-  // as they are created in `CreateVirtualEnvironment` so that they can be
-  // deleted in the reverse order as they are created.
-  std::unique_ptr<FileManager> FileMgr;
-  std::unique_ptr<SourceManager> VirtualSM;
-  std::unique_ptr<DiagnosticsEngine> Diagnostics;
-};
-
-class TokenAnalyzer : public UnwrappedLineConsumer {
-public:
-  TokenAnalyzer(const Environment &Env, const FormatStyle &Style)
-      : Style(Style), Env(Env),
-        AffectedRangeMgr(Env.getSourceManager(), Env.getCharRanges()),
-        UnwrappedLines(1),
-        Encoding(encoding::detectEncoding(
-            Env.getSourceManager().getBufferData(Env.getFileID()))) {
-    DEBUG(llvm::dbgs() << "File encoding: "
-                       << (Encoding == encoding::Encoding_UTF8 ? "UTF8"
-                                                               : "unknown")
-                       << "\n");
-    DEBUG(llvm::dbgs() << "Language: " << getLanguageName(Style.Language)
-                       << "\n");
-  }
-
-  tooling::Replacements process() {
-    tooling::Replacements Result;
-    FormatTokenLexer Tokens(Env.getSourceManager(), Env.getFileID(), Style,
-                            Encoding);
-
-    UnwrappedLineParser Parser(Style, Tokens.getKeywords(), Tokens.lex(),
-                               *this);
-    Parser.parse();
-    assert(UnwrappedLines.rbegin()->empty());
-    for (unsigned Run = 0, RunE = UnwrappedLines.size(); Run + 1 != RunE;
-         ++Run) {
-      DEBUG(llvm::dbgs() << "Run " << Run << "...\n");
-      SmallVector<AnnotatedLine *, 16> AnnotatedLines;
-
-      TokenAnnotator Annotator(Style, Tokens.getKeywords());
-      for (unsigned i = 0, e = UnwrappedLines[Run].size(); i != e; ++i) {
-        AnnotatedLines.push_back(new AnnotatedLine(UnwrappedLines[Run][i]));
-        Annotator.annotate(*AnnotatedLines.back());
-      }
-
-      tooling::Replacements RunResult =
-          analyze(Annotator, AnnotatedLines, Tokens, Result);
-
-      DEBUG({
-        llvm::dbgs() << "Replacements for run " << Run << ":\n";
-        for (tooling::Replacements::iterator I = RunResult.begin(),
-                                             E = RunResult.end();
-             I != E; ++I) {
-          llvm::dbgs() << I->toString() << "\n";
-        }
-      });
-      for (unsigned i = 0, e = AnnotatedLines.size(); i != e; ++i) {
-        delete AnnotatedLines[i];
-      }
-      Result.insert(RunResult.begin(), RunResult.end());
-    }
-    return Result;
-  }
-
-protected:
-  virtual tooling::Replacements
-  analyze(TokenAnnotator &Annotator,
-          SmallVectorImpl<AnnotatedLine *> &AnnotatedLines,
-          FormatTokenLexer &Tokens, tooling::Replacements &Result) = 0;
-
-  void consumeUnwrappedLine(const UnwrappedLine &TheLine) override {
-    assert(!UnwrappedLines.empty());
-    UnwrappedLines.back().push_back(TheLine);
-  }
-
-  void finishRun() override {
-    UnwrappedLines.push_back(SmallVector<UnwrappedLine, 16>());
-  }
-
-  FormatStyle Style;
-  // Stores Style, FileID and SourceManager etc.
-  const Environment &Env;
-  // AffectedRangeMgr stores ranges to be fixed.
-  AffectedRangeManager AffectedRangeMgr;
-  SmallVector<SmallVector<UnwrappedLine, 16>, 2> UnwrappedLines;
-  encoding::Encoding Encoding;
-};
-
 class Formatter : public TokenAnalyzer {
 public:
   Formatter(const Environment &Env, const FormatStyle &Style,
@@ -2038,6 +1220,8 @@
   tooling::Replacements Replaces;
   if (!Style.SortIncludes)
     return Replaces;
+  if (Style.Language == FormatStyle::LanguageKind::LK_JavaScript)
+    return sortJavaScriptImports(Style, Code, Ranges, FileName);
 
   unsigned Prev = 0;
   unsigned SearchFrom = 0;
Index: lib/Format/CMakeLists.txt
===================================================================
--- lib/Format/CMakeLists.txt
+++ lib/Format/CMakeLists.txt
@@ -6,6 +6,7 @@
   ContinuationIndenter.cpp
   Format.cpp
   FormatToken.cpp
+  SortJavaScriptImports.cpp
   TokenAnnotator.cpp
   UnwrappedLineFormatter.cpp
   UnwrappedLineParser.cpp

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Re: [PATCH] D20198: clang-format: [JS] sort ES6 imports.

Reply via email to