https://github.com/HighCommander4 updated https://github.com/llvm/llvm-project/pull/78925
>From 47ebe97c199ade54a9538eaed02171d78e6a5a46 Mon Sep 17 00:00:00 2001 From: Nathan Ridge <zeratul...@hotmail.com> Date: Sun, 21 Jan 2024 02:45:16 -0500 Subject: [PATCH] [clang-format] Limit how much work guessLanguage() can do guessLanguage() uses UnwrappedLineParser to process different preprocessor variants of a file. For large files with many preprocessor branches, the number of variants can be very large and the operation can hang for a long time and eventually OOM. (This has been observed particularly for single-header libraries such as miniaudio.h). This patch implements a limit on how many variants guessLanguage() analyzes, to avoid such a performance cliff. The limit is expressed as a maximum number of lines (summed over preprocessor variants) to process. This allows shorter files to have more variants processed than longer files. Fixes https://github.com/clangd/clangd/issues/719 Fixes https://github.com/clangd/clangd/issues/1384 Fixes https://github.com/llvm/llvm-project/issues/70945 This patch does not fix the broader problem of actually trying to format such large headers, which involves using UnwrappedLineParser from call sites other than guessLanguage(), though the approach in the patch could be extended to other call sites as well. --- clang/lib/Format/Format.cpp | 8 +++++++- clang/lib/Format/TokenAnalyzer.cpp | 8 +++++--- clang/lib/Format/TokenAnalyzer.h | 4 +++- clang/lib/Format/UnwrappedLineParser.cpp | 11 +++++++++-- clang/lib/Format/UnwrappedLineParser.h | 8 +++++++- 5 files changed, 31 insertions(+), 8 deletions(-) diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index f798d555bf9929..fbafce38d52b77 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -2830,7 +2830,7 @@ class Cleaner : public TokenAnalyzer { class ObjCHeaderStyleGuesser : public TokenAnalyzer { public: ObjCHeaderStyleGuesser(const Environment &Env, const FormatStyle &Style) - : TokenAnalyzer(Env, Style), IsObjC(false) {} + : TokenAnalyzer(Env, Style, MaxLinesToProcess), IsObjC(false) {} std::pair<tooling::Replacements, unsigned> analyze(TokenAnnotator &Annotator, @@ -2846,6 +2846,12 @@ class ObjCHeaderStyleGuesser : public TokenAnalyzer { bool isObjC() { return IsObjC; } private: + // Limit the number of variants of the file TokenAnalyzer processes for + // the purpose of guessing the language. An inaccurate guess is better than + // hanging for a long time or OOMing, which has been observed with real + // libraries which are single-header with many preprocessor branches. + static const unsigned MaxLinesToProcess = (1 << 20); + static bool guessIsObjC(const SourceManager &SourceManager, const SmallVectorImpl<AnnotatedLine *> &AnnotatedLines, diff --git a/clang/lib/Format/TokenAnalyzer.cpp b/clang/lib/Format/TokenAnalyzer.cpp index bd648c430f9b0a..4f1195d5f82b55 100644 --- a/clang/lib/Format/TokenAnalyzer.cpp +++ b/clang/lib/Format/TokenAnalyzer.cpp @@ -83,12 +83,14 @@ Environment::Environment(StringRef Code, StringRef FileName, ID(VirtualSM->get().getMainFileID()), FirstStartColumn(FirstStartColumn), NextStartColumn(NextStartColumn), LastStartColumn(LastStartColumn) {} -TokenAnalyzer::TokenAnalyzer(const Environment &Env, const FormatStyle &Style) +TokenAnalyzer::TokenAnalyzer(const Environment &Env, const FormatStyle &Style, + unsigned MaxLinesToProcess) : Style(Style), Env(Env), AffectedRangeMgr(Env.getSourceManager(), Env.getCharRanges()), UnwrappedLines(1), Encoding(encoding::detectEncoding( - Env.getSourceManager().getBufferData(Env.getFileID()))) { + Env.getSourceManager().getBufferData(Env.getFileID()))), + MaxLinesToProcess(MaxLinesToProcess) { LLVM_DEBUG( llvm::dbgs() << "File encoding: " << (Encoding == encoding::Encoding_UTF8 ? "UTF8" : "unknown") @@ -109,7 +111,7 @@ TokenAnalyzer::process(bool SkipAnnotation) { SmallVector<FormatToken *, 10> Tokens(Toks.begin(), Toks.end()); UnwrappedLineParser Parser(Env.getSourceManager(), Style, Lex.getKeywords(), Env.getFirstStartColumn(), Tokens, *this, - Allocator, IdentTable); + Allocator, IdentTable, MaxLinesToProcess); Parser.parse(); assert(UnwrappedLines.back().empty()); unsigned Penalty = 0; diff --git a/clang/lib/Format/TokenAnalyzer.h b/clang/lib/Format/TokenAnalyzer.h index 4086dab1c94c3a..65770d4be5ca28 100644 --- a/clang/lib/Format/TokenAnalyzer.h +++ b/clang/lib/Format/TokenAnalyzer.h @@ -87,7 +87,8 @@ class Environment { class TokenAnalyzer : public UnwrappedLineConsumer { public: - TokenAnalyzer(const Environment &Env, const FormatStyle &Style); + TokenAnalyzer(const Environment &Env, const FormatStyle &Style, + unsigned MaxLinesToProcess = 0); std::pair<tooling::Replacements, unsigned> process(bool SkipAnnotation = false); @@ -109,6 +110,7 @@ class TokenAnalyzer : public UnwrappedLineConsumer { AffectedRangeManager AffectedRangeMgr; SmallVector<SmallVector<UnwrappedLine, 16>, 2> UnwrappedLines; encoding::Encoding Encoding; + unsigned MaxLinesToProcess; }; } // end namespace format diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 684609747a5513..3aa92cdae8de44 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -151,7 +151,7 @@ UnwrappedLineParser::UnwrappedLineParser( const AdditionalKeywords &Keywords, unsigned FirstStartColumn, ArrayRef<FormatToken *> Tokens, UnwrappedLineConsumer &Callback, llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator, - IdentifierTable &IdentTable) + IdentifierTable &IdentTable, unsigned MaxLinesToProcess) : Line(new UnwrappedLine), MustBreakBeforeNextToken(false), CurrentLines(&Lines), Style(Style), Keywords(Keywords), CommentPragmasRegex(Style.CommentPragmas), Tokens(nullptr), @@ -160,7 +160,8 @@ UnwrappedLineParser::UnwrappedLineParser( ? IG_Rejected : IG_Inited), IncludeGuardToken(nullptr), FirstStartColumn(FirstStartColumn), - Macros(Style.Macros, SourceMgr, Style, Allocator, IdentTable) {} + Macros(Style.Macros, SourceMgr, Style, Allocator, IdentTable), + MaxLinesToProcess(MaxLinesToProcess) {} void UnwrappedLineParser::reset() { PPBranchLevel = -1; @@ -194,6 +195,7 @@ void UnwrappedLineParser::reset() { void UnwrappedLineParser::parse() { IndexedTokenSource TokenSource(AllTokens); Line->FirstStartColumn = FirstStartColumn; + size_t TotalLinesProcessed = 0; do { LLVM_DEBUG(llvm::dbgs() << "----\n"); reset(); @@ -235,6 +237,7 @@ void UnwrappedLineParser::parse() { Callback.consumeUnwrappedLine(Line); } Callback.finishRun(); + TotalLinesProcessed += ExpandedLines.size(); } LLVM_DEBUG(llvm::dbgs() << "Unwrapped lines:\n"); @@ -243,6 +246,10 @@ void UnwrappedLineParser::parse() { Callback.consumeUnwrappedLine(Line); } Callback.finishRun(); + TotalLinesProcessed += Lines.size(); + if (MaxLinesToProcess > 0 && TotalLinesProcessed >= MaxLinesToProcess) { + break; + } Lines.clear(); while (!PPLevelBranchIndex.empty() && PPLevelBranchIndex.back() + 1 >= PPLevelBranchCount.back()) { diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h index 739298690bbd76..f54ad6bbd79a20 100644 --- a/clang/lib/Format/UnwrappedLineParser.h +++ b/clang/lib/Format/UnwrappedLineParser.h @@ -111,7 +111,8 @@ class UnwrappedLineParser { unsigned FirstStartColumn, ArrayRef<FormatToken *> Tokens, UnwrappedLineConsumer &Callback, llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator, - IdentifierTable &IdentTable); + IdentifierTable &IdentTable, + unsigned MaxLinesToProcess = 0); void parse(); @@ -406,6 +407,11 @@ class UnwrappedLineParser { MacroExpander Macros; + // If set to a nonzero value, stop generating new runs after this + // many lines (summed over all the runs generated so far) have been + // processed. + unsigned MaxLinesToProcess; + friend class ScopedLineState; friend class CompoundStatementIndenter; }; _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits