https://github.com/HighCommander4 created https://github.com/llvm/llvm-project/pull/78925
guessLanguage() uses UnwrappedLineParser to process different preprocessor variants of a file. For large files with many preprocessor branches, the number of variants can be very large and the operation can hang for a long time and eventually OOM. (This has been observed particularly for single-header libraries such as miniaudio.h). This patch implements a limit on how many variants guessLanguage() analyzes, to avoid such a performance cliff. The limit is expressed as a maximum number of lines (summed over preprocessor variants) to process. This allows shorter files to have more variants processed than longer files. Fixes https://github.com/clangd/clangd/issues/719 Fixes https://github.com/clangd/clangd/issues/1384 Fixes https://github.com/llvm/llvm-project/issues/70945 This patch does not fix the broader problem of actually trying to format such large headers, which involves using UnwrappedLineParser from call sites other than guessLanguage(), though the approach in the patch could be extended to other call sites as well. >From 05c751db607874eddffc836d6d70da7418ee0589 Mon Sep 17 00:00:00 2001 From: Nathan Ridge <zeratul...@hotmail.com> Date: Sun, 21 Jan 2024 02:45:16 -0500 Subject: [PATCH] [clang-format] Limit how much work guessLanguage() can do guessLanguage() uses UnwrappedLineParser to process different preprocessor variants of a file. For large files with many preprocessor branches, the number of variants can be very large and the operation can hang for a long time and eventually OOM. (This has been observed particularly for single-header libraries such as miniaudio.h). This patch implements a limit on how many variants guessLanguage() analyzes, to avoid such a performance cliff. The limit is expressed as a maximum number of lines (summed over preprocessor variants) to process. This allows shorter files to have more variants processed than longer files. Fixes https://github.com/clangd/clangd/issues/719 Fixes https://github.com/clangd/clangd/issues/1384 Fixes https://github.com/llvm/llvm-project/issues/70945 This patch does not fix the broader problem of actually trying to format such large headers, which involves using UnwrappedLineParser from call sites other than guessLanguage(), though the approach in the patch could be extended to other call sites as well. --- clang/lib/Format/Format.cpp | 8 +++++++- clang/lib/Format/TokenAnalyzer.cpp | 8 +++++--- clang/lib/Format/TokenAnalyzer.h | 4 +++- clang/lib/Format/UnwrappedLineParser.cpp | 16 ++++++++++++++-- clang/lib/Format/UnwrappedLineParser.h | 8 +++++++- 5 files changed, 36 insertions(+), 8 deletions(-) diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index f798d555bf9929c..fbafce38d52b775 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -2830,7 +2830,7 @@ class Cleaner : public TokenAnalyzer { class ObjCHeaderStyleGuesser : public TokenAnalyzer { public: ObjCHeaderStyleGuesser(const Environment &Env, const FormatStyle &Style) - : TokenAnalyzer(Env, Style), IsObjC(false) {} + : TokenAnalyzer(Env, Style, MaxLinesToProcess), IsObjC(false) {} std::pair<tooling::Replacements, unsigned> analyze(TokenAnnotator &Annotator, @@ -2846,6 +2846,12 @@ class ObjCHeaderStyleGuesser : public TokenAnalyzer { bool isObjC() { return IsObjC; } private: + // Limit the number of variants of the file TokenAnalyzer processes for + // the purpose of guessing the language. An inaccurate guess is better than + // hanging for a long time or OOMing, which has been observed with real + // libraries which are single-header with many preprocessor branches. + static const unsigned MaxLinesToProcess = (1 << 20); + static bool guessIsObjC(const SourceManager &SourceManager, const SmallVectorImpl<AnnotatedLine *> &AnnotatedLines, diff --git a/clang/lib/Format/TokenAnalyzer.cpp b/clang/lib/Format/TokenAnalyzer.cpp index bd648c430f9b0a4..4f1195d5f82b554 100644 --- a/clang/lib/Format/TokenAnalyzer.cpp +++ b/clang/lib/Format/TokenAnalyzer.cpp @@ -83,12 +83,14 @@ Environment::Environment(StringRef Code, StringRef FileName, ID(VirtualSM->get().getMainFileID()), FirstStartColumn(FirstStartColumn), NextStartColumn(NextStartColumn), LastStartColumn(LastStartColumn) {} -TokenAnalyzer::TokenAnalyzer(const Environment &Env, const FormatStyle &Style) +TokenAnalyzer::TokenAnalyzer(const Environment &Env, const FormatStyle &Style, + unsigned MaxLinesToProcess) : Style(Style), Env(Env), AffectedRangeMgr(Env.getSourceManager(), Env.getCharRanges()), UnwrappedLines(1), Encoding(encoding::detectEncoding( - Env.getSourceManager().getBufferData(Env.getFileID()))) { + Env.getSourceManager().getBufferData(Env.getFileID()))), + MaxLinesToProcess(MaxLinesToProcess) { LLVM_DEBUG( llvm::dbgs() << "File encoding: " << (Encoding == encoding::Encoding_UTF8 ? "UTF8" : "unknown") @@ -109,7 +111,7 @@ TokenAnalyzer::process(bool SkipAnnotation) { SmallVector<FormatToken *, 10> Tokens(Toks.begin(), Toks.end()); UnwrappedLineParser Parser(Env.getSourceManager(), Style, Lex.getKeywords(), Env.getFirstStartColumn(), Tokens, *this, - Allocator, IdentTable); + Allocator, IdentTable, MaxLinesToProcess); Parser.parse(); assert(UnwrappedLines.back().empty()); unsigned Penalty = 0; diff --git a/clang/lib/Format/TokenAnalyzer.h b/clang/lib/Format/TokenAnalyzer.h index 4086dab1c94c3ad..65770d4be5ca280 100644 --- a/clang/lib/Format/TokenAnalyzer.h +++ b/clang/lib/Format/TokenAnalyzer.h @@ -87,7 +87,8 @@ class Environment { class TokenAnalyzer : public UnwrappedLineConsumer { public: - TokenAnalyzer(const Environment &Env, const FormatStyle &Style); + TokenAnalyzer(const Environment &Env, const FormatStyle &Style, + unsigned MaxLinesToProcess = 0); std::pair<tooling::Replacements, unsigned> process(bool SkipAnnotation = false); @@ -109,6 +110,7 @@ class TokenAnalyzer : public UnwrappedLineConsumer { AffectedRangeManager AffectedRangeMgr; SmallVector<SmallVector<UnwrappedLine, 16>, 2> UnwrappedLines; encoding::Encoding Encoding; + unsigned MaxLinesToProcess; }; } // end namespace format diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 684609747a5513f..d925e19f8bf3bcc 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -151,7 +151,7 @@ UnwrappedLineParser::UnwrappedLineParser( const AdditionalKeywords &Keywords, unsigned FirstStartColumn, ArrayRef<FormatToken *> Tokens, UnwrappedLineConsumer &Callback, llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator, - IdentifierTable &IdentTable) + IdentifierTable &IdentTable, unsigned MaxLinesToProcess) : Line(new UnwrappedLine), MustBreakBeforeNextToken(false), CurrentLines(&Lines), Style(Style), Keywords(Keywords), CommentPragmasRegex(Style.CommentPragmas), Tokens(nullptr), @@ -160,7 +160,8 @@ UnwrappedLineParser::UnwrappedLineParser( ? IG_Rejected : IG_Inited), IncludeGuardToken(nullptr), FirstStartColumn(FirstStartColumn), - Macros(Style.Macros, SourceMgr, Style, Allocator, IdentTable) {} + Macros(Style.Macros, SourceMgr, Style, Allocator, IdentTable), + MaxLinesToProcess(MaxLinesToProcess) {} void UnwrappedLineParser::reset() { PPBranchLevel = -1; @@ -194,6 +195,8 @@ void UnwrappedLineParser::reset() { void UnwrappedLineParser::parse() { IndexedTokenSource TokenSource(AllTokens); Line->FirstStartColumn = FirstStartColumn; + size_t TotalLinesProcessed = 0; + size_t LastReport = 0; do { LLVM_DEBUG(llvm::dbgs() << "----\n"); reset(); @@ -235,6 +238,7 @@ void UnwrappedLineParser::parse() { Callback.consumeUnwrappedLine(Line); } Callback.finishRun(); + TotalLinesProcessed += ExpandedLines.size(); } LLVM_DEBUG(llvm::dbgs() << "Unwrapped lines:\n"); @@ -243,6 +247,14 @@ void UnwrappedLineParser::parse() { Callback.consumeUnwrappedLine(Line); } Callback.finishRun(); + TotalLinesProcessed += Lines.size(); + if ((TotalLinesProcessed / 10000) > LastReport) { + llvm::errs() << "Processed " << TotalLinesProcessed << " lines\n"; + LastReport = (TotalLinesProcessed / 10000); + } + if (MaxLinesToProcess > 0 && TotalLinesProcessed >= MaxLinesToProcess) { + break; + } Lines.clear(); while (!PPLevelBranchIndex.empty() && PPLevelBranchIndex.back() + 1 >= PPLevelBranchCount.back()) { diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h index 739298690bbd765..f54ad6bbd79a209 100644 --- a/clang/lib/Format/UnwrappedLineParser.h +++ b/clang/lib/Format/UnwrappedLineParser.h @@ -111,7 +111,8 @@ class UnwrappedLineParser { unsigned FirstStartColumn, ArrayRef<FormatToken *> Tokens, UnwrappedLineConsumer &Callback, llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator, - IdentifierTable &IdentTable); + IdentifierTable &IdentTable, + unsigned MaxLinesToProcess = 0); void parse(); @@ -406,6 +407,11 @@ class UnwrappedLineParser { MacroExpander Macros; + // If set to a nonzero value, stop generating new runs after this + // many lines (summed over all the runs generated so far) have been + // processed. + unsigned MaxLinesToProcess; + friend class ScopedLineState; friend class CompoundStatementIndenter; }; _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits