llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang @llvm/pr-subscribers-clang-static-analyzer-1 Author: None (NagyDonat) <details> <summary>Changes</summary> This commit improves alpha.security.ArrayBoundV2 in two connected areas: (1) It calls `markInteresting()` on the symbolic values that are responsible for the out of bounds access. (2) Its index-is-in-bounds assumptions are reported in note tags if they provide information about the value of an interesting symbol. This commit is limited to "display" changes: it introduces new diagnostic pieces (potentially to bugs found by other checkers), but ArrayBoundV2 will make the same assumptions and detect the same bugs before and after this change. As a minor unrelated change, this commit also updates/removes some very old comments which became obsolate due to my previous changes. --- Patch is 26.88 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/78315.diff 3 Files Affected: - (modified) clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp (+275-70) - (modified) clang/test/Analysis/out-of-bounds-diagnostics.c (+10) - (added) clang/test/Analysis/out-of-bounds-notes.c (+128) ``````````diff diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp index 6c7a1601402efa..4241fa3a2ce8af 100644 --- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp @@ -33,7 +33,66 @@ using namespace taint; using llvm::formatv; namespace { -enum OOB_Kind { OOB_Precedes, OOB_Exceeds, OOB_Taint }; +class StateUpdateReporter { + const SubRegion *Reg; + NonLoc ByteOffsetVal; + std::optional<QualType> ElementType = std::nullopt; + std::optional<int64_t> ElementSize = std::nullopt; + bool AssumedNonNegative = false; + std::optional<NonLoc> AssumedUpperBound = std::nullopt; + +public: + StateUpdateReporter(const SubRegion *R, NonLoc ByteOffsVal, const Expr *E, + CheckerContext &C) + : Reg(R), ByteOffsetVal(ByteOffsVal) { + initializeElementInfo(E, C); + } + + void initializeElementInfo(const Expr *E, CheckerContext &C) { + if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E)) { + const MemRegion *SubscriptBaseReg = + C.getSVal(ASE->getBase()).getAsRegion(); + if (!SubscriptBaseReg) + return; + SubscriptBaseReg = SubscriptBaseReg->StripCasts(); + if (!isa_and_nonnull<ElementRegion>(SubscriptBaseReg)) { + ElementType = ASE->getType(); + ElementSize = + C.getASTContext().getTypeSizeInChars(*ElementType).getQuantity(); + } + } + } + void recordNonNegativeAssumption() { AssumedNonNegative = true; } + void recordUpperBoundAssumption(NonLoc UpperBoundVal) { + AssumedUpperBound = UpperBoundVal; + } + + const NoteTag *createNoteTag(CheckerContext &C) const; + +private: + std::string getMessage(PathSensitiveBugReport &BR) const; + + /// Return true if information about the value of `Sym` can put constraints + /// on some symbol which is interesting within the bug report `BR`. + /// In particular, this returns true when `Sym` is interesting within `BR`; + /// but it also returns true if `Sym` is an expression that contains integer + /// constants and a single symbolic operand which is interesting (in `BR`). + /// We need to use this instead of plain `BR.isInteresting()` because if we + /// are analyzing code like + /// int array[10]; + /// int f(int arg) { + /// return array[arg] && array[arg + 10]; + /// } + /// then the byte offsets are `arg * 4` and `(arg + 10) * 4`, which are not + /// sub-expressions of each other (but `getSimplifiedOffsets` is smart enough + /// to detect this out of bounds access). + static bool providesInformationAboutInteresting(SymbolRef Sym, + PathSensitiveBugReport &BR); + static bool providesInformationAboutInteresting(SVal SV, + PathSensitiveBugReport &BR) { + return providesInformationAboutInteresting(SV.getAsSymbol(), BR); + } +}; struct Messages { std::string Short, Full; @@ -54,11 +113,14 @@ class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>, void performCheck(const Expr *E, CheckerContext &C) const; - void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, OOB_Kind Kind, - NonLoc Offset, Messages Msgs) const; + void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, Messages Msgs, + NonLoc Offset, bool IsTaintBug = false) const; static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC); + static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State, + NonLoc Offset, NonLoc Limit, + CheckerContext &C); static bool isInAddressOf(const Stmt *S, ASTContext &AC); public: @@ -133,12 +195,19 @@ computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) { return std::nullopt; } -// TODO: once the constraint manager is smart enough to handle non simplified -// symbolic expressions remove this function. Note that this can not be used in -// the constraint manager as is, since this does not handle overflows. It is -// safe to assume, however, that memory offsets will not overflow. -// NOTE: callers of this function need to be aware of the effects of overflows -// and signed<->unsigned conversions! +// NOTE: This function is the "heart" of this checker. It simplifies +// inequalities with transformations that are valid (and very elementary) in +// pure mathematics, but become invalid if we use them in C++ number model +// where the calculations may overflow. +// Due to the overflow issues I think it's impossible (or at least not +// practical) to integrate this kind of simplification into the resolution of +// arbitrary inequalities (i.e. the code of `evalBinOp`); but this function +// produces valid results if the arguments are memory offsets which are known +// to be << SIZE_MAX. +// TODO: This algorithm should be moved to a central location where it's +// available for other checkers that need to compare memory offsets. +// NOTE: When using the results of this function, don't forget that `evalBinOp` +// uses the evaluation rules of C++, so e.g. `(size_t)123 < -1`! static std::pair<NonLoc, nonloc::ConcreteInt> getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent, SValBuilder &svalBuilder) { @@ -239,13 +308,8 @@ static std::optional<int64_t> getConcreteValue(NonLoc SV) { return std::nullopt; } -static std::string getShortMsg(OOB_Kind Kind, std::string RegName) { - static const char *ShortMsgTemplates[] = { - "Out of bound access to memory preceding {0}", - "Out of bound access to memory after the end of {0}", - "Potential out of bound access to {0} with tainted offset"}; - - return formatv(ShortMsgTemplates[Kind], RegName); +static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) { + return SV ? getConcreteValue(*SV) : std::nullopt; } static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) { @@ -255,7 +319,28 @@ static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) { Out << "Access of " << RegName << " at negative byte offset"; if (auto ConcreteIdx = Offset.getAs<nonloc::ConcreteInt>()) Out << ' ' << ConcreteIdx->getValue(); - return {getShortMsg(OOB_Precedes, RegName), std::string(Buf)}; + return {formatv("Out of bound access to memory preceding {0}", RegName), + std::string(Buf)}; +} + +/// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if +/// it can be performed (`Divisor` is nonzero and there is no remainder). The +/// values `Val1` and `Val2` may be nullopt and in that case the corresponding +/// division is considered to be successful. +bool tryDividePair(std::optional<int64_t> &Val1, std::optional<int64_t> &Val2, + int64_t Divisor) { + if (!Divisor) + return false; + const bool Val1HasRemainder = Val1 && *Val1 % Divisor; + const bool Val2HasRemainder = Val2 && *Val2 % Divisor; + if (!Val1HasRemainder && !Val2HasRemainder) { + if (Val1) + *Val1 /= Divisor; + if (Val2) + *Val2 /= Divisor; + return true; + } + return false; } static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region, @@ -268,18 +353,9 @@ static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region, std::optional<int64_t> OffsetN = getConcreteValue(Offset); std::optional<int64_t> ExtentN = getConcreteValue(Extent); - bool UseByteOffsets = true; - if (int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity()) { - const bool OffsetHasRemainder = OffsetN && *OffsetN % ElemSize; - const bool ExtentHasRemainder = ExtentN && *ExtentN % ElemSize; - if (!OffsetHasRemainder && !ExtentHasRemainder) { - UseByteOffsets = false; - if (OffsetN) - *OffsetN /= ElemSize; - if (ExtentN) - *ExtentN /= ElemSize; - } - } + int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity(); + + bool UseByteOffsets = !tryDividePair(OffsetN, ExtentN, ElemSize); SmallString<256> Buf; llvm::raw_svector_ostream Out(Buf); @@ -307,7 +383,9 @@ static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region, Out << "s"; } - return {getShortMsg(OOB_Exceeds, RegName), std::string(Buf)}; + return { + formatv("Out of bound access to memory after the end of {0}", RegName), + std::string(Buf)}; } static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName) { @@ -318,17 +396,87 @@ static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName) { RegName, OffsetName)}; } -void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const { - // NOTE: Instead of using ProgramState::assumeInBound(), we are prototyping - // some new logic here that reasons directly about memory region extents. - // Once that logic is more mature, we can bring it back to assumeInBound() - // for all clients to use. - // - // The algorithm we are using here for bounds checking is to see if the - // memory access is within the extent of the base region. Since we - // have some flexibility in defining the base region, we can achieve - // various levels of conservatism in our buffer overflow checking. +const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const { + // Don't create a note tag if we didn't assume anything: + if (!AssumedNonNegative && !AssumedUpperBound) + return nullptr; + + return C.getNoteTag( + [*this](PathSensitiveBugReport &BR) -> std::string { + return getMessage(BR); + }, + /*isPrunable=*/false); +} + +std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const { + bool ShouldReportNonNegative = AssumedNonNegative; + if (!providesInformationAboutInteresting(ByteOffsetVal, BR)) { + if (AssumedUpperBound && + providesInformationAboutInteresting(*AssumedUpperBound, BR)) + ShouldReportNonNegative = false; + else + return ""; + } + + std::optional<int64_t> OffsetN = getConcreteValue(ByteOffsetVal); + std::optional<int64_t> ExtentN = getConcreteValue(AssumedUpperBound); + const bool UseIndex = + ElementSize && tryDividePair(OffsetN, ExtentN, *ElementSize); + + SmallString<256> Buf; + llvm::raw_svector_ostream Out(Buf); + Out << "Assuming "; + if (UseIndex) { + Out << "index "; + if (OffsetN) + Out << "'" << OffsetN << "' "; + } else if (AssumedUpperBound) { + Out << "byte offset "; + if (OffsetN) + Out << "'" << OffsetN << "' "; + } else { + Out << "offset "; + } + + Out << "is"; + if (ShouldReportNonNegative) { + Out << " non-negative"; + } + if (AssumedUpperBound) { + if (ShouldReportNonNegative) + Out << " and"; + Out << " less than "; + if (ExtentN) + Out << *ExtentN << ", "; + if (UseIndex && ElementType) + Out << "the number of '" << ElementType->getAsString() + << "' elements in "; + else + Out << "the extent of "; + Out << getRegionName(Reg); + } + return std::string(Out.str()); +} + +bool StateUpdateReporter::providesInformationAboutInteresting( + SymbolRef Sym, PathSensitiveBugReport &BR) { + if (!Sym) + return false; + for (SymbolRef PartSym : Sym->symbols()) { + // The interestingess mark may appear on any layer as we're stripping off + // the SymIntExpr, UnarySymExpr etc. layers... + if (BR.isInteresting(PartSym)) + return true; + // ...but if both sides of the expression are symbolic (i.e. unknown), then + // the analyzer can't use the combined result to constrain the operands. + if (isa<SymSymExpr>(PartSym)) + return false; + } + return false; +} + +void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const { const SVal Location = C.getSVal(E); // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as @@ -350,6 +498,10 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const { auto [Reg, ByteOffset] = *RawOffset; + // The state updates will be reported as a single note tag, which will be + // composed by this helper class. + StateUpdateReporter SUR(Reg, ByteOffset, E, C); + // CHECK LOWER BOUND const MemSpaceRegion *Space = Reg->getMemorySpace(); if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Space))) { @@ -363,13 +515,22 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const { auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold( State, ByteOffset, SVB.makeZeroArrayIndex(), SVB); - if (PrecedesLowerBound && !WithinLowerBound) { - // We know that the index definitely precedes the lower bound. - Messages Msgs = getPrecedesMsgs(Reg, ByteOffset); - reportOOB(C, PrecedesLowerBound, OOB_Precedes, ByteOffset, Msgs); - return; + if (PrecedesLowerBound) { + // The offset may be invalid (negative)... + if (!WithinLowerBound) { + // ...and it cannot be valid (>= 0), so report an error. + Messages Msgs = getPrecedesMsgs(Reg, ByteOffset); + reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset); + return; + } + // ...but it can be valid as well, so the checker will (optimistically) + // assume that it's valid and mention this in the note tag. + SUR.recordNonNegativeAssumption(); } + // Actually update the state. The "if" only fails in the extremely unlikely + // case when compareValueToThreshold returns {nullptr, nullptr} becasue + // evalBinOpNN fails to evaluate the less-than operator. if (WithinLowerBound) State = WithinLowerBound; } @@ -381,32 +542,30 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const { compareValueToThreshold(State, ByteOffset, *KnownSize, SVB); if (ExceedsUpperBound) { + // The offset may be invalid (>= Size)... if (!WithinUpperBound) { - // We know that the index definitely exceeds the upper bound. - if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) { - // ...but this is within an addressof expression, so we need to check - // for the exceptional case that `&array[size]` is valid. - auto [EqualsToThreshold, NotEqualToThreshold] = - compareValueToThreshold(ExceedsUpperBound, ByteOffset, *KnownSize, - SVB, /*CheckEquality=*/true); - if (EqualsToThreshold && !NotEqualToThreshold) { - // We are definitely in the exceptional case, so return early - // instead of reporting a bug. - C.addTransition(EqualsToThreshold); - return; - } + // ...and it cannot be within bounds, so report an error, unless we can + // definitely determine that this is an idiomatic `&array[size]` + // expression that calculates the past-the-end pointer. + if (isIdiomaticPastTheEndPtr(E, ExceedsUpperBound, ByteOffset, + *KnownSize, C)) { + // FIXME: this duplicates the `addTransition` at the end of the + // function, but `goto` is taboo nowdays. + C.addTransition(ExceedsUpperBound, SUR.createNoteTag(C)); + return; } + Messages Msgs = getExceedsMsgs(C.getASTContext(), Reg, ByteOffset, *KnownSize, Location); - reportOOB(C, ExceedsUpperBound, OOB_Exceeds, ByteOffset, Msgs); + reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset); return; } + // ...and it can be valid as well... if (isTainted(State, ByteOffset)) { - // Both cases are possible, but the offset is tainted, so report. - std::string RegName = getRegionName(Reg); + // ...but it's tainted, so report an error. - // Diagnostic detail: "tainted offset" is always correct, but the - // common case is that 'idx' is tainted in 'arr[idx]' and then it's + // Diagnostic detail: saying "tainted offset" is always correct, but + // the common case is that 'idx' is tainted in 'arr[idx]' and then it's // nicer to say "tainted index". const char *OffsetName = "offset"; if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E)) @@ -414,33 +573,67 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const { OffsetName = "index"; Messages Msgs = getTaintMsgs(Reg, OffsetName); - reportOOB(C, ExceedsUpperBound, OOB_Taint, ByteOffset, Msgs); + reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, /*IsTaintBug=*/true); return; } + // ...and it isn't tainted, so the checker will (optimistically) assume + // that the offset is in bounds and mention this in the note tag. + SUR.recordUpperBoundAssumption(*KnownSize); } + // Actually update the state. The "if" only fails in the extremely unlikely + // case when compareValueToThreshold returns {nullptr, nullptr} becasue + // evalBinOpNN fails to evaluate the less-than operator. if (WithinUpperBound) State = WithinUpperBound; } - C.addTransition(State); + // Add a transition, reporting the state updates that we accumulated. + C.addTransition(State, SUR.createNoteTag(C)); } void ArrayBoundCheckerV2::reportOOB(CheckerContext &C, - ProgramStateRef ErrorState, OOB_Kind Kind, - NonLoc Offset, Messages Msgs) const { + ProgramStateRef ErrorState, Messages Msgs, + NonLoc Offset, bool IsTaintBug) const { ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState); if (!ErrorNode) return; auto BR = std::make_unique<PathSensitiveBugReport>( - Kind == OOB_Taint ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode); + IsTaintBug ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode); + + // FIXME: ideally we would just call trackExpressionValue() and that would + // "do the right thing": mark the relevant symbols as interesting, track the + // control dependencies and statements storing the relevant values and add + // helpful diagnostic pieces. However, right now trackExpressionValue() is + // a heap of unreliable heuristics, so it would cause several issues: + // - Interestingness is not applied consistently, e.g. if `array[x+10]` + // causes an overflow, then `x` is not marked as interesting. + // - We get irrelevant diagnostic pieces, e.g. in the code + // `int *p = (int*)malloc(2*sizeof(int)); p[3] = 0;` + // it places a "Storing uninitialized value" note on the `malloc` call + // (which is technically true, but irrelevant). + // If trackExpressionValue() becomes reliable, it should be applied instead + // of the manual markInteresting() calls. + + if (SymbolRef OffsetSym = Offset.getAsSymbol()) { + // If the offset is a symbolic value, iterate over its "parts" with + // `SymExpr::symbols()` and mark each of them as interesting. + // For example, if the offset is `x*4 + y` then we put interestingness onto + // the SymSymExpr `x*4 + y`, the SymIntExpr `x*4` and the two data symbols + // `x` and `y`. + for (SymbolRef PartSym : OffsetSym->symbols()) + BR->markInteresting(PartSym); + } - // Track back the propagation of taintedness. - if (Kind == OOB_Taint) + if (IsTaintBug) { + // If the issue that we're reporting depends on the taintedness of the + // offset, then put interestingness onto symbols that could be the origin + // of the taint. for (SymbolRef Sym : getTaintedSymbols(ErrorState, Offset)) BR->markInteresting(Sym); + } C.emitReport(std::move(BR)); } @@ -476,6 +669,18 @@ bool ArrayBoundCheckerV2::isInAddressOf(const Stmt *S, ASTContext &ACtx) { return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf; } +bool ArrayBoundCheckerV2::isIdiomaticPastTheEndPtr(const Expr *E, + ProgramStateRef State, + NonLoc Offset, NonLoc Limit, + CheckerContext &C) { + if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.ge... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/78315 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits