Author: Valeriy Savchenko Date: 2020-08-06T12:53:20+03:00 New Revision: 6ddef92474583ef3c183da9bdc8c8e81ec578fd8
URL: https://github.com/llvm/llvm-project/commit/6ddef92474583ef3c183da9bdc8c8e81ec578fd8 DIFF: https://github.com/llvm/llvm-project/commit/6ddef92474583ef3c183da9bdc8c8e81ec578fd8.diff LOG: [analyzer][tests] Understand when diagnostics change between builds Before the patch `SATest compare`, produced quite obscure results when something about the diagnostic have changed (i.e. its description or the name of the corresponding checker) because it was simply two lists of warnings, ADDED and REMOVED. It was up to the developer to match those warnings, understand that they are essentially the same, and figure out what caused the difference. This patch introduces another category of results: MODIFIED. It tries to match new warnings against the old ones and prints out clues on what is different between two builds. Differential Revision: https://reviews.llvm.org/D85311 Added: Modified: clang/utils/analyzer/CmpRuns.py Removed: ################################################################################ diff --git a/clang/utils/analyzer/CmpRuns.py b/clang/utils/analyzer/CmpRuns.py index f7f28d9dc72e..9d5e00767067 100644 --- a/clang/utils/analyzer/CmpRuns.py +++ b/clang/utils/analyzer/CmpRuns.py @@ -35,14 +35,16 @@ from collections import defaultdict from copy import copy from enum import Enum -from typing import (Any, cast, Dict, List, NamedTuple, Optional, Sequence, - TextIO, TypeVar, Tuple, Union) +from typing import (Any, DefaultDict, Dict, List, NamedTuple, Optional, + Sequence, TextIO, TypeVar, Tuple, Union) Number = Union[int, float] Stats = Dict[str, Dict[str, Number]] Plist = Dict[str, Any] JSON = Dict[str, Any] +# Diff in a form: field -> (before, after) +JSONDiff = Dict[str, Tuple[str, str]] # Type for generics T = TypeVar('T') @@ -136,6 +138,9 @@ def get_category(self) -> str: def get_description(self) -> str: return self._data['description'] + def get_location(self) -> str: + return f"{self.get_file_name()}:{self.get_line()}:{self.get_column()}" + def get_issue_identifier(self) -> str: id = self.get_file_name() + "+" @@ -172,11 +177,32 @@ def get_readable_name(self) -> str: return f"{file_prefix}{funcname_postfix}:{line}:{col}" \ f", {self.get_category()}: {self.get_description()}" + KEY_FIELDS = ["check_name", "category", "description"] + + def is_similar_to(self, other: "AnalysisDiagnostic") -> bool: + # We consider two diagnostics similar only if at least one + # of the key fields is the same in both diagnostics. + return len(self.get_ diff s(other)) != len(self.KEY_FIELDS) + + def get_ diff s(self, other: "AnalysisDiagnostic") -> JSONDiff: + return {field: (self._data[field], other._data[field]) + for field in self.KEY_FIELDS + if self._data[field] != other._data[field]} + # Note, the data format is not an API and may change from one analyzer # version to another. def get_raw_data(self) -> Plist: return self._data + def __eq__(self, other: object) -> bool: + return hash(self) == hash(other) + + def __ne__(self, other: object) -> bool: + return hash(self) != hash(other) + + def __hash__(self) -> int: + return hash(self.get_issue_identifier()) + class AnalysisRun: def __init__(self, info: SingleRunInfo): @@ -283,12 +309,39 @@ def cmp_analysis_diagnostic(d): return d.get_issue_identifier() -PresentInBoth = Tuple[AnalysisDiagnostic, AnalysisDiagnostic] -PresentOnlyInOld = Tuple[AnalysisDiagnostic, None] -PresentOnlyInNew = Tuple[None, AnalysisDiagnostic] -ComparisonResult = List[Union[PresentInBoth, - PresentOnlyInOld, - PresentOnlyInNew]] +AnalysisDiagnosticPair = Tuple[AnalysisDiagnostic, AnalysisDiagnostic] + + +class ComparisonResult: + def __init__(self): + self.present_in_both: List[AnalysisDiagnostic] = [] + self.present_only_in_old: List[AnalysisDiagnostic] = [] + self.present_only_in_new: List[AnalysisDiagnostic] = [] + self.changed_between_new_and_old: List[AnalysisDiagnosticPair] = [] + + def add_common(self, issue: AnalysisDiagnostic): + self.present_in_both.append(issue) + + def add_removed(self, issue: AnalysisDiagnostic): + self.present_only_in_old.append(issue) + + def add_added(self, issue: AnalysisDiagnostic): + self.present_only_in_new.append(issue) + + def add_changed(self, old_issue: AnalysisDiagnostic, + new_issue: AnalysisDiagnostic): + self.changed_between_new_and_old.append((old_issue, new_issue)) + + +GroupedDiagnostics = DefaultDict[str, List[AnalysisDiagnostic]] + + +def get_grouped_diagnostics(diagnostics: List[AnalysisDiagnostic] + ) -> GroupedDiagnostics: + result: GroupedDiagnostics = defaultdict(list) + for diagnostic in diagnostics: + result[diagnostic.get_location()].append(diagnostic) + return result def compare_results(results_old: AnalysisRun, results_new: AnalysisRun, @@ -302,52 +355,79 @@ def compare_results(results_old: AnalysisRun, results_new: AnalysisRun, each element {a,b} is None or a matching element from the respective run """ - res: ComparisonResult = [] + res = ComparisonResult() # Map size_before -> size_after path_ diff erence_data: List[float] = [] - # Quickly eliminate equal elements. - neq_old: List[AnalysisDiagnostic] = [] - neq_new: List[AnalysisDiagnostic] = [] - - diags_old = copy(results_old.diagnostics) - diags_new = copy(results_new.diagnostics) - - diags_old.sort(key=cmp_analysis_diagnostic) - diags_new.sort(key=cmp_analysis_diagnostic) - - while diags_old and diags_new: - a = diags_old.pop() - b = diags_new.pop() - - if a.get_issue_identifier() == b.get_issue_identifier(): - if a.get_path_length() != b.get_path_length(): - - if histogram == HistogramType.RELATIVE: - path_ diff erence_data.append( - float(a.get_path_length()) / b.get_path_length()) - - elif histogram == HistogramType.LOG_RELATIVE: - path_ diff erence_data.append( - log(float(a.get_path_length()) / b.get_path_length())) - - elif histogram == HistogramType.ABSOLUTE: - path_ diff erence_data.append( - a.get_path_length() - b.get_path_length()) - - res.append((a, b)) - - elif a.get_issue_identifier() > b.get_issue_identifier(): - diags_new.append(b) - neq_old.append(a) - - else: - diags_old.append(a) - neq_new.append(b) - - neq_old.extend(diags_old) - neq_new.extend(diags_new) + diags_old = get_grouped_diagnostics(results_old.diagnostics) + diags_new = get_grouped_diagnostics(results_new.diagnostics) + + locations_old = set(diags_old.keys()) + locations_new = set(diags_new.keys()) + + common_locations = locations_old & locations_new + + for location in common_locations: + old = diags_old[location] + new = diags_new[location] + + # Quadratic algorithms in this part are fine because 'old' and 'new' + # are most commonly of size 1. + for a in copy(old): + for b in copy(new): + if a.get_issue_identifier() == b.get_issue_identifier(): + a_path_len = a.get_path_length() + b_path_len = b.get_path_length() + + if a_path_len != b_path_len: + + if histogram == HistogramType.RELATIVE: + path_ diff erence_data.append( + float(a_path_len) / b_path_len) + + elif histogram == HistogramType.LOG_RELATIVE: + path_ diff erence_data.append( + log(float(a_path_len) / b_path_len)) + + elif histogram == HistogramType.ABSOLUTE: + path_ diff erence_data.append( + a_path_len - b_path_len) + + res.add_common(a) + old.remove(a) + new.remove(b) + + for a in copy(old): + for b in copy(new): + if a.is_similar_to(b): + res.add_changed(a, b) + old.remove(a) + new.remove(b) + + # Whatever is left in 'old' doesn't have a corresponding diagnostic + # in 'new', so we need to mark it as 'removed'. + for a in old: + res.add_removed(a) + + # Whatever is left in 'new' doesn't have a corresponding diagnostic + # in 'old', so we need to mark it as 'added'. + for b in new: + res.add_added(b) + + only_old_locations = locations_old - common_locations + for location in only_old_locations: + for a in diags_old[location]: + # These locations have been found only in the old build, so we + # need to mark all of therm as 'removed' + res.add_removed(a) + + only_new_locations = locations_new - common_locations + for location in only_new_locations: + for b in diags_new[location]: + # These locations have been found only in the new build, so we + # need to mark all of therm as 'added' + res.add_added(b) # FIXME: Add fuzzy matching. One simple and possible effective idea would # be to bin the diagnostics, print them in a normalized form (based solely @@ -355,11 +435,6 @@ def compare_results(results_old: AnalysisRun, results_new: AnalysisRun, # the basis for matching. This has the nice property that we don't depend # in any way on the diagnostic format. - for a in neq_old: - res.append((a, None)) - for b in neq_new: - res.append((None, b)) - if histogram: from matplotlib import pyplot pyplot.hist(path_ diff erence_data, bins=100) @@ -476,47 +551,55 @@ def dump_scan_build_results_ diff (dir_old: ResultsDirectory, # Open the verbose log, if given. if verbose_log: - auxLog: Optional[TextIO] = open(verbose_log, "w") + aux_log: Optional[TextIO] = open(verbose_log, "w") else: - auxLog = None + aux_log = None diff = compare_results(results_old, results_new, histogram) found_ diff s = 0 total_added = 0 total_removed = 0 - - for res in diff : - old, new = res - if old is None: - # TODO: mypy still doesn't understand that old and new can't be - # both Nones, we should introduce a better type solution - new = cast(AnalysisDiagnostic, new) - out.write(f"ADDED: {new.get_readable_name()}\n") - found_ diff s += 1 - total_added += 1 - if auxLog: - auxLog.write(f"('ADDED', {new.get_readable_name()}, " - f"{new.get_html_report()})\n") - - elif new is None: - out.write(f"REMOVED: {old.get_readable_name()}\n") - found_ diff s += 1 - total_removed += 1 - if auxLog: - auxLog.write(f"('REMOVED', {old.get_readable_name()}, " - f"{old.get_html_report()})\n") - else: - pass + total_modified = 0 + + for new in diff .present_only_in_new: + out.write(f"ADDED: {new.get_readable_name()}\n\n") + found_ diff s += 1 + total_added += 1 + if aux_log: + aux_log.write(f"('ADDED', {new.get_readable_name()}, " + f"{new.get_html_report()})\n") + + for old in diff .present_only_in_old: + out.write(f"REMOVED: {old.get_readable_name()}\n\n") + found_ diff s += 1 + total_removed += 1 + if aux_log: + aux_log.write(f"('REMOVED', {old.get_readable_name()}, " + f"{old.get_html_report()})\n") + + for old, new in diff .changed_between_new_and_old: + out.write(f"MODIFIED: {old.get_readable_name()}\n") + found_ diff s += 1 + total_modified += 1 + diff s = old.get_ diff s(new) + str_ diff s = [f" '{key}' changed: " + f"'{old_value}' -> '{new_value}'" + for key, (old_value, new_value) in diff s.items()] + out.write(",\n".join(str_ diff s) + "\n\n") + if aux_log: + aux_log.write(f"('MODIFIED', {old.get_readable_name()}, " + f"{old.get_html_report()})\n") total_reports = len(results_new.diagnostics) out.write(f"TOTAL REPORTS: {total_reports}\n") out.write(f"TOTAL ADDED: {total_added}\n") out.write(f"TOTAL REMOVED: {total_removed}\n") + out.write(f"TOTAL MODIFIED: {total_modified}\n") - if auxLog: - auxLog.write(f"('TOTAL NEW REPORTS', {total_reports})\n") - auxLog.write(f"('TOTAL DIFFERENCES', {found_ diff s})\n") - auxLog.close() + if aux_log: + aux_log.write(f"('TOTAL NEW REPORTS', {total_reports})\n") + aux_log.write(f"('TOTAL DIFFERENCES', {found_ diff s})\n") + aux_log.close() # TODO: change to NamedTuple return found_ diff s, len(results_old.diagnostics), \ _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits