This is an automated email from the ASF dual-hosted git repository. eldenmoon pushed a commit to branch variant-sparse in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/variant-sparse by this push: new a9967223329 fix error stats (#49582) a9967223329 is described below commit a9967223329eada676feee5cc48e119affc4c9a6 Author: Sun Chenyang <suncheny...@selectdb.com> AuthorDate: Fri Mar 28 12:21:56 2025 +0800 fix error stats (#49582) --- be/src/vec/common/schema_util.cpp | 75 ++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 32 deletions(-) diff --git a/be/src/vec/common/schema_util.cpp b/be/src/vec/common/schema_util.cpp index 7f775e2b76d..2f8a4de72e7 100644 --- a/be/src/vec/common/schema_util.cpp +++ b/be/src/vec/common/schema_util.cpp @@ -746,23 +746,48 @@ Status check_path_stats(const std::vector<RowsetSharedPtr>& intputs, RowsetShare } std::unordered_map<int32_t, PathToNoneNullValues> output_uid_to_path_stats; RETURN_IF_ERROR(collect_path_stats(output, output_uid_to_path_stats)); - for (const auto& [uid, stats] : original_uid_to_path_stats) { - if (output_uid_to_path_stats.find(uid) == output_uid_to_path_stats.end()) { + for (const auto& [uid, stats] : output_uid_to_path_stats) { + if (original_uid_to_path_stats.find(uid) == original_uid_to_path_stats.end()) { return Status::InternalError("Path stats not found for uid {}, tablet_id {}", uid, tablet->tablet_id()); } - if (stats.size() != output_uid_to_path_stats.at(uid).size()) { - return Status::InternalError("Path stats size not match for uid {}, tablet_id {}", uid, - tablet->tablet_id()); + + // In input rowsets, some rowsets may have statistics values exceeding the maximum limit, + // which leads to inaccurate statistics + if (stats.size() > VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE) { + // When there is only one segment, we can ensure that the size of each path in output stats is accurate + if (output->num_segments() == 1) { + for (const auto& [path, size] : stats) { + if (original_uid_to_path_stats.at(uid).at(path) > size) { + return Status::InternalError( + "Path stats not smaller for uid {} with path `{}`, input size {}, " + "output " + "size {}, " + "tablet_id {}", + uid, path, original_uid_to_path_stats.at(uid).at(path), size, + tablet->tablet_id()); + } + } + } } - for (const auto& [path, size] : stats) { - if (output_uid_to_path_stats.at(uid).at(path) != size) { + // in this case, input stats is accurate, so we check the stats size and stats value + else { + if (stats.size() != original_uid_to_path_stats.at(uid).size()) { return Status::InternalError( - "Path stats not match for uid {} with path `{}`, input size {}, output " - "size {}, " - "tablet_id {}", - uid, path, size, output_uid_to_path_stats.at(uid).at(path), - tablet->tablet_id()); + "Path stats size not match for uid {}, tablet_id {}, input size {}, output " + "size {}", + uid, tablet->tablet_id(), original_uid_to_path_stats.at(uid).size(), + stats.size()); + } + for (const auto& [path, size] : stats) { + if (original_uid_to_path_stats.at(uid).at(path) != size) { + return Status::InternalError( + "Path stats not match for uid {} with path `{}`, input size {}, output " + "size {}, " + "tablet_id {}", + uid, path, original_uid_to_path_stats.at(uid).at(path), size, + tablet->tablet_id()); + } } } } @@ -831,49 +856,35 @@ Status get_compaction_schema(const std::vector<RowsetSharedPtr>& rowsets, void calculate_variant_stats(const IColumn& encoded_sparse_column, segment_v2::VariantStatisticsPB* stats, size_t row_pos, size_t num_rows) { - size_t limit = VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE - - stats->sparse_column_non_null_size().size(); // Cast input column to ColumnMap type since sparse column is stored as a map const auto& map_column = assert_cast<const ColumnMap&>(encoded_sparse_column); - // Map to store path frequencies - tracks how many times each path appears - std::unordered_map<StringRef, size_t> sparse_data_paths_statistics; - // Get the keys column which contains the paths as strings const auto& sparse_data_paths = assert_cast<const ColumnString*>(map_column.get_keys_ptr().get()); const auto& serialized_sparse_column_offsets = assert_cast<const ColumnArray::Offsets64&>(map_column.get_offsets()); + auto& count_map = *stats->mutable_sparse_column_non_null_size(); // Iterate through all paths in the sparse column for (size_t i = row_pos; i != row_pos + num_rows; ++i) { size_t offset = serialized_sparse_column_offsets[i - 1]; size_t end = serialized_sparse_column_offsets[i]; for (size_t j = offset; j != end; ++j) { auto path = sparse_data_paths->get_data_at(j); + + const auto& sparse_path = path.to_string(); // If path already exists in statistics, increment its count - if (auto it = sparse_data_paths_statistics.find(path); - it != sparse_data_paths_statistics.end()) { + if (auto it = count_map.find(sparse_path); it != count_map.end()) { ++it->second; } // If path doesn't exist and we haven't hit the max statistics size limit, // add it with count 1 - else if (sparse_data_paths_statistics.size() < limit) { - sparse_data_paths_statistics.emplace(path, 1); + else if (count_map.size() < VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE) { + count_map.emplace(sparse_path, 1); } } } - // Copy the collected statistics into the protobuf stats object - // This maps each path string to its frequency count - for (const auto& [path, size] : sparse_data_paths_statistics) { - const auto& sparse_path = path.to_string(); - auto& count_map = *stats->mutable_sparse_column_non_null_size(); - if (auto it = count_map.find(sparse_path); it != count_map.end()) { - it->second += size; - } else { - count_map.emplace(sparse_path, size); - } - } if (stats->sparse_column_non_null_size().size() > VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE) { throw doris::Exception( --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org