github-actions[bot] commented on code in PR #63389:
URL: https://github.com/apache/doris/pull/63389#discussion_r3339430294


##########
be/src/storage/segment/segment.cpp:
##########
@@ -85,6 +90,93 @@ namespace doris::segment_v2 {
 
 class InvertedIndexIterator;
 
+namespace {
+
+Status build_segment_zonemap_context(Segment* segment, const Schema& schema,
+                                     const StorageReadOptions& read_options,
+                                     const VExprContextSPtrs& conjuncts, 
ZoneMapEvalContext* ctx) {
+    DORIS_CHECK(segment != nullptr);
+    DORIS_CHECK(ctx != nullptr);
+    std::set<int> slot_indexes;
+    for (const auto& conjunct : conjuncts) {
+        DORIS_CHECK(conjunct != nullptr);
+        auto root = conjunct->root();
+        DORIS_CHECK(root != nullptr);
+        if (!root->can_evaluate_zonemap_filter()) {
+            continue;
+        }
+        std::set<int> expr_slot_indexes;
+        root->collect_slot_column_ids(expr_slot_indexes);
+        if (expr_slot_indexes.size() == 1) {
+            slot_indexes.insert(*expr_slot_indexes.begin());
+        }
+    }
+    for (const int slot_index : slot_indexes) {
+        if (slot_index < 0 || cast_set<size_t>(slot_index) >= 
schema.num_column_ids()) {
+            continue;
+        }
+        const auto column_id = schema.column_id(cast_set<size_t>(slot_index));
+        const auto* field = schema.column(column_id);
+        DORIS_CHECK(field != nullptr);
+        if (!segment->can_apply_predicate_safely(
+                    column_id, schema, 
read_options.target_cast_type_for_variants, read_options)) {
+            continue;
+        }
+        auto data_type = segment->get_data_type_of(*field, read_options);
+        if (data_type == nullptr) {
+            continue;
+        }
+        ZoneMapEvalContext::SlotZoneMap slot_zone_map;
+        slot_zone_map.data_type = data_type;
+        std::shared_ptr<ColumnReader> reader;
+        Status st = segment->get_column_reader(*field, &reader, 
read_options.stats);
+        if (st.is<ErrorCode::NOT_FOUND>()) {
+            ctx->slots.emplace(slot_index, std::move(slot_zone_map));
+            continue;
+        }
+        RETURN_IF_ERROR(st);
+        if (reader != nullptr && reader->has_zone_map()) {
+            ZoneMap zone_map;
+            RETURN_IF_ERROR(reader->get_segment_zone_map(&zone_map));
+            slot_zone_map.zone_map = std::move(zone_map);
+        }
+        ctx->slots.emplace(slot_index, std::move(slot_zone_map));
+    }
+    return Status::OK();
+}
+
+bool storage_expr_slots_match_reader_schema(const StorageReadOptions& 
read_options) {
+    DORIS_CHECK(read_options.tablet_schema != nullptr);
+    const auto keys_type = read_options.tablet_schema->keys_type();
+    return keys_type == KeysType::DUP_KEYS ||
+           (keys_type == KeysType::UNIQUE_KEYS && 
read_options.enable_unique_key_merge_on_write);
+}
+
+Status prepare_segment_zonemap_exprs(const StorageReadOptions& read_options, 
const Schema& schema,
+                                     const VExprContextSPtrs** expr_ctxs,
+                                     VExprContextSPtrs* rebound_expr_ctxs) {
+    DORIS_CHECK(expr_ctxs != nullptr);
+    DORIS_CHECK(rebound_expr_ctxs != nullptr);
+    *expr_ctxs = &read_options.common_expr_ctxs_push_down;
+    if (storage_expr_slots_match_reader_schema(read_options)) {
+        return Status::OK();
+    }
+
+    rebound_expr_ctxs->reserve(read_options.common_expr_ctxs_push_down.size());
+    for (const auto& expr_ctx : read_options.common_expr_ctxs_push_down) {
+        VExprContextSPtr cloned_ctx;
+        RETURN_IF_ERROR(expr_ctx->clone(read_options.runtime_state, 
cloned_ctx));
+        rebound_expr_ctxs->emplace_back(std::move(cloned_ctx));

Review Comment:
   This does not create an isolated expression tree before the rebinding below. 
`VExprContext::clone()` constructs the new context with the same `_root` 
pointer (`vexpr_context.cpp:153`), so `rebind_storage_exprs_to_reader_schema()` 
mutates the original `read_options.common_expr_ctxs_push_down` tree shared by 
other segments/scanners. On AGG_KEYS or non-MOW UNIQUE_KEYS scans, concurrent 
segment iterators can rebind the same `VSlotRef` column ids while this 
segment-level zonemap evaluation is running, or leave the shared tree rebound 
for a different reader schema before later evaluation. This is distinct from 
the earlier slot-rebinding thread: rebinding is now attempted, but the cloned 
context is shallow. Please deep-clone the expression tree before rebinding, or 
skip this segment-level path whenever rebinding would be required.



##########
be/src/format/parquet/vparquet_reader.cpp:
##########
@@ -1139,8 +1153,10 @@ Status ParquetReader::_process_page_index_filter(
     // from https://github.com/apache/doris/pull/55795
     RETURN_IF_ERROR(parse_offset_index());
 
-    // Check if page index is needed for min-max filter.
-    if (!_enable_filter_by_min_max || push_down_pred.empty()) {
+    // Check if page index is needed for min-max or expr-zonemap filter.
+    const bool has_expr_zonemap_filter = 
expr_zonemap::is_expr_zonemap_filter_enabled(_state) &&
+                                         !_lazy_read_ctx.conjuncts.empty();

Review Comment:
   This gate is true for any conjunct, but 
`_process_expr_zonemap_page_filter()` later only keeps single-slot conjuncts 
whose root `can_evaluate_zonemap_filter()`. For an expression-only Parquet scan 
with an unsupported predicate such as `abs(k) = 1` and no ordinary 
`push_down_pred`, this still reads and parses the column-index buffer for every 
row group, then `ctxs_by_column` is empty and all pages are kept. Since 
`enable_expr_zonemap_filter` defaults to true, this is a hot-path regression. 
Please compute this flag from the same supported single-slot capability check 
used by the page filter before deciding to read the page index.



##########
be/src/format/parquet/vparquet_reader.cpp:
##########
@@ -1256,6 +1272,85 @@ Status ParquetReader::_process_page_index_filter(
         }
         RowRanges::ranges_intersection(*candidate_row_ranges, tmp_row_range, 
candidate_row_ranges);
     }
+    RETURN_IF_ERROR(_process_expr_zonemap_page_filter(&cached_page_index, 
candidate_row_ranges));
+    return Status::OK();
+}
+
+Status ParquetReader::_process_expr_zonemap_page_filter(
+        ParquetPredicate::CachedPageIndexStat* cached_page_index, RowRanges* 
candidate_row_ranges) {
+    DORIS_CHECK(cached_page_index != nullptr);
+    DORIS_CHECK(candidate_row_ranges != nullptr);
+    const auto& all_conjuncts = _lazy_read_ctx.conjuncts;
+    if (!expr_zonemap::is_expr_zonemap_filter_enabled(_state) || 
all_conjuncts.empty() ||
+        candidate_row_ranges->is_empty()) {
+        return Status::OK();
+    }
+
+    std::unordered_map<int, VExprContextSPtrs> ctxs_by_column;
+    for (const auto& conjunct : all_conjuncts) {
+        if (conjunct == nullptr || conjunct->root() == nullptr) {
+            continue;
+        }
+        std::set<int> column_ids;
+        conjunct->root()->collect_slot_column_ids(column_ids);
+        if (column_ids.size() == 1 && 
conjunct->root()->can_evaluate_zonemap_filter()) {
+            ctxs_by_column[*column_ids.begin()].emplace_back(conjunct);
+        }
+    }
+
+    for (const auto& [cid, conjuncts] : ctxs_by_column) {
+        if (cid < 0 || cid >= _tuple_descriptor->slots().size()) {
+            continue;
+        }
+        auto* slot = _tuple_descriptor->slots()[cid];
+        if (!_exists_in_file(slot->col_name()) || !_type_matches(cid)) {
+            continue;
+        }
+        ParquetPredicate::PageIndexStat* stat = nullptr;
+        if (!cached_page_index->get_stat_func(&stat, cid) || stat == nullptr 
|| !stat->available) {
+            continue;
+        }
+        RowRanges expr_ranges;
+        ZoneMapEvalStats page_stats;
+        for (int64_t page_id = 0; page_id < stat->num_of_pages; ++page_id) {
+            DORIS_CHECK(page_id < stat->ranges.size());
+            DORIS_CHECK(page_id < stat->has_null.size());
+            DORIS_CHECK(page_id < stat->is_all_null.size());
+            const auto& page_range = stat->ranges[page_id];
+            DORIS_CHECK(page_range.is_valid());
+
+            ZoneMapEvalContext ctx;
+            ZoneMapEvalContext::SlotZoneMap slot_zone_map;
+            slot_zone_map.data_type = slot->type();
+            segment_v2::ZoneMap zone_map;
+            zone_map.has_null = stat->has_null[page_id];
+            zone_map.has_not_null = !stat->is_all_null[page_id];
+            if (!stat->is_all_null[page_id]) {
+                DORIS_CHECK(page_id < stat->encoded_min_value.size());
+                DORIS_CHECK(page_id < stat->encoded_max_value.size());
+                Status status = ParquetPredicate::parse_min_max_value(
+                        stat->col_schema, stat->encoded_min_value[page_id],
+                        stat->encoded_max_value[page_id], 
*cached_page_index->ctz,
+                        &zone_map.min_value, &zone_map.max_value);
+                if (status.ok()) {
+                    slot_zone_map.zone_map = std::move(zone_map);
+                }
+            } else {
+                slot_zone_map.zone_map = std::move(zone_map);
+            }
+            ctx.slots.emplace(cid, std::move(slot_zone_map));
+            const auto result = 
VExprContext::evaluate_zonemap_filter(conjuncts, ctx);
+            page_stats.merge_page_eval_stats(ctx.stats);
+            if (result != ZoneMapFilterResult::kNoMatch) {
+                expr_ranges.add(page_range);
+            }
+        }
+        page_stats.accumulate_to(&_reader_statistics);
+        RowRanges::ranges_intersection(*candidate_row_ranges, expr_ranges, 
candidate_row_ranges);
+        if (candidate_row_ranges->is_empty()) {
+            return Status::OK();

Review Comment:
   When expression page pruning makes `candidate_row_ranges` empty here, 
`_next_row_group_reader()` counts the row group as filtered, but 
`filtered_row_groups_by_expr_zonemap` is only incremented in the 
row-group-level path. A row group whose row-group stats are too wide but whose 
pages are all eliminated by `starts_with`/`IN` will therefore show 
`ParquetExprZoneMapFilteredRowGroups = 0`, underreporting this feature's 
effect. Please attribute the empty-range case to expr-zonemap as well, or add a 
separate page-level expr-zonemap row-group counter.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to