wsjz commented on code in PR #12652: URL: https://github.com/apache/doris/pull/12652#discussion_r974933139
########## be/src/vec/exec/format/parquet/vparquet_reader.cpp: ########## @@ -232,28 +232,57 @@ Status ParquetReader::_process_page_index(tparquet::RowGroup& row_group) { int64_t bytes_read = 0; RETURN_IF_ERROR( _file_reader->readat(_page_index->_column_index_start, buffer_size, &bytes_read, buff)); - for (auto col_id : _include_column_ids) { - auto conjunct_iter = _slot_conjuncts.find(col_id); + + std::vector<RowRange> skipped_row_ranges; + for (auto& read_col : _read_columns) { + auto conjunct_iter = _slot_conjuncts.find(read_col._parquet_col_id); if (_slot_conjuncts.end() == conjunct_iter) { continue; } - auto& chunk = row_group.columns[col_id]; + auto& chunk = row_group.columns[read_col._parquet_col_id]; tparquet::ColumnIndex column_index; RETURN_IF_ERROR(_page_index->parse_column_index(chunk, buff, &column_index)); - const int num_of_page = column_index.null_pages.size(); - if (num_of_page <= 1) { + const int num_of_pages = column_index.null_pages.size(); + if (num_of_pages <= 0) { break; } auto& conjuncts = conjunct_iter->second; - std::vector<int> candidate_page_range; - _page_index->collect_skipped_page_range(conjuncts, candidate_page_range); + std::vector<int> skipped_page_range; + _page_index->collect_skipped_page_range(&column_index, conjuncts, skipped_page_range); + if (skipped_page_range.empty()) { + return Status::OK(); + } tparquet::OffsetIndex offset_index; RETURN_IF_ERROR(_page_index->parse_offset_index(chunk, buff, buffer_size, &offset_index)); - for (int page_id : candidate_page_range) { + for (int page_id : skipped_page_range) { RowRange skipped_row_range; _page_index->create_skipped_row_range(offset_index, row_group.num_rows, page_id, &skipped_row_range); - _skipped_row_ranges.emplace_back(skipped_row_range); + // use the union row range + skipped_row_ranges.emplace_back(skipped_row_range); + } + _col_offsets.emplace(read_col._parquet_col_id, offset_index); + } + if (skipped_row_ranges.empty()) { + return Status::OK(); + } + + std::sort(skipped_row_ranges.begin(), skipped_row_ranges.end(), + [](const RowRange& lhs, const RowRange& rhs) { + return std::tie(lhs.first_row, lhs.last_row) < + std::tie(rhs.first_row, rhs.last_row); + }); + int skip_end = -1; + for (auto& skip_range : skipped_row_ranges) { + VLOG_DEBUG << skip_range.first_row << " " << skip_range.last_row << " | "; Review Comment: https://github.com/apache/doris/pull/12771 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org