github-actions[bot] commented on code in PR #25138: URL: https://github.com/apache/doris/pull/25138#discussion_r1367781725
########## be/src/vec/exec/format/parquet/vparquet_column_reader.cpp: ########## @@ -476,86 +481,109 @@ Status ScalarColumnReader::_try_load_dict_page(bool* loaded, bool* has_dict) { Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr& type, ColumnSelectVector& select_vector, size_t batch_size, size_t* read_rows, bool* eof, bool is_dict_filter) { - if (_chunk_reader->remaining_num_values() == 0) { - if (!_chunk_reader->has_next_page()) { - *eof = true; - *read_rows = 0; - return Status::OK(); - } - RETURN_IF_ERROR(_chunk_reader->next_page()); - } - if (_nested_column) { - RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent()); - return _read_nested_column(doris_column, type, select_vector, batch_size, read_rows, eof, - is_dict_filter); - } - - // generate the row ranges that should be read - std::list<RowRange> read_ranges; - _generate_read_ranges(_current_row_index, - _current_row_index + _chunk_reader->remaining_num_values(), read_ranges); - if (read_ranges.size() == 0) { - // skip the whole page - _current_row_index += _chunk_reader->remaining_num_values(); - RETURN_IF_ERROR(_chunk_reader->skip_page()); - *read_rows = 0; - } else { - bool skip_whole_batch = false; - // Determining whether to skip page or batch will increase the calculation time. - // When the filtering effect is greater than 60%, it is possible to skip the page or batch. - if (select_vector.has_filter() && select_vector.filter_ratio() > 0.6) { - // lazy read - size_t remaining_num_values = 0; - for (auto& range : read_ranges) { - remaining_num_values += range.last_row - range.first_row; - } - if (batch_size >= remaining_num_values && - select_vector.can_filter_all(remaining_num_values)) { - // We can skip the whole page if the remaining values is filtered by predicate columns - select_vector.skip(remaining_num_values); - _current_row_index += _chunk_reader->remaining_num_values(); - RETURN_IF_ERROR(_chunk_reader->skip_page()); - *read_rows = remaining_num_values; - if (!_chunk_reader->has_next_page()) { - *eof = true; - } + bool need_convert = false; + auto& parquet_physical_type = _chunk_meta.meta_data.type; + auto& show_type = _field_schema->type.type; + + ColumnPtr src_column = ParquetConvert::get_column(parquet_physical_type, show_type, + doris_column, type, &need_convert); + + do { + if (_chunk_reader->remaining_num_values() == 0) { + if (!_chunk_reader->has_next_page()) { + *eof = true; + *read_rows = 0; return Status::OK(); } - skip_whole_batch = - batch_size <= remaining_num_values && select_vector.can_filter_all(batch_size); - if (skip_whole_batch) { - select_vector.skip(batch_size); - } + RETURN_IF_ERROR(_chunk_reader->next_page()); } - // load page data to decode or skip values - RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent()); - size_t has_read = 0; - for (auto& range : read_ranges) { - // generate the skipped values - size_t skip_values = range.first_row - _current_row_index; - RETURN_IF_ERROR(_skip_values(skip_values)); - _current_row_index += skip_values; - // generate the read values - size_t read_values = - std::min((size_t)(range.last_row - range.first_row), batch_size - has_read); - if (skip_whole_batch) { - RETURN_IF_ERROR(_skip_values(read_values)); - } else { - RETURN_IF_ERROR(_read_values(read_values, doris_column, type, select_vector, - is_dict_filter)); + if (_nested_column) { + RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent()); + RETURN_IF_ERROR(_read_nested_column(src_column, type, select_vector, batch_size, + read_rows, eof, is_dict_filter)); + break; + } + + // generate the row ranges that should be read + std::list<RowRange> read_ranges; + _generate_read_ranges(_current_row_index, + _current_row_index + _chunk_reader->remaining_num_values(), + read_ranges); + if (read_ranges.size() == 0) { + // skip the whole page + _current_row_index += _chunk_reader->remaining_num_values(); + RETURN_IF_ERROR(_chunk_reader->skip_page()); + *read_rows = 0; + } else { + bool skip_whole_batch = false; + // Determining whether to skip page or batch will increase the calculation time. + // When the filtering effect is greater than 60%, it is possible to skip the page or batch. + if (select_vector.has_filter() && select_vector.filter_ratio() > 0.6) { + // lazy read + size_t remaining_num_values = 0; + for (auto& range : read_ranges) { + remaining_num_values += range.last_row - range.first_row; + } + if (batch_size >= remaining_num_values && + select_vector.can_filter_all(remaining_num_values)) { + // We can skip the whole page if the remaining values is filtered by predicate columns + select_vector.skip(remaining_num_values); + _current_row_index += _chunk_reader->remaining_num_values(); + RETURN_IF_ERROR(_chunk_reader->skip_page()); + *read_rows = remaining_num_values; + if (!_chunk_reader->has_next_page()) { + *eof = true; + } + break; + } + skip_whole_batch = batch_size <= remaining_num_values && + select_vector.can_filter_all(batch_size); + if (skip_whole_batch) { + select_vector.skip(batch_size); + } } - has_read += read_values; - _current_row_index += read_values; - if (has_read == batch_size) { - break; + // load page data to decode or skip values + RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent()); + size_t has_read = 0; + for (auto& range : read_ranges) { + // generate the skipped values + size_t skip_values = range.first_row - _current_row_index; + RETURN_IF_ERROR(_skip_values(skip_values)); + _current_row_index += skip_values; + // generate the read values + size_t read_values = + std::min((size_t)(range.last_row - range.first_row), batch_size - has_read); + if (skip_whole_batch) { + RETURN_IF_ERROR(_skip_values(read_values)); + } else { + RETURN_IF_ERROR(_read_values(read_values, src_column, type, select_vector, + is_dict_filter)); + } + has_read += read_values; + _current_row_index += read_values; + if (has_read == batch_size) { + break; + } } + *read_rows = has_read; } - *read_rows = has_read; - } - if (_chunk_reader->remaining_num_values() == 0 && !_chunk_reader->has_next_page()) { - *eof = true; + if (_chunk_reader->remaining_num_values() == 0 && !_chunk_reader->has_next_page()) { + *eof = true; + } + } while (0); Review Comment: warning: converting integer literal to bool, use bool literal instead [modernize-use-bool-literals] ```suggestion } while (false); ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org