This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
commit e292ffd6fe938aeab8881e96b986482862aa674c Author: Qi Chen <kaka11.c...@gmail.com> AuthorDate: Sat Aug 26 00:13:11 2023 +0800 [Fix](orc-reader) Fix incorrect result if null partition fields in orc file. (#23369) Fix incorrect result if null partition fields in orc file. ### Root Cause Theoretically, the underlying file of the hive partition table should not contain partition fields. But we found that in some user scenarios, the partition field will exist in the underlying orc/parquet file and are null values. As a result, the pushed down partition field which are null values. filter incorrectly. ### Solution we handle this case by only reading non-partition fields. The parquet reader is already handled this way, this PR handles the orc reader. --- be/src/vec/exec/format/orc/vorc_reader.cpp | 29 +++++++++-------- .../hive/test_external_catalog_hive.out | 36 ++++++++++++++++++++++ .../hive/test_external_catalog_hive.groovy | 11 +++++++ 3 files changed, 64 insertions(+), 12 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 99aa9cf5a3..d6b258d39a 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -632,19 +632,24 @@ bool OrcReader::_init_search_argument( for (int i = 0; i < root_type.getSubtypeCount(); ++i) { type_map.emplace(_get_field_name_lower_case(&root_type, i), root_type.getSubtype(i)); } - for (auto it = colname_to_value_range->begin(); it != colname_to_value_range->end(); ++it) { - auto type_it = type_map.find(it->first); - if (type_it != type_map.end()) { - std::visit( - [&](auto& range) { - std::vector<OrcPredicate> value_predicates = - value_range_to_predicate(range, type_it->second); - for (auto& range_predicate : value_predicates) { - predicates.emplace_back(range_predicate); - } - }, - it->second); + for (auto& col_name : _lazy_read_ctx.all_read_columns) { + auto iter = colname_to_value_range->find(col_name); + if (iter == colname_to_value_range->end()) { + continue; } + auto type_it = type_map.find(col_name); + if (type_it == type_map.end()) { + continue; + } + std::visit( + [&](auto& range) { + std::vector<OrcPredicate> value_predicates = + value_range_to_predicate(range, type_it->second); + for (auto& range_predicate : value_predicates) { + predicates.emplace_back(range_predicate); + } + }, + iter->second); } if (predicates.empty()) { return false; diff --git a/regression-test/data/external_table_p2/hive/test_external_catalog_hive.out b/regression-test/data/external_table_p2/hive/test_external_catalog_hive.out index 0cb7db4732..124239d174 100644 --- a/regression-test/data/external_table_p2/hive/test_external_catalog_hive.out +++ b/regression-test/data/external_table_p2/hive/test_external_catalog_hive.out @@ -126,3 +126,39 @@ Z6n2t4XA2n7CXTECJ,PE,iBbsCh0RE1Dd2A,z48 -- !null_expr_dict_filter_parquet -- 4844 4363 +-- !par_fields_in_file_orc1 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_parquet1 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_orc2 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_parquet2 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_orc3 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_parquet3 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_orc4 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_parquet4 -- +1 Alice 100.0 2023 8 +2 Bob 150.0 2023 8 + +-- !par_fields_in_file_orc5 -- + +-- !par_fields_in_file_parquet5 -- + diff --git a/regression-test/suites/external_table_p2/hive/test_external_catalog_hive.groovy b/regression-test/suites/external_table_p2/hive/test_external_catalog_hive.groovy index 03d5cda5f1..1f32218b34 100644 --- a/regression-test/suites/external_table_p2/hive/test_external_catalog_hive.groovy +++ b/regression-test/suites/external_table_p2/hive/test_external_catalog_hive.groovy @@ -97,6 +97,17 @@ suite("test_external_catalog_hive", "p2") { qt_null_expr_dict_filter_orc """ select count(*), count(distinct user_no) from multi_catalog.dict_fitler_test_orc WHERE partitions in ('2023-08-21') and actual_intf_type = 'type1' and (REUSE_FLAG<> 'y' or REUSE_FLAG is null); """ qt_null_expr_dict_filter_parquet """ select count(*), count(distinct user_no) from multi_catalog.dict_fitler_test_parquet WHERE partitions in ('2023-08-21') and actual_intf_type = 'type1' and (REUSE_FLAG<> 'y' or REUSE_FLAG is null); """ + // test par fields in file + qt_par_fields_in_file_orc1 """ select * from multi_catalog.par_fields_in_file_orc where year = 2023 and month = 8 order by id; """ + qt_par_fields_in_file_parquet1 """ select * from multi_catalog.par_fields_in_file_parquet where year = 2023 and month = 8 order by id; """ + qt_par_fields_in_file_orc2 """ select * from multi_catalog.par_fields_in_file_orc where year = 2023 order by id; """ + qt_par_fields_in_file_parquet2 """ select * from multi_catalog.par_fields_in_file_parquet where year = 2023 order by id; """ + qt_par_fields_in_file_orc3 """ select * from multi_catalog.par_fields_in_file_orc where month = 8 order by id; """ + qt_par_fields_in_file_parquet3 """ select * from multi_catalog.par_fields_in_file_parquet where month = 8 order by id; """ + qt_par_fields_in_file_orc4 """ select * from multi_catalog.par_fields_in_file_orc where month = 8 and year >= 2022 order by id; """ + qt_par_fields_in_file_parquet4 """ select * from multi_catalog.par_fields_in_file_parquet where month = 8 and year >= 2022 order by id; """ + qt_par_fields_in_file_orc5 """ select * from multi_catalog.par_fields_in_file_orc where month = 8 and year = 2022 order by id; """ + qt_par_fields_in_file_parquet5 """ select * from multi_catalog.par_fields_in_file_parquet where month = 8 and year = 2022 order by id; """ // test remember last used database after switch / rename catalog sql """switch ${catalog_name};""" --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org