This is an automated email from the ASF dual-hosted git repository. ashingau pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 621235796de [fix](parquet) fix parquet reader missing column and filter missing column (#36189) 621235796de is described below commit 621235796deb409e70e5e112b4f5548da159ceee Author: Ashin Gau <ashin...@users.noreply.github.com> AuthorDate: Wed Jun 19 16:18:50 2024 +0800 [fix](parquet) fix parquet reader missing column and filter missing column (#36189) ## Proposed changes follow https://github.com/apache/doris/pull/35583, fix parquet reader. --- .../exec/format/parquet/vparquet_group_reader.cpp | 4 + .../exec/format/parquet/vparquet_group_reader.h | 1 + be/src/vec/exec/format/parquet/vparquet_reader.cpp | 8 + .../hive/scripts/create_preinstalled_table.hql | 18 ++ .../docker-compose/hive/scripts/hive-metastore.sh | 7 - .../hive/test_hive_parquet_add_column.out | 185 +++++++++++++++++++++ .../hive/test_hive_parquet_add_column.groovy | 92 ++++++++++ 7 files changed, 308 insertions(+), 7 deletions(-) diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index 426810ccbfc..c41c8e9ef12 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -174,6 +174,10 @@ Status RowGroupReader::init( } } } + //For check missing column : missing column == xx, missing column is null,missing column is not null. + _filter_conjuncts.insert(_filter_conjuncts.end(), + _lazy_read_ctx.missing_columns_conjuncts.begin(), + _lazy_read_ctx.missing_columns_conjuncts.end()); RETURN_IF_ERROR(_rewrite_dict_predicates()); return Status::OK(); } diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.h b/be/src/vec/exec/format/parquet/vparquet_group_reader.h index d38f5a74adf..d9f7f2dbf34 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.h @@ -94,6 +94,7 @@ public: std::unordered_map<std::string, std::tuple<std::string, const SlotDescriptor*>> partition_columns; std::unordered_map<std::string, VExprContextSPtr> predicate_missing_columns; + VExprContextSPtrs missing_columns_conjuncts; // lazy read missing columns or all missing columns std::unordered_map<std::string, VExprContextSPtr> missing_columns; // should turn off filtering by page index, lazy read and dict filter if having complex type diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index c3199b5de66..629f272ef72 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -426,6 +426,14 @@ Status ParquetReader::set_fill_columns( if (iter == predicate_columns.end()) { _lazy_read_ctx.missing_columns.emplace(kv.first, kv.second); } else { + //For check missing column : missing column == xx, missing column is null,missing column is not null. + if (_slot_id_to_filter_conjuncts->find(iter->second.second) != + _slot_id_to_filter_conjuncts->end()) { + for (auto& ctx : _slot_id_to_filter_conjuncts->find(iter->second.second)->second) { + _lazy_read_ctx.missing_columns_conjuncts.emplace_back(ctx); + } + } + _lazy_read_ctx.predicate_missing_columns.emplace(kv.first, kv.second); _lazy_read_ctx.all_predicate_col_ids.emplace_back(iter->second.first); } diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql index 4a63b85f563..cdcc2698e9c 100644 --- a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql @@ -622,6 +622,24 @@ insert into `test_hive_orc_add_column` values(25,26,null,null,null); insert into `test_hive_orc_add_column` values(27,28,29,null,null); insert into `test_hive_orc_add_column` values(30,31,32,33,null); +CREATE TABLE `test_hive_parquet_add_column`( + id int, + col1 int +) +stored as parquet; +insert into `test_hive_parquet_add_column` values(1,2); +insert into `test_hive_parquet_add_column` values(3,4),(4,6); +alter table `test_hive_parquet_add_column` ADD COLUMNS (col2 int); +insert into `test_hive_parquet_add_column` values(7,8,9); +insert into `test_hive_parquet_add_column` values(10,11,null); +insert into `test_hive_parquet_add_column` values(12,13,null); +insert into `test_hive_parquet_add_column` values(14,15,16); +alter table `test_hive_parquet_add_column` ADD COLUMNS (col3 int,col4 string); +insert into `test_hive_parquet_add_column` values(17,18,19,20,"hello world"); +insert into `test_hive_parquet_add_column` values(21,22,23,24,"cywcywcyw"); +insert into `test_hive_parquet_add_column` values(25,26,null,null,null); +insert into `test_hive_parquet_add_column` values(27,28,29,null,null); +insert into `test_hive_parquet_add_column` values(30,31,32,33,null); CREATE TABLE `schema_evo_test_text`( id int, diff --git a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh index 09658778fdb..0157dc26d40 100755 --- a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh +++ b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh @@ -82,10 +82,3 @@ touch /mnt/SUCCESS while true; do sleep 1 done - -create catalog if not exists hive2_docker properties ( - 'type'='hms', - 'hive.metastore.uris' = 'thrift://172.21.0.101:9083', - 'fs.defaultFS' = 'hdfs://172.21.0.101:8020', - 'hadoop.username' = 'hadoop' -); diff --git a/regression-test/data/external_table_p0/hive/test_hive_parquet_add_column.out b/regression-test/data/external_table_p0/hive/test_hive_parquet_add_column.out new file mode 100644 index 00000000000..31462d22af3 --- /dev/null +++ b/regression-test/data/external_table_p0/hive/test_hive_parquet_add_column.out @@ -0,0 +1,185 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !parquet_add_col1 -- +1 2 \N \N \N +3 4 \N \N \N +4 6 \N \N \N +7 8 9 \N \N +10 11 \N \N \N +12 13 \N \N \N +14 15 16 \N \N +17 18 19 20 hello world +21 22 23 24 cywcywcyw +25 26 \N \N \N +27 28 29 \N \N +30 31 32 33 \N + +-- !parquet_add_col2 -- + +-- !parquet_add_col3 -- + +-- !parquet_add_col4 -- +1 2 \N \N \N +3 4 \N \N \N +4 6 \N \N \N +10 11 \N \N \N +12 13 \N \N \N +25 26 \N \N \N + +-- !parquet_add_col5 -- +\N +\N +\N +\N +\N +\N + +-- !parquet_add_col6 -- +1 2 \N \N \N +3 4 \N \N \N +4 6 \N \N \N +7 8 9 \N \N +10 11 \N \N \N +12 13 \N \N \N +14 15 16 \N \N +25 26 \N \N \N +27 28 29 \N \N + +-- !parquet_add_col7 -- +\N +\N +\N +\N +\N +\N +\N +\N +\N + +-- !parquet_add_col8 -- +1 2 \N \N \N +3 4 \N \N \N +4 6 \N \N \N +7 8 9 \N \N +10 11 \N \N \N +12 13 \N \N \N +14 15 16 \N \N +25 26 \N \N \N +27 28 29 \N \N +30 31 32 33 \N + +-- !parquet_add_col9 -- +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N + +-- !parquet_add_col10 -- +1 2 \N \N \N +3 4 \N \N \N +4 6 \N \N \N +7 8 9 \N \N +10 11 \N \N \N +12 13 \N \N \N +14 15 16 \N \N +17 18 19 20 hello world +21 22 23 24 cywcywcyw +25 26 \N \N \N +27 28 29 \N \N +30 31 32 33 \N + +-- !parquet_add_col11 -- +2 +4 +6 +8 +11 +13 +15 +18 +22 +26 +28 +31 + +-- !parquet_add_col12 -- +7 8 9 \N \N +14 15 16 \N \N +17 18 19 20 hello world +21 22 23 24 cywcywcyw +27 28 29 \N \N +30 31 32 33 \N + +-- !parquet_add_col13 -- +9 +16 +19 +23 +29 +32 + +-- !parquet_add_col14 -- +17 18 19 20 hello world +21 22 23 24 cywcywcyw +30 31 32 33 \N + +-- !parquet_add_col15 -- +20 +24 +33 + +-- !parquet_add_col16 -- +17 18 19 20 hello world +21 22 23 24 cywcywcyw + +-- !parquet_add_col17 -- +cywcywcyw +hello world + +-- !parquet_add_col18 -- +7 8 9 \N \N + +-- !parquet_add_col19 -- + +-- !parquet_add_col20 -- +7 8 9 \N \N +14 15 16 \N \N +17 18 19 20 hello world +21 22 23 24 cywcywcyw +27 28 29 \N \N +30 31 32 33 \N + +-- !parquet_add_col21 -- +7 8 9 \N \N +14 15 16 \N \N +17 18 19 20 hello world +21 22 23 24 cywcywcyw +27 28 29 \N \N +30 31 32 33 \N + +-- !parquet_add_col22 -- + +-- !parquet_add_col23 -- +30 31 32 33 \N + +-- !parquet_add_col24 -- + +-- !parquet_add_col25 -- +17 18 19 20 hello world +21 22 23 24 cywcywcyw +30 31 32 33 \N + +-- !parquet_add_col26 -- + +-- !parquet_add_col27 -- +21 22 23 24 cywcywcyw + +-- !parquet_add_col28 -- +17 18 19 20 hello world +21 22 23 24 cywcywcyw + diff --git a/regression-test/suites/external_table_p0/hive/test_hive_parquet_add_column.groovy b/regression-test/suites/external_table_p0/hive/test_hive_parquet_add_column.groovy new file mode 100644 index 00000000000..4a0df8bdf39 --- /dev/null +++ b/regression-test/suites/external_table_p0/hive/test_hive_parquet_add_column.groovy @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_hive_parquet_add_column", "all_types,p0,external,hive,external_docker,external_docker_hive") { + + String enabled = context.config.otherConfigs.get("enableHiveTest") + if (enabled == null || !enabled.equalsIgnoreCase("true")) { + logger.info("diable Hive test.") + return; + } + + try { + String hms_port = context.config.otherConfigs.get("hive3HmsPort") + String catalog_name = "hive3_test_parquet_add_column" + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + + sql """drop catalog if exists ${catalog_name}""" + sql """create catalog if not exists ${catalog_name} properties ( + "type"="hms", + 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}' + );""" + sql """use `${catalog_name}`.`default`""" + + qt_parquet_add_col1 """select * from test_hive_parquet_add_column order by id ;""" + qt_parquet_add_col2 """select * from test_hive_parquet_add_column where col1 is null order by id ;""" + qt_parquet_add_col3 """select col1 from test_hive_parquet_add_column where col1 is null;""" + qt_parquet_add_col4 """select * from test_hive_parquet_add_column where col2 is null order by id ;""" + qt_parquet_add_col5 """select col2 from test_hive_parquet_add_column where col2 is null;""" + qt_parquet_add_col6 """select * from test_hive_parquet_add_column where col3 is null order by id ;""" + qt_parquet_add_col7 """select col3 from test_hive_parquet_add_column where col3 is null;""" + qt_parquet_add_col8 """select * from test_hive_parquet_add_column where col4 is null order by id ;""" + qt_parquet_add_col9 """select col4 from test_hive_parquet_add_column where col4 is null;""" + qt_parquet_add_col10 """select * from test_hive_parquet_add_column where col1 is not null order by id ;""" + qt_parquet_add_col11 """select col1 from test_hive_parquet_add_column where col1 is not null order by col1;""" + qt_parquet_add_col12 """select * from test_hive_parquet_add_column where col2 is not null order by id ;""" + qt_parquet_add_col13 """select col2 from test_hive_parquet_add_column where col2 is not null order by col2;""" + qt_parquet_add_col14 """select * from test_hive_parquet_add_column where col3 is not null order by id ;""" + qt_parquet_add_col15 """select col3 from test_hive_parquet_add_column where col3 is not null order by col3;""" + qt_parquet_add_col16 """select * from test_hive_parquet_add_column where col4 is not null order by id ;""" + qt_parquet_add_col17 """select col4 from test_hive_parquet_add_column where col4 is not null order by col4;""" + qt_parquet_add_col18 """select * from test_hive_parquet_add_column where col2 = 9 order by id ;""" + qt_parquet_add_col19 """select * from test_hive_parquet_add_column where col2 = 190 order by id ;""" + qt_parquet_add_col20 """select * from test_hive_parquet_add_column where col2 - col1 = 1 order by id ;""" + qt_parquet_add_col21 """select * from test_hive_parquet_add_column where col2 - id = 2 order by id ;""" + qt_parquet_add_col22 """select * from test_hive_parquet_add_column where col2 - id = 3 order by id ;""" + qt_parquet_add_col23 """select * from test_hive_parquet_add_column where col3 = 33 order by id ;""" + qt_parquet_add_col24 """select * from test_hive_parquet_add_column where col3 = 330 order by id ;""" + qt_parquet_add_col25 """select * from test_hive_parquet_add_column where col3 - col1 = 2 order by id ;""" + qt_parquet_add_col26 """select * from test_hive_parquet_add_column where col3 - id != 3 order by id ;""" + qt_parquet_add_col27 """select * from test_hive_parquet_add_column where col1 + col2 + col3 = 23*3 order by id ;""" + qt_parquet_add_col28 """select * from test_hive_parquet_add_column where col1 + col2 + col3 != 32*3 order by id ; """ + + sql """drop catalog if exists ${catalog_name}""" + + } finally { + } + +} + + +// CREATE TABLE `test_hive_parquet_add_column`( +// id int, +// col1 int +// ) +// stored as parquet; +// insert into `test_hive_parquet_add_column` values(1,2); +// insert into `test_hive_parquet_add_column` values(3,4),(4,6); +// alter table `test_hive_parquet_add_column` ADD COLUMNS(col2 int); +// insert into `test_hive_parquet_add_column` values(7,8,9); +// insert into `test_hive_parquet_add_column` values(10,11,null); +// insert into `test_hive_parquet_add_column` values(12,13,null); +// insert into `test_hive_parquet_add_column` values(14,15,16); +// alter table `test_hive_parquet_add_column` ADD COLUMNS(col3 int,col4 string); +// insert into `test_hive_parquet_add_column` values(17,18,19,20,"hello world"); +// insert into `test_hive_parquet_add_column` values(21,22,23,24,"cywcywcyw"); +// insert into `test_hive_parquet_add_column` values(25,26,null,null,null); +// insert into `test_hive_parquet_add_column` values(27,28,29,null,null); +// insert into `test_hive_parquet_add_column` values(30,31,32,33,null); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org