This is an automated email from the ASF dual-hosted git repository.

ashingau pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 621235796de [fix](parquet) fix parquet reader missing column and 
filter missing column (#36189)
621235796de is described below

commit 621235796deb409e70e5e112b4f5548da159ceee
Author: Ashin Gau <ashin...@users.noreply.github.com>
AuthorDate: Wed Jun 19 16:18:50 2024 +0800

    [fix](parquet) fix parquet reader missing column and filter missing column 
(#36189)
    
    ## Proposed changes
    
    follow https://github.com/apache/doris/pull/35583, fix parquet reader.
---
 .../exec/format/parquet/vparquet_group_reader.cpp  |   4 +
 .../exec/format/parquet/vparquet_group_reader.h    |   1 +
 be/src/vec/exec/format/parquet/vparquet_reader.cpp |   8 +
 .../hive/scripts/create_preinstalled_table.hql     |  18 ++
 .../docker-compose/hive/scripts/hive-metastore.sh  |   7 -
 .../hive/test_hive_parquet_add_column.out          | 185 +++++++++++++++++++++
 .../hive/test_hive_parquet_add_column.groovy       |  92 ++++++++++
 7 files changed, 308 insertions(+), 7 deletions(-)

diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp 
b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
index 426810ccbfc..c41c8e9ef12 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
@@ -174,6 +174,10 @@ Status RowGroupReader::init(
             }
         }
     }
+    //For check missing column :   missing column == xx, missing column is 
null,missing column is not null.
+    _filter_conjuncts.insert(_filter_conjuncts.end(),
+                             _lazy_read_ctx.missing_columns_conjuncts.begin(),
+                             _lazy_read_ctx.missing_columns_conjuncts.end());
     RETURN_IF_ERROR(_rewrite_dict_predicates());
     return Status::OK();
 }
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.h 
b/be/src/vec/exec/format/parquet/vparquet_group_reader.h
index d38f5a74adf..d9f7f2dbf34 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.h
@@ -94,6 +94,7 @@ public:
         std::unordered_map<std::string, std::tuple<std::string, const 
SlotDescriptor*>>
                 partition_columns;
         std::unordered_map<std::string, VExprContextSPtr> 
predicate_missing_columns;
+        VExprContextSPtrs missing_columns_conjuncts;
         // lazy read missing columns or all missing columns
         std::unordered_map<std::string, VExprContextSPtr> missing_columns;
         // should turn off filtering by page index, lazy read and dict filter 
if having complex type
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp 
b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
index c3199b5de66..629f272ef72 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
@@ -426,6 +426,14 @@ Status ParquetReader::set_fill_columns(
         if (iter == predicate_columns.end()) {
             _lazy_read_ctx.missing_columns.emplace(kv.first, kv.second);
         } else {
+            //For check missing column :   missing column == xx, missing 
column is null,missing column is not null.
+            if (_slot_id_to_filter_conjuncts->find(iter->second.second) !=
+                _slot_id_to_filter_conjuncts->end()) {
+                for (auto& ctx : 
_slot_id_to_filter_conjuncts->find(iter->second.second)->second) {
+                    _lazy_read_ctx.missing_columns_conjuncts.emplace_back(ctx);
+                }
+            }
+
             _lazy_read_ctx.predicate_missing_columns.emplace(kv.first, 
kv.second);
             
_lazy_read_ctx.all_predicate_col_ids.emplace_back(iter->second.first);
         }
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql 
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql
index 4a63b85f563..cdcc2698e9c 100644
--- 
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql
@@ -622,6 +622,24 @@ insert into  `test_hive_orc_add_column` 
values(25,26,null,null,null);
 insert into  `test_hive_orc_add_column` values(27,28,29,null,null);
 insert into  `test_hive_orc_add_column` values(30,31,32,33,null);
 
+CREATE TABLE `test_hive_parquet_add_column`(
+  id int,
+  col1 int
+)
+stored as parquet;
+insert into  `test_hive_parquet_add_column` values(1,2);
+insert into  `test_hive_parquet_add_column` values(3,4),(4,6);
+alter table `test_hive_parquet_add_column` ADD COLUMNS (col2 int);
+insert into  `test_hive_parquet_add_column` values(7,8,9);
+insert into  `test_hive_parquet_add_column` values(10,11,null);
+insert into  `test_hive_parquet_add_column` values(12,13,null);
+insert into  `test_hive_parquet_add_column` values(14,15,16);
+alter table `test_hive_parquet_add_column` ADD COLUMNS (col3 int,col4 string);
+insert into  `test_hive_parquet_add_column` values(17,18,19,20,"hello world");
+insert into  `test_hive_parquet_add_column` values(21,22,23,24,"cywcywcyw");
+insert into  `test_hive_parquet_add_column` values(25,26,null,null,null);
+insert into  `test_hive_parquet_add_column` values(27,28,29,null,null);
+insert into  `test_hive_parquet_add_column` values(30,31,32,33,null);
 
 CREATE TABLE `schema_evo_test_text`(
   id int,
diff --git a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh 
b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh
index 09658778fdb..0157dc26d40 100755
--- a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh
+++ b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh
@@ -82,10 +82,3 @@ touch /mnt/SUCCESS
 while true; do
     sleep 1
 done
-
-create catalog if not exists hive2_docker properties (
-    'type'='hms',
-    'hive.metastore.uris' = 'thrift://172.21.0.101:9083',
-    'fs.defaultFS' = 'hdfs://172.21.0.101:8020',
-    'hadoop.username' = 'hadoop'
-);
diff --git 
a/regression-test/data/external_table_p0/hive/test_hive_parquet_add_column.out 
b/regression-test/data/external_table_p0/hive/test_hive_parquet_add_column.out
new file mode 100644
index 00000000000..31462d22af3
--- /dev/null
+++ 
b/regression-test/data/external_table_p0/hive/test_hive_parquet_add_column.out
@@ -0,0 +1,185 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !parquet_add_col1 --
+1      2       \N      \N      \N
+3      4       \N      \N      \N
+4      6       \N      \N      \N
+7      8       9       \N      \N
+10     11      \N      \N      \N
+12     13      \N      \N      \N
+14     15      16      \N      \N
+17     18      19      20      hello world
+21     22      23      24      cywcywcyw
+25     26      \N      \N      \N
+27     28      29      \N      \N
+30     31      32      33      \N
+
+-- !parquet_add_col2 --
+
+-- !parquet_add_col3 --
+
+-- !parquet_add_col4 --
+1      2       \N      \N      \N
+3      4       \N      \N      \N
+4      6       \N      \N      \N
+10     11      \N      \N      \N
+12     13      \N      \N      \N
+25     26      \N      \N      \N
+
+-- !parquet_add_col5 --
+\N
+\N
+\N
+\N
+\N
+\N
+
+-- !parquet_add_col6 --
+1      2       \N      \N      \N
+3      4       \N      \N      \N
+4      6       \N      \N      \N
+7      8       9       \N      \N
+10     11      \N      \N      \N
+12     13      \N      \N      \N
+14     15      16      \N      \N
+25     26      \N      \N      \N
+27     28      29      \N      \N
+
+-- !parquet_add_col7 --
+\N
+\N
+\N
+\N
+\N
+\N
+\N
+\N
+\N
+
+-- !parquet_add_col8 --
+1      2       \N      \N      \N
+3      4       \N      \N      \N
+4      6       \N      \N      \N
+7      8       9       \N      \N
+10     11      \N      \N      \N
+12     13      \N      \N      \N
+14     15      16      \N      \N
+25     26      \N      \N      \N
+27     28      29      \N      \N
+30     31      32      33      \N
+
+-- !parquet_add_col9 --
+\N
+\N
+\N
+\N
+\N
+\N
+\N
+\N
+\N
+\N
+
+-- !parquet_add_col10 --
+1      2       \N      \N      \N
+3      4       \N      \N      \N
+4      6       \N      \N      \N
+7      8       9       \N      \N
+10     11      \N      \N      \N
+12     13      \N      \N      \N
+14     15      16      \N      \N
+17     18      19      20      hello world
+21     22      23      24      cywcywcyw
+25     26      \N      \N      \N
+27     28      29      \N      \N
+30     31      32      33      \N
+
+-- !parquet_add_col11 --
+2
+4
+6
+8
+11
+13
+15
+18
+22
+26
+28
+31
+
+-- !parquet_add_col12 --
+7      8       9       \N      \N
+14     15      16      \N      \N
+17     18      19      20      hello world
+21     22      23      24      cywcywcyw
+27     28      29      \N      \N
+30     31      32      33      \N
+
+-- !parquet_add_col13 --
+9
+16
+19
+23
+29
+32
+
+-- !parquet_add_col14 --
+17     18      19      20      hello world
+21     22      23      24      cywcywcyw
+30     31      32      33      \N
+
+-- !parquet_add_col15 --
+20
+24
+33
+
+-- !parquet_add_col16 --
+17     18      19      20      hello world
+21     22      23      24      cywcywcyw
+
+-- !parquet_add_col17 --
+cywcywcyw
+hello world
+
+-- !parquet_add_col18 --
+7      8       9       \N      \N
+
+-- !parquet_add_col19 --
+
+-- !parquet_add_col20 --
+7      8       9       \N      \N
+14     15      16      \N      \N
+17     18      19      20      hello world
+21     22      23      24      cywcywcyw
+27     28      29      \N      \N
+30     31      32      33      \N
+
+-- !parquet_add_col21 --
+7      8       9       \N      \N
+14     15      16      \N      \N
+17     18      19      20      hello world
+21     22      23      24      cywcywcyw
+27     28      29      \N      \N
+30     31      32      33      \N
+
+-- !parquet_add_col22 --
+
+-- !parquet_add_col23 --
+30     31      32      33      \N
+
+-- !parquet_add_col24 --
+
+-- !parquet_add_col25 --
+17     18      19      20      hello world
+21     22      23      24      cywcywcyw
+30     31      32      33      \N
+
+-- !parquet_add_col26 --
+
+-- !parquet_add_col27 --
+21     22      23      24      cywcywcyw
+
+-- !parquet_add_col28 --
+17     18      19      20      hello world
+21     22      23      24      cywcywcyw
+
diff --git 
a/regression-test/suites/external_table_p0/hive/test_hive_parquet_add_column.groovy
 
b/regression-test/suites/external_table_p0/hive/test_hive_parquet_add_column.groovy
new file mode 100644
index 00000000000..4a0df8bdf39
--- /dev/null
+++ 
b/regression-test/suites/external_table_p0/hive/test_hive_parquet_add_column.groovy
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_hive_parquet_add_column", 
"all_types,p0,external,hive,external_docker,external_docker_hive") {
+
+    String enabled = context.config.otherConfigs.get("enableHiveTest")
+    if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+        logger.info("diable Hive test.")
+        return;
+    }
+
+    try {
+        String hms_port = context.config.otherConfigs.get("hive3HmsPort")
+        String catalog_name = "hive3_test_parquet_add_column"
+        String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+
+        sql """drop catalog if exists ${catalog_name}"""
+        sql """create catalog if not exists ${catalog_name} properties (
+            "type"="hms",
+            'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
+        );"""
+        sql """use `${catalog_name}`.`default`"""
+
+        qt_parquet_add_col1 """select * from test_hive_parquet_add_column 
order by id ;"""
+        qt_parquet_add_col2 """select * from test_hive_parquet_add_column 
where col1 is null order by id ;"""
+        qt_parquet_add_col3 """select col1 from test_hive_parquet_add_column 
where col1 is null;"""
+        qt_parquet_add_col4 """select * from test_hive_parquet_add_column 
where col2 is null order by id ;"""
+        qt_parquet_add_col5 """select col2 from test_hive_parquet_add_column 
where col2 is null;"""
+        qt_parquet_add_col6 """select * from test_hive_parquet_add_column 
where col3 is null order by id ;"""
+        qt_parquet_add_col7 """select col3 from test_hive_parquet_add_column 
where col3 is null;"""
+        qt_parquet_add_col8 """select * from test_hive_parquet_add_column 
where col4 is null order by id ;"""
+        qt_parquet_add_col9 """select col4 from test_hive_parquet_add_column 
where col4 is null;"""
+        qt_parquet_add_col10 """select * from test_hive_parquet_add_column 
where col1 is not null order by id ;"""
+        qt_parquet_add_col11 """select col1 from test_hive_parquet_add_column 
where col1 is not null order by col1;"""
+        qt_parquet_add_col12 """select * from test_hive_parquet_add_column 
where col2 is not null order by id ;"""
+        qt_parquet_add_col13 """select col2 from test_hive_parquet_add_column 
where col2 is not null order by col2;"""
+        qt_parquet_add_col14 """select * from test_hive_parquet_add_column 
where col3 is not null order by id ;"""
+        qt_parquet_add_col15 """select col3 from test_hive_parquet_add_column 
where col3 is not null order by col3;"""
+        qt_parquet_add_col16 """select * from test_hive_parquet_add_column 
where col4 is not null order by id ;"""
+        qt_parquet_add_col17 """select col4 from test_hive_parquet_add_column 
where col4 is not null order by col4;"""
+        qt_parquet_add_col18 """select * from test_hive_parquet_add_column 
where col2 = 9 order by id ;"""
+        qt_parquet_add_col19 """select * from test_hive_parquet_add_column 
where col2 = 190 order by id ;"""
+        qt_parquet_add_col20 """select * from test_hive_parquet_add_column 
where col2 - col1 = 1 order by id ;"""
+        qt_parquet_add_col21 """select * from test_hive_parquet_add_column 
where col2 - id  = 2 order by id ;"""
+        qt_parquet_add_col22 """select * from test_hive_parquet_add_column 
where col2 - id  = 3 order by id ;"""
+        qt_parquet_add_col23 """select * from test_hive_parquet_add_column 
where col3 = 33 order by id ;"""
+        qt_parquet_add_col24 """select * from test_hive_parquet_add_column 
where col3 = 330 order by id ;"""
+        qt_parquet_add_col25 """select * from test_hive_parquet_add_column 
where col3 - col1 = 2 order by id ;"""
+        qt_parquet_add_col26 """select * from test_hive_parquet_add_column 
where col3 - id  != 3 order by id ;"""
+        qt_parquet_add_col27 """select * from test_hive_parquet_add_column 
where col1 + col2 + col3 = 23*3 order by id ;"""
+        qt_parquet_add_col28 """select * from test_hive_parquet_add_column 
where col1 + col2 + col3 != 32*3 order by id ; """
+
+        sql """drop catalog if exists ${catalog_name}"""
+
+    } finally {
+    }
+
+}
+
+
+// CREATE TABLE `test_hive_parquet_add_column`(
+//   id int,
+//   col1 int
+// )
+// stored as parquet;
+// insert into  `test_hive_parquet_add_column` values(1,2);
+// insert into  `test_hive_parquet_add_column` values(3,4),(4,6);
+// alter table `test_hive_parquet_add_column` ADD COLUMNS(col2 int);
+// insert into  `test_hive_parquet_add_column` values(7,8,9);
+// insert into  `test_hive_parquet_add_column` values(10,11,null);
+// insert into  `test_hive_parquet_add_column` values(12,13,null);
+// insert into  `test_hive_parquet_add_column` values(14,15,16);
+// alter table `test_hive_parquet_add_column` ADD COLUMNS(col3 int,col4 
string);
+// insert into  `test_hive_parquet_add_column` values(17,18,19,20,"hello 
world");
+// insert into  `test_hive_parquet_add_column` values(21,22,23,24,"cywcywcyw");
+// insert into  `test_hive_parquet_add_column` values(25,26,null,null,null);
+// insert into  `test_hive_parquet_add_column` values(27,28,29,null,null);
+// insert into  `test_hive_parquet_add_column` values(30,31,32,33,null);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to