This is an automated email from the ASF dual-hosted git repository.

kakachen pushed a commit to branch turn_on_late_mat_orc_complex_types
in repository https://gitbox.apache.org/repos/asf/doris.git

commit abb01ec505303dfe6b113db025771d1f4b7fcc10
Author: kakachen <che...@selectdb.com>
AuthorDate: Mon Mar 31 20:11:04 2025 +0800

    [opt](orc-reader)Turn on late materialization of orc complex types.
---
 be/src/apache-orc                                  |   2 +-
 be/src/vec/exec/format/orc/vorc_reader.cpp         |   6 +-
 .../orc_nested_types/create_table.hql              |  32 ++++
 .../multi_catalog/orc_nested_types/data.tar.gz     | Bin 0 -> 2965 bytes
 .../data/multi_catalog/orc_nested_types/run.sh     |  12 ++
 .../hive/test_orc_nested_types.groovy              | 183 +++++++++++++++++++++
 6 files changed, 232 insertions(+), 3 deletions(-)

diff --git a/be/src/apache-orc b/be/src/apache-orc
index f3349e01f34..8e3303a8c6d 160000
--- a/be/src/apache-orc
+++ b/be/src/apache-orc
@@ -1 +1 @@
-Subproject commit f3349e01f343ee84241427c5b46e8ecc3bed6d5b
+Subproject commit 8e3303a8c6d6eee03b09e05ab91b844fdc873764
diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp 
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index 9a467a4064a..58b287e0b9b 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -1117,13 +1117,15 @@ Status OrcReader::set_fill_columns(
         }
     }
 
-    if (!_has_complex_type && _enable_lazy_mat && 
!_lazy_read_ctx.predicate_columns.first.empty() &&
+    if (_enable_lazy_mat && !_lazy_read_ctx.predicate_columns.first.empty() &&
         !_lazy_read_ctx.lazy_read_columns.empty()) {
         _lazy_read_ctx.can_lazy_read = true;
     }
 
-    if (_lazy_read_ctx.conjuncts.empty() || 
!_init_search_argument(_lazy_read_ctx.conjuncts)) {
+    if (_lazy_read_ctx.conjuncts.empty()) {
         _lazy_read_ctx.can_lazy_read = false;
+    } else {
+        _init_search_argument(_lazy_read_ctx.conjuncts);
     }
     try {
         _row_reader_options.range(_range_start_offset, _range_size);
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_nested_types/create_table.hql
 
b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_nested_types/create_table.hql
new file mode 100644
index 00000000000..a1a35827909
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_nested_types/create_table.hql
@@ -0,0 +1,32 @@
+CREATE DATABASE IF NOT EXISTS multi_catalog;
+USE multi_catalog;
+
+CREATE TABLE `nested_types1_orc` (
+    `id` INT,
+    `array_col` ARRAY<INT>,
+    `nested_array_col` ARRAY<ARRAY<INT>>,
+    `map_col` MAP<STRING, INT>,
+    `nested_map_col` MAP<STRING, ARRAY<INT>>,
+    `struct_col` STRUCT<`name`: STRING, `age`: INT>,
+    `array_struct_col` ARRAY<STRUCT<`name`: STRING, `age`: INT>>,
+    `map_struct_col` MAP<STRING, STRUCT<`name`: STRING, `age`: INT>>,
+    `complex_struct_col` STRUCT<
+        `a`: ARRAY<INT>,
+        `b`: MAP<STRING, ARRAY<INT>>,
+        `c`: STRUCT<
+            `x`: ARRAY<INT>,
+            `y`: STRING
+        >
+    >
+)
+ROW FORMAT SERDE
+  'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT
+  'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT
+  'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
+LOCATION
+  '/user/doris/suites/multi_catalog/nested_types1_orc';
+
+msck repair table nested_types1_orc;
+
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_nested_types/data.tar.gz
 
b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_nested_types/data.tar.gz
new file mode 100644
index 00000000000..d7be7822674
Binary files /dev/null and 
b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_nested_types/data.tar.gz
 differ
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_nested_types/run.sh
 
b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_nested_types/run.sh
new file mode 100644
index 00000000000..f3136eaa200
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_nested_types/run.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -x
+
+CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+
+## mkdir and put data to hdfs
+cd "${CUR_DIR}" && rm -rf data/ && tar xzf data.tar.gz
+hadoop fs -mkdir -p /user/doris/suites/multi_catalog/
+hadoop fs -put "${CUR_DIR}"/data/* /user/doris/suites/multi_catalog/
+
+# create table
+hive -f "${CUR_DIR}/create_table.hql"
diff --git 
a/regression-test/suites/external_table_p0/hive/test_orc_nested_types.groovy 
b/regression-test/suites/external_table_p0/hive/test_orc_nested_types.groovy
new file mode 100644
index 00000000000..63da0c2582f
--- /dev/null
+++ b/regression-test/suites/external_table_p0/hive/test_orc_nested_types.groovy
@@ -0,0 +1,183 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_orc_nested_types", 
"p0,external,hive,external_docker,external_docker_hive") {
+    String enabled = context.config.otherConfigs.get("enableHiveTest")
+    if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+        logger.info("disable Hive test.")
+        return;
+    }
+
+    for (String hivePrefix : ["hive2", "hive3"]) {
+        String hms_port = context.config.otherConfigs.get(hivePrefix + 
"HmsPort")
+        String catalog_name = "${hivePrefix}_test_orc_nested_types"
+        String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+
+        sql """drop catalog if exists ${catalog_name}"""
+        sql """create catalog if not exists ${catalog_name} properties (
+            "type"="hms",
+            'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
+        );"""
+        logger.info("catalog " + catalog_name + " created")
+        sql """switch ${catalog_name};"""
+        logger.info("switched to catalog " + catalog_name)
+
+        sql """ use multi_catalog """
+
+        -- 简单查询
+        order_qt_nested_types_q1 """select array_col from nested_types_example 
where id = 1"""
+        order_qt_nested_types_q2 """select array_col from nested_types_example 
where id = 2"""
+        order_qt_nested_types_q3 """select array_col from nested_types_example 
where id = 3"""
+        
+        -- 数组大小查询
+        order_qt_nested_types_q4 """
+            SELECT id, size(array_col) as arr_size
+            FROM nested_types_example
+            ORDER BY id
+        """
+        order_qt_nested_types_q5 """
+            SELECT id, array_col
+            FROM nested_types_example
+            WHERE size(array_col) > 2
+            ORDER BY id
+        """
+        
+        -- 数组元素查询
+        order_qt_nested_types_q6 """
+            SELECT id, array_col[0] as first_elem, array_col[2] as third_elem
+            FROM nested_types_example
+            ORDER BY id
+        """
+        order_qt_nested_types_q7 """
+            SELECT id, array_col
+            FROM nested_types_example
+            WHERE array_contains(array_col, 1)
+            ORDER BY id
+        """
+        
+        -- 复杂条件查询
+        order_qt_nested_types_q8 """
+            SELECT id, array_col, description
+            FROM nested_types_example
+            WHERE id > 1 AND size(array_col) < 3
+            ORDER BY id
+        """
+        order_qt_nested_types_q9 """
+            SELECT id, array_col, description
+            FROM nested_types_example
+            WHERE description LIKE '%Hello%'
+            ORDER BY id
+        """
+        
+        -- 数组统计查询
+        order_qt_nested_types_q10 """
+            SELECT
+                id,
+                array_min(array_col) as min_val,
+                array_max(array_col) as max_val
+            FROM nested_types_example
+            ORDER BY id
+        """
+        
+        -- 嵌套数组查询
+        order_qt_nested_types_q11 """
+            SELECT
+                id,
+                nested_array_col,
+                size(nested_array_col) as outer_size
+            FROM nested_types_example
+            WHERE id = 1
+        """
+        order_qt_nested_types_q12 """
+            SELECT
+                id,
+                nested_array_col,
+                size(nested_array_col[0]) as inner_size
+            FROM nested_types_example
+            WHERE id = 2
+        """
+        
+        -- 结构体数组查询
+        order_qt_nested_types_q13 """
+            SELECT
+                id,
+                array_struct_col,
+                size(array_struct_col) as struct_arr_size
+            FROM nested_types_example
+            WHERE description LIKE '%large%'
+        """
+        order_qt_nested_types_q14 """
+            SELECT
+                id,
+                item.name as name,
+                item.age as age
+            FROM nested_types_example
+            LATERAL VIEW EXPLODE(array_struct_col) tmp AS item
+            WHERE id = 1 AND item.age > 30
+        """
+        
+        -- 映射数组查询
+        order_qt_nested_types_q15 """
+            SELECT
+                id,
+                map_array_col,
+                size(map_array_col) as map_size
+            FROM nested_types_example
+            WHERE id = 2
+        """
+        order_qt_nested_types_q16 """
+            SELECT
+                id,
+                map_array_col['a'] as a_value
+            FROM nested_types_example
+            WHERE id = 1
+        """
+        
+        -- 复杂结构体查询
+        order_qt_nested_types_q17 """
+            SELECT
+                id,
+                complex_struct_col.a as array_a
+            FROM nested_types_example
+            WHERE id = 1
+        """
+        order_qt_nested_types_q18 """
+            SELECT
+                id,
+                complex_struct_col.b as map_b
+            FROM nested_types_example
+            WHERE id = 2
+        """
+        order_qt_nested_types_q19 """
+            SELECT
+                id,
+                complex_struct_col.c as struct_c
+            FROM nested_types_example
+            WHERE id = 3
+        """
+        
+        -- 综合查询
+        order_qt_nested_types_q20 """
+            SELECT *
+            FROM nested_types_example
+            ORDER BY id
+        """
+
+        sql """drop catalog ${catalog_name};"""
+    }
+}
+


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to