This is an automated email from the ASF dual-hosted git repository. kakachen pushed a commit to branch turn_on_late_mat_orc_complex_types in repository https://gitbox.apache.org/repos/asf/doris.git
commit abb01ec505303dfe6b113db025771d1f4b7fcc10 Author: kakachen <che...@selectdb.com> AuthorDate: Mon Mar 31 20:11:04 2025 +0800 [opt](orc-reader)Turn on late materialization of orc complex types. --- be/src/apache-orc | 2 +- be/src/vec/exec/format/orc/vorc_reader.cpp | 6 +- .../orc_nested_types/create_table.hql | 32 ++++ .../multi_catalog/orc_nested_types/data.tar.gz | Bin 0 -> 2965 bytes .../data/multi_catalog/orc_nested_types/run.sh | 12 ++ .../hive/test_orc_nested_types.groovy | 183 +++++++++++++++++++++ 6 files changed, 232 insertions(+), 3 deletions(-) diff --git a/be/src/apache-orc b/be/src/apache-orc index f3349e01f34..8e3303a8c6d 160000 --- a/be/src/apache-orc +++ b/be/src/apache-orc @@ -1 +1 @@ -Subproject commit f3349e01f343ee84241427c5b46e8ecc3bed6d5b +Subproject commit 8e3303a8c6d6eee03b09e05ab91b844fdc873764 diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 9a467a4064a..58b287e0b9b 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -1117,13 +1117,15 @@ Status OrcReader::set_fill_columns( } } - if (!_has_complex_type && _enable_lazy_mat && !_lazy_read_ctx.predicate_columns.first.empty() && + if (_enable_lazy_mat && !_lazy_read_ctx.predicate_columns.first.empty() && !_lazy_read_ctx.lazy_read_columns.empty()) { _lazy_read_ctx.can_lazy_read = true; } - if (_lazy_read_ctx.conjuncts.empty() || !_init_search_argument(_lazy_read_ctx.conjuncts)) { + if (_lazy_read_ctx.conjuncts.empty()) { _lazy_read_ctx.can_lazy_read = false; + } else { + _init_search_argument(_lazy_read_ctx.conjuncts); } try { _row_reader_options.range(_range_start_offset, _range_size); diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_nested_types/create_table.hql b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_nested_types/create_table.hql new file mode 100644 index 00000000000..a1a35827909 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_nested_types/create_table.hql @@ -0,0 +1,32 @@ +CREATE DATABASE IF NOT EXISTS multi_catalog; +USE multi_catalog; + +CREATE TABLE `nested_types1_orc` ( + `id` INT, + `array_col` ARRAY<INT>, + `nested_array_col` ARRAY<ARRAY<INT>>, + `map_col` MAP<STRING, INT>, + `nested_map_col` MAP<STRING, ARRAY<INT>>, + `struct_col` STRUCT<`name`: STRING, `age`: INT>, + `array_struct_col` ARRAY<STRUCT<`name`: STRING, `age`: INT>>, + `map_struct_col` MAP<STRING, STRUCT<`name`: STRING, `age`: INT>>, + `complex_struct_col` STRUCT< + `a`: ARRAY<INT>, + `b`: MAP<STRING, ARRAY<INT>>, + `c`: STRUCT< + `x`: ARRAY<INT>, + `y`: STRING + > + > +) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT + 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' +LOCATION + '/user/doris/suites/multi_catalog/nested_types1_orc'; + +msck repair table nested_types1_orc; + diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_nested_types/data.tar.gz b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_nested_types/data.tar.gz new file mode 100644 index 00000000000..d7be7822674 Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_nested_types/data.tar.gz differ diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_nested_types/run.sh b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_nested_types/run.sh new file mode 100644 index 00000000000..f3136eaa200 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_nested_types/run.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -x + +CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" + +## mkdir and put data to hdfs +cd "${CUR_DIR}" && rm -rf data/ && tar xzf data.tar.gz +hadoop fs -mkdir -p /user/doris/suites/multi_catalog/ +hadoop fs -put "${CUR_DIR}"/data/* /user/doris/suites/multi_catalog/ + +# create table +hive -f "${CUR_DIR}/create_table.hql" diff --git a/regression-test/suites/external_table_p0/hive/test_orc_nested_types.groovy b/regression-test/suites/external_table_p0/hive/test_orc_nested_types.groovy new file mode 100644 index 00000000000..63da0c2582f --- /dev/null +++ b/regression-test/suites/external_table_p0/hive/test_orc_nested_types.groovy @@ -0,0 +1,183 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_orc_nested_types", "p0,external,hive,external_docker,external_docker_hive") { + String enabled = context.config.otherConfigs.get("enableHiveTest") + if (enabled == null || !enabled.equalsIgnoreCase("true")) { + logger.info("disable Hive test.") + return; + } + + for (String hivePrefix : ["hive2", "hive3"]) { + String hms_port = context.config.otherConfigs.get(hivePrefix + "HmsPort") + String catalog_name = "${hivePrefix}_test_orc_nested_types" + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + + sql """drop catalog if exists ${catalog_name}""" + sql """create catalog if not exists ${catalog_name} properties ( + "type"="hms", + 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}' + );""" + logger.info("catalog " + catalog_name + " created") + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + + sql """ use multi_catalog """ + + -- 简单查询 + order_qt_nested_types_q1 """select array_col from nested_types_example where id = 1""" + order_qt_nested_types_q2 """select array_col from nested_types_example where id = 2""" + order_qt_nested_types_q3 """select array_col from nested_types_example where id = 3""" + + -- 数组大小查询 + order_qt_nested_types_q4 """ + SELECT id, size(array_col) as arr_size + FROM nested_types_example + ORDER BY id + """ + order_qt_nested_types_q5 """ + SELECT id, array_col + FROM nested_types_example + WHERE size(array_col) > 2 + ORDER BY id + """ + + -- 数组元素查询 + order_qt_nested_types_q6 """ + SELECT id, array_col[0] as first_elem, array_col[2] as third_elem + FROM nested_types_example + ORDER BY id + """ + order_qt_nested_types_q7 """ + SELECT id, array_col + FROM nested_types_example + WHERE array_contains(array_col, 1) + ORDER BY id + """ + + -- 复杂条件查询 + order_qt_nested_types_q8 """ + SELECT id, array_col, description + FROM nested_types_example + WHERE id > 1 AND size(array_col) < 3 + ORDER BY id + """ + order_qt_nested_types_q9 """ + SELECT id, array_col, description + FROM nested_types_example + WHERE description LIKE '%Hello%' + ORDER BY id + """ + + -- 数组统计查询 + order_qt_nested_types_q10 """ + SELECT + id, + array_min(array_col) as min_val, + array_max(array_col) as max_val + FROM nested_types_example + ORDER BY id + """ + + -- 嵌套数组查询 + order_qt_nested_types_q11 """ + SELECT + id, + nested_array_col, + size(nested_array_col) as outer_size + FROM nested_types_example + WHERE id = 1 + """ + order_qt_nested_types_q12 """ + SELECT + id, + nested_array_col, + size(nested_array_col[0]) as inner_size + FROM nested_types_example + WHERE id = 2 + """ + + -- 结构体数组查询 + order_qt_nested_types_q13 """ + SELECT + id, + array_struct_col, + size(array_struct_col) as struct_arr_size + FROM nested_types_example + WHERE description LIKE '%large%' + """ + order_qt_nested_types_q14 """ + SELECT + id, + item.name as name, + item.age as age + FROM nested_types_example + LATERAL VIEW EXPLODE(array_struct_col) tmp AS item + WHERE id = 1 AND item.age > 30 + """ + + -- 映射数组查询 + order_qt_nested_types_q15 """ + SELECT + id, + map_array_col, + size(map_array_col) as map_size + FROM nested_types_example + WHERE id = 2 + """ + order_qt_nested_types_q16 """ + SELECT + id, + map_array_col['a'] as a_value + FROM nested_types_example + WHERE id = 1 + """ + + -- 复杂结构体查询 + order_qt_nested_types_q17 """ + SELECT + id, + complex_struct_col.a as array_a + FROM nested_types_example + WHERE id = 1 + """ + order_qt_nested_types_q18 """ + SELECT + id, + complex_struct_col.b as map_b + FROM nested_types_example + WHERE id = 2 + """ + order_qt_nested_types_q19 """ + SELECT + id, + complex_struct_col.c as struct_c + FROM nested_types_example + WHERE id = 3 + """ + + -- 综合查询 + order_qt_nested_types_q20 """ + SELECT * + FROM nested_types_example + ORDER BY id + """ + + sql """drop catalog ${catalog_name};""" + } +} + --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org