This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 16c5374af44 [fix](orc) Should not pass selection vector when decode child column of List or Map (#50136) 16c5374af44 is described below commit 16c5374af442e90900f5605d7ecc69181f4c84f1 Author: Socrates <suyit...@selectdb.com> AuthorDate: Sat Apr 19 06:06:44 2025 +0800 [fix](orc) Should not pass selection vector when decode child column of List or Map (#50136) ### What problem does this PR solve? Related PR: #18615 Problem Summary: The problem is like https://github.com/apache/doris-thirdparty/pull/256 When performing late materialization for LIST or MAP types, filters should not be applied directly to their child fields. These complex types rely on offsets to correctly map parent-child relationships within the columnar storage layout (e.g., in ORC or Parquet files). If filters are applied to the children of a LIST or MAP field, it may cause inconsistencies in the offset alignment, leading to incorrect data being read—such as mismatched elements, missing values, or even runtime errors. This breaks the structural integrity of the nested data and can produce incorrect query results. ```text mysql> select * from complex_data_orc; +------+--------------------------+-----------------+ | id | m | l | +------+--------------------------+-----------------+ | 1 | {"a":1, "b":2} | ["a", "b"] | | 2 | {"b":3, "c":4} | ["b"] | | 3 | {"c":5, "a":6, "b":7} | ["c", "a"] | | 4 | {"a":8, "c":9} | ["b", "c"] | | 5 | {"b":10, "a":11} | ["a"] | | 6 | {"c":12, "b":13} | ["c"] | | 7 | {"a":15} | ["a", "a"] | | 8 | {"b":17} | ["b", "b"] | | 9 | {"c":19} | ["c", "c"] | | 10 | {"a":20, "b":21, "c":22} | ["a", "b", "c"] | +------+--------------------------+-----------------+ 10 rows in set (0.02 sec) !!!WRONG RESULT: mysql> select * from complex_data_orc where id > 2; +------+--------------------------+----------------+ | id | m | l | +------+--------------------------+----------------+ | 3 | {"c":5, "a":6, "b":7} | ["c", "a"] | | 4 | {"a":8, "c":9} | ["b", "c"] | | 5 | {"b":10, "":11} | ["a"] | | 6 | {"":12, "":13} | ["c"] | | 7 | {"":15} | ["a", ""] | | 8 | {"":17} | ["", ""] | | 9 | {"":19} | ["", ""] | | 10 | {"a":20, "b":21, "c":22} | ["", "b", "c"] | +------+--------------------------+----------------+ 8 rows in set (0.02 sec) ``` To ensure correctness, filters should only be applied at the top level of the LIST or MAP, and their children should be read in full when late materialization occurs. After this pr: ```text mysql> select * from complex_data_orc where id > 2; +------+--------------------------+-----------------+ | id | m | l | +------+--------------------------+-----------------+ | 3 | {"c":5, "a":6, "b":7} | ["c", "a"] | | 4 | {"a":8, "c":9} | ["b", "c"] | | 5 | {"b":10, "a":11} | ["a"] | | 6 | {"c":12, "b":13} | ["c"] | | 7 | {"a":15} | ["a", "a"] | | 8 | {"b":17} | ["b", "b"] | | 9 | {"c":19} | ["c", "c"] | | 10 | {"a":20, "b":21, "c":22} | ["a", "b", "c"] | +------+--------------------------+-----------------+ 8 rows in set (1.41 sec) ``` --- be/src/vec/exec/format/orc/vorc_reader.cpp | 14 +++++++------- .../create_hive_orc_tables.hql | 11 +++++++++++ .../preinstalled_data/orc/complex_data_orc/000000_0 | Bin 0 -> 561 bytes .../external_table_p0/hive/test_hive_orc_predicate.out | Bin 585 -> 2463 bytes .../hive/test_hive_orc_predicate.groovy | 5 +++++ 5 files changed, 23 insertions(+), 7 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 709ea0bf327..364ad5ea8d2 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -1724,7 +1724,7 @@ Status OrcReader::_fill_doris_data_column(const std::string& col_name, ->get_nested_type()); const orc::Type* nested_orc_type = orc_column_type->getSubtype(0); std::string element_name = col_name + ".element"; - return _orc_column_to_doris_column<is_filter>( + return _orc_column_to_doris_column<false>( element_name, static_cast<ColumnArray&>(*data_column).get_data_ptr(), nested_type, nested_orc_type, orc_list->elements.get(), element_size); } @@ -1750,12 +1750,12 @@ Status OrcReader::_fill_doris_data_column(const std::string& col_name, ColumnPtr& doris_value_column = doris_map.get_values_ptr(); std::string key_col_name = col_name + ".key"; std::string value_col_name = col_name + ".value"; - RETURN_IF_ERROR(_orc_column_to_doris_column<is_filter>(key_col_name, doris_key_column, - doris_key_type, orc_key_type, - orc_map->keys.get(), element_size)); - return _orc_column_to_doris_column<is_filter>(value_col_name, doris_value_column, - doris_value_type, orc_value_type, - orc_map->elements.get(), element_size); + RETURN_IF_ERROR(_orc_column_to_doris_column<false>(key_col_name, doris_key_column, + doris_key_type, orc_key_type, + orc_map->keys.get(), element_size)); + return _orc_column_to_doris_column<false>(value_col_name, doris_value_column, + doris_value_type, orc_value_type, + orc_map->elements.get(), element_size); } case TypeIndex::Struct: { if (orc_column_type->getKind() != orc::TypeKind::STRUCT) { diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/create_hive_orc_tables.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/create_hive_orc_tables.hql new file mode 100644 index 00000000000..d33061471ea --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/create_hive_orc_tables.hql @@ -0,0 +1,11 @@ +create database if not exists multi_catalog; +use multi_catalog; + +CREATE TABLE complex_data_orc ( + id INT, + m MAP<STRING, INT>, + l ARRAY<STRING> +) +STORED AS ORC +LOCATION + '/user/doris/preinstalled_data/orc/complex_data_orc'; diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc/complex_data_orc/000000_0 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc/complex_data_orc/000000_0 new file mode 100644 index 00000000000..5e91793abf4 Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc/complex_data_orc/000000_0 differ diff --git a/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out b/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out index d943f3f80c6..8060ddd620c 100644 Binary files a/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out and b/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out differ diff --git a/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy b/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy index cfb96441fb2..f898f2862dc 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy @@ -45,6 +45,11 @@ suite("test_hive_orc_predicate", "p0,external,hive,external_docker,external_dock qt_predicate_null_aware_equal_in_rt """select * from table_a inner join table_b on table_a.age <=> table_b.age and table_b.id in (1,3) order by table_a.id;""" + qt_lazy_materialization_for_list_type """ select l from complex_data_orc where id > 2 order by id; """ + qt_lazy_materialization_for_map_type """ select m from complex_data_orc where id > 2 order by id; """ + qt_lazy_materialization_for_list_and_map_type """ select * from complex_data_orc where id > 2 order by id; """ + qt_lazy_materialization_for_list_type2 """select t_struct_nested from `${catalog_name}`.`default`.orc_all_types_t where t_int=3;""" + sql """drop catalog if exists ${catalog_name}""" } finally { } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org