This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 16c5374af44 [fix](orc) Should not pass selection vector when decode 
child column of List or Map (#50136)
16c5374af44 is described below

commit 16c5374af442e90900f5605d7ecc69181f4c84f1
Author: Socrates <suyit...@selectdb.com>
AuthorDate: Sat Apr 19 06:06:44 2025 +0800

    [fix](orc) Should not pass selection vector when decode child column of 
List or Map (#50136)
    
    ### What problem does this PR solve?
    Related PR: #18615
    
    Problem Summary:
    The problem is like https://github.com/apache/doris-thirdparty/pull/256
    When performing late materialization for LIST or MAP types, filters
    should not be applied directly to their child fields. These complex
    types rely on offsets to correctly map parent-child relationships within
    the columnar storage layout (e.g., in ORC or Parquet files).
    
    If filters are applied to the children of a LIST or MAP field, it may
    cause inconsistencies in the offset alignment, leading to incorrect data
    being read—such as mismatched elements, missing values, or even runtime
    errors. This breaks the structural integrity of the nested data and can
    produce incorrect query results.
    
    ```text
    mysql> select * from complex_data_orc;
    +------+--------------------------+-----------------+
    | id   | m                        | l               |
    +------+--------------------------+-----------------+
    |    1 | {"a":1, "b":2}           | ["a", "b"]      |
    |    2 | {"b":3, "c":4}           | ["b"]           |
    |    3 | {"c":5, "a":6, "b":7}    | ["c", "a"]      |
    |    4 | {"a":8, "c":9}           | ["b", "c"]      |
    |    5 | {"b":10, "a":11}         | ["a"]           |
    |    6 | {"c":12, "b":13}         | ["c"]           |
    |    7 | {"a":15}                 | ["a", "a"]      |
    |    8 | {"b":17}                 | ["b", "b"]      |
    |    9 | {"c":19}                 | ["c", "c"]      |
    |   10 | {"a":20, "b":21, "c":22} | ["a", "b", "c"] |
    +------+--------------------------+-----------------+
    10 rows in set (0.02 sec)
    
    !!!WRONG RESULT:
    mysql> select * from complex_data_orc where id > 2;
    +------+--------------------------+----------------+
    | id   | m                        | l              |
    +------+--------------------------+----------------+
    |    3 | {"c":5, "a":6, "b":7}    | ["c", "a"]     |
    |    4 | {"a":8, "c":9}           | ["b", "c"]     |
    |    5 | {"b":10, "":11}          | ["a"]          |
    |    6 | {"":12, "":13}           | ["c"]          |
    |    7 | {"":15}                  | ["a", ""]      |
    |    8 | {"":17}                  | ["", ""]       |
    |    9 | {"":19}                  | ["", ""]       |
    |   10 | {"a":20, "b":21, "c":22} | ["", "b", "c"] |
    +------+--------------------------+----------------+
    8 rows in set (0.02 sec)
    ```
    
    To ensure correctness, filters should only be applied at the top level
    of the LIST or MAP, and their children should be read in full when late
    materialization occurs.
    
    After this pr:
    ```text
    mysql> select * from complex_data_orc where id > 2;
    +------+--------------------------+-----------------+
    | id   | m                        | l               |
    +------+--------------------------+-----------------+
    |    3 | {"c":5, "a":6, "b":7}    | ["c", "a"]      |
    |    4 | {"a":8, "c":9}           | ["b", "c"]      |
    |    5 | {"b":10, "a":11}         | ["a"]           |
    |    6 | {"c":12, "b":13}         | ["c"]           |
    |    7 | {"a":15}                 | ["a", "a"]      |
    |    8 | {"b":17}                 | ["b", "b"]      |
    |    9 | {"c":19}                 | ["c", "c"]      |
    |   10 | {"a":20, "b":21, "c":22} | ["a", "b", "c"] |
    +------+--------------------------+-----------------+
    8 rows in set (1.41 sec)
    ```
---
 be/src/vec/exec/format/orc/vorc_reader.cpp             |  14 +++++++-------
 .../create_hive_orc_tables.hql                         |  11 +++++++++++
 .../preinstalled_data/orc/complex_data_orc/000000_0    | Bin 0 -> 561 bytes
 .../external_table_p0/hive/test_hive_orc_predicate.out | Bin 585 -> 2463 bytes
 .../hive/test_hive_orc_predicate.groovy                |   5 +++++
 5 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp 
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index 709ea0bf327..364ad5ea8d2 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -1724,7 +1724,7 @@ Status OrcReader::_fill_doris_data_column(const 
std::string& col_name,
                         ->get_nested_type());
         const orc::Type* nested_orc_type = orc_column_type->getSubtype(0);
         std::string element_name = col_name + ".element";
-        return _orc_column_to_doris_column<is_filter>(
+        return _orc_column_to_doris_column<false>(
                 element_name, 
static_cast<ColumnArray&>(*data_column).get_data_ptr(), nested_type,
                 nested_orc_type, orc_list->elements.get(), element_size);
     }
@@ -1750,12 +1750,12 @@ Status OrcReader::_fill_doris_data_column(const 
std::string& col_name,
         ColumnPtr& doris_value_column = doris_map.get_values_ptr();
         std::string key_col_name = col_name + ".key";
         std::string value_col_name = col_name + ".value";
-        RETURN_IF_ERROR(_orc_column_to_doris_column<is_filter>(key_col_name, 
doris_key_column,
-                                                               doris_key_type, 
orc_key_type,
-                                                               
orc_map->keys.get(), element_size));
-        return _orc_column_to_doris_column<is_filter>(value_col_name, 
doris_value_column,
-                                                      doris_value_type, 
orc_value_type,
-                                                      orc_map->elements.get(), 
element_size);
+        RETURN_IF_ERROR(_orc_column_to_doris_column<false>(key_col_name, 
doris_key_column,
+                                                           doris_key_type, 
orc_key_type,
+                                                           
orc_map->keys.get(), element_size));
+        return _orc_column_to_doris_column<false>(value_col_name, 
doris_value_column,
+                                                  doris_value_type, 
orc_value_type,
+                                                  orc_map->elements.get(), 
element_size);
     }
     case TypeIndex::Struct: {
         if (orc_column_type->getKind() != orc::TypeKind::STRUCT) {
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/create_hive_orc_tables.hql
 
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/create_hive_orc_tables.hql
new file mode 100644
index 00000000000..d33061471ea
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/create_hive_orc_tables.hql
@@ -0,0 +1,11 @@
+create database if not exists multi_catalog;
+use multi_catalog;
+
+CREATE TABLE complex_data_orc (
+  id INT,
+  m MAP<STRING, INT>,
+  l ARRAY<STRING>
+)
+STORED AS ORC
+LOCATION
+  '/user/doris/preinstalled_data/orc/complex_data_orc';
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc/complex_data_orc/000000_0
 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc/complex_data_orc/000000_0
new file mode 100644
index 00000000000..5e91793abf4
Binary files /dev/null and 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc/complex_data_orc/000000_0
 differ
diff --git 
a/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out 
b/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out
index d943f3f80c6..8060ddd620c 100644
Binary files 
a/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out and 
b/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out differ
diff --git 
a/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy 
b/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy
index cfb96441fb2..f898f2862dc 100644
--- 
a/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy
+++ 
b/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy
@@ -45,6 +45,11 @@ suite("test_hive_orc_predicate", 
"p0,external,hive,external_docker,external_dock
 
             qt_predicate_null_aware_equal_in_rt """select * from table_a inner 
join table_b on table_a.age <=> table_b.age and table_b.id in (1,3) order by 
table_a.id;"""
 
+            qt_lazy_materialization_for_list_type """ select l from 
complex_data_orc where id > 2 order by id; """
+            qt_lazy_materialization_for_map_type """ select m from 
complex_data_orc where id > 2 order by id; """
+            qt_lazy_materialization_for_list_and_map_type """ select * from 
complex_data_orc where id > 2 order by id; """
+            qt_lazy_materialization_for_list_type2 """select t_struct_nested 
from `${catalog_name}`.`default`.orc_all_types_t where t_int=3;"""
+
             sql """drop catalog if exists ${catalog_name}"""
         } finally {
         }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to