This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-2.1-lakehouse
in repository https://gitbox.apache.org/repos/asf/doris.git

commit c8f733dbe83e4d856e42438819f8b9779698ed26
Author: Socrates <suyit...@selectdb.com>
AuthorDate: Mon Dec 23 10:59:54 2024 +0800

    [Fix](ORC) Not push down fixed char type in orc reader  (#45484)
---
 be/src/vec/exec/format/orc/vorc_reader.cpp         |  24 ++++++----
 be/src/vec/exec/format/orc/vorc_reader.h           |   4 +-
 be/src/vec/exec/scan/vfile_scanner.cpp             |  10 +----
 .../orc_predicate/orc_predicate_table.hql          |  16 +++++++
 .../data/multi_catalog/orc_predicate/run.sh        |   9 ++++
 .../hive/test_hive_orc_predicate.out               | Bin 0 -> 463 bytes
 .../hive/test_hive_orc_predicate.groovy            |  50 +++++++++++++++++++++
 7 files changed, 93 insertions(+), 20 deletions(-)

diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp 
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index dd010fe5bd8..765791daef8 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -142,7 +142,7 @@ void ORCFileInputStream::read(void* buf, uint64_t length, 
uint64_t offset) {
 OrcReader::OrcReader(RuntimeProfile* profile, RuntimeState* state,
                      const TFileScanRangeParams& params, const TFileRangeDesc& 
range,
                      size_t batch_size, const std::string& ctz, io::IOContext* 
io_ctx,
-                     bool enable_lazy_mat, std::vector<orc::TypeKind>* 
unsupported_pushdown_types)
+                     bool enable_lazy_mat)
         : _profile(profile),
           _state(state),
           _scan_params(params),
@@ -155,8 +155,7 @@ OrcReader::OrcReader(RuntimeProfile* profile, RuntimeState* 
state,
           _enable_lazy_mat(enable_lazy_mat),
           _enable_filter_by_min_max(
                   state == nullptr ? true : 
state->query_options().enable_orc_filter_by_min_max),
-          _dict_cols_has_converted(false),
-          _unsupported_pushdown_types(unsupported_pushdown_types) {
+          _dict_cols_has_converted(false) {
     TimezoneUtils::find_cctz_time_zone(ctz, _time_zone);
     VecDateTimeValue t;
     t.from_unixtime(0, ctz);
@@ -460,7 +459,8 @@ static std::unordered_map<orc::TypeKind, 
orc::PredicateDataType> TYPEKIND_TO_PRE
         {orc::TypeKind::DOUBLE, orc::PredicateDataType::FLOAT},
         {orc::TypeKind::STRING, orc::PredicateDataType::STRING},
         {orc::TypeKind::BINARY, orc::PredicateDataType::STRING},
-        {orc::TypeKind::CHAR, orc::PredicateDataType::STRING},
+        // should not pust down CHAR type, because CHAR type is fixed length 
and will be padded
+        // {orc::TypeKind::CHAR, orc::PredicateDataType::STRING},
         {orc::TypeKind::VARCHAR, orc::PredicateDataType::STRING},
         {orc::TypeKind::DATE, orc::PredicateDataType::DATE},
         {orc::TypeKind::DECIMAL, orc::PredicateDataType::DECIMAL},
@@ -492,8 +492,9 @@ std::tuple<bool, orc::Literal> convert_to_orc_literal(const 
orc::Type* type,
             [[fallthrough]];
         case orc::TypeKind::BINARY:
             [[fallthrough]];
-        case orc::TypeKind::CHAR:
-            [[fallthrough]];
+        // should not pust down CHAR type, because CHAR type is fixed length 
and will be padded
+        // case orc::TypeKind::CHAR:
+        //     [[fallthrough]];
         case orc::TypeKind::VARCHAR: {
             return std::make_tuple(true, orc::Literal(literal_data.data, 
literal_data.size));
         }
@@ -589,7 +590,15 @@ std::tuple<bool, orc::Literal, orc::PredicateDataType> 
OrcReader::_make_orc_lite
     auto literal_data = literal->get_column_ptr()->get_data_at(0);
     auto* slot = _tuple_descriptor->slots()[slot_ref->column_id()];
     auto slot_type = slot->type();
-    switch (slot_type.type) {
+    auto primitive_type = slot_type.type;
+    auto src_type = OrcReader::convert_to_doris_type(orc_type).type;
+    // should not down predicate for string type change from other type
+    if (src_type != primitive_type && !is_string_type(src_type) && 
is_string_type(primitive_type)) {
+        LOG(WARNING) << "Unsupported Push Down Schema Changed Column " << 
primitive_type << " to "
+                     << src_type;
+        return std::make_tuple(false, orc::Literal(false), 
orc::PredicateDataType::LONG);
+    }
+    switch (primitive_type) {
 #define M(NAME)                                                                
\
     case TYPE_##NAME: {                                                        
\
         auto [valid, orc_literal] = convert_to_orc_literal<TYPE_##NAME>(       
\
@@ -602,7 +611,6 @@ std::tuple<bool, orc::Literal, orc::PredicateDataType> 
OrcReader::_make_orc_lite
     M(INT)                          \
     M(BIGINT)                       \
     M(LARGEINT)                     \
-    M(CHAR)                         \
     M(DATE)                         \
     M(DATETIME)                     \
     M(DATEV2)                       \
diff --git a/be/src/vec/exec/format/orc/vorc_reader.h 
b/be/src/vec/exec/format/orc/vorc_reader.h
index aa6432d9948..88f4854f056 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.h
+++ b/be/src/vec/exec/format/orc/vorc_reader.h
@@ -129,8 +129,7 @@ public:
 
     OrcReader(RuntimeProfile* profile, RuntimeState* state, const 
TFileScanRangeParams& params,
               const TFileRangeDesc& range, size_t batch_size, const 
std::string& ctz,
-              io::IOContext* io_ctx, bool enable_lazy_mat = true,
-              std::vector<orc::TypeKind>* unsupported_pushdown_types = 
nullptr);
+              io::IOContext* io_ctx, bool enable_lazy_mat = true);
 
     OrcReader(const TFileScanRangeParams& params, const TFileRangeDesc& range,
               const std::string& ctz, io::IOContext* io_ctx, bool 
enable_lazy_mat = true);
@@ -639,7 +638,6 @@ private:
     std::unique_ptr<StringDictFilterImpl> _string_dict_filter;
     bool _dict_cols_has_converted = false;
     bool _has_complex_type = false;
-    std::vector<orc::TypeKind>* _unsupported_pushdown_types;
 
     // resolve schema change
     std::unordered_map<std::string, 
std::unique_ptr<converter::ColumnTypeConverter>> _converters;
diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp 
b/be/src/vec/exec/scan/vfile_scanner.cpp
index 7c65faac81c..f4376cc1e70 100644
--- a/be/src/vec/exec/scan/vfile_scanner.cpp
+++ b/be/src/vec/exec/scan/vfile_scanner.cpp
@@ -893,17 +893,9 @@ Status VFileScanner::_get_next_reader() {
             break;
         }
         case TFileFormatType::FORMAT_ORC: {
-            std::vector<orc::TypeKind>* unsupported_pushdown_types = nullptr;
-            if (range.__isset.table_format_params &&
-                range.table_format_params.table_format_type == "paimon") {
-                static std::vector<orc::TypeKind> paimon_unsupport_type =
-                        std::vector<orc::TypeKind> {orc::TypeKind::CHAR};
-                unsupported_pushdown_types = &paimon_unsupport_type;
-            }
             std::unique_ptr<OrcReader> orc_reader = OrcReader::create_unique(
                     _profile, _state, *_params, range, 
_state->query_options().batch_size,
-                    _state->timezone(), _io_ctx.get(), 
_state->query_options().enable_orc_lazy_mat,
-                    unsupported_pushdown_types);
+                    _state->timezone(), _io_ctx.get(), 
_state->query_options().enable_orc_lazy_mat);
             orc_reader->set_push_down_agg_type(_get_push_down_agg_type());
             if (push_down_predicates) {
                 RETURN_IF_ERROR(_process_late_arrival_conjuncts());
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql
 
b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql
new file mode 100644
index 00000000000..a946b25ff1a
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql
@@ -0,0 +1,16 @@
+CREATE DATABASE IF NOT EXISTS multi_catalog;
+USE multi_catalog;
+
+create table fixed_char_table (
+  i int,
+  c char(2)
+) stored as orc;
+
+insert into fixed_char_table values(1,'a'),(2,'b '), (3,'cd');
+
+create table type_changed_table (
+  id int,
+  name string 
+) stored as orc;
+insert into type_changed_table values (1, 'Alice'), (2, 'Bob'), (3, 'Charlie');
+ALTER TABLE type_changed_table CHANGE COLUMN id id STRING;
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/run.sh
 
b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/run.sh
new file mode 100755
index 00000000000..f934ff3009c
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/run.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -x
+
+CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+
+# create table
+hive -f "${CUR_DIR}"/orc_predicate_table.hql
+
+
diff --git 
a/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out 
b/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out
new file mode 100644
index 00000000000..f42bb629550
Binary files /dev/null and 
b/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out differ
diff --git 
a/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy 
b/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy
new file mode 100644
index 00000000000..2dd647aa2c1
--- /dev/null
+++ 
b/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_hive_orc_predicate", 
"p0,external,hive,external_docker,external_docker_hive") {
+
+    String enabled = context.config.otherConfigs.get("enableHiveTest")
+    if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+        logger.info("diable Hive test.")
+        return;
+    }
+
+    for (String hivePrefix : ["hive2", "hive3"]) {
+        try {
+            String hms_port = context.config.otherConfigs.get(hivePrefix + 
"HmsPort")
+            String catalog_name = "${hivePrefix}_test_predicate"
+            String externalEnvIp = 
context.config.otherConfigs.get("externalEnvIp")
+
+            sql """drop catalog if exists ${catalog_name}"""
+            sql """create catalog if not exists ${catalog_name} properties (
+                "type"="hms",
+                'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
+            );"""
+            sql """use `${catalog_name}`.`multi_catalog`"""
+
+            qt_predicate_fixed_char1 """ select * from fixed_char_table where 
c = 'a';"""
+            qt_predicate_fixed_char2 """ select * from fixed_char_table where 
c = 'a ';"""
+
+            qt_predicate_changed_type1 """ select * from type_changed_table 
where id = '1';"""
+            qt_predicate_changed_type2 """ select * from type_changed_table 
where id = '2';"""
+            qt_predicate_changed_type3 """ select * from type_changed_table 
where id = '3';"""
+
+            sql """drop catalog if exists ${catalog_name}"""
+        } finally {
+        }
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to