This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-2.1-lakehouse in repository https://gitbox.apache.org/repos/asf/doris.git
commit da212850cc9824ad2e6186b294d0cb2e063f02f3 Author: morningman <morning...@163.com> AuthorDate: Sat Feb 15 11:20:02 2025 +0800 Revert "branch-2.1: [Fix](ORC) Not push down fixed char type in orc reader #45484 (#45525)" This reverts commit 7d32e4f71ff5ea6700af11223bd7970c572cb6d6. --- be/src/vec/exec/format/orc/vorc_reader.cpp | 30 ++++++++----- be/src/vec/exec/format/orc/vorc_reader.h | 4 +- be/src/vec/exec/scan/vfile_scanner.cpp | 10 ++++- .../orc_predicate/orc_predicate_table.hql | 16 ------- .../data/multi_catalog/orc_predicate/run.sh | 9 ---- .../hive/test_hive_orc_predicate.out | Bin 463 -> 0 bytes .../hive/test_hive_orc_predicate.groovy | 50 --------------------- 7 files changed, 32 insertions(+), 87 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 996effd554e..b175ce7ace1 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -143,7 +143,7 @@ void ORCFileInputStream::read(void* buf, uint64_t length, uint64_t offset) { OrcReader::OrcReader(RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, const TFileRangeDesc& range, size_t batch_size, const std::string& ctz, io::IOContext* io_ctx, - bool enable_lazy_mat) + bool enable_lazy_mat, std::vector<orc::TypeKind>* unsupported_pushdown_types) : _profile(profile), _state(state), _scan_params(params), @@ -156,7 +156,8 @@ OrcReader::OrcReader(RuntimeProfile* profile, RuntimeState* state, _enable_lazy_mat(enable_lazy_mat), _enable_filter_by_min_max( state == nullptr ? true : state->query_options().enable_orc_filter_by_min_max), - _dict_cols_has_converted(false) { + _dict_cols_has_converted(false), + _unsupported_pushdown_types(unsupported_pushdown_types) { TimezoneUtils::find_cctz_time_zone(ctz, _time_zone); VecDateTimeValue t; t.from_unixtime(0, ctz); @@ -452,8 +453,7 @@ static std::unordered_map<orc::TypeKind, orc::PredicateDataType> TYPEKIND_TO_PRE {orc::TypeKind::DOUBLE, orc::PredicateDataType::FLOAT}, {orc::TypeKind::STRING, orc::PredicateDataType::STRING}, {orc::TypeKind::BINARY, orc::PredicateDataType::STRING}, - // should not pust down CHAR type, because CHAR type is fixed length and will be padded - // {orc::TypeKind::CHAR, orc::PredicateDataType::STRING}, + {orc::TypeKind::CHAR, orc::PredicateDataType::STRING}, {orc::TypeKind::VARCHAR, orc::PredicateDataType::STRING}, {orc::TypeKind::DATE, orc::PredicateDataType::DATE}, {orc::TypeKind::DECIMAL, orc::PredicateDataType::DECIMAL}, @@ -483,9 +483,8 @@ std::tuple<bool, orc::Literal> convert_to_orc_literal(const orc::Type* type, con [[fallthrough]]; case orc::TypeKind::BINARY: [[fallthrough]]; - // should not pust down CHAR type, because CHAR type is fixed length and will be padded - // case orc::TypeKind::CHAR: - // [[fallthrough]]; + case orc::TypeKind::CHAR: + [[fallthrough]]; case orc::TypeKind::VARCHAR: { StringRef* string_value = (StringRef*)value; return std::make_tuple(true, orc::Literal(string_value->data, string_value->size)); @@ -561,7 +560,8 @@ std::tuple<bool, orc::Literal> convert_to_orc_literal(const orc::Type* type, con template <PrimitiveType primitive_type> std::vector<OrcPredicate> value_range_to_predicate( - const ColumnValueRange<primitive_type>& col_val_range, const orc::Type* type) { + const ColumnValueRange<primitive_type>& col_val_range, const orc::Type* type, + std::vector<orc::TypeKind>* unsupported_pushdown_types) { std::vector<OrcPredicate> predicates; PrimitiveType src_type = OrcReader::convert_to_doris_type(type).type; @@ -572,6 +572,16 @@ std::vector<OrcPredicate> value_range_to_predicate( } } + if (unsupported_pushdown_types != nullptr) { + for (vector<orc::TypeKind>::iterator it = unsupported_pushdown_types->begin(); + it != unsupported_pushdown_types->end(); ++it) { + if (*it == type->getKind()) { + // Unsupported type + return predicates; + } + } + } + orc::PredicateDataType predicate_data_type; auto type_it = TYPEKIND_TO_PREDICATE_TYPE.find(type->getKind()); if (type_it == TYPEKIND_TO_PREDICATE_TYPE.end()) { @@ -713,8 +723,8 @@ bool OrcReader::_init_search_argument( } std::visit( [&](auto& range) { - std::vector<OrcPredicate> value_predicates = - value_range_to_predicate(range, type_it->second); + std::vector<OrcPredicate> value_predicates = value_range_to_predicate( + range, type_it->second, _unsupported_pushdown_types); for (auto& range_predicate : value_predicates) { predicates.emplace_back(range_predicate); } diff --git a/be/src/vec/exec/format/orc/vorc_reader.h b/be/src/vec/exec/format/orc/vorc_reader.h index 95afe21e144..b286b714ad9 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.h +++ b/be/src/vec/exec/format/orc/vorc_reader.h @@ -133,7 +133,8 @@ public: OrcReader(RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, const TFileRangeDesc& range, size_t batch_size, const std::string& ctz, - io::IOContext* io_ctx, bool enable_lazy_mat = true); + io::IOContext* io_ctx, bool enable_lazy_mat = true, + std::vector<orc::TypeKind>* unsupported_pushdown_types = nullptr); OrcReader(const TFileScanRangeParams& params, const TFileRangeDesc& range, const std::string& ctz, io::IOContext* io_ctx, bool enable_lazy_mat = true); @@ -618,6 +619,7 @@ private: std::unique_ptr<StringDictFilterImpl> _string_dict_filter; bool _dict_cols_has_converted = false; bool _has_complex_type = false; + std::vector<orc::TypeKind>* _unsupported_pushdown_types; // resolve schema change std::unordered_map<std::string, std::unique_ptr<converter::ColumnTypeConverter>> _converters; diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index f4376cc1e70..7c65faac81c 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -893,9 +893,17 @@ Status VFileScanner::_get_next_reader() { break; } case TFileFormatType::FORMAT_ORC: { + std::vector<orc::TypeKind>* unsupported_pushdown_types = nullptr; + if (range.__isset.table_format_params && + range.table_format_params.table_format_type == "paimon") { + static std::vector<orc::TypeKind> paimon_unsupport_type = + std::vector<orc::TypeKind> {orc::TypeKind::CHAR}; + unsupported_pushdown_types = &paimon_unsupport_type; + } std::unique_ptr<OrcReader> orc_reader = OrcReader::create_unique( _profile, _state, *_params, range, _state->query_options().batch_size, - _state->timezone(), _io_ctx.get(), _state->query_options().enable_orc_lazy_mat); + _state->timezone(), _io_ctx.get(), _state->query_options().enable_orc_lazy_mat, + unsupported_pushdown_types); orc_reader->set_push_down_agg_type(_get_push_down_agg_type()); if (push_down_predicates) { RETURN_IF_ERROR(_process_late_arrival_conjuncts()); diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql deleted file mode 100644 index a946b25ff1a..00000000000 --- a/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql +++ /dev/null @@ -1,16 +0,0 @@ -CREATE DATABASE IF NOT EXISTS multi_catalog; -USE multi_catalog; - -create table fixed_char_table ( - i int, - c char(2) -) stored as orc; - -insert into fixed_char_table values(1,'a'),(2,'b '), (3,'cd'); - -create table type_changed_table ( - id int, - name string -) stored as orc; -insert into type_changed_table values (1, 'Alice'), (2, 'Bob'), (3, 'Charlie'); -ALTER TABLE type_changed_table CHANGE COLUMN id id STRING; diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/run.sh b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/run.sh deleted file mode 100755 index f934ff3009c..00000000000 --- a/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/run.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -set -x - -CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" - -# create table -hive -f "${CUR_DIR}"/orc_predicate_table.hql - - diff --git a/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out b/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out deleted file mode 100644 index f42bb629550..00000000000 Binary files a/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out and /dev/null differ diff --git a/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy b/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy deleted file mode 100644 index d9b6357ca0a..00000000000 --- a/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy +++ /dev/null @@ -1,50 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -suite("test_hive_orc_predicate", "p0,external,hive,external_docker,external_docker_hive") { - - String enabled = context.config.otherConfigs.get("enableHiveTest") - if (enabled == null || !enabled.equalsIgnoreCase("true")) { - logger.info("disable Hive test.") - return; - } - - for (String hivePrefix : ["hive2", "hive3"]) { - try { - String hms_port = context.config.otherConfigs.get(hivePrefix + "HmsPort") - String catalog_name = "${hivePrefix}_test_predicate" - String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") - - sql """drop catalog if exists ${catalog_name}""" - sql """create catalog if not exists ${catalog_name} properties ( - "type"="hms", - 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}' - );""" - sql """use `${catalog_name}`.`multi_catalog`""" - - qt_predicate_fixed_char1 """ select * from fixed_char_table where c = 'a';""" - qt_predicate_fixed_char2 """ select * from fixed_char_table where c = 'a ';""" - - qt_predicate_changed_type1 """ select * from type_changed_table where id = '1';""" - qt_predicate_changed_type2 """ select * from type_changed_table where id = '2';""" - qt_predicate_changed_type3 """ select * from type_changed_table where id = '3';""" - - sql """drop catalog if exists ${catalog_name}""" - } finally { - } - } -} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org