This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push: new 3e707e9dc4a [fix](paimon)set timestamp's scale for parquet which has no logical type for 2.0 (#30259) 3e707e9dc4a is described below commit 3e707e9dc4a79ff8b1b8b2fe41d095bddf67b67c Author: wuwenchi <wuwenchi...@hotmail.com> AuthorDate: Wed Jan 24 12:03:57 2024 +0800 [fix](paimon)set timestamp's scale for parquet which has no logical type for 2.0 (#30259) --- be/src/vec/exec/format/parquet/decoder.cpp | 24 ++++++++++ be/src/vec/exec/format/parquet/decoder.h | 2 + .../format/parquet/fix_length_dict_decoder.hpp | 2 + .../format/parquet/fix_length_plain_decoder.cpp | 2 + .../paimon/paimon_timestamp_types.out | 13 +++++ .../paimon/paimon_timestamp_types.groovy | 55 ++++++++++++++++++++++ 6 files changed, 98 insertions(+) diff --git a/be/src/vec/exec/format/parquet/decoder.cpp b/be/src/vec/exec/format/parquet/decoder.cpp index 0a158176091..aa16eaf4b6c 100644 --- a/be/src/vec/exec/format/parquet/decoder.cpp +++ b/be/src/vec/exec/format/parquet/decoder.cpp @@ -184,4 +184,28 @@ void Decoder::init(FieldSchema* field_schema, cctz::time_zone* ctz) { _decode_params->offset_days = t.day() == 31 ? -1 : 0; // If 1969-12-31, then returns -1. } } + +/** + * Some frameworks like paimon maybe writes non-standard parquet files. Timestamp field doesn't have + * logicalType or converted_type to indicates its precision. We have to reset the time mask. + */ +void Decoder::reset_time_scale_if_missing(int scale) { + const auto& schema = _field_schema->parquet_schema; + if (!schema.__isset.logicalType && !schema.__isset.converted_type) { + int ts_scale = 9; + if (scale <= 3) { + ts_scale = 3; + } else if (scale <= 6) { + ts_scale = 6; + } + _decode_params->second_mask = common::exp10_i64(ts_scale); + _decode_params->scale_to_nano_factor = common::exp10_i64(9 - ts_scale); + + // The missing parque metadata makes it impossible for us to know the time zone information, + // so we default to UTC here. + if (_decode_params->ctz == nullptr) { + _decode_params->ctz = const_cast<cctz::time_zone*>(&_decode_params->utc0); + } + } +} } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/decoder.h b/be/src/vec/exec/format/parquet/decoder.h index acd9965bad8..f3da9429dda 100644 --- a/be/src/vec/exec/format/parquet/decoder.h +++ b/be/src/vec/exec/format/parquet/decoder.h @@ -99,6 +99,8 @@ public: template <typename DecimalPrimitiveType> void init_decimal_converter(DataTypePtr& data_type); + void reset_time_scale_if_missing(int scale); + // Write the decoded values batch to doris's column virtual Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector, bool is_dict_filter) = 0; diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp index a30c2dff3d1..3ca6193a4a1 100644 --- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp +++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp @@ -107,9 +107,11 @@ public: // Spark can set the timestamp precision by the following configuration: // spark.sql.parquet.outputTimestampType = INT96(NANOS), TIMESTAMP_MICROS, TIMESTAMP_MILLIS if constexpr (std::is_same_v<T, ParquetInt96>) { + reset_time_scale_if_missing(9); return _decode_datetime96<DateV2Value<DateTimeV2ValueType>, UInt64, has_filter>( doris_column, select_vector); } else if constexpr (std::is_same_v<T, Int64>) { + reset_time_scale_if_missing(remove_nullable(data_type)->get_scale()); return _decode_datetime64<DateV2Value<DateTimeV2ValueType>, UInt64, has_filter>( doris_column, select_vector); } diff --git a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp index af464c15545..c496cc175c9 100644 --- a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp +++ b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp @@ -118,9 +118,11 @@ Status FixLengthPlainDecoder::_decode_values(MutableColumnPtr& doris_column, Dat // Spark can set the timestamp precision by the following configuration: // spark.sql.parquet.outputTimestampType = INT96(NANOS), TIMESTAMP_MICROS, TIMESTAMP_MILLIS if (_physical_type == tparquet::Type::INT96) { + reset_time_scale_if_missing(9); return _decode_datetime96<DateV2Value<DateTimeV2ValueType>, UInt64, has_filter>( doris_column, select_vector); } else if (_physical_type == tparquet::Type::INT64) { + reset_time_scale_if_missing(remove_nullable(data_type)->get_scale()); return _decode_datetime64<DateV2Value<DateTimeV2ValueType>, UInt64, has_filter>( doris_column, select_vector); } diff --git a/regression-test/data/external_table_p2/paimon/paimon_timestamp_types.out b/regression-test/data/external_table_p2/paimon/paimon_timestamp_types.out new file mode 100644 index 00000000000..641424b160e --- /dev/null +++ b/regression-test/data/external_table_p2/paimon/paimon_timestamp_types.out @@ -0,0 +1,13 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !c1 -- +1 5432-08-30T05:43:21.100 5432-08-30T05:43:21.120 5432-08-30T05:43:21.123 5432-08-30T05:43:21.123400 5432-08-30T05:43:21.123450 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 + +-- !c2 -- +1 5432-08-30T05:43:21.100 5432-08-30T05:43:21.120 5432-08-30T05:43:21.123 5432-08-30T05:43:21.123400 5432-08-30T05:43:21.123450 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 + +-- !c3 -- +1 5432-08-30T05:43:21.100 5432-08-30T05:43:21.120 5432-08-30T05:43:21.123 5432-08-30T05:43:21.123400 5432-08-30T05:43:21.123450 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 + +-- !c4 -- +1 5432-08-30T05:43:21.100 5432-08-30T05:43:21.120 5432-08-30T05:43:21.123 5432-08-30T05:43:21.123400 5432-08-30T05:43:21.123450 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456 + diff --git a/regression-test/suites/external_table_p2/paimon/paimon_timestamp_types.groovy b/regression-test/suites/external_table_p2/paimon/paimon_timestamp_types.groovy new file mode 100644 index 00000000000..dbb1f1d038c --- /dev/null +++ b/regression-test/suites/external_table_p2/paimon/paimon_timestamp_types.groovy @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("paimon_timestamp_types", "p2,external,paimon,external_remote,external_remote_paimon") { + + def ts_orc = """select * from ts_orc""" + def ts_parquet = """select * from ts_parquet""" + + String enabled = context.config.otherConfigs.get("enableExternalPaimonTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String catalog_name = "paimon_timestamp_catalog" + String user_name = context.config.otherConfigs.get("extHiveHmsUser") + String hiveHost = context.config.otherConfigs.get("extHiveHmsHost") + String hivePort = context.config.otherConfigs.get("extHdfsPort") + + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + "type" = "paimon", + "paimon.catalog.type" = "filesystem", + "warehouse" = "hdfs://${hiveHost}/${hivePort}/paimon/paimon1", + "hadoop.username" = "${user_name}" + ); + """ + logger.info("catalog " + catalog_name + " created") + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + sql """use db1;""" + logger.info("use db1") + + sql """set force_jni_scanner=true""" + qt_c1 ts_orc + qt_c2 ts_parquet + + sql """set force_jni_scanner=false""" + qt_c3 ts_orc + qt_c4 ts_parquet + + } +} + --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org