morningman commented on code in PR #32873: URL: https://github.com/apache/doris/pull/32873#discussion_r1554820218
########## be/src/vec/exec/format/column_type_convert.h: ########## @@ -0,0 +1,539 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "gutil/strings/numbers.h" +#include "vec/columns/column_string.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_factory.hpp" +#include "vec/io/io_helper.h" + +namespace doris::vectorized::converter { + +template <PrimitiveType type> +constexpr bool is_decimal_type_const() { + return type == TYPE_DECIMALV2 || type == TYPE_DECIMAL32 || type == TYPE_DECIMAL64 || + type == TYPE_DECIMAL128I || type == TYPE_DECIMAL256; +} + +/** + * Unified schema change interface for all format readers: + * + * First, read the data according to the column type of the file into source column + * Second, convert source column to the destination column with type planned by FE + */ +class ColumnTypeConverter { +protected: + // The cached column to read data according to the column type of the file + // Then, it will be converted destination column, so this column can be reuse in next loop Review Comment: ```suggestion // Then, it will be converted to destination column, so this column can be reuse in next loop ``` ########## be/src/vec/exec/format/column_type_convert.h: ########## @@ -0,0 +1,539 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "gutil/strings/numbers.h" +#include "vec/columns/column_string.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_factory.hpp" +#include "vec/io/io_helper.h" + +namespace doris::vectorized::converter { + +template <PrimitiveType type> +constexpr bool is_decimal_type_const() { + return type == TYPE_DECIMALV2 || type == TYPE_DECIMAL32 || type == TYPE_DECIMAL64 || + type == TYPE_DECIMAL128I || type == TYPE_DECIMAL256; +} + +/** + * Unified schema change interface for all format readers: + * + * First, read the data according to the column type of the file into source column + * Second, convert source column to the destination column with type planned by FE + */ +class ColumnTypeConverter { +protected: + // The cached column to read data according to the column type of the file + // Then, it will be converted destination column, so this column can be reuse in next loop + ColumnPtr _cached_src_column = nullptr; + // The column type generated from file meta(eg. parquet footer) + DataTypePtr _cached_src_type = nullptr; + // Error message to show unsupported converter if support() return false; + std::string _error_msg; + +public: + /** + * Get the converter to change column type + * @param src_type colum type from file meta data + * @param dst_type column type from FE planner(the changed column type) + */ + static std::unique_ptr<ColumnTypeConverter> get_converter(const TypeDescriptor& src_type, + const DataTypePtr& dst_type); + + ColumnTypeConverter() = default; + virtual ~ColumnTypeConverter() = default; + + /** + * Converter source column to destination column. If the converter is not consistent, + * the source column is `_cached_src_column`, otherwise, `src_col` and `dst_col` are the + * same column, and with nothing to do. + */ + virtual Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) { return Status::OK(); } + + virtual bool support() { return true; } + + virtual bool is_consistent() { return false; } + + /** + * Get the column to read data from file with the type from file meta data. + * If the converter is not consistent, the returned column is `_cached_src_column`. + * For performance reasons, the null map of `_cached_src_column` is a reference from + * the null map of `dst_column`, so there is no need to convert null map in `convert()`. + * + * According to the hive standard, if certain values fail to be converted(eg. string `row1` to in value), + * these values are replaced by nulls. + */ + ColumnPtr get_column(const TypeDescriptor& src_type, ColumnPtr& dst_column, + const DataTypePtr& dst_type); + + /** + * Get the column type from file meta data. + */ + const DataTypePtr& get_type() { return _cached_src_type; } + + std::string get_error_msg() { return _error_msg; }; +}; + +/** + * No type conversion occurred, or compatible type conversion + * + * Compatible type conversion: + * conversion within string, char and varchar + * conversion from decimal(p1, s1) to decimal(p2, s2), because the scale change of decimal type is resolved in decode process + */ +class ConsistentConverter : public ColumnTypeConverter { + bool is_consistent() override { return true; } +}; + +/** + * Unsupported type change, eg. from int to date + */ +class UnsupportedConverter : public ColumnTypeConverter { +public: + UnsupportedConverter(const TypeDescriptor& src_type, const DataTypePtr& dst_type) { + std::string src_type_str = std::string(getTypeName( + DataTypeFactory::instance().create_data_type(src_type, false)->get_type_id())); + std::string dst_type_str = + std::string(getTypeName(remove_nullable(dst_type)->get_type_id())); + _error_msg = src_type_str + " => " + dst_type_str; + } + + bool support() override { return false; } + + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + return Status::InternalError("Unsupported type change: {}", _error_msg); + } +}; + +template <PrimitiveType SrcPrimitiveType, PrimitiveType DstPrimitiveType> +class NumericToNumericConverter : public ColumnTypeConverter { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using SrcColumnType = typename PrimitiveTypeTraits<SrcPrimitiveType>::ColumnType; + using DstCppType = typename PrimitiveTypeTraits<DstPrimitiveType>::CppType; + using DstColumnType = typename PrimitiveTypeTraits<DstPrimitiveType>::ColumnType; + ColumnPtr from_col = remove_nullable(src_col); + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + size_t rows = from_col->size(); + auto& src_data = static_cast<const SrcColumnType*>(from_col.get())->get_data(); + size_t start_idx = to_col->size(); + to_col->resize(start_idx + rows); + auto& data = static_cast<DstColumnType&>(*to_col.get()).get_data(); + for (int i = 0; i < rows; ++i) { + data[start_idx + i] = static_cast<DstCppType>(src_data[i]); + } + + return Status::OK(); + } +}; + +template <PrimitiveType SrcPrimitiveType> +class NumericToStringConverter : public ColumnTypeConverter { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using SrcColumnType = typename PrimitiveTypeTraits<SrcPrimitiveType>::ColumnType; + ColumnPtr from_col = remove_nullable(src_col); + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + size_t rows = from_col->size(); + auto& src_data = static_cast<const SrcColumnType*>(from_col.get())->get_data(); + auto& string_col = static_cast<ColumnString&>(*to_col.get()); + for (int i = 0; i < rows; ++i) { + if constexpr (SrcPrimitiveType == TYPE_LARGEINT) { + string value = int128_to_string(src_data[i]); + string_col.insert_data(value.data(), value.size()); + } else { + string value = std::to_string(src_data[i]); + string_col.insert_data(value.data(), value.size()); + } + } + + return Status::OK(); + } +}; + +template <PrimitiveType SrcPrimitiveType> +class DecimalToStringConverter : public ColumnTypeConverter { +private: + int _scale; + +public: + DecimalToStringConverter(int scale) : _scale(scale) {} + + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using SrcColumnType = typename PrimitiveTypeTraits<SrcPrimitiveType>::ColumnType; + ColumnPtr from_col = remove_nullable(src_col); + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + size_t rows = from_col->size(); + auto& src_data = static_cast<const SrcColumnType*>(from_col.get())->get_data(); + auto& string_col = static_cast<ColumnString&>(*to_col.get()); + for (int i = 0; i < rows; ++i) { + std::string value = src_data[i].to_string(_scale); + string_col.insert_data(value.data(), value.size()); + } + + return Status::OK(); + } +}; + +template <PrimitiveType SrcPrimitiveType> +class TimeToStringConverter : public ColumnTypeConverter { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using SrcCppType = typename PrimitiveTypeTraits<SrcPrimitiveType>::CppType; + using SrcColumnType = typename PrimitiveTypeTraits<SrcPrimitiveType>::ColumnType; + ColumnPtr from_col = remove_nullable(src_col); + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + size_t rows = from_col->size(); + auto& src_data = static_cast<const SrcColumnType*>(from_col.get())->get_data(); + auto& string_col = static_cast<ColumnString&>(*to_col.get()); + char buf[50]; + for (int i = 0; i < rows; ++i) { + char* end = (reinterpret_cast<const SrcCppType&>(src_data[i])).to_string(buf); + string_col.insert_data(buf, end - buf); + } + + return Status::OK(); + } +}; + +template <PrimitiveType DstPrimitiveType> +struct SafeCastString {}; + +template <> +struct SafeCastString<TYPE_BOOLEAN> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_BOOLEAN>::ColumnType::value_type* value) { + int32 cast_to_int = 0; + bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int); + *value = cast_to_int == 0 ? 0 : 1; + return can_cast; + } +}; + +template <> +struct SafeCastString<TYPE_TINYINT> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_TINYINT>::ColumnType::value_type* value) { + int32 cast_to_int = 0; + bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int); + *value = cast_to_int; + return can_cast && cast_to_int <= std::numeric_limits<int8>::max() && + cast_to_int >= std::numeric_limits<int8>::min(); + } +}; + +template <> +struct SafeCastString<TYPE_SMALLINT> { + static bool safe_cast_string( + const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_SMALLINT>::ColumnType::value_type* value) { + int32 cast_to_int = 0; + bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int); + *value = cast_to_int; + return can_cast && cast_to_int <= std::numeric_limits<int16>::max() && + cast_to_int >= std::numeric_limits<int16>::min(); + } +}; + +template <> +struct SafeCastString<TYPE_INT> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_INT>::ColumnType::value_type* value) { + int32 cast_to_int = 0; + bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int); + *value = cast_to_int; + return can_cast; + } +}; + +template <> +struct SafeCastString<TYPE_BIGINT> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_BIGINT>::ColumnType::value_type* value) { + int64 cast_to_int = 0; + bool can_cast = safe_strto64(startptr, buffer_size, &cast_to_int); + *value = cast_to_int; + return can_cast; + } +}; + +template <> +struct SafeCastString<TYPE_LARGEINT> { + static bool safe_cast_string( + const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_LARGEINT>::ColumnType::value_type* value) { + int64 cast_to_int = 0; + bool can_cast = safe_strto64(startptr, buffer_size, &cast_to_int); + *value = cast_to_int; + return can_cast; + } +}; + +template <> +struct SafeCastString<TYPE_FLOAT> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_FLOAT>::ColumnType::value_type* value) { + float cast_to_float = 0; + bool can_cast = safe_strtof(std::string(startptr, buffer_size), &cast_to_float); + *value = cast_to_float; + return can_cast; + } +}; + +template <> +struct SafeCastString<TYPE_DOUBLE> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_DOUBLE>::ColumnType::value_type* value) { + double cast_to_double = 0; + bool can_cast = safe_strtod(std::string(startptr, buffer_size), &cast_to_double); + *value = cast_to_double; + return can_cast; + } +}; + +template <> +struct SafeCastString<TYPE_DATETIME> { + static bool safe_cast_string( + const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_DATETIME>::ColumnType::value_type* value) { + ReadBuffer buffer(reinterpret_cast<const unsigned char*>(startptr), buffer_size); + return read_datetime_text_impl<Int64>(*value, buffer); + } +}; + +template <> +struct SafeCastString<TYPE_DATETIMEV2> { + static bool safe_cast_string( + const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_DATETIMEV2>::ColumnType::value_type* value, int scale) { + ReadBuffer buffer(reinterpret_cast<const unsigned char*>(startptr), buffer_size); + return read_datetime_v2_text_impl<UInt64>(*value, buffer, scale); + } +}; + +template <> +struct SafeCastString<TYPE_DATE> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_DATE>::ColumnType::value_type* value) { + ReadBuffer buffer(reinterpret_cast<const unsigned char*>(startptr), buffer_size); + return read_date_text_impl<Int64>(*value, buffer); + } +}; + +template <> +struct SafeCastString<TYPE_DATEV2> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_DATEV2>::ColumnType::value_type* value) { + ReadBuffer buffer(reinterpret_cast<const unsigned char*>(startptr), buffer_size); + return read_date_v2_text_impl<UInt32>(*value, buffer); + } +}; + +template <PrimitiveType DstPrimitiveType> +struct SafeCastDecimalString { + using CppType = typename PrimitiveTypeTraits<DstPrimitiveType>::ColumnType::value_type; + + static bool safe_cast_string(const char* startptr, const int buffer_size, CppType* value, + int precision, int scale) { + ReadBuffer buffer(reinterpret_cast<const unsigned char*>(startptr), buffer_size); + return read_decimal_text_impl<DstPrimitiveType, CppType>( + *value, buffer, precision, scale) == StringParser::PARSE_SUCCESS; + } +}; + +template <PrimitiveType DstPrimitiveType> +class CastStringConverter : public ColumnTypeConverter { +private: + DataTypePtr _dst_type_desc; + +public: + CastStringConverter() = default; + CastStringConverter(DataTypePtr dst_type_desc) : _dst_type_desc(dst_type_desc) {} + + using DstCppType = typename PrimitiveTypeTraits<DstPrimitiveType>::ColumnType::value_type; + using DstColumnType = typename PrimitiveTypeTraits<DstPrimitiveType>::ColumnType; + + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + ColumnPtr from_col = remove_nullable(src_col); + NullMap* null_map = nullptr; + if (dst_col->is_nullable()) { + null_map = &reinterpret_cast<vectorized::ColumnNullable*>(dst_col.get()) + ->get_null_map_data(); + } + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + size_t rows = from_col->size(); + auto& string_col = static_cast<ColumnString&>(*from_col->assume_mutable().get()); + size_t start_idx = to_col->size(); + to_col->resize(start_idx + rows); + auto& data = static_cast<DstColumnType*>(to_col.get())->get_data(); + for (int i = 0; i < rows; ++i) { + DstCppType value; + auto string_value = string_col.get_data_at(i); + bool can_cast = false; + if constexpr (is_decimal_type_const<DstPrimitiveType>()) { + can_cast = SafeCastDecimalString<DstPrimitiveType>::safe_cast_string( + string_value.data, string_value.size, &value, + _dst_type_desc->get_precision(), _dst_type_desc->get_scale()); + } else if constexpr (DstPrimitiveType == TYPE_DATETIMEV2) { + can_cast = SafeCastString<TYPE_DATETIMEV2>::safe_cast_string( + string_value.data, string_value.size, &value, _dst_type_desc->get_scale()); + } else { + can_cast = SafeCastString<DstPrimitiveType>::safe_cast_string( + string_value.data, string_value.size, &value); + } + if (can_cast) { + data[start_idx + i] = value; + } else { + if (null_map == nullptr) { + return Status::InternalError("Failed to cast string '{}'", Review Comment: ```suggestion return Status::InternalError("Failed to cast string '{}' to not null column", ``` ########## be/src/vec/exec/format/column_type_convert.h: ########## @@ -0,0 +1,539 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "gutil/strings/numbers.h" +#include "vec/columns/column_string.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_factory.hpp" +#include "vec/io/io_helper.h" + +namespace doris::vectorized::converter { + +template <PrimitiveType type> +constexpr bool is_decimal_type_const() { + return type == TYPE_DECIMALV2 || type == TYPE_DECIMAL32 || type == TYPE_DECIMAL64 || + type == TYPE_DECIMAL128I || type == TYPE_DECIMAL256; +} + +/** + * Unified schema change interface for all format readers: + * + * First, read the data according to the column type of the file into source column + * Second, convert source column to the destination column with type planned by FE + */ +class ColumnTypeConverter { +protected: + // The cached column to read data according to the column type of the file + // Then, it will be converted destination column, so this column can be reuse in next loop + ColumnPtr _cached_src_column = nullptr; + // The column type generated from file meta(eg. parquet footer) + DataTypePtr _cached_src_type = nullptr; + // Error message to show unsupported converter if support() return false; + std::string _error_msg; + +public: + /** + * Get the converter to change column type + * @param src_type colum type from file meta data + * @param dst_type column type from FE planner(the changed column type) + */ + static std::unique_ptr<ColumnTypeConverter> get_converter(const TypeDescriptor& src_type, + const DataTypePtr& dst_type); + + ColumnTypeConverter() = default; + virtual ~ColumnTypeConverter() = default; + + /** + * Converter source column to destination column. If the converter is not consistent, + * the source column is `_cached_src_column`, otherwise, `src_col` and `dst_col` are the + * same column, and with nothing to do. + */ + virtual Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) { return Status::OK(); } + + virtual bool support() { return true; } + + virtual bool is_consistent() { return false; } + + /** + * Get the column to read data from file with the type from file meta data. + * If the converter is not consistent, the returned column is `_cached_src_column`. + * For performance reasons, the null map of `_cached_src_column` is a reference from + * the null map of `dst_column`, so there is no need to convert null map in `convert()`. + * + * According to the hive standard, if certain values fail to be converted(eg. string `row1` to in value), Review Comment: ```suggestion * According to the hive standard, if certain values fail to be converted(eg. string `row1` to int value), ``` ########## be/src/vec/data_types/data_type_struct.cpp: ########## @@ -355,14 +355,7 @@ std::optional<size_t> DataTypeStruct::try_get_position_by_name(const String& nam } String DataTypeStruct::get_name_by_position(size_t i) const { - if (i == 0 || i > names.size()) { Review Comment: Why removing this check? ########## be/src/vec/exec/format/parquet/parquet_column_convert.h: ########## @@ -193,195 +106,189 @@ struct ConvertParams { } template <typename DecimalPrimitiveType> - void init_decimal_converter(DataTypePtr& data_type) { + void init_decimal_converter(int dst_scale) { if (field_schema == nullptr || decimal_scale.scale_type != DecimalScaleParams::NOT_INIT) { return; } auto scale = field_schema->parquet_schema.scale; - auto* decimal_type = static_cast<DataTypeDecimal<DecimalPrimitiveType>*>( - const_cast<IDataType*>(remove_nullable(data_type).get())); - auto dest_scale = decimal_type->get_scale(); - if (dest_scale > scale) { + if (dst_scale > scale) { decimal_scale.scale_type = DecimalScaleParams::SCALE_UP; decimal_scale.scale_factor = - DecimalScaleParams::get_scale_factor<DecimalPrimitiveType>(dest_scale - scale); - } else if (dest_scale < scale) { + DecimalScaleParams::get_scale_factor<DecimalPrimitiveType>(dst_scale - scale); + } else if (dst_scale < scale) { decimal_scale.scale_type = DecimalScaleParams::SCALE_DOWN; decimal_scale.scale_factor = - DecimalScaleParams::get_scale_factor<DecimalPrimitiveType>(scale - dest_scale); + DecimalScaleParams::get_scale_factor<DecimalPrimitiveType>(scale - dst_scale); } else { decimal_scale.scale_type = DecimalScaleParams::NO_SCALE; decimal_scale.scale_factor = 1; } } }; -/* -* parquet_physical_type : The type of data stored in parquet. -* Read data into columns returned by get_column according to the physical type of parquet. -* show_type : The data format that should be displayed. -* doris_column : What type of column does the upper layer need to put the data in. -* -* example : -* In hive, if decimal is stored as FIXED_LENBYTE_ARRAY in parquet, -* then we use `ALTER TABLE TableName CHANGE COLUMN Col_Decimal Col_Decimal String;` -* to convert this column to string type. -* parquet_type : FIXED_LEN_BYTE_ARRAY. -* ans_data_type : ColumnInt8 -* show_type : Decimal. -* doris_column : ColumnString. -*/ -ColumnPtr get_column(tparquet::Type::type parquet_physical_type, PrimitiveType show_type, - ColumnPtr& doris_column, DataTypePtr& doris_type, bool* need_convert); - -struct ColumnConvert { - virtual Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) { return Status::OK(); } - - virtual ~ColumnConvert() = default; - - void convert_null(ColumnPtr& src_col, MutableColumnPtr& dst_col) { - src_col = remove_nullable(src_col); - dst_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); - } +/** + * Convert parquet physical column to logical column + * In parquet document(https://github.com/apache/parquet-format/blob/master/LogicalTypes.md), + * Logical or converted type is the data type of column, physical type is the stored type of column chunk. + * eg, decimal type can be stored as INT32, INT64, BYTE_ARRAY, FIXED_LENGTH_BYTE_ARRAY. + * So there is a convert process from physical type to logical type. + * In addition, Schema change will bring about a change in logical type. + * + * In previous implementations, physical and logical conversion were mixed together, Review Comment: No need to add this in comment ########## be/src/vec/exec/format/column_type_convert.h: ########## @@ -0,0 +1,539 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "gutil/strings/numbers.h" +#include "vec/columns/column_string.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_factory.hpp" +#include "vec/io/io_helper.h" + +namespace doris::vectorized::converter { + +template <PrimitiveType type> +constexpr bool is_decimal_type_const() { + return type == TYPE_DECIMALV2 || type == TYPE_DECIMAL32 || type == TYPE_DECIMAL64 || + type == TYPE_DECIMAL128I || type == TYPE_DECIMAL256; +} + +/** + * Unified schema change interface for all format readers: + * + * First, read the data according to the column type of the file into source column + * Second, convert source column to the destination column with type planned by FE + */ +class ColumnTypeConverter { +protected: + // The cached column to read data according to the column type of the file + // Then, it will be converted destination column, so this column can be reuse in next loop + ColumnPtr _cached_src_column = nullptr; + // The column type generated from file meta(eg. parquet footer) + DataTypePtr _cached_src_type = nullptr; + // Error message to show unsupported converter if support() return false; + std::string _error_msg; + +public: + /** + * Get the converter to change column type + * @param src_type colum type from file meta data + * @param dst_type column type from FE planner(the changed column type) + */ + static std::unique_ptr<ColumnTypeConverter> get_converter(const TypeDescriptor& src_type, + const DataTypePtr& dst_type); + + ColumnTypeConverter() = default; + virtual ~ColumnTypeConverter() = default; + + /** + * Converter source column to destination column. If the converter is not consistent, + * the source column is `_cached_src_column`, otherwise, `src_col` and `dst_col` are the + * same column, and with nothing to do. + */ + virtual Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) { return Status::OK(); } + + virtual bool support() { return true; } + + virtual bool is_consistent() { return false; } + + /** + * Get the column to read data from file with the type from file meta data. + * If the converter is not consistent, the returned column is `_cached_src_column`. + * For performance reasons, the null map of `_cached_src_column` is a reference from + * the null map of `dst_column`, so there is no need to convert null map in `convert()`. + * + * According to the hive standard, if certain values fail to be converted(eg. string `row1` to in value), + * these values are replaced by nulls. + */ + ColumnPtr get_column(const TypeDescriptor& src_type, ColumnPtr& dst_column, + const DataTypePtr& dst_type); + + /** + * Get the column type from file meta data. + */ + const DataTypePtr& get_type() { return _cached_src_type; } + + std::string get_error_msg() { return _error_msg; }; +}; + +/** + * No type conversion occurred, or compatible type conversion + * + * Compatible type conversion: + * conversion within string, char and varchar + * conversion from decimal(p1, s1) to decimal(p2, s2), because the scale change of decimal type is resolved in decode process + */ +class ConsistentConverter : public ColumnTypeConverter { + bool is_consistent() override { return true; } +}; + +/** + * Unsupported type change, eg. from int to date + */ +class UnsupportedConverter : public ColumnTypeConverter { +public: + UnsupportedConverter(const TypeDescriptor& src_type, const DataTypePtr& dst_type) { + std::string src_type_str = std::string(getTypeName( + DataTypeFactory::instance().create_data_type(src_type, false)->get_type_id())); + std::string dst_type_str = + std::string(getTypeName(remove_nullable(dst_type)->get_type_id())); + _error_msg = src_type_str + " => " + dst_type_str; + } + + bool support() override { return false; } + + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + return Status::InternalError("Unsupported type change: {}", _error_msg); + } +}; + +template <PrimitiveType SrcPrimitiveType, PrimitiveType DstPrimitiveType> +class NumericToNumericConverter : public ColumnTypeConverter { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using SrcColumnType = typename PrimitiveTypeTraits<SrcPrimitiveType>::ColumnType; + using DstCppType = typename PrimitiveTypeTraits<DstPrimitiveType>::CppType; + using DstColumnType = typename PrimitiveTypeTraits<DstPrimitiveType>::ColumnType; + ColumnPtr from_col = remove_nullable(src_col); + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + size_t rows = from_col->size(); + auto& src_data = static_cast<const SrcColumnType*>(from_col.get())->get_data(); + size_t start_idx = to_col->size(); + to_col->resize(start_idx + rows); + auto& data = static_cast<DstColumnType&>(*to_col.get()).get_data(); + for (int i = 0; i < rows; ++i) { + data[start_idx + i] = static_cast<DstCppType>(src_data[i]); + } + + return Status::OK(); + } +}; + +template <PrimitiveType SrcPrimitiveType> +class NumericToStringConverter : public ColumnTypeConverter { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using SrcColumnType = typename PrimitiveTypeTraits<SrcPrimitiveType>::ColumnType; + ColumnPtr from_col = remove_nullable(src_col); + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + size_t rows = from_col->size(); + auto& src_data = static_cast<const SrcColumnType*>(from_col.get())->get_data(); + auto& string_col = static_cast<ColumnString&>(*to_col.get()); + for (int i = 0; i < rows; ++i) { + if constexpr (SrcPrimitiveType == TYPE_LARGEINT) { + string value = int128_to_string(src_data[i]); + string_col.insert_data(value.data(), value.size()); + } else { + string value = std::to_string(src_data[i]); + string_col.insert_data(value.data(), value.size()); + } + } + + return Status::OK(); + } +}; + +template <PrimitiveType SrcPrimitiveType> +class DecimalToStringConverter : public ColumnTypeConverter { +private: + int _scale; + +public: + DecimalToStringConverter(int scale) : _scale(scale) {} + + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using SrcColumnType = typename PrimitiveTypeTraits<SrcPrimitiveType>::ColumnType; + ColumnPtr from_col = remove_nullable(src_col); + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + size_t rows = from_col->size(); + auto& src_data = static_cast<const SrcColumnType*>(from_col.get())->get_data(); + auto& string_col = static_cast<ColumnString&>(*to_col.get()); + for (int i = 0; i < rows; ++i) { + std::string value = src_data[i].to_string(_scale); + string_col.insert_data(value.data(), value.size()); + } + + return Status::OK(); + } +}; + +template <PrimitiveType SrcPrimitiveType> +class TimeToStringConverter : public ColumnTypeConverter { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using SrcCppType = typename PrimitiveTypeTraits<SrcPrimitiveType>::CppType; + using SrcColumnType = typename PrimitiveTypeTraits<SrcPrimitiveType>::ColumnType; + ColumnPtr from_col = remove_nullable(src_col); + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + size_t rows = from_col->size(); + auto& src_data = static_cast<const SrcColumnType*>(from_col.get())->get_data(); + auto& string_col = static_cast<ColumnString&>(*to_col.get()); + char buf[50]; + for (int i = 0; i < rows; ++i) { + char* end = (reinterpret_cast<const SrcCppType&>(src_data[i])).to_string(buf); + string_col.insert_data(buf, end - buf); + } + + return Status::OK(); + } +}; + +template <PrimitiveType DstPrimitiveType> +struct SafeCastString {}; + +template <> +struct SafeCastString<TYPE_BOOLEAN> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_BOOLEAN>::ColumnType::value_type* value) { + int32 cast_to_int = 0; + bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int); + *value = cast_to_int == 0 ? 0 : 1; + return can_cast; + } +}; + +template <> +struct SafeCastString<TYPE_TINYINT> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_TINYINT>::ColumnType::value_type* value) { + int32 cast_to_int = 0; + bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int); + *value = cast_to_int; + return can_cast && cast_to_int <= std::numeric_limits<int8>::max() && + cast_to_int >= std::numeric_limits<int8>::min(); + } +}; + +template <> +struct SafeCastString<TYPE_SMALLINT> { + static bool safe_cast_string( + const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_SMALLINT>::ColumnType::value_type* value) { + int32 cast_to_int = 0; + bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int); + *value = cast_to_int; + return can_cast && cast_to_int <= std::numeric_limits<int16>::max() && + cast_to_int >= std::numeric_limits<int16>::min(); + } +}; + +template <> +struct SafeCastString<TYPE_INT> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_INT>::ColumnType::value_type* value) { + int32 cast_to_int = 0; + bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int); + *value = cast_to_int; + return can_cast; + } +}; + +template <> +struct SafeCastString<TYPE_BIGINT> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_BIGINT>::ColumnType::value_type* value) { + int64 cast_to_int = 0; + bool can_cast = safe_strto64(startptr, buffer_size, &cast_to_int); + *value = cast_to_int; + return can_cast; + } +}; + +template <> +struct SafeCastString<TYPE_LARGEINT> { + static bool safe_cast_string( + const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_LARGEINT>::ColumnType::value_type* value) { + int64 cast_to_int = 0; + bool can_cast = safe_strto64(startptr, buffer_size, &cast_to_int); Review Comment: still using `64` for largeint? ########## be/src/vec/exec/format/column_type_convert.h: ########## @@ -0,0 +1,539 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "gutil/strings/numbers.h" +#include "vec/columns/column_string.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_factory.hpp" +#include "vec/io/io_helper.h" + +namespace doris::vectorized::converter { + +template <PrimitiveType type> +constexpr bool is_decimal_type_const() { + return type == TYPE_DECIMALV2 || type == TYPE_DECIMAL32 || type == TYPE_DECIMAL64 || + type == TYPE_DECIMAL128I || type == TYPE_DECIMAL256; +} + +/** + * Unified schema change interface for all format readers: + * + * First, read the data according to the column type of the file into source column + * Second, convert source column to the destination column with type planned by FE + */ +class ColumnTypeConverter { +protected: + // The cached column to read data according to the column type of the file + // Then, it will be converted destination column, so this column can be reuse in next loop + ColumnPtr _cached_src_column = nullptr; + // The column type generated from file meta(eg. parquet footer) + DataTypePtr _cached_src_type = nullptr; + // Error message to show unsupported converter if support() return false; + std::string _error_msg; + +public: + /** + * Get the converter to change column type + * @param src_type colum type from file meta data + * @param dst_type column type from FE planner(the changed column type) + */ + static std::unique_ptr<ColumnTypeConverter> get_converter(const TypeDescriptor& src_type, + const DataTypePtr& dst_type); + + ColumnTypeConverter() = default; + virtual ~ColumnTypeConverter() = default; + + /** + * Converter source column to destination column. If the converter is not consistent, + * the source column is `_cached_src_column`, otherwise, `src_col` and `dst_col` are the + * same column, and with nothing to do. + */ + virtual Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) { return Status::OK(); } + + virtual bool support() { return true; } + + virtual bool is_consistent() { return false; } + + /** + * Get the column to read data from file with the type from file meta data. + * If the converter is not consistent, the returned column is `_cached_src_column`. + * For performance reasons, the null map of `_cached_src_column` is a reference from + * the null map of `dst_column`, so there is no need to convert null map in `convert()`. + * + * According to the hive standard, if certain values fail to be converted(eg. string `row1` to in value), + * these values are replaced by nulls. + */ + ColumnPtr get_column(const TypeDescriptor& src_type, ColumnPtr& dst_column, + const DataTypePtr& dst_type); + + /** + * Get the column type from file meta data. + */ + const DataTypePtr& get_type() { return _cached_src_type; } + + std::string get_error_msg() { return _error_msg; }; +}; + +/** + * No type conversion occurred, or compatible type conversion + * + * Compatible type conversion: + * conversion within string, char and varchar + * conversion from decimal(p1, s1) to decimal(p2, s2), because the scale change of decimal type is resolved in decode process + */ +class ConsistentConverter : public ColumnTypeConverter { + bool is_consistent() override { return true; } +}; + +/** + * Unsupported type change, eg. from int to date + */ +class UnsupportedConverter : public ColumnTypeConverter { +public: + UnsupportedConverter(const TypeDescriptor& src_type, const DataTypePtr& dst_type) { + std::string src_type_str = std::string(getTypeName( + DataTypeFactory::instance().create_data_type(src_type, false)->get_type_id())); + std::string dst_type_str = + std::string(getTypeName(remove_nullable(dst_type)->get_type_id())); + _error_msg = src_type_str + " => " + dst_type_str; + } + + bool support() override { return false; } + + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + return Status::InternalError("Unsupported type change: {}", _error_msg); + } +}; + +template <PrimitiveType SrcPrimitiveType, PrimitiveType DstPrimitiveType> +class NumericToNumericConverter : public ColumnTypeConverter { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using SrcColumnType = typename PrimitiveTypeTraits<SrcPrimitiveType>::ColumnType; + using DstCppType = typename PrimitiveTypeTraits<DstPrimitiveType>::CppType; + using DstColumnType = typename PrimitiveTypeTraits<DstPrimitiveType>::ColumnType; + ColumnPtr from_col = remove_nullable(src_col); + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + size_t rows = from_col->size(); + auto& src_data = static_cast<const SrcColumnType*>(from_col.get())->get_data(); + size_t start_idx = to_col->size(); + to_col->resize(start_idx + rows); + auto& data = static_cast<DstColumnType&>(*to_col.get()).get_data(); + for (int i = 0; i < rows; ++i) { + data[start_idx + i] = static_cast<DstCppType>(src_data[i]); + } + + return Status::OK(); + } +}; + +template <PrimitiveType SrcPrimitiveType> +class NumericToStringConverter : public ColumnTypeConverter { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using SrcColumnType = typename PrimitiveTypeTraits<SrcPrimitiveType>::ColumnType; + ColumnPtr from_col = remove_nullable(src_col); + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + size_t rows = from_col->size(); + auto& src_data = static_cast<const SrcColumnType*>(from_col.get())->get_data(); + auto& string_col = static_cast<ColumnString&>(*to_col.get()); + for (int i = 0; i < rows; ++i) { + if constexpr (SrcPrimitiveType == TYPE_LARGEINT) { + string value = int128_to_string(src_data[i]); + string_col.insert_data(value.data(), value.size()); + } else { + string value = std::to_string(src_data[i]); + string_col.insert_data(value.data(), value.size()); + } + } + + return Status::OK(); + } +}; + +template <PrimitiveType SrcPrimitiveType> +class DecimalToStringConverter : public ColumnTypeConverter { +private: + int _scale; + +public: + DecimalToStringConverter(int scale) : _scale(scale) {} + + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using SrcColumnType = typename PrimitiveTypeTraits<SrcPrimitiveType>::ColumnType; + ColumnPtr from_col = remove_nullable(src_col); + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + size_t rows = from_col->size(); + auto& src_data = static_cast<const SrcColumnType*>(from_col.get())->get_data(); + auto& string_col = static_cast<ColumnString&>(*to_col.get()); + for (int i = 0; i < rows; ++i) { + std::string value = src_data[i].to_string(_scale); + string_col.insert_data(value.data(), value.size()); + } + + return Status::OK(); + } +}; + +template <PrimitiveType SrcPrimitiveType> +class TimeToStringConverter : public ColumnTypeConverter { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using SrcCppType = typename PrimitiveTypeTraits<SrcPrimitiveType>::CppType; + using SrcColumnType = typename PrimitiveTypeTraits<SrcPrimitiveType>::ColumnType; + ColumnPtr from_col = remove_nullable(src_col); + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + size_t rows = from_col->size(); + auto& src_data = static_cast<const SrcColumnType*>(from_col.get())->get_data(); + auto& string_col = static_cast<ColumnString&>(*to_col.get()); + char buf[50]; + for (int i = 0; i < rows; ++i) { + char* end = (reinterpret_cast<const SrcCppType&>(src_data[i])).to_string(buf); + string_col.insert_data(buf, end - buf); + } + + return Status::OK(); + } +}; + +template <PrimitiveType DstPrimitiveType> +struct SafeCastString {}; + +template <> +struct SafeCastString<TYPE_BOOLEAN> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_BOOLEAN>::ColumnType::value_type* value) { + int32 cast_to_int = 0; + bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int); + *value = cast_to_int == 0 ? 0 : 1; + return can_cast; + } +}; + +template <> +struct SafeCastString<TYPE_TINYINT> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_TINYINT>::ColumnType::value_type* value) { + int32 cast_to_int = 0; + bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int); + *value = cast_to_int; + return can_cast && cast_to_int <= std::numeric_limits<int8>::max() && + cast_to_int >= std::numeric_limits<int8>::min(); + } +}; + +template <> +struct SafeCastString<TYPE_SMALLINT> { + static bool safe_cast_string( + const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_SMALLINT>::ColumnType::value_type* value) { + int32 cast_to_int = 0; + bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int); + *value = cast_to_int; + return can_cast && cast_to_int <= std::numeric_limits<int16>::max() && + cast_to_int >= std::numeric_limits<int16>::min(); + } +}; + +template <> +struct SafeCastString<TYPE_INT> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_INT>::ColumnType::value_type* value) { + int32 cast_to_int = 0; + bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int); + *value = cast_to_int; + return can_cast; + } +}; + +template <> +struct SafeCastString<TYPE_BIGINT> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_BIGINT>::ColumnType::value_type* value) { + int64 cast_to_int = 0; + bool can_cast = safe_strto64(startptr, buffer_size, &cast_to_int); + *value = cast_to_int; + return can_cast; + } +}; + +template <> +struct SafeCastString<TYPE_LARGEINT> { + static bool safe_cast_string( + const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_LARGEINT>::ColumnType::value_type* value) { + int64 cast_to_int = 0; + bool can_cast = safe_strto64(startptr, buffer_size, &cast_to_int); + *value = cast_to_int; + return can_cast; + } +}; + +template <> +struct SafeCastString<TYPE_FLOAT> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_FLOAT>::ColumnType::value_type* value) { + float cast_to_float = 0; + bool can_cast = safe_strtof(std::string(startptr, buffer_size), &cast_to_float); + *value = cast_to_float; + return can_cast; + } +}; + +template <> +struct SafeCastString<TYPE_DOUBLE> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_DOUBLE>::ColumnType::value_type* value) { + double cast_to_double = 0; + bool can_cast = safe_strtod(std::string(startptr, buffer_size), &cast_to_double); + *value = cast_to_double; + return can_cast; + } +}; + +template <> +struct SafeCastString<TYPE_DATETIME> { + static bool safe_cast_string( + const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_DATETIME>::ColumnType::value_type* value) { + ReadBuffer buffer(reinterpret_cast<const unsigned char*>(startptr), buffer_size); + return read_datetime_text_impl<Int64>(*value, buffer); + } +}; + +template <> +struct SafeCastString<TYPE_DATETIMEV2> { + static bool safe_cast_string( + const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_DATETIMEV2>::ColumnType::value_type* value, int scale) { + ReadBuffer buffer(reinterpret_cast<const unsigned char*>(startptr), buffer_size); + return read_datetime_v2_text_impl<UInt64>(*value, buffer, scale); + } +}; + +template <> +struct SafeCastString<TYPE_DATE> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_DATE>::ColumnType::value_type* value) { + ReadBuffer buffer(reinterpret_cast<const unsigned char*>(startptr), buffer_size); + return read_date_text_impl<Int64>(*value, buffer); + } +}; + +template <> +struct SafeCastString<TYPE_DATEV2> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_DATEV2>::ColumnType::value_type* value) { + ReadBuffer buffer(reinterpret_cast<const unsigned char*>(startptr), buffer_size); + return read_date_v2_text_impl<UInt32>(*value, buffer); + } +}; + +template <PrimitiveType DstPrimitiveType> +struct SafeCastDecimalString { + using CppType = typename PrimitiveTypeTraits<DstPrimitiveType>::ColumnType::value_type; + + static bool safe_cast_string(const char* startptr, const int buffer_size, CppType* value, + int precision, int scale) { + ReadBuffer buffer(reinterpret_cast<const unsigned char*>(startptr), buffer_size); + return read_decimal_text_impl<DstPrimitiveType, CppType>( + *value, buffer, precision, scale) == StringParser::PARSE_SUCCESS; + } +}; + +template <PrimitiveType DstPrimitiveType> +class CastStringConverter : public ColumnTypeConverter { +private: + DataTypePtr _dst_type_desc; + +public: + CastStringConverter() = default; + CastStringConverter(DataTypePtr dst_type_desc) : _dst_type_desc(dst_type_desc) {} + + using DstCppType = typename PrimitiveTypeTraits<DstPrimitiveType>::ColumnType::value_type; + using DstColumnType = typename PrimitiveTypeTraits<DstPrimitiveType>::ColumnType; + + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + ColumnPtr from_col = remove_nullable(src_col); + NullMap* null_map = nullptr; + if (dst_col->is_nullable()) { + null_map = &reinterpret_cast<vectorized::ColumnNullable*>(dst_col.get()) + ->get_null_map_data(); + } + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + size_t rows = from_col->size(); + auto& string_col = static_cast<ColumnString&>(*from_col->assume_mutable().get()); + size_t start_idx = to_col->size(); + to_col->resize(start_idx + rows); + auto& data = static_cast<DstColumnType*>(to_col.get())->get_data(); + for (int i = 0; i < rows; ++i) { + DstCppType value; + auto string_value = string_col.get_data_at(i); + bool can_cast = false; + if constexpr (is_decimal_type_const<DstPrimitiveType>()) { + can_cast = SafeCastDecimalString<DstPrimitiveType>::safe_cast_string( + string_value.data, string_value.size, &value, + _dst_type_desc->get_precision(), _dst_type_desc->get_scale()); + } else if constexpr (DstPrimitiveType == TYPE_DATETIMEV2) { + can_cast = SafeCastString<TYPE_DATETIMEV2>::safe_cast_string( + string_value.data, string_value.size, &value, _dst_type_desc->get_scale()); + } else { + can_cast = SafeCastString<DstPrimitiveType>::safe_cast_string( + string_value.data, string_value.size, &value); + } + if (can_cast) { + data[start_idx + i] = value; + } else { + if (null_map == nullptr) { + return Status::InternalError("Failed to cast string '{}'", + string_value.to_string()); + } else { + (*null_map)[start_idx + i] = 1; + } + } + } + + return Status::OK(); + } +}; + +// only support date & datetime v2 +template <PrimitiveType SrcPrimitiveType, PrimitiveType DstPrimitiveType> +class V2DateConverter : public ColumnTypeConverter { Review Comment: ```suggestion class DateV2Converter : public ColumnTypeConverter { ``` ########## be/src/vec/exec/format/orc/vorc_reader.cpp: ########## @@ -532,6 +532,14 @@ std::vector<OrcPredicate> value_range_to_predicate( std::vector<orc::TypeKind>* unsupported_pushdown_types) { std::vector<OrcPredicate> predicates; + PrimitiveType src_type = OrcReader::convert_to_doris_type(type).type; + if (src_type != primitive_type) { + if (!(is_string_type(src_type) && is_string_type(primitive_type))) { Review Comment: why? ########## be/src/vec/exec/format/column_type_convert.cpp: ########## @@ -0,0 +1,330 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/format/column_type_convert.h" + +namespace doris::vectorized::converter { + +#define FOR_LOGICAL_NUMERIC_TYPES(M) \ + M(TYPE_BOOLEAN) \ + M(TYPE_TINYINT) \ + M(TYPE_SMALLINT) \ + M(TYPE_INT) \ + M(TYPE_BIGINT) \ + M(TYPE_LARGEINT) \ + M(TYPE_FLOAT) \ + M(TYPE_DOUBLE) + +#define FOR_LOGICAL_DECIMAL_TYPES(M) \ + M(TYPE_DECIMALV2) \ + M(TYPE_DECIMAL32) \ + M(TYPE_DECIMAL64) \ + M(TYPE_DECIMAL128I) \ + M(TYPE_DECIMAL256) + +#define FOR_LOGICAL_TIME_TYPES(M) \ + M(TYPE_DATETIME) \ + M(TYPE_DATE) \ + M(TYPE_DATETIMEV2) \ + M(TYPE_DATEV2) + +#define FOR_ALL_LOGICAL_TYPES(M) \ + M(TYPE_BOOLEAN) \ + M(TYPE_TINYINT) \ + M(TYPE_SMALLINT) \ + M(TYPE_INT) \ + M(TYPE_BIGINT) \ + M(TYPE_LARGEINT) \ + M(TYPE_FLOAT) \ + M(TYPE_DOUBLE) \ + M(TYPE_DECIMALV2) \ + M(TYPE_DECIMAL32) \ + M(TYPE_DECIMAL64) \ + M(TYPE_DECIMAL128I) \ + M(TYPE_DECIMAL256) \ + M(TYPE_DATETIME) \ + M(TYPE_DATE) \ + M(TYPE_DATETIMEV2) \ + M(TYPE_DATEV2) + +static bool _is_numeric_type(PrimitiveType type) { + switch (type) { + case TYPE_BOOLEAN: + case TYPE_TINYINT: + case TYPE_SMALLINT: + case TYPE_INT: + case TYPE_BIGINT: + case TYPE_LARGEINT: + case TYPE_FLOAT: + case TYPE_DOUBLE: + return true; + default: + return false; + } +} + +static bool _is_decimal_type(doris::PrimitiveType type) { + switch (type) { + case TYPE_DECIMALV2: + case TYPE_DECIMAL32: + case TYPE_DECIMAL64: + case TYPE_DECIMAL128I: + case TYPE_DECIMAL256: + return true; + default: + return false; + } +} + +ColumnPtr ColumnTypeConverter::get_column(const TypeDescriptor& src_type, ColumnPtr& dst_column, + const DataTypePtr& dst_type) { + if (is_consistent()) { + if (_cached_src_type == nullptr) { + _cached_src_type = + DataTypeFactory::instance().create_data_type(src_type, dst_type->is_nullable()); + } + return dst_column; + } + + if (_cached_src_column == nullptr) { + _cached_src_type = + DataTypeFactory::instance().create_data_type(src_type, dst_type->is_nullable()); + _cached_src_column = + DataTypeFactory::instance().create_data_type(src_type, false)->create_column(); + } + // remove the old cached data + _cached_src_column->assume_mutable()->clear(); + + if (dst_type->is_nullable()) { + // In order to share null map between parquet converted src column and dst column to avoid copying. It is very tricky that will + // call mutable function `doris_nullable_column->get_null_map_column_ptr()` which will set `_need_update_has_null = true`. + // Because some operations such as agg will call `has_null()` to set `_need_update_has_null = false`. + auto doris_nullable_column = + const_cast<ColumnNullable*>(static_cast<const ColumnNullable*>(dst_column.get())); + return ColumnNullable::create(_cached_src_column, + doris_nullable_column->get_null_map_column_ptr()); + } + + return _cached_src_column; +} + +static std::unique_ptr<ColumnTypeConverter> _numeric_converter(const TypeDescriptor& src_type, + const DataTypePtr& dst_type) { + PrimitiveType src_primitive_type = src_type.type; + PrimitiveType dst_primitive_type = + remove_nullable(dst_type)->get_type_as_type_descriptor().type; + switch (src_primitive_type) { +#define DISPATCH(SRC_PTYPE) \ + case SRC_PTYPE: { \ + switch (dst_primitive_type) { \ + case TYPE_BOOLEAN: \ + return std::make_unique<NumericToNumericConverter<SRC_PTYPE, TYPE_BOOLEAN>>(); \ + case TYPE_TINYINT: \ + return std::make_unique<NumericToNumericConverter<SRC_PTYPE, TYPE_TINYINT>>(); \ + case TYPE_SMALLINT: \ + return std::make_unique<NumericToNumericConverter<SRC_PTYPE, TYPE_SMALLINT>>(); \ + case TYPE_INT: \ + return std::make_unique<NumericToNumericConverter<SRC_PTYPE, TYPE_INT>>(); \ + case TYPE_BIGINT: \ + return std::make_unique<NumericToNumericConverter<SRC_PTYPE, TYPE_BIGINT>>(); \ + case TYPE_LARGEINT: \ + return std::make_unique<NumericToNumericConverter<SRC_PTYPE, TYPE_LARGEINT>>(); \ + case TYPE_FLOAT: \ + return std::make_unique<NumericToNumericConverter<SRC_PTYPE, TYPE_FLOAT>>(); \ + case TYPE_DOUBLE: \ + return std::make_unique<NumericToNumericConverter<SRC_PTYPE, TYPE_DOUBLE>>(); \ + default: \ + return std::make_unique<UnsupportedConverter>(src_type, dst_type); \ + } \ + } + FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) +#undef DISPATCH + default: + return std::make_unique<UnsupportedConverter>(src_type, dst_type); + } +} + +static std::unique_ptr<ColumnTypeConverter> _to_string_converter(const TypeDescriptor& src_type, + const DataTypePtr& dst_type) { + PrimitiveType src_primitive_type = src_type.type; + // numeric type to string, using native std::to_string + if (_is_numeric_type(src_primitive_type)) { + switch (src_primitive_type) { +#define DISPATCH(SRC_PTYPE) \ + case SRC_PTYPE: \ + return std::make_unique<NumericToStringConverter<SRC_PTYPE>>(); + FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) +#undef DISPATCH + default: + return std::make_unique<UnsupportedConverter>(src_type, dst_type); + } + } else if (_is_decimal_type(src_primitive_type)) { // decimal type to string + switch (src_primitive_type) { +#define DISPATCH(SRC_PTYPE) \ + case SRC_PTYPE: \ + return std::make_unique<DecimalToStringConverter<SRC_PTYPE>>(src_type.scale); + FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) +#undef DISPATCH + default: + return std::make_unique<UnsupportedConverter>(src_type, dst_type); + } + } else if (is_date_type(src_primitive_type)) { // date and datetime type to string + switch (src_primitive_type) { +#define DISPATCH(SRC_PTYPE) \ + case SRC_PTYPE: \ + return std::make_unique<TimeToStringConverter<SRC_PTYPE>>(); + FOR_LOGICAL_TIME_TYPES(DISPATCH) +#undef DISPATCH + default: + return std::make_unique<UnsupportedConverter>(src_type, dst_type); + } + } + return std::make_unique<UnsupportedConverter>(src_type, dst_type); +} + +static std::unique_ptr<ColumnTypeConverter> _cast_string_converter(const TypeDescriptor& src_type, + const DataTypePtr& dst_type) { + PrimitiveType dst_primitive_type = + remove_nullable(dst_type)->get_type_as_type_descriptor().type; + switch (dst_primitive_type) { +#define DISPATCH(DST_PTYPE) \ + case DST_PTYPE: \ + return std::make_unique<CastStringConverter<DST_PTYPE>>(remove_nullable(dst_type)); + FOR_ALL_LOGICAL_TYPES(DISPATCH) +#undef DISPATCH + default: + return std::make_unique<UnsupportedConverter>(src_type, dst_type); + } +} + +static std::unique_ptr<ColumnTypeConverter> _numeric_to_decimal_converter( + const TypeDescriptor& src_type, const DataTypePtr& dst_type) { + PrimitiveType src_primitive_type = src_type.type; + PrimitiveType dst_primitive_type = + remove_nullable(dst_type)->get_type_as_type_descriptor().type; + int scale = remove_nullable(dst_type)->get_scale(); + switch (src_primitive_type) { +#define DISPATCH(SRC_PTYPE) \ + case SRC_PTYPE: { \ + switch (dst_primitive_type) { \ + case TYPE_DECIMALV2: \ + return std::make_unique<NumericToDecimalConverter<SRC_PTYPE, TYPE_DECIMALV2>>(scale); \ + case TYPE_DECIMAL32: \ + return std::make_unique<NumericToDecimalConverter<SRC_PTYPE, TYPE_DECIMAL32>>(scale); \ + case TYPE_DECIMAL64: \ + return std::make_unique<NumericToDecimalConverter<SRC_PTYPE, TYPE_DECIMAL64>>(scale); \ + case TYPE_DECIMAL128I: \ + return std::make_unique<NumericToDecimalConverter<SRC_PTYPE, TYPE_DECIMAL128I>>( \ + scale); \ + case TYPE_DECIMAL256: \ + return std::make_unique<NumericToDecimalConverter<SRC_PTYPE, TYPE_DECIMAL256>>(scale); \ + default: \ + return std::make_unique<UnsupportedConverter>(src_type, dst_type); \ + } \ + } + FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) +#undef DISPATCH + default: + return std::make_unique<UnsupportedConverter>(src_type, dst_type); + } +} + +static std::unique_ptr<ColumnTypeConverter> _decimal_to_numeric_converter( + const TypeDescriptor& src_type, const DataTypePtr& dst_type) { + PrimitiveType src_primitive_type = src_type.type; + PrimitiveType dst_primitive_type = + remove_nullable(dst_type)->get_type_as_type_descriptor().type; + int scale = src_type.scale; + switch (dst_primitive_type) { +#define DISPATCH(DST_PTYPE) \ + case DST_PTYPE: { \ + switch (src_primitive_type) { \ + case TYPE_DECIMALV2: \ + return std::make_unique<DecimalToNumericConverter<TYPE_DECIMALV2, DST_PTYPE>>(scale); \ + case TYPE_DECIMAL32: \ + return std::make_unique<DecimalToNumericConverter<TYPE_DECIMAL32, DST_PTYPE>>(scale); \ + case TYPE_DECIMAL64: \ + return std::make_unique<DecimalToNumericConverter<TYPE_DECIMAL64, DST_PTYPE>>(scale); \ + case TYPE_DECIMAL128I: \ + return std::make_unique<DecimalToNumericConverter<TYPE_DECIMAL128I, DST_PTYPE>>( \ + scale); \ + case TYPE_DECIMAL256: \ + return std::make_unique<DecimalToNumericConverter<TYPE_DECIMAL256, DST_PTYPE>>(scale); \ + default: \ + return std::make_unique<UnsupportedConverter>(src_type, dst_type); \ + } \ + } + FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) +#undef DISPATCH + default: + return std::make_unique<UnsupportedConverter>(src_type, dst_type); + } +} + +std::unique_ptr<ColumnTypeConverter> ColumnTypeConverter::get_converter( + const TypeDescriptor& src_type, const DataTypePtr& dst_type) { + PrimitiveType src_primitive_type = src_type.type; + PrimitiveType dst_primitive_type = + remove_nullable(dst_type)->get_type_as_type_descriptor().type; + if (src_primitive_type == dst_primitive_type) { + return std::make_unique<ConsistentConverter>(); + } + if (is_string_type(src_primitive_type) && is_string_type(dst_primitive_type)) { + return std::make_unique<ConsistentConverter>(); + } + + // from numeric type to numeric type, use native static cast + // example: float -> int + if (_is_numeric_type(src_primitive_type) && _is_numeric_type(dst_primitive_type)) { + return _numeric_converter(src_type, dst_type); + } + + // change to string type + // example: decimal -> string + if (is_string_type(dst_primitive_type)) { + return _to_string_converter(src_type, dst_type); + } + + // string type to other type + // example: string -> date + if (is_string_type(src_primitive_type)) { + return _cast_string_converter(src_type, dst_type); Review Comment: ```suggestion return _from_string_converter(src_type, dst_type); ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org