AshinGau commented on code in PR #32873: URL: https://github.com/apache/doris/pull/32873#discussion_r1555135831
########## be/src/vec/exec/format/column_type_convert.h: ########## @@ -0,0 +1,539 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "gutil/strings/numbers.h" +#include "vec/columns/column_string.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_factory.hpp" +#include "vec/io/io_helper.h" + +namespace doris::vectorized::converter { + +template <PrimitiveType type> +constexpr bool is_decimal_type_const() { + return type == TYPE_DECIMALV2 || type == TYPE_DECIMAL32 || type == TYPE_DECIMAL64 || + type == TYPE_DECIMAL128I || type == TYPE_DECIMAL256; +} + +/** + * Unified schema change interface for all format readers: + * + * First, read the data according to the column type of the file into source column + * Second, convert source column to the destination column with type planned by FE + */ +class ColumnTypeConverter { +protected: + // The cached column to read data according to the column type of the file + // Then, it will be converted destination column, so this column can be reuse in next loop + ColumnPtr _cached_src_column = nullptr; + // The column type generated from file meta(eg. parquet footer) + DataTypePtr _cached_src_type = nullptr; + // Error message to show unsupported converter if support() return false; + std::string _error_msg; + +public: + /** + * Get the converter to change column type + * @param src_type colum type from file meta data + * @param dst_type column type from FE planner(the changed column type) + */ + static std::unique_ptr<ColumnTypeConverter> get_converter(const TypeDescriptor& src_type, + const DataTypePtr& dst_type); + + ColumnTypeConverter() = default; + virtual ~ColumnTypeConverter() = default; + + /** + * Converter source column to destination column. If the converter is not consistent, + * the source column is `_cached_src_column`, otherwise, `src_col` and `dst_col` are the + * same column, and with nothing to do. + */ + virtual Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) { return Status::OK(); } + + virtual bool support() { return true; } + + virtual bool is_consistent() { return false; } + + /** + * Get the column to read data from file with the type from file meta data. + * If the converter is not consistent, the returned column is `_cached_src_column`. + * For performance reasons, the null map of `_cached_src_column` is a reference from + * the null map of `dst_column`, so there is no need to convert null map in `convert()`. + * + * According to the hive standard, if certain values fail to be converted(eg. string `row1` to in value), + * these values are replaced by nulls. + */ + ColumnPtr get_column(const TypeDescriptor& src_type, ColumnPtr& dst_column, + const DataTypePtr& dst_type); + + /** + * Get the column type from file meta data. + */ + const DataTypePtr& get_type() { return _cached_src_type; } + + std::string get_error_msg() { return _error_msg; }; +}; + +/** + * No type conversion occurred, or compatible type conversion + * + * Compatible type conversion: + * conversion within string, char and varchar + * conversion from decimal(p1, s1) to decimal(p2, s2), because the scale change of decimal type is resolved in decode process + */ +class ConsistentConverter : public ColumnTypeConverter { + bool is_consistent() override { return true; } +}; + +/** + * Unsupported type change, eg. from int to date + */ +class UnsupportedConverter : public ColumnTypeConverter { +public: + UnsupportedConverter(const TypeDescriptor& src_type, const DataTypePtr& dst_type) { + std::string src_type_str = std::string(getTypeName( + DataTypeFactory::instance().create_data_type(src_type, false)->get_type_id())); + std::string dst_type_str = + std::string(getTypeName(remove_nullable(dst_type)->get_type_id())); + _error_msg = src_type_str + " => " + dst_type_str; + } + + bool support() override { return false; } + + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + return Status::InternalError("Unsupported type change: {}", _error_msg); + } +}; + +template <PrimitiveType SrcPrimitiveType, PrimitiveType DstPrimitiveType> +class NumericToNumericConverter : public ColumnTypeConverter { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using SrcColumnType = typename PrimitiveTypeTraits<SrcPrimitiveType>::ColumnType; + using DstCppType = typename PrimitiveTypeTraits<DstPrimitiveType>::CppType; + using DstColumnType = typename PrimitiveTypeTraits<DstPrimitiveType>::ColumnType; + ColumnPtr from_col = remove_nullable(src_col); + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + size_t rows = from_col->size(); + auto& src_data = static_cast<const SrcColumnType*>(from_col.get())->get_data(); + size_t start_idx = to_col->size(); + to_col->resize(start_idx + rows); + auto& data = static_cast<DstColumnType&>(*to_col.get()).get_data(); + for (int i = 0; i < rows; ++i) { + data[start_idx + i] = static_cast<DstCppType>(src_data[i]); + } + + return Status::OK(); + } +}; + +template <PrimitiveType SrcPrimitiveType> +class NumericToStringConverter : public ColumnTypeConverter { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using SrcColumnType = typename PrimitiveTypeTraits<SrcPrimitiveType>::ColumnType; + ColumnPtr from_col = remove_nullable(src_col); + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + size_t rows = from_col->size(); + auto& src_data = static_cast<const SrcColumnType*>(from_col.get())->get_data(); + auto& string_col = static_cast<ColumnString&>(*to_col.get()); + for (int i = 0; i < rows; ++i) { + if constexpr (SrcPrimitiveType == TYPE_LARGEINT) { + string value = int128_to_string(src_data[i]); + string_col.insert_data(value.data(), value.size()); + } else { + string value = std::to_string(src_data[i]); + string_col.insert_data(value.data(), value.size()); + } + } + + return Status::OK(); + } +}; + +template <PrimitiveType SrcPrimitiveType> +class DecimalToStringConverter : public ColumnTypeConverter { +private: + int _scale; + +public: + DecimalToStringConverter(int scale) : _scale(scale) {} + + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using SrcColumnType = typename PrimitiveTypeTraits<SrcPrimitiveType>::ColumnType; + ColumnPtr from_col = remove_nullable(src_col); + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + size_t rows = from_col->size(); + auto& src_data = static_cast<const SrcColumnType*>(from_col.get())->get_data(); + auto& string_col = static_cast<ColumnString&>(*to_col.get()); + for (int i = 0; i < rows; ++i) { + std::string value = src_data[i].to_string(_scale); + string_col.insert_data(value.data(), value.size()); + } + + return Status::OK(); + } +}; + +template <PrimitiveType SrcPrimitiveType> +class TimeToStringConverter : public ColumnTypeConverter { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using SrcCppType = typename PrimitiveTypeTraits<SrcPrimitiveType>::CppType; + using SrcColumnType = typename PrimitiveTypeTraits<SrcPrimitiveType>::ColumnType; + ColumnPtr from_col = remove_nullable(src_col); + MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + + size_t rows = from_col->size(); + auto& src_data = static_cast<const SrcColumnType*>(from_col.get())->get_data(); + auto& string_col = static_cast<ColumnString&>(*to_col.get()); + char buf[50]; + for (int i = 0; i < rows; ++i) { + char* end = (reinterpret_cast<const SrcCppType&>(src_data[i])).to_string(buf); + string_col.insert_data(buf, end - buf); + } + + return Status::OK(); + } +}; + +template <PrimitiveType DstPrimitiveType> +struct SafeCastString {}; + +template <> +struct SafeCastString<TYPE_BOOLEAN> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_BOOLEAN>::ColumnType::value_type* value) { + int32 cast_to_int = 0; + bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int); + *value = cast_to_int == 0 ? 0 : 1; + return can_cast; + } +}; + +template <> +struct SafeCastString<TYPE_TINYINT> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_TINYINT>::ColumnType::value_type* value) { + int32 cast_to_int = 0; + bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int); + *value = cast_to_int; + return can_cast && cast_to_int <= std::numeric_limits<int8>::max() && + cast_to_int >= std::numeric_limits<int8>::min(); + } +}; + +template <> +struct SafeCastString<TYPE_SMALLINT> { + static bool safe_cast_string( + const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_SMALLINT>::ColumnType::value_type* value) { + int32 cast_to_int = 0; + bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int); + *value = cast_to_int; + return can_cast && cast_to_int <= std::numeric_limits<int16>::max() && + cast_to_int >= std::numeric_limits<int16>::min(); + } +}; + +template <> +struct SafeCastString<TYPE_INT> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_INT>::ColumnType::value_type* value) { + int32 cast_to_int = 0; + bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int); + *value = cast_to_int; + return can_cast; + } +}; + +template <> +struct SafeCastString<TYPE_BIGINT> { + static bool safe_cast_string(const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_BIGINT>::ColumnType::value_type* value) { + int64 cast_to_int = 0; + bool can_cast = safe_strto64(startptr, buffer_size, &cast_to_int); + *value = cast_to_int; + return can_cast; + } +}; + +template <> +struct SafeCastString<TYPE_LARGEINT> { + static bool safe_cast_string( + const char* startptr, const int buffer_size, + PrimitiveTypeTraits<TYPE_LARGEINT>::ColumnType::value_type* value) { + int64 cast_to_int = 0; + bool can_cast = safe_strto64(startptr, buffer_size, &cast_to_int); Review Comment: Use `read_int_text_impl` instead. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org