AshinGau commented on code in PR #32873:
URL: https://github.com/apache/doris/pull/32873#discussion_r1555135831


##########
be/src/vec/exec/format/column_type_convert.h:
##########
@@ -0,0 +1,539 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "gutil/strings/numbers.h"
+#include "vec/columns/column_string.h"
+#include "vec/core/types.h"
+#include "vec/data_types/data_type.h"
+#include "vec/data_types/data_type_factory.hpp"
+#include "vec/io/io_helper.h"
+
+namespace doris::vectorized::converter {
+
+template <PrimitiveType type>
+constexpr bool is_decimal_type_const() {
+    return type == TYPE_DECIMALV2 || type == TYPE_DECIMAL32 || type == 
TYPE_DECIMAL64 ||
+           type == TYPE_DECIMAL128I || type == TYPE_DECIMAL256;
+}
+
+/**
+ * Unified schema change interface for all format readers:
+ *
+ * First, read the data according to the column type of the file into source 
column
+ * Second, convert source column to the destination column with type planned 
by FE
+ */
+class ColumnTypeConverter {
+protected:
+    // The cached column to read data according to the column type of the file
+    // Then, it will be converted destination column, so this column can be 
reuse in next loop
+    ColumnPtr _cached_src_column = nullptr;
+    // The column type generated from file meta(eg. parquet footer)
+    DataTypePtr _cached_src_type = nullptr;
+    // Error message to show unsupported converter if support() return false;
+    std::string _error_msg;
+
+public:
+    /**
+     * Get the converter to change column type
+     * @param src_type colum type from file meta data
+     * @param dst_type column type from FE planner(the changed column type)
+     */
+    static std::unique_ptr<ColumnTypeConverter> get_converter(const 
TypeDescriptor& src_type,
+                                                              const 
DataTypePtr& dst_type);
+
+    ColumnTypeConverter() = default;
+    virtual ~ColumnTypeConverter() = default;
+
+    /**
+     * Converter source column to destination column. If the converter is not 
consistent,
+     * the source column is `_cached_src_column`, otherwise, `src_col` and 
`dst_col` are the
+     * same column, and with nothing to do.
+     */
+    virtual Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) { 
return Status::OK(); }
+
+    virtual bool support() { return true; }
+
+    virtual bool is_consistent() { return false; }
+
+    /**
+     * Get the column to read data from file with the type from file meta data.
+     * If the converter is not consistent, the returned column is 
`_cached_src_column`.
+     * For performance reasons, the null map of `_cached_src_column` is a 
reference from
+     * the null map of `dst_column`, so there is no need to convert null map 
in `convert()`.
+     *
+     * According to the hive standard, if certain values fail to be 
converted(eg. string `row1` to in value),
+     * these values are replaced by nulls.
+     */
+    ColumnPtr get_column(const TypeDescriptor& src_type, ColumnPtr& dst_column,
+                         const DataTypePtr& dst_type);
+
+    /**
+     * Get the column type from file meta data.
+     */
+    const DataTypePtr& get_type() { return _cached_src_type; }
+
+    std::string get_error_msg() { return _error_msg; };
+};
+
+/**
+ * No type conversion occurred, or compatible type conversion
+ *
+ * Compatible type conversion:
+ * conversion within string, char and varchar
+ * conversion from decimal(p1, s1) to decimal(p2, s2), because the scale 
change of decimal type is resolved in decode process
+ */
+class ConsistentConverter : public ColumnTypeConverter {
+    bool is_consistent() override { return true; }
+};
+
+/**
+ * Unsupported type change, eg. from int to date
+ */
+class UnsupportedConverter : public ColumnTypeConverter {
+public:
+    UnsupportedConverter(const TypeDescriptor& src_type, const DataTypePtr& 
dst_type) {
+        std::string src_type_str = std::string(getTypeName(
+                DataTypeFactory::instance().create_data_type(src_type, 
false)->get_type_id()));
+        std::string dst_type_str =
+                
std::string(getTypeName(remove_nullable(dst_type)->get_type_id()));
+        _error_msg = src_type_str + " => " + dst_type_str;
+    }
+
+    bool support() override { return false; }
+
+    Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
+        return Status::InternalError("Unsupported type change: {}", 
_error_msg);
+    }
+};
+
+template <PrimitiveType SrcPrimitiveType, PrimitiveType DstPrimitiveType>
+class NumericToNumericConverter : public ColumnTypeConverter {
+    Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
+        using SrcColumnType = typename 
PrimitiveTypeTraits<SrcPrimitiveType>::ColumnType;
+        using DstCppType = typename 
PrimitiveTypeTraits<DstPrimitiveType>::CppType;
+        using DstColumnType = typename 
PrimitiveTypeTraits<DstPrimitiveType>::ColumnType;
+        ColumnPtr from_col = remove_nullable(src_col);
+        MutableColumnPtr to_col = 
remove_nullable(dst_col->get_ptr())->assume_mutable();
+
+        size_t rows = from_col->size();
+        auto& src_data = static_cast<const 
SrcColumnType*>(from_col.get())->get_data();
+        size_t start_idx = to_col->size();
+        to_col->resize(start_idx + rows);
+        auto& data = static_cast<DstColumnType&>(*to_col.get()).get_data();
+        for (int i = 0; i < rows; ++i) {
+            data[start_idx + i] = static_cast<DstCppType>(src_data[i]);
+        }
+
+        return Status::OK();
+    }
+};
+
+template <PrimitiveType SrcPrimitiveType>
+class NumericToStringConverter : public ColumnTypeConverter {
+    Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
+        using SrcColumnType = typename 
PrimitiveTypeTraits<SrcPrimitiveType>::ColumnType;
+        ColumnPtr from_col = remove_nullable(src_col);
+        MutableColumnPtr to_col = 
remove_nullable(dst_col->get_ptr())->assume_mutable();
+
+        size_t rows = from_col->size();
+        auto& src_data = static_cast<const 
SrcColumnType*>(from_col.get())->get_data();
+        auto& string_col = static_cast<ColumnString&>(*to_col.get());
+        for (int i = 0; i < rows; ++i) {
+            if constexpr (SrcPrimitiveType == TYPE_LARGEINT) {
+                string value = int128_to_string(src_data[i]);
+                string_col.insert_data(value.data(), value.size());
+            } else {
+                string value = std::to_string(src_data[i]);
+                string_col.insert_data(value.data(), value.size());
+            }
+        }
+
+        return Status::OK();
+    }
+};
+
+template <PrimitiveType SrcPrimitiveType>
+class DecimalToStringConverter : public ColumnTypeConverter {
+private:
+    int _scale;
+
+public:
+    DecimalToStringConverter(int scale) : _scale(scale) {}
+
+    Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
+        using SrcColumnType = typename 
PrimitiveTypeTraits<SrcPrimitiveType>::ColumnType;
+        ColumnPtr from_col = remove_nullable(src_col);
+        MutableColumnPtr to_col = 
remove_nullable(dst_col->get_ptr())->assume_mutable();
+
+        size_t rows = from_col->size();
+        auto& src_data = static_cast<const 
SrcColumnType*>(from_col.get())->get_data();
+        auto& string_col = static_cast<ColumnString&>(*to_col.get());
+        for (int i = 0; i < rows; ++i) {
+            std::string value = src_data[i].to_string(_scale);
+            string_col.insert_data(value.data(), value.size());
+        }
+
+        return Status::OK();
+    }
+};
+
+template <PrimitiveType SrcPrimitiveType>
+class TimeToStringConverter : public ColumnTypeConverter {
+    Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
+        using SrcCppType = typename 
PrimitiveTypeTraits<SrcPrimitiveType>::CppType;
+        using SrcColumnType = typename 
PrimitiveTypeTraits<SrcPrimitiveType>::ColumnType;
+        ColumnPtr from_col = remove_nullable(src_col);
+        MutableColumnPtr to_col = 
remove_nullable(dst_col->get_ptr())->assume_mutable();
+
+        size_t rows = from_col->size();
+        auto& src_data = static_cast<const 
SrcColumnType*>(from_col.get())->get_data();
+        auto& string_col = static_cast<ColumnString&>(*to_col.get());
+        char buf[50];
+        for (int i = 0; i < rows; ++i) {
+            char* end = (reinterpret_cast<const 
SrcCppType&>(src_data[i])).to_string(buf);
+            string_col.insert_data(buf, end - buf);
+        }
+
+        return Status::OK();
+    }
+};
+
+template <PrimitiveType DstPrimitiveType>
+struct SafeCastString {};
+
+template <>
+struct SafeCastString<TYPE_BOOLEAN> {
+    static bool safe_cast_string(const char* startptr, const int buffer_size,
+                                 
PrimitiveTypeTraits<TYPE_BOOLEAN>::ColumnType::value_type* value) {
+        int32 cast_to_int = 0;
+        bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int);
+        *value = cast_to_int == 0 ? 0 : 1;
+        return can_cast;
+    }
+};
+
+template <>
+struct SafeCastString<TYPE_TINYINT> {
+    static bool safe_cast_string(const char* startptr, const int buffer_size,
+                                 
PrimitiveTypeTraits<TYPE_TINYINT>::ColumnType::value_type* value) {
+        int32 cast_to_int = 0;
+        bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int);
+        *value = cast_to_int;
+        return can_cast && cast_to_int <= std::numeric_limits<int8>::max() &&
+               cast_to_int >= std::numeric_limits<int8>::min();
+    }
+};
+
+template <>
+struct SafeCastString<TYPE_SMALLINT> {
+    static bool safe_cast_string(
+            const char* startptr, const int buffer_size,
+            PrimitiveTypeTraits<TYPE_SMALLINT>::ColumnType::value_type* value) 
{
+        int32 cast_to_int = 0;
+        bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int);
+        *value = cast_to_int;
+        return can_cast && cast_to_int <= std::numeric_limits<int16>::max() &&
+               cast_to_int >= std::numeric_limits<int16>::min();
+    }
+};
+
+template <>
+struct SafeCastString<TYPE_INT> {
+    static bool safe_cast_string(const char* startptr, const int buffer_size,
+                                 
PrimitiveTypeTraits<TYPE_INT>::ColumnType::value_type* value) {
+        int32 cast_to_int = 0;
+        bool can_cast = safe_strto32(startptr, buffer_size, &cast_to_int);
+        *value = cast_to_int;
+        return can_cast;
+    }
+};
+
+template <>
+struct SafeCastString<TYPE_BIGINT> {
+    static bool safe_cast_string(const char* startptr, const int buffer_size,
+                                 
PrimitiveTypeTraits<TYPE_BIGINT>::ColumnType::value_type* value) {
+        int64 cast_to_int = 0;
+        bool can_cast = safe_strto64(startptr, buffer_size, &cast_to_int);
+        *value = cast_to_int;
+        return can_cast;
+    }
+};
+
+template <>
+struct SafeCastString<TYPE_LARGEINT> {
+    static bool safe_cast_string(
+            const char* startptr, const int buffer_size,
+            PrimitiveTypeTraits<TYPE_LARGEINT>::ColumnType::value_type* value) 
{
+        int64 cast_to_int = 0;
+        bool can_cast = safe_strto64(startptr, buffer_size, &cast_to_int);

Review Comment:
   Use `read_int_text_impl` instead.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to