This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new d4a4c172ea [Improve](serde)update serialize and deserialize text for data type (#21109) d4a4c172ea is described below commit d4a4c172ea8112ccbb0ff870fa515667a114a3e9 Author: amory <wangqian...@selectdb.com> AuthorDate: Wed Jul 26 10:06:16 2023 +0800 [Improve](serde)update serialize and deserialize text for data type (#21109) --- be/src/vec/data_types/data_type_date_time.h | 4 +- be/src/vec/data_types/data_type_decimal.h | 2 +- .../vec/data_types/serde/data_type_array_serde.h | 11 + .../vec/data_types/serde/data_type_bitmap_serde.h | 11 + .../data_types/serde/data_type_date64_serde.cpp | 109 ++++++++ .../vec/data_types/serde/data_type_date64_serde.h | 13 + .../serde/data_type_datetimev2_serde.cpp | 48 ++++ .../data_types/serde/data_type_datetimev2_serde.h | 7 + .../data_types/serde/data_type_datev2_serde.cpp | 38 +++ .../vec/data_types/serde/data_type_datev2_serde.h | 6 + .../data_types/serde/data_type_decimal_serde.cpp | 33 +++ .../vec/data_types/serde/data_type_decimal_serde.h | 12 +- .../serde/data_type_fixedlengthobject_serde.h | 10 + .../vec/data_types/serde/data_type_hll_serde.cpp | 22 ++ be/src/vec/data_types/serde/data_type_hll_serde.h | 4 + be/src/vec/data_types/serde/data_type_jsonb.h | 62 ----- .../vec/data_types/serde/data_type_jsonb_serde.cpp | 29 ++- .../vec/data_types/serde/data_type_jsonb_serde.h | 6 + be/src/vec/data_types/serde/data_type_map_serde.h | 11 + .../data_types/serde/data_type_nullable_serde.cpp | 38 +++ .../data_types/serde/data_type_nullable_serde.h | 6 + .../data_types/serde/data_type_number_serde.cpp | 60 +++++ .../vec/data_types/serde/data_type_number_serde.h | 7 + .../vec/data_types/serde/data_type_object_serde.h | 11 + .../serde/data_type_quantilestate_serde.h | 10 + be/src/vec/data_types/serde/data_type_serde.h | 19 ++ .../data_types/serde/data_type_string_serde.cpp | 19 ++ .../vec/data_types/serde/data_type_string_serde.h | 6 + .../vec/data_types/serde/data_type_struct_serde.h | 11 + be/test/vec/data_types/from_string_test.cpp | 279 +++++++++++++++++++++ .../data_types/serde/data_type_serde_text_test.cpp | 256 +++++++++++++++++++ be/test/vec/data_types/serde_utils.h | 53 ++++ 32 files changed, 1145 insertions(+), 68 deletions(-) diff --git a/be/src/vec/data_types/data_type_date_time.h b/be/src/vec/data_types/data_type_date_time.h index 2e63bc99b7..f096d003dd 100644 --- a/be/src/vec/data_types/data_type_date_time.h +++ b/be/src/vec/data_types/data_type_date_time.h @@ -84,7 +84,9 @@ public: std::string to_string(const IColumn& column, size_t row_num) const override; - DataTypeSerDeSPtr get_serde() const override { return std::make_shared<DataTypeDate64SerDe>(); } + DataTypeSerDeSPtr get_serde() const override { + return std::make_shared<DataTypeDateTimeSerDe>(); + } Field get_field(const TExprNode& node) const override { VecDateTimeValue value; diff --git a/be/src/vec/data_types/data_type_decimal.h b/be/src/vec/data_types/data_type_decimal.h index 50bfce56eb..21d899e620 100644 --- a/be/src/vec/data_types/data_type_decimal.h +++ b/be/src/vec/data_types/data_type_decimal.h @@ -249,7 +249,7 @@ public: void to_string(const IColumn& column, size_t row_num, BufferWritable& ostr) const override; Status from_string(ReadBuffer& rb, IColumn* column) const override; DataTypeSerDeSPtr get_serde() const override { - return std::make_shared<DataTypeDecimalSerDe<T>>(scale); + return std::make_shared<DataTypeDecimalSerDe<T>>(scale, precision); }; /// Decimal specific diff --git a/be/src/vec/data_types/serde/data_type_array_serde.h b/be/src/vec/data_types/serde/data_type_array_serde.h index 37516b45b0..28a90dc114 100644 --- a/be/src/vec/data_types/serde/data_type_array_serde.h +++ b/be/src/vec/data_types/serde/data_type_array_serde.h @@ -38,6 +38,17 @@ class DataTypeArraySerDe : public DataTypeSerDe { public: DataTypeArraySerDe(const DataTypeSerDeSPtr& _nested_serde) : nested_serde(_nested_serde) {} + void serialize_one_cell_to_text(const IColumn& column, int row_num, BufferWritable& bw, + const FormatOptions& options) const override { + LOG(FATAL) << "Not support serialize array column to buffer"; + } + + Status deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const override { + LOG(FATAL) << "Not support deserialize from buffer to array"; + return Status::NotSupported("Not support deserialize from buffer to array"); + } + Status write_column_to_pb(const IColumn& column, PValues& result, int start, int end) const override { LOG(FATAL) << "Not support write array column to pb"; diff --git a/be/src/vec/data_types/serde/data_type_bitmap_serde.h b/be/src/vec/data_types/serde/data_type_bitmap_serde.h index 01988b50bd..9f1d8f8a71 100644 --- a/be/src/vec/data_types/serde/data_type_bitmap_serde.h +++ b/be/src/vec/data_types/serde/data_type_bitmap_serde.h @@ -33,6 +33,17 @@ class Arena; class DataTypeBitMapSerDe : public DataTypeSerDe { public: + void serialize_one_cell_to_text(const IColumn& column, int row_num, BufferWritable& bw, + const FormatOptions& options) const override { + LOG(FATAL) << "Not support serialize bitmap column to buffer"; + } + + Status deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const override { + LOG(FATAL) << "Not support deserialize from buffer to bitmap"; + return Status::NotSupported("Not support deserialize from buffer to bitmap"); + } + Status write_column_to_pb(const IColumn& column, PValues& result, int start, int end) const override; Status read_column_from_pb(IColumn& column, const PValues& arg) const override; diff --git a/be/src/vec/data_types/serde/data_type_date64_serde.cpp b/be/src/vec/data_types/serde/data_type_date64_serde.cpp index 6133d73b08..e360a86172 100644 --- a/be/src/vec/data_types/serde/data_type_date64_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_date64_serde.cpp @@ -23,10 +23,119 @@ #include "gutil/casts.h" #include "vec/columns/column_const.h" +#include "vec/io/io_helper.h" namespace doris { namespace vectorized { +void DataTypeDate64SerDe::serialize_one_cell_to_text(const IColumn& column, int row_num, + BufferWritable& bw, + const FormatOptions& options) const { + auto result = check_column_const_set_readability(column, row_num); + ColumnPtr ptr = result.first; + row_num = result.second; + + Int64 int_val = assert_cast<const ColumnInt64&>(*ptr).get_element(row_num); + if (options.date_olap_format) { + tm time_tm; + memset(&time_tm, 0, sizeof(time_tm)); + time_tm.tm_mday = static_cast<int>(int_val & 31); + time_tm.tm_mon = static_cast<int>(int_val >> 5 & 15) - 1; + time_tm.tm_year = static_cast<int>(int_val >> 9) - 1900; + char buf[20] = {'\0'}; + strftime(buf, sizeof(buf), "%Y-%m-%d", &time_tm); + std::string s = std::string(buf); + bw.write(s.c_str(), s.length()); + } else { + doris::vectorized::VecDateTimeValue value = + binary_cast<Int64, doris::vectorized::VecDateTimeValue>(int_val); + + char buf[64]; + char* pos = value.to_string(buf); + bw.write(buf, pos - buf - 1); + } + bw.commit(); +} + +Status DataTypeDate64SerDe::deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const { + auto& column_data = assert_cast<ColumnInt64&>(column); + Int64 val = 0; + if (options.date_olap_format) { + tm time_tm; + char* res = strptime(rb.position(), "%Y-%m-%d", &time_tm); + if (nullptr != res) { + val = (time_tm.tm_year + 1900) * 16 * 32 + (time_tm.tm_mon + 1) * 32 + time_tm.tm_mday; + } else { + // 1400 - 01 - 01 + val = 716833; + } + } else if (!read_date_text_impl<Int64>(val, rb)) { + return Status::InvalidArgument("parse date fail, string: '{}'", + std::string(rb.position(), rb.count()).c_str()); + } + column_data.insert_value(val); + return Status::OK(); +} + +void DataTypeDateTimeSerDe::serialize_one_cell_to_text(const IColumn& column, int row_num, + BufferWritable& bw, + const FormatOptions& options) const { + auto result = check_column_const_set_readability(column, row_num); + ColumnPtr ptr = result.first; + row_num = result.second; + + Int64 int_val = assert_cast<const ColumnInt64&>(*ptr).get_element(row_num); + if (options.date_olap_format) { + tm time_tm; + int64 part1 = (int_val / 1000000L); + int64 part2 = (int_val - part1 * 1000000L); + time_tm.tm_year = static_cast<int>((part1 / 10000L) % 10000) - 1900; + time_tm.tm_mon = static_cast<int>((part1 / 100) % 100) - 1; + time_tm.tm_mday = static_cast<int>(part1 % 100); + + time_tm.tm_hour = static_cast<int>((part2 / 10000L) % 10000); + time_tm.tm_min = static_cast<int>((part2 / 100) % 100); + time_tm.tm_sec = static_cast<int>(part2 % 100); + char buf[20] = {'\0'}; + strftime(buf, 20, "%Y-%m-%d %H:%M:%S", &time_tm); + std::string s = std::string(buf); + bw.write(s.c_str(), s.length()); + } else { + doris::vectorized::VecDateTimeValue value = + binary_cast<Int64, doris::vectorized::VecDateTimeValue>(int_val); + + char buf[64]; + char* pos = value.to_string(buf); + bw.write(buf, pos - buf - 1); + } + bw.commit(); +} + +Status DataTypeDateTimeSerDe::deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const { + auto& column_data = assert_cast<ColumnInt64&>(column); + Int64 val = 0; + if (options.date_olap_format) { + tm time_tm; + char* res = strptime(rb.position(), "%Y-%m-%d %H:%M:%S", &time_tm); + if (nullptr != res) { + val = ((time_tm.tm_year + 1900) * 10000L + (time_tm.tm_mon + 1) * 100L + + time_tm.tm_mday) * + 1000000L + + time_tm.tm_hour * 10000L + time_tm.tm_min * 100L + time_tm.tm_sec; + } else { + // 1400 - 01 - 01 + val = 14000101000000L; + } + } else if (!read_datetime_text_impl<Int64>(val, rb)) { + return Status::InvalidArgument("parse datetime fail, string: '{}'", + std::string(rb.position(), rb.count()).c_str()); + } + column_data.insert_value(val); + return Status::OK(); +} + void DataTypeDate64SerDe::write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int start, int end) const { diff --git a/be/src/vec/data_types/serde/data_type_date64_serde.h b/be/src/vec/data_types/serde/data_type_date64_serde.h index 94a4618d5c..a52cac5d15 100644 --- a/be/src/vec/data_types/serde/data_type_date64_serde.h +++ b/be/src/vec/data_types/serde/data_type_date64_serde.h @@ -42,6 +42,11 @@ namespace vectorized { class Arena; class DataTypeDate64SerDe : public DataTypeNumberSerDe<Int64> { + void serialize_one_cell_to_text(const IColumn& column, int row_num, BufferWritable& bw, + const FormatOptions& options) const override; + + Status deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const override; void write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int start, int end) const override; @@ -57,5 +62,13 @@ private: Status _write_column_to_mysql(const IColumn& column, MysqlRowBuffer<is_binary_format>& result, int row_idx, bool col_const) const; }; + +class DataTypeDateTimeSerDe : public DataTypeDate64SerDe { + void serialize_one_cell_to_text(const IColumn& column, int row_num, BufferWritable& bw, + const FormatOptions& options) const override; + + Status deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const override; +}; } // namespace vectorized } // namespace doris \ No newline at end of file diff --git a/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp b/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp index 6800486dd3..b6ddc5744d 100644 --- a/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp @@ -23,10 +23,58 @@ #include "gutil/casts.h" #include "vec/columns/column_const.h" +#include "vec/io/io_helper.h" namespace doris { namespace vectorized { +void DataTypeDateTimeV2SerDe::serialize_one_cell_to_text(const IColumn& column, int row_num, + BufferWritable& bw, + const FormatOptions& options) const { + auto result = check_column_const_set_readability(column, row_num); + ColumnPtr ptr = result.first; + row_num = result.second; + + UInt64 int_val = assert_cast<const ColumnUInt64&>(*ptr).get_element(row_num); + DateV2Value<DateTimeV2ValueType> val = + binary_cast<UInt64, DateV2Value<DateTimeV2ValueType>>(int_val); + + if (options.date_olap_format) { + std::string format = "%Y-%m-%d %H:%i:%s.%f"; + char buf[30]; + val.to_format_string(format.c_str(), format.size(), buf); + std::string s = std::string(buf); + bw.write(s.c_str(), s.length()); + } else { + char buf[64]; + char* pos = val.to_string(buf); + bw.write(buf, pos - buf - 1); + } + bw.commit(); +} + +Status DataTypeDateTimeV2SerDe::deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const { + auto& column_data = assert_cast<ColumnUInt64&>(column); + UInt64 val = 0; + if (options.date_olap_format) { + doris::vectorized::DateV2Value<doris::vectorized::DateTimeV2ValueType> datetimev2_value; + std::string date_format = "%Y-%m-%d %H:%i:%s.%f"; + if (datetimev2_value.from_date_format_str(date_format.data(), date_format.size(), + rb.position(), rb.count())) { + val = datetimev2_value.to_date_int_val(); + } else { + val = doris::vectorized::MIN_DATETIME_V2; + } + + } else if (!read_datetime_v2_text_impl<UInt64>(val, rb)) { + return Status::InvalidArgument("parse date fail, string: '{}'", + std::string(rb.position(), rb.count()).c_str()); + } + column_data.insert_value(val); + return Status::OK(); +} + void DataTypeDateTimeV2SerDe::write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int start, int end) const { diff --git a/be/src/vec/data_types/serde/data_type_datetimev2_serde.h b/be/src/vec/data_types/serde/data_type_datetimev2_serde.h index 0b943e6010..b0309198d0 100644 --- a/be/src/vec/data_types/serde/data_type_datetimev2_serde.h +++ b/be/src/vec/data_types/serde/data_type_datetimev2_serde.h @@ -44,6 +44,13 @@ class Arena; class DataTypeDateTimeV2SerDe : public DataTypeNumberSerDe<UInt64> { public: DataTypeDateTimeV2SerDe(int scale) : scale(scale) {}; + + void serialize_one_cell_to_text(const IColumn& column, int row_num, BufferWritable& bw, + const FormatOptions& options) const override; + + Status deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const override; + void write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int start, int end) const override; diff --git a/be/src/vec/data_types/serde/data_type_datev2_serde.cpp b/be/src/vec/data_types/serde/data_type_datev2_serde.cpp index a538ec01f4..6d9d4f3417 100644 --- a/be/src/vec/data_types/serde/data_type_datev2_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_datev2_serde.cpp @@ -23,10 +23,48 @@ #include "gutil/casts.h" #include "vec/columns/column_const.h" +#include "vec/io/io_helper.h" namespace doris { namespace vectorized { +void DataTypeDateV2SerDe::serialize_one_cell_to_text(const IColumn& column, int row_num, + BufferWritable& bw, + const FormatOptions& options) const { + auto result = check_column_const_set_readability(column, row_num); + ColumnPtr ptr = result.first; + row_num = result.second; + + UInt32 int_val = assert_cast<const ColumnUInt32&>(*ptr).get_element(row_num); + DateV2Value<DateV2ValueType> val = binary_cast<UInt32, DateV2Value<DateV2ValueType>>(int_val); + + char buf[64]; + char* pos = val.to_string(buf); + // DateTime to_string the end is /0 + bw.write(buf, pos - buf - 1); + bw.commit(); +} + +Status DataTypeDateV2SerDe::deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const { + auto& column_data = assert_cast<ColumnUInt32&>(column); + UInt32 val = 0; + if (options.date_olap_format) { + tm time_tm; + char* res = strptime(rb.position(), "%Y-%m-%d", &time_tm); + if (nullptr != res) { + val = ((time_tm.tm_year + 1900) << 9) | ((time_tm.tm_mon + 1) << 5) | time_tm.tm_mday; + } else { + val = doris::vectorized::MIN_DATE_V2; + } + } else if (!read_date_v2_text_impl<UInt32>(val, rb)) { + return Status::InvalidArgument("parse date fail, string: '{}'", + std::string(rb.position(), rb.count()).c_str()); + } + column_data.insert_value(val); + return Status::OK(); +} + void DataTypeDateV2SerDe::write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int start, int end) const { diff --git a/be/src/vec/data_types/serde/data_type_datev2_serde.h b/be/src/vec/data_types/serde/data_type_datev2_serde.h index 5d2baf7704..689ed08092 100644 --- a/be/src/vec/data_types/serde/data_type_datev2_serde.h +++ b/be/src/vec/data_types/serde/data_type_datev2_serde.h @@ -42,6 +42,12 @@ namespace vectorized { class Arena; class DataTypeDateV2SerDe : public DataTypeNumberSerDe<UInt32> { + void serialize_one_cell_to_text(const IColumn& column, int row_num, BufferWritable& bw, + const FormatOptions& options) const override; + + Status deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const override; + void write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int start, int end) const override; diff --git a/be/src/vec/data_types/serde/data_type_decimal_serde.cpp b/be/src/vec/data_types/serde/data_type_decimal_serde.cpp index 5e46c996e7..fa230e917d 100644 --- a/be/src/vec/data_types/serde/data_type_decimal_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_decimal_serde.cpp @@ -26,11 +26,44 @@ #include "gutil/casts.h" #include "vec/columns/column_decimal.h" #include "vec/common/arithmetic_overflow.h" +#include "vec/io/io_helper.h" namespace doris { namespace vectorized { +template <typename T> +void DataTypeDecimalSerDe<T>::serialize_one_cell_to_text(const IColumn& column, int row_num, + BufferWritable& bw, + const FormatOptions& options) const { + auto result = check_column_const_set_readability(column, row_num); + ColumnPtr ptr = result.first; + row_num = result.second; + + auto& col = assert_cast<const ColumnDecimal<T>&>(*ptr); + if constexpr (!IsDecimalV2<T>) { + T value = col.get_element(row_num); + auto decimal_str = value.to_string(scale); + bw.write(decimal_str.data(), decimal_str.size()); + } else { + auto length = col.get_element(row_num).to_string(buf, scale, scale_multiplier); + bw.write(buf, length); + } + bw.commit(); +} +template <typename T> +Status DataTypeDecimalSerDe<T>::deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const { + auto& column_data = assert_cast<ColumnDecimal<T>&>(column).get_data(); + T val = 0; + if (!read_decimal_text_impl<T>(val, rb, precision, scale)) { + return Status::InvalidArgument("parse decimal fail, string: '{}'", + std::string(rb.position(), rb.count()).c_str()); + } + column_data.emplace_back(val); + return Status::OK(); +} + template <typename T> void DataTypeDecimalSerDe<T>::write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int start, diff --git a/be/src/vec/data_types/serde/data_type_decimal_serde.h b/be/src/vec/data_types/serde/data_type_decimal_serde.h index 2b64a00eca..21c5f681a6 100644 --- a/be/src/vec/data_types/serde/data_type_decimal_serde.h +++ b/be/src/vec/data_types/serde/data_type_decimal_serde.h @@ -47,10 +47,17 @@ class DataTypeDecimalSerDe : public DataTypeSerDe { static_assert(IsDecimalNumber<T>); public: - DataTypeDecimalSerDe(int scale_) + DataTypeDecimalSerDe(int scale_, int precision_) : scale(scale_), + precision(precision_), scale_multiplier(decimal_scale_multiplier<typename T::NativeType>(scale)) {} + void serialize_one_cell_to_text(const IColumn& column, int row_num, BufferWritable& bw, + const FormatOptions& options) const override; + + Status deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const override; + Status write_column_to_pb(const IColumn& column, PValues& result, int start, int end) const override; Status read_column_from_pb(IColumn& column, const PValues& arg) const override; @@ -76,7 +83,8 @@ private: int row_idx, bool col_const) const; int scale; - const T::NativeType scale_multiplier; + int precision; + const typename T::NativeType scale_multiplier; mutable char buf[T::max_string_length()]; }; diff --git a/be/src/vec/data_types/serde/data_type_fixedlengthobject_serde.h b/be/src/vec/data_types/serde/data_type_fixedlengthobject_serde.h index 69fe53e241..ca1795181e 100644 --- a/be/src/vec/data_types/serde/data_type_fixedlengthobject_serde.h +++ b/be/src/vec/data_types/serde/data_type_fixedlengthobject_serde.h @@ -36,6 +36,16 @@ class Arena; class DataTypeFixedLengthObjectSerDe : public DataTypeSerDe { public: + void serialize_one_cell_to_text(const IColumn& column, int row_num, BufferWritable& bw, + const FormatOptions& options) const override { + LOG(FATAL) << "Not support serialize FixedLengthObject column to buffer"; + } + Status deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const override { + LOG(FATAL) << "Not support deserialize FixedLengthObject column from buffer"; + return Status::NotSupported("Not support deserialize FixedLengthObject column from buffer"); + } + Status write_column_to_pb(const IColumn& column, PValues& result, int start, int end) const override { return Status::NotSupported("Not support write FixedLengthObject column to pb"); diff --git a/be/src/vec/data_types/serde/data_type_hll_serde.cpp b/be/src/vec/data_types/serde/data_type_hll_serde.cpp index 72052d47cf..52b59cb3c0 100644 --- a/be/src/vec/data_types/serde/data_type_hll_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_hll_serde.cpp @@ -37,6 +37,28 @@ namespace doris { namespace vectorized { class IColumn; +void DataTypeHLLSerDe::serialize_one_cell_to_text(const IColumn& column, int row_num, + BufferWritable& bw, + const FormatOptions& options) const { + auto col_row = check_column_const_set_readability(column, row_num); + ColumnPtr ptr = col_row.first; + row_num = col_row.second; + auto& data = const_cast<HyperLogLog&>(assert_cast<const ColumnHLL&>(*ptr).get_element(row_num)); + std::unique_ptr<char[]> buf = std::make_unique<char[]>(data.max_serialized_size()); + size_t size = data.serialize((uint8*)buf.get()); + bw.write(buf.get(), size); + bw.commit(); +} + +Status DataTypeHLLSerDe::deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const { + auto& data_column = assert_cast<ColumnHLL&>(column); + + HyperLogLog hyper_log_log(Slice(rb.to_string())); + data_column.insert_value(hyper_log_log); + return Status::OK(); +} + Status DataTypeHLLSerDe::write_column_to_pb(const IColumn& column, PValues& result, int start, int end) const { auto ptype = result.mutable_type(); diff --git a/be/src/vec/data_types/serde/data_type_hll_serde.h b/be/src/vec/data_types/serde/data_type_hll_serde.h index 981b197ca8..65f56cd3d0 100644 --- a/be/src/vec/data_types/serde/data_type_hll_serde.h +++ b/be/src/vec/data_types/serde/data_type_hll_serde.h @@ -33,6 +33,10 @@ class Arena; class DataTypeHLLSerDe : public DataTypeSerDe { public: + void serialize_one_cell_to_text(const IColumn& column, int row_num, BufferWritable& bw, + const FormatOptions& options) const override; + Status deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const override; Status write_column_to_pb(const IColumn& column, PValues& result, int start, int end) const override; Status read_column_from_pb(IColumn& column, const PValues& arg) const override; diff --git a/be/src/vec/data_types/serde/data_type_jsonb.h b/be/src/vec/data_types/serde/data_type_jsonb.h deleted file mode 100644 index d274a34571..0000000000 --- a/be/src/vec/data_types/serde/data_type_jsonb.h +++ /dev/null @@ -1,62 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once -#include <gen_cpp/types.pb.h> -#include <glog/logging.h> -#include <stddef.h> -#include <stdint.h> - -#include "data_type_number_serde.h" -#include "vec/core/types.h" - -namespace doris { -class JsonbOutStream; - -namespace vectorized { -class Arena; - -class DataTypeTimeSerDe : public DataTypeNumberSerDe<Float64> { - Status write_column_to_mysql(const IColumn& column, std::vector<MysqlRowBuffer<false>>& result, - int row_idx, int start, int end, bool col_const) const override { - return _write_date_time_column_to_mysql(column, result, row_idx, start, end, col_const); - } - Status write_column_to_mysql(const IColumn& column, std::vector<MysqlRowBuffer<true>>& result, - int row_idx, int start, int end, bool col_const) const override { - return _write_date_time_column_to_mysql(column, result, row_idx, start, end, col_const); - } - -private: - template <bool is_binary_format> - Status _write_date_time_column_to_mysql(const IColumn& column, - std::vector<MysqlRowBuffer<is_binary_format>>& result, - int row_idx, int start, int end, bool col_const) const { - int buf_ret = 0; - auto& data = assert_cast<const ColumnVector<Float64>&>(column).get_data(); - for (int i = start; i < end; ++i) { - if (0 != buf_ret) { - return Status::InternalError("pack mysql buffer failed."); - } - const auto col_index = index_check_const(i, col_const); - buf_ret = result[row_idx].push_time(data[col_index]); - ++row_idx; - } - return Status::OK(); - } -}; -} // namespace vectorized -} // namespace doris \ No newline at end of file diff --git a/be/src/vec/data_types/serde/data_type_jsonb_serde.cpp b/be/src/vec/data_types/serde/data_type_jsonb_serde.cpp index ff42a06c9d..c85c5c4abe 100644 --- a/be/src/vec/data_types/serde/data_type_jsonb_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_jsonb_serde.cpp @@ -17,7 +17,8 @@ #include "data_type_jsonb_serde.h" -#include <arrow/array/builder_binary.h> +#include "arrow/array/builder_binary.h" +#include "runtime/jsonb_value.h" namespace doris { namespace vectorized { @@ -54,6 +55,30 @@ Status DataTypeJsonbSerDe::write_column_to_mysql(const IColumn& column, return _write_column_to_mysql(column, row_buffer, row_idx, col_const); } +void DataTypeJsonbSerDe::serialize_one_cell_to_text(const IColumn& column, int row_num, + BufferWritable& bw, + const FormatOptions& options) const { + auto result = check_column_const_set_readability(column, row_num); + ColumnPtr ptr = result.first; + row_num = result.second; + + const StringRef& s = assert_cast<const ColumnString&>(*ptr).get_data_at(row_num); + if (s.size > 0) { + bw.write(s.data, s.size); + bw.commit(); + } +} + +Status DataTypeJsonbSerDe::deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const { + JsonBinaryValue value; + RETURN_IF_ERROR(value.from_json_string(rb.position(), rb.count())); + + auto& column_string = assert_cast<ColumnString&>(column); + column_string.insert_data(value.value(), value.size()); + return Status::OK(); +} + void DataTypeJsonbSerDe::write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int start, int end) const { @@ -74,4 +99,4 @@ void DataTypeJsonbSerDe::write_column_to_arrow(const IColumn& column, const Null } } // namespace vectorized -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/vec/data_types/serde/data_type_jsonb_serde.h b/be/src/vec/data_types/serde/data_type_jsonb_serde.h index 9bf523504c..1d612eeb69 100644 --- a/be/src/vec/data_types/serde/data_type_jsonb_serde.h +++ b/be/src/vec/data_types/serde/data_type_jsonb_serde.h @@ -42,6 +42,12 @@ class DataTypeJsonbSerDe : public DataTypeStringSerDe { arrow::ArrayBuilder* array_builder, int start, int end) const override; + void serialize_one_cell_to_text(const IColumn& column, int row_num, BufferWritable& bw, + const FormatOptions& options) const override; + + Status deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const override; + private: template <bool is_binary_format> Status _write_column_to_mysql(const IColumn& column, MysqlRowBuffer<is_binary_format>& result, diff --git a/be/src/vec/data_types/serde/data_type_map_serde.h b/be/src/vec/data_types/serde/data_type_map_serde.h index 67474d0676..b415ec4d27 100644 --- a/be/src/vec/data_types/serde/data_type_map_serde.h +++ b/be/src/vec/data_types/serde/data_type_map_serde.h @@ -39,6 +39,17 @@ public: DataTypeMapSerDe(const DataTypeSerDeSPtr& _key_serde, const DataTypeSerDeSPtr& _value_serde) : key_serde(_key_serde), value_serde(_value_serde) {} + void serialize_one_cell_to_text(const IColumn& column, int row_num, BufferWritable& bw, + const FormatOptions& options) const override { + LOG(FATAL) << "Not support serialize map column to buffer"; + } + + Status deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const override { + LOG(FATAL) << "Not support deserialize from buffer to map"; + return Status::NotSupported("Not support deserialize from buffer to map"); + } + Status write_column_to_pb(const IColumn& column, PValues& result, int start, int end) const override { LOG(FATAL) << "Not support write map column to pb"; diff --git a/be/src/vec/data_types/serde/data_type_nullable_serde.cpp b/be/src/vec/data_types/serde/data_type_nullable_serde.cpp index 654890f501..b40acdfd0d 100644 --- a/be/src/vec/data_types/serde/data_type_nullable_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_nullable_serde.cpp @@ -38,6 +38,44 @@ namespace doris { namespace vectorized { class Arena; +void DataTypeNullableSerDe::serialize_one_cell_to_text(const IColumn& column, int row_num, + BufferWritable& bw, + const FormatOptions& options) const { + auto result = check_column_const_set_readability(column, row_num); + ColumnPtr ptr = result.first; + row_num = result.second; + + const auto& col_null = assert_cast<const ColumnNullable&>(*ptr); + if (col_null.is_null_at(row_num)) { + bw.write("NULL", 4); + bw.commit(); + } else { + nested_serde->serialize_one_cell_to_text(col_null.get_nested_column(), row_num, bw, + options); + } +} + +Status DataTypeNullableSerDe::deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const { + auto& null_column = assert_cast<ColumnNullable&>(column); + // TODO(Amory) make null literal configurable + if (rb.count() == 4 && *(rb.position()) == 'N' && *(rb.position() + 1) == 'U' && + *(rb.position() + 2) == 'L' && *(rb.position() + 3) == 'L') { + null_column.insert_data(nullptr, 0); + return Status::OK(); + } + auto st = nested_serde->deserialize_one_cell_from_text(null_column.get_nested_column(), rb, + options); + if (!st.ok()) { + // fill null if fail + null_column.insert_data(nullptr, 0); // 0 is meaningless here + return Status::OK(); + } + // fill not null if succ + null_column.get_null_map_data().push_back(0); + return Status::OK(); +} + Status DataTypeNullableSerDe::write_column_to_pb(const IColumn& column, PValues& result, int start, int end) const { int row_count = end - start; diff --git a/be/src/vec/data_types/serde/data_type_nullable_serde.h b/be/src/vec/data_types/serde/data_type_nullable_serde.h index 1c7dc1b8a5..4ff7630ddf 100644 --- a/be/src/vec/data_types/serde/data_type_nullable_serde.h +++ b/be/src/vec/data_types/serde/data_type_nullable_serde.h @@ -35,6 +35,12 @@ class DataTypeNullableSerDe : public DataTypeSerDe { public: DataTypeNullableSerDe(const DataTypeSerDeSPtr& _nested_serde) : nested_serde(_nested_serde) {} + void serialize_one_cell_to_text(const IColumn& column, int row_num, BufferWritable& bw, + const FormatOptions& options) const override; + + Status deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const override; + Status write_column_to_pb(const IColumn& column, PValues& result, int start, int end) const override; Status read_column_from_pb(IColumn& column, const PValues& arg) const override; diff --git a/be/src/vec/data_types/serde/data_type_number_serde.cpp b/be/src/vec/data_types/serde/data_type_number_serde.cpp index 081ad569c3..d887d90d2d 100644 --- a/be/src/vec/data_types/serde/data_type_number_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_number_serde.cpp @@ -22,6 +22,9 @@ #include <type_traits> #include "gutil/casts.h" +#include "gutil/strings/numbers.h" +#include "util/mysql_global.h" +#include "vec/io/io_helper.h" namespace doris { namespace vectorized { @@ -97,6 +100,63 @@ void DataTypeNumberSerDe<T>::write_column_to_arrow(const IColumn& column, const } } +template <typename T> +Status DataTypeNumberSerDe<T>::deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const { + auto& column_data = reinterpret_cast<ColumnType&>(column); + if constexpr (std::is_same<T, UInt128>::value) { + // TODO: support for Uint128 + return Status::InvalidArgument("uint128 is not support"); + } else if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) { + T val = 0; + if (!read_float_text_fast_impl(val, rb)) { + return Status::InvalidArgument("parse number fail, string: '{}'", + std::string(rb.position(), rb.count()).c_str()); + } + column_data.insert_value(val); + } else if constexpr (std::is_same_v<T, uint8_t>) { + // Note: here we should handle the bool type + T val = 0; + if (!try_read_bool_text(val, rb)) { + return Status::InvalidArgument("parse boolean fail, string: '{}'", + std::string(rb.position(), rb.count()).c_str()); + } + column_data.insert_value(val); + } else if constexpr (std::is_integral<T>::value) { + T val = 0; + if (!read_int_text_impl(val, rb)) { + return Status::InvalidArgument("parse number fail, string: '{}'", + std::string(rb.position(), rb.count()).c_str()); + } + column_data.insert_value(val); + } else { + DCHECK(false); + } + return Status::OK(); +} + +template <typename T> +void DataTypeNumberSerDe<T>::serialize_one_cell_to_text(const IColumn& column, int row_num, + BufferWritable& bw, + const FormatOptions& options) const { + auto result = check_column_const_set_readability(column, row_num); + ColumnPtr ptr = result.first; + row_num = result.second; + auto data = assert_cast<const ColumnVector<T>&>(*ptr).get_element(row_num); + if constexpr (std::is_same<T, UInt128>::value) { + std::string hex = int128_to_string(data); + bw.write(hex.data(), hex.size()); + } else if constexpr (std::is_same_v<T, float>) { + // fmt::format_to maybe get inaccurate results at float type, so we use gutil implement. + char buf[MAX_FLOAT_STR_LENGTH + 2]; + int len = FloatToBuffer(data, MAX_FLOAT_STR_LENGTH + 2, buf); + bw.write(buf, len); + } else if constexpr (std::is_integral<T>::value || std::numeric_limits<T>::is_iec559) { + bw.write_number(data); + } + bw.commit(); +} + template <typename T> void DataTypeNumberSerDe<T>::read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int start, diff --git a/be/src/vec/data_types/serde/data_type_number_serde.h b/be/src/vec/data_types/serde/data_type_number_serde.h index 2130d78493..b3df10f062 100644 --- a/be/src/vec/data_types/serde/data_type_number_serde.h +++ b/be/src/vec/data_types/serde/data_type_number_serde.h @@ -54,6 +54,13 @@ class DataTypeNumberSerDe : public DataTypeSerDe { public: using ColumnType = ColumnVector<T>; + + void serialize_one_cell_to_text(const IColumn& column, int row_num, BufferWritable& bw, + const FormatOptions& options) const override; + + Status deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const override; + Status write_column_to_pb(const IColumn& column, PValues& result, int start, int end) const override; Status read_column_from_pb(IColumn& column, const PValues& arg) const override; diff --git a/be/src/vec/data_types/serde/data_type_object_serde.h b/be/src/vec/data_types/serde/data_type_object_serde.h index ceb4a20cf2..22235a1573 100644 --- a/be/src/vec/data_types/serde/data_type_object_serde.h +++ b/be/src/vec/data_types/serde/data_type_object_serde.h @@ -36,6 +36,17 @@ class Arena; class DataTypeObjectSerDe : public DataTypeSerDe { public: + void serialize_one_cell_to_text(const IColumn& column, int row_num, BufferWritable& bw, + const FormatOptions& options) const override { + LOG(FATAL) << "Not support write object column to buffer"; + } + + Status deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const override { + LOG(FATAL) << "Not support read object column from buffer"; + return Status::NotSupported("Not support read object column from buffer"); + } + Status write_column_to_pb(const IColumn& column, PValues& result, int start, int end) const override { LOG(FATAL) << "Not support write object column to pb"; diff --git a/be/src/vec/data_types/serde/data_type_quantilestate_serde.h b/be/src/vec/data_types/serde/data_type_quantilestate_serde.h index ebeccca097..08fe45699c 100644 --- a/be/src/vec/data_types/serde/data_type_quantilestate_serde.h +++ b/be/src/vec/data_types/serde/data_type_quantilestate_serde.h @@ -40,6 +40,16 @@ namespace vectorized { template <typename T> class DataTypeQuantileStateSerDe : public DataTypeSerDe { public: + void serialize_one_cell_to_text(const IColumn& column, int row_num, BufferWritable& bw, + const FormatOptions& options) const override { + LOG(FATAL) << "Not support write QuantileState column to buffer"; + } + + Status deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const override { + LOG(FATAL) << "Not support read QuantileState column from buffer"; + return Status::NotSupported("Not support read QuantileState column from buffer"); + } Status write_column_to_pb(const IColumn& column, PValues& result, int start, int end) const override; Status read_column_from_pb(IColumn& column, const PValues& arg) const override; diff --git a/be/src/vec/data_types/serde/data_type_serde.h b/be/src/vec/data_types/serde/data_type_serde.h index 4df22029c9..36a2bd104a 100644 --- a/be/src/vec/data_types/serde/data_type_serde.h +++ b/be/src/vec/data_types/serde/data_type_serde.h @@ -29,7 +29,9 @@ #include "vec/columns/column_nullable.h" #include "vec/common/pod_array.h" #include "vec/common/pod_array_fwd.h" +#include "vec/common/string_buffer.hpp" #include "vec/core/types.h" +#include "vec/io/reader_buffer.h" namespace arrow { class ArrayBuilder; @@ -59,9 +61,26 @@ class IDataType; // the developer does not know how many datatypes has to deal. class DataTypeSerDe { +public: + // Text serialization/deserialization of data types depend on some settings witch we define + // in formatOptions. + struct FormatOptions { + /** + * if true, we will use olap format which defined in src/olap/types.h, but we do not suggest + * use this format in olap, because it is more slower, keep this option is for compatibility. + */ + bool date_olap_format = false; + }; + public: DataTypeSerDe(); virtual ~DataTypeSerDe(); + // Text serializer and deserializer with formatOptions to handle different text format + virtual void serialize_one_cell_to_text(const IColumn& column, int row_num, BufferWritable& bw, + const FormatOptions& options) const = 0; + + virtual Status deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const = 0; // Protobuf serializer and deserializer virtual Status write_column_to_pb(const IColumn& column, PValues& result, int start, diff --git a/be/src/vec/data_types/serde/data_type_string_serde.cpp b/be/src/vec/data_types/serde/data_type_string_serde.cpp index bff2df4431..dda20b5b65 100644 --- a/be/src/vec/data_types/serde/data_type_string_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_string_serde.cpp @@ -34,6 +34,25 @@ namespace doris { namespace vectorized { class Arena; +void DataTypeStringSerDe::serialize_one_cell_to_text(const IColumn& column, int row_num, + BufferWritable& bw, + const FormatOptions& options) const { + auto result = check_column_const_set_readability(column, row_num); + ColumnPtr ptr = result.first; + row_num = result.second; + + const auto& value = assert_cast<const ColumnString&>(*ptr).get_data_at(row_num); + bw.write(value.data, value.size); + bw.commit(); +} + +Status DataTypeStringSerDe::deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const { + auto& column_data = assert_cast<ColumnString&>(column); + column_data.insert_data(rb.position(), rb.count()); + return Status::OK(); +} + Status DataTypeStringSerDe::write_column_to_pb(const IColumn& column, PValues& result, int start, int end) const { result.mutable_bytes_value()->Reserve(end - start); diff --git a/be/src/vec/data_types/serde/data_type_string_serde.h b/be/src/vec/data_types/serde/data_type_string_serde.h index 85b60cb268..d377c345b9 100644 --- a/be/src/vec/data_types/serde/data_type_string_serde.h +++ b/be/src/vec/data_types/serde/data_type_string_serde.h @@ -33,6 +33,12 @@ class Arena; class DataTypeStringSerDe : public DataTypeSerDe { public: + void serialize_one_cell_to_text(const IColumn& column, int row_num, BufferWritable& bw, + const FormatOptions& options) const override; + + Status deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const override; + Status write_column_to_pb(const IColumn& column, PValues& result, int start, int end) const override; Status read_column_from_pb(IColumn& column, const PValues& arg) const override; diff --git a/be/src/vec/data_types/serde/data_type_struct_serde.h b/be/src/vec/data_types/serde/data_type_struct_serde.h index 927c7ac9ed..9491fc1d49 100644 --- a/be/src/vec/data_types/serde/data_type_struct_serde.h +++ b/be/src/vec/data_types/serde/data_type_struct_serde.h @@ -39,6 +39,17 @@ public: DataTypeStructSerDe(const DataTypeSerDeSPtrs& _elemSerDeSPtrs) : elemSerDeSPtrs(_elemSerDeSPtrs) {} + void serialize_one_cell_to_text(const IColumn& column, int row_num, BufferWritable& bw, + const FormatOptions& options) const override { + LOG(FATAL) << "Not support serialize struct column to buffer"; + } + + Status deserialize_one_cell_from_text(IColumn& column, ReadBuffer& rb, + const FormatOptions& options) const override { + LOG(FATAL) << "Not support deserialize from buffer to struct"; + return Status::NotSupported("Not support deserialize from buffer to struct"); + } + Status write_column_to_pb(const IColumn& column, PValues& result, int start, int end) const override { LOG(FATAL) << "Not support write struct column to pb"; diff --git a/be/test/vec/data_types/from_string_test.cpp b/be/test/vec/data_types/from_string_test.cpp new file mode 100644 index 0000000000..69efe394df --- /dev/null +++ b/be/test/vec/data_types/from_string_test.cpp @@ -0,0 +1,279 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gtest/gtest_pred_impl.h" +#include "olap/types.h" // for TypeInfo +#include "olap/wrapper_field.h" +#include "vec/columns/column.h" +#include "vec/core/field.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_factory.hpp" +#include "vec/data_types/serde_utils.h" +#include "vec/io/reader_buffer.h" + +namespace doris::vectorized { + +/** + * This test is used to check wrapperField from_string is equal to data type from_string or not + * same string feed to wrapperField and data type from_string, and check the result from + * wrapperField and data type to_string is equal or not + */ +TEST(FromStringTest, ScalaWrapperFieldVsDataType) { + // arithmetic scala field types + { + // fieldType, test_string, expect_wrapper_field_string, expect_data_type_string + typedef std::tuple<FieldType, std::vector<string>, std::vector<string>, std::vector<string>> + FieldType_RandStr; + std::vector<FieldType_RandStr> arithmetic_scala_field_types = { + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_BOOL, {"0", "1", "-9"}, + {"0", "1", "1"}, {"0", "1", ""}), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_TINYINT, {"127", "-128", "-190"}, + {"127", "-128", "66"}, {"127", "-128", ""}), + // here if it has overflow , wrapper field will return make max/min value, but data type will just throw error + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_SMALLINT, {"32767", "32768", "-32769"}, + {"32767", "-32768", "32767"}, {"32767", "", ""}), + // here if it has overflow , wrapper field will return make max/min value, but data type will just throw error + FieldType_RandStr( + FieldType::OLAP_FIELD_TYPE_INT, {"2147483647", "2147483648", "-2147483649"}, + {"2147483647", "-2147483648", "2147483647"}, {"2147483647", "", ""}), + // float ==> float32(32bit) + // here if it has overflow , wrapper field will return make max/min value, but data type will just throw error + FieldType_RandStr( + FieldType::OLAP_FIELD_TYPE_FLOAT, {"1.123", "3.40282e+38", "3.40282e+38+1"}, + {"1.123", "3.40282e+38", "3.40282e+38"}, {"1.123", "3.40282e+38", ""}), + // double ==> float64(64bit) + // here if it has overflow , wrapper field will return make max/min value, but data type will just throw error + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DOUBLE, + {"2343.12345465746", "2.22507e-308", "2.22507e-308-1"}, + {"2343.12345465746", "2.22507e-308", "2.22507e-308"}, + {"2343.12345465746", "2.22507e-308", ""}), + // BIGINT ==> int64_t(64bit) + // here if it has overflow , wrapper field will return make max/min value, but data type will just throw error + FieldType_RandStr( + FieldType::OLAP_FIELD_TYPE_BIGINT, + {"9223372036854775807", "-9223372036854775808", "9223372036854775808"}, + {"9223372036854775807", "-9223372036854775808", "9223372036854775807"}, + {"9223372036854775807", "-9223372036854775808", ""}), + // LARGEINT ==> int128_t(128bit) + // here if it has overflow , wrapper field will return 0, but data type will just throw error + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_LARGEINT, + {"170141183460469231731687303715884105727", + "−170141183460469231731687303715884105728", + "170141183460469231731687303715884105728"}, + {"170141183460469231731687303715884105727", "0", "0"}, + {"170141183460469231731687303715884105727", "", ""}), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_CHAR, {"amory happy"}, {"amory happy"}, + {"amory happy"}), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_VARCHAR, {"doris be better"}, + {"doris be better"}, {"doris be better"}), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_STRING, {"doris be better"}, + {"doris be better"}, {"doris be better"}), + // here if non-valid string , wrapper field will return make 999999999999999999.999999999, but data type will just throw error + // decimal ==> decimalv2(decimal<128>(27,9)) (17, 9)(firstN 0 will ignore) + FieldType_RandStr( + FieldType::OLAP_FIELD_TYPE_DECIMAL, + { + "012345678901234567.012345678", + // (18, 8) (automatically fill 0 for scala) + "123456789012345678.01234567", + // (17, 10) (wrapper_field just drop last, but data_type rounding last to make it fit) + "12345678901234567.0123456779", + // (17, 11) (wrapper_field just drop last, but data_type return error) + "12345678901234567.01234567791", + // (19, 8) (wrong) + "1234567890123456789.01234567", + }, + {"12345678901234567.012345678", "123456789012345678.012345670", + "12345678901234567.012345677", "12345678901234567.012345677", + "999999999999999999.999999999"}, + {"12345678901234567.012345678", "123456789012345678.012345670", + "12345678901234567.012345678", "", ""}), + // here decimal if non-valid value wrapper field will return make 999999999999999999.999999999, but data type will just throw error + // wrapper field to_string() will drop the scala. + // decimal32 ==> decimal32(9,2) (7,2) (6,3) (7,3) (8,1) + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DECIMAL32, + {"1234567.12", "123456.123", "1234567.123", "12345679.1"}, + {"1234567", "123456", "999999999", "12345679"}, + {"1234567.12", "123456.12", "", ""}), + // decimal64 ==> decimal64(18,9) (9, 9) (3,2) (9, 10) (10, 9) + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DECIMAL64, + {"123456789.123456789", "123.12", "123456789.0123456789", + "1234567890.123456789"}, + {"123456789", "123", "999999999999999999", "999999999999999999"}, + {"123456789.123456789", "123.120000000", "", ""}), + // decimal128I ==> decimal128I(38,18) (19,18) + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DECIMAL128I, + {"01234567890123456789.123456789123456789", + // (20,11) (automatically fill 0 for scala) + "12345678901234567890.12345678911", + // (19,18) + "1234567890123456789.123456789123456789", + // (19,19) (rounding last to make it fit) + "1234567890123456789.1234567890123456789", + // (18, 20) (rounding to make it fit) + "123456789012345678.01234567890123456789", + // (20, 19) (wrong) + "12345678901234567890.1234567890123456789"}, + {"1234567890123456789", "12345678901234567890", + "1234567890123456789", "1234567890123456789", + "123456789012345678", "99999999999999999999999999999999999999"}, + {"1234567890123456789.123456789123456789", + "12345678901234567890.123456789110000000", + "1234567890123456789.123456789123456789", + "1234567890123456789.123456789012345679", + "123456789012345678.012345678901234568", ""}), + + }; + for (auto type_pair : arithmetic_scala_field_types) { + auto type = std::get<0>(type_pair); + DataTypePtr data_type_ptr; + if (type == FieldType::OLAP_FIELD_TYPE_DECIMAL32) { + // decimal32(7, 2) + data_type_ptr = DataTypeFactory::instance().create_data_type(type, 9, 2); + } else if (type == FieldType::OLAP_FIELD_TYPE_DECIMAL64) { + // decimal64(18, 9) + data_type_ptr = DataTypeFactory::instance().create_data_type(type, 18, 9); + } else if (type == FieldType::OLAP_FIELD_TYPE_DECIMAL128I) { + // decimal128I(38,18) + data_type_ptr = DataTypeFactory::instance().create_data_type(type, 38, 18); + } else { + data_type_ptr = DataTypeFactory::instance().create_data_type(type, 0, 0); + } + std::cout << "this type is " << data_type_ptr->get_name() << ": " + << fmt::format("{}", type) << std::endl; + + // wrapper_field + for (int i = 0; i < std::get<1>(type_pair).size(); ++i) { + string test_str = std::get<1>(type_pair)[i]; + std::unique_ptr<WrapperField> wf(WrapperField::create_by_type(type)); + std::cout << "the ith : " << i << " test_str: " << test_str << std::endl; + // from_string + Status st = wf->from_string(test_str); + EXPECT_EQ(st.ok(), true); + //to_string + std::string wfs = wf->to_string(); + EXPECT_EQ(wfs, std::get<2>(type_pair)[i]); + } + + auto col = data_type_ptr->create_column(); + // data_type + for (int i = 0; i < std::get<1>(type_pair).size(); ++i) { + std::cout << "the ith : " << i << std::endl; + string test_str = std::get<1>(type_pair)[i]; + // data_type from_string + ReadBuffer rb_test(test_str.data(), test_str.size()); + Status st = data_type_ptr->from_string(rb_test, col); + if (std::get<3>(type_pair)[i].empty()) { + EXPECT_EQ(st.ok(), false); + std::cout << "deserialize failed: " << st.to_json() << std::endl; + continue; + } + EXPECT_EQ(st.ok(), true); + // data_type to_string + string min_s_d = data_type_ptr->to_string(*col, i); + EXPECT_EQ(min_s_d, std::get<3>(type_pair)[i]); + } + } + } + + // date and datetime type + { + typedef std::pair<FieldType, string> FieldType_RandStr; + std::vector<FieldType_RandStr> date_scala_field_types = { + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DATE, "2020-01-01"), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DATEV2, "2020-01-01"), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DATETIME, "2020-01-01 12:00:00"), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DATETIMEV2, + "2020-01-01 12:00:00.666666"), + }; + for (auto pair : date_scala_field_types) { + auto type = pair.first; + DataTypePtr data_type_ptr = DataTypeFactory::instance().create_data_type(type, 0, 0); + std::cout << "this type is " << data_type_ptr->get_name() << ": " + << fmt::format("{}", type) << std::endl; + + std::unique_ptr<WrapperField> min_wf(WrapperField::create_by_type(type)); + std::unique_ptr<WrapperField> max_wf(WrapperField::create_by_type(type)); + std::unique_ptr<WrapperField> rand_wf(WrapperField::create_by_type(type)); + + min_wf->set_to_min(); + max_wf->set_to_max(); + rand_wf->from_string(pair.second, 0, 0); + + string min_s = min_wf->to_string(); + string max_s = max_wf->to_string(); + string rand_date = rand_wf->to_string(); + + ReadBuffer min_rb(min_s.data(), min_s.size()); + ReadBuffer max_rb(max_s.data(), max_s.size()); + ReadBuffer rand_rb(rand_date.data(), rand_date.size()); + + auto col = data_type_ptr->create_column(); + Status st = data_type_ptr->from_string(min_rb, col); + EXPECT_EQ(st.ok(), true); + st = data_type_ptr->from_string(max_rb, col); + EXPECT_EQ(st.ok(), true); + st = data_type_ptr->from_string(rand_rb, col); + EXPECT_EQ(st.ok(), true); + + string min_s_d = data_type_ptr->to_string(*col, 0); + string max_s_d = data_type_ptr->to_string(*col, 1); + string rand_s_d = data_type_ptr->to_string(*col, 2); + rtrim(min_s); + rtrim(max_s); + rtrim(rand_date); + std::cout << "min(" << min_s << ") with datat_ype_str:" << min_s_d << std::endl; + std::cout << "max(" << max_s << ") with datat_ype_str:" << max_s_d << std::endl; + std::cout << "rand(" << rand_date << ") with datat_type_str:" << rand_s_d << std::endl; + if (FieldType::OLAP_FIELD_TYPE_DATETIMEV2 == type) { + // field to_string : %Y-%m-%d %H:%i:%s.%f vs data type to_string %Y-%m-%d %H:%i:%s + min_s = min_s.substr(0, min_s.find_last_of('.')); + max_s = max_s.substr(0, max_s.find_last_of('.')); + rand_date = rand_date.substr(0, rand_date.find_last_of('.')); + } + // min wrapper field date to_string in macOS and linux system has different result + // macOs equals with data type to_string(0000-01-01), but in linux is (0-01-01) + if (FieldType::OLAP_FIELD_TYPE_DATE == type || + FieldType::OLAP_FIELD_TYPE_DATETIME == type) { + // min wrapper field date to_string in macOS and linux system has different result + // macOs equals with data type to_string(0000-01-01), but in linux is (0-01-01) + std::cout << "wrapper field (" << min_s << ") with data type to_string(" << min_s_d + << ")" << std::endl; + } else { + EXPECT_EQ(min_s, min_s_d); + } + EXPECT_EQ(max_s, max_s_d); + EXPECT_EQ(rand_date, rand_s_d); + } + } + + // null data type + { + DataTypePtr data_type_ptr = DataTypeFactory::instance().create_data_type( + FieldType::OLAP_FIELD_TYPE_STRING, 0, 0); + DataTypePtr nullable_ptr = std::make_shared<DataTypeNullable>(data_type_ptr); + std::unique_ptr<WrapperField> rand_wf( + WrapperField::create_by_type(FieldType::OLAP_FIELD_TYPE_STRING)); + std::string test_str = generate(128); + rand_wf->from_string(test_str, 0, 0); + Field string_field(test_str); + ColumnPtr col = nullable_ptr->create_column_const(0, string_field); + EXPECT_EQ(rand_wf->to_string(), nullable_ptr->to_string(*col, 0)); + } +} + +} // namespace doris::vectorized diff --git a/be/test/vec/data_types/serde/data_type_serde_text_test.cpp b/be/test/vec/data_types/serde/data_type_serde_text_test.cpp new file mode 100644 index 0000000000..4d1139003d --- /dev/null +++ b/be/test/vec/data_types/serde/data_type_serde_text_test.cpp @@ -0,0 +1,256 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gtest/gtest_pred_impl.h" +#include "olap/types.h" // for TypeInfo +#include "olap/wrapper_field.h" +#include "vec/columns/column.h" +#include "vec/common/string_buffer.hpp" +#include "vec/core/field.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_factory.hpp" +#include "vec/data_types/serde/data_type_serde.h" +#include "vec/data_types/serde_utils.h" +#include "vec/io/reader_buffer.h" + +namespace doris::vectorized { +// This test aim to make sense for text serde of data types. +// we use default formatOption and special formatOption to equal serde for wrapperField. +TEST(TextSerde, ScalaDataTypeSerdeTextTest) { + // arithmetic scala field types + { + // fieldType, test_string, expect_string + typedef std::tuple<FieldType, std::vector<string>, std::vector<string>> FieldType_RandStr; + std::vector<FieldType_RandStr> arithmetic_scala_field_types = { + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_BOOL, {"0", "1", "-1"}, + {"0", "1", ""}), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_TINYINT, {"127", "-128", "-190"}, + {"127", "-128", ""}), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_SMALLINT, {"32767", "32768", "-32769"}, + {"32767", "", ""}), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_INT, + {"2147483647", "2147483648", "-2147483649"}, + {"2147483647", "", ""}), + // float ==> float32(32bit) + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_FLOAT, + {"1.123", "3.40282e+38", "3.40282e+38+1"}, + {"1.123", "3.40282e+38", ""}), + // double ==> float64(64bit) + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DOUBLE, + {"2343.12345465746", "2.22507e-308", "2.22507e-308-1"}, + {"2343.12345465746", "2.22507e-308", ""}), + // BIGINT ==> int64_t(64bit) + FieldType_RandStr( + FieldType::OLAP_FIELD_TYPE_BIGINT, + {"9223372036854775807", "-9223372036854775808", "9223372036854775808"}, + {"9223372036854775807", "-9223372036854775808", ""}), + // LARGEINT ==> int128_t(128bit) + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_LARGEINT, + {"170141183460469231731687303715884105727", + "−170141183460469231731687303715884105728", + "170141183460469231731687303715884105728"}, + {"170141183460469231731687303715884105727", "", ""}), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_CHAR, {"amory happy"}, + {"amory happy"}), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_VARCHAR, {"doris be better"}, + {"doris be better"}), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_STRING, {"doris be better"}, + {"doris be better"}), + // decimal ==> decimalv2(decimal<128>(27,9)) (17, 9)(first 0 will ignore) + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DECIMAL, + { + "012345678901234567.012345678", + // (18, 8) (automatically fill 0 for scala) + "123456789012345678.01234567", + // (17, 10) (rounding last to make it fit) + "12345678901234567.0123456779", + // (17, 11) (wrong) + "12345678901234567.01234567791", + // (19, 8) (wrong) + "1234567890123456789.01234567", + }, + {"12345678901234567.012345678", "123456789012345678.012345670", + "12345678901234567.012345678", "", ""}), + // decimal32 ==> decimal32(9,2) (7,2) (6,3) (7,3) (8,1) + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DECIMAL32, + {"1234567.12", "123456.123", "1234567.123", "12345679.1"}, + {"1234567.12", "123456.12", "", ""}), + // decimal64 ==> decimal64(18,9) (9, 9) (3,2) (9, 10) (10, 9) + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DECIMAL64, + {"123456789.123456789", "123.12", "123456789.0123456789", + "1234567890.123456789"}, + {"123456789.123456789", "123.120000000", "", ""}), + // decimal128I ==> decimal128I(38,18) (19,18) + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DECIMAL128I, + {"01234567890123456789.123456789123456789", + // (20,11) (automatically fill 0 for scala) + "12345678901234567890.12345678911", + // (19,18) + "1234567890123456789.123456789123456789", + // (19,19) (rounding last to make it fit) + "1234567890123456789.1234567890123456789", + // (18, 20) (rounding to make it fit) + "123456789012345678.01234567890123456789", + // (20, 19) (wrong) + "12345678901234567890.1234567890123456789"}, + {"1234567890123456789.123456789123456789", + "12345678901234567890.123456789110000000", + "1234567890123456789.123456789123456789", + "1234567890123456789.123456789012345679", + "123456789012345678.012345678901234568", ""}), + + }; + + for (auto type_pair : arithmetic_scala_field_types) { + auto type = std::get<0>(type_pair); + DataTypePtr data_type_ptr; + if (type == FieldType::OLAP_FIELD_TYPE_DECIMAL32) { + // decimal32(7, 2) + data_type_ptr = DataTypeFactory::instance().create_data_type(type, 9, 2); + } else if (type == FieldType::OLAP_FIELD_TYPE_DECIMAL64) { + // decimal64(18, 9) + data_type_ptr = DataTypeFactory::instance().create_data_type(type, 18, 9); + } else if (type == FieldType::OLAP_FIELD_TYPE_DECIMAL128I) { + // decimal128I(38,18) + data_type_ptr = DataTypeFactory::instance().create_data_type(type, 38, 18); + } else { + data_type_ptr = DataTypeFactory::instance().create_data_type(type, 0, 0); + } + std::cout << "this type is " << data_type_ptr->get_name() << ": " + << fmt::format("{}", type) << std::endl; + + auto col = data_type_ptr->create_column(); + + // serde for data types with default FormatOption + DataTypeSerDe::FormatOptions default_format_option; + DataTypeSerDeSPtr serde = data_type_ptr->get_serde(); + + auto ser_col = ColumnString::create(); + ser_col->reserve(std::get<1>(type_pair).size()); + VectorBufferWriter buffer_writer(*ser_col.get()); + + for (int i = 0; i < std::get<1>(type_pair).size(); ++i) { + std::cout << "the ith : " << i << std::endl; + string test_str = std::get<1>(type_pair)[i]; + ReadBuffer rb_test(test_str.data(), test_str.size()); + // deserialize + Status st = + serde->deserialize_one_cell_from_text(*col, rb_test, default_format_option); + if (std::get<2>(type_pair)[i].empty()) { + EXPECT_EQ(st.ok(), false); + std::cout << "deserialize failed: " << st.to_json() << std::endl; + continue; + } + EXPECT_EQ(st.ok(), true); + // serialize + serde->serialize_one_cell_to_text(*col, i, buffer_writer, default_format_option); + EXPECT_EQ(ser_col->get_data_at(ser_col->size() - 1).to_string(), + std::get<2>(type_pair)[i]); + } + } + } + + // date and datetime type + { + typedef std::pair<FieldType, string> FieldType_RandStr; + std::vector<FieldType_RandStr> date_scala_field_types = { + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DATE, "2020-01-01"), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DATE, "2020-01-01"), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DATEV2, "2020-01-01"), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DATETIME, "2020-01-01 12:00:00"), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DATETIMEV2, + "2020-01-01 12:00:00.666666"), + }; + for (auto pair : date_scala_field_types) { + auto type = pair.first; + DataTypePtr data_type_ptr = DataTypeFactory::instance().create_data_type(type, 0, 0); + std::cout << "this type is " << data_type_ptr->get_name() << ": " + << fmt::format("{}", type) << std::endl; + + std::unique_ptr<WrapperField> min_wf(WrapperField::create_by_type(type)); + std::unique_ptr<WrapperField> max_wf(WrapperField::create_by_type(type)); + std::unique_ptr<WrapperField> rand_wf(WrapperField::create_by_type(type)); + + min_wf->set_to_min(); + max_wf->set_to_max(); + rand_wf->from_string(pair.second, 0, 0); + + string min_s = min_wf->to_string(); + string max_s = max_wf->to_string(); + string rand_date = rand_wf->to_string(); + + ReadBuffer min_rb(min_s.data(), min_s.size()); + ReadBuffer max_rb(max_s.data(), max_s.size()); + ReadBuffer rand_rb(rand_date.data(), rand_date.size()); + + auto col = data_type_ptr->create_column(); + DataTypeSerDeSPtr serde = data_type_ptr->get_serde(); + // make use c++ lib equals to wrapper field from_string behavior + DataTypeSerDe::FormatOptions formatOptions; + formatOptions.date_olap_format = true; + + Status st = serde->deserialize_one_cell_from_text(*col, min_rb, formatOptions); + EXPECT_EQ(st.ok(), true); + st = serde->deserialize_one_cell_from_text(*col, max_rb, formatOptions); + EXPECT_EQ(st.ok(), true); + st = serde->deserialize_one_cell_from_text(*col, rand_rb, formatOptions); + EXPECT_EQ(st.ok(), true); + + auto ser_col = ColumnString::create(); + ser_col->reserve(3); + VectorBufferWriter buffer_writer(*ser_col.get()); + serde->serialize_one_cell_to_text(*col, 0, buffer_writer, formatOptions); + serde->serialize_one_cell_to_text(*col, 1, buffer_writer, formatOptions); + serde->serialize_one_cell_to_text(*col, 2, buffer_writer, formatOptions); + rtrim(min_s); + rtrim(max_s); + rtrim(rand_date); + StringRef min_s_d = ser_col->get_data_at(0); + StringRef max_s_d = ser_col->get_data_at(1); + StringRef rand_s_d = ser_col->get_data_at(2); + + std::cout << "min(" << min_s << ") with datat_ype_str:" << min_s_d << std::endl; + std::cout << "max(" << max_s << ") with datat_ype_str:" << max_s_d << std::endl; + std::cout << "rand(" << rand_date << ") with datat_type_str:" << rand_s_d << std::endl; + EXPECT_EQ(min_s, min_s_d.to_string()); + EXPECT_EQ(max_s, max_s_d.to_string()); + EXPECT_EQ(rand_date, rand_s_d.to_string()); + } + } + + // nullable data type with const column + { + DataTypePtr data_type_ptr = DataTypeFactory::instance().create_data_type( + FieldType::OLAP_FIELD_TYPE_STRING, 0, 0); + DataTypePtr nullable_ptr = std::make_shared<DataTypeNullable>(data_type_ptr); + std::unique_ptr<WrapperField> rand_wf( + WrapperField::create_by_type(FieldType::OLAP_FIELD_TYPE_STRING)); + std::string test_str = generate(128); + rand_wf->from_string(test_str, 0, 0); + Field string_field(test_str); + ColumnPtr col = nullable_ptr->create_column_const(0, string_field); + DataTypeSerDe::FormatOptions default_format_option; + DataTypeSerDeSPtr serde = nullable_ptr->get_serde(); + auto ser_col = ColumnString::create(); + ser_col->reserve(1); + VectorBufferWriter buffer_writer(*ser_col.get()); + serde->serialize_one_cell_to_text(*col, 0, buffer_writer, default_format_option); + StringRef rand_s_d = ser_col->get_data_at(0); + EXPECT_EQ(rand_wf->to_string(), rand_s_d.to_string()); + } +} +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/test/vec/data_types/serde_utils.h b/be/test/vec/data_types/serde_utils.h new file mode 100644 index 0000000000..fabb724330 --- /dev/null +++ b/be/test/vec/data_types/serde_utils.h @@ -0,0 +1,53 @@ + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#pragma once + +#include <gtest/gtest-message.h> +#include <gtest/gtest-test-part.h> + +#include <limits> +#include <memory> +#include <random> +#include <type_traits> + +namespace doris::vectorized { +static constexpr const char END_SYMBOL = '\0'; + +static void rtrim(std::string& s) { + if (int pos = s.find_last_not_of(END_SYMBOL); pos != std::string::npos) { + s = s.substr(0, pos + 1); + } +} +static constexpr const char alphanum[] = + "0123456789" + "!@#$%^&*" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz"; +static std::default_random_engine random {static_cast<unsigned>(time(0))}; +static std::mt19937 random_generator(random()); +[[maybe_unused]] static std::string generate(size_t length, const std::string& charset = "") { + // use default charset if no charset is specified + std::string str = charset.empty() ? std::string(alphanum) : charset; + // double string length until it is at least as long as the requested length + while (length > str.length()) str += str; + // shuffle string + std::shuffle(str.begin(), str.end(), random_generator); + // return substring with specified length + return str.substr(0, length); +} +} // namespace doris::vectorized --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org