This is an automated email from the ASF dual-hosted git repository. adonisling pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push: new 6b97719b66 [Improve](serde)update serialize and deserialize text for data type (#21109) (#23695) 6b97719b66 is described below commit 6b97719b663a568f94c2a23235fe6715ba3d75c8 Author: Adonis Ling <adonis0...@gmail.com> AuthorDate: Thu Aug 31 11:57:52 2023 +0800 [Improve](serde)update serialize and deserialize text for data type (#21109) (#23695) Cherry pick #21109 Co-authored-by: amory <wangqian...@selectdb.com> --- be/src/vec/data_types/data_type_date_time.h | 4 +- be/src/vec/data_types/serde/data_type_jsonb.h | 62 ----- be/src/vec/data_types/serde/data_type_serde.h | 1 + be/test/vec/data_types/from_string_test.cpp | 279 +++++++++++++++++++++ .../data_types/serde/data_type_serde_text_test.cpp | 2 +- be/test/vec/data_types/serde_utils.h | 53 ++++ 6 files changed, 337 insertions(+), 64 deletions(-) diff --git a/be/src/vec/data_types/data_type_date_time.h b/be/src/vec/data_types/data_type_date_time.h index 2e63bc99b7..f096d003dd 100644 --- a/be/src/vec/data_types/data_type_date_time.h +++ b/be/src/vec/data_types/data_type_date_time.h @@ -84,7 +84,9 @@ public: std::string to_string(const IColumn& column, size_t row_num) const override; - DataTypeSerDeSPtr get_serde() const override { return std::make_shared<DataTypeDate64SerDe>(); } + DataTypeSerDeSPtr get_serde() const override { + return std::make_shared<DataTypeDateTimeSerDe>(); + } Field get_field(const TExprNode& node) const override { VecDateTimeValue value; diff --git a/be/src/vec/data_types/serde/data_type_jsonb.h b/be/src/vec/data_types/serde/data_type_jsonb.h deleted file mode 100644 index d274a34571..0000000000 --- a/be/src/vec/data_types/serde/data_type_jsonb.h +++ /dev/null @@ -1,62 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once -#include <gen_cpp/types.pb.h> -#include <glog/logging.h> -#include <stddef.h> -#include <stdint.h> - -#include "data_type_number_serde.h" -#include "vec/core/types.h" - -namespace doris { -class JsonbOutStream; - -namespace vectorized { -class Arena; - -class DataTypeTimeSerDe : public DataTypeNumberSerDe<Float64> { - Status write_column_to_mysql(const IColumn& column, std::vector<MysqlRowBuffer<false>>& result, - int row_idx, int start, int end, bool col_const) const override { - return _write_date_time_column_to_mysql(column, result, row_idx, start, end, col_const); - } - Status write_column_to_mysql(const IColumn& column, std::vector<MysqlRowBuffer<true>>& result, - int row_idx, int start, int end, bool col_const) const override { - return _write_date_time_column_to_mysql(column, result, row_idx, start, end, col_const); - } - -private: - template <bool is_binary_format> - Status _write_date_time_column_to_mysql(const IColumn& column, - std::vector<MysqlRowBuffer<is_binary_format>>& result, - int row_idx, int start, int end, bool col_const) const { - int buf_ret = 0; - auto& data = assert_cast<const ColumnVector<Float64>&>(column).get_data(); - for (int i = start; i < end; ++i) { - if (0 != buf_ret) { - return Status::InternalError("pack mysql buffer failed."); - } - const auto col_index = index_check_const(i, col_const); - buf_ret = result[row_idx].push_time(data[col_index]); - ++row_idx; - } - return Status::OK(); - } -}; -} // namespace vectorized -} // namespace doris \ No newline at end of file diff --git a/be/src/vec/data_types/serde/data_type_serde.h b/be/src/vec/data_types/serde/data_type_serde.h index 59ca3d0291..654b8aeb7b 100644 --- a/be/src/vec/data_types/serde/data_type_serde.h +++ b/be/src/vec/data_types/serde/data_type_serde.h @@ -31,6 +31,7 @@ #include "vec/common/pod_array_fwd.h" #include "vec/common/string_buffer.hpp" #include "vec/core/types.h" +#include "vec/io/reader_buffer.h" namespace arrow { class ArrayBuilder; diff --git a/be/test/vec/data_types/from_string_test.cpp b/be/test/vec/data_types/from_string_test.cpp new file mode 100644 index 0000000000..69efe394df --- /dev/null +++ b/be/test/vec/data_types/from_string_test.cpp @@ -0,0 +1,279 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gtest/gtest_pred_impl.h" +#include "olap/types.h" // for TypeInfo +#include "olap/wrapper_field.h" +#include "vec/columns/column.h" +#include "vec/core/field.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_factory.hpp" +#include "vec/data_types/serde_utils.h" +#include "vec/io/reader_buffer.h" + +namespace doris::vectorized { + +/** + * This test is used to check wrapperField from_string is equal to data type from_string or not + * same string feed to wrapperField and data type from_string, and check the result from + * wrapperField and data type to_string is equal or not + */ +TEST(FromStringTest, ScalaWrapperFieldVsDataType) { + // arithmetic scala field types + { + // fieldType, test_string, expect_wrapper_field_string, expect_data_type_string + typedef std::tuple<FieldType, std::vector<string>, std::vector<string>, std::vector<string>> + FieldType_RandStr; + std::vector<FieldType_RandStr> arithmetic_scala_field_types = { + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_BOOL, {"0", "1", "-9"}, + {"0", "1", "1"}, {"0", "1", ""}), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_TINYINT, {"127", "-128", "-190"}, + {"127", "-128", "66"}, {"127", "-128", ""}), + // here if it has overflow , wrapper field will return make max/min value, but data type will just throw error + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_SMALLINT, {"32767", "32768", "-32769"}, + {"32767", "-32768", "32767"}, {"32767", "", ""}), + // here if it has overflow , wrapper field will return make max/min value, but data type will just throw error + FieldType_RandStr( + FieldType::OLAP_FIELD_TYPE_INT, {"2147483647", "2147483648", "-2147483649"}, + {"2147483647", "-2147483648", "2147483647"}, {"2147483647", "", ""}), + // float ==> float32(32bit) + // here if it has overflow , wrapper field will return make max/min value, but data type will just throw error + FieldType_RandStr( + FieldType::OLAP_FIELD_TYPE_FLOAT, {"1.123", "3.40282e+38", "3.40282e+38+1"}, + {"1.123", "3.40282e+38", "3.40282e+38"}, {"1.123", "3.40282e+38", ""}), + // double ==> float64(64bit) + // here if it has overflow , wrapper field will return make max/min value, but data type will just throw error + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DOUBLE, + {"2343.12345465746", "2.22507e-308", "2.22507e-308-1"}, + {"2343.12345465746", "2.22507e-308", "2.22507e-308"}, + {"2343.12345465746", "2.22507e-308", ""}), + // BIGINT ==> int64_t(64bit) + // here if it has overflow , wrapper field will return make max/min value, but data type will just throw error + FieldType_RandStr( + FieldType::OLAP_FIELD_TYPE_BIGINT, + {"9223372036854775807", "-9223372036854775808", "9223372036854775808"}, + {"9223372036854775807", "-9223372036854775808", "9223372036854775807"}, + {"9223372036854775807", "-9223372036854775808", ""}), + // LARGEINT ==> int128_t(128bit) + // here if it has overflow , wrapper field will return 0, but data type will just throw error + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_LARGEINT, + {"170141183460469231731687303715884105727", + "−170141183460469231731687303715884105728", + "170141183460469231731687303715884105728"}, + {"170141183460469231731687303715884105727", "0", "0"}, + {"170141183460469231731687303715884105727", "", ""}), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_CHAR, {"amory happy"}, {"amory happy"}, + {"amory happy"}), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_VARCHAR, {"doris be better"}, + {"doris be better"}, {"doris be better"}), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_STRING, {"doris be better"}, + {"doris be better"}, {"doris be better"}), + // here if non-valid string , wrapper field will return make 999999999999999999.999999999, but data type will just throw error + // decimal ==> decimalv2(decimal<128>(27,9)) (17, 9)(firstN 0 will ignore) + FieldType_RandStr( + FieldType::OLAP_FIELD_TYPE_DECIMAL, + { + "012345678901234567.012345678", + // (18, 8) (automatically fill 0 for scala) + "123456789012345678.01234567", + // (17, 10) (wrapper_field just drop last, but data_type rounding last to make it fit) + "12345678901234567.0123456779", + // (17, 11) (wrapper_field just drop last, but data_type return error) + "12345678901234567.01234567791", + // (19, 8) (wrong) + "1234567890123456789.01234567", + }, + {"12345678901234567.012345678", "123456789012345678.012345670", + "12345678901234567.012345677", "12345678901234567.012345677", + "999999999999999999.999999999"}, + {"12345678901234567.012345678", "123456789012345678.012345670", + "12345678901234567.012345678", "", ""}), + // here decimal if non-valid value wrapper field will return make 999999999999999999.999999999, but data type will just throw error + // wrapper field to_string() will drop the scala. + // decimal32 ==> decimal32(9,2) (7,2) (6,3) (7,3) (8,1) + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DECIMAL32, + {"1234567.12", "123456.123", "1234567.123", "12345679.1"}, + {"1234567", "123456", "999999999", "12345679"}, + {"1234567.12", "123456.12", "", ""}), + // decimal64 ==> decimal64(18,9) (9, 9) (3,2) (9, 10) (10, 9) + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DECIMAL64, + {"123456789.123456789", "123.12", "123456789.0123456789", + "1234567890.123456789"}, + {"123456789", "123", "999999999999999999", "999999999999999999"}, + {"123456789.123456789", "123.120000000", "", ""}), + // decimal128I ==> decimal128I(38,18) (19,18) + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DECIMAL128I, + {"01234567890123456789.123456789123456789", + // (20,11) (automatically fill 0 for scala) + "12345678901234567890.12345678911", + // (19,18) + "1234567890123456789.123456789123456789", + // (19,19) (rounding last to make it fit) + "1234567890123456789.1234567890123456789", + // (18, 20) (rounding to make it fit) + "123456789012345678.01234567890123456789", + // (20, 19) (wrong) + "12345678901234567890.1234567890123456789"}, + {"1234567890123456789", "12345678901234567890", + "1234567890123456789", "1234567890123456789", + "123456789012345678", "99999999999999999999999999999999999999"}, + {"1234567890123456789.123456789123456789", + "12345678901234567890.123456789110000000", + "1234567890123456789.123456789123456789", + "1234567890123456789.123456789012345679", + "123456789012345678.012345678901234568", ""}), + + }; + for (auto type_pair : arithmetic_scala_field_types) { + auto type = std::get<0>(type_pair); + DataTypePtr data_type_ptr; + if (type == FieldType::OLAP_FIELD_TYPE_DECIMAL32) { + // decimal32(7, 2) + data_type_ptr = DataTypeFactory::instance().create_data_type(type, 9, 2); + } else if (type == FieldType::OLAP_FIELD_TYPE_DECIMAL64) { + // decimal64(18, 9) + data_type_ptr = DataTypeFactory::instance().create_data_type(type, 18, 9); + } else if (type == FieldType::OLAP_FIELD_TYPE_DECIMAL128I) { + // decimal128I(38,18) + data_type_ptr = DataTypeFactory::instance().create_data_type(type, 38, 18); + } else { + data_type_ptr = DataTypeFactory::instance().create_data_type(type, 0, 0); + } + std::cout << "this type is " << data_type_ptr->get_name() << ": " + << fmt::format("{}", type) << std::endl; + + // wrapper_field + for (int i = 0; i < std::get<1>(type_pair).size(); ++i) { + string test_str = std::get<1>(type_pair)[i]; + std::unique_ptr<WrapperField> wf(WrapperField::create_by_type(type)); + std::cout << "the ith : " << i << " test_str: " << test_str << std::endl; + // from_string + Status st = wf->from_string(test_str); + EXPECT_EQ(st.ok(), true); + //to_string + std::string wfs = wf->to_string(); + EXPECT_EQ(wfs, std::get<2>(type_pair)[i]); + } + + auto col = data_type_ptr->create_column(); + // data_type + for (int i = 0; i < std::get<1>(type_pair).size(); ++i) { + std::cout << "the ith : " << i << std::endl; + string test_str = std::get<1>(type_pair)[i]; + // data_type from_string + ReadBuffer rb_test(test_str.data(), test_str.size()); + Status st = data_type_ptr->from_string(rb_test, col); + if (std::get<3>(type_pair)[i].empty()) { + EXPECT_EQ(st.ok(), false); + std::cout << "deserialize failed: " << st.to_json() << std::endl; + continue; + } + EXPECT_EQ(st.ok(), true); + // data_type to_string + string min_s_d = data_type_ptr->to_string(*col, i); + EXPECT_EQ(min_s_d, std::get<3>(type_pair)[i]); + } + } + } + + // date and datetime type + { + typedef std::pair<FieldType, string> FieldType_RandStr; + std::vector<FieldType_RandStr> date_scala_field_types = { + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DATE, "2020-01-01"), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DATEV2, "2020-01-01"), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DATETIME, "2020-01-01 12:00:00"), + FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DATETIMEV2, + "2020-01-01 12:00:00.666666"), + }; + for (auto pair : date_scala_field_types) { + auto type = pair.first; + DataTypePtr data_type_ptr = DataTypeFactory::instance().create_data_type(type, 0, 0); + std::cout << "this type is " << data_type_ptr->get_name() << ": " + << fmt::format("{}", type) << std::endl; + + std::unique_ptr<WrapperField> min_wf(WrapperField::create_by_type(type)); + std::unique_ptr<WrapperField> max_wf(WrapperField::create_by_type(type)); + std::unique_ptr<WrapperField> rand_wf(WrapperField::create_by_type(type)); + + min_wf->set_to_min(); + max_wf->set_to_max(); + rand_wf->from_string(pair.second, 0, 0); + + string min_s = min_wf->to_string(); + string max_s = max_wf->to_string(); + string rand_date = rand_wf->to_string(); + + ReadBuffer min_rb(min_s.data(), min_s.size()); + ReadBuffer max_rb(max_s.data(), max_s.size()); + ReadBuffer rand_rb(rand_date.data(), rand_date.size()); + + auto col = data_type_ptr->create_column(); + Status st = data_type_ptr->from_string(min_rb, col); + EXPECT_EQ(st.ok(), true); + st = data_type_ptr->from_string(max_rb, col); + EXPECT_EQ(st.ok(), true); + st = data_type_ptr->from_string(rand_rb, col); + EXPECT_EQ(st.ok(), true); + + string min_s_d = data_type_ptr->to_string(*col, 0); + string max_s_d = data_type_ptr->to_string(*col, 1); + string rand_s_d = data_type_ptr->to_string(*col, 2); + rtrim(min_s); + rtrim(max_s); + rtrim(rand_date); + std::cout << "min(" << min_s << ") with datat_ype_str:" << min_s_d << std::endl; + std::cout << "max(" << max_s << ") with datat_ype_str:" << max_s_d << std::endl; + std::cout << "rand(" << rand_date << ") with datat_type_str:" << rand_s_d << std::endl; + if (FieldType::OLAP_FIELD_TYPE_DATETIMEV2 == type) { + // field to_string : %Y-%m-%d %H:%i:%s.%f vs data type to_string %Y-%m-%d %H:%i:%s + min_s = min_s.substr(0, min_s.find_last_of('.')); + max_s = max_s.substr(0, max_s.find_last_of('.')); + rand_date = rand_date.substr(0, rand_date.find_last_of('.')); + } + // min wrapper field date to_string in macOS and linux system has different result + // macOs equals with data type to_string(0000-01-01), but in linux is (0-01-01) + if (FieldType::OLAP_FIELD_TYPE_DATE == type || + FieldType::OLAP_FIELD_TYPE_DATETIME == type) { + // min wrapper field date to_string in macOS and linux system has different result + // macOs equals with data type to_string(0000-01-01), but in linux is (0-01-01) + std::cout << "wrapper field (" << min_s << ") with data type to_string(" << min_s_d + << ")" << std::endl; + } else { + EXPECT_EQ(min_s, min_s_d); + } + EXPECT_EQ(max_s, max_s_d); + EXPECT_EQ(rand_date, rand_s_d); + } + } + + // null data type + { + DataTypePtr data_type_ptr = DataTypeFactory::instance().create_data_type( + FieldType::OLAP_FIELD_TYPE_STRING, 0, 0); + DataTypePtr nullable_ptr = std::make_shared<DataTypeNullable>(data_type_ptr); + std::unique_ptr<WrapperField> rand_wf( + WrapperField::create_by_type(FieldType::OLAP_FIELD_TYPE_STRING)); + std::string test_str = generate(128); + rand_wf->from_string(test_str, 0, 0); + Field string_field(test_str); + ColumnPtr col = nullable_ptr->create_column_const(0, string_field); + EXPECT_EQ(rand_wf->to_string(), nullable_ptr->to_string(*col, 0)); + } +} + +} // namespace doris::vectorized diff --git a/be/test/vec/data_types/serde/data_type_serde_text_test.cpp b/be/test/vec/data_types/serde/data_type_serde_text_test.cpp index 188d70daa9..00db4ef1b8 100644 --- a/be/test/vec/data_types/serde/data_type_serde_text_test.cpp +++ b/be/test/vec/data_types/serde/data_type_serde_text_test.cpp @@ -1257,4 +1257,4 @@ TEST(TextSerde, test_slice) { std::cout << s.to_string() << std::endl; } } -} // namespace doris::vectorized \ No newline at end of file +} // namespace doris::vectorized diff --git a/be/test/vec/data_types/serde_utils.h b/be/test/vec/data_types/serde_utils.h new file mode 100644 index 0000000000..fabb724330 --- /dev/null +++ b/be/test/vec/data_types/serde_utils.h @@ -0,0 +1,53 @@ + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#pragma once + +#include <gtest/gtest-message.h> +#include <gtest/gtest-test-part.h> + +#include <limits> +#include <memory> +#include <random> +#include <type_traits> + +namespace doris::vectorized { +static constexpr const char END_SYMBOL = '\0'; + +static void rtrim(std::string& s) { + if (int pos = s.find_last_not_of(END_SYMBOL); pos != std::string::npos) { + s = s.substr(0, pos + 1); + } +} +static constexpr const char alphanum[] = + "0123456789" + "!@#$%^&*" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz"; +static std::default_random_engine random {static_cast<unsigned>(time(0))}; +static std::mt19937 random_generator(random()); +[[maybe_unused]] static std::string generate(size_t length, const std::string& charset = "") { + // use default charset if no charset is specified + std::string str = charset.empty() ? std::string(alphanum) : charset; + // double string length until it is at least as long as the requested length + while (length > str.length()) str += str; + // shuffle string + std::shuffle(str.begin(), str.end(), random_generator); + // return substring with specified length + return str.substr(0, length); +} +} // namespace doris::vectorized --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org