This is an automated email from the ASF dual-hosted git repository. panxiaolei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new de0e89d1b4 [feature](function) Modified cast as time to behave more like MySQL (#18565) de0e89d1b4 is described below commit de0e89d1b4bb2135aa3f20f8b530e85acbb71a89 Author: Mryange <59914473+mrya...@users.noreply.github.com> AuthorDate: Sat Apr 22 06:11:59 2023 +0800 [feature](function) Modified cast as time to behave more like MySQL (#18565) Because the underlying type of time was float64, select cast("19:22:18" as time) would result in a null value in the past. Results in the following: --- be/src/vec/core/call_on_type_index.h | 4 +- be/src/vec/core/types.h | 3 + be/src/vec/data_types/data_type.cpp | 2 + be/src/vec/data_types/data_type_factory.cpp | 7 ++ be/src/vec/data_types/data_type_time.h | 1 + be/src/vec/functions/function_cast.h | 131 +++++++++++++++++++-- gensrc/proto/types.proto | 1 + .../data/correctness/test_cast_as_time.out | 9 ++ .../suites/correctness/test_cast_as_time.groovy | 48 ++++++++ 9 files changed, 197 insertions(+), 9 deletions(-) diff --git a/be/src/vec/core/call_on_type_index.h b/be/src/vec/core/call_on_type_index.h index 88e8fca5bf..bc822d9902 100644 --- a/be/src/vec/core/call_on_type_index.h +++ b/be/src/vec/core/call_on_type_index.h @@ -23,6 +23,7 @@ #include <utility> #include "vec/core/types.h" +#include "vec/data_types/data_type_time.h" namespace doris::vectorized { @@ -202,7 +203,8 @@ bool call_on_index_and_data_type(TypeIndex number, F&& f) { return f(TypePair<DataTypeNumber<Float32>, T>()); case TypeIndex::Float64: return f(TypePair<DataTypeNumber<Float64>, T>()); - + case TypeIndex::Time: + return f(TypePair<DataTypeTime, T>()); case TypeIndex::Decimal32: return f(TypePair<DataTypeDecimal<Decimal32>, T>()); case TypeIndex::Decimal64: diff --git a/be/src/vec/core/types.h b/be/src/vec/core/types.h index c88b3f3948..fc1a4332dc 100644 --- a/be/src/vec/core/types.h +++ b/be/src/vec/core/types.h @@ -91,6 +91,7 @@ enum class TypeIndex { Struct = 40, VARIANT = 41, QuantileState = 42, + Time = 43 }; struct Consted { @@ -626,6 +627,8 @@ inline const char* getTypeName(TypeIndex idx) { return "Struct"; case TypeIndex::QuantileState: return TypeName<QuantileState<double>>::get(); + case TypeIndex::Time: + return "Time"; } __builtin_unreachable(); diff --git a/be/src/vec/data_types/data_type.cpp b/be/src/vec/data_types/data_type.cpp index 458a204576..a5ebc78eab 100644 --- a/be/src/vec/data_types/data_type.cpp +++ b/be/src/vec/data_types/data_type.cpp @@ -171,6 +171,8 @@ PGenericType_TypeId IDataType::get_pdata_type(const IDataType* data_type) { return PGenericType::JSONB; case TypeIndex::Map: return PGenericType::MAP; + case TypeIndex::Time: + return PGenericType::TIME; default: return PGenericType::UNKNOWN; } diff --git a/be/src/vec/data_types/data_type_factory.cpp b/be/src/vec/data_types/data_type_factory.cpp index c1ca9245c7..92e1db8357 100644 --- a/be/src/vec/data_types/data_type_factory.cpp +++ b/be/src/vec/data_types/data_type_factory.cpp @@ -286,6 +286,9 @@ DataTypePtr DataTypeFactory::create_data_type(const TypeIndex& type_index, bool case TypeIndex::DateV2: nested = std::make_shared<vectorized::DataTypeDateV2>(); break; + case TypeIndex::Time: + nested = std::make_shared<DataTypeTime>(); + break; case TypeIndex::DateTimeV2: nested = std::make_shared<DataTypeDateTimeV2>(); break; @@ -522,6 +525,10 @@ DataTypePtr DataTypeFactory::create_data_type(const PColumnMeta& pcolumn) { nested = std::make_shared<DataTypeQuantileStateDouble>(); break; } + case PGenericType::TIME: { + nested = std::make_shared<DataTypeTime>(); + break; + } default: { LOG(FATAL) << fmt::format("Unknown data type: {}", pcolumn.type()); return nullptr; diff --git a/be/src/vec/data_types/data_type_time.h b/be/src/vec/data_types/data_type_time.h index 5f508f56d4..0bff06c869 100644 --- a/be/src/vec/data_types/data_type_time.h +++ b/be/src/vec/data_types/data_type_time.h @@ -76,6 +76,7 @@ public: DataTypeSerDeSPtr get_serde() const override { return std::make_shared<DataTypeNumberSerDe<Float64>>(); }; + TypeIndex get_type_id() const override { return TypeIndex::Time; } }; } // namespace doris::vectorized diff --git a/be/src/vec/functions/function_cast.h b/be/src/vec/functions/function_cast.h index 7af176395a..6f79522ab8 100644 --- a/be/src/vec/functions/function_cast.h +++ b/be/src/vec/functions/function_cast.h @@ -75,6 +75,7 @@ #include "vec/data_types/data_type_number.h" #include "vec/data_types/data_type_string.h" #include "vec/data_types/data_type_struct.h" +#include "vec/data_types/data_type_time.h" #include "vec/data_types/data_type_time_v2.h" #include "vec/functions/function.h" #include "vec/functions/function_helpers.h" @@ -110,7 +111,92 @@ inline UInt32 extract_to_decimal_scale(const ColumnWithTypeAndName& named_column named_column.column->get(0, field); return field.get<UInt32>(); } +/** Cast from string or number to Time. + * In Doris, the underlying storage type of the Time class is Float64. + */ +struct TimeCast { + // Cast from string + // Some examples of conversions. + // '300' -> 00:03:00 '20:23' -> 20:23:00 '20:23:24' -> 20:23:24 + template <typename T> + static bool try_parse_time(char* s, size_t len, T& x) { + char* first_char = s; + char* end_char = s + len; + int hour = 0, minute = 0, second = 0; + auto parse_from_str_to_int = [](char* begin, size_t len, auto& num) { + StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS; + auto int_value = StringParser::string_to_unsigned_int<uint64_t>( + reinterpret_cast<char*>(begin), len, &parse_result); + if (UNLIKELY(parse_result != StringParser::PARSE_SUCCESS)) { + return false; + } + num = int_value; + return true; + }; + if (char* first_colon {nullptr}; + (first_colon = (char*)memchr(first_char, ':', len)) != nullptr) { + if (char* second_colon {nullptr}; + (second_colon = (char*)memchr(first_colon + 1, ':', end_char - first_colon - 1)) != + nullptr) { + // find two colon + // parse hour + if (!parse_from_str_to_int(first_char, first_colon - first_char, hour)) { + // hour failed + return false; + } + // parse minute + if (!parse_from_str_to_int(first_colon + 1, second_colon - first_colon - 1, + minute)) { + return false; + } + // parse second + if (!parse_from_str_to_int(second_colon + 1, end_char - second_colon - 1, second)) { + return false; + } + } else { + // find one colon + // parse hour + if (!parse_from_str_to_int(first_char, first_colon - first_char, hour)) { + return false; + } + // parse minute + if (!parse_from_str_to_int(first_colon + 1, end_char - first_colon - 1, minute)) { + return false; + } + } + } else { + // no colon ,so try to parse as a number + size_t from {}; + if (!parse_from_str_to_int(first_char, len, from)) { + return false; + } + return try_parse_time(from, x); + } + // minute second must be < 60 + if (minute >= 60 || second >= 60) { + return false; + } + x = hour * 3600 + minute * 60 + second; + return true; + } + // Cast from number + template <typename T, typename S> + static bool try_parse_time(T from, S& x) { + int64 seconds = from / 100; + int64 hour = 0, minute = 0, second = 0; + second = from - 100 * seconds; + from /= 100; + seconds = from / 100; + minute = from - 100 * seconds; + hour = seconds; + if (minute >= 60 || second >= 60) { + return false; + } + x = hour * 3600 + minute * 60 + second; + return true; + } +}; /** Conversion of number types to each other, enums to numbers, dates and datetimes to numbers and back: done by straight assignment. * (Date is represented internally as number of days from some day; DateTime - as unix timestamp) */ @@ -275,11 +361,25 @@ struct ConvertImpl { } } } else { - for (size_t i = 0; i < size; ++i) { - vec_to[i] = static_cast<ToFieldType>(vec_from[i]); + if constexpr (IsDataTypeNumber<FromDataType> && + std::is_same_v<ToDataType, DataTypeTime>) { + // 300 -> 00:03:00 360 will be parse failed , so value maybe null + ColumnUInt8::MutablePtr col_null_map_to; + ColumnUInt8::Container* vec_null_map_to = nullptr; + col_null_map_to = ColumnUInt8::create(size); + vec_null_map_to = &col_null_map_to->get_data(); + for (size_t i = 0; i < size; ++i) { + (*vec_null_map_to)[i] = !TimeCast::try_parse_time(vec_from[i], vec_to[i]); + } + block.get_by_position(result).column = + ColumnNullable::create(std::move(col_to), std::move(col_null_map_to)); + return Status::OK(); + } else { + for (size_t i = 0; i < size; ++i) { + vec_to[i] = static_cast<ToFieldType>(vec_from[i]); + } } } - // TODO: support boolean cast more reasonable if constexpr (std::is_same_v<uint8_t, ToFieldType>) { for (int i = 0; i < size; ++i) { @@ -699,7 +799,7 @@ struct NameToDateTime { static constexpr auto name = "toDateTime"; }; -template <typename DataType, typename Additions = void*> +template <typename DataType, typename Additions = void*, typename FromDataType = void*> bool try_parse_impl(typename DataType::FieldType& x, ReadBuffer& rb, const DateLUTImpl*, Additions additions [[maybe_unused]] = Additions()) { if constexpr (IsDateTimeType<DataType>) { @@ -719,6 +819,15 @@ bool try_parse_impl(typename DataType::FieldType& x, ReadBuffer& rb, const DateL return try_read_datetime_v2_text(x, rb, scale); } + if constexpr (std::is_same_v<DataTypeString, FromDataType> && + std::is_same_v<DataTypeTime, DataType>) { + // cast from string to time(float64) + auto len = rb.count(); + auto s = rb.position(); + rb.position() = rb.end(); // make is_all_read = true + return TimeCast::try_parse_time(s, len, x); + } + if constexpr (std::is_floating_point_v<typename DataType::FieldType>) { return try_read_float_text(x, rb); } @@ -1002,6 +1111,8 @@ using FunctionToFloat32 = FunctionConvert<DataTypeFloat32, NameToFloat32, ToNumberMonotonicity<Float32>>; using FunctionToFloat64 = FunctionConvert<DataTypeFloat64, NameToFloat64, ToNumberMonotonicity<Float64>>; + +using FunctionToTime = FunctionConvert<DataTypeTime, NameToFloat64, ToNumberMonotonicity<Float64>>; using FunctionToString = FunctionConvert<DataTypeString, NameToString, ToStringMonotonicity>; using FunctionToDecimal32 = FunctionConvert<DataTypeDecimal<Decimal32>, NameToDecimal32, UnknownMonotonicity>; @@ -1096,7 +1207,10 @@ template <> struct FunctionTo<DataTypeDateTimeV2> { using Type = FunctionToDateTimeV2; }; - +template <> +struct FunctionTo<DataTypeTime> { + using Type = FunctionToTime; +}; class PreparedFunctionCast : public PreparedFunctionImpl { public: using WrapperType = std::function<Status(FunctionContext* context, Block&, const ColumnNumbers&, @@ -1186,7 +1300,6 @@ struct ConvertThroughParsing { } size_t current_offset = 0; - for (size_t i = 0; i < size; ++i) { size_t next_offset = std::is_same_v<FromDataType, DataTypeString> ? (*offsets)[i] @@ -1207,7 +1320,8 @@ struct ConvertThroughParsing { parsed = try_parse_impl<ToDataType>(vec_to[i], read_buffer, local_time_zone, type->get_scale()); } else { - parsed = try_parse_impl<ToDataType>(vec_to[i], read_buffer, local_time_zone); + parsed = try_parse_impl<ToDataType, void*, FromDataType>(vec_to[i], read_buffer, + local_time_zone); } (*vec_null_map_to)[i] = !parsed || !is_all_read(read_buffer); @@ -1837,7 +1951,8 @@ private: std::is_same_v<ToDataType, DataTypeDate> || std::is_same_v<ToDataType, DataTypeDateTime> || std::is_same_v<ToDataType, DataTypeDateV2> || - std::is_same_v<ToDataType, DataTypeDateTimeV2>) { + std::is_same_v<ToDataType, DataTypeDateTimeV2> || + std::is_same_v<ToDataType, DataTypeTime>) { ret = create_wrapper(from_type, check_and_get_data_type<ToDataType>(to_type.get()), requested_result_is_nullable); return true; diff --git a/gensrc/proto/types.proto b/gensrc/proto/types.proto index 12b1e34df9..171a91c4eb 100644 --- a/gensrc/proto/types.proto +++ b/gensrc/proto/types.proto @@ -107,6 +107,7 @@ message PGenericType { DECIMAL128I = 32; VARIANT = 33; QUANTILE_STATE = 34; + TIME = 35; UNKNOWN = 999; } required TypeId id = 2; diff --git a/regression-test/data/correctness/test_cast_as_time.out b/regression-test/data/correctness/test_cast_as_time.out new file mode 100644 index 0000000000..d216f2e72f --- /dev/null +++ b/regression-test/data/correctness/test_cast_as_time.out @@ -0,0 +1,9 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select1 -- +00:03:00 +\\N +20:20:20 +-- !select2 -- +19:18:17 +30:20:00 +00:04:00 \ No newline at end of file diff --git a/regression-test/suites/correctness/test_cast_as_time.groovy b/regression-test/suites/correctness/test_cast_as_time.groovy new file mode 100644 index 0000000000..a13af65679 --- /dev/null +++ b/regression-test/suites/correctness/test_cast_as_time.groovy @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_cast_as_time") { + sql """ DROP TABLE IF EXISTS tbl_cast_as_time """ + sql """ + CREATE TABLE tbl_cast_as_time ( + id INT DEFAULT '10', + str VARCHAR(32) DEFAULT '' + ) ENGINE=OLAP + AGGREGATE KEY(id,str) + DISTRIBUTED BY HASH(id) BUCKETS 10 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "in_memory" = "false", + "storage_format" = "V2" + ); + """ + sql """ + insert into tbl_cast_as_time values(300,'19:18:17') + """ + sql """ + insert into tbl_cast_as_time values(360,'30:20') + """ + sql """ + insert into tbl_cast_as_time values(202020,'400') + """ + qt_select1 """ + select cast(id as time) from tbl_cast_as_time order by id + """ + qt_select2 """ + select cast(str as time) from tbl_cast_as_time order by id + """ +} \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org