This is an automated email from the ASF dual-hosted git repository. lihaopeng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new f2919567df [feature](datetime) Support timezone when insert datetime value (#21898) f2919567df is described below commit f2919567dfc6e04bbb175c9477cb966f25a00231 Author: zclllyybb <zhaochan...@selectdb.com> AuthorDate: Mon Jul 31 13:08:28 2023 +0800 [feature](datetime) Support timezone when insert datetime value (#21898) --- be/src/http/action/stream_load.cpp | 3 + be/src/http/http_common.h | 1 + be/src/util/timezone_utils.cpp | 63 ++++++-- be/src/vec/common/schema_util.cpp | 17 +- be/src/vec/common/schema_util.h | 7 +- be/src/vec/data_types/data_type.h | 1 + be/src/vec/exec/format/json/new_json_reader.cpp | 36 +++-- be/src/vec/exec/format/json/new_json_reader.h | 29 ++-- be/src/vec/functions/function_cast.h | 56 ++++--- be/src/vec/functions/function_convert_tz.h | 12 +- be/src/vec/io/io_helper.h | 46 +++++- be/src/vec/runtime/vdatetime_value.cpp | 175 ++++++++++++++++++--- be/src/vec/runtime/vdatetime_value.h | 14 +- .../org/apache/doris/analysis/DateLiteral.java | 57 +++++++ .../org/apache/doris/analysis/StringLiteral.java | 2 +- .../trees/expressions/literal/DateTimeLiteral.java | 58 +++++++ .../data/datatype_p0/datetimev2/test_timezone.out | 21 +++ .../datatype_p0/datetimev2/test_tz_streamload.csv | 8 + .../datatype_p0/datetimev2/test_tz_streamload.out | 11 ++ .../datatype_p0/datetimev2/test_timezone.groovy | 60 +++++++ .../datetimev2/test_tz_streamload.groovy | 45 ++++++ .../test_partition_table_err_msg.groovy | 2 +- 22 files changed, 624 insertions(+), 100 deletions(-) diff --git a/be/src/http/action/stream_load.cpp b/be/src/http/action/stream_load.cpp index 588d887d20..9f715ce7cd 100644 --- a/be/src/http/action/stream_load.cpp +++ b/be/src/http/action/stream_load.cpp @@ -408,6 +408,9 @@ Status StreamLoadAction::_process_put(HttpRequest* http_req, if (!http_req->header(HTTP_TIMEZONE).empty()) { request.__set_timezone(http_req->header(HTTP_TIMEZONE)); } + if (!http_req->header(HTTP_TIME_ZONE).empty()) { + request.__set_timezone(http_req->header(HTTP_TIME_ZONE)); + } if (!http_req->header(HTTP_EXEC_MEM_LIMIT).empty()) { try { request.__set_execMemLimit(std::stoll(http_req->header(HTTP_EXEC_MEM_LIMIT))); diff --git a/be/src/http/http_common.h b/be/src/http/http_common.h index 86df938af1..6e02765393 100644 --- a/be/src/http/http_common.h +++ b/be/src/http/http_common.h @@ -36,6 +36,7 @@ static const std::string HTTP_TEMP_PARTITIONS = "temporary_partitions"; static const std::string HTTP_NEGATIVE = "negative"; static const std::string HTTP_STRICT_MODE = "strict_mode"; static const std::string HTTP_TIMEZONE = "timezone"; +static const std::string HTTP_TIME_ZONE = "time_zone"; static const std::string HTTP_EXEC_MEM_LIMIT = "exec_mem_limit"; static const std::string HTTP_JSONPATHS = "jsonpaths"; static const std::string HTTP_JSONROOT = "json_root"; diff --git a/be/src/util/timezone_utils.cpp b/be/src/util/timezone_utils.cpp index d62e01a994..077d1b3022 100644 --- a/be/src/util/timezone_utils.cpp +++ b/be/src/util/timezone_utils.cpp @@ -18,13 +18,19 @@ #include "util/timezone_utils.h" +#include <cctz/civil_time.h> #include <cctz/time_zone.h> #include <re2/stringpiece.h> #include <boost/algorithm/string.hpp> +#include <cctype> +#include <exception> #include <filesystem> #include <string> +#include "common/exception.h" +#include "common/logging.h" + namespace doris { RE2 TimezoneUtils::time_zone_offset_format_reg("^[+-]{1}\\d{2}\\:\\d{2}$"); @@ -60,6 +66,7 @@ void TimezoneUtils::load_timezone_names() { bool TimezoneUtils::find_cctz_time_zone(const std::string& timezone, cctz::time_zone& ctz) { auto timezone_lower = boost::algorithm::to_lower_copy(timezone); re2::StringPiece value; + // +08:00 if (time_zone_offset_format_reg.Match(timezone, 0, timezone.size(), RE2::UNANCHORED, &value, 1)) { bool positive = value[0] != '-'; @@ -78,17 +85,55 @@ bool TimezoneUtils::find_cctz_time_zone(const std::string& timezone, cctz::time_ offset *= positive ? 1 : -1; ctz = cctz::fixed_time_zone(cctz::seconds(offset)); return true; - } else if (timezone_lower == "cst") { - // Supports offset and region timezone type, "CST" use here is compatibility purposes. - ctz = cctz::fixed_time_zone(cctz::seconds(8 * 60 * 60)); - return true; - } else { - auto it = timezone_names_map_.find(timezone_lower); - if (it == timezone_names_map_.end()) { - return false; + } else { // not only offset, GMT or GMT+8 + // split tz_name and offset + int split = timezone_lower.find('+') != std::string::npos ? timezone_lower.find('+') + : timezone_lower.find('-'); + cctz::time_zone offset; + bool have_both = split != std::string::npos && split + 1 < timezone_lower.length() && + std::isdigit(timezone_lower[split + 1]); + if (have_both) { + auto offset_str = timezone_lower.substr(split); + timezone_lower = timezone_lower.substr(0, split); + int offset_hours = 0; + try { + offset_hours = std::stoi(offset_str); + } catch ([[maybe_unused]] std::exception& e) { + VLOG_DEBUG << "Unable to cast " << timezone << " as timezone"; + return false; + } + offset = cctz::fixed_time_zone(cctz::seconds(offset_hours * 60 * 60)); + } + + bool tz_parsed = false; + if (timezone_lower == "cst") { + // Supports offset and region timezone type, "CST" use here is compatibility purposes. + ctz = cctz::fixed_time_zone(cctz::seconds(8 * 60 * 60)); + tz_parsed = true; + } else if (timezone_lower == "z") { + ctz = cctz::utc_time_zone(); + tz_parsed = true; + } else { + auto it = timezone_names_map_.find(timezone_lower); + if (it == timezone_names_map_.end()) { + return false; + } + tz_parsed = cctz::load_time_zone(it->second, &ctz); + } + if (tz_parsed) { + if (!have_both) { // GMT only + return true; + } + // GMT+8 + auto tz = (cctz::convert(cctz::civil_second {}, ctz) - + cctz::time_point<cctz::seconds>()) - + (cctz::convert(cctz::civil_second {}, offset) - + cctz::time_point<cctz::seconds>()); + ctz = cctz::fixed_time_zone(std::chrono::duration_cast<std::chrono::seconds>(tz)); + return true; } - return cctz::load_time_zone(it->second, &ctz); } + return false; } } // namespace doris diff --git a/be/src/vec/common/schema_util.cpp b/be/src/vec/common/schema_util.cpp index 09fabbae0e..adabedb440 100644 --- a/be/src/vec/common/schema_util.cpp +++ b/be/src/vec/common/schema_util.cpp @@ -39,6 +39,8 @@ #include "olap/olap_common.h" #include "runtime/client_cache.h" #include "runtime/exec_env.h" +#include "runtime/types.h" +#include "udf/udf.h" #include "util/string_util.h" #include "util/thrift_rpc_helper.h" #include "vec/columns/column.h" @@ -128,7 +130,8 @@ bool is_conversion_required_between_integers(FieldType lhs, FieldType rhs) { return true; } -Status cast_column(const ColumnWithTypeAndName& arg, const DataTypePtr& type, ColumnPtr* result) { +Status cast_column(const ColumnWithTypeAndName& arg, const DataTypePtr& type, ColumnPtr* result, + RuntimeState* state) { ColumnsWithTypeAndName arguments; if (WhichDataType(type->get_type_id()).is_string()) { // Special handle ColumnString, since the original cast logic use ColumnString's first item @@ -145,8 +148,9 @@ Status cast_column(const ColumnWithTypeAndName& arg, const DataTypePtr& type, Co argnum.emplace_back(1); size_t result_column = tmp_block.columns(); tmp_block.insert({nullptr, type, arg.name}); - RETURN_IF_ERROR( - function->execute(nullptr, tmp_block, argnum, result_column, arg.column->size())); + auto need_state_only = FunctionContext::create_context(state, {}, {}); + RETURN_IF_ERROR(function->execute(need_state_only.get(), tmp_block, argnum, result_column, + arg.column->size())); *result = std::move(tmp_block.get_by_position(result_column).column); return Status::OK(); } @@ -289,7 +293,8 @@ Status send_add_columns_rpc(ColumnsWithTypeAndName column_type_names, return Status::OK(); } -Status unfold_object(size_t dynamic_col_position, Block& block, bool cast_to_original_type) { +Status unfold_object(size_t dynamic_col_position, Block& block, bool cast_to_original_type, + RuntimeState* state) { auto dynamic_col = block.get_by_position(dynamic_col_position).column->assume_mutable(); auto* column_object_ptr = assert_cast<ColumnObject*>(dynamic_col.get()); if (column_object_ptr->empty()) { @@ -324,8 +329,8 @@ Status unfold_object(size_t dynamic_col_position, Block& block, bool cast_to_ori } if (cast_to_original_type && !dst_type->equals(*types[i])) { // Cast static columns to original slot type - RETURN_IF_ERROR( - schema_util::cast_column({subcolumns[i], types[i], ""}, dst_type, &column)); + RETURN_IF_ERROR(schema_util::cast_column({subcolumns[i], types[i], ""}, dst_type, + &column, state)); } // replace original column column_type_name->column = column; diff --git a/be/src/vec/common/schema_util.h b/be/src/vec/common/schema_util.h index fda6a69cf4..8a5b1423d6 100644 --- a/be/src/vec/common/schema_util.h +++ b/be/src/vec/common/schema_util.h @@ -28,6 +28,7 @@ #include "common/status.h" #include "olap/tablet_schema.h" +#include "udf/udf.h" #include "vec/aggregate_functions/aggregate_function.h" #include "vec/core/columns_with_type_and_name.h" #include "vec/core/field.h" @@ -58,14 +59,16 @@ DataTypePtr get_base_type_of_array(const DataTypePtr& type); Array create_empty_array_field(size_t num_dimensions); // Cast column to dst type -Status cast_column(const ColumnWithTypeAndName& arg, const DataTypePtr& type, ColumnPtr* result); +Status cast_column(const ColumnWithTypeAndName& arg, const DataTypePtr& type, ColumnPtr* result, + RuntimeState* = nullptr); // Object column will be unfolded and if cast_to_original_type // the original column in the block will be replaced with the subcolumn // from object column and casted to the new type from slot_descs. // Also if column in block is empty, it will be filled // with num_rows of default values -Status unfold_object(size_t dynamic_col_position, Block& block, bool cast_to_original_type); +Status unfold_object(size_t dynamic_col_position, Block& block, bool cast_to_original_type, + RuntimeState* = nullptr); /// If both of types are signed/unsigned integers and size of left field type /// is less than right type, we don't need to convert field, diff --git a/be/src/vec/data_types/data_type.h b/be/src/vec/data_types/data_type.h index 42c0d27388..2aee6fdb1e 100644 --- a/be/src/vec/data_types/data_type.h +++ b/be/src/vec/data_types/data_type.h @@ -82,6 +82,7 @@ public: virtual void to_string(const IColumn& column, size_t row_num, BufferWritable& ostr) const; virtual std::string to_string(const IColumn& column, size_t row_num) const; + // only for compound type now. virtual Status from_string(ReadBuffer& rb, IColumn* column) const; // get specific serializer or deserializer diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp b/be/src/vec/exec/format/json/new_json_reader.cpp index 02145c3360..d10f60f5e9 100644 --- a/be/src/vec/exec/format/json/new_json_reader.cpp +++ b/be/src/vec/exec/format/json/new_json_reader.cpp @@ -214,7 +214,8 @@ Status NewJsonReader::get_next_block(Block* block, size_t* read_rows, bool* eof) bool is_empty_row = false; - RETURN_IF_ERROR(_read_json_column(*block, _file_slot_descs, &is_empty_row, &_reader_eof)); + RETURN_IF_ERROR( + _read_json_column(_state, *block, _file_slot_descs, &is_empty_row, &_reader_eof)); if (is_empty_row) { // Read empty row, just continue continue; @@ -440,13 +441,14 @@ Status NewJsonReader::_parse_jsonpath_and_json_root() { return Status::OK(); } -Status NewJsonReader::_read_json_column(Block& block, +Status NewJsonReader::_read_json_column(RuntimeState* state, Block& block, const std::vector<SlotDescriptor*>& slot_descs, bool* is_empty_row, bool* eof) { - return (this->*_vhandle_json_callback)(block, slot_descs, is_empty_row, eof); + return (this->*_vhandle_json_callback)(state, block, slot_descs, is_empty_row, eof); } -Status NewJsonReader::_parse_dynamic_json(bool* is_empty_row, bool* eof, Block& block, +Status NewJsonReader::_parse_dynamic_json(RuntimeState* state, bool* is_empty_row, bool* eof, + Block& block, const std::vector<SlotDescriptor*>& slot_descs) { size_t size = 0; // read a whole message @@ -485,8 +487,8 @@ Status NewJsonReader::_parse_dynamic_json(bool* is_empty_row, bool* eof, Block& } // Unfold object columns for the purpose of extracting static columns and // fill default values missing in static columns - RETURN_IF_ERROR(schema_util::unfold_object(block.columns() - 1, block, - true /*cast to original column type*/)); + RETURN_IF_ERROR(schema_util::unfold_object( + block.columns() - 1, block, true /*cast to original column type*/, state)); _cur_parsed_variant_rows = 0; } return Status::OK(); @@ -518,12 +520,12 @@ Status NewJsonReader::_parse_dynamic_json(bool* is_empty_row, bool* eof, Block& return Status::OK(); } -Status NewJsonReader::_vhandle_dynamic_json(Block& block, +Status NewJsonReader::_vhandle_dynamic_json(RuntimeState* state, Block& block, const std::vector<SlotDescriptor*>& slot_descs, bool* is_empty_row, bool* eof) { bool valid = false; do { - Status st = _parse_dynamic_json(is_empty_row, eof, block, slot_descs); + Status st = _parse_dynamic_json(state, is_empty_row, eof, block, slot_descs); if (st.is<DATA_QUALITY_ERROR>()) { continue; // continue to read next } @@ -537,7 +539,7 @@ Status NewJsonReader::_vhandle_dynamic_json(Block& block, return Status::OK(); } -Status NewJsonReader::_vhandle_simple_json(Block& block, +Status NewJsonReader::_vhandle_simple_json(RuntimeState* /*state*/, Block& block, const std::vector<SlotDescriptor*>& slot_descs, bool* is_empty_row, bool* eof) { do { @@ -610,8 +612,8 @@ Status NewJsonReader::_vhandle_simple_json(Block& block, } Status NewJsonReader::_vhandle_flat_array_complex_json( - Block& block, const std::vector<SlotDescriptor*>& slot_descs, bool* is_empty_row, - bool* eof) { + RuntimeState* /*state*/, Block& block, const std::vector<SlotDescriptor*>& slot_descs, + bool* is_empty_row, bool* eof) { do { if (_next_row >= _total_rows) { Status st = _parse_json(is_empty_row, eof); @@ -640,7 +642,7 @@ Status NewJsonReader::_vhandle_flat_array_complex_json( return Status::OK(); } -Status NewJsonReader::_vhandle_nested_complex_json(Block& block, +Status NewJsonReader::_vhandle_nested_complex_json(RuntimeState* /*state*/, Block& block, const std::vector<SlotDescriptor*>& slot_descs, bool* is_empty_row, bool* eof) { while (true) { @@ -1099,7 +1101,7 @@ Status NewJsonReader::_simdjson_init_reader() { return Status::OK(); } -Status NewJsonReader::_simdjson_handle_simple_json(Block& block, +Status NewJsonReader::_simdjson_handle_simple_json(RuntimeState* /*state*/, Block& block, const std::vector<SlotDescriptor*>& slot_descs, bool* is_empty_row, bool* eof) { // simple json @@ -1198,8 +1200,8 @@ Status NewJsonReader::_simdjson_handle_simple_json(Block& block, } Status NewJsonReader::_simdjson_handle_flat_array_complex_json( - Block& block, const std::vector<SlotDescriptor*>& slot_descs, bool* is_empty_row, - bool* eof) { + RuntimeState* /*state*/, Block& block, const std::vector<SlotDescriptor*>& slot_descs, + bool* is_empty_row, bool* eof) { // Advance one row in array list, if it is the endpoint, stop advance and break the loop #define ADVANCE_ROW() \ if (_array_iter == _array.end()) { \ @@ -1291,8 +1293,8 @@ Status NewJsonReader::_simdjson_handle_flat_array_complex_json( } Status NewJsonReader::_simdjson_handle_nested_complex_json( - Block& block, const std::vector<SlotDescriptor*>& slot_descs, bool* is_empty_row, - bool* eof) { + RuntimeState* /*state*/, Block& block, const std::vector<SlotDescriptor*>& slot_descs, + bool* is_empty_row, bool* eof) { // nested complex json while (true) { size_t num_rows = block.rows(); diff --git a/be/src/vec/exec/format/json/new_json_reader.h b/be/src/vec/exec/format/json/new_json_reader.h index 99106d100f..97e0f6cbc6 100644 --- a/be/src/vec/exec/format/json/new_json_reader.h +++ b/be/src/vec/exec/format/json/new_json_reader.h @@ -102,22 +102,25 @@ private: Status _open_line_reader(); Status _parse_jsonpath_and_json_root(); - Status _read_json_column(Block& block, const std::vector<SlotDescriptor*>& slot_descs, - bool* is_empty_row, bool* eof); + Status _read_json_column(RuntimeState* state, Block& block, + const std::vector<SlotDescriptor*>& slot_descs, bool* is_empty_row, + bool* eof); - Status _vhandle_simple_json(Block& block, const std::vector<SlotDescriptor*>& slot_descs, - bool* is_empty_row, bool* eof); + Status _vhandle_simple_json(RuntimeState* /*state*/, Block& block, + const std::vector<SlotDescriptor*>& slot_descs, bool* is_empty_row, + bool* eof); - Status _parse_dynamic_json(bool* is_empty_row, bool* eof, Block& block, + Status _parse_dynamic_json(RuntimeState* state, bool* is_empty_row, bool* eof, Block& block, const std::vector<SlotDescriptor*>& slot_descs); - Status _vhandle_dynamic_json(Block& block, const std::vector<SlotDescriptor*>& slot_descs, - bool* is_empty_row, bool* eof); + Status _vhandle_dynamic_json(RuntimeState* /*state*/, Block& block, + const std::vector<SlotDescriptor*>& slot_descs, bool* is_empty_row, + bool* eof); - Status _vhandle_flat_array_complex_json(Block& block, + Status _vhandle_flat_array_complex_json(RuntimeState* /*state*/, Block& block, const std::vector<SlotDescriptor*>& slot_descs, bool* is_empty_row, bool* eof); - Status _vhandle_nested_complex_json(Block& block, + Status _vhandle_nested_complex_json(RuntimeState* /*state*/, Block& block, const std::vector<SlotDescriptor*>& slot_descs, bool* is_empty_row, bool* eof); @@ -147,15 +150,15 @@ private: Status _simdjson_parse_json(bool* is_empty_row, bool* eof); Status _simdjson_parse_json_doc(size_t* size, bool* eof); - Status _simdjson_handle_simple_json(Block& block, + Status _simdjson_handle_simple_json(RuntimeState* state, Block& block, const std::vector<SlotDescriptor*>& slot_descs, bool* is_empty_row, bool* eof); - Status _simdjson_handle_flat_array_complex_json(Block& block, + Status _simdjson_handle_flat_array_complex_json(RuntimeState* state, Block& block, const std::vector<SlotDescriptor*>& slot_descs, bool* is_empty_row, bool* eof); - Status _simdjson_handle_nested_complex_json(Block& block, + Status _simdjson_handle_nested_complex_json(RuntimeState* state, Block& block, const std::vector<SlotDescriptor*>& slot_descs, bool* is_empty_row, bool* eof); @@ -174,7 +177,7 @@ private: size_t _column_index(const StringRef& name, size_t key_index); - Status (NewJsonReader::*_vhandle_json_callback)(Block& block, + Status (NewJsonReader::*_vhandle_json_callback)(RuntimeState* state, Block& block, const std::vector<SlotDescriptor*>& slot_descs, bool* is_empty_row, bool* eof); Status _get_column_default_value( diff --git a/be/src/vec/functions/function_cast.h b/be/src/vec/functions/function_cast.h index 7f28eae11d..1af6184e87 100644 --- a/be/src/vec/functions/function_cast.h +++ b/be/src/vec/functions/function_cast.h @@ -20,6 +20,7 @@ #pragma once +#include <cctz/time_zone.h> #include <fmt/format.h> #include <glog/logging.h> #include <stddef.h> @@ -80,8 +81,8 @@ #include "vec/data_types/data_type_time.h" #include "vec/data_types/data_type_time_v2.h" #include "vec/functions/function.h" +#include "vec/functions/function_convert_tz.h" #include "vec/functions/function_helpers.h" -#include "vec/io/io_helper.h" #include "vec/io/reader_buffer.h" #include "vec/runtime/vdatetime_value.h" #include "vec/utils/template_helpers.hpp" @@ -122,12 +123,14 @@ struct TimeCast { // Some examples of conversions. // '300' -> 00:03:00 '20:23' -> 20:23:00 '20:23:24' -> 20:23:24 template <typename T> - static bool try_parse_time(char* s, size_t len, T& x) { + static bool try_parse_time(char* s, size_t len, T& x, const cctz::time_zone& local_time_zone, + ZoneList& time_zone_cache) { /// TODO: Maybe we can move Timecast to the io_helper. - if (try_as_time(s, len, x)) { + if (try_as_time(s, len, x, local_time_zone)) { return true; } else { - if (VecDateTimeValue dv {}; dv.from_date_str(s, len)) { + if (VecDateTimeValue dv {}; + dv.from_date_str(s, len, local_time_zone, time_zone_cache)) { // can be parse as a datetime x = dv.hour() * 3600 + dv.minute() * 60 + dv.second(); return true; @@ -137,7 +140,7 @@ struct TimeCast { } template <typename T> - static bool try_as_time(char* s, size_t len, T& x) { + static bool try_as_time(char* s, size_t len, T& x, const cctz::time_zone& local_time_zone) { char* first_char = s; char* end_char = s + len; int hour = 0, minute = 0, second = 0; @@ -188,7 +191,7 @@ struct TimeCast { if (!parse_from_str_to_int(first_char, len, from)) { return false; } - return try_parse_time(from, x); + return try_parse_time(from, x, local_time_zone); } // minute second must be < 60 if (minute >= 60 || second >= 60) { @@ -199,7 +202,8 @@ struct TimeCast { } // Cast from number template <typename T, typename S> - static bool try_parse_time(T from, S& x) { + //requires {std::is_arithmetic_v<T> && std::is_arithmetic_v<S>} + static bool try_parse_time(T from, S& x, const cctz::time_zone& local_time_zone) { int64 seconds = from / 100; int64 hour = 0, minute = 0, second = 0; second = from - 100 * seconds; @@ -388,7 +392,8 @@ struct ConvertImpl { col_null_map_to = ColumnUInt8::create(size); vec_null_map_to = &col_null_map_to->get_data(); for (size_t i = 0; i < size; ++i) { - (*vec_null_map_to)[i] = !TimeCast::try_parse_time(vec_from[i], vec_to[i]); + (*vec_null_map_to)[i] = !TimeCast::try_parse_time( + vec_from[i], vec_to[i], context->state()->timezone_obj()); } block.get_by_position(result).column = ColumnNullable::create(std::move(col_to), std::move(col_null_map_to)); @@ -516,7 +521,7 @@ struct ConvertImplGenericToString { return execute(block, arguments, result); } }; - +//this is for data in compound type template <typename StringColumnType> struct ConvertImplGenericFromString { static Status execute(FunctionContext* context, Block& block, const ColumnNumbers& arguments, @@ -820,7 +825,8 @@ struct NameToDateTime { }; template <typename DataType, typename Additions = void*, typename FromDataType = void*> -bool try_parse_impl(typename DataType::FieldType& x, ReadBuffer& rb, const DateLUTImpl*, +bool try_parse_impl(typename DataType::FieldType& x, ReadBuffer& rb, + const cctz::time_zone& local_time_zone, ZoneList& time_zone_cache, Additions additions [[maybe_unused]] = Additions()) { if constexpr (IsDateTimeType<DataType>) { return try_read_datetime_text(x, rb); @@ -831,12 +837,12 @@ bool try_parse_impl(typename DataType::FieldType& x, ReadBuffer& rb, const DateL } if constexpr (IsDateV2Type<DataType>) { - return try_read_date_v2_text(x, rb); + return try_read_date_v2_text(x, rb, local_time_zone, time_zone_cache); } if constexpr (IsDateTimeV2Type<DataType>) { UInt32 scale = additions; - return try_read_datetime_v2_text(x, rb, scale); + return try_read_datetime_v2_text(x, rb, local_time_zone, time_zone_cache, scale); } if constexpr (std::is_same_v<DataTypeString, FromDataType> && @@ -845,7 +851,7 @@ bool try_parse_impl(typename DataType::FieldType& x, ReadBuffer& rb, const DateL auto len = rb.count(); auto s = rb.position(); rb.position() = rb.end(); // make is_all_read = true - return TimeCast::try_parse_time(s, len, x); + return TimeCast::try_parse_time(s, len, x, local_time_zone, time_zone_cache); } if constexpr (std::is_floating_point_v<typename DataType::FieldType>) { @@ -1277,8 +1283,11 @@ struct ConvertThroughParsing { using ColVecTo = std::conditional_t<IsDecimalNumber<ToFieldType>, ColumnDecimal<ToFieldType>, ColumnVector<ToFieldType>>; - const DateLUTImpl* local_time_zone [[maybe_unused]] = nullptr; - const DateLUTImpl* utc_time_zone [[maybe_unused]] = nullptr; + // For datelike type, only from FunctionConvertFromString. So we can use its' context。 + auto convert_ctx = reinterpret_cast<ConvertTzCtx*>( + context->get_function_state(FunctionContext::FunctionStateScope::THREAD_LOCAL)); + ZoneList time_zone_cache_; + auto& time_zone_cache = convert_ctx ? convert_ctx->time_zone_cache : time_zone_cache_; const IColumn* col_from = block.get_by_position(arguments[0]).column.get(); const ColumnString* col_from_string = check_and_get_column<ColumnString>(col_from); @@ -1333,15 +1342,18 @@ struct ConvertThroughParsing { bool parsed; if constexpr (IsDataTypeDecimal<ToDataType>) { parsed = try_parse_impl<ToDataType>( - vec_to[i], read_buffer, local_time_zone, vec_to.get_scale()); + vec_to[i], read_buffer, context->state()->timezone_obj(), + time_zone_cache, vec_to.get_scale()); } else if constexpr (IsDataTypeDateTimeV2<ToDataType>) { auto type = check_and_get_data_type<DataTypeDateTimeV2>( block.get_by_position(result).type.get()); parsed = try_parse_impl<ToDataType>(vec_to[i], read_buffer, - local_time_zone, type->get_scale()); + context->state()->timezone_obj(), + time_zone_cache, type->get_scale()); } else { parsed = try_parse_impl<ToDataType, void*, FromDataType>( - vec_to[i], read_buffer, local_time_zone); + vec_to[i], read_buffer, context->state()->timezone_obj(), + time_zone_cache); } (*vec_null_map_to)[i] = !parsed || !is_all_read(read_buffer); if constexpr (is_load_ && is_strict_insert_) { @@ -1396,6 +1408,14 @@ public: ColumnNumbers get_arguments_that_are_always_constant() const override { return {1}; } + Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { + if (scope != FunctionContext::THREAD_LOCAL) { + return Status::OK(); + } + context->set_function_state(scope, std::make_unique<ConvertTzCtx>()); + return Status::OK(); + } + // This function should not be called for get DateType Ptr // using the FunctionCast::get_return_type_impl DataTypePtr get_return_type_impl(const ColumnsWithTypeAndName& arguments) const override { diff --git a/be/src/vec/functions/function_convert_tz.h b/be/src/vec/functions/function_convert_tz.h index 829f86a66b..7c6d0be442 100644 --- a/be/src/vec/functions/function_convert_tz.h +++ b/be/src/vec/functions/function_convert_tz.h @@ -50,6 +50,7 @@ #include "vec/data_types/data_type_string.h" #include "vec/data_types/data_type_time_v2.h" #include "vec/functions/function.h" +#include "vec/io/io_helper.h" namespace doris { namespace vectorized { @@ -65,7 +66,7 @@ class DateV2Value; namespace doris::vectorized { struct ConvertTzCtx { - std::map<std::string, cctz::time_zone> time_zone_cache; + ZoneList time_zone_cache; }; template <typename DateValueType, typename ArgType> @@ -91,7 +92,7 @@ struct ConvertTZImpl { size_t input_rows_count) { auto convert_ctx = reinterpret_cast<ConvertTzCtx*>( context->get_function_state(FunctionContext::FunctionStateScope::THREAD_LOCAL)); - std::map<std::string, cctz::time_zone> time_zone_cache_; + ZoneList time_zone_cache_; auto& time_zone_cache = convert_ctx ? convert_ctx->time_zone_cache : time_zone_cache_; for (size_t i = 0; i < input_rows_count; i++) { if (result_null_map[i]) { @@ -111,7 +112,7 @@ struct ConvertTZImpl { NullMap& result_null_map, size_t input_rows_count) { auto convert_ctx = reinterpret_cast<ConvertTzCtx*>( context->get_function_state(FunctionContext::FunctionStateScope::THREAD_LOCAL)); - std::map<std::string, cctz::time_zone> time_zone_cache_; + ZoneList time_zone_cache_; auto& time_zone_cache = convert_ctx ? convert_ctx->time_zone_cache : time_zone_cache_; auto from_tz = from_tz_column->get_data_at(0).to_string(); @@ -126,8 +127,7 @@ struct ConvertTZImpl { } } - static void execute_inner_loop(const ColumnType* date_column, - std::map<std::string, cctz::time_zone>& time_zone_cache, + static void execute_inner_loop(const ColumnType* date_column, ZoneList& time_zone_cache, const std::string& from_tz, const std::string& to_tz, ReturnColumnType* result_column, NullMap& result_null_map, const size_t index_now) { @@ -205,7 +205,7 @@ public: if (scope != FunctionContext::THREAD_LOCAL) { return Status::OK(); } - context->set_function_state(scope, std::make_shared<ConvertTzCtx>()); + context->set_function_state(scope, std::make_unique<ConvertTzCtx>()); return Status::OK(); } diff --git a/be/src/vec/io/io_helper.h b/be/src/vec/io/io_helper.h index 7111913910..30b6278fa3 100644 --- a/be/src/vec/io/io_helper.h +++ b/be/src/vec/io/io_helper.h @@ -42,6 +42,8 @@ static constexpr size_t DEFAULT_MAX_STRING_SIZE = 1073741824; // 1GB static constexpr size_t DEFAULT_MAX_JSON_SIZE = 1073741824; // 1GB static constexpr auto WRITE_HELPERS_MAX_INT_WIDTH = 40U; +using ZoneList = std::map<std::string, cctz::time_zone>; + inline std::string int128_to_string(__int128_t value) { fmt::memory_buffer buffer; fmt::format_to(buffer, "{}", value); @@ -130,8 +132,9 @@ template <typename Type> void write_vector_binary(const std::vector<Type>& v, BufferWritable& buf) { write_var_uint(v.size(), buf); - for (typename std::vector<Type>::const_iterator it = v.begin(); it != v.end(); ++it) + for (typename std::vector<Type>::const_iterator it = v.begin(); it != v.end(); ++it) { write_binary(*it, buf); + } } inline void write_binary(const String& x, BufferWritable& buf) { @@ -215,7 +218,9 @@ void read_vector_binary(std::vector<Type>& v, BufferReadable& buf, } v.resize(size); - for (size_t i = 0; i < size; ++i) read_binary(v[i], buf); + for (size_t i = 0; i < size; ++i) { + read_binary(v[i], buf); + } } inline void read_binary(String& x, BufferReadable& buf) { @@ -302,6 +307,19 @@ bool read_date_v2_text_impl(T& x, ReadBuffer& buf) { return ans; } +template <typename T> +bool read_date_v2_text_impl(T& x, ReadBuffer& buf, const cctz::time_zone& local_time_zone, + ZoneList& time_zone_cache) { + static_assert(std::is_same_v<UInt32, T>); + auto dv = binary_cast<UInt32, DateV2Value<DateV2ValueType>>(x); + auto ans = dv.from_date_str(buf.position(), buf.count(), local_time_zone, time_zone_cache); + + // only to match the is_all_read() check to prevent return null + buf.position() = buf.end(); + x = binary_cast<DateV2Value<DateV2ValueType>, UInt32>(dv); + return ans; +} + template <typename T> bool read_datetime_v2_text_impl(T& x, ReadBuffer& buf, UInt32 scale = -1) { static_assert(std::is_same_v<UInt64, T>); @@ -314,6 +332,20 @@ bool read_datetime_v2_text_impl(T& x, ReadBuffer& buf, UInt32 scale = -1) { return ans; } +template <typename T> +bool read_datetime_v2_text_impl(T& x, ReadBuffer& buf, const cctz::time_zone& local_time_zone, + ZoneList& time_zone_cache, UInt32 scale = -1) { + static_assert(std::is_same_v<UInt64, T>); + auto dv = binary_cast<UInt64, DateV2Value<DateTimeV2ValueType>>(x); + auto ans = + dv.from_date_str(buf.position(), buf.count(), local_time_zone, time_zone_cache, scale); + + // only to match the is_all_read() check to prevent return null + buf.position() = buf.end(); + x = binary_cast<DateV2Value<DateTimeV2ValueType>, UInt64>(dv); + return ans; +} + template <typename T> bool read_decimal_text_impl(T& x, ReadBuffer& buf, UInt32 precision, UInt32 scale) { static_assert(IsDecimalNumber<T>); @@ -400,12 +432,14 @@ bool try_read_date_text(T& x, ReadBuffer& in) { } template <typename T> -bool try_read_date_v2_text(T& x, ReadBuffer& in) { - return read_date_v2_text_impl<T>(x, in); +bool try_read_date_v2_text(T& x, ReadBuffer& in, const cctz::time_zone& local_time_zone, + ZoneList& time_zone_cache) { + return read_date_v2_text_impl<T>(x, in, local_time_zone, time_zone_cache); } template <typename T> -bool try_read_datetime_v2_text(T& x, ReadBuffer& in, UInt32 scale) { - return read_datetime_v2_text_impl<T>(x, in, scale); +bool try_read_datetime_v2_text(T& x, ReadBuffer& in, const cctz::time_zone& local_time_zone, + ZoneList& time_zone_cache, UInt32 scale) { + return read_datetime_v2_text_impl<T>(x, in, local_time_zone, time_zone_cache, scale); } } // namespace doris::vectorized diff --git a/be/src/vec/runtime/vdatetime_value.cpp b/be/src/vec/runtime/vdatetime_value.cpp index 8aaa51a376..27926d1ee6 100644 --- a/be/src/vec/runtime/vdatetime_value.cpp +++ b/be/src/vec/runtime/vdatetime_value.cpp @@ -20,16 +20,26 @@ #include <cctz/civil_time.h> #include <cctz/time_zone.h> #include <ctype.h> +#include <glog/logging.h> #include <stdlib.h> #include <string.h> #include <time.h> // IWYU pragma: no_include <bits/chrono.h> +#include <algorithm> +#include <cctype> #include <chrono> // IWYU pragma: keep // IWYU pragma: no_include <bits/std_abs.h> #include <cmath> +#include <exception> +#include <string> +#include <string_view> +#include "common/compiler_util.h" #include "common/config.h" +#include "common/exception.h" +#include "common/status.h" #include "util/timezone_utils.h" +#include "vec/common/int_exp.h" namespace doris::vectorized { @@ -48,6 +58,11 @@ uint8_t mysql_week_mode(uint32_t mode) { return mode; } +static bool time_zone_begins(const char* ptr, const char* end) { + return *ptr == '+' || (*ptr == '-' && ptr + 3 < end && *(ptr + 3) == ':') || + (isalpha(*ptr) && *ptr != 'T'); +} + bool VecDateTimeValue::check_range(uint32_t year, uint32_t month, uint32_t day, uint32_t hour, uint32_t minute, uint32_t second, uint16_t type) { bool time = hour > (type == TIME_TIME ? TIME_MAX_HOUR : 23) || minute > 59 || second > 59; @@ -70,6 +85,18 @@ bool VecDateTimeValue::check_date(uint32_t year, uint32_t month, uint32_t day) { // YYYY-MM-DD HH-MM-DD.FFFFFF AM in default format // 0 1 2 3 4 5 6 7 bool VecDateTimeValue::from_date_str(const char* date_str, int len) { + return from_date_str_base(date_str, len, nullptr, nullptr); +} +//parse timezone to get offset +bool VecDateTimeValue::from_date_str(const char* date_str, int len, + const cctz::time_zone& local_time_zone, + ZoneList& time_zone_cache) { + return from_date_str_base(date_str, len, &local_time_zone, &time_zone_cache); +} + +bool VecDateTimeValue::from_date_str_base(const char* date_str, int len, + const cctz::time_zone* local_time_zone, + ZoneList* time_zone_cache) { const char* ptr = date_str; const char* end = date_str + len; // ONLY 2, 6 can follow by a space @@ -94,6 +121,7 @@ bool VecDateTimeValue::from_date_str(const char* date_str, int len) { int year_len = 4; int digits = pos - ptr; bool is_interval_format = false; + bool has_bar = false; // Compatible with MySQL. // For YYYYMMDD/YYYYMMDDHHMMSS is 4 digits years @@ -108,6 +136,7 @@ bool VecDateTimeValue::from_date_str(const char* date_str, int len) { int field_idx = 0; int field_len = year_len; + long sec_offset = 0; while (ptr < end && isdigit(*ptr) && field_idx < MAX_DATE_PARTS - 1) { const char* start = ptr; int temp_val = 0; @@ -127,6 +156,38 @@ bool VecDateTimeValue::from_date_str(const char* date_str, int len) { field_idx++; break; } + + // timezone + if (UNLIKELY((field_idx > 2 || + !has_bar) /*dont treat xxxx-xx-xx:xx:xx as xxxx-xx(-xx:xx:xx)*/ + && time_zone_begins(ptr, end))) { + if (local_time_zone == nullptr || time_zone_cache == nullptr) { + return false; + } + auto get_tz_offset = [&](const std::string& str_tz, + const cctz::time_zone* local_time_zone, + ZoneList* time_zone_cache) -> long { + // no lock needed because of the entity is of thread_local + if (time_zone_cache->find(str_tz) == time_zone_cache->end()) { // not found + if (!TimezoneUtils::find_cctz_time_zone(str_tz, (*time_zone_cache)[str_tz])) { + throw Exception {ErrorCode::INVALID_ARGUMENT, ""}; + } + } + auto given = cctz::convert(cctz::civil_second {}, (*time_zone_cache)[str_tz]); + auto local = cctz::convert(cctz::civil_second {}, *local_time_zone); + // these two values is absolute time. so they are negative. need to use (-local) - (-given) + return std::chrono::duration_cast<std::chrono::seconds>(given - local).count(); + }; + try { + sec_offset = get_tz_offset(std::string {ptr, end}, local_time_zone, + time_zone_cache); // use the whole remain string + } catch ([[maybe_unused]] Exception& e) { + return false; // invalid format + } + field_idx++; + break; + } + if (field_idx == 2 && *ptr == 'T') { // YYYYMMDDTHHMMDD, skip 'T' and continue ptr++; @@ -153,6 +214,9 @@ bool VecDateTimeValue::from_date_str(const char* date_str, int len) { return false; } } + if (*ptr == '-') { + has_bar = true; + } ptr++; } field_idx++; @@ -179,9 +243,16 @@ bool VecDateTimeValue::from_date_str(const char* date_str, int len) { } } - if (num_field < 3) return false; - return check_range_and_set_time(date_val[0], date_val[1], date_val[2], date_val[3], date_val[4], - date_val[5], _type); + if (num_field < 3) { + return false; + } + if (!check_range_and_set_time(date_val[0], date_val[1], date_val[2], date_val[3], date_val[4], + date_val[5], _type)) { + return false; + } + return sec_offset ? date_add_interval<TimeUnit::SECOND>( + TimeInterval {TimeUnit::SECOND, sec_offset, false}) + : true; } // [0, 101) invalid @@ -1560,7 +1631,9 @@ bool VecDateTimeValue::date_add_interval(const TimeInterval& interval) { if (!get_date_from_daynr(day_nr)) { return false; } - _type = TIME_DATETIME; + if (_second || _minute || _hour) { + _type = TIME_DATETIME; + } } else if constexpr ((unit == DAY) || (unit == WEEK)) { // This only change day information, not change second information int64_t day_nr = daynr() + interval.day * sign; @@ -1834,7 +1907,20 @@ void DateV2Value<T>::format_datetime(uint32_t* date_val, bool* carry_bits) const // YYYY-MM-DD HH-MM-DD.FFFFFF AM in default format // 0 1 2 3 4 5 6 7 template <typename T> -bool DateV2Value<T>::from_date_str(const char* date_str, int len, int scale) { +bool DateV2Value<T>::from_date_str(const char* date_str, int len, int scale /* = -1*/) { + return from_date_str_base(date_str, len, scale, nullptr, nullptr); +} +// when we parse +template <typename T> +bool DateV2Value<T>::from_date_str(const char* date_str, int len, + const cctz::time_zone& local_time_zone, + ZoneList& time_zone_cache, int scale /* = -1*/) { + return from_date_str_base(date_str, len, scale, &local_time_zone, &time_zone_cache); +} +template <typename T> +bool DateV2Value<T>::from_date_str_base(const char* date_str, int len, int scale, + const cctz::time_zone* local_time_zone, + ZoneList* time_zone_cache) { const char* ptr = date_str; const char* end = date_str + len; // ONLY 2, 6 can follow by a space @@ -1858,10 +1944,12 @@ bool DateV2Value<T>::from_date_str(const char* date_str, int len, int scale) { int year_len = 4; int digits = pos - ptr; bool is_interval_format = false; + bool has_bar = false; // Compatible with MySQL. // For YYYYMMDD/YYYYMMDDHHMMSS is 4 digits years - if (pos == end || *pos == '.') { + if (pos == end || *pos == '.' || + time_zone_begins(pos, end)) { // no delimeter until ./Asia/Z/GMT... if (digits == 4 || digits == 8 || digits >= 14) { year_len = 4; } else { @@ -1872,17 +1960,18 @@ bool DateV2Value<T>::from_date_str(const char* date_str, int len, int scale) { int field_idx = 0; int field_len = year_len; + long sec_offset = 0; while (ptr < end && isdigit(*ptr) && field_idx < MAX_DATE_PARTS) { const char* start = ptr; int temp_val = 0; bool scan_to_delim = (!is_interval_format) && (field_idx != 6); - while (ptr < end && isdigit(*ptr) && (scan_to_delim || field_len--)) { + while (ptr < end && isdigit(*ptr) && (scan_to_delim || field_len--)) { // field_len <= 6 temp_val = temp_val * 10 + (*ptr++ - '0'); } if (field_idx == 6) { // Microsecond - const auto ms_part = end - start; - temp_val *= std::pow(10, std::max(0L, 6 - ms_part)); + const auto ms_part = ptr - start; + temp_val *= int_exp10(std::max(0L, 6 - ms_part)); if constexpr (is_datetime) { if (scale >= 0) { if (scale == 6 && ms_part > 6) { @@ -1890,7 +1979,7 @@ bool DateV2Value<T>::from_date_str(const char* date_str, int len, int scale) { temp_val += 1; } } else { - const int divisor = std::pow(10, 6 - scale); + const int divisor = int_exp10(6 - scale); int remainder = temp_val % divisor; temp_val /= divisor; if (scale < 6 && std::abs(remainder) >= (divisor >> 1)) { @@ -1918,6 +2007,38 @@ bool DateV2Value<T>::from_date_str(const char* date_str, int len, int scale) { field_idx++; break; } + + // timezone + if (UNLIKELY((field_idx > 2 || + !has_bar) /*dont treat xxxx-xx-xx:xx:xx as xxxx-xx(-xx:xx:xx)*/ + && time_zone_begins(ptr, end))) { + if (local_time_zone == nullptr || time_zone_cache == nullptr) { + return false; + } + auto get_tz_offset = [&](const std::string& str_tz, + const cctz::time_zone* local_time_zone, + ZoneList* time_zone_cache) -> long { + // no lock needed because of the entity is of thread_local + if (time_zone_cache->find(str_tz) == time_zone_cache->end()) { // not found + if (!TimezoneUtils::find_cctz_time_zone(str_tz, (*time_zone_cache)[str_tz])) { + throw Exception {ErrorCode::INVALID_ARGUMENT, ""}; + } + } + auto given = cctz::convert(cctz::civil_second {}, (*time_zone_cache)[str_tz]); + auto local = cctz::convert(cctz::civil_second {}, *local_time_zone); + // these two values is absolute time. so they are negative. need to use (-local) - (-given) + return std::chrono::duration_cast<std::chrono::seconds>(given - local).count(); + }; + try { + sec_offset = get_tz_offset(std::string {ptr, end}, local_time_zone, + time_zone_cache); // use the whole remain string + } catch ([[maybe_unused]] Exception& e) { + return false; // invalid format + } + field_idx++; + break; + } + if (field_idx == 2 && *ptr == 'T') { // YYYYMMDDTHHMMDD, skip 'T' and continue ptr++; @@ -1944,6 +2065,9 @@ bool DateV2Value<T>::from_date_str(const char* date_str, int len, int scale) { return false; } } + if (*ptr == '-') { + has_bar = true; + } ptr++; } field_idx++; @@ -1964,13 +2088,20 @@ bool DateV2Value<T>::from_date_str(const char* date_str, int len, int scale) { } } - if (num_field < 3) return false; + if (num_field < 3) { + return false; + } if (is_invalid(date_val[0], date_val[1], date_val[2], 0, 0, 0, 0)) { return false; } format_datetime(date_val, carry_bits); - return check_range_and_set_time(date_val[0], date_val[1], date_val[2], date_val[3], date_val[4], - date_val[5], date_val[6]); + if (!check_range_and_set_time(date_val[0], date_val[1], date_val[2], date_val[3], date_val[4], + date_val[5], date_val[6])) { + return false; + } + return sec_offset ? date_add_interval<TimeUnit::SECOND>( + TimeInterval {TimeUnit::SECOND, sec_offset, false}) + : true; } template <typename T> @@ -2132,7 +2263,7 @@ bool DateV2Value<T>::from_date_format_str(const char* format, int format_len, co if (!str_to_int64(val, &tmp, &int_value)) { return false; } - microsecond = int_value * std::pow(10, 6 - min(6, val_end - val)); + microsecond = int_value * int_exp10(6 - min(6, val_end - val)); val = tmp; time_part_used = true; frac_part_used = true; @@ -2453,8 +2584,8 @@ int32_t DateV2Value<T>::to_buffer(char* buffer, int scale) const { uint32_t ms = date_v2_value_.microsecond_; int ms_width = scale == -1 ? 6 : std::min(6, scale); for (int i = 0; i < ms_width; i++) { - *buffer++ = (char)('0' + (ms / std::pow(10, 5 - i))); - ms %= (uint32_t)std::pow(10, 5 - i); + *buffer++ = (char)('0' + (ms / int_exp10(5 - i))); + ms %= (uint32_t)int_exp10(5 - i); } } else if (scale > 0) { *buffer++ = '.'; @@ -2462,8 +2593,8 @@ int32_t DateV2Value<T>::to_buffer(char* buffer, int scale) const { uint32_t ms = date_v2_value_.microsecond_; int ms_width = std::min(6, scale); for (int i = 0; i < ms_width; i++) { - *buffer++ = (char)('0' + (ms / std::pow(10, 5 - i))); - ms %= (uint32_t)std::pow(10, 5 - i); + *buffer++ = (char)('0' + (ms / int_exp10(5 - i))); + ms %= (uint32_t)int_exp10(5 - i); } } } @@ -2870,7 +3001,7 @@ bool DateV2Value<T>::from_unixtime(int64_t timestamp, int32_t nano_seconds, template <typename T> bool DateV2Value<T>::from_unixtime(int64_t timestamp, int32_t nano_seconds, - const cctz::time_zone& ctz, const int scale) { + const cctz::time_zone& ctz, int scale) { static const cctz::time_point<cctz::sys_seconds> epoch = std::chrono::time_point_cast<cctz::sys_seconds>( std::chrono::system_clock::from_time_t(0)); @@ -2878,8 +3009,12 @@ bool DateV2Value<T>::from_unixtime(int64_t timestamp, int32_t nano_seconds, const auto tp = cctz::convert(t, ctz); + if (scale > 6) [[unlikely]] { + scale = 6; + } + set_time(tp.year(), tp.month(), tp.day(), tp.hour(), tp.minute(), tp.second(), - nano_seconds / std::pow(10, 9 - scale) * std::pow(10, 6 - scale)); + nano_seconds / int_exp10(9 - scale) * int_exp10(6 - scale)); return true; } diff --git a/be/src/vec/runtime/vdatetime_value.h b/be/src/vec/runtime/vdatetime_value.h index 0abc314842..0abccf5568 100644 --- a/be/src/vec/runtime/vdatetime_value.h +++ b/be/src/vec/runtime/vdatetime_value.h @@ -43,6 +43,8 @@ namespace doris { namespace vectorized { +using ZoneList = std::map<std::string, cctz::time_zone>; + enum TimeUnit { MICROSECOND, SECOND, @@ -351,6 +353,8 @@ public: // 'YY-MM-DD', 'YYYY-MM-DD', 'YY-MM-DD HH.MM.SS' // 'YYYYMMDDTHHMMSS' bool from_date_str(const char* str, int len); + bool from_date_str(const char* str, int len, const cctz::time_zone& local_time_zone, + ZoneList& time_zone_cache); // Construct Date/Datetime type value from int64_t value. // Return true if convert success. Otherwise return false. @@ -681,6 +685,9 @@ private: char* to_date_buffer(char* to) const; char* to_time_buffer(char* to) const; + bool from_date_str_base(const char* date_str, int len, const cctz::time_zone* local_time_zone, + ZoneList* time_zone_cache); + int64_t to_date_int64() const; int64_t to_time_int64() const; @@ -800,6 +807,8 @@ public: // 'YY-MM-DD', 'YYYY-MM-DD', 'YY-MM-DD HH.MM.SS' // 'YYYYMMDDTHHMMSS' bool from_date_str(const char* str, int len, int scale = -1); + bool from_date_str(const char* str, int len, const cctz::time_zone& local_time_zone, + ZoneList& time_zone_cache, int scale = -1); // Convert this value to string // this will check type to decide which format to convert @@ -940,7 +949,7 @@ public: bool from_unixtime(int64_t, const cctz::time_zone& ctz); bool from_unixtime(int64_t, int32_t, const std::string& timezone, const int scale); - bool from_unixtime(int64_t, int32_t, const cctz::time_zone& ctz, const int scale); + bool from_unixtime(int64_t, int32_t, const cctz::time_zone& ctz, int scale); bool operator==(const DateV2Value<T>& other) const { // NOTE: This is not same with MySQL. @@ -1157,6 +1166,9 @@ private: const uint8_t& day, uint8_t mode, uint16_t* to_year, bool disable_lut = false); + bool from_date_str_base(const char* date_str, int len, int scale, + const cctz::time_zone* local_time_zone, ZoneList* time_zone_cache); + // Used to construct from int value int64_t standardize_timevalue(int64_t value); diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java index 770f2f9d37..11d17ec816 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java @@ -25,6 +25,7 @@ import org.apache.doris.catalog.ScalarType; import org.apache.doris.catalog.Type; import org.apache.doris.common.AnalysisException; import org.apache.doris.common.InvalidFormatException; +import org.apache.doris.nereids.util.DateUtils; import org.apache.doris.thrift.TDateLiteral; import org.apache.doris.thrift.TExprNode; import org.apache.doris.thrift.TExprNodeType; @@ -175,6 +176,7 @@ public class DateLiteral extends LiteralExpr { //Regex used to determine if the TIME field exists int date_format private static final Pattern HAS_TIME_PART = Pattern.compile("^.*[HhIiklrSsTp]+.*$"); + private static final Pattern HAS_OFFSET_PART = Pattern.compile("[\\+\\-]\\d{2}:\\d{2}"); //Date Literal persist type in meta private enum DateLiteralType { @@ -358,6 +360,27 @@ public class DateLiteral extends LiteralExpr { Preconditions.checkArgument(type.isDateType()); TemporalAccessor dateTime = null; boolean parsed = false; + int offset = 0; + + // parse timezone + if (haveTimeZoneOffset(s) || haveTimeZoneName(s)) { + String tzString = new String(); + if (haveTimeZoneName(s)) { // GMT, UTC+8, Z[, CN, Asia/Shanghai] + int split = getTimeZoneSplitPos(s); + Preconditions.checkArgument(split > 0); + tzString = s.substring(split); + s = s.substring(0, split); + } else { // +04:30 + Preconditions.checkArgument(s.charAt(s.length() - 6) == '-' || s.charAt(s.length() - 6) == '+'); + tzString = s.substring(s.length() - 6); + s = s.substring(0, s.length() - 6); + } + ZoneId zone = ZoneId.of(tzString); + ZoneId dorisZone = DateUtils.getTimeZone(); + offset = dorisZone.getRules().getOffset(java.time.Instant.now()).getTotalSeconds() + - zone.getRules().getOffset(java.time.Instant.now()).getTotalSeconds(); + } + if (!s.contains("-")) { // handle format like 20210106, but should not handle 2021-1-6 for (DateTimeFormatter formatter : formatterList) { @@ -455,6 +478,17 @@ public class DateLiteral extends LiteralExpr { type = ScalarType.createDatetimeV2Type(scale); } this.type = type; + + if (offset != 0) { + DateLiteral result = this.plusSeconds(offset); + this.second = result.second; + this.minute = result.minute; + this.hour = result.hour; + this.day = result.day; + this.month = result.month; + this.year = result.year; + } + if (checkRange() || checkDate()) { throw new AnalysisException("Datetime value is out of range"); } @@ -1770,4 +1804,27 @@ public class DateLiteral extends LiteralExpr { return; } } + + private static boolean haveTimeZoneOffset(String arg) { + Preconditions.checkArgument(arg.length() > 6); + return HAS_OFFSET_PART.matcher(arg.substring(arg.length() - 6)).matches(); + } + + private static boolean haveTimeZoneName(String arg) { + for (char ch : arg.toCharArray()) { + if (Character.isUpperCase(ch) && ch != 'T') { + return true; + } + } + return false; + } + + private static int getTimeZoneSplitPos(String arg) { + int split = arg.length() - 1; + for (; !Character.isAlphabetic(arg.charAt(split)); split--) { + } // skip +8 of UTC+8 + for (; split >= 0 && (Character.isUpperCase(arg.charAt(split)) || arg.charAt(split) == '/'); split--) { + } + return split + 1; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/StringLiteral.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/StringLiteral.java index 4bd07be579..26665f77ac 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/StringLiteral.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/StringLiteral.java @@ -261,7 +261,7 @@ public class StringLiteral extends LiteralExpr { } else if (targetType.isDateType()) { // FE only support 'yyyy-MM-dd hh:mm:ss' && 'yyyy-MM-dd' format // so if FE unchecked cast fail, we also build CastExpr for BE - // BE support other format suck as 'yyyyMMdd'... + // BE support other format such as 'yyyyMMdd'... try { return convertToDate(targetType); } catch (AnalysisException e) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateTimeLiteral.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateTimeLiteral.java index b82f0a8c94..2db3dacb49 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateTimeLiteral.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateTimeLiteral.java @@ -26,11 +26,13 @@ import org.apache.doris.nereids.types.DateTimeType; import org.apache.doris.nereids.types.coercion.DateLikeType; import org.apache.doris.nereids.util.DateUtils; +import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.time.LocalDateTime; +import java.time.ZoneId; import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatterBuilder; import java.time.format.DateTimeParseException; @@ -40,6 +42,7 @@ import java.time.temporal.TemporalAccessor; import java.util.Collections; import java.util.List; import java.util.Objects; +import java.util.regex.Pattern; /** * date time literal. @@ -58,6 +61,8 @@ public class DateTimeLiteral extends DateLiteral { private static final Logger LOG = LogManager.getLogger(DateTimeLiteral.class); + private static final Pattern HAS_OFFSET_PART = Pattern.compile("[\\+\\-]\\d{2}:\\d{2}"); + protected long hour; protected long minute; protected long second; @@ -146,6 +151,26 @@ public class DateTimeLiteral extends DateLiteral { protected void init(String s) throws AnalysisException { try { TemporalAccessor dateTime = null; + int offset = 0; + // parse timezone + if (haveTimeZoneOffset(s) || haveTimeZoneName(s)) { + String tzString = new String(); + if (haveTimeZoneName(s)) { // GMT, UTC+8, Z[, CN, Asia/Shanghai] + int split = getTimeZoneSplitPos(s); + Preconditions.checkArgument(split > 0); + tzString = s.substring(split); + s = s.substring(0, split); + } else { // +04:30 + Preconditions.checkArgument(s.charAt(s.length() - 6) == '-' || s.charAt(s.length() - 6) == '+'); + tzString = s.substring(s.length() - 6); + s = s.substring(0, s.length() - 6); + } + ZoneId zone = ZoneId.of(tzString); + ZoneId dorisZone = DateUtils.getTimeZone(); + offset = dorisZone.getRules().getOffset(java.time.Instant.now()).getTotalSeconds() + - zone.getRules().getOffset(java.time.Instant.now()).getTotalSeconds(); + } + if (!s.contains("-")) { // handle format like 20210106, but should not handle 2021-1-6 boolean parsed = false; @@ -231,6 +256,16 @@ public class DateTimeLiteral extends DateLiteral { second = DateUtils.getOrDefault(dateTime, ChronoField.SECOND_OF_MINUTE); microSecond = DateUtils.getOrDefault(dateTime, ChronoField.MICRO_OF_SECOND); + if (offset != 0) { + DateTimeLiteral result = (DateTimeLiteral) this.plusSeconds(offset); + this.second = result.second; + this.minute = result.minute; + this.hour = result.hour; + this.day = result.day; + this.month = result.month; + this.year = result.year; + } + } catch (Exception ex) { throw new AnalysisException("datetime literal [" + s + "] is invalid"); } @@ -344,4 +379,27 @@ public class DateTimeLiteral extends DateLiteral { : new DateTimeLiteral(dateTime.getYear(), dateTime.getMonthValue(), dateTime.getDayOfMonth(), dateTime.getHour(), dateTime.getMinute(), dateTime.getSecond()); } + + private static boolean haveTimeZoneOffset(String arg) { + Preconditions.checkArgument(arg.length() > 6); + return HAS_OFFSET_PART.matcher(arg.substring(arg.length() - 6)).matches(); + } + + private static boolean haveTimeZoneName(String arg) { + for (char ch : arg.toCharArray()) { + if (Character.isUpperCase(ch) && ch != 'T') { + return true; + } + } + return false; + } + + private static int getTimeZoneSplitPos(String arg) { + int split = arg.length() - 1; + for (; !Character.isAlphabetic(arg.charAt(split)); split--) { + } // skip +8 of UTC+8 + for (; split >= 0 && (Character.isUpperCase(arg.charAt(split)) || arg.charAt(split) == '/'); split--) { + } + return split + 1; + } } diff --git a/regression-test/data/datatype_p0/datetimev2/test_timezone.out b/regression-test/data/datatype_p0/datetimev2/test_timezone.out new file mode 100644 index 0000000000..5dd1b615b8 --- /dev/null +++ b/regression-test/data/datatype_p0/datetimev2/test_timezone.out @@ -0,0 +1,21 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !analysis -- +2022-01-01T01:02:55 2022-01-01 +2022-02-01T03:02:55 2022-02-01 +2022-02-28T19:02:55 2022-03-01 +2022-04-01T09:02:55 2022-03-31 +2022-05-01T00:32:55 2022-05-01 +2022-05-31T22:32:55 2022-06-01 +2022-06-30T20:02:55 2022-07-01 +2022-07-31T21:00 2022-08-01 + +-- !nereids -- +2022-01-01T01:02:55 2022-01-01 +2022-02-01T03:02:55 2022-02-01 +2022-02-28T19:02:55 2022-03-01 +2022-04-01T09:02:55 2022-03-31 +2022-05-01T00:32:55 2022-05-01 +2022-05-31T22:32:55 2022-06-01 +2022-06-30T20:02:55 2022-07-01 +2022-07-31T21:00 2022-08-01 + diff --git a/regression-test/data/datatype_p0/datetimev2/test_tz_streamload.csv b/regression-test/data/datatype_p0/datetimev2/test_tz_streamload.csv new file mode 100644 index 0000000000..e4e6ee3594 --- /dev/null +++ b/regression-test/data/datatype_p0/datetimev2/test_tz_streamload.csv @@ -0,0 +1,8 @@ +2022-01-01 01:02:55,2022-01-01 01:02:55.123 +2022-02-01 01:02:55Z,2022-02-01 01:02:55.123Z +2022-03-01 01:02:55UTC+8,2022-03-01 01:02:55.123UTC +2022-04-01T01:02:55UTC-6,2022-04-01T01:02:55.123UTC+6 +2022-05-01 01:02:55+02:30,2022-05-01 01:02:55.123-02:30 +2022-06-01T01:02:55+04:30,2022-06-01 01:02:55.123-07:30 +20220701010255+07:00,20220701010255-05:00 +20220801GMT+5,20220801GMT-3 \ No newline at end of file diff --git a/regression-test/data/datatype_p0/datetimev2/test_tz_streamload.out b/regression-test/data/datatype_p0/datetimev2/test_tz_streamload.out new file mode 100644 index 0000000000..3fc4fb93b6 --- /dev/null +++ b/regression-test/data/datatype_p0/datetimev2/test_tz_streamload.out @@ -0,0 +1,11 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !all -- +2022-01-01T01:02:55 2022-01-01 +2022-02-01T03:02:55 2022-02-01 +2022-02-28T19:02:55 2022-03-01 +2022-04-01T09:02:55 2022-03-31 +2022-05-01T00:32:55 2022-05-01 +2022-05-31T22:32:55 2022-06-01 +2022-06-30T20:02:55 2022-07-01 +2022-07-31T21:00 2022-08-01 + diff --git a/regression-test/suites/datatype_p0/datetimev2/test_timezone.groovy b/regression-test/suites/datatype_p0/datetimev2/test_timezone.groovy new file mode 100644 index 0000000000..c15404bf02 --- /dev/null +++ b/regression-test/suites/datatype_p0/datetimev2/test_timezone.groovy @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_timezone") { + def table = "test_timezone" + + sql "drop table if exists ${table}" + + sql """ + CREATE TABLE IF NOT EXISTS `${table}` ( + `k1` datetimev2(3) NOT NULL, + `k2` datev2 NOT NULL + ) ENGINE=OLAP + DUPLICATE KEY(`k1`) + DISTRIBUTED BY HASH(`k1`) BUCKETS 3 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + sql """ set time_zone = '+02:00' """ + + sql """ set enable_nereids_planner = false """ + sql """insert into ${table} values('2022-01-01 01:02:55', '2022-01-01 01:02:55.123')""" + sql """insert into ${table} values('2022-02-01 01:02:55Z', '2022-02-01 01:02:55.123Z')""" + sql """insert into ${table} values('2022-03-01 01:02:55UTC+8', '2022-03-01 01:02:55.123UTC')""" + sql """insert into ${table} values('2022-04-01T01:02:55UTC-6', '2022-04-01T01:02:55.123UTC+6')""" + sql """insert into ${table} values('2022-05-01 01:02:55+02:30', '2022-05-01 01:02:55.123-02:30')""" + sql """insert into ${table} values('2022-06-01T01:02:55+04:30', '2022-06-01 01:02:55.123-07:30')""" + sql """insert into ${table} values('20220701010255+07:00', '20220701010255-05:00')""" + sql """insert into ${table} values('20220801GMT+5', '20220801GMT-3')""" + qt_analysis "select * from ${table} order by k1" + + sql """ truncate table ${table} """ + + sql """ set enable_nereids_planner = true """ + sql """insert into ${table} values('2022-01-01 01:02:55', '2022-01-01 01:02:55.123')""" + sql """insert into ${table} values('2022-02-01 01:02:55Z', '2022-02-01 01:02:55.123Z')""" + sql """insert into ${table} values('2022-03-01 01:02:55UTC+8', '2022-03-01 01:02:55.123UTC')""" + sql """insert into ${table} values('2022-04-01T01:02:55UTC-6', '2022-04-01T01:02:55.123UTC+6')""" + sql """insert into ${table} values('2022-05-01 01:02:55+02:30', '2022-05-01 01:02:55.123-02:30')""" + sql """insert into ${table} values('2022-06-01T01:02:55+04:30', '2022-06-01 01:02:55.123-07:30')""" + sql """insert into ${table} values('20220701010255+07:00', '20220701010255-05:00')""" + sql """insert into ${table} values('20220801GMT+5', '20220801GMT-3')""" + qt_nereids "select * from ${table} order by k1" +} diff --git a/regression-test/suites/datatype_p0/datetimev2/test_tz_streamload.groovy b/regression-test/suites/datatype_p0/datetimev2/test_tz_streamload.groovy new file mode 100644 index 0000000000..c4b0cdf051 --- /dev/null +++ b/regression-test/suites/datatype_p0/datetimev2/test_tz_streamload.groovy @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_tz_streamload") { + def dbName = "tz_streamload" + def tableName = "timezone" + + sql "drop table if exists ${tableName}" + sql """ + CREATE TABLE IF NOT EXISTS ${tableName} ( + `k1` datetimev2(3) NULL, + `k2` datev2 NULL + ) ENGINE=OLAP + DUPLICATE KEY(`k1`) + DISTRIBUTED BY HASH(`k1`) BUCKETS 3 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + streamLoad { + table "${tableName}" + set 'column_separator', ',' + set 'time_zone', '+02:00' + file "test_tz_streamload.csv" + time 10000 + } + sql "sync" + + qt_all "select * from ${tableName} order by k1" +} diff --git a/regression-test/suites/partition_p0/test_partition_table_err_msg.groovy b/regression-test/suites/partition_p0/test_partition_table_err_msg.groovy index e9a2e7166a..e900c24b10 100644 --- a/regression-test/suites/partition_p0/test_partition_table_err_msg.groovy +++ b/regression-test/suites/partition_p0/test_partition_table_err_msg.groovy @@ -72,7 +72,7 @@ suite("test_partition_table_err_msg", "p0") { PARTITION partition_d VALUES LESS THAN ("10000-01-01 00:00:00") ) DISTRIBUTED BY HASH(k1) BUCKETS 13 """ - exception "Datetime value is out of range" + exception "date literal [10000-01-01 00:00:00] is invalid" } test { sql """ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org