This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new 72c20d3ccc2 [branch-2.1](function) fix date_format and from_unixtime core when meet long format string (#35883) (#36158) 72c20d3ccc2 is described below commit 72c20d3ccc270e301108dadfeef8e36c93f6ca03 Author: zclllyybb <zhaochan...@selectdb.com> AuthorDate: Mon Jul 1 20:35:31 2024 +0800 [branch-2.1](function) fix date_format and from_unixtime core when meet long format string (#35883) (#36158) pick #35883 --- be/src/olap/types.h | 16 +++++----- .../serde/data_type_datetimev2_serde.cpp | 16 +++++----- be/src/vec/functions/date_time_transforms.h | 10 ++++--- be/src/vec/runtime/vdatetime_value.cpp | 35 ++++++++++++++-------- be/src/vec/runtime/vdatetime_value.h | 17 +++++++++-- .../data/datatype_p0/date/test_from_unixtime.out | 6 +++- .../datetime_functions/test_date_function.out | 3 ++ .../datatype_p0/date/test_from_unixtime.groovy | 1 + .../datetime_functions/test_date_function.groovy | 1 + 9 files changed, 68 insertions(+), 37 deletions(-) diff --git a/be/src/olap/types.h b/be/src/olap/types.h index edc1fdabd7a..d2de658da59 100644 --- a/be/src/olap/types.h +++ b/be/src/olap/types.h @@ -1236,11 +1236,11 @@ struct FieldTypeTraits<FieldType::OLAP_FIELD_TYPE_DATEV2> CppType tmp = *reinterpret_cast<const CppType*>(src); DateV2Value<DateV2ValueType> value = binary_cast<CppType, DateV2Value<DateV2ValueType>>(tmp); - string format = "%Y-%m-%d"; - string res; - res.resize(12); - res.reserve(12); - value.to_format_string(format.c_str(), format.size(), res.data()); + std::string format = "%Y-%m-%d"; + std::string res; + res.resize(12 + SAFE_FORMAT_STRING_MARGIN); + value.to_format_string_conservative(format.c_str(), format.size(), res.data(), + 12 + SAFE_FORMAT_STRING_MARGIN); return res; } @@ -1277,9 +1277,9 @@ struct FieldTypeTraits<FieldType::OLAP_FIELD_TYPE_DATETIMEV2> binary_cast<CppType, DateV2Value<DateTimeV2ValueType>>(tmp); string format = "%Y-%m-%d %H:%i:%s.%f"; string res; - res.resize(30); - res.reserve(30); - value.to_format_string(format.c_str(), format.size(), res.data()); + res.resize(30 + SAFE_FORMAT_STRING_MARGIN); + value.to_format_string_conservative(format.c_str(), format.size(), res.data(), + 30 + SAFE_FORMAT_STRING_MARGIN); return res; } diff --git a/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp b/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp index 73e859f985a..ec70b127af9 100644 --- a/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp @@ -20,7 +20,6 @@ #include <arrow/builder.h> #include <chrono> // IWYU pragma: keep -#include <type_traits> #include "vec/columns/column_const.h" #include "vec/io/io_helper.h" @@ -32,8 +31,7 @@ enum { DIVISOR_FOR_NANO = 1000000000 }; -namespace doris { -namespace vectorized { +namespace doris::vectorized { static const int64_t timestamp_threshold = -2177481943; static const int64_t timestamp_diff = 343; static const int64_t micr_to_nano_second = 1000; @@ -57,8 +55,9 @@ Status DataTypeDateTimeV2SerDe::serialize_one_cell_to_json(const IColumn& column if (options.date_olap_format) { std::string format = "%Y-%m-%d %H:%i:%s.%f"; - char buf[30]; - val.to_format_string(format.c_str(), format.size(), buf); + char buf[30 + SAFE_FORMAT_STRING_MARGIN]; + val.to_format_string_conservative(format.c_str(), format.size(), buf, + 30 + SAFE_FORMAT_STRING_MARGIN); std::string s = std::string(buf); bw.write(s.c_str(), s.length()); } else { @@ -132,7 +131,7 @@ void DataTypeDateTimeV2SerDe::read_column_from_arrow(IColumn& column, auto& col_data = static_cast<ColumnVector<Int64>&>(column).get_data(); int64_t divisor = 1; if (arrow_array->type()->id() == arrow::Type::TIMESTAMP) { - auto concrete_array = dynamic_cast<const arrow::TimestampArray*>(arrow_array); + const auto* concrete_array = dynamic_cast<const arrow::TimestampArray*>(arrow_array); const auto type = std::static_pointer_cast<arrow::TimestampType>(arrow_array->type()); switch (type->unit()) { case arrow::TimeUnit::type::SECOND: { @@ -176,7 +175,7 @@ template <bool is_binary_format> Status DataTypeDateTimeV2SerDe::_write_column_to_mysql(const IColumn& column, MysqlRowBuffer<is_binary_format>& result, int row_idx, bool col_const) const { - auto& data = assert_cast<const ColumnVector<UInt64>&>(column).get_data(); + const auto& data = assert_cast<const ColumnVector<UInt64>&>(column).get_data(); const auto col_index = index_check_const(row_idx, col_const); DateV2Value<DateTimeV2ValueType> date_val = binary_cast<UInt64, DateV2Value<DateTimeV2ValueType>>(data[col_index]); @@ -245,5 +244,4 @@ Status DataTypeDateTimeV2SerDe::write_column_to_orc(const std::string& timezone, return Status::OK(); } -} // namespace vectorized -} // namespace doris +} // namespace doris::vectorized diff --git a/be/src/vec/functions/date_time_transforms.h b/be/src/vec/functions/date_time_transforms.h index a513f9f154d..266c9b5d272 100644 --- a/be/src/vec/functions/date_time_transforms.h +++ b/be/src/vec/functions/date_time_transforms.h @@ -190,8 +190,9 @@ struct DateFormatImpl { if (format.size > 128) { return std::pair {offset, true}; } - char buf[128]; - if (!dt.to_format_string(format.data, format.size, buf)) { + char buf[100 + SAFE_FORMAT_STRING_MARGIN]; + if (!dt.to_format_string_conservative(format.data, format.size, buf, + 100 + SAFE_FORMAT_STRING_MARGIN)) { return std::pair {offset, true}; } @@ -227,8 +228,9 @@ struct FromUnixTimeImpl { } dt.from_unixtime(val, time_zone); - char buf[128]; - if (!dt.to_format_string(format.data, format.size, buf)) { + char buf[100 + SAFE_FORMAT_STRING_MARGIN]; + if (!dt.to_format_string_conservative(format.data, format.size, buf, + 100 + SAFE_FORMAT_STRING_MARGIN)) { return std::pair {offset, true}; } diff --git a/be/src/vec/runtime/vdatetime_value.cpp b/be/src/vec/runtime/vdatetime_value.cpp index bd2a7044a7f..b82f706e2eb 100644 --- a/be/src/vec/runtime/vdatetime_value.cpp +++ b/be/src/vec/runtime/vdatetime_value.cpp @@ -543,6 +543,7 @@ bool VecDateTimeValue::from_date_daynr(uint64_t daynr) { return true; } +/// @return: tail static char* int_to_str(uint64_t val, char* to) { char buf[64]; char* ptr = buf; @@ -555,7 +556,6 @@ static char* int_to_str(uint64_t val, char* to) { while (ptr > buf) { *to++ = *--ptr; } - return to; } @@ -566,18 +566,17 @@ static char* append_string(const char* from, char* to) { return to; } -static char* append_with_prefix(const char* str, int str_len, char prefix, int full_len, char* to) { - int len = (str_len > full_len) ? str_len : full_len; - len -= str_len; - while (len-- > 0) { - // push prefix; +static char* append_with_prefix(const char* str, int str_len, char prefix, int target_len, + char* to) { + // full_len is the lower bound. if less, use prefix to pad. if greater, accept all. + int diff = target_len - str_len; + // use prefix to pad + while (diff-- > 0) { // won't be INT_MIN. it's ok *to++ = prefix; } - while (str_len-- > 0) { - *to++ = *str++; - } - return to; + memcpy(to, str, str_len); + return to + str_len; } int VecDateTimeValue::compute_format_len(const char* format, int len) { @@ -673,10 +672,12 @@ char* write_four_digits_to_string(int number, char* dst) { return dst + 4; } -bool VecDateTimeValue::to_format_string(const char* format, int len, char* to) const { +bool VecDateTimeValue::to_format_string_conservative(const char* format, int len, char* to, + int max_valid_length) const { if (check_range(_year, _month, _day, _hour, _minute, _second, _type)) { return false; } + char* const begin = to; // to check written bytes char buf[64]; char* cursor = buf; char* pos = nullptr; @@ -685,6 +686,9 @@ bool VecDateTimeValue::to_format_string(const char* format, int len, char* to) c char ch = '\0'; while (ptr < end) { + if (to - begin + SAFE_FORMAT_STRING_MARGIN > max_valid_length) [[unlikely]] { + return false; + } if (*ptr != '%' || (ptr + 1) == end) { *to++ = *ptr++; continue; @@ -932,6 +936,7 @@ bool VecDateTimeValue::to_format_string(const char* format, int len, char* to) c break; } default: + // put it literal *to++ = ch; break; } @@ -3421,10 +3426,12 @@ void DateV2Value<T>::set_microsecond(uint32_t microsecond) { } template <typename T> -bool DateV2Value<T>::to_format_string(const char* format, int len, char* to) const { +bool DateV2Value<T>::to_format_string_conservative(const char* format, int len, char* to, + int max_valid_length) const { if (is_invalid(year(), month(), day(), hour(), minute(), second(), microsecond())) { return false; } + char* const begin = to; // to check written bytes char buf[64]; char* pos = nullptr; char* cursor = buf; @@ -3433,6 +3440,9 @@ bool DateV2Value<T>::to_format_string(const char* format, int len, char* to) con char ch = '\0'; while (ptr < end) { + if (to - begin + SAFE_FORMAT_STRING_MARGIN > max_valid_length) [[unlikely]] { + return false; + } if (*ptr != '%' || (ptr + 1) == end) { *to++ = *ptr++; continue; @@ -3666,6 +3676,7 @@ bool DateV2Value<T>::to_format_string(const char* format, int len, char* to) con break; } default: + // put it literal *to++ = ch; break; } diff --git a/be/src/vec/runtime/vdatetime_value.h b/be/src/vec/runtime/vdatetime_value.h index 0251964b809..5606fe37779 100644 --- a/be/src/vec/runtime/vdatetime_value.h +++ b/be/src/vec/runtime/vdatetime_value.h @@ -143,6 +143,8 @@ struct TimeInterval { enum TimeType { TIME_TIME = 1, TIME_DATE = 2, TIME_DATETIME = 3 }; +constexpr int SAFE_FORMAT_STRING_MARGIN = 12; + // Used to compute week const int WEEK_MONDAY_FIRST = 1; const int WEEK_YEAR = 2; @@ -394,8 +396,12 @@ public: char* to_string(char* to) const; - // Convert this datetime value to string by the format string - bool to_format_string(const char* format, int len, char* to) const; + // Convert this datetime value to string by the format string. + // for performance of checking, may return false when just APPROACH BUT NOT REACH max_valid_length. + // so need a little big buffer and its length as max_valid_length to make sure store valid data. + // to make sure of this. make the buffer size = <data_need_length> + SAFE_FORMAT_STRING_MARGIN. and pass this size as max_valid_length + bool to_format_string_conservative(const char* format, int len, char* to, + int max_valid_length) const; // compute the length of data format pattern static int compute_format_len(const char* format, int len); @@ -822,7 +828,12 @@ public: return val; } - bool to_format_string(const char* format, int len, char* to) const; + // Convert this datetime value to string by the format string. + // for performance of checking, may return false when just APPROACH BUT NOT REACH max_valid_length. + // so need a little big buffer and its length as max_valid_length to make sure store valid data. + // to make sure of this. make the buffer size = <data_need_length> + SAFE_FORMAT_STRING_MARGIN. and pass this size as max_valid_length + bool to_format_string_conservative(const char* format, int len, char* to, + int max_valid_length) const; bool from_date_format_str(const char* format, int format_len, const char* value, int value_len) { diff --git a/regression-test/data/datatype_p0/date/test_from_unixtime.out b/regression-test/data/datatype_p0/date/test_from_unixtime.out index a78ef107ea2..622d1971202 100644 --- a/regression-test/data/datatype_p0/date/test_from_unixtime.out +++ b/regression-test/data/datatype_p0/date/test_from_unixtime.out @@ -27,4 +27,8 @@ \N -- !sql10 -- -\N \ No newline at end of file +\N + +-- !long -- +\N + diff --git a/regression-test/data/nereids_p0/sql_functions/datetime_functions/test_date_function.out b/regression-test/data/nereids_p0/sql_functions/datetime_functions/test_date_function.out index d71b9968051..8c256e42d57 100644 --- a/regression-test/data/nereids_p0/sql_functions/datetime_functions/test_date_function.out +++ b/regression-test/data/nereids_p0/sql_functions/datetime_functions/test_date_function.out @@ -491,6 +491,9 @@ true -- !sql -- 2022 31 4 +-- !sql_date_format_long -- +\N + -- !sql -- \N diff --git a/regression-test/suites/datatype_p0/date/test_from_unixtime.groovy b/regression-test/suites/datatype_p0/date/test_from_unixtime.groovy index 43d4a581509..ecbc7ddc393 100644 --- a/regression-test/suites/datatype_p0/date/test_from_unixtime.groovy +++ b/regression-test/suites/datatype_p0/date/test_from_unixtime.groovy @@ -44,4 +44,5 @@ suite("test_from_unixtime") { qt_sql9 "select from_unixtime(-7629445119491449, \"%Y-%m-%d\");" qt_sql10 "select from_unixtime(-7629445119491449);" + qt_long "select from_unixtime(1196440219, '%f %V %f %l %V %I %S %p %w %r %j %f %l %I %D %w %j %D %e %s %V %f %D %M %s %X %U %v %c %u %x %r %j %a %h %s %m %a %v %u %b');" } diff --git a/regression-test/suites/nereids_p0/sql_functions/datetime_functions/test_date_function.groovy b/regression-test/suites/nereids_p0/sql_functions/datetime_functions/test_date_function.groovy index 2f1ef98b4ea..c83fbcad789 100644 --- a/regression-test/suites/nereids_p0/sql_functions/datetime_functions/test_date_function.groovy +++ b/regression-test/suites/nereids_p0/sql_functions/datetime_functions/test_date_function.groovy @@ -474,6 +474,7 @@ suite("test_date_function") { qt_sql """ select date_format('1999-01-01', '%X %V'); """ qt_sql """ select date_format('2025-01-01', '%X %V'); """ qt_sql """ select date_format('2022-08-04', '%X %V %w'); """ + qt_sql_date_format_long """ select date_format(cast('2011-06-24' as DATETIMEV2(0)), '%f %V %f %l %V %I %S %p %w %r %j %f %l %I %D %w %j %D %e %s %V %f %D %M %s %X %U %v %c %u %x %r %j %a %h %s %m %a %v %u %b') """ qt_sql """ select STR_TO_DATE('Tue Jul 12 20:00:45 CST 2022', '%a %b %e %H:%i:%s %Y'); """ qt_sql """ select STR_TO_DATE('Tue Jul 12 20:00:45 CST 2022', '%a %b %e %T CST %Y'); """ qt_sql """ select STR_TO_DATE('2018-4-2 15:3:28','%Y-%m-%d %H:%i:%s'); """ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org