This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new 72c20d3ccc2 [branch-2.1](function) fix date_format and from_unixtime 
core when meet long format string (#35883) (#36158)
72c20d3ccc2 is described below

commit 72c20d3ccc270e301108dadfeef8e36c93f6ca03
Author: zclllyybb <zhaochan...@selectdb.com>
AuthorDate: Mon Jul 1 20:35:31 2024 +0800

    [branch-2.1](function) fix date_format and from_unixtime core when meet 
long format string (#35883) (#36158)
    
    pick #35883
---
 be/src/olap/types.h                                | 16 +++++-----
 .../serde/data_type_datetimev2_serde.cpp           | 16 +++++-----
 be/src/vec/functions/date_time_transforms.h        | 10 ++++---
 be/src/vec/runtime/vdatetime_value.cpp             | 35 ++++++++++++++--------
 be/src/vec/runtime/vdatetime_value.h               | 17 +++++++++--
 .../data/datatype_p0/date/test_from_unixtime.out   |  6 +++-
 .../datetime_functions/test_date_function.out      |  3 ++
 .../datatype_p0/date/test_from_unixtime.groovy     |  1 +
 .../datetime_functions/test_date_function.groovy   |  1 +
 9 files changed, 68 insertions(+), 37 deletions(-)

diff --git a/be/src/olap/types.h b/be/src/olap/types.h
index edc1fdabd7a..d2de658da59 100644
--- a/be/src/olap/types.h
+++ b/be/src/olap/types.h
@@ -1236,11 +1236,11 @@ struct 
FieldTypeTraits<FieldType::OLAP_FIELD_TYPE_DATEV2>
         CppType tmp = *reinterpret_cast<const CppType*>(src);
         DateV2Value<DateV2ValueType> value =
                 binary_cast<CppType, DateV2Value<DateV2ValueType>>(tmp);
-        string format = "%Y-%m-%d";
-        string res;
-        res.resize(12);
-        res.reserve(12);
-        value.to_format_string(format.c_str(), format.size(), res.data());
+        std::string format = "%Y-%m-%d";
+        std::string res;
+        res.resize(12 + SAFE_FORMAT_STRING_MARGIN);
+        value.to_format_string_conservative(format.c_str(), format.size(), 
res.data(),
+                                            12 + SAFE_FORMAT_STRING_MARGIN);
         return res;
     }
 
@@ -1277,9 +1277,9 @@ struct 
FieldTypeTraits<FieldType::OLAP_FIELD_TYPE_DATETIMEV2>
                 binary_cast<CppType, DateV2Value<DateTimeV2ValueType>>(tmp);
         string format = "%Y-%m-%d %H:%i:%s.%f";
         string res;
-        res.resize(30);
-        res.reserve(30);
-        value.to_format_string(format.c_str(), format.size(), res.data());
+        res.resize(30 + SAFE_FORMAT_STRING_MARGIN);
+        value.to_format_string_conservative(format.c_str(), format.size(), 
res.data(),
+                                            30 + SAFE_FORMAT_STRING_MARGIN);
         return res;
     }
 
diff --git a/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp 
b/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp
index 73e859f985a..ec70b127af9 100644
--- a/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp
@@ -20,7 +20,6 @@
 #include <arrow/builder.h>
 
 #include <chrono> // IWYU pragma: keep
-#include <type_traits>
 
 #include "vec/columns/column_const.h"
 #include "vec/io/io_helper.h"
@@ -32,8 +31,7 @@ enum {
     DIVISOR_FOR_NANO = 1000000000
 };
 
-namespace doris {
-namespace vectorized {
+namespace doris::vectorized {
 static const int64_t timestamp_threshold = -2177481943;
 static const int64_t timestamp_diff = 343;
 static const int64_t micr_to_nano_second = 1000;
@@ -57,8 +55,9 @@ Status 
DataTypeDateTimeV2SerDe::serialize_one_cell_to_json(const IColumn& column
 
     if (options.date_olap_format) {
         std::string format = "%Y-%m-%d %H:%i:%s.%f";
-        char buf[30];
-        val.to_format_string(format.c_str(), format.size(), buf);
+        char buf[30 + SAFE_FORMAT_STRING_MARGIN];
+        val.to_format_string_conservative(format.c_str(), format.size(), buf,
+                                          30 + SAFE_FORMAT_STRING_MARGIN);
         std::string s = std::string(buf);
         bw.write(s.c_str(), s.length());
     } else {
@@ -132,7 +131,7 @@ void 
DataTypeDateTimeV2SerDe::read_column_from_arrow(IColumn& column,
     auto& col_data = static_cast<ColumnVector<Int64>&>(column).get_data();
     int64_t divisor = 1;
     if (arrow_array->type()->id() == arrow::Type::TIMESTAMP) {
-        auto concrete_array = dynamic_cast<const 
arrow::TimestampArray*>(arrow_array);
+        const auto* concrete_array = dynamic_cast<const 
arrow::TimestampArray*>(arrow_array);
         const auto type = 
std::static_pointer_cast<arrow::TimestampType>(arrow_array->type());
         switch (type->unit()) {
         case arrow::TimeUnit::type::SECOND: {
@@ -176,7 +175,7 @@ template <bool is_binary_format>
 Status DataTypeDateTimeV2SerDe::_write_column_to_mysql(const IColumn& column,
                                                        
MysqlRowBuffer<is_binary_format>& result,
                                                        int row_idx, bool 
col_const) const {
-    auto& data = assert_cast<const ColumnVector<UInt64>&>(column).get_data();
+    const auto& data = assert_cast<const 
ColumnVector<UInt64>&>(column).get_data();
     const auto col_index = index_check_const(row_idx, col_const);
     DateV2Value<DateTimeV2ValueType> date_val =
             binary_cast<UInt64, 
DateV2Value<DateTimeV2ValueType>>(data[col_index]);
@@ -245,5 +244,4 @@ Status DataTypeDateTimeV2SerDe::write_column_to_orc(const 
std::string& timezone,
     return Status::OK();
 }
 
-} // namespace vectorized
-} // namespace doris
+} // namespace doris::vectorized
diff --git a/be/src/vec/functions/date_time_transforms.h 
b/be/src/vec/functions/date_time_transforms.h
index a513f9f154d..266c9b5d272 100644
--- a/be/src/vec/functions/date_time_transforms.h
+++ b/be/src/vec/functions/date_time_transforms.h
@@ -190,8 +190,9 @@ struct DateFormatImpl {
         if (format.size > 128) {
             return std::pair {offset, true};
         }
-        char buf[128];
-        if (!dt.to_format_string(format.data, format.size, buf)) {
+        char buf[100 + SAFE_FORMAT_STRING_MARGIN];
+        if (!dt.to_format_string_conservative(format.data, format.size, buf,
+                                              100 + 
SAFE_FORMAT_STRING_MARGIN)) {
             return std::pair {offset, true};
         }
 
@@ -227,8 +228,9 @@ struct FromUnixTimeImpl {
         }
         dt.from_unixtime(val, time_zone);
 
-        char buf[128];
-        if (!dt.to_format_string(format.data, format.size, buf)) {
+        char buf[100 + SAFE_FORMAT_STRING_MARGIN];
+        if (!dt.to_format_string_conservative(format.data, format.size, buf,
+                                              100 + 
SAFE_FORMAT_STRING_MARGIN)) {
             return std::pair {offset, true};
         }
 
diff --git a/be/src/vec/runtime/vdatetime_value.cpp 
b/be/src/vec/runtime/vdatetime_value.cpp
index bd2a7044a7f..b82f706e2eb 100644
--- a/be/src/vec/runtime/vdatetime_value.cpp
+++ b/be/src/vec/runtime/vdatetime_value.cpp
@@ -543,6 +543,7 @@ bool VecDateTimeValue::from_date_daynr(uint64_t daynr) {
     return true;
 }
 
+/// @return: tail
 static char* int_to_str(uint64_t val, char* to) {
     char buf[64];
     char* ptr = buf;
@@ -555,7 +556,6 @@ static char* int_to_str(uint64_t val, char* to) {
     while (ptr > buf) {
         *to++ = *--ptr;
     }
-
     return to;
 }
 
@@ -566,18 +566,17 @@ static char* append_string(const char* from, char* to) {
     return to;
 }
 
-static char* append_with_prefix(const char* str, int str_len, char prefix, int 
full_len, char* to) {
-    int len = (str_len > full_len) ? str_len : full_len;
-    len -= str_len;
-    while (len-- > 0) {
-        // push prefix;
+static char* append_with_prefix(const char* str, int str_len, char prefix, int 
target_len,
+                                char* to) {
+    // full_len is the lower bound. if less, use prefix to pad. if greater, 
accept all.
+    int diff = target_len - str_len;
+    // use prefix to pad
+    while (diff-- > 0) { // won't be INT_MIN. it's ok
         *to++ = prefix;
     }
-    while (str_len-- > 0) {
-        *to++ = *str++;
-    }
 
-    return to;
+    memcpy(to, str, str_len);
+    return to + str_len;
 }
 
 int VecDateTimeValue::compute_format_len(const char* format, int len) {
@@ -673,10 +672,12 @@ char* write_four_digits_to_string(int number, char* dst) {
     return dst + 4;
 }
 
-bool VecDateTimeValue::to_format_string(const char* format, int len, char* to) 
const {
+bool VecDateTimeValue::to_format_string_conservative(const char* format, int 
len, char* to,
+                                                     int max_valid_length) 
const {
     if (check_range(_year, _month, _day, _hour, _minute, _second, _type)) {
         return false;
     }
+    char* const begin = to; // to check written bytes
     char buf[64];
     char* cursor = buf;
     char* pos = nullptr;
@@ -685,6 +686,9 @@ bool VecDateTimeValue::to_format_string(const char* format, 
int len, char* to) c
     char ch = '\0';
 
     while (ptr < end) {
+        if (to - begin + SAFE_FORMAT_STRING_MARGIN > max_valid_length) 
[[unlikely]] {
+            return false;
+        }
         if (*ptr != '%' || (ptr + 1) == end) {
             *to++ = *ptr++;
             continue;
@@ -932,6 +936,7 @@ bool VecDateTimeValue::to_format_string(const char* format, 
int len, char* to) c
             break;
         }
         default:
+            // put it literal
             *to++ = ch;
             break;
         }
@@ -3421,10 +3426,12 @@ void DateV2Value<T>::set_microsecond(uint32_t 
microsecond) {
 }
 
 template <typename T>
-bool DateV2Value<T>::to_format_string(const char* format, int len, char* to) 
const {
+bool DateV2Value<T>::to_format_string_conservative(const char* format, int 
len, char* to,
+                                                   int max_valid_length) const 
{
     if (is_invalid(year(), month(), day(), hour(), minute(), second(), 
microsecond())) {
         return false;
     }
+    char* const begin = to; // to check written bytes
     char buf[64];
     char* pos = nullptr;
     char* cursor = buf;
@@ -3433,6 +3440,9 @@ bool DateV2Value<T>::to_format_string(const char* format, 
int len, char* to) con
     char ch = '\0';
 
     while (ptr < end) {
+        if (to - begin + SAFE_FORMAT_STRING_MARGIN > max_valid_length) 
[[unlikely]] {
+            return false;
+        }
         if (*ptr != '%' || (ptr + 1) == end) {
             *to++ = *ptr++;
             continue;
@@ -3666,6 +3676,7 @@ bool DateV2Value<T>::to_format_string(const char* format, 
int len, char* to) con
             break;
         }
         default:
+            // put it literal
             *to++ = ch;
             break;
         }
diff --git a/be/src/vec/runtime/vdatetime_value.h 
b/be/src/vec/runtime/vdatetime_value.h
index 0251964b809..5606fe37779 100644
--- a/be/src/vec/runtime/vdatetime_value.h
+++ b/be/src/vec/runtime/vdatetime_value.h
@@ -143,6 +143,8 @@ struct TimeInterval {
 
 enum TimeType { TIME_TIME = 1, TIME_DATE = 2, TIME_DATETIME = 3 };
 
+constexpr int SAFE_FORMAT_STRING_MARGIN = 12;
+
 // Used to compute week
 const int WEEK_MONDAY_FIRST = 1;
 const int WEEK_YEAR = 2;
@@ -394,8 +396,12 @@ public:
 
     char* to_string(char* to) const;
 
-    // Convert this datetime value to string by the format string
-    bool to_format_string(const char* format, int len, char* to) const;
+    // Convert this datetime value to string by the format string.
+    // for performance of checking, may return false when just APPROACH BUT 
NOT REACH max_valid_length.
+    // so need a little big buffer and its length as max_valid_length to make 
sure store valid data.
+    // to make sure of this. make the buffer size = <data_need_length> + 
SAFE_FORMAT_STRING_MARGIN. and pass this size as max_valid_length
+    bool to_format_string_conservative(const char* format, int len, char* to,
+                                       int max_valid_length) const;
 
     // compute the length of data format pattern
     static int compute_format_len(const char* format, int len);
@@ -822,7 +828,12 @@ public:
         return val;
     }
 
-    bool to_format_string(const char* format, int len, char* to) const;
+    // Convert this datetime value to string by the format string.
+    // for performance of checking, may return false when just APPROACH BUT 
NOT REACH max_valid_length.
+    // so need a little big buffer and its length as max_valid_length to make 
sure store valid data.
+    // to make sure of this. make the buffer size = <data_need_length> + 
SAFE_FORMAT_STRING_MARGIN. and pass this size as max_valid_length
+    bool to_format_string_conservative(const char* format, int len, char* to,
+                                       int max_valid_length) const;
 
     bool from_date_format_str(const char* format, int format_len, const char* 
value,
                               int value_len) {
diff --git a/regression-test/data/datatype_p0/date/test_from_unixtime.out 
b/regression-test/data/datatype_p0/date/test_from_unixtime.out
index a78ef107ea2..622d1971202 100644
--- a/regression-test/data/datatype_p0/date/test_from_unixtime.out
+++ b/regression-test/data/datatype_p0/date/test_from_unixtime.out
@@ -27,4 +27,8 @@
 \N
 
 -- !sql10 --
-\N
\ No newline at end of file
+\N
+
+-- !long --
+\N
+
diff --git 
a/regression-test/data/nereids_p0/sql_functions/datetime_functions/test_date_function.out
 
b/regression-test/data/nereids_p0/sql_functions/datetime_functions/test_date_function.out
index d71b9968051..8c256e42d57 100644
--- 
a/regression-test/data/nereids_p0/sql_functions/datetime_functions/test_date_function.out
+++ 
b/regression-test/data/nereids_p0/sql_functions/datetime_functions/test_date_function.out
@@ -491,6 +491,9 @@ true
 -- !sql --
 2022 31 4
 
+-- !sql_date_format_long --
+\N
+
 -- !sql --
 \N
 
diff --git a/regression-test/suites/datatype_p0/date/test_from_unixtime.groovy 
b/regression-test/suites/datatype_p0/date/test_from_unixtime.groovy
index 43d4a581509..ecbc7ddc393 100644
--- a/regression-test/suites/datatype_p0/date/test_from_unixtime.groovy
+++ b/regression-test/suites/datatype_p0/date/test_from_unixtime.groovy
@@ -44,4 +44,5 @@ suite("test_from_unixtime") {
     qt_sql9 "select from_unixtime(-7629445119491449, \"%Y-%m-%d\");"
     qt_sql10 "select from_unixtime(-7629445119491449);"
 
+    qt_long "select from_unixtime(1196440219, '%f %V %f %l %V %I %S %p %w %r 
%j %f %l %I %D %w %j %D %e %s %V %f %D %M %s %X %U %v %c %u %x %r %j %a %h %s 
%m %a %v %u %b');"
 }
diff --git 
a/regression-test/suites/nereids_p0/sql_functions/datetime_functions/test_date_function.groovy
 
b/regression-test/suites/nereids_p0/sql_functions/datetime_functions/test_date_function.groovy
index 2f1ef98b4ea..c83fbcad789 100644
--- 
a/regression-test/suites/nereids_p0/sql_functions/datetime_functions/test_date_function.groovy
+++ 
b/regression-test/suites/nereids_p0/sql_functions/datetime_functions/test_date_function.groovy
@@ -474,6 +474,7 @@ suite("test_date_function") {
     qt_sql """ select date_format('1999-01-01', '%X %V'); """
     qt_sql """ select date_format('2025-01-01', '%X %V'); """
     qt_sql """ select date_format('2022-08-04', '%X %V %w'); """
+    qt_sql_date_format_long """ select date_format(cast('2011-06-24' as 
DATETIMEV2(0)), '%f %V %f %l %V %I %S %p %w %r %j %f %l %I %D %w %j %D %e %s %V 
%f %D %M %s %X %U %v %c %u %x %r %j %a %h %s %m %a %v %u %b') """
     qt_sql """ select STR_TO_DATE('Tue Jul 12 20:00:45 CST 2022', '%a %b %e 
%H:%i:%s %Y'); """
     qt_sql """ select STR_TO_DATE('Tue Jul 12 20:00:45 CST 2022', '%a %b %e %T 
CST %Y'); """
     qt_sql """ select STR_TO_DATE('2018-4-2 15:3:28','%Y-%m-%d %H:%i:%s'); """


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to