This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new ad36fe532bd branch-2.1: [bugfix](es catalog) Fix the parsing error of 
es catalog for special time format #54659 (#55328)
ad36fe532bd is described below

commit ad36fe532bd5a18349bf91e42006c8740d40f734
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Wed Sep 3 19:54:11 2025 +0800

    branch-2.1: [bugfix](es catalog) Fix the parsing error of es catalog for 
special time format #54659 (#55328)
    
    Cherry-picked from #54659
    
    Co-authored-by: lw112 <[email protected]>
---
 be/src/exec/es/es_scroll_parser.cpp    |   2 +-
 be/test/exec/es_scroll_parser_test.cpp | 184 +++++++++++++++++++++++++++++++++
 2 files changed, 185 insertions(+), 1 deletion(-)

diff --git a/be/src/exec/es/es_scroll_parser.cpp 
b/be/src/exec/es/es_scroll_parser.cpp
index 6067203f2ba..6e72596bd4d 100644
--- a/be/src/exec/es/es_scroll_parser.cpp
+++ b/be/src/exec/es/es_scroll_parser.cpp
@@ -199,7 +199,7 @@ Status get_date_value_int(const rapidjson::Value& col, 
PrimitiveType type, bool
             std::chrono::system_clock::time_point tp;
             // time_zone suffix pattern
             // Z/+08:00/-04:30
-            RE2 time_zone_pattern(R"([+-]\d{2}:\d{2}|Z)");
+            RE2 time_zone_pattern(R"([+-]\d{2}:?\d{2}|Z)");
             bool ok = false;
             std::string fmt;
             re2::StringPiece value;
diff --git a/be/test/exec/es_scroll_parser_test.cpp 
b/be/test/exec/es_scroll_parser_test.cpp
new file mode 100644
index 00000000000..4f4f53618a0
--- /dev/null
+++ b/be/test/exec/es_scroll_parser_test.cpp
@@ -0,0 +1,184 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include <rapidjson/document.h>
+#include <re2/re2.h>
+
+#include <string>
+
+namespace doris {
+
+class EsScrollParserTest : public testing::Test {
+public:
+    void SetUp() override {}
+    void TearDown() override {}
+};
+
+// Test timezone pattern matching for ES datetime parsing fix
+TEST_F(EsScrollParserTest, TestTimezonePatternMatching) {
+    RE2 time_zone_pattern(R"([+-]\d{2}:?\d{2}|Z)");
+
+    std::vector<std::string> valid_timezone_formats = {
+            "2025-05-23T20:56:52.052+0900",  "2025-05-23T20:56:52.052-0500",
+            "2025-05-23T20:56:52.052+08:00", "2025-05-23T20:56:52.052-04:30",
+            "2025-05-23T20:56:52.052Z",      "2022-08-08T12:10:10.151Z",
+            "2022-08-08T12:10:10+0900",      "2022-08-08T12:10:10-0500"};
+
+    for (const auto& datetime_str : valid_timezone_formats) {
+        re2::StringPiece timezone_value;
+        bool matched = time_zone_pattern.Match(datetime_str, 0, 
datetime_str.size(),
+                                               RE2::UNANCHORED, 
&timezone_value, 1);
+        EXPECT_TRUE(matched) << "Failed to match timezone in: " << 
datetime_str;
+
+        std::string timezone = timezone_value.as_string();
+        EXPECT_FALSE(timezone.empty()) << "Empty timezone captured from: " << 
datetime_str;
+
+        if (timezone == "Z") {
+            EXPECT_EQ(timezone, "Z");
+        } else {
+            EXPECT_TRUE(timezone[0] == '+' || timezone[0] == '-')
+                    << "Invalid timezone sign in: " << timezone;
+            // Valid timezone lengths: 5 for +0900, 6 for +08:00
+            EXPECT_TRUE(timezone.length() == 5 || timezone.length() == 6)
+                    << "Invalid timezone length in: " << timezone
+                    << " (length: " << timezone.length() << ")";
+        }
+    }
+}
+
+TEST_F(EsScrollParserTest, TestInvalidTimezonePatterns) {
+    RE2 time_zone_pattern(R"([+-]\d{2}:?\d{2}|Z)");
+
+    std::vector<std::string> invalid_formats = {
+            "2025-05-23T20:56:52.052", "2025-05-23T20:56:52.052+9", 
"2025-05-23T20:56:52.052+090",
+            "2025-05-23T20:56:52.052+9:00"};
+
+    for (const auto& datetime_str : invalid_formats) {
+        re2::StringPiece timezone_value;
+        bool matched = time_zone_pattern.Match(datetime_str, 0, 
datetime_str.size(),
+                                               RE2::UNANCHORED, 
&timezone_value, 1);
+        if (matched) {
+            std::string timezone = timezone_value.as_string();
+            EXPECT_TRUE(timezone.empty()) << "Should not capture timezone 
from: " << datetime_str;
+        }
+    }
+}
+
+TEST_F(EsScrollParserTest, TestBugScenarioTimezoneFormat) {
+    RE2 time_zone_pattern(R"([+-]\d{2}:?\d{2}|Z)");
+
+    std::string problematic_format = "2025-05-23T20:56:52.052+0900";
+
+    re2::StringPiece timezone_value;
+    bool matched = time_zone_pattern.Match(problematic_format, 0, 
problematic_format.size(),
+                                           RE2::UNANCHORED, &timezone_value, 
1);
+
+    EXPECT_TRUE(matched) << "Failed to match the bug scenario format: " << 
problematic_format;
+
+    std::string timezone = timezone_value.as_string();
+    EXPECT_EQ(timezone, "+0900") << "Incorrect timezone captured: " << 
timezone;
+}
+
+TEST_F(EsScrollParserTest, TestEdgeCaseTimezoneFormats) {
+    RE2 time_zone_pattern(R"([+-]\d{2}:?\d{2}|Z)");
+
+    std::vector<std::string> edge_cases = {"+00:00", "-00:00", "+23:59", 
"-23:59",
+                                           "+99:99", "Z",      "+0800",  ""};
+
+    // Test each edge case
+    std::vector<std::string> test_datetime_strings = {
+            "2025-05-23T20:56:52.052+00:00", // +00:00 (UTC with colon)
+            "2025-05-23T20:56:52.052-00:00", // -00:00 (UTC with colon)
+            "2025-05-23T20:56:52.052+23:59", // +23:59 (max valid timezone)
+            "2025-05-23T20:56:52.052-23:59", // -23:59 (max valid timezone)
+            "2025-05-23T20:56:52.052+99:99", // +99:99 (invalid but should 
match pattern)
+            "2025-05-23T20:56:52.052Z",      // Z (UTC)
+            "2025-05-23T20:56:52.052+0800",  // +0800 (no colon)
+            "2025-05-23T20:56:52.052"        // empty timezone (no timezone)
+    };
+
+    std::vector<std::string> expected_matches = {"+00:00", "-00:00", "+23:59", 
"-23:59",
+                                                 "+99:99", "Z",      "+0800",  
""};
+
+    std::vector<bool> should_match = {true, true, true, true, true, true, 
true, false};
+
+    for (size_t i = 0; i < test_datetime_strings.size(); ++i) {
+        const std::string& datetime_str = test_datetime_strings[i];
+        const std::string& expected_match = expected_matches[i];
+        bool should_match_expected = should_match[i];
+
+        re2::StringPiece timezone_value;
+        bool matched = time_zone_pattern.Match(datetime_str, 0, 
datetime_str.size(),
+                                               RE2::UNANCHORED, 
&timezone_value, 1);
+
+        EXPECT_EQ(matched, should_match_expected)
+                << "Edge case test failed for: " << datetime_str
+                << " (expected match: " << should_match_expected << ")";
+
+        if (matched && should_match_expected) {
+            std::string timezone = timezone_value.as_string();
+            EXPECT_EQ(timezone, expected_match)
+                    << "Incorrect timezone captured from: " << datetime_str
+                    << " (expected: " << expected_match << ", got: " << 
timezone << ")";
+        }
+    }
+}
+
+TEST_F(EsScrollParserTest, TestSpecialTimezoneEdgeCases) {
+    RE2 time_zone_pattern(R"([+-]\d{2}:?\d{2}|Z)");
+
+    // Additional edge cases for comprehensive testing
+    std::vector<std::pair<std::string, std::pair<std::string, bool>>> 
special_cases = {
+            // {datetime_string, {expected_timezone, should_match}}
+            {"2025-05-23T20:56:52+0000", {"+0000", true}},          // +0000 
without colon
+            {"2025-05-23T20:56:52-0000", {"-0000", true}},          // -0000 
without colon
+            {"2025-05-23T20:56:52+12:30", {"+12:30", true}},        // +12:30 
with colon
+            {"2025-05-23T20:56:52-12:30", {"-12:30", true}},        // -12:30 
with colon
+            {"2025-05-23T20:56:52+1200", {"+1200", true}},          // +1200 
without colon
+            {"2025-05-23T20:56:52-1200", {"-1200", true}},          // -1200 
without colon
+            {"2025-05-23T20:56:52.000Z", {"Z", true}},              // Z with 
milliseconds
+            {"2025-05-23T20:56:52.123456+05:30", {"+05:30", true}}, // 
microseconds with timezone
+            {"2025-05-23T20:56:52.123456-05:30", {"-05:30", true}}, // 
microseconds with timezone
+            {"2025-05-23T20:56:52.123456+0530", {"+0530", true}},   // 
microseconds without colon
+            {"2025-05-23T20:56:52.123456-0530", {"-0530", true}},   // 
microseconds without colon
+            {"2025-05-23T20:56:52+14:00", {"+14:00", true}},        // +14:00 
(valid max timezone)
+            {"2025-05-23T20:56:52-12:00", {"-12:00", true}},        // -12:00 
(valid min timezone)
+    };
+
+    for (const auto& test_case : special_cases) {
+        const std::string& datetime_str = test_case.first;
+        const std::string& expected_timezone = test_case.second.first;
+        bool should_match = test_case.second.second;
+
+        re2::StringPiece timezone_value;
+        bool matched = time_zone_pattern.Match(datetime_str, 0, 
datetime_str.size(),
+                                               RE2::UNANCHORED, 
&timezone_value, 1);
+
+        EXPECT_EQ(matched, should_match) << "Special case test failed for: " 
<< datetime_str
+                                         << " (expected match: " << 
should_match << ")";
+
+        if (matched && should_match) {
+            std::string timezone = timezone_value.as_string();
+            EXPECT_EQ(timezone, expected_timezone)
+                    << "Incorrect timezone captured from: " << datetime_str
+                    << " (expected: " << expected_timezone << ", got: " << 
timezone << ")";
+        }
+    }
+}
+
+} // namespace doris


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to