This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 9ecc5aeedc8 [Fix](delete) Support delete when column name is Unicode 
(#39381)
9ecc5aeedc8 is described below

commit 9ecc5aeedc8805ea94da6e6d3619b78e78f3012f
Author: zclllhhjj <zhaochan...@selectdb.com>
AuthorDate: Mon Aug 19 12:43:17 2024 +0800

    [Fix](delete) Support delete when column name is Unicode (#39381)
    
    ## Proposed changes
    
    Issue Number: close #xxx
    
    before:
    ```sql
    mysql> delete from table_7298276 where  中文列名1 > '2023-08-17' and 中文列名2 > 
'-68' and 中文列名3 in ("77", "0", "-35", "-8", "93", "-87", "42", "24", "57", 
"74");
    ERROR 1105 (HY000): errCode = 2, detailMessage = delete job failed, 
errmsg:10019: [(10.16.10.8)[INVALID_ARGUMENT]failed to parse condition_str, 
condtion=TCondition {
      01: column_name (string) = 
"\xe4\xb8\xad\xe6\x96\x87\xe5\x88\x97\xe5\x90\x8d1",
      02: condition_op (string) = ">",
      03: condition_values (list) = list<string>[1] {
        [0] = "2023-08-17",
      },
      04: column_unique_id (i32) = 0,
      05: marked_by_runtime_filter (bool) = false,
      1000: compound_type (i32) = 0,
    }]
    ```
    
    now:
    ```sql
    mysql> delete from table_7298276 where 中文列名1 > '2012-08-17' and 中文列名2 > -68 
and 中文列名3 in (1,2,3);
    Query OK, 0 rows affected (0.14 sec)
    ```
---
 be/src/olap/delete_handler.cpp                     | 49 ++++++++++------------
 be/test/olap/delete_handler_test.cpp               |  4 ++
 .../data/delete_p0/test_delete_unicode.out         |  6 +++
 .../suites/delete_p0/test_delete_unicode.groovy    | 39 +++++++++++++++++
 4 files changed, 70 insertions(+), 28 deletions(-)

diff --git a/be/src/olap/delete_handler.cpp b/be/src/olap/delete_handler.cpp
index 66859a069cc..10c6f50b300 100644
--- a/be/src/olap/delete_handler.cpp
+++ b/be/src/olap/delete_handler.cpp
@@ -21,8 +21,6 @@
 #include <gen_cpp/olap_file.pb.h>
 #include <thrift/protocol/TDebugProtocol.h>
 
-#include <boost/regex.hpp>
-#include <sstream>
 #include <string>
 #include <vector>
 
@@ -40,12 +38,10 @@
 using apache::thrift::ThriftDebugString;
 using std::vector;
 using std::string;
-using std::stringstream;
 
 using ::google::protobuf::RepeatedPtrField;
 
 namespace doris {
-using namespace ErrorCode;
 
 // construct sub condition from TCondition
 std::string construct_sub_predicate(const TCondition& condition) {
@@ -314,38 +310,35 @@ Status DeleteHandler::parse_condition(const 
DeleteSubPredicatePB& sub_cond, TCon
 // value: matches "1597751948193618247  and length(source)<1;\n;\n"
 //
 // For more info, see DeleteHandler::construct_sub_predicates
-// FIXME(gavin): support unicode. And this is a tricky implementation, it 
should
-//               not be the final resolution, refactor it.
+// FIXME(gavin): This is a tricky implementation, it should not be the final 
resolution, refactor it.
 const char* const CONDITION_STR_PATTERN =
-    // .----------------- column-name ----------------.   
.----------------------- operator ------------------------.   .------------ 
value ----------.
-    
R"(([_a-zA-Z@0-9\s/][.a-zA-Z0-9_+-/?@#$%^&*"\s,:]*)\s*((?:=)|(?:!=)|(?:>>)|(?:<<)|(?:>=)|(?:<=)|(?:\*=)|(?:
 IS ))\s*('((?:[\s\S]+)?)'|(?:[\s\S]+)?))";
-    // '----------------- group 1 --------------------'   
'--------------------- group 2 ---------------------------'   | '-- group 4--'  
            |
-    //                                                         match any of: = 
!= >> << >= <= *= " IS "                 '----------- group 3 ---------'
-    //                                                                         
                                          match **ANY THING** without(4)
-    //                                                                         
                                          or with(3) single quote
-boost::regex DELETE_HANDLER_REGEX(CONDITION_STR_PATTERN);
+    // .----------------- column-name --------------------------.   
.----------------------- operator ------------------------.   .------------ 
value ----------.
+    
R"(([_a-zA-Z@0-9\s/\p{L}][.a-zA-Z0-9_+-/?@#$%^&*"\s,:\p{L}]*)\s*((?:=)|(?:!=)|(?:>>)|(?:<<)|(?:>=)|(?:<=)|(?:\*=)|(?:
 IS ))\s*('((?:[\s\S]+)?)'|(?:[\s\S]+)?))";
+    // '----------------- group 1 ------------------------------'   
'--------------------- group 2 ---------------------------'   | '-- group 4--'  
            |
+    //                                                                   match 
any of: = != >> << >= <= *= " IS "                 '----------- group 3 
---------'
+    //                                                                         
                                                    match **ANY THING** 
without(4)
+    //                                                                         
                                                    or with(3) single quote
 // clang-format on
+RE2 DELETE_HANDLER_REGEX(CONDITION_STR_PATTERN);
 
 Status DeleteHandler::parse_condition(const std::string& condition_str, 
TCondition* condition) {
-    bool matched = false;
-    boost::smatch what;
-    try {
-        VLOG_NOTICE << "condition_str: " << condition_str;
-        matched = boost::regex_match(condition_str, what, 
DELETE_HANDLER_REGEX) &&
-                  condition_str.size() == what[0].str().size(); // exact match
-    } catch (boost::regex_error& e) {
-        VLOG_NOTICE << "fail to parse expr. [expr=" << condition_str << "; 
error=" << e.what()
-                    << "]";
-    }
+    std::string col_name, op, value, g4;
+
+    bool matched = RE2::FullMatch(condition_str, DELETE_HANDLER_REGEX, 
&col_name, &op, &value,
+                                  &g4); // exact match
+
     if (!matched) {
-        return Status::Error<ErrorCode::INVALID_ARGUMENT>("fail to sub 
condition. condition={}",
-                                                          condition_str);
+        return Status::InvalidArgument("fail to sub condition. condition={}", 
condition_str);
     }
 
-    condition->column_name = what[1].str();
-    condition->condition_op = what[2].str() == " IS " ? "IS" : what[2].str();
+    condition->column_name = col_name;
+    condition->condition_op = op == " IS " ? "IS" : op;
     // match string with single quotes, a = b  or a = 'b'
-    condition->condition_values.push_back(what[3 + !!what[4].matched].str());
+    if (!g4.empty()) {
+        condition->condition_values.push_back(g4);
+    } else {
+        condition->condition_values.push_back(value);
+    }
     VLOG_NOTICE << "parsed condition_str: col_name={" << 
condition->column_name << "} op={"
                 << condition->condition_op << "} val={" << 
condition->condition_values.back()
                 << "}";
diff --git a/be/test/olap/delete_handler_test.cpp 
b/be/test/olap/delete_handler_test.cpp
index 335c163930d..0d45d28c284 100644
--- a/be/test/olap/delete_handler_test.cpp
+++ b/be/test/olap/delete_handler_test.cpp
@@ -1225,6 +1225,10 @@ TEST_F(TestDeleteHandler, TestParseDeleteCondition) {
         {R"(a IS b IS NOT NULL)", true,  gen_cond(R"(a IS b)", "IS", R"(NOT 
NULL)"  )}, // test " IS " in column name
         {R"(_a-zA-Z@0-9 /.a-zA-Z0-9_+-/?@#$%^&*" ,:=hell)", true, 
gen_cond(R"(_a-zA-Z@0-9 /.a-zA-Z0-9_+-/?@#$%^&*" ,:)", "=", R"(hell)")}, // 
hellbound column name
         {R"(this is a col very 
loooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooon
 colum name=long)", true,  gen_cond(R"(this is a col very 
loooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooon
 colum name)", "=", R"(long)")}, // test " IS " in column name
+        {R"(中文列名1=b)"        , true,  gen_cond(R"(中文列名1)", "=" , R"(b)"        
)}, // Chinese case
+        {R"(错!!误!=b)"         , false,  gen_cond(R"(abc)"   , "!=", R"(b)"     
    )}, // illegal character
+        {R"(##错误<=b)"         , false,  gen_cond(R"(abc)"   , "<=", R"(b)"     
    )}, // illegal prefix
+        {R"(κάνεις지내세요>>b)"   , true,  gen_cond(R"(κάνεις지내세요)", ">>", R"(b)"  
  )}, // other languages
     };
     for (auto& i : test_input) { test(i); }
 }
diff --git a/regression-test/data/delete_p0/test_delete_unicode.out 
b/regression-test/data/delete_p0/test_delete_unicode.out
new file mode 100644
index 00000000000..c0cb04a2a1d
--- /dev/null
+++ b/regression-test/data/delete_p0/test_delete_unicode.out
@@ -0,0 +1,6 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql1 --
+2020-12-12     1       1       1
+
+-- !sql2 --
+
diff --git a/regression-test/suites/delete_p0/test_delete_unicode.groovy 
b/regression-test/suites/delete_p0/test_delete_unicode.groovy
new file mode 100644
index 00000000000..9dd5f589a07
--- /dev/null
+++ b/regression-test/suites/delete_p0/test_delete_unicode.groovy
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_delete_unicode") {
+    sql "set enable_unicode_name_support=true;"
+
+    sql """
+        CREATE TABLE `table_7298276` (
+        `中文列名1` date NOT NULL,
+        `中文列名2` int NOT NULL,
+        `中文列名3` bigint NOT NULL,
+        `中文列名4` largeint NOT NULL,
+        INDEX 中文列名2 (`中文列名2`) USING INVERTED,
+        INDEX 中文列名4 (`中文列名4`) USING INVERTED
+        ) ENGINE=OLAP
+        DUPLICATE KEY(`中文列名1`, `中文列名2`, `中文列名3`)
+        DISTRIBUTED BY HASH(`中文列名1`, `中文列名2`, `中文列名3`) BUCKETS 4
+        properties("replication_num" = "1");
+    """
+
+    sql """ insert into table_7298276 values ('2020-12-12',1,1,1);"""
+    qt_sql1 "select * from table_7298276;"
+    sql "delete from table_7298276 where 中文列名1 > '2012-08-17' and 中文列名2 > -68 
and 中文列名3 in (1,2,3);"
+    qt_sql2 "select * from table_7298276;"
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to