This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 9ecc5aeedc8 [Fix](delete) Support delete when column name is Unicode (#39381) 9ecc5aeedc8 is described below commit 9ecc5aeedc8805ea94da6e6d3619b78e78f3012f Author: zclllhhjj <zhaochan...@selectdb.com> AuthorDate: Mon Aug 19 12:43:17 2024 +0800 [Fix](delete) Support delete when column name is Unicode (#39381) ## Proposed changes Issue Number: close #xxx before: ```sql mysql> delete from table_7298276 where 中文列名1 > '2023-08-17' and 中文列名2 > '-68' and 中文列名3 in ("77", "0", "-35", "-8", "93", "-87", "42", "24", "57", "74"); ERROR 1105 (HY000): errCode = 2, detailMessage = delete job failed, errmsg:10019: [(10.16.10.8)[INVALID_ARGUMENT]failed to parse condition_str, condtion=TCondition { 01: column_name (string) = "\xe4\xb8\xad\xe6\x96\x87\xe5\x88\x97\xe5\x90\x8d1", 02: condition_op (string) = ">", 03: condition_values (list) = list<string>[1] { [0] = "2023-08-17", }, 04: column_unique_id (i32) = 0, 05: marked_by_runtime_filter (bool) = false, 1000: compound_type (i32) = 0, }] ``` now: ```sql mysql> delete from table_7298276 where 中文列名1 > '2012-08-17' and 中文列名2 > -68 and 中文列名3 in (1,2,3); Query OK, 0 rows affected (0.14 sec) ``` --- be/src/olap/delete_handler.cpp | 49 ++++++++++------------ be/test/olap/delete_handler_test.cpp | 4 ++ .../data/delete_p0/test_delete_unicode.out | 6 +++ .../suites/delete_p0/test_delete_unicode.groovy | 39 +++++++++++++++++ 4 files changed, 70 insertions(+), 28 deletions(-) diff --git a/be/src/olap/delete_handler.cpp b/be/src/olap/delete_handler.cpp index 66859a069cc..10c6f50b300 100644 --- a/be/src/olap/delete_handler.cpp +++ b/be/src/olap/delete_handler.cpp @@ -21,8 +21,6 @@ #include <gen_cpp/olap_file.pb.h> #include <thrift/protocol/TDebugProtocol.h> -#include <boost/regex.hpp> -#include <sstream> #include <string> #include <vector> @@ -40,12 +38,10 @@ using apache::thrift::ThriftDebugString; using std::vector; using std::string; -using std::stringstream; using ::google::protobuf::RepeatedPtrField; namespace doris { -using namespace ErrorCode; // construct sub condition from TCondition std::string construct_sub_predicate(const TCondition& condition) { @@ -314,38 +310,35 @@ Status DeleteHandler::parse_condition(const DeleteSubPredicatePB& sub_cond, TCon // value: matches "1597751948193618247 and length(source)<1;\n;\n" // // For more info, see DeleteHandler::construct_sub_predicates -// FIXME(gavin): support unicode. And this is a tricky implementation, it should -// not be the final resolution, refactor it. +// FIXME(gavin): This is a tricky implementation, it should not be the final resolution, refactor it. const char* const CONDITION_STR_PATTERN = - // .----------------- column-name ----------------. .----------------------- operator ------------------------. .------------ value ----------. - R"(([_a-zA-Z@0-9\s/][.a-zA-Z0-9_+-/?@#$%^&*"\s,:]*)\s*((?:=)|(?:!=)|(?:>>)|(?:<<)|(?:>=)|(?:<=)|(?:\*=)|(?: IS ))\s*('((?:[\s\S]+)?)'|(?:[\s\S]+)?))"; - // '----------------- group 1 --------------------' '--------------------- group 2 ---------------------------' | '-- group 4--' | - // match any of: = != >> << >= <= *= " IS " '----------- group 3 ---------' - // match **ANY THING** without(4) - // or with(3) single quote -boost::regex DELETE_HANDLER_REGEX(CONDITION_STR_PATTERN); + // .----------------- column-name --------------------------. .----------------------- operator ------------------------. .------------ value ----------. + R"(([_a-zA-Z@0-9\s/\p{L}][.a-zA-Z0-9_+-/?@#$%^&*"\s,:\p{L}]*)\s*((?:=)|(?:!=)|(?:>>)|(?:<<)|(?:>=)|(?:<=)|(?:\*=)|(?: IS ))\s*('((?:[\s\S]+)?)'|(?:[\s\S]+)?))"; + // '----------------- group 1 ------------------------------' '--------------------- group 2 ---------------------------' | '-- group 4--' | + // match any of: = != >> << >= <= *= " IS " '----------- group 3 ---------' + // match **ANY THING** without(4) + // or with(3) single quote // clang-format on +RE2 DELETE_HANDLER_REGEX(CONDITION_STR_PATTERN); Status DeleteHandler::parse_condition(const std::string& condition_str, TCondition* condition) { - bool matched = false; - boost::smatch what; - try { - VLOG_NOTICE << "condition_str: " << condition_str; - matched = boost::regex_match(condition_str, what, DELETE_HANDLER_REGEX) && - condition_str.size() == what[0].str().size(); // exact match - } catch (boost::regex_error& e) { - VLOG_NOTICE << "fail to parse expr. [expr=" << condition_str << "; error=" << e.what() - << "]"; - } + std::string col_name, op, value, g4; + + bool matched = RE2::FullMatch(condition_str, DELETE_HANDLER_REGEX, &col_name, &op, &value, + &g4); // exact match + if (!matched) { - return Status::Error<ErrorCode::INVALID_ARGUMENT>("fail to sub condition. condition={}", - condition_str); + return Status::InvalidArgument("fail to sub condition. condition={}", condition_str); } - condition->column_name = what[1].str(); - condition->condition_op = what[2].str() == " IS " ? "IS" : what[2].str(); + condition->column_name = col_name; + condition->condition_op = op == " IS " ? "IS" : op; // match string with single quotes, a = b or a = 'b' - condition->condition_values.push_back(what[3 + !!what[4].matched].str()); + if (!g4.empty()) { + condition->condition_values.push_back(g4); + } else { + condition->condition_values.push_back(value); + } VLOG_NOTICE << "parsed condition_str: col_name={" << condition->column_name << "} op={" << condition->condition_op << "} val={" << condition->condition_values.back() << "}"; diff --git a/be/test/olap/delete_handler_test.cpp b/be/test/olap/delete_handler_test.cpp index 335c163930d..0d45d28c284 100644 --- a/be/test/olap/delete_handler_test.cpp +++ b/be/test/olap/delete_handler_test.cpp @@ -1225,6 +1225,10 @@ TEST_F(TestDeleteHandler, TestParseDeleteCondition) { {R"(a IS b IS NOT NULL)", true, gen_cond(R"(a IS b)", "IS", R"(NOT NULL)" )}, // test " IS " in column name {R"(_a-zA-Z@0-9 /.a-zA-Z0-9_+-/?@#$%^&*" ,:=hell)", true, gen_cond(R"(_a-zA-Z@0-9 /.a-zA-Z0-9_+-/?@#$%^&*" ,:)", "=", R"(hell)")}, // hellbound column name {R"(this is a col very loooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooon colum name=long)", true, gen_cond(R"(this is a col very loooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooon colum name)", "=", R"(long)")}, // test " IS " in column name + {R"(中文列名1=b)" , true, gen_cond(R"(中文列名1)", "=" , R"(b)" )}, // Chinese case + {R"(错!!误!=b)" , false, gen_cond(R"(abc)" , "!=", R"(b)" )}, // illegal character + {R"(##错误<=b)" , false, gen_cond(R"(abc)" , "<=", R"(b)" )}, // illegal prefix + {R"(κάνεις지내세요>>b)" , true, gen_cond(R"(κάνεις지내세요)", ">>", R"(b)" )}, // other languages }; for (auto& i : test_input) { test(i); } } diff --git a/regression-test/data/delete_p0/test_delete_unicode.out b/regression-test/data/delete_p0/test_delete_unicode.out new file mode 100644 index 00000000000..c0cb04a2a1d --- /dev/null +++ b/regression-test/data/delete_p0/test_delete_unicode.out @@ -0,0 +1,6 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql1 -- +2020-12-12 1 1 1 + +-- !sql2 -- + diff --git a/regression-test/suites/delete_p0/test_delete_unicode.groovy b/regression-test/suites/delete_p0/test_delete_unicode.groovy new file mode 100644 index 00000000000..9dd5f589a07 --- /dev/null +++ b/regression-test/suites/delete_p0/test_delete_unicode.groovy @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_delete_unicode") { + sql "set enable_unicode_name_support=true;" + + sql """ + CREATE TABLE `table_7298276` ( + `中文列名1` date NOT NULL, + `中文列名2` int NOT NULL, + `中文列名3` bigint NOT NULL, + `中文列名4` largeint NOT NULL, + INDEX 中文列名2 (`中文列名2`) USING INVERTED, + INDEX 中文列名4 (`中文列名4`) USING INVERTED + ) ENGINE=OLAP + DUPLICATE KEY(`中文列名1`, `中文列名2`, `中文列名3`) + DISTRIBUTED BY HASH(`中文列名1`, `中文列名2`, `中文列名3`) BUCKETS 4 + properties("replication_num" = "1"); + """ + + sql """ insert into table_7298276 values ('2020-12-12',1,1,1);""" + qt_sql1 "select * from table_7298276;" + sql "delete from table_7298276 where 中文列名1 > '2012-08-17' and 中文列名2 > -68 and 中文列名3 in (1,2,3);" + qt_sql2 "select * from table_7298276;" +} \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org