This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new 9c270e5cdf3 [fix](delete) Fix unrecognized column name delete handler (#32429) (#35742) 9c270e5cdf3 is described below commit 9c270e5cdf361b8497d568e38b3c736c8da52744 Author: Gavin Chou <gavineaglec...@gmail.com> AuthorDate: Fri May 31 20:41:22 2024 +0800 [fix](delete) Fix unrecognized column name delete handler (#32429) (#35742) pick doris-master #32429 --- be/src/olap/delete_handler.cpp | 73 +++++++++++-------- be/src/olap/delete_handler.h | 10 ++- be/test/olap/delete_handler_test.cpp | 40 +++++++++++ .../java/org/apache/doris/common/FeNameFormat.java | 4 +- .../suites/delete_p0/test_delete_handler.groovy | 84 ++++++++++++++++++++++ 5 files changed, 179 insertions(+), 32 deletions(-) diff --git a/be/src/olap/delete_handler.cpp b/be/src/olap/delete_handler.cpp index d2126bbdf3e..ca28d07254e 100644 --- a/be/src/olap/delete_handler.cpp +++ b/be/src/olap/delete_handler.cpp @@ -80,7 +80,15 @@ Status DeleteHandler::generate_delete_predicate(const TabletSchema& schema, << "condition size=" << in_pred->values().size(); } else { // write sub predicate v1 for compactbility - del_pred->add_sub_predicates(construct_sub_predicate(condition)); + std::string condition_str = construct_sub_predicate(condition); + if (TCondition tmp; !DeleteHandler::parse_condition(condition_str, &tmp)) { + LOG(WARNING) << "failed to parse condition_str, condtion=" + << ThriftDebugString(condition); + return Status::Error<DELETE_INVALID_CONDITION>( + "failed to parse condition_str, condtion={}", ThriftDebugString(condition)); + } + VLOG_NOTICE << __PRETTY_FUNCTION__ << " condition_str: " << condition_str; + del_pred->add_sub_predicates(condition_str); DeleteSubPredicatePB* sub_predicate = del_pred->add_sub_predicates_v2(); if (condition.__isset.column_unique_id) { sub_predicate->set_column_unique_id(condition.column_unique_id); @@ -127,8 +135,9 @@ std::string DeleteHandler::construct_sub_predicate(const TCondition& condition) } string condition_str; if ("IS" == op) { - condition_str = condition.column_name + " " + op + " " + condition.condition_values[0]; - } else { + // ATTN: tricky! Surround IS with spaces to make it "special" + condition_str = condition.column_name + " IS " + condition.condition_values[0]; + } else { // multi-elements IN expr has been processed with InPredicatePB if (op == "*=") { op = "="; } else if (op == "!*=") { @@ -286,44 +295,50 @@ Status DeleteHandler::parse_condition(const DeleteSubPredicatePB& sub_cond, TCon return Status::OK(); } +// clang-format off +// Condition string format, the format is (column_name)(op)(value) +// eg: condition_str="c1 = 1597751948193618247 and length(source)<1;\n;\n" +// column_name: matches "c1", must include FeNameFormat.java COLUMN_NAME_REGEX +// and compactible with any the lagacy +// operator: matches "=" +// value: matches "1597751948193618247 and length(source)<1;\n;\n" +// +// For more info, see DeleteHandler::construct_sub_predicates +// FIXME(gavin): support unicode. And this is a tricky implementation, it should +// not be the final resolution, refactor it. +const char* const CONDITION_STR_PATTERN = + // .----------------- column-name ----------------. .----------------------- operator ------------------------. .------------ value ----------. + R"(([_a-zA-Z@0-9\s/][.a-zA-Z0-9_+-/?@#$%^&*"\s,:]*)\s*((?:=)|(?:!=)|(?:>>)|(?:<<)|(?:>=)|(?:<=)|(?:\*=)|(?: IS ))\s*('((?:[\s\S]+)?)'|(?:[\s\S]+)?))"; + // '----------------- group 1 --------------------' '--------------------- group 2 ---------------------------' | '-- group 4--' | + // match any of: = != >> << >= <= *= " IS " '----------- group 3 ---------' + // match **ANY THING** without(4) + // or with(3) single quote +boost::regex DELETE_HANDLER_REGEX(CONDITION_STR_PATTERN); +// clang-format on + Status DeleteHandler::parse_condition(const std::string& condition_str, TCondition* condition) { - bool matched = true; + bool matched = false; boost::smatch what; - try { - // Condition string format, the format is (column_name)(op)(value) - // eg: condition_str="c1 = 1597751948193618247 and length(source)<1;\n;\n" - // group1: (\w+) matches "c1" - // group2: ((?:=)|(?:!=)|(?:>>)|(?:<<)|(?:>=)|(?:<=)|(?:\*=)|(?:IS)) matches "=" - // group3: ((?:[\s\S]+)?) matches "1597751948193618247 and length(source)<1;\n;\n" - const char* const CONDITION_STR_PATTERN = - R"(([\w$#%]+)\s*((?:=)|(?:!=)|(?:>>)|(?:<<)|(?:>=)|(?:<=)|(?:\*=)|(?:IS))\s*('((?:[\s\S]+)?)'|(?:[\s\S]+)?))"; - boost::regex ex(CONDITION_STR_PATTERN); - if (boost::regex_match(condition_str, what, ex)) { - if (condition_str.size() != what[0].str().size()) { - matched = false; - } - } else { - matched = false; - } + VLOG_NOTICE << "condition_str: " << condition_str; + matched = boost::regex_match(condition_str, what, DELETE_HANDLER_REGEX) && + condition_str.size() == what[0].str().size(); // exact match } catch (boost::regex_error& e) { VLOG_NOTICE << "fail to parse expr. [expr=" << condition_str << "; error=" << e.what() << "]"; - matched = false; } - if (!matched) { return Status::Error<DELETE_INVALID_PARAMETERS>("fail to sub condition. condition={}", condition_str); } - condition->column_name = what[1].str(); - condition->condition_op = what[2].str(); - if (what[4].matched) { // match string with single quotes, eg. a = 'b' - condition->condition_values.push_back(what[4].str()); - } else { // match string without quote, compat with old conditions, eg. a = b - condition->condition_values.push_back(what[3].str()); - } + condition->column_name = what[1].str(); + condition->condition_op = what[2].str() == " IS " ? "IS" : what[2].str(); + // match string with single quotes, a = b or a = 'b' + condition->condition_values.push_back(what[3 + !!what[4].matched].str()); + VLOG_NOTICE << "parsed condition_str: col_name={" << condition->column_name << "} op={" + << condition->condition_op << "} val={" << condition->condition_values.back() + << "}"; return Status::OK(); } diff --git a/be/src/olap/delete_handler.h b/be/src/olap/delete_handler.h index 45be5d73ffe..bce18669c58 100644 --- a/be/src/olap/delete_handler.h +++ b/be/src/olap/delete_handler.h @@ -66,6 +66,15 @@ public: static void convert_to_sub_pred_v2(DeletePredicatePB* delete_pred, TabletSchemaSPtr schema); + /** + * Use regular expression to extract 'column_name', 'op' and 'operands' + * + * @param condition_str input predicate string in form of `X OP Y` + * @param condition output param + * @return OK if matched and extracted correctly otherwise DELETE_INVALID_PARAMETERS + */ + static Status parse_condition(const std::string& condition_str, TCondition* condition); + private: // Validate the condition on the schema. static Status check_condition_valid(const TabletSchema& tablet_schema, const TCondition& cond); @@ -87,7 +96,6 @@ private: // extract 'column_name', 'op' and 'operands' to condition static Status parse_condition(const DeleteSubPredicatePB& sub_cond, TCondition* condition); - static Status parse_condition(const std::string& condition_str, TCondition* condition); public: DeleteHandler() = default; diff --git a/be/test/olap/delete_handler_test.cpp b/be/test/olap/delete_handler_test.cpp index a7d39ad81b7..f36aeac84cd 100644 --- a/be/test/olap/delete_handler_test.cpp +++ b/be/test/olap/delete_handler_test.cpp @@ -1192,4 +1192,44 @@ TEST_F(TestDeleteHandler, FilterDataVersion) { EXPECT_EQ(Status::OK(), res); } +// clang-format off +TEST_F(TestDeleteHandler, TestParseDeleteCondition) { + auto test = [](const std::tuple<std::string, bool, TCondition>& in) { + auto& [cond_str, exp_succ, exp_cond] = in; + TCondition parsed_cond; + EXPECT_EQ(DeleteHandler::parse_condition(cond_str, &parsed_cond), exp_succ) << " unexpected result, cond_str: " << cond_str; + if (exp_succ) EXPECT_EQ(parsed_cond, exp_cond) << " unexpected result, cond_str: " << cond_str; + }; + + auto gen_cond = [](const std::string& col, const std::string& op, const std::string& val) { + TCondition cond; + cond.__set_column_name(col); + cond.__set_condition_op(op); + cond.__set_condition_values(std::vector<std::string>{val}); + return cond; + }; + + // <cond_str, parsed, expect_value>> + std::vector<std::tuple<std::string, bool, TCondition>> test_input { + {R"(abc=b)" , true, gen_cond(R"(abc)" , "=" , R"(b)" )}, // normal case + {R"(abc!=b)" , true, gen_cond(R"(abc)" , "!=", R"(b)" )}, // normal case + {R"(abc<=b)" , true, gen_cond(R"(abc)" , "<=", R"(b)" )}, // normal case + {R"(abc>=b)" , true, gen_cond(R"(abc)" , ">=", R"(b)" )}, // normal case + {R"(abc>>b)" , true, gen_cond(R"(abc)" , ">>", R"(b)" )}, // normal case + {R"(abc<<b)" , true, gen_cond(R"(abc)" , "<<", R"(b)" )}, // normal case + {R"(abc!='b')" , true, gen_cond(R"(abc)" , "!=", R"(b)" )}, // value surrounded by ' + {R"(abc=)" , true, gen_cond(R"(abc)" , "=" , R"()" )}, // missing value, it means not to be parsed succefully, how every it's a ignorable bug + {R"(@a*<<10086)" , true, gen_cond(R"(@a*)" , "<<", R"(10086)" )}, // column ends with * + {R"(*a=b)" , false, gen_cond(R"(*a)" , "=" , R"(b)" )}, // starts with * + {R"(a*a>>WTF(10086))" , true, gen_cond(R"(a*a)" , ">>", R"(WTF(10086))")}, // function + {R"(a-b IS NULL)" , true, gen_cond(R"(a-b)" , "IS", R"(NULL)" )}, // - in col name and test IS NULL + {R"(@a*-b IS NOT NULL)" , true, gen_cond(R"(@a*-b)" , "IS", R"(NOT NULL)" )}, // test IS NOT NULL + {R"(a IS b IS NOT NULL)", true, gen_cond(R"(a IS b)", "IS", R"(NOT NULL)" )}, // test " IS " in column name + {R"(_a-zA-Z@0-9 /.a-zA-Z0-9_+-/?@#$%^&*" ,:=hell)", true, gen_cond(R"(_a-zA-Z@0-9 /.a-zA-Z0-9_+-/?@#$%^&*" ,:)", "=", R"(hell)")}, // hellbound column name + {R"(this is a col very loooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooon colum name=long)", true, gen_cond(R"(this is a col very loooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooon colum name)", "=", R"(long)")}, // test " IS " in column name + }; + for (auto& i : test_input) { test(i); } +} +// clang-format on + } // namespace doris diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/FeNameFormat.java b/fe/fe-core/src/main/java/org/apache/doris/common/FeNameFormat.java index 53d2f3ca8b6..922edd99d2f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/FeNameFormat.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/FeNameFormat.java @@ -33,7 +33,7 @@ public class FeNameFormat { private static final String UNDERSCORE_COMMON_NAME_REGEX = "^[_a-zA-Z][a-zA-Z0-9-_]{0,63}$"; private static final String TABLE_NAME_REGEX = "^[a-zA-Z][a-zA-Z0-9-_]*$"; private static final String USER_NAME_REGEX = "^[a-zA-Z][a-zA-Z0-9.-_]*$"; - private static final String COLUMN_NAME_REGEX = "^[_a-zA-Z@0-9\\s<>/][.a-zA-Z0-9_+-/><?@#$%^&*\"\\s,:]{0,255}$"; + private static final String COLUMN_NAME_REGEX = "^[_a-zA-Z@0-9\\s/][.a-zA-Z0-9_+-/?@#$%^&*\"\\s,:]{0,255}$"; private static final String UNICODE_LABEL_REGEX = "^[-_A-Za-z0-9:\\p{L}]{1,128}$"; private static final String UNICODE_COMMON_NAME_REGEX = "^[a-zA-Z\\p{L}][a-zA-Z0-9-_\\p{L}]{0,63}$"; @@ -41,7 +41,7 @@ public class FeNameFormat { private static final String UNICODE_TABLE_NAME_REGEX = "^[a-zA-Z\\p{L}][a-zA-Z0-9-_\\p{L}]*$"; private static final String UNICODE_USER_NAME_REGEX = "^[a-zA-Z\\p{L}][a-zA-Z0-9.-_\\p{L}]*$"; private static final String UNICODE_COLUMN_NAME_REGEX - = "^[_a-zA-Z@0-9\\p{L}][.a-zA-Z0-9_+-/><?@#$%^&*\\p{L}]{0,255}$"; + = "^[_a-zA-Z@0-9\\p{L}][.a-zA-Z0-9_+-/?@#$%^&*\\p{L}]{0,255}$"; public static final String FORBIDDEN_PARTITION_NAME = "placeholder_"; diff --git a/regression-test/suites/delete_p0/test_delete_handler.groovy b/regression-test/suites/delete_p0/test_delete_handler.groovy new file mode 100644 index 00000000000..8c9ac33edc9 --- /dev/null +++ b/regression-test/suites/delete_p0/test_delete_handler.groovy @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/** + * Test delete with strange name + */ +suite("test_delete_handler") { + // test condition operator + sql """drop table if exists td1;""" + sql """ + CREATE TABLE `td1` ( + `id` int(11) NULL, + `name` varchar(255) NULL, + `score` int(11) NULL + ) ENGINE=OLAP + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_num" = "1" ); + """ + + sql """insert into td1(id,name,`score`) values(1,"a",1),(2,"a",2),(3,"a",3),(4,"b",4);""" + sql """insert into td1(id,name,`score`) values(1,"a",1),(2,"a",2),(3,"a",3),(4,"b",4);""" + sql """insert into td1(id,name,`score`) values(1,"a",1),(2,"a",2),(3,"a",3),(4,"b",4);""" + + sql """delete from td1 where `score` = 4 and `score` = 3 and `score` = 1;""" + sql """delete from td1 where `score` is null;""" + sql """delete from td1 where `score` is not null;""" + sql """delete from td1 where `score` in (1,2);""" + sql """delete from td1 where `score` not in (3,4);""" + sql """select * from td1;""" + sql """insert into td1(id,name,`score`) values(1,"a",1),(2,"a",2),(3,"a",3),(4,"b",4);""" + sql """select * from td1;""" + + + // test column name + sql """drop table if exists td2;""" + sql """ + CREATE TABLE `td2` ( + `id` int(11) NULL, + `name` varchar(255) NULL, + `@score` int(11) NULL, + `scoreIS` int(11) NULL, + `sc ore` int(11) NULL, + `score IS score` int(11) NULL, + `_a-zA-Z0-9_.+-/?@#\$%^&*" ,:` int(11) NULL + ) ENGINE=OLAP + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_num" = "1" ); + """ + + sql """insert into td2(id,name,`@score`) values(1,"a",1),(2,"a",2),(3,"a",3),(4,"b",4);""" + sql """insert into td2(id,name,`@score`) values(1,"a",1),(2,"a",2),(3,"a",3),(4,"b",4);""" + sql """insert into td2(id,name,`@score`) values(1,"a",1),(2,"a",2),(3,"a",3),(4,"b",4);""" + + sql """select * from td2;""" + sql """delete from td2 where `@score` = 88;""" + sql """delete from td2 where `scoreIS` is null;""" + sql """delete from td2 where `score IS score` is null;""" + sql """delete from td2 where `sc ore` is null;""" + sql """delete from td2 where `_a-zA-Z0-9_.+-/?@#\$%^&*" ,:` is null;""" + sql """delete from td2 where `_a-zA-Z0-9_.+-/?@#\$%^&*" ,:` is not null;""" + sql """delete from td2 where `_a-zA-Z0-9_.+-/?@#\$%^&*" ,:` in (1,2,3);""" + sql """delete from td2 where `_a-zA-Z0-9_.+-/?@#\$%^&*" ,:` not in (1,2,3);""" + sql """select * from td2;""" + sql """insert into td2(id,name,`@score`) values(1,"a",1),(2,"a",2),(3,"a",3),(4,"b",4);""" + sql """select * from td2;""" + +} + --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org