This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 1c18f2c34f1 branch-4.0: [Enhancement](regexp) Support zero-width
assertions in some regexp functions #57643 (#57948)
1c18f2c34f1 is described below
commit 1c18f2c34f1a5dfe05df9d011be2bdc36dc6954a
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Thu Nov 13 10:27:04 2025 +0800
branch-4.0: [Enhancement](regexp) Support zero-width assertions in some
regexp functions #57643 (#57948)
Cherry-picked from #57643
Co-authored-by: linrrarity <[email protected]>
---
be/src/runtime/runtime_state.h | 6 +
be/src/vec/functions/function_regexp.cpp | 216 ++++++++++++++++-----
be/src/vec/functions/like.cpp | 26 ++-
be/src/vec/functions/like.h | 4 +
.../java/org/apache/doris/qe/SessionVariable.java | 8 +
gensrc/thrift/PaloInternalService.thrift | 2 +
.../test_string_function_regexp.out | 56 +++++-
.../test_string_function_regexp.groovy | 52 ++++-
8 files changed, 317 insertions(+), 53 deletions(-)
diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h
index 3d89f4aa0d4..24993123bf3 100644
--- a/be/src/runtime/runtime_state.h
+++ b/be/src/runtime/runtime_state.h
@@ -157,6 +157,12 @@ public:
: 1;
}
+ // Support extended regex
+ // like look-around zero-width assertions(`?=`, `?!`, `?<=`, `?<!`)
+ bool enable_extended_regex() const {
+ return _query_options.__isset.enable_extended_regex &&
_query_options.enable_extended_regex;
+ }
+
TQueryType::type query_type() const { return _query_options.query_type; }
int64_t timestamp_ms() const { return _timestamp_ms; }
int32_t nano_seconds() const { return _nano_seconds; }
diff --git a/be/src/vec/functions/function_regexp.cpp
b/be/src/vec/functions/function_regexp.cpp
index 1007487e4ce..d24a0538c04 100644
--- a/be/src/vec/functions/function_regexp.cpp
+++ b/be/src/vec/functions/function_regexp.cpp
@@ -20,6 +20,7 @@
#include <re2/stringpiece.h>
#include <stddef.h>
+#include <boost/regex.hpp>
#include <memory>
#include <string>
#include <string_view>
@@ -51,6 +52,137 @@
namespace doris::vectorized {
#include "common/compile_check_begin.h"
+
+// Helper structure to hold either RE2 or Boost.Regex
+struct RegexpExtractEngine {
+ std::unique_ptr<re2::RE2> re2_regex;
+ std::unique_ptr<boost::regex> boost_regex;
+
+ bool is_boost() const { return boost_regex != nullptr; }
+ bool is_re2() const { return re2_regex != nullptr; }
+
+ // Try to compile with RE2 first, fallback to Boost.Regex if RE2 fails
+ static bool compile(const StringRef& pattern, std::string* error_str,
+ RegexpExtractEngine& engine, bool
enable_extended_regex) {
+ engine.re2_regex =
std::make_unique<re2::RE2>(re2::StringPiece(pattern.data, pattern.size));
+ if (engine.re2_regex->ok()) {
+ return true;
+ } else if (!enable_extended_regex) {
+ *error_str = fmt::format(
+ "Invalid regex pattern: {}. Error: {}. If you need
advanced regex features, "
+ "try setting enable_extended_regex=true",
+ std::string(pattern.data, pattern.size),
engine.re2_regex->error());
+ return false;
+ }
+
+ // RE2 failed, try Boost.Regex for advanced features like zero-width
assertions
+ engine.re2_regex.reset();
+ try {
+ boost::regex::flag_type flags = boost::regex::normal;
+ engine.boost_regex = std::make_unique<boost::regex>(pattern.data,
+ pattern.data +
pattern.size, flags);
+ return true;
+ } catch (const boost::regex_error& e) {
+ if (error_str) {
+ *error_str = fmt::format("Invalid regex pattern: {}. Error:
{}",
+ std::string(pattern.data,
pattern.size), e.what());
+ }
+ return false;
+ }
+ }
+
+ // Get number of capturing groups
+ int number_of_capturing_groups() const {
+ if (is_re2()) {
+ return re2_regex->NumberOfCapturingGroups();
+ } else if (is_boost()) {
+ return static_cast<int>(boost_regex->mark_count());
+ }
+ return 0;
+ }
+
+ // Match function for extraction
+ bool match_and_extract(const char* data, size_t size, int index,
std::string& result) const {
+ if (is_re2()) {
+ int max_matches = 1 + re2_regex->NumberOfCapturingGroups();
+ if (index >= max_matches) {
+ return false;
+ }
+ std::vector<re2::StringPiece> matches(max_matches);
+ bool success = re2_regex->Match(re2::StringPiece(data, size), 0,
size,
+ re2::RE2::UNANCHORED,
matches.data(), max_matches);
+ if (success && index < matches.size()) {
+ const re2::StringPiece& match = matches[index];
+ result.assign(match.data(), match.size());
+ return true;
+ }
+ return false;
+ } else if (is_boost()) {
+ boost::cmatch matches;
+ bool success = boost::regex_search(data, data + size, matches,
*boost_regex);
+ if (success && index < matches.size()) {
+ result = matches[index].str();
+ return true;
+ }
+ return false;
+ }
+ return false;
+ }
+
+ // Match all occurrences and extract the first capturing group
+ void match_all_and_extract(const char* data, size_t size,
+ std::vector<std::string>& results) const {
+ if (is_re2()) {
+ int max_matches = 1 + re2_regex->NumberOfCapturingGroups();
+ if (max_matches < 2) {
+ return; // No capturing groups
+ }
+
+ size_t pos = 0;
+ while (pos < size) {
+ const char* str_pos = data + pos;
+ size_t str_size = size - pos;
+ std::vector<re2::StringPiece> matches(max_matches);
+ bool success = re2_regex->Match(re2::StringPiece(str_pos,
str_size), 0, str_size,
+ re2::RE2::UNANCHORED,
matches.data(), max_matches);
+ if (!success) {
+ break;
+ }
+ if (matches[0].empty()) {
+ pos += 1;
+ continue;
+ }
+ // Extract first capturing group
+ if (matches.size() > 1 && !matches[1].empty()) {
+ results.emplace_back(matches[1].data(), matches[1].size());
+ }
+ // Move position forward
+ auto offset = std::string(str_pos, str_size)
+ .find(std::string(matches[0].data(),
matches[0].size()));
+ pos += offset + matches[0].size();
+ }
+ } else if (is_boost()) {
+ const char* search_start = data;
+ const char* search_end = data + size;
+ boost::match_results<const char*> matches;
+
+ while (boost::regex_search(search_start, search_end, matches,
*boost_regex)) {
+ if (matches.size() > 1 && matches[1].matched) {
+ results.emplace_back(matches[1].str());
+ }
+ if (matches[0].length() == 0) {
+ if (search_start == search_end) {
+ break;
+ }
+ search_start += 1;
+ } else {
+ search_start = matches[0].second;
+ }
+ }
+ }
+ }
+};
+
struct RegexpCountImpl {
static void execute_impl(FunctionContext* context, ColumnPtr
argument_columns[],
size_t input_rows_count, ColumnInt32::Container&
result_data) {
@@ -469,42 +601,45 @@ struct RegexpExtractImpl {
ColumnString::Chars& result_data,
ColumnString::Offsets& result_offset,
NullMap& null_map,
const size_t index_now) {
- re2::RE2* re = reinterpret_cast<re2::RE2*>(
+ auto* engine = reinterpret_cast<RegexpExtractEngine*>(
context->get_function_state(FunctionContext::THREAD_LOCAL));
- std::unique_ptr<re2::RE2> scoped_re;
- if (re == nullptr) {
+ std::unique_ptr<RegexpExtractEngine> scoped_engine;
+
+ if (engine == nullptr) {
std::string error_str;
const auto& pattern =
pattern_col->get_data_at(index_check_const(index_now, Const));
- bool st = StringFunctions::compile_regex(pattern, &error_str,
StringRef(), StringRef(),
- scoped_re);
+ scoped_engine = std::make_unique<RegexpExtractEngine>();
+ bool st = RegexpExtractEngine::compile(pattern, &error_str,
*scoped_engine,
+
context->state()->enable_extended_regex());
if (!st) {
context->add_warning(error_str.c_str());
StringOP::push_null_string(index_now, result_data,
result_offset, null_map);
return;
}
- re = scoped_re.get();
+ engine = scoped_engine.get();
}
+
const auto& str = str_col->get_data_at(index_now);
- re2::StringPiece str_sp = re2::StringPiece(str.data, str.size);
- int max_matches = 1 + re->NumberOfCapturingGroups();
+ int max_matches = 1 + engine->number_of_capturing_groups();
if (index_data >= max_matches) {
ReturnNull ? StringOP::push_null_string(index_now, result_data,
result_offset, null_map)
: StringOP::push_empty_string(index_now, result_data,
result_offset);
return;
}
- std::vector<re2::StringPiece> matches(max_matches);
- bool success =
- re->Match(str_sp, 0, str.size, re2::RE2::UNANCHORED,
&matches[0], max_matches);
+ std::string match_result;
+ bool success = engine->match_and_extract(str.data, str.size,
static_cast<int>(index_data),
+ match_result);
+
if (!success) {
ReturnNull ? StringOP::push_null_string(index_now, result_data,
result_offset, null_map)
: StringOP::push_empty_string(index_now, result_data,
result_offset);
return;
}
- const re2::StringPiece& match = matches[index_data];
- StringOP::push_value_string(std::string_view(match.data(),
match.size()), index_now,
- result_data, result_offset);
+
+ StringOP::push_value_string(std::string_view(match_result.data(),
match_result.size()),
+ index_now, result_data, result_offset);
}
};
@@ -548,49 +683,31 @@ struct RegexpExtractAllImpl {
ColumnString::Chars& result_data,
ColumnString::Offsets& result_offset,
NullMap& null_map,
const size_t index_now) {
- re2::RE2* re = reinterpret_cast<re2::RE2*>(
+ auto* engine = reinterpret_cast<RegexpExtractEngine*>(
context->get_function_state(FunctionContext::THREAD_LOCAL));
- std::unique_ptr<re2::RE2> scoped_re;
- if (re == nullptr) {
+ std::unique_ptr<RegexpExtractEngine> scoped_engine;
+
+ if (engine == nullptr) {
std::string error_str;
const auto& pattern =
pattern_col->get_data_at(index_check_const(index_now, Const));
- bool st = StringFunctions::compile_regex(pattern, &error_str,
StringRef(), StringRef(),
- scoped_re);
+ scoped_engine = std::make_unique<RegexpExtractEngine>();
+ bool st = RegexpExtractEngine::compile(pattern, &error_str,
*scoped_engine,
+
context->state()->enable_extended_regex());
if (!st) {
context->add_warning(error_str.c_str());
StringOP::push_null_string(index_now, result_data,
result_offset, null_map);
return;
}
- re = scoped_re.get();
+ engine = scoped_engine.get();
}
- if (re->NumberOfCapturingGroups() == 0) {
+
+ if (engine->number_of_capturing_groups() == 0) {
StringOP::push_empty_string(index_now, result_data, result_offset);
return;
}
const auto& str = str_col->get_data_at(index_now);
- int max_matches = 1 + re->NumberOfCapturingGroups();
- std::vector<re2::StringPiece> res_matches;
- size_t pos = 0;
- while (pos < str.size) {
- auto str_pos = str.data + pos;
- auto str_size = str.size - pos;
- re2::StringPiece str_sp = re2::StringPiece(str_pos, str_size);
- std::vector<re2::StringPiece> matches(max_matches);
- bool success =
- re->Match(str_sp, 0, str_size, re2::RE2::UNANCHORED,
&matches[0], max_matches);
- if (!success) {
- StringOP::push_empty_string(index_now, result_data,
result_offset);
- break;
- }
- if (matches[0].empty()) {
- StringOP::push_empty_string(index_now, result_data,
result_offset);
- pos += 1;
- continue;
- }
- res_matches.push_back(matches[1]);
- auto offset = std::string(str_pos,
str_size).find(std::string(matches[0].as_string()));
- pos += offset + matches[0].size();
- }
+ std::vector<std::string> res_matches;
+ engine->match_all_and_extract(str.data, str.size, res_matches);
if (res_matches.empty()) {
StringOP::push_empty_string(index_now, result_data, result_offset);
@@ -599,7 +716,7 @@ struct RegexpExtractAllImpl {
std::string res = "[";
for (int j = 0; j < res_matches.size(); ++j) {
- res += "'" + res_matches[j].as_string() + "'";
+ res += "'" + res_matches[j] + "'";
if (j < res_matches.size() - 1) {
res += ",";
}
@@ -641,15 +758,14 @@ public:
}
std::string error_str;
- std::unique_ptr<re2::RE2> scoped_re;
- bool st = StringFunctions::compile_regex(pattern, &error_str,
StringRef(),
- StringRef(),
scoped_re);
+ auto engine = std::make_shared<RegexpExtractEngine>();
+ bool st = RegexpExtractEngine::compile(pattern, &error_str,
*engine,
+
context->state()->enable_extended_regex());
if (!st) {
context->set_error(error_str.c_str());
return Status::InvalidArgument(error_str);
}
- std::shared_ptr<re2::RE2> re(scoped_re.release());
- context->set_function_state(scope, re);
+ context->set_function_state(scope, engine);
}
}
return Status::OK();
diff --git a/be/src/vec/functions/like.cpp b/be/src/vec/functions/like.cpp
index 95703197990..b609bbd0382 100644
--- a/be/src/vec/functions/like.cpp
+++ b/be/src/vec/functions/like.cpp
@@ -392,6 +392,8 @@ Status FunctionLikeBase::constant_regex_fn_scalar(const
LikeSearchState* state,
if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
return Status::RuntimeError(fmt::format("hyperscan error: {}",
ret));
}
+ } else if (state->boost_regex) { // use boost::regex for advanced features
+ *result = boost::regex_search(val.data, val.data + val.size,
*state->boost_regex);
} else { // fallback to re2
*result = RE2::PartialMatch(re2::StringPiece(val.data, val.size),
*state->regex);
}
@@ -429,6 +431,12 @@ Status FunctionLikeBase::constant_regex_fn(const
LikeSearchState* state, const C
return Status::RuntimeError(fmt::format("hyperscan error: {}",
ret));
}
}
+ } else if (state->boost_regex) { // use boost::regex for advanced features
+ for (size_t i = 0; i < sz; i++) {
+ const auto& str_ref = val.get_data_at(i);
+ *(result.data() + i) = boost::regex_search(str_ref.data,
str_ref.data + str_ref.size,
+ *state->boost_regex);
+ }
} else { // fallback to re2
for (size_t i = 0; i < sz; i++) {
const auto& str_ref = val.get_data_at(i);
@@ -1009,7 +1017,23 @@ Status FunctionRegexpLike::open(FunctionContext* context,
opts.set_dot_nl(true);
state->search_state.regex = std::make_unique<RE2>(pattern_str,
opts);
if (!state->search_state.regex->ok()) {
- return Status::InternalError("Invalid regex expression:
{}", pattern_str);
+ if (!context->state()->enable_extended_regex()) {
+ return Status::InternalError(
+ "Invalid regex expression: {}. Error: {}. If
you need advanced "
+ "regex features, try setting
enable_extended_regex=true",
+ pattern_str,
state->search_state.regex->error());
+ }
+
+ // RE2 failed, fallback to Boost.Regex
+ // This handles advanced regex features like zero-width
assertions
+ state->search_state.regex.reset();
+ try {
+ state->search_state.boost_regex =
+ std::make_unique<boost::regex>(pattern_str);
+ } catch (const boost::regex_error& e) {
+ return Status::InternalError("Invalid regex
expression: {}. Error: {}",
+ pattern_str, e.what());
+ }
}
}
state->function = constant_regex_fn;
diff --git a/be/src/vec/functions/like.h b/be/src/vec/functions/like.h
index 1128e4f3f69..085bea5bcd2 100644
--- a/be/src/vec/functions/like.h
+++ b/be/src/vec/functions/like.h
@@ -25,6 +25,7 @@
#include <algorithm>
#include <boost/iterator/iterator_facade.hpp>
+#include <boost/regex.hpp>
#include <functional>
#include <memory>
#include <string>
@@ -100,6 +101,9 @@ struct LikeSearchState {
/// Used for RLIKE and REGEXP predicates if the pattern is a constant
argument.
std::unique_ptr<re2::RE2> regex;
+ /// Used for REGEXP predicates when RE2 doesn't support the pattern (e.g.,
zero-width assertions like `?=`, `?!`, `?<=`, `?<!`)
+ std::unique_ptr<boost::regex> boost_regex;
+
template <typename Deleter, Deleter deleter>
struct HyperscanDeleter {
template <typename T>
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index 0f875787fe7..00d9b965c9f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -726,6 +726,8 @@ public class SessionVariable implements Serializable,
Writable {
public static final String SKIP_CHECKING_ACID_VERSION_FILE =
"skip_checking_acid_version_file";
+ public static final String ENABLE_EXTENDED_REGEX = "enable_extended_regex";
+
// NOTE: if you want to add some debug variables, please disable sql cache
in `CacheAnalyzer.commonCacheCondition`,
// and set affectQueryResult=true
public static final List<String> DEBUG_VARIABLES = ImmutableList.of(
@@ -3055,6 +3057,11 @@ public class SessionVariable implements Serializable,
Writable {
)
public int defaultVariantMaxSparseColumnStatisticsSize = 10000;
+ @VariableMgr.VarAttr(name = ENABLE_EXTENDED_REGEX, needForward = true,
affectQueryResult = true,
+ description = {"是否启用扩展的正则表达式, 支持如 look-around 类的零宽断言",
+ "Enable extended regular expressions, support look-around
zero-width assertions"})
+ public boolean enableExtendedRegex = false;
+
@VariableMgr.VarAttr(
name = DEFAULT_VARIANT_SPARSE_HASH_SHARD_COUNT,
needForward = true,
@@ -4817,6 +4824,7 @@ public class SessionVariable implements Serializable,
Writable {
tResult.setHnswCheckRelativeDistance(hnswCheckRelativeDistance);
tResult.setHnswBoundedQueue(hnswBoundedQueue);
tResult.setMergeReadSliceSize(mergeReadSliceSizeBytes);
+ tResult.setEnableExtendedRegex(enableExtendedRegex);
return tResult;
}
diff --git a/gensrc/thrift/PaloInternalService.thrift
b/gensrc/thrift/PaloInternalService.thrift
index 1248c93ef35..bac2030f5fc 100644
--- a/gensrc/thrift/PaloInternalService.thrift
+++ b/gensrc/thrift/PaloInternalService.thrift
@@ -410,6 +410,8 @@ struct TQueryOptions {
175: optional bool enable_fuzzy_blockable_task = false;
+ 177: optional bool enable_extended_regex = false;
+
// For cloud, to control if the content would be written into file cache
// In write path, to control if the content would be written into file cache.
// In read path, read from file cache or remote storage when execute query.
diff --git
a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function_regexp.out
b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function_regexp.out
index b0a07aad777..d7994943fd5 100644
---
a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function_regexp.out
+++
b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function_regexp.out
@@ -154,6 +154,15 @@ d
-- !sql --
+-- !regexp_extract_1 --
+123
+
+-- !regexp_extract_2 --
+EdgeCase1
+
+-- !regexp_extract_3 --
+AA-1
+
-- !sql --
b
@@ -163,6 +172,12 @@ d
-- !sql --
\N
+-- !regexp_extract_or_null_1 --
+123
+
+-- !regexp_extract_or_null_2 --
+B
+
-- !sql --
['18','17']
@@ -181,7 +196,7 @@ d
-- !sql --
['ab','c','c','c']
--- !sql_regexp_extract_all --
+-- !sql_regexp_extract_all_1 --
0
0
0
@@ -190,6 +205,18 @@ d
0
0
+-- !sql_regexp_extract_all_2 --
+['Apache/Doris']
+
+-- !sql_regexp_extract_all_3 --
+['123','456']
+
+-- !sql_regexp_extract_all_4 --
+['AA-1','BB-2','CC-3']
+
+-- !sql_regexp_extract_all_5 --
+['Case1','Case2','Case3']
+
-- !sql --
a-b-c
@@ -202,6 +229,33 @@ a-b c
-- !sql --
a <b> b
+-- !regexp_fn_1 --
+true
+
+-- !regexp_fn_2 --
+false
+
+-- !regexp_fn_3 --
+true
+
+-- !regexp_fn_4 --
+true
+
+-- !regexp_fn_5 --
+false
+
+-- !regexp_fn_6 --
+false
+
+-- !regexp_fn_7 --
+true
+
+-- !regexp_fn_8 --
+false
+
+-- !regexp_fn_9 --
+true
+
-- !sql_utf1 --
true
diff --git
a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function_regexp.groovy
b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function_regexp.groovy
index 7b78d5865b6..7c9876d32d6 100644
---
a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function_regexp.groovy
+++
b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function_regexp.groovy
@@ -65,17 +65,50 @@ suite("test_string_function_regexp") {
qt_sql "SELECT regexp_extract('AbCdE', '([[:lower:]]+)C([[:lower:]]+)',
2);"
qt_sql "SELECT regexp_extract('AbCdE', '([[:lower:]]+)C([[:lower:]]+)',
3);"
+ sql "set enable_extended_regex = false;"
+ test {
+ sql 'SELECT regexp_extract(\'foo123bar456baz\',
\'(?<=foo)(\\\\d+)(?=bar)\', 1);'
+ exception "Invalid regex pattern"
+ }
+ sql "set enable_extended_regex = true;"
+ qt_regexp_extract_1 'SELECT regexp_extract(\'foo123bar456baz\',
\'(?<=foo)(\\\\d+)(?=bar)\', 1);'
+ qt_regexp_extract_2 'SELECT regexp_extract(\'EdgeCase1 EdgeCase2
EdgeCase3\', \'(EdgeCase\\\\d)(?= EdgeCase|$)\', 1);'
+ qt_regexp_extract_3 'SELECT regexp_extract(\'ID:AA-1,ID:BB-2,ID:CC-3\',
\'(?<=ID:)([A-Z]{2}-\\\\d)(?=,ID|$)\', 1);'
+ sql "set enable_extended_regex = false;"
+
qt_sql "SELECT regexp_extract_or_null('AbCdE',
'([[:lower:]]+)C([[:lower:]]+)', 1);"
qt_sql "SELECT regexp_extract_or_null('AbCdE',
'([[:lower:]]+)C([[:lower:]]+)', 2);"
qt_sql "SELECT regexp_extract_or_null('AbCdE',
'([[:lower:]]+)C([[:lower:]]+)', 3);"
+ sql "SET enable_extended_regex = false;"
+ test {
+ sql "SELECT regexp_extract_or_null('foo123bar',
'(?<=foo)(\\\\d+)(?=bar)', 1);"
+ exception "Invalid regex pattern"
+ }
+ sql "set enable_extended_regex = true;"
+ qt_regexp_extract_or_null_1 "SELECT regexp_extract_or_null('foo123bar',
'(?<=foo)(\\\\d+)(?=bar)', 1);"
+ qt_regexp_extract_or_null_2 "SELECT regexp_extract_or_null('TokenA TokenB
TokenC', '(?<=Token)([A-Z])(?= TokenC)', 1);"
+ sql "set enable_extended_regex = false;"
+
qt_sql "SELECT regexp_extract_all('x=a3&x=18abc&x=2&y=3&x=4&x=17bcd',
'x=([0-9]+)([a-z]+)');"
qt_sql "SELECT regexp_extract_all('http://a.m.baidu.com/i41915i73660.htm',
'i([0-9]+)');"
qt_sql "SELECT regexp_extract_all('abc=111, def=222, ghi=333',
'(\"[^\"]+\"|\\\\w+)=(\"[^\"]+\"|\\\\w+)');"
qt_sql "select regexp_extract_all('xxfs','f');"
qt_sql "select regexp_extract_all('asdfg', '(z|x|c|)');"
qt_sql "select regexp_extract_all('abcdfesscca', '(ab|c|)');"
- qt_sql_regexp_extract_all "select regexp_extract_all('', '\"([^\"]+)\":'),
length(regexp_extract_all('', '\"([^\"]+)\":')) from
test_string_function_regexp;"
+
+ sql "set enable_extended_regex = false"
+ test {
+ sql 'SELECT REGEXP_EXTRACT_ALL(\'Apache/Doris\',
\'([a-zA-Z_+-]+(?:\\/[a-zA-Z_0-9+-]+)*)(?=s|$)\');'
+ exception "Invalid regex pattern"
+ }
+ sql "set enable_extended_regex = true;"
+ qt_sql_regexp_extract_all_1 "select regexp_extract_all('',
'\"([^\"]+)\":'), length(regexp_extract_all('', '\"([^\"]+)\":')) from
test_string_function_regexp;"
+ qt_sql_regexp_extract_all_2 'SELECT REGEXP_EXTRACT_ALL(\'Apache/Doris\',
\'([a-zA-Z_+-]+(?:\\/[a-zA-Z_0-9+-]+)*)(?=s|$)\');'
+ qt_sql_regexp_extract_all_3 'SELECT
REGEXP_EXTRACT_ALL(\'foo123bar456baz\', \'(\\\\d{3})(?=bar|baz)\');'
+ qt_sql_regexp_extract_all_4 'SELECT
REGEXP_EXTRACT_ALL(\'ID:AA-1,ID:BB-2,ID:CC-3\', \'(?<=ID:)([A-Z]{2}-\\\\d)\');'
+ qt_sql_regexp_extract_all_5 'SELECT
REGEXP_EXTRACT_ALL(\'EdgeCase1EdgeCase2EdgeCase3\',
\'(?<=Edge)(Case\\\\d)(?=Edge|$)\');'
+ sql "set enable_extended_regex = false;"
qt_sql "SELECT regexp_replace('a b c', \" \", \"-\");"
qt_sql "SELECT regexp_replace('a b c','(b)','<\\\\1>');"
@@ -83,6 +116,23 @@ suite("test_string_function_regexp") {
qt_sql "SELECT regexp_replace_one('a b c', \" \", \"-\");"
qt_sql "SELECT regexp_replace_one('a b b','(b)','<\\\\1>');"
+ sql "set enable_extended_regex = false"
+ test {
+ sql 'SELECT regexp(\'foobar\', \'(?<=foo)bar\');'
+ exception "Invalid regex expression"
+ }
+ sql "set enable_extended_regex = true;"
+ qt_regexp_fn_1 'SELECT regexp(\'abc123def\', \'abc[0-9]+\');'
+ qt_regexp_fn_2 'SELECT regexp(\'edge case test\', \'\\bcase\\b\');'
+ qt_regexp_fn_3 'SELECT regexp(\'foo123bar\', \'foo(?=123)\');'
+ qt_regexp_fn_4 'SELECT regexp(\'fooXYZbar\', \'foo(?!123)\');'
+ qt_regexp_fn_5 'SELECT regexp(\'123abc\', \'^\\d+\');'
+ qt_regexp_fn_6 'SELECT regexp(\'abc123\', \'^\\d+\');'
+ qt_regexp_fn_7 'SELECT regexp(\'foobar\', \'(?<=foo)bar\');'
+ qt_regexp_fn_8 'SELECT regexp(\'foobar\', \'(?<!foo)bar\');'
+ qt_regexp_fn_9 'SELECT regexp(\'Hello\', \'(?i)hello\');'
+ sql "set enable_extended_regex = false;"
+
qt_sql_utf1 """ select '皖12345' REGEXP '^[皖][0-9]{5}\$'; """
qt_sql_utf2 """ select '皖 12345' REGEXP '^[皖] [0-9]{5}\$'; """
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]