This is an automated email from the ASF dual-hosted git repository.
bcall pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/trafficserver.git
The following commit(s) were added to refs/heads/master by this push:
new 8e1c1b9faf Change Regex class to use PCRE2 (#11014)
8e1c1b9faf is described below
commit 8e1c1b9fafb13df87ca431a1e855f64223b06ffb
Author: Bryan Call <[email protected]>
AuthorDate: Thu Feb 29 12:06:24 2024 -0800
Change Regex class to use PCRE2 (#11014)
---
CMakeLists.txt | 2 +-
include/proxy/http/remap/UrlRewrite.h | 2 +-
include/tsutil/Regex.h | 106 +++++---
plugins/experimental/tls_bridge/CMakeLists.txt | 2 +-
src/proxy/http/remap/UrlRewrite.cc | 14 +-
src/tsutil/CMakeLists.txt | 3 +-
src/tsutil/Regex.cc | 329 +++++++++++++++++--------
src/tsutil/unit_tests/test_Regex.cc | 138 ++++++++++-
8 files changed, 440 insertions(+), 156 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 63ec25723a..a276d827cc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -253,7 +253,7 @@ if(LibLZMA_FOUND)
endif()
find_package(PCRE REQUIRED)
-find_package(PCRE2 COMPONENTS 8BIT)
+pkg_check_modules(PCRE2 REQUIRED IMPORTED_TARGET libpcre2-8)
include(CheckOpenSSLIsBoringSSL)
include(CheckOpenSSLIsQuictls)
diff --git a/include/proxy/http/remap/UrlRewrite.h
b/include/proxy/http/remap/UrlRewrite.h
index 86dcb50a07..797ad94c47 100644
--- a/include/proxy/http/remap/UrlRewrite.h
+++ b/include/proxy/http/remap/UrlRewrite.h
@@ -232,7 +232,7 @@ private:
int request_host_len);
bool _regexMappingLookup(RegexMappingList ®ex_mappings, URL *request_url,
int request_port, const char *request_host,
int request_host_len, int rank_ceiling,
UrlMappingContainer &mapping_container);
- int _expandSubstitutions(int *matches_info, const RegexMapping *reg_map,
const char *matched_string, char *dest_buf,
+ int _expandSubstitutions(size_t *matches_info, const RegexMapping *reg_map,
const char *matched_string, char *dest_buf,
int dest_buf_size);
void _destroyTable(std::unique_ptr<URLTable> &h_table);
void _destroyList(RegexMappingList ®exes);
diff --git a/include/tsutil/Regex.h b/include/tsutil/Regex.h
index a1c51e3661..c4ca8feb03 100644
--- a/include/tsutil/Regex.h
+++ b/include/tsutil/Regex.h
@@ -28,24 +28,60 @@
#include <vector>
#include <memory>
-#include "swoc/MemSpan.h"
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
-/// Match flags for regular expression evaluation.
+/// @brief Match flags for regular expression evaluation.
enum REFlags {
- RE_CASE_INSENSITIVE = 0x0001, ///< Ignore case (default: case sensitive).
- RE_UNANCHORED = 0x0002, ///< Unanchored (DFA defaults to anchored).
- RE_ANCHORED = 0x0004, ///< Anchored (Regex defaults to unanchored).
+ RE_CASE_INSENSITIVE = PCRE2_CASELESS, ///< Ignore case (default: case
sensitive).
+ RE_UNANCHORED = PCRE2_MULTILINE, ///< Unanchored (DFA defaults to
anchored).
+ RE_ANCHORED = PCRE2_ANCHORED, ///< Anchored (Regex defaults to
unanchored).
};
-/** Wrapper for PCRE evaluation.
- *
- */
-class Regex
+/// @brief Wrapper for PCRE2 match data.
+class RegexMatches
{
+ friend class Regex;
+
public:
- /// Default number of capture groups.
- static constexpr size_t DEFAULT_GROUP_COUNT = 10;
+ /** Construct a new RegexMatches object.
+ *
+ * @param size The number of matches to allocate space for.
+ */
+ RegexMatches(uint32_t size = DEFAULT_MATCHES);
+ ~RegexMatches();
+
+ /** Get the match at the given index.
+ *
+ * @return The match at the given index.
+ */
+ std::string_view operator[](size_t index) const;
+ /** Get the ovector pointer for the capture groups. Don't use this unless
you know what you are doing.
+ *
+ * @return ovector pointer.
+ */
+ size_t *get_ovector_pointer();
+ int32_t size() const;
+
+protected:
+ pcre2_match_data *get_match_data();
+ void set_subject(std::string_view subject);
+ void set_size(int32_t size);
+
+private:
+ constexpr static uint32_t DEFAULT_MATCHES = 10;
+ static void *malloc(size_t size, void *caller);
+ pcre2_match_data *_match_data = nullptr;
+ std::string_view _subject;
+ char _buffer[24 + 96 + 16 * DEFAULT_MATCHES]; // 24 bytes for the general
context, 96 bytes overhead, 16 bytes per match.
+ size_t _buffer_bytes_used = 0;
+ int32_t _size = 0;
+};
+/// @brief Wrapper for PCRE2 regular expression.
+class Regex
+{
+public:
Regex() = default;
Regex(Regex const &) = delete; // No copying.
Regex(Regex &&that) noexcept;
@@ -59,46 +95,43 @@ public:
*
* @a flags should be the bitwise @c or of @c REFlags values.
*/
- bool compile(const char *pattern, unsigned flags = 0);
+ bool compile(std::string_view pattern, uint32_t flags = 0);
- /** Execute the regular expression.
+ /** Compile the @a pattern into a regular expression.
*
- * @param str String to match against.
- * @return @c true if the pattern matched, @a false if not.
+ * @param pattern Source pattern for regular expression (null terminated).
+ * @param error String to receive error message.
+ * @param erroffset Pointer to integer to receive error offset.
+ * @param flags Compilation flags.
+ * @return @a true if compiled successfully, @a false otherwise.
*
- * It is safe to call this method concurrently on the same instance of @a
this.
+ * @a flags should be the bitwise @c or of @c REFlags values.
*/
- bool exec(std::string_view const &str) const;
+ bool compile(std::string_view pattern, std::string &error, int &erroffset,
unsigned flags = 0);
/** Execute the regular expression.
*
- * @param str String to match against.
- * @param ovector Capture results.
- * @param ovecsize Number of elements in @a ovector.
+ * @param subject String to match against.
* @return @c true if the pattern matched, @a false if not.
*
* It is safe to call this method concurrently on the same instance of @a
this.
- *
- * Each capture group takes 3 elements of @a ovector, therefore @a ovecsize
must
- * be a multiple of 3 and at least three times the number of desired capture
groups.
*/
- bool exec(std::string_view const &str, int *ovector, int ovecsize) const;
+ bool exec(std::string_view subject) const;
/** Execute the regular expression.
*
- * @param str String to match against.
- * @param ovector Capture results.
- * @param ovecsize Number of elements in @a ovector.
- * @return @c true if the pattern matched, @a false if not.
+ * @param subject String to match against.
+ * @param matches Place to store the capture groups.
+ * @return @c The number of capture groups. < 0 if an error occurred. 0 if
the number of Matches is too small.
*
* It is safe to call this method concurrently on the same instance of @a
this.
*
* Each capture group takes 3 elements of @a ovector, therefore @a ovecsize
must
* be a multiple of 3 and at least three times the number of desired capture
groups.
*/
- bool exec(std::string_view str, swoc::MemSpan<int> groups) const;
+ int exec(std::string_view subject, RegexMatches &matches) const;
- /// @return The number of groups captured in the last call to @c exec.
+ /// @return The number of capture groups in the compiled pattern.
int get_capture_count();
private:
@@ -106,8 +139,7 @@ private:
// enough to use as pointers. For some reason the header defines in name
only a struct and
// then aliases it to the standard name, rather than simply declare the
latter in name only.
// The goal is completely wrap PCRE and not include that header in client
code.
- void *regex = nullptr; ///< Compiled expression.
- void *regex_extra = nullptr; ///< Extra information about the expression.
+ pcre2_code *_code = nullptr;
};
/** Deterministic Finite state Automata container.
@@ -122,18 +154,18 @@ public:
~DFA();
/// @return The number of patterns successfully compiled.
- int compile(std::string_view const &pattern, unsigned flags = 0);
+ int32_t compile(std::string_view pattern, unsigned flags = 0);
/// @return The number of patterns successfully compiled.
- int compile(std::string_view *patterns, int npatterns, unsigned flags = 0);
+ int32_t compile(std::string_view *patterns, int npatterns, unsigned flags =
0);
/// @return The number of patterns successfully compiled.
- int compile(const char **patterns, int npatterns, unsigned flags = 0);
+ int32_t compile(const char **patterns, int npatterns, unsigned flags = 0);
/** Match @a str against the internal patterns.
*
* @param str String to match.
* @return Index of the matched pattern, -1 if no match.
*/
- int match(std::string_view const &str) const;
+ int32_t match(std::string_view str) const;
private:
struct Pattern {
@@ -148,7 +180,7 @@ private:
* @param flags Regular expression compilation flags.
* @return @c true if @a pattern was successfully compiled, @c false if not.
*/
- bool build(std::string_view const &pattern, unsigned flags = 0);
+ bool build(std::string_view pattern, unsigned flags = 0);
std::vector<Pattern> _patterns;
};
diff --git a/plugins/experimental/tls_bridge/CMakeLists.txt
b/plugins/experimental/tls_bridge/CMakeLists.txt
index e3de8b9249..5430505431 100644
--- a/plugins/experimental/tls_bridge/CMakeLists.txt
+++ b/plugins/experimental/tls_bridge/CMakeLists.txt
@@ -17,5 +17,5 @@
add_atsplugin(tls_bridge tls_bridge.cc)
-target_link_libraries(tls_bridge PRIVATE libswoc::libswoc)
+target_link_libraries(tls_bridge PRIVATE ts::tsutil libswoc::libswoc)
verify_global_plugin(tls_bridge)
diff --git a/src/proxy/http/remap/UrlRewrite.cc
b/src/proxy/http/remap/UrlRewrite.cc
index 1f8176e97c..e377451921 100644
--- a/src/proxy/http/remap/UrlRewrite.cc
+++ b/src/proxy/http/remap/UrlRewrite.cc
@@ -857,7 +857,7 @@ UrlRewrite::_mappingLookup(MappingsStore &mappings, URL
*request_url, int reques
// does not null terminate return string
int
-UrlRewrite::_expandSubstitutions(int *matches_info, const RegexMapping
*reg_map, const char *matched_string, char *dest_buf,
+UrlRewrite::_expandSubstitutions(size_t *matches_info, const RegexMapping
*reg_map, const char *matched_string, char *dest_buf,
int dest_buf_size)
{
int cur_buf_size = 0;
@@ -908,6 +908,7 @@ UrlRewrite::_regexMappingLookup(RegexMappingList
®ex_mappings, URL *request_u
int request_host_len, int rank_ceiling,
UrlMappingContainer &mapping_container)
{
bool retval = false;
+ RegexMatches matches;
if (rank_ceiling == -1) { // we will now look at all regex mappings
rank_ceiling = INT_MAX;
@@ -959,11 +960,9 @@ UrlRewrite::_regexMappingLookup(RegexMappingList
®ex_mappings, URL *request_u
continue;
}
- int matches_info[MAX_REGEX_SUBS * 3];
- bool match_result =
- list_iter->regular_expression.exec(std::string_view(request_host,
request_host_len), matches_info, countof(matches_info));
+ int match_result =
list_iter->regular_expression.exec(std::string_view(request_host,
request_host_len), matches);
- if (match_result == true) {
+ if (match_result > 0) {
Debug("url_rewrite_regex",
"Request URL host [%.*s] matched regex in mapping of rank %d "
"with %d possible substitutions",
@@ -975,8 +974,9 @@ UrlRewrite::_regexMappingLookup(RegexMappingList
®ex_mappings, URL *request_u
int buf_len;
// Expand substitutions in the host field from the stored template
- buf_len = _expandSubstitutions(matches_info, list_iter,
request_host, buf, sizeof(buf));
- URL *expanded_url = mapping_container.createNewToURL();
+ size_t *matches_info = matches.get_ovector_pointer();
+ buf_len = _expandSubstitutions(matches_info, list_iter,
request_host, buf, sizeof(buf));
+ URL *expanded_url = mapping_container.createNewToURL();
expanded_url->copy(&((list_iter->url_map)->toURL));
expanded_url->host_set(buf, buf_len);
diff --git a/src/tsutil/CMakeLists.txt b/src/tsutil/CMakeLists.txt
index 44b83448fd..a747431daa 100644
--- a/src/tsutil/CMakeLists.txt
+++ b/src/tsutil/CMakeLists.txt
@@ -50,9 +50,10 @@ add_library(
ts_unit_parser.cc
Regex.cc
)
+
add_library(ts::tsutil ALIAS tsutil)
set_target_properties(tsutil PROPERTIES POSITION_INDEPENDENT_CODE TRUE
PUBLIC_HEADER "${TSUTIL_PUBLIC_HEADERS}")
-target_link_libraries(tsutil PUBLIC libswoc::libswoc yaml-cpp::yaml-cpp
PCRE::PCRE)
+target_link_libraries(tsutil PUBLIC libswoc::libswoc yaml-cpp::yaml-cpp
PkgConfig::PCRE2)
install(
TARGETS tsutil
diff --git a/src/tsutil/Regex.cc b/src/tsutil/Regex.cc
index 42d9d27c28..faea3b8546 100644
--- a/src/tsutil/Regex.cc
+++ b/src/tsutil/Regex.cc
@@ -26,160 +26,281 @@
#include <array>
#include <assert.h>
-#if __has_include(<pcre/pcre.h>)
-#include <pcre/pcre.h>
-#else
-#include <pcre.h>
-#endif
-
+//----------------------------------------------------------------------------
namespace
{
-inline pcre *
-as_pcre(void *p)
+void *
+my_malloc(size_t size, void * /*caller*/)
{
- return static_cast<pcre *>(p);
+ void *ptr = malloc(size);
+ return ptr;
}
-inline pcre_extra *
-as_extra(void *p)
+
+void
+my_free(void *ptr, void * /*caller*/)
{
- return static_cast<pcre_extra *>(p);
+ free(ptr);
}
} // namespace
-#ifdef PCRE_CONFIG_JIT
-/*
-Using two thread locals avoids the deadlock because without the thread local
object access, get_jit_stack doesn't call
-the TLS init function which ends up calling __cxx_thread_atexit(which locks
the dl_whatever mutex). Since the raw
-pointer doesn't have a destructor to call, it doesn't need to call this.
Interestingly, get_jit_stack was calling the
-TLS init function to setup the destructor call at thread exit whether or not
the class was declared in the function
-body.
-*/
-namespace
+//----------------------------------------------------------------------------
+class RegexContext
{
-thread_local pcre_jit_stack *jit_stack;
-
-struct JitStackCleanup {
- ~JitStackCleanup()
+public:
+ static RegexContext *
+ get_instance()
+ {
+ if (!_regex_context) {
+ _regex_context = new RegexContext();
+ }
+ return _regex_context;
+ }
+ ~RegexContext()
{
- if (jit_stack) {
- pcre_jit_stack_free(jit_stack);
+ if (_general_context != nullptr) {
+ pcre2_general_context_free(_general_context);
+ }
+ if (_compile_context != nullptr) {
+ pcre2_compile_context_free(_compile_context);
+ }
+ if (_match_context != nullptr) {
+ pcre2_match_context_free(_match_context);
+ }
+ if (_jit_stack != nullptr) {
+ pcre2_jit_stack_free(_jit_stack);
}
}
+ pcre2_general_context *
+ get_general_context()
+ {
+ return _general_context;
+ }
+ pcre2_compile_context *
+ get_compile_context()
+ {
+ return _compile_context;
+ }
+ pcre2_match_context *
+ get_match_context()
+ {
+ return _match_context;
+ }
+
+private:
+ RegexContext()
+ {
+ _general_context = pcre2_general_context_create(my_malloc, my_free,
nullptr);
+ _compile_context = pcre2_compile_context_create(_general_context);
+ _match_context = pcre2_match_context_create(_general_context);
+ _jit_stack = pcre2_jit_stack_create(4096, 1024 * 1024, nullptr); //
1 page min and 1MB max
+ pcre2_jit_stack_assign(_match_context, nullptr, _jit_stack);
+ }
+ pcre2_general_context *_general_context = nullptr;
+ pcre2_compile_context *_compile_context = nullptr;
+ pcre2_match_context *_match_context = nullptr;
+ pcre2_jit_stack *_jit_stack = nullptr;
+ thread_local static RegexContext *_regex_context;
};
-thread_local JitStackCleanup jsc;
+thread_local RegexContext *RegexContext::_regex_context = nullptr;
-pcre_jit_stack *
-get_jit_stack(void *)
+//----------------------------------------------------------------------------
+namespace
{
- if (!jit_stack) {
- jit_stack = pcre_jit_stack_alloc(4096, 1024 * 1024); // 1 page min and 1MB
max
- }
- return jit_stack;
-}
-
-} // end anonymous namespace
-#endif // def PCRE_CONFIG_JIT
+struct RegexContextCleanup {
+ ~RegexContextCleanup() { delete RegexContext::get_instance(); }
+};
+thread_local RegexContextCleanup cleanup;
+} // namespace
-Regex::Regex(Regex &&that) noexcept : regex(that.regex),
regex_extra(that.regex_extra)
+//----------------------------------------------------------------------------
+RegexMatches::RegexMatches(uint32_t size)
{
- that.regex = nullptr;
- that.regex_extra = nullptr;
+ pcre2_general_context *ctx = pcre2_general_context_create(
+ &RegexMatches::malloc, [](void *, void *) -> void {}, static_cast<void
*>(this));
+
+ _match_data = pcre2_match_data_create(size, ctx);
}
-bool
-Regex::compile(const char *pattern, const unsigned flags)
+//----------------------------------------------------------------------------
+void *
+RegexMatches::malloc(size_t size, void *caller)
{
- const char *error = nullptr;
- int erroffset = 0;
- int options = 0;
- int study_opts = 0;
+ auto *matches = static_cast<RegexMatches *>(caller);
- if (regex) {
- return false;
+ // allocate from the buffer if possible
+ if (size <= sizeof(matches->_buffer) - matches->_buffer_bytes_used) {
+ void *ptr = matches->_buffer +
matches->_buffer_bytes_used;
+ matches->_buffer_bytes_used += size;
+ return ptr;
}
- if (flags & RE_CASE_INSENSITIVE) {
- options |= PCRE_CASELESS;
- }
+ // otherwise use system malloc if the buffer is too small
+ void *ptr = ::malloc(size);
+ return ptr;
+}
- if (flags & RE_ANCHORED) {
- options |= PCRE_ANCHORED;
+//----------------------------------------------------------------------------
+RegexMatches::~RegexMatches()
+{
+ if (_match_data != nullptr) {
+ pcre2_match_data_free(_match_data);
}
+}
- regex = pcre_compile(pattern, options, &error, &erroffset, nullptr);
- if (error) {
- regex = nullptr;
- return false;
- }
+//----------------------------------------------------------------------------
+size_t *
+RegexMatches::get_ovector_pointer()
+{
+ return pcre2_get_ovector_pointer(_match_data);
+}
+
+//----------------------------------------------------------------------------
+int32_t
+RegexMatches::size() const
+{
+ return _size;
+}
+
+//----------------------------------------------------------------------------
+pcre2_match_data *
+RegexMatches::get_match_data()
+{
+ return _match_data;
+}
-#ifdef PCRE_CONFIG_JIT
- study_opts |= PCRE_STUDY_JIT_COMPILE;
-#endif
+//----------------------------------------------------------------------------
+void
+RegexMatches::set_size(int32_t size)
+{
+ _size = size;
+}
- regex_extra = pcre_study(as_pcre(regex), study_opts, &error);
+//----------------------------------------------------------------------------
+void
+RegexMatches::set_subject(std::string_view subject)
+{
+ _subject = subject;
+}
-#ifdef PCRE_CONFIG_JIT
- if (regex_extra) {
- pcre_assign_jit_stack(as_extra(regex_extra), &get_jit_stack, nullptr);
+//----------------------------------------------------------------------------
+std::string_view
+RegexMatches::operator[](size_t index) const
+{
+ // check if the index is valid
+ if (index >= pcre2_get_ovector_count(_match_data)) {
+ return std::string_view();
}
-#endif
- return true;
+ PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(_match_data);
+ return std::string_view(_subject.data() + ovector[2 * index], ovector[2 *
index + 1] - ovector[2 * index]);
}
-int
-Regex::get_capture_count()
+//----------------------------------------------------------------------------
+Regex::Regex(Regex &&that) noexcept
{
- int captures = -1;
- if (pcre_fullinfo(as_pcre(regex), as_extra(regex_extra),
PCRE_INFO_CAPTURECOUNT, &captures) != 0) {
- return -1;
- }
+ _code = that._code;
+ that._code = nullptr;
+}
- return captures;
+//----------------------------------------------------------------------------
+Regex::~Regex()
+{
+ if (_code != nullptr) {
+ pcre2_code_free(_code);
+ }
}
+//----------------------------------------------------------------------------
bool
-Regex::exec(std::string_view const &str) const
+Regex::compile(std::string_view pattern, uint32_t flags)
{
- std::array<int, DEFAULT_GROUP_COUNT * 3> ovector = {{0}};
- return this->exec(str, ovector);
+ std::string error;
+ int erroroffset;
+
+ return this->compile(pattern, error, erroroffset, flags);
}
+//----------------------------------------------------------------------------
bool
-Regex::exec(std::string_view const &str, int *ovector, int ovecsize) const
+Regex::compile(std::string_view pattern, std::string &error, int &erroroffset,
uint32_t flags)
{
- int rv;
+ if (_code != nullptr) {
+ pcre2_code_free(_code);
+ }
+ PCRE2_SIZE error_offset;
+ int error_code;
+ _code = pcre2_compile(reinterpret_cast<PCRE2_SPTR>(pattern.data()),
pattern.size(), flags, &error_code, &error_offset,
+ RegexContext::get_instance()->get_compile_context());
+ if (!_code) {
+ erroroffset = error_offset;
+
+ // get pcre2 error message
+ PCRE2_UCHAR buffer[256];
+ pcre2_get_error_message(error_code, buffer, sizeof(buffer));
+ error.assign((char *)buffer);
+ return false;
+ }
+
+ // support for JIT
+ pcre2_jit_compile(_code, PCRE2_JIT_COMPLETE);
- rv = pcre_exec(as_pcre(regex), as_extra(regex_extra), str.data(),
static_cast<int>(str.size()), 0, 0, ovector, ovecsize);
- return rv > 0;
+ return true;
}
+//----------------------------------------------------------------------------
bool
-Regex::exec(std::string_view str, swoc::MemSpan<int> groups) const
+Regex::exec(std::string_view subject) const
{
- return 0 <
- pcre_exec(as_pcre(regex), as_extra(regex_extra), str.data(),
int(str.size()), 0, 0, groups.data(), int(groups.count()));
+ if (_code == nullptr) {
+ return false;
+ }
+ RegexMatches matches;
+
+ int count = this->exec(subject, matches);
+ return count > 0;
}
-Regex::~Regex()
+//----------------------------------------------------------------------------
+int32_t
+Regex::exec(std::string_view subject, RegexMatches &matches) const
{
- if (regex_extra) {
-#ifdef PCRE_CONFIG_JIT
- pcre_free_study(as_extra(regex_extra));
-#else
- pcre_free(regex_extra);
-#endif
+ if (_code == nullptr) {
+ return 0;
+ }
+ int count = pcre2_match(_code, reinterpret_cast<PCRE2_SPTR>(subject.data()),
subject.size(), 0, 0, matches.get_match_data(),
+ RegexContext::get_instance()->get_match_context());
+
+ matches.set_size(count);
+
+ if (count < 0) {
+ return count;
}
- if (regex) {
- pcre_free(regex);
+
+ if (count > 0) {
+ matches.set_subject(subject);
+ }
+
+ return count;
+}
+
+//----------------------------------------------------------------------------
+int32_t
+Regex::get_capture_count()
+{
+ int captures = -1;
+ if (pcre2_pattern_info(_code, PCRE2_INFO_CAPTURECOUNT, &captures) != 0) {
+ return -1;
}
+ return captures;
}
+//----------------------------------------------------------------------------
DFA::~DFA() {}
+//----------------------------------------------------------------------------
bool
-DFA::build(std::string_view const &pattern, unsigned flags)
+DFA::build(const std::string_view pattern, unsigned flags)
{
Regex rxp;
std::string string{pattern};
@@ -188,22 +309,24 @@ DFA::build(std::string_view const &pattern, unsigned
flags)
flags |= RE_ANCHORED;
}
- if (!rxp.compile(string.c_str(), flags)) {
+ if (!rxp.compile(pattern, flags)) {
return false;
}
_patterns.emplace_back(std::move(rxp), std::move(string));
return true;
}
-int
-DFA::compile(std::string_view const &pattern, unsigned flags)
+//----------------------------------------------------------------------------
+int32_t
+DFA::compile(std::string_view pattern, unsigned flags)
{
assert(_patterns.empty());
this->build(pattern, flags);
return _patterns.size();
}
-int
+//----------------------------------------------------------------------------
+int32_t
DFA::compile(std::string_view *patterns, int npatterns, unsigned flags)
{
_patterns.reserve(npatterns); // try to pre-allocate.
@@ -213,7 +336,8 @@ DFA::compile(std::string_view *patterns, int npatterns,
unsigned flags)
return _patterns.size();
}
-int
+//----------------------------------------------------------------------------
+int32_t
DFA::compile(const char **patterns, int npatterns, unsigned flags)
{
_patterns.reserve(npatterns); // try to pre-allocate.
@@ -223,8 +347,9 @@ DFA::compile(const char **patterns, int npatterns, unsigned
flags)
return _patterns.size();
}
-int
-DFA::match(std::string_view const &str) const
+//----------------------------------------------------------------------------
+int32_t
+DFA::match(std::string_view str) const
{
for (auto spot = _patterns.begin(), limit = _patterns.end(); spot != limit;
++spot) {
if (spot->_re.exec(str)) {
diff --git a/src/tsutil/unit_tests/test_Regex.cc
b/src/tsutil/unit_tests/test_Regex.cc
index 16f327dff6..f17d2b17c8 100644
--- a/src/tsutil/unit_tests/test_Regex.cc
+++ b/src/tsutil/unit_tests/test_Regex.cc
@@ -20,8 +20,8 @@
limitations under the License.
*/
-#include <array>
#include <string_view>
+#include <vector>
#include "tscore/ink_assert.h"
#include "tscore/ink_defs.h"
@@ -35,22 +35,148 @@ struct subject_match_t {
struct test_t {
std::string_view regex;
- std::array<subject_match_t, 4> tests;
+ std::vector<subject_match_t> tests;
};
-std::array<test_t, 2> test_data{
- {{{"^foo"}, {{{{"foo"}, true}, {{"bar"}, false}, {{"foobar"}, true},
{{"foobarbaz"}, true}}}},
- {{"foo$"}, {{{{"foo"}, true}, {{"bar"}, false}, {{"foobar"}, false},
{{"foobarbaz"}, false}}}}}
+std::vector<test_t> test_data{
+ {
+ {{R"(^foo)"}, {{{{"foo"}, true}, {{"bar"}, false}, {{"foobar"}, true},
{{"foobarbaz"}, true}}}},
+ {{R"(foo$)"}, {{{{"foo"}, true}, {{"bar"}, false}, {{"foobar"}, false},
{{"foobarbaz"}, false}}}},
+ // url regular expression
+ {{R"(^(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?$)"},
+ {{{{"http://www.example.com"}, true},
+ {{"https://www.example.com"}, true},
+ {{"http://~example.com"}, false},
+ {{"http://www.example.com/foo/bar"}, true}}}},
+ // ip address regular expression
+
{R"(^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$)",
+ {{{{"1.2.3.4"}, true}, {{"127.0.0.1"}, true}, {{"256.256.256.256"},
false}, {{".1.1.1.1"}, false}}}},
+ }
+};
+
+// test case insensitive test data
+std::vector<test_t> test_data_case_insensitive{
+ {
+ {{R"(^foo)"}, {{{{"FoO"}, true}, {{"bar"}, false}, {{"foObar"}, true},
{{"foobaRbaz"}, true}}}},
+ {{R"(foo$)"}, {{{{"foO"}, true}, {{"bar"}, false}, {{"foobar"}, false},
{{"foobarbaz"}, false}}}},
+ }
+};
+
+// test case for anchored flag
+std::vector<test_t> test_data_anchored{
+ {
+ {{R"(foo)"}, {{{{"foo"}, true}, {{"bar"}, false}, {{"foobar"}, true},
{{"foobarbaz"}, true}}}},
+ {{R"(bar)"}, {{{{"foo"}, false}, {{"bar"}, true}, {{"foobar"}, false},
{{"foobarbaz"}, false}}}},
+ }
+};
+
+struct submatch_t {
+ std::string_view subject;
+ int32_t count;
+ std::vector<std::string_view> submatches;
+};
+
+struct submatch_test_t {
+ std::string_view regex;
+ int capture_count;
+ std::vector<submatch_t> tests;
+};
+
+std::vector<submatch_test_t> submatch_test_data{
+ {
+ {{R"(^foo)"}, 0, {{{{"foo"}, 1, {{"foo"}}}, {{"bar"}, -1, {}}, {{"foobar"},
1, {{"foo"}}}, {{"foobarbaz"}, 1, {{"foo"}}}}}},
+ {{R"(foo$)"}, 0, {{{{"foo"}, 1, {{"foo"}}}, {{"bar"}, -1, {}}, {{"foobar"},
-1, {}}, {{"foobarbaz"}, -1, {}}}}},
+ {{R"(^(foo)(bar))"}, 2, {{{{"foobar"}, 3, {{"foobar", "foo", "bar"}}},
{{"barfoo"}, -1, {}}, {{"foo"}, -1, {}}}}},
+ }
};
TEST_CASE("Regex", "[libts][Regex]")
{
+ // case sensitive test
for (auto &item : test_data) {
Regex r;
- r.compile(item.regex.data());
+ REQUIRE(r.compile(item.regex.data()) == true);
for (auto &test : item.tests) {
REQUIRE(r.exec(test.subject.data()) == test.match);
}
}
+
+ // case insensitive test
+ for (auto &item : test_data_case_insensitive) {
+ Regex r;
+ REQUIRE(r.compile(item.regex.data(), RE_CASE_INSENSITIVE) == true);
+
+ for (auto &test : item.tests) {
+ REQUIRE(r.exec(test.subject.data()) == test.match);
+ }
+ }
+
+ // case anchored test
+ for (auto &item : test_data_anchored) {
+ Regex r;
+ REQUIRE(r.compile(item.regex.data(), RE_ANCHORED) == true);
+
+ for (auto &test : item.tests) {
+ REQUIRE(r.exec(test.subject.data()) == test.match);
+ }
+ }
+
+ // test getting submatches with operator[]
+ for (auto &item : submatch_test_data) {
+ Regex r;
+ REQUIRE(r.compile(item.regex.data()) == true);
+ REQUIRE(r.get_capture_count() == item.capture_count);
+
+ for (auto &test : item.tests) {
+ RegexMatches matches;
+ REQUIRE(r.exec(test.subject.data(), matches) == test.count);
+ REQUIRE(matches.size() == test.count);
+
+ for (int32_t i = 0; i < test.count; i++) {
+ REQUIRE(matches[i] == test.submatches[i]);
+ }
+ }
+ }
+
+ // test getting submatches with ovector pointer
+ for (auto &item : submatch_test_data) {
+ Regex r;
+ REQUIRE(r.compile(item.regex.data()) == true);
+ REQUIRE(r.get_capture_count() == item.capture_count);
+
+ for (auto &test : item.tests) {
+ RegexMatches matches;
+ REQUIRE(r.exec(test.subject.data(), matches) == test.count);
+ REQUIRE(matches.size() == test.count);
+
+ size_t *ovector = matches.get_ovector_pointer();
+ for (int32_t i = 0; i < test.count; i++) {
+ REQUIRE(test.submatches[i] == std::string_view{test.subject.data() +
ovector[i * 2], ovector[i * 2 + 1] - ovector[i * 2]});
+ }
+ }
+ }
+
+ // test for invalid regular expression
+ {
+ Regex r;
+ REQUIRE(r.compile(R"((\d+)", RE_CASE_INSENSITIVE) == false);
+ }
+
+ // test for not compiling regular expression
+ {
+ Regex r;
+ RegexMatches matches;
+ REQUIRE(r.exec("foo") == false);
+ REQUIRE(r.exec("foo", matches) == 0);
+ }
+
+ // test for recompiling the regular expression
+ {
+ Regex r;
+ REQUIRE(r.compile(R"(foo)") == true);
+ REQUIRE(r.exec("foo") == true);
+ REQUIRE(r.compile(R"(bar)") == true);
+ REQUIRE(r.exec("bar") == true);
+ }
}