I'd prefer to propose another patch that should be commited with this one, which fix bugs (say _M_flags & regex_constants::icase, not "&&"), and do more "matcher optimization".
It is now more DRY (_RegexTranslator<>) and efficient, but <regex> takes longer to compile, mainly because now we have 4 times more _Compiler<> versions :) Booted and tested with -m32 and -m64 respectively. Thank you! -- Regards, Tim Shen
commit 9dcc87dd782c7a09b5e075e746ceb3d0c6f1f4db Author: tim <timshe...@gmail.com> Date: Mon Jan 6 00:03:41 2014 -0500 2014-01-07 Tim Shen <timshe...@gmail.com> * bits/regex_automaton.tcc: Indentation fix. * bits/regex_compiler.h (__compile_nfa<>(), _Compiler<>, _RegexTranslator<> _AnyMatcher<>, _CharMatcher<>, _BracketMatcher<>): Add bool option template parameter and specialization to make matching more efficient and space saving. * bits/regex_compiler.tcc: Likewise. diff --git a/libstdc++-v3/include/bits/regex_automaton.tcc b/libstdc++-v3/include/bits/regex_automaton.tcc index 7edc67f..e222803 100644 --- a/libstdc++-v3/include/bits/regex_automaton.tcc +++ b/libstdc++-v3/include/bits/regex_automaton.tcc @@ -134,9 +134,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _NFA<_TraitsT>::_M_dot(std::ostream& __ostr) const { __ostr << "digraph _Nfa {\n" - " rankdir=LR;\n"; + " rankdir=LR;\n"; for (size_t __i = 0; __i < this->size(); ++__i) - (*this)[__i]._M_dot(__ostr, __i); + (*this)[__i]._M_dot(__ostr, __i); __ostr << "}\n"; return __ostr; } diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h index 4ac67df..4873166 100644 --- a/libstdc++-v3/include/bits/regex_compiler.h +++ b/libstdc++-v3/include/bits/regex_compiler.h @@ -39,11 +39,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION * @{ */ - template<typename _TraitsT> + template<typename, bool, bool> struct _BracketMatcher; /// Builds an NFA from an input iterator interval. - template<typename _FwdIter, typename _TraitsT> + template<typename _FwdIter, typename _TraitsT, bool __icase, bool __collate> class _Compiler { public: @@ -63,7 +63,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION typedef typename _ScannerT::_TokenT _TokenT; typedef _StateSeq<_TraitsT> _StateSeqT; typedef std::stack<_StateSeqT, std::vector<_StateSeqT>> _StackT; - typedef _BracketMatcher<_TraitsT> _BMatcherT; + typedef _BracketMatcher<_TraitsT, __icase, __collate> _BMatcherT; typedef std::ctype<typename _TraitsT::char_type> _CtypeT; // accepts a specific token or returns false. @@ -95,12 +95,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_expression_term(_BMatcherT& __matcher); bool - _M_range_expression(_BMatcherT& __matcher); - - bool - _M_collating_symbol(_BMatcherT& __matcher); - - bool _M_equivalence_class(_BMatcherT& __matcher); bool @@ -134,8 +128,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __compile_nfa(_FwdIter __first, _FwdIter __last, const _TraitsT& __traits, regex_constants::syntax_option_type __flags) { - using _Cmplr = _Compiler<_FwdIter, _TraitsT>; - return _Cmplr(__first, __last, __traits, __flags)._M_get_nfa(); + if (!(__flags & regex_constants::icase)) + if (!(__flags & regex_constants::collate)) + return _Compiler<_FwdIter, _TraitsT, false, false> + (__first, __last, __traits, __flags)._M_get_nfa(); + else + return _Compiler<_FwdIter, _TraitsT, false, true> + (__first, __last, __traits, __flags)._M_get_nfa(); + else + if (!(__flags & regex_constants::collate)) + return _Compiler<_FwdIter, _TraitsT, true, false> + (__first, __last, __traits, __flags)._M_get_nfa(); + else + return _Compiler<_FwdIter, _TraitsT, true, true> + (__first, __last, __traits, __flags)._M_get_nfa(); } template<typename _Iter, typename _TraitsT> @@ -148,16 +154,110 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return __compile_nfa(__cfirst, __cfirst + __len, __traits, __flags); } - template<typename _TraitsT, bool __is_ecma> - struct _AnyMatcher + // [28.13.14] + template<typename _TraitsT, bool __icase, bool __collate> + class _RegexTranslator { - typedef typename _TraitsT::char_type _CharT; + public: + typedef typename _TraitsT::char_type _CharT; + typedef typename _TraitsT::string_type _StringT; + typedef typename std::conditional<__collate, + _StringT, + _CharT>::type _StrTransT; explicit - _AnyMatcher(const _TraitsT& __traits) + _RegexTranslator(const _TraitsT& __traits) : _M_traits(__traits) { } + _CharT + _M_translate(_CharT __ch) const + { + if (__icase) + return _M_traits.translate_nocase(__ch); + else if (__collate) + return _M_traits.translate(__ch); + else + return __ch; + } + + _StrTransT + _M_transform(_CharT __ch) const + { + return _M_transform_impl(__ch, typename integral_constant<bool, + __collate>::type()); + } + + private: + _StrTransT + _M_transform_impl(_CharT __ch, false_type) const + { return __ch; } + + _StrTransT + _M_transform_impl(_CharT __ch, true_type) const + { + _StrTransT __str = _StrTransT(1, _M_translate(__ch)); + return _M_traits.transform(__str.begin(), __str.end()); + } + + const _TraitsT& _M_traits; + }; + + template<typename _TraitsT> + class _RegexTranslator<_TraitsT, false, false> + { + public: + typedef typename _TraitsT::char_type _CharT; + typedef _CharT _StrTransT; + + explicit + _RegexTranslator(const _TraitsT& __traits) + { } + + _CharT + _M_translate(_CharT __ch) const + { return __ch; } + + _StrTransT + _M_transform(_CharT __ch) const + { return __ch; } + }; + + template<typename _TraitsT, bool __is_ecma, bool __icase, bool __collate> + struct _AnyMatcher; + + template<typename _TraitsT, bool __icase, bool __collate> + struct _AnyMatcher<_TraitsT, false, __icase, __collate> + { + typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT; + typedef typename _TransT::_CharT _CharT; + + explicit + _AnyMatcher(const _TraitsT& __traits) + : _M_translator(__traits) + { } + + bool + operator()(_CharT __ch) const + { + static auto __nul = _M_translator._M_translate('\0'); + return _M_translator._M_translate(__ch) != __nul; + } + + _TransT _M_translator; + }; + + template<typename _TraitsT, bool __icase, bool __collate> + struct _AnyMatcher<_TraitsT, true, __icase, __collate> + { + typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT; + typedef typename _TransT::_CharT _CharT; + + explicit + _AnyMatcher(const _TraitsT& __traits) + : _M_translator(__traits) + { } + bool operator()(_CharT __ch) const { return _M_apply(__ch, typename is_same<_CharT, char>::type()); } @@ -165,92 +265,66 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool _M_apply(_CharT __ch, true_type) const { - auto __c = _M_traits.translate(__ch); - if (__is_ecma) - { - static auto __n = _M_traits.translate('\n'); - static auto __r = _M_traits.translate('\r'); - return __c != __n && __c != __r; - } - else - { - static auto __nul = _M_traits.translate('\0'); - return __c != __nul; - } + auto __c = _M_translator._M_translate(__ch); + auto __n = _M_translator._M_translate('\n'); + auto __r = _M_translator._M_translate('\r'); + return __c != __n && __c != __r; } bool _M_apply(_CharT __ch, false_type) const { - auto __c = _M_traits.translate(__ch); - if (__is_ecma) - { - static auto __n = _M_traits.translate('\n'); - static auto __r = _M_traits.translate('\r'); - static auto __u2028 = _M_traits.translate(u'\u2028'); - static auto __u2029 = _M_traits.translate(u'\u2029'); - return __c != __n && __c != __r && __c != __u2028 - && __c != __u2029; - } - else - { - static auto __nul = _M_traits.translate('\0'); - return __c != __nul; - } + auto __c = _M_translator._M_translate(__ch); + auto __n = _M_translator._M_translate('\n'); + auto __r = _M_translator._M_translate('\r'); + auto __u2028 = _M_translator._M_translate(u'\u2028'); + auto __u2029 = _M_translator._M_translate(u'\u2029'); + return __c != __n && __c != __r && __c != __u2028 && __c != __u2029; } - const _TraitsT& _M_traits; + _TransT _M_translator; }; - template<typename _TraitsT, bool __icase> + template<typename _TraitsT, bool __icase, bool __collate> struct _CharMatcher { - typedef typename _TraitsT::char_type _CharT; + typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT; + typedef typename _TransT::_CharT _CharT; _CharMatcher(_CharT __ch, const _TraitsT& __traits) - : _M_traits(__traits), _M_ch(_M_translate(__ch)) + : _M_translator(__traits), _M_ch(_M_translator._M_translate(__ch)) { } bool operator()(_CharT __ch) const - { return _M_ch == _M_translate(__ch); } + { return _M_ch == _M_translator._M_translate(__ch); } - _CharT - _M_translate(_CharT __ch) const - { - if (__icase) - return _M_traits.translate_nocase(__ch); - else - return _M_traits.translate(__ch); - } - - const _TraitsT& _M_traits; - _CharT _M_ch; + _TransT _M_translator; + _CharT _M_ch; }; /// Matches a character range (bracket expression) // TODO: Convert used _M_flags fields to template parameters, including // collate and icase. Avoid using std::set, could use flat_set // (sorted vector and binary search) instead. - template<typename _TraitsT> + template<typename _TraitsT, bool __icase, bool __collate> struct _BracketMatcher { public: - typedef typename _TraitsT::char_type _CharT; - typedef typename _TraitsT::char_class_type _CharClassT; - typedef typename _TraitsT::string_type _StringT; - typedef regex_constants::syntax_option_type _FlagT; + typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT; + typedef typename _TransT::_CharT _CharT; + typedef typename _TransT::_StrTransT _StrTransT; + typedef typename _TraitsT::string_type _StringT; + typedef typename _TraitsT::char_class_type _CharClassT; public: _BracketMatcher(bool __is_non_matching, - const _TraitsT& __traits, - _FlagT __flags) - : + const _TraitsT& __traits) + : _M_class_set(0), _M_translator(__traits), _M_traits(__traits), + _M_is_non_matching(__is_non_matching) #ifdef _GLIBCXX_DEBUG - _M_is_ready(false), + , _M_is_ready(false) #endif - _M_traits(__traits), _M_class_set(0), _M_flags(__flags), - _M_is_non_matching(__is_non_matching) { } bool @@ -263,7 +337,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION void _M_add_char(_CharT __c) { - _M_char_set.insert(_M_translate(__c)); + _M_char_set.insert(_M_translator._M_translate(__c)); #ifdef _GLIBCXX_DEBUG _M_is_ready = false; #endif @@ -276,7 +350,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __s.data() + __s.size()); if (__st.empty()) __throw_regex_error(regex_constants::error_collate); - _M_char_set.insert(_M_translate(__st[0])); + _M_char_set.insert(_M_translator._M_translate(__st[0])); #ifdef _GLIBCXX_DEBUG _M_is_ready = false; #endif @@ -302,7 +376,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { auto __mask = _M_traits.lookup_classname(__s.data(), __s.data() + __s.size(), - _M_is_icase()); + __icase); if (__mask == 0) __throw_regex_error(regex_constants::error_ctype); _M_class_set |= __mask; @@ -314,12 +388,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION void _M_make_range(_CharT __l, _CharT __r) { - if (_M_flags & regex_constants::collate) - _M_range_set.insert( - make_pair(_M_get_str(_M_translate(__l)), - _M_get_str(_M_translate(__r)))); - else - _M_range_set.insert(make_pair(_M_get_str(__l), _M_get_str(__r))); + _M_range_set.insert(make_pair(_M_translator._M_transform(__l), + _M_translator._M_transform(__r))); #ifdef _GLIBCXX_DEBUG _M_is_ready = false; #endif @@ -350,26 +420,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_apply(_CharT __ch, true_type) const { return _M_cache[static_cast<_UnsignedCharT>(__ch)]; } - _CharT - _M_translate(_CharT __c) const - { - if (_M_is_icase()) - return _M_traits.translate_nocase(__c); - else - return _M_traits.translate(__c); - } - - bool - _M_is_icase() const - { return _M_flags & regex_constants::icase; } - - _StringT - _M_get_str(_CharT __c) const - { - _StringT __s(1, __c); - return _M_traits.transform(__s.begin(), __s.end()); - } - void _M_make_cache(true_type) { @@ -383,16 +433,16 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { } private: - _CacheT _M_cache; - std::set<_CharT> _M_char_set; - std::set<_StringT> _M_equiv_set; - std::set<pair<_StringT, _StringT>> _M_range_set; - const _TraitsT& _M_traits; - _CharClassT _M_class_set; - _FlagT _M_flags; - bool _M_is_non_matching; + _CacheT _M_cache; + std::set<_CharT> _M_char_set; + std::set<_StringT> _M_equiv_set; + std::set<pair<_StrTransT, _StrTransT>> _M_range_set; + _CharClassT _M_class_set; + _TransT _M_translator; + const _TraitsT& _M_traits; + bool _M_is_non_matching; #ifdef _GLIBCXX_DEBUG - bool _M_is_ready; + bool _M_is_ready; #endif }; diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index 4da653f..90a39f7 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -59,8 +59,8 @@ namespace __detail { _GLIBCXX_BEGIN_NAMESPACE_VERSION - template<typename _FwdIter, typename _TraitsT> - _Compiler<_FwdIter, _TraitsT>:: + template<typename _FwdIter, typename _TraitsT, bool __icase, bool __collate> + _Compiler<_FwdIter, _TraitsT, __icase, __collate>:: _Compiler(_FwdIter __b, _FwdIter __e, const _TraitsT& __traits, _FlagT __flags) : _M_flags((__flags @@ -89,9 +89,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_nfa._M_eliminate_dummy(); } - template<typename _FwdIter, typename _TraitsT> + template<typename _FwdIter, typename _TraitsT, bool __icase, bool __collate> void - _Compiler<_FwdIter, _TraitsT>:: + _Compiler<_FwdIter, _TraitsT, __icase, __collate>:: _M_disjunction() { this->_M_alternative(); @@ -110,9 +110,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } } - template<typename _FwdIter, typename _TraitsT> + template<typename _FwdIter, typename _TraitsT, bool __icase, bool __collate> void - _Compiler<_FwdIter, _TraitsT>:: + _Compiler<_FwdIter, _TraitsT, __icase, __collate>:: _M_alternative() { if (this->_M_term()) @@ -126,9 +126,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy())); } - template<typename _FwdIter, typename _TraitsT> + template<typename _FwdIter, typename _TraitsT, bool __icase, bool __collate> bool - _Compiler<_FwdIter, _TraitsT>:: + _Compiler<_FwdIter, _TraitsT, __icase, __collate>:: _M_term() { if (this->_M_assertion()) @@ -141,9 +141,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return false; } - template<typename _FwdIter, typename _TraitsT> + template<typename _FwdIter, typename _TraitsT, bool __icase, bool __collate> bool - _Compiler<_FwdIter, _TraitsT>:: + _Compiler<_FwdIter, _TraitsT, __icase, __collate>:: _M_assertion() { if (_M_match_token(_ScannerT::_S_token_line_begin)) @@ -172,9 +172,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return true; } - template<typename _FwdIter, typename _TraitsT> + template<typename _FwdIter, typename _TraitsT, bool __icase, bool __collate> void - _Compiler<_FwdIter, _TraitsT>:: + _Compiler<_FwdIter, _TraitsT, __icase, __collate>:: _M_quantifier() { bool __neg = (_M_flags & regex_constants::ECMAScript); @@ -278,38 +278,30 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } } - template<typename _FwdIter, typename _TraitsT> + template<typename _FwdIter, typename _TraitsT, bool __icase, bool __collate> bool - _Compiler<_FwdIter, _TraitsT>:: + _Compiler<_FwdIter, _TraitsT, __icase, __collate>:: _M_atom() { if (_M_match_token(_ScannerT::_S_token_anychar)) { - if (_M_flags & regex_constants::ECMAScript) + if (!(_M_flags & regex_constants::ECMAScript)) _M_stack.push(_StateSeqT(_M_nfa, - _M_nfa._M_insert_matcher - (_AnyMatcher<_TraitsT, - true>(_M_traits)))); + _M_nfa._M_insert_matcher + (_AnyMatcher<_TraitsT, false, __icase, __collate> + (_M_traits)))); else _M_stack.push(_StateSeqT(_M_nfa, - _M_nfa._M_insert_matcher - (_AnyMatcher<_TraitsT, - false>(_M_traits)))); + _M_nfa._M_insert_matcher + (_AnyMatcher<_TraitsT, true, __icase, __collate> + (_M_traits)))); } else if (_M_try_char()) { - if (_M_flags & regex_constants::icase) - _M_stack.push(_StateSeqT(_M_nfa, - _M_nfa._M_insert_matcher - (_CharMatcher<_TraitsT, - true>(_M_value[0], - _M_traits)))); - else - _M_stack.push(_StateSeqT(_M_nfa, - _M_nfa._M_insert_matcher - (_CharMatcher<_TraitsT, - false>(_M_value[0], - _M_traits)))); + _M_stack.push(_StateSeqT(_M_nfa, + _M_nfa._M_insert_matcher + (_CharMatcher<_TraitsT, __icase, __collate> + (_M_value[0], _M_traits)))); } else if (_M_match_token(_ScannerT::_S_token_backref)) _M_stack.push(_StateSeqT(_M_nfa, _M_nfa. @@ -318,11 +310,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { _GLIBCXX_DEBUG_ASSERT(_M_value.size() == 1); _BMatcherT __matcher(_M_ctype.is(_CtypeT::upper, _M_value[0]), - _M_traits, _M_flags); + _M_traits); __matcher._M_add_character_class(_M_value); __matcher._M_ready(); _M_stack.push(_StateSeqT(_M_nfa, - _M_nfa._M_insert_matcher(std::move(__matcher)))); + _M_nfa._M_insert_matcher(std::move(__matcher)))); } else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin)) { @@ -348,16 +340,16 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return true; } - template<typename _FwdIter, typename _TraitsT> + template<typename _FwdIter, typename _TraitsT, bool __icase, bool __collate> bool - _Compiler<_FwdIter, _TraitsT>:: + _Compiler<_FwdIter, _TraitsT, __icase, __collate>:: _M_bracket_expression() { bool __neg = _M_match_token(_ScannerT::_S_token_bracket_neg_begin); if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin))) return false; - _BMatcherT __matcher(__neg, _M_traits, _M_flags); + _BMatcherT __matcher(__neg, _M_traits); while (!_M_match_token(_ScannerT::_S_token_bracket_end)) _M_expression_term(__matcher); __matcher._M_ready(); @@ -366,9 +358,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return true; } - template<typename _FwdIter, typename _TraitsT> + template<typename _FwdIter, typename _TraitsT, bool __icase, bool __collate> void - _Compiler<_FwdIter, _TraitsT>:: + _Compiler<_FwdIter, _TraitsT, __icase, __collate>:: _M_expression_term(_BMatcherT& __matcher) { if (_M_match_token(_ScannerT::_S_token_collsymbol)) @@ -403,9 +395,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __throw_regex_error(regex_constants::error_brack); } - template<typename _FwdIter, typename _TraitsT> + template<typename _FwdIter, typename _TraitsT, bool __icase, bool __collate> bool - _Compiler<_FwdIter, _TraitsT>:: + _Compiler<_FwdIter, _TraitsT, __icase, __collate>:: _M_try_char() { bool __is_char = false; @@ -424,9 +416,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return __is_char; } - template<typename _FwdIter, typename _TraitsT> + template<typename _FwdIter, typename _TraitsT, bool __icase, bool __collate> bool - _Compiler<_FwdIter, _TraitsT>:: + _Compiler<_FwdIter, _TraitsT, __icase, __collate>:: _M_match_token(_TokenT token) { if (token == _M_scanner._M_get_token()) @@ -438,9 +430,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return false; } - template<typename _FwdIter, typename _TraitsT> + template<typename _FwdIter, typename _TraitsT, bool __icase, bool __collate> int - _Compiler<_FwdIter, _TraitsT>:: + _Compiler<_FwdIter, _TraitsT, __icase, __collate>:: _M_cur_int_value(int __radix) { long __v = 0; @@ -450,25 +442,28 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return __v; } - template<typename _TraitsT> + template<typename _TraitsT, bool __icase, bool __collate> bool - _BracketMatcher<_TraitsT>::_M_apply(_CharT __ch, false_type) const + _BracketMatcher<_TraitsT, __icase, __collate>:: + _M_apply(_CharT __ch, false_type) const { bool __ret = false; - if (_M_traits.isctype(__ch, _M_class_set) - || _M_char_set.count(_M_translate(__ch)) - || _M_equiv_set.count(_M_traits.transform_primary(&__ch, &__ch+1))) + if (_M_char_set.count(_M_translator._M_translate(__ch))) __ret = true; else { - _StringT __s = _M_get_str(_M_flags & regex_constants::collate - ? _M_translate(__ch) : __ch); + auto __s = _M_translator._M_transform(__ch); for (auto& __it : _M_range_set) if (__it.first <= __s && __s <= __it.second) { __ret = true; break; } + if (_M_traits.isctype(__ch, _M_class_set)) + __ret = true; + else if (_M_equiv_set.count(_M_traits. + transform_primary(&__ch, &__ch+1))) + __ret = true; } if (_M_is_non_matching) return !__ret;