libstdc++

regex_compiler.tcc

Go to the documentation of this file.
00001 // class template regex -*- C++ -*-
00002 
00003 // Copyright (C) 2013-2015 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 /**
00026  *  @file bits/regex_compiler.tcc
00027  *  This is an internal header file, included by other library headers.
00028  *  Do not attempt to use it directly. @headername{regex}
00029  */
00030 
00031 // FIXME make comments doxygen format.
00032 
00033 // This compiler refers to "Regular Expression Matching Can Be Simple And Fast"
00034 // (http://swtch.com/~rsc/regexp/regexp1.html"),
00035 // but doesn't strictly follow it.
00036 //
00037 // When compiling, states are *chained* instead of tree- or graph-constructed.
00038 // It's more like structured programs: there's if statement and loop statement.
00039 //
00040 // For alternative structure (say "a|b"), aka "if statement", two branches
00041 // should be constructed. However, these two shall merge to an "end_tag" at
00042 // the end of this operator:
00043 //
00044 //                branch1
00045 //              /        \
00046 // => begin_tag            end_tag =>
00047 //              \        /
00048 //                branch2
00049 //
00050 // This is the difference between this implementation and that in Russ's
00051 // article.
00052 //
00053 // That's why we introduced dummy node here ------ "end_tag" is a dummy node.
00054 // All dummy node will be eliminated at the end of compiling process.
00055 
00056 namespace std _GLIBCXX_VISIBILITY(default)
00057 {
00058 namespace __detail
00059 {
00060 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00061 
00062   template<typename _TraitsT>
00063     _Compiler<_TraitsT>::
00064     _Compiler(_IterT __b, _IterT __e,
00065               const typename _TraitsT::locale_type& __loc, _FlagT __flags)
00066     : _M_flags((__flags
00067                 & (regex_constants::ECMAScript
00068                    | regex_constants::basic
00069                    | regex_constants::extended
00070                    | regex_constants::grep
00071                    | regex_constants::egrep
00072                    | regex_constants::awk))
00073                ? __flags
00074                : __flags | regex_constants::ECMAScript),
00075       _M_scanner(__b, __e, _M_flags, __loc),
00076       _M_nfa(make_shared<_RegexT>(__loc, _M_flags)),
00077       _M_traits(_M_nfa->_M_traits),
00078       _M_ctype(std::use_facet<_CtypeT>(__loc))
00079     {
00080       _StateSeqT __r(*_M_nfa, _M_nfa->_M_start());
00081       __r._M_append(_M_nfa->_M_insert_subexpr_begin());
00082       this->_M_disjunction();
00083       if (!_M_match_token(_ScannerT::_S_token_eof))
00084         __throw_regex_error(regex_constants::error_paren);
00085       __r._M_append(_M_pop());
00086       _GLIBCXX_DEBUG_ASSERT(_M_stack.empty());
00087       __r._M_append(_M_nfa->_M_insert_subexpr_end());
00088       __r._M_append(_M_nfa->_M_insert_accept());
00089       _M_nfa->_M_eliminate_dummy();
00090     }
00091 
00092   template<typename _TraitsT>
00093     void
00094     _Compiler<_TraitsT>::
00095     _M_disjunction()
00096     {
00097       this->_M_alternative();
00098       while (_M_match_token(_ScannerT::_S_token_or))
00099         {
00100           _StateSeqT __alt1 = _M_pop();
00101           this->_M_alternative();
00102           _StateSeqT __alt2 = _M_pop();
00103           auto __end = _M_nfa->_M_insert_dummy();
00104           __alt1._M_append(__end);
00105           __alt2._M_append(__end);
00106           // __alt2 is state._M_next, __alt1 is state._M_alt. The executor
00107           // executes _M_alt before _M_next, as well as executing left
00108           // alternative before right one.
00109           _M_stack.push(_StateSeqT(*_M_nfa,
00110                                    _M_nfa->_M_insert_alt(
00111                                      __alt2._M_start, __alt1._M_start, false),
00112                                    __end));
00113         }
00114     }
00115 
00116   template<typename _TraitsT>
00117     void
00118     _Compiler<_TraitsT>::
00119     _M_alternative()
00120     {
00121       if (this->_M_term())
00122         {
00123           _StateSeqT __re = _M_pop();
00124           this->_M_alternative();
00125           __re._M_append(_M_pop());
00126           _M_stack.push(__re);
00127         }
00128       else
00129         _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_dummy()));
00130     }
00131 
00132   template<typename _TraitsT>
00133     bool
00134     _Compiler<_TraitsT>::
00135     _M_term()
00136     {
00137       if (this->_M_assertion())
00138         return true;
00139       if (this->_M_atom())
00140         {
00141           while (this->_M_quantifier());
00142           return true;
00143         }
00144       return false;
00145     }
00146 
00147   template<typename _TraitsT>
00148     bool
00149     _Compiler<_TraitsT>::
00150     _M_assertion()
00151     {
00152       if (_M_match_token(_ScannerT::_S_token_line_begin))
00153         _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_begin()));
00154       else if (_M_match_token(_ScannerT::_S_token_line_end))
00155         _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_end()));
00156       else if (_M_match_token(_ScannerT::_S_token_word_bound))
00157         // _M_value[0] == 'n' means it's negative, say "not word boundary".
00158         _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->
00159               _M_insert_word_bound(_M_value[0] == 'n')));
00160       else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin))
00161         {
00162           auto __neg = _M_value[0] == 'n';
00163           this->_M_disjunction();
00164           if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
00165             __throw_regex_error(regex_constants::error_paren);
00166           auto __tmp = _M_pop();
00167           __tmp._M_append(_M_nfa->_M_insert_accept());
00168           _M_stack.push(
00169               _StateSeqT(
00170                 *_M_nfa,
00171                 _M_nfa->_M_insert_lookahead(__tmp._M_start, __neg)));
00172         }
00173       else
00174         return false;
00175       return true;
00176     }
00177 
00178   template<typename _TraitsT>
00179     bool
00180     _Compiler<_TraitsT>::
00181     _M_quantifier()
00182     {
00183       bool __neg = (_M_flags & regex_constants::ECMAScript);
00184       auto __init = [this, &__neg]()
00185         {
00186           if (_M_stack.empty())
00187             __throw_regex_error(regex_constants::error_badrepeat);
00188           __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
00189         };
00190       if (_M_match_token(_ScannerT::_S_token_closure0))
00191         {
00192           __init();
00193           auto __e = _M_pop();
00194           _StateSeqT __r(*_M_nfa,
00195                          _M_nfa->_M_insert_repeat(_S_invalid_state_id,
00196                                                   __e._M_start, __neg));
00197           __e._M_append(__r);
00198           _M_stack.push(__r);
00199         }
00200       else if (_M_match_token(_ScannerT::_S_token_closure1))
00201         {
00202           __init();
00203           auto __e = _M_pop();
00204           __e._M_append(_M_nfa->_M_insert_repeat(_S_invalid_state_id,
00205                                                  __e._M_start, __neg));
00206           _M_stack.push(__e);
00207         }
00208       else if (_M_match_token(_ScannerT::_S_token_opt))
00209         {
00210           __init();
00211           auto __e = _M_pop();
00212           auto __end = _M_nfa->_M_insert_dummy();
00213           _StateSeqT __r(*_M_nfa,
00214                          _M_nfa->_M_insert_repeat(_S_invalid_state_id,
00215                                                   __e._M_start, __neg));
00216           __e._M_append(__end);
00217           __r._M_append(__end);
00218           _M_stack.push(__r);
00219         }
00220       else if (_M_match_token(_ScannerT::_S_token_interval_begin))
00221         {
00222           if (_M_stack.empty())
00223             __throw_regex_error(regex_constants::error_badrepeat);
00224           if (!_M_match_token(_ScannerT::_S_token_dup_count))
00225             __throw_regex_error(regex_constants::error_badbrace);
00226           _StateSeqT __r(_M_pop());
00227           _StateSeqT __e(*_M_nfa, _M_nfa->_M_insert_dummy());
00228           long __min_rep = _M_cur_int_value(10);
00229           bool __infi = false;
00230           long __n;
00231 
00232           // {3
00233           if (_M_match_token(_ScannerT::_S_token_comma))
00234             if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7}
00235               __n = _M_cur_int_value(10) - __min_rep;
00236             else
00237               __infi = true;
00238           else
00239             __n = 0;
00240           if (!_M_match_token(_ScannerT::_S_token_interval_end))
00241             __throw_regex_error(regex_constants::error_brace);
00242 
00243           __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
00244 
00245           for (long __i = 0; __i < __min_rep; ++__i)
00246             __e._M_append(__r._M_clone());
00247 
00248           if (__infi)
00249             {
00250               auto __tmp = __r._M_clone();
00251               _StateSeqT __s(*_M_nfa,
00252                              _M_nfa->_M_insert_repeat(_S_invalid_state_id,
00253                                                       __tmp._M_start, __neg));
00254               __tmp._M_append(__s);
00255               __e._M_append(__s);
00256             }
00257           else
00258             {
00259               if (__n < 0)
00260                 __throw_regex_error(regex_constants::error_badbrace);
00261               auto __end = _M_nfa->_M_insert_dummy();
00262               // _M_alt is the "match more" branch, and _M_next is the
00263               // "match less" one. Switch _M_alt and _M_next of all created
00264               // nodes. This is a hack but IMO works well.
00265               std::stack<_StateIdT> __stack;
00266               for (long __i = 0; __i < __n; ++__i)
00267                 {
00268                   auto __tmp = __r._M_clone();
00269                   auto __alt = _M_nfa->_M_insert_repeat(__tmp._M_start,
00270                                                         __end, __neg);
00271                   __stack.push(__alt);
00272                   __e._M_append(_StateSeqT(*_M_nfa, __alt, __tmp._M_end));
00273                 }
00274               __e._M_append(__end);
00275               while (!__stack.empty())
00276                 {
00277                   auto& __tmp = (*_M_nfa)[__stack.top()];
00278                   __stack.pop();
00279                   std::swap(__tmp._M_next, __tmp._M_alt);
00280                 }
00281             }
00282           _M_stack.push(__e);
00283         }
00284       else
00285         return false;
00286       return true;
00287     }
00288 
00289 #define __INSERT_REGEX_MATCHER(__func, args...)\
00290         do\
00291           if (!(_M_flags & regex_constants::icase))\
00292             if (!(_M_flags & regex_constants::collate))\
00293               __func<false, false>(args);\
00294             else\
00295               __func<false, true>(args);\
00296           else\
00297             if (!(_M_flags & regex_constants::collate))\
00298               __func<true, false>(args);\
00299             else\
00300               __func<true, true>(args);\
00301         while (false)
00302 
00303   template<typename _TraitsT>
00304     bool
00305     _Compiler<_TraitsT>::
00306     _M_atom()
00307     {
00308       if (_M_match_token(_ScannerT::_S_token_anychar))
00309         {
00310           if (!(_M_flags & regex_constants::ECMAScript))
00311             __INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix);
00312           else
00313             __INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma);
00314         }
00315       else if (_M_try_char())
00316         __INSERT_REGEX_MATCHER(_M_insert_char_matcher);
00317       else if (_M_match_token(_ScannerT::_S_token_backref))
00318         _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->
00319                                  _M_insert_backref(_M_cur_int_value(10))));
00320       else if (_M_match_token(_ScannerT::_S_token_quoted_class))
00321         __INSERT_REGEX_MATCHER(_M_insert_character_class_matcher);
00322       else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin))
00323         {
00324           _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_dummy());
00325           this->_M_disjunction();
00326           if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
00327             __throw_regex_error(regex_constants::error_paren);
00328           __r._M_append(_M_pop());
00329           _M_stack.push(__r);
00330         }
00331       else if (_M_match_token(_ScannerT::_S_token_subexpr_begin))
00332         {
00333           _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_subexpr_begin());
00334           this->_M_disjunction();
00335           if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
00336             __throw_regex_error(regex_constants::error_paren);
00337           __r._M_append(_M_pop());
00338           __r._M_append(_M_nfa->_M_insert_subexpr_end());
00339           _M_stack.push(__r);
00340         }
00341       else if (!_M_bracket_expression())
00342         return false;
00343       return true;
00344     }
00345 
00346   template<typename _TraitsT>
00347     bool
00348     _Compiler<_TraitsT>::
00349     _M_bracket_expression()
00350     {
00351       bool __neg =
00352         _M_match_token(_ScannerT::_S_token_bracket_neg_begin);
00353       if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin)))
00354         return false;
00355       __INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg);
00356       return true;
00357     }
00358 #undef __INSERT_REGEX_MATCHER
00359 
00360   template<typename _TraitsT>
00361   template<bool __icase, bool __collate>
00362     void
00363     _Compiler<_TraitsT>::
00364     _M_insert_any_matcher_ecma()
00365     {
00366       _M_stack.push(_StateSeqT(*_M_nfa,
00367         _M_nfa->_M_insert_matcher
00368           (_AnyMatcher<_TraitsT, true, __icase, __collate>
00369             (_M_traits))));
00370     }
00371 
00372   template<typename _TraitsT>
00373   template<bool __icase, bool __collate>
00374     void
00375     _Compiler<_TraitsT>::
00376     _M_insert_any_matcher_posix()
00377     {
00378       _M_stack.push(_StateSeqT(*_M_nfa,
00379         _M_nfa->_M_insert_matcher
00380           (_AnyMatcher<_TraitsT, false, __icase, __collate>
00381             (_M_traits))));
00382     }
00383 
00384   template<typename _TraitsT>
00385   template<bool __icase, bool __collate>
00386     void
00387     _Compiler<_TraitsT>::
00388     _M_insert_char_matcher()
00389     {
00390       _M_stack.push(_StateSeqT(*_M_nfa,
00391         _M_nfa->_M_insert_matcher
00392           (_CharMatcher<_TraitsT, __icase, __collate>
00393             (_M_value[0], _M_traits))));
00394     }
00395 
00396   template<typename _TraitsT>
00397   template<bool __icase, bool __collate>
00398     void
00399     _Compiler<_TraitsT>::
00400     _M_insert_character_class_matcher()
00401     {
00402       _GLIBCXX_DEBUG_ASSERT(_M_value.size() == 1);
00403       _BracketMatcher<_TraitsT, __icase, __collate> __matcher
00404         (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits);
00405       __matcher._M_add_character_class(_M_value, false);
00406       __matcher._M_ready();
00407       _M_stack.push(_StateSeqT(*_M_nfa,
00408         _M_nfa->_M_insert_matcher(std::move(__matcher))));
00409     }
00410 
00411   template<typename _TraitsT>
00412   template<bool __icase, bool __collate>
00413     void
00414     _Compiler<_TraitsT>::
00415     _M_insert_bracket_matcher(bool __neg)
00416     {
00417       _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits);
00418       pair<bool, _CharT> __last_char; // Optional<_CharT>
00419       __last_char.first = false;
00420       if (!(_M_flags & regex_constants::ECMAScript))
00421         if (_M_try_char())
00422           {
00423             __matcher._M_add_char(_M_value[0]);
00424             __last_char.first = true;
00425             __last_char.second = _M_value[0];
00426           }
00427       while (!_M_match_token(_ScannerT::_S_token_bracket_end))
00428         _M_expression_term(__last_char, __matcher);
00429       __matcher._M_ready();
00430       _M_stack.push(_StateSeqT(
00431                       *_M_nfa,
00432                       _M_nfa->_M_insert_matcher(std::move(__matcher))));
00433     }
00434 
00435   template<typename _TraitsT>
00436   template<bool __icase, bool __collate>
00437     void
00438     _Compiler<_TraitsT>::
00439     _M_expression_term(pair<bool, _CharT>& __last_char,
00440                        _BracketMatcher<_TraitsT, __icase, __collate>& __matcher)
00441     {
00442       if (_M_match_token(_ScannerT::_S_token_collsymbol))
00443         __matcher._M_add_collating_element(_M_value);
00444       else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
00445         __matcher._M_add_equivalence_class(_M_value);
00446       else if (_M_match_token(_ScannerT::_S_token_char_class_name))
00447         __matcher._M_add_character_class(_M_value, false);
00448       // POSIX doesn't permit '-' as a start-range char (say [a-z--0]),
00449       // except when the '-' is the first character in the bracket expression
00450       // ([--0]). ECMAScript treats all '-' after a range as a normal character.
00451       // Also see above, where _M_expression_term gets called.
00452       //
00453       // As a result, POSIX rejects [-----], but ECMAScript doesn't.
00454       // Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax.
00455       // Clang (3.5) always uses ECMAScript style even in its POSIX syntax.
00456       //
00457       // It turns out that no one reads BNFs ;)
00458       else if (_M_try_char())
00459         {
00460           if (!__last_char.first)
00461             {
00462               if (_M_value[0] == '-'
00463                   && !(_M_flags & regex_constants::ECMAScript))
00464                 __throw_regex_error(regex_constants::error_range);
00465               __matcher._M_add_char(_M_value[0]);
00466               __last_char.first = true;
00467               __last_char.second = _M_value[0];
00468             }
00469           else
00470             {
00471               if (_M_value[0] == '-')
00472                 {
00473                   if (_M_try_char())
00474                     {
00475                       __matcher._M_make_range(__last_char.second , _M_value[0]);
00476                       __last_char.first = false;
00477                     }
00478                   else
00479                     {
00480                       if (_M_scanner._M_get_token()
00481                           != _ScannerT::_S_token_bracket_end)
00482                         __throw_regex_error(regex_constants::error_range);
00483                       __matcher._M_add_char(_M_value[0]);
00484                     }
00485                 }
00486               else
00487                 {
00488                   __matcher._M_add_char(_M_value[0]);
00489                   __last_char.second = _M_value[0];
00490                 }
00491             }
00492         }
00493       else if (_M_match_token(_ScannerT::_S_token_quoted_class))
00494         __matcher._M_add_character_class(_M_value,
00495                                          _M_ctype.is(_CtypeT::upper,
00496                                                      _M_value[0]));
00497       else
00498         __throw_regex_error(regex_constants::error_brack);
00499     }
00500 
00501   template<typename _TraitsT>
00502     bool
00503     _Compiler<_TraitsT>::
00504     _M_try_char()
00505     {
00506       bool __is_char = false;
00507       if (_M_match_token(_ScannerT::_S_token_oct_num))
00508         {
00509           __is_char = true;
00510           _M_value.assign(1, _M_cur_int_value(8));
00511         }
00512       else if (_M_match_token(_ScannerT::_S_token_hex_num))
00513         {
00514           __is_char = true;
00515           _M_value.assign(1, _M_cur_int_value(16));
00516         }
00517       else if (_M_match_token(_ScannerT::_S_token_ord_char))
00518         __is_char = true;
00519       return __is_char;
00520     }
00521 
00522   template<typename _TraitsT>
00523     bool
00524     _Compiler<_TraitsT>::
00525     _M_match_token(_TokenT token)
00526     {
00527       if (token == _M_scanner._M_get_token())
00528         {
00529           _M_value = _M_scanner._M_get_value();
00530           _M_scanner._M_advance();
00531           return true;
00532         }
00533       return false;
00534     }
00535 
00536   template<typename _TraitsT>
00537     int
00538     _Compiler<_TraitsT>::
00539     _M_cur_int_value(int __radix)
00540     {
00541       long __v = 0;
00542       for (typename _StringT::size_type __i = 0;
00543            __i < _M_value.length(); ++__i)
00544         __v =__v * __radix + _M_traits.value(_M_value[__i], __radix);
00545       return __v;
00546     }
00547 
00548   template<typename _TraitsT, bool __icase, bool __collate>
00549     bool
00550     _BracketMatcher<_TraitsT, __icase, __collate>::
00551     _M_apply(_CharT __ch, false_type) const
00552     {
00553       bool __ret = std::binary_search(_M_char_set.begin(), _M_char_set.end(),
00554                                       _M_translator._M_translate(__ch));
00555       if (!__ret)
00556         {
00557           auto __s = _M_translator._M_transform(__ch);
00558           for (auto& __it : _M_range_set)
00559             if (__it.first <= __s && __s <= __it.second)
00560               {
00561                 __ret = true;
00562                 break;
00563               }
00564           if (_M_traits.isctype(__ch, _M_class_set))
00565             __ret = true;
00566           else if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(),
00567                              _M_traits.transform_primary(&__ch, &__ch+1))
00568                    != _M_equiv_set.end())
00569             __ret = true;
00570           else
00571             {
00572               for (auto& __it : _M_neg_class_set)
00573                 if (!_M_traits.isctype(__ch, __it))
00574                   {
00575                     __ret = true;
00576                     break;
00577                   }
00578             }
00579         }
00580       if (_M_is_non_matching)
00581         return !__ret;
00582       else
00583         return __ret;
00584     }
00585 
00586 _GLIBCXX_END_NAMESPACE_VERSION
00587 } // namespace __detail
00588 } // namespace