libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2010, 2011 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex_compiler.h 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 namespace std _GLIBCXX_VISIBILITY(default) 00032 { 00033 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00034 00035 namespace __regex 00036 { 00037 struct _Scanner_base 00038 { 00039 typedef unsigned int _StateT; 00040 00041 static constexpr _StateT _S_state_at_start = 1 << 0; 00042 static constexpr _StateT _S_state_in_brace = 1 << 2; 00043 static constexpr _StateT _S_state_in_bracket = 1 << 3; 00044 }; 00045 00046 // 00047 // @brief Scans an input range for regex tokens. 00048 // 00049 // The %_Scanner class interprets the regular expression pattern in the input 00050 // range passed to its constructor as a sequence of parse tokens passed to 00051 // the regular expression compiler. The sequence of tokens provided depends 00052 // on the flag settings passed to the constructor: different regular 00053 // expression grammars will interpret the same input pattern in 00054 // syntactically different ways. 00055 // 00056 template<typename _InputIterator> 00057 class _Scanner: public _Scanner_base 00058 { 00059 public: 00060 typedef _InputIterator _IteratorT; 00061 typedef typename std::iterator_traits<_IteratorT>::value_type _CharT; 00062 typedef std::basic_string<_CharT> _StringT; 00063 typedef regex_constants::syntax_option_type _FlagT; 00064 typedef const std::ctype<_CharT> _CtypeT; 00065 00066 // Token types returned from the scanner. 00067 enum _TokenT 00068 { 00069 _S_token_anychar, 00070 _S_token_backref, 00071 _S_token_bracket_begin, 00072 _S_token_bracket_end, 00073 _S_token_inverse_class, 00074 _S_token_char_class_name, 00075 _S_token_closure0, 00076 _S_token_closure1, 00077 _S_token_collelem_multi, 00078 _S_token_collelem_single, 00079 _S_token_collsymbol, 00080 _S_token_comma, 00081 _S_token_dash, 00082 _S_token_dup_count, 00083 _S_token_eof, 00084 _S_token_equiv_class_name, 00085 _S_token_interval_begin, 00086 _S_token_interval_end, 00087 _S_token_line_begin, 00088 _S_token_line_end, 00089 _S_token_opt, 00090 _S_token_or, 00091 _S_token_ord_char, 00092 _S_token_quoted_char, 00093 _S_token_subexpr_begin, 00094 _S_token_subexpr_end, 00095 _S_token_word_begin, 00096 _S_token_word_end, 00097 _S_token_unknown 00098 }; 00099 00100 public: 00101 _Scanner(_IteratorT __begin, _IteratorT __end, _FlagT __flags, 00102 std::locale __loc) 00103 : _M_current(__begin) , _M_end(__end) , _M_flags(__flags), 00104 _M_ctype(std::use_facet<_CtypeT>(__loc)), _M_state(_S_state_at_start) 00105 { _M_advance(); } 00106 00107 void 00108 _M_advance(); 00109 00110 _TokenT 00111 _M_token() const 00112 { return _M_curToken; } 00113 00114 const _StringT& 00115 _M_value() const 00116 { return _M_curValue; } 00117 00118 #ifdef _GLIBCXX_DEBUG 00119 std::ostream& 00120 _M_print(std::ostream&); 00121 #endif 00122 00123 private: 00124 void 00125 _M_eat_escape(); 00126 00127 void 00128 _M_scan_in_brace(); 00129 00130 void 00131 _M_scan_in_bracket(); 00132 00133 void 00134 _M_eat_charclass(); 00135 00136 void 00137 _M_eat_equivclass(); 00138 00139 void 00140 _M_eat_collsymbol(); 00141 00142 private: 00143 _IteratorT _M_current; 00144 _IteratorT _M_end; 00145 _FlagT _M_flags; 00146 _CtypeT& _M_ctype; 00147 _TokenT _M_curToken; 00148 _StringT _M_curValue; 00149 _StateT _M_state; 00150 }; 00151 00152 template<typename _InputIterator> 00153 void 00154 _Scanner<_InputIterator>:: 00155 _M_advance() 00156 { 00157 if (_M_current == _M_end) 00158 { 00159 _M_curToken = _S_token_eof; 00160 return; 00161 } 00162 00163 _CharT __c = *_M_current; 00164 if (_M_state & _S_state_in_bracket) 00165 { 00166 _M_scan_in_bracket(); 00167 return; 00168 } 00169 if (_M_state & _S_state_in_brace) 00170 { 00171 _M_scan_in_brace(); 00172 return; 00173 } 00174 #if 0 00175 // TODO: re-enable line anchors when _M_assertion is implemented. 00176 // See PR libstdc++/47724 00177 else if (_M_state & _S_state_at_start && __c == _M_ctype.widen('^')) 00178 { 00179 _M_curToken = _S_token_line_begin; 00180 ++_M_current; 00181 return; 00182 } 00183 else if (__c == _M_ctype.widen('$')) 00184 { 00185 _M_curToken = _S_token_line_end; 00186 ++_M_current; 00187 return; 00188 } 00189 #endif 00190 else if (__c == _M_ctype.widen('.')) 00191 { 00192 _M_curToken = _S_token_anychar; 00193 ++_M_current; 00194 return; 00195 } 00196 else if (__c == _M_ctype.widen('*')) 00197 { 00198 _M_curToken = _S_token_closure0; 00199 ++_M_current; 00200 return; 00201 } 00202 else if (__c == _M_ctype.widen('+')) 00203 { 00204 _M_curToken = _S_token_closure1; 00205 ++_M_current; 00206 return; 00207 } 00208 else if (__c == _M_ctype.widen('|')) 00209 { 00210 _M_curToken = _S_token_or; 00211 ++_M_current; 00212 return; 00213 } 00214 else if (__c == _M_ctype.widen('[')) 00215 { 00216 _M_curToken = _S_token_bracket_begin; 00217 _M_state |= (_S_state_in_bracket | _S_state_at_start); 00218 ++_M_current; 00219 return; 00220 } 00221 else if (__c == _M_ctype.widen('\\')) 00222 { 00223 _M_eat_escape(); 00224 return; 00225 } 00226 else if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) 00227 { 00228 if (__c == _M_ctype.widen('(')) 00229 { 00230 _M_curToken = _S_token_subexpr_begin; 00231 ++_M_current; 00232 return; 00233 } 00234 else if (__c == _M_ctype.widen(')')) 00235 { 00236 _M_curToken = _S_token_subexpr_end; 00237 ++_M_current; 00238 return; 00239 } 00240 else if (__c == _M_ctype.widen('{')) 00241 { 00242 _M_curToken = _S_token_interval_begin; 00243 _M_state |= _S_state_in_brace; 00244 ++_M_current; 00245 return; 00246 } 00247 } 00248 00249 _M_curToken = _S_token_ord_char; 00250 _M_curValue.assign(1, __c); 00251 ++_M_current; 00252 } 00253 00254 00255 template<typename _InputIterator> 00256 void 00257 _Scanner<_InputIterator>:: 00258 _M_scan_in_brace() 00259 { 00260 if (_M_ctype.is(_CtypeT::digit, *_M_current)) 00261 { 00262 _M_curToken = _S_token_dup_count; 00263 _M_curValue.assign(1, *_M_current); 00264 ++_M_current; 00265 while (_M_current != _M_end 00266 && _M_ctype.is(_CtypeT::digit, *_M_current)) 00267 { 00268 _M_curValue += *_M_current; 00269 ++_M_current; 00270 } 00271 return; 00272 } 00273 else if (*_M_current == _M_ctype.widen(',')) 00274 { 00275 _M_curToken = _S_token_comma; 00276 ++_M_current; 00277 return; 00278 } 00279 if (_M_flags & (regex_constants::basic | regex_constants::grep)) 00280 { 00281 if (*_M_current == _M_ctype.widen('\\')) 00282 _M_eat_escape(); 00283 } 00284 else 00285 { 00286 if (*_M_current == _M_ctype.widen('}')) 00287 { 00288 _M_curToken = _S_token_interval_end; 00289 _M_state &= ~_S_state_in_brace; 00290 ++_M_current; 00291 return; 00292 } 00293 } 00294 } 00295 00296 template<typename _InputIterator> 00297 void 00298 _Scanner<_InputIterator>:: 00299 _M_scan_in_bracket() 00300 { 00301 if (_M_state & _S_state_at_start && *_M_current == _M_ctype.widen('^')) 00302 { 00303 _M_curToken = _S_token_inverse_class; 00304 _M_state &= ~_S_state_at_start; 00305 ++_M_current; 00306 return; 00307 } 00308 else if (*_M_current == _M_ctype.widen('[')) 00309 { 00310 ++_M_current; 00311 if (_M_current == _M_end) 00312 { 00313 _M_curToken = _S_token_eof; 00314 return; 00315 } 00316 00317 if (*_M_current == _M_ctype.widen('.')) 00318 { 00319 _M_curToken = _S_token_collsymbol; 00320 _M_eat_collsymbol(); 00321 return; 00322 } 00323 else if (*_M_current == _M_ctype.widen(':')) 00324 { 00325 _M_curToken = _S_token_char_class_name; 00326 _M_eat_charclass(); 00327 return; 00328 } 00329 else if (*_M_current == _M_ctype.widen('=')) 00330 { 00331 _M_curToken = _S_token_equiv_class_name; 00332 _M_eat_equivclass(); 00333 return; 00334 } 00335 } 00336 else if (*_M_current == _M_ctype.widen('-')) 00337 { 00338 _M_curToken = _S_token_dash; 00339 ++_M_current; 00340 return; 00341 } 00342 else if (*_M_current == _M_ctype.widen(']')) 00343 { 00344 if (!(_M_flags & regex_constants::ECMAScript) 00345 || !(_M_state & _S_state_at_start)) 00346 { 00347 // special case: only if _not_ chr first after 00348 // '[' or '[^' and if not ECMAscript 00349 _M_curToken = _S_token_bracket_end; 00350 ++_M_current; 00351 return; 00352 } 00353 } 00354 _M_curToken = _S_token_collelem_single; 00355 _M_curValue.assign(1, *_M_current); 00356 ++_M_current; 00357 } 00358 00359 template<typename _InputIterator> 00360 void 00361 _Scanner<_InputIterator>:: 00362 _M_eat_escape() 00363 { 00364 ++_M_current; 00365 if (_M_current == _M_end) 00366 { 00367 _M_curToken = _S_token_eof; 00368 return; 00369 } 00370 _CharT __c = *_M_current; 00371 ++_M_current; 00372 00373 if (__c == _M_ctype.widen('(')) 00374 { 00375 if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) 00376 { 00377 _M_curToken = _S_token_ord_char; 00378 _M_curValue.assign(1, __c); 00379 } 00380 else 00381 _M_curToken = _S_token_subexpr_begin; 00382 } 00383 else if (__c == _M_ctype.widen(')')) 00384 { 00385 if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) 00386 { 00387 _M_curToken = _S_token_ord_char; 00388 _M_curValue.assign(1, __c); 00389 } 00390 else 00391 _M_curToken = _S_token_subexpr_end; 00392 } 00393 else if (__c == _M_ctype.widen('{')) 00394 { 00395 if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) 00396 { 00397 _M_curToken = _S_token_ord_char; 00398 _M_curValue.assign(1, __c); 00399 } 00400 else 00401 { 00402 _M_curToken = _S_token_interval_begin; 00403 _M_state |= _S_state_in_brace; 00404 } 00405 } 00406 else if (__c == _M_ctype.widen('}')) 00407 { 00408 if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) 00409 { 00410 _M_curToken = _S_token_ord_char; 00411 _M_curValue.assign(1, __c); 00412 } 00413 else 00414 { 00415 if (!(_M_state && _S_state_in_brace)) 00416 __throw_regex_error(regex_constants::error_badbrace); 00417 _M_state &= ~_S_state_in_brace; 00418 _M_curToken = _S_token_interval_end; 00419 } 00420 } 00421 else if (__c == _M_ctype.widen('x')) 00422 { 00423 ++_M_current; 00424 if (_M_current == _M_end) 00425 { 00426 _M_curToken = _S_token_eof; 00427 return; 00428 } 00429 if (_M_ctype.is(_CtypeT::digit, *_M_current)) 00430 { 00431 _M_curValue.assign(1, *_M_current); 00432 ++_M_current; 00433 if (_M_current == _M_end) 00434 { 00435 _M_curToken = _S_token_eof; 00436 return; 00437 } 00438 if (_M_ctype.is(_CtypeT::digit, *_M_current)) 00439 { 00440 _M_curValue += *_M_current; 00441 ++_M_current; 00442 return; 00443 } 00444 } 00445 } 00446 else if (__c == _M_ctype.widen('^') 00447 || __c == _M_ctype.widen('.') 00448 || __c == _M_ctype.widen('*') 00449 || __c == _M_ctype.widen('$') 00450 || __c == _M_ctype.widen('\\')) 00451 { 00452 _M_curToken = _S_token_ord_char; 00453 _M_curValue.assign(1, __c); 00454 } 00455 else if (_M_ctype.is(_CtypeT::digit, __c)) 00456 { 00457 _M_curToken = _S_token_backref; 00458 _M_curValue.assign(1, __c); 00459 } 00460 else 00461 __throw_regex_error(regex_constants::error_escape); 00462 } 00463 00464 00465 // Eats a character class or throwns an exception. 00466 // current point to ':' delimiter on entry, char after ']' on return 00467 template<typename _InputIterator> 00468 void 00469 _Scanner<_InputIterator>:: 00470 _M_eat_charclass() 00471 { 00472 ++_M_current; // skip ':' 00473 if (_M_current == _M_end) 00474 __throw_regex_error(regex_constants::error_ctype); 00475 for (_M_curValue.clear(); 00476 _M_current != _M_end && *_M_current != _M_ctype.widen(':'); 00477 ++_M_current) 00478 _M_curValue += *_M_current; 00479 if (_M_current == _M_end) 00480 __throw_regex_error(regex_constants::error_ctype); 00481 ++_M_current; // skip ':' 00482 if (*_M_current != _M_ctype.widen(']')) 00483 __throw_regex_error(regex_constants::error_ctype); 00484 ++_M_current; // skip ']' 00485 } 00486 00487 00488 template<typename _InputIterator> 00489 void 00490 _Scanner<_InputIterator>:: 00491 _M_eat_equivclass() 00492 { 00493 ++_M_current; // skip '=' 00494 if (_M_current == _M_end) 00495 __throw_regex_error(regex_constants::error_collate); 00496 for (_M_curValue.clear(); 00497 _M_current != _M_end && *_M_current != _M_ctype.widen('='); 00498 ++_M_current) 00499 _M_curValue += *_M_current; 00500 if (_M_current == _M_end) 00501 __throw_regex_error(regex_constants::error_collate); 00502 ++_M_current; // skip '=' 00503 if (*_M_current != _M_ctype.widen(']')) 00504 __throw_regex_error(regex_constants::error_collate); 00505 ++_M_current; // skip ']' 00506 } 00507 00508 00509 template<typename _InputIterator> 00510 void 00511 _Scanner<_InputIterator>:: 00512 _M_eat_collsymbol() 00513 { 00514 ++_M_current; // skip '.' 00515 if (_M_current == _M_end) 00516 __throw_regex_error(regex_constants::error_collate); 00517 for (_M_curValue.clear(); 00518 _M_current != _M_end && *_M_current != _M_ctype.widen('.'); 00519 ++_M_current) 00520 _M_curValue += *_M_current; 00521 if (_M_current == _M_end) 00522 __throw_regex_error(regex_constants::error_collate); 00523 ++_M_current; // skip '.' 00524 if (*_M_current != _M_ctype.widen(']')) 00525 __throw_regex_error(regex_constants::error_collate); 00526 ++_M_current; // skip ']' 00527 } 00528 00529 #ifdef _GLIBCXX_DEBUG 00530 template<typename _InputIterator> 00531 std::ostream& 00532 _Scanner<_InputIterator>:: 00533 _M_print(std::ostream& ostr) 00534 { 00535 switch (_M_curToken) 00536 { 00537 case _S_token_anychar: 00538 ostr << "any-character\n"; 00539 break; 00540 case _S_token_backref: 00541 ostr << "backref\n"; 00542 break; 00543 case _S_token_bracket_begin: 00544 ostr << "bracket-begin\n"; 00545 break; 00546 case _S_token_bracket_end: 00547 ostr << "bracket-end\n"; 00548 break; 00549 case _S_token_char_class_name: 00550 ostr << "char-class-name \"" << _M_curValue << "\"\n"; 00551 break; 00552 case _S_token_closure0: 00553 ostr << "closure0\n"; 00554 break; 00555 case _S_token_closure1: 00556 ostr << "closure1\n"; 00557 break; 00558 case _S_token_collelem_multi: 00559 ostr << "coll-elem-multi \"" << _M_curValue << "\"\n"; 00560 break; 00561 case _S_token_collelem_single: 00562 ostr << "coll-elem-single \"" << _M_curValue << "\"\n"; 00563 break; 00564 case _S_token_collsymbol: 00565 ostr << "collsymbol \"" << _M_curValue << "\"\n"; 00566 break; 00567 case _S_token_comma: 00568 ostr << "comma\n"; 00569 break; 00570 case _S_token_dash: 00571 ostr << "dash\n"; 00572 break; 00573 case _S_token_dup_count: 00574 ostr << "dup count: " << _M_curValue << "\n"; 00575 break; 00576 case _S_token_eof: 00577 ostr << "EOF\n"; 00578 break; 00579 case _S_token_equiv_class_name: 00580 ostr << "equiv-class-name \"" << _M_curValue << "\"\n"; 00581 break; 00582 case _S_token_interval_begin: 00583 ostr << "interval begin\n"; 00584 break; 00585 case _S_token_interval_end: 00586 ostr << "interval end\n"; 00587 break; 00588 case _S_token_line_begin: 00589 ostr << "line begin\n"; 00590 break; 00591 case _S_token_line_end: 00592 ostr << "line end\n"; 00593 break; 00594 case _S_token_opt: 00595 ostr << "opt\n"; 00596 break; 00597 case _S_token_or: 00598 ostr << "or\n"; 00599 break; 00600 case _S_token_ord_char: 00601 ostr << "ordinary character: \"" << _M_value() << "\"\n"; 00602 break; 00603 case _S_token_quoted_char: 00604 ostr << "quoted char\n"; 00605 break; 00606 case _S_token_subexpr_begin: 00607 ostr << "subexpr begin\n"; 00608 break; 00609 case _S_token_subexpr_end: 00610 ostr << "subexpr end\n"; 00611 break; 00612 case _S_token_word_begin: 00613 ostr << "word begin\n"; 00614 break; 00615 case _S_token_word_end: 00616 ostr << "word end\n"; 00617 break; 00618 case _S_token_unknown: 00619 ostr << "-- unknown token --\n"; 00620 break; 00621 } 00622 return ostr; 00623 } 00624 #endif 00625 00626 // Builds an NFA from an input iterator interval. 00627 template<typename _InIter, typename _TraitsT> 00628 class _Compiler 00629 { 00630 public: 00631 typedef _InIter _IterT; 00632 typedef typename std::iterator_traits<_InIter>::value_type _CharT; 00633 typedef std::basic_string<_CharT> _StringT; 00634 typedef regex_constants::syntax_option_type _FlagT; 00635 00636 public: 00637 _Compiler(const _InIter& __b, const _InIter& __e, 00638 _TraitsT& __traits, _FlagT __flags); 00639 00640 const _Nfa& 00641 _M_nfa() const 00642 { return _M_state_store; } 00643 00644 private: 00645 typedef _Scanner<_InIter> _ScannerT; 00646 typedef typename _ScannerT::_TokenT _TokenT; 00647 typedef std::stack<_StateSeq, std::vector<_StateSeq> > _StackT; 00648 typedef _RangeMatcher<_InIter, _TraitsT> _RMatcherT; 00649 00650 // accepts a specific token or returns false. 00651 bool 00652 _M_match_token(_TokenT __token); 00653 00654 void 00655 _M_disjunction(); 00656 00657 bool 00658 _M_alternative(); 00659 00660 bool 00661 _M_term(); 00662 00663 bool 00664 _M_assertion(); 00665 00666 bool 00667 _M_quantifier(); 00668 00669 bool 00670 _M_atom(); 00671 00672 bool 00673 _M_bracket_expression(); 00674 00675 bool 00676 _M_bracket_list(_RMatcherT& __matcher); 00677 00678 bool 00679 _M_follow_list(_RMatcherT& __matcher); 00680 00681 bool 00682 _M_follow_list2(_RMatcherT& __matcher); 00683 00684 bool 00685 _M_expression_term(_RMatcherT& __matcher); 00686 00687 bool 00688 _M_range_expression(_RMatcherT& __matcher); 00689 00690 bool 00691 _M_start_range(_RMatcherT& __matcher); 00692 00693 bool 00694 _M_collating_symbol(_RMatcherT& __matcher); 00695 00696 bool 00697 _M_equivalence_class(_RMatcherT& __matcher); 00698 00699 bool 00700 _M_character_class(_RMatcherT& __matcher); 00701 00702 int 00703 _M_cur_int_value(int __radix); 00704 00705 private: 00706 _TraitsT& _M_traits; 00707 _ScannerT _M_scanner; 00708 _StringT _M_cur_value; 00709 _Nfa _M_state_store; 00710 _StackT _M_stack; 00711 }; 00712 00713 template<typename _InIter, typename _TraitsT> 00714 _Compiler<_InIter, _TraitsT>:: 00715 _Compiler(const _InIter& __b, const _InIter& __e, _TraitsT& __traits, 00716 _Compiler<_InIter, _TraitsT>::_FlagT __flags) 00717 : _M_traits(__traits), _M_scanner(__b, __e, __flags, _M_traits.getloc()), 00718 _M_state_store(__flags) 00719 { 00720 typedef _StartTagger<_InIter, _TraitsT> _Start; 00721 typedef _EndTagger<_InIter, _TraitsT> _End; 00722 00723 _StateSeq __r(_M_state_store, 00724 _M_state_store._M_insert_subexpr_begin(_Start(0))); 00725 _M_disjunction(); 00726 if (!_M_stack.empty()) 00727 { 00728 __r._M_append(_M_stack.top()); 00729 _M_stack.pop(); 00730 } 00731 __r._M_append(_M_state_store._M_insert_subexpr_end(0, _End(0))); 00732 __r._M_append(_M_state_store._M_insert_accept()); 00733 } 00734 00735 template<typename _InIter, typename _TraitsT> 00736 bool 00737 _Compiler<_InIter, _TraitsT>:: 00738 _M_match_token(_Compiler<_InIter, _TraitsT>::_TokenT token) 00739 { 00740 if (token == _M_scanner._M_token()) 00741 { 00742 _M_cur_value = _M_scanner._M_value(); 00743 _M_scanner._M_advance(); 00744 return true; 00745 } 00746 return false; 00747 } 00748 00749 template<typename _InIter, typename _TraitsT> 00750 void 00751 _Compiler<_InIter, _TraitsT>:: 00752 _M_disjunction() 00753 { 00754 this->_M_alternative(); 00755 if (_M_match_token(_ScannerT::_S_token_or)) 00756 { 00757 _StateSeq __alt1 = _M_stack.top(); _M_stack.pop(); 00758 this->_M_disjunction(); 00759 _StateSeq __alt2 = _M_stack.top(); _M_stack.pop(); 00760 _M_stack.push(_StateSeq(__alt1, __alt2)); 00761 } 00762 } 00763 00764 template<typename _InIter, typename _TraitsT> 00765 bool 00766 _Compiler<_InIter, _TraitsT>:: 00767 _M_alternative() 00768 { 00769 if (this->_M_term()) 00770 { 00771 _StateSeq __re = _M_stack.top(); _M_stack.pop(); 00772 this->_M_alternative(); 00773 if (!_M_stack.empty()) 00774 { 00775 __re._M_append(_M_stack.top()); 00776 _M_stack.pop(); 00777 } 00778 _M_stack.push(__re); 00779 return true; 00780 } 00781 return false; 00782 } 00783 00784 template<typename _InIter, typename _TraitsT> 00785 bool 00786 _Compiler<_InIter, _TraitsT>:: 00787 _M_term() 00788 { 00789 if (this->_M_assertion()) 00790 return true; 00791 if (this->_M_atom()) 00792 { 00793 this->_M_quantifier(); 00794 return true; 00795 } 00796 return false; 00797 } 00798 00799 template<typename _InIter, typename _TraitsT> 00800 bool 00801 _Compiler<_InIter, _TraitsT>:: 00802 _M_assertion() 00803 { 00804 if (_M_match_token(_ScannerT::_S_token_line_begin)) 00805 { 00806 // __m.push(_Matcher::_S_opcode_line_begin); 00807 return true; 00808 } 00809 if (_M_match_token(_ScannerT::_S_token_line_end)) 00810 { 00811 // __m.push(_Matcher::_S_opcode_line_end); 00812 return true; 00813 } 00814 if (_M_match_token(_ScannerT::_S_token_word_begin)) 00815 { 00816 // __m.push(_Matcher::_S_opcode_word_begin); 00817 return true; 00818 } 00819 if (_M_match_token(_ScannerT::_S_token_word_end)) 00820 { 00821 // __m.push(_Matcher::_S_opcode_word_end); 00822 return true; 00823 } 00824 return false; 00825 } 00826 00827 template<typename _InIter, typename _TraitsT> 00828 bool 00829 _Compiler<_InIter, _TraitsT>:: 00830 _M_quantifier() 00831 { 00832 if (_M_match_token(_ScannerT::_S_token_closure0)) 00833 { 00834 if (_M_stack.empty()) 00835 __throw_regex_error(regex_constants::error_badrepeat); 00836 _StateSeq __r(_M_stack.top(), -1); 00837 __r._M_append(__r._M_front()); 00838 _M_stack.pop(); 00839 _M_stack.push(__r); 00840 return true; 00841 } 00842 if (_M_match_token(_ScannerT::_S_token_closure1)) 00843 { 00844 if (_M_stack.empty()) 00845 __throw_regex_error(regex_constants::error_badrepeat); 00846 _StateSeq __r(_M_state_store, 00847 _M_state_store. 00848 _M_insert_alt(_S_invalid_state_id, 00849 _M_stack.top()._M_front())); 00850 _M_stack.top()._M_append(__r); 00851 return true; 00852 } 00853 if (_M_match_token(_ScannerT::_S_token_opt)) 00854 { 00855 if (_M_stack.empty()) 00856 __throw_regex_error(regex_constants::error_badrepeat); 00857 _StateSeq __r(_M_stack.top(), -1); 00858 _M_stack.pop(); 00859 _M_stack.push(__r); 00860 return true; 00861 } 00862 if (_M_match_token(_ScannerT::_S_token_interval_begin)) 00863 { 00864 if (_M_stack.empty()) 00865 __throw_regex_error(regex_constants::error_badrepeat); 00866 if (!_M_match_token(_ScannerT::_S_token_dup_count)) 00867 __throw_regex_error(regex_constants::error_badbrace); 00868 _StateSeq __r(_M_stack.top()); 00869 int __min_rep = _M_cur_int_value(10); 00870 for (int __i = 1; __i < __min_rep; ++__i) 00871 _M_stack.top()._M_append(__r._M_clone()); 00872 if (_M_match_token(_ScannerT::_S_token_comma)) 00873 if (_M_match_token(_ScannerT::_S_token_dup_count)) 00874 { 00875 int __n = _M_cur_int_value(10) - __min_rep; 00876 if (__n < 0) 00877 __throw_regex_error(regex_constants::error_badbrace); 00878 for (int __i = 0; __i < __n; ++__i) 00879 { 00880 _StateSeq __r(_M_state_store, 00881 _M_state_store. 00882 _M_insert_alt(_S_invalid_state_id, 00883 _M_stack.top()._M_front())); 00884 _M_stack.top()._M_append(__r); 00885 } 00886 } 00887 else 00888 { 00889 _StateSeq __r(_M_stack.top(), -1); 00890 __r._M_push_back(__r._M_front()); 00891 _M_stack.pop(); 00892 _M_stack.push(__r); 00893 } 00894 if (!_M_match_token(_ScannerT::_S_token_interval_end)) 00895 __throw_regex_error(regex_constants::error_brace); 00896 return true; 00897 } 00898 return false; 00899 } 00900 00901 template<typename _InIter, typename _TraitsT> 00902 bool 00903 _Compiler<_InIter, _TraitsT>:: 00904 _M_atom() 00905 { 00906 typedef _CharMatcher<_InIter, _TraitsT> _CMatcher; 00907 typedef _StartTagger<_InIter, _TraitsT> _Start; 00908 typedef _EndTagger<_InIter, _TraitsT> _End; 00909 00910 if (_M_match_token(_ScannerT::_S_token_anychar)) 00911 { 00912 _M_stack.push(_StateSeq(_M_state_store, 00913 _M_state_store._M_insert_matcher 00914 (_AnyMatcher))); 00915 return true; 00916 } 00917 if (_M_match_token(_ScannerT::_S_token_ord_char)) 00918 { 00919 _M_stack.push(_StateSeq(_M_state_store, 00920 _M_state_store._M_insert_matcher 00921 (_CMatcher(_M_cur_value[0], _M_traits)))); 00922 return true; 00923 } 00924 if (_M_match_token(_ScannerT::_S_token_quoted_char)) 00925 { 00926 // note that in the ECMA grammar, this case covers backrefs. 00927 _M_stack.push(_StateSeq(_M_state_store, 00928 _M_state_store._M_insert_matcher 00929 (_CMatcher(_M_cur_value[0], _M_traits)))); 00930 return true; 00931 } 00932 if (_M_match_token(_ScannerT::_S_token_backref)) 00933 { 00934 // __m.push(_Matcher::_S_opcode_ordchar, _M_cur_value); 00935 return true; 00936 } 00937 if (_M_match_token(_ScannerT::_S_token_subexpr_begin)) 00938 { 00939 int __mark = _M_state_store._M_sub_count(); 00940 _StateSeq __r(_M_state_store, 00941 _M_state_store. 00942 _M_insert_subexpr_begin(_Start(__mark))); 00943 this->_M_disjunction(); 00944 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 00945 __throw_regex_error(regex_constants::error_paren); 00946 if (!_M_stack.empty()) 00947 { 00948 __r._M_append(_M_stack.top()); 00949 _M_stack.pop(); 00950 } 00951 __r._M_append(_M_state_store._M_insert_subexpr_end 00952 (__mark, _End(__mark))); 00953 _M_stack.push(__r); 00954 return true; 00955 } 00956 return _M_bracket_expression(); 00957 } 00958 00959 template<typename _InIter, typename _TraitsT> 00960 bool 00961 _Compiler<_InIter, _TraitsT>:: 00962 _M_bracket_expression() 00963 { 00964 if (_M_match_token(_ScannerT::_S_token_bracket_begin)) 00965 { 00966 _RMatcherT __matcher(_M_match_token(_ScannerT::_S_token_line_begin), 00967 _M_traits); 00968 if (!_M_bracket_list(__matcher) 00969 || !_M_match_token(_ScannerT::_S_token_bracket_end)) 00970 __throw_regex_error(regex_constants::error_brack); 00971 _M_stack.push(_StateSeq(_M_state_store, 00972 _M_state_store._M_insert_matcher(__matcher))); 00973 return true; 00974 } 00975 return false; 00976 } 00977 00978 // If the dash is the last character in the bracket expression, it is not 00979 // special. 00980 template<typename _InIter, typename _TraitsT> 00981 bool 00982 _Compiler<_InIter, _TraitsT>:: 00983 _M_bracket_list(_RMatcherT& __matcher) 00984 { 00985 if (_M_follow_list(__matcher)) 00986 { 00987 if (_M_match_token(_ScannerT::_S_token_dash)) 00988 __matcher._M_add_char(_M_cur_value[0]); 00989 return true; 00990 } 00991 return false; 00992 } 00993 00994 template<typename _InIter, typename _TraitsT> 00995 bool 00996 _Compiler<_InIter, _TraitsT>:: 00997 _M_follow_list(_RMatcherT& __matcher) 00998 { return _M_expression_term(__matcher) && _M_follow_list2(__matcher); } 00999 01000 template<typename _InIter, typename _TraitsT> 01001 bool 01002 _Compiler<_InIter, _TraitsT>:: 01003 _M_follow_list2(_RMatcherT& __matcher) 01004 { 01005 if (_M_expression_term(__matcher)) 01006 return _M_follow_list2(__matcher); 01007 return true; 01008 } 01009 01010 template<typename _InIter, typename _TraitsT> 01011 bool 01012 _Compiler<_InIter, _TraitsT>:: 01013 _M_expression_term(_RMatcherT& __matcher) 01014 { 01015 return (_M_collating_symbol(__matcher) 01016 || _M_character_class(__matcher) 01017 || _M_equivalence_class(__matcher) 01018 || (_M_start_range(__matcher) 01019 && _M_range_expression(__matcher))); 01020 } 01021 01022 template<typename _InIter, typename _TraitsT> 01023 bool 01024 _Compiler<_InIter, _TraitsT>:: 01025 _M_range_expression(_RMatcherT& __matcher) 01026 { 01027 if (!_M_collating_symbol(__matcher)) 01028 if (!_M_match_token(_ScannerT::_S_token_dash)) 01029 __throw_regex_error(regex_constants::error_range); 01030 __matcher._M_make_range(); 01031 return true; 01032 } 01033 01034 template<typename _InIter, typename _TraitsT> 01035 bool 01036 _Compiler<_InIter, _TraitsT>:: 01037 _M_start_range(_RMatcherT& __matcher) 01038 { return _M_match_token(_ScannerT::_S_token_dash); } 01039 01040 template<typename _InIter, typename _TraitsT> 01041 bool 01042 _Compiler<_InIter, _TraitsT>:: 01043 _M_collating_symbol(_RMatcherT& __matcher) 01044 { 01045 if (_M_match_token(_ScannerT::_S_token_collelem_single)) 01046 { 01047 __matcher._M_add_char(_M_cur_value[0]); 01048 return true; 01049 } 01050 if (_M_match_token(_ScannerT::_S_token_collsymbol)) 01051 { 01052 __matcher._M_add_collating_element(_M_cur_value); 01053 return true; 01054 } 01055 return false; 01056 } 01057 01058 template<typename _InIter, typename _TraitsT> 01059 bool 01060 _Compiler<_InIter, _TraitsT>:: 01061 _M_equivalence_class(_RMatcherT& __matcher) 01062 { 01063 if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) 01064 { 01065 __matcher._M_add_equivalence_class(_M_cur_value); 01066 return true; 01067 } 01068 return false; 01069 } 01070 01071 template<typename _InIter, typename _TraitsT> 01072 bool 01073 _Compiler<_InIter, _TraitsT>:: 01074 _M_character_class(_RMatcherT& __matcher) 01075 { 01076 if (_M_match_token(_ScannerT::_S_token_char_class_name)) 01077 { 01078 __matcher._M_add_character_class(_M_cur_value); 01079 return true; 01080 } 01081 return false; 01082 } 01083 01084 template<typename _InIter, typename _TraitsT> 01085 int 01086 _Compiler<_InIter, _TraitsT>:: 01087 _M_cur_int_value(int __radix) 01088 { 01089 int __v = 0; 01090 for (typename _StringT::size_type __i = 0; 01091 __i < _M_cur_value.length(); ++__i) 01092 __v =__v * __radix + _M_traits.value(_M_cur_value[__i], __radix); 01093 return __v; 01094 } 01095 01096 template<typename _InIter, typename _TraitsT> 01097 _AutomatonPtr 01098 __compile(const _InIter& __b, const _InIter& __e, _TraitsT& __t, 01099 regex_constants::syntax_option_type __f) 01100 { return _AutomatonPtr(new _Nfa(_Compiler<_InIter, _TraitsT>(__b, __e, __t, 01101 __f)._M_nfa())); } 01102 01103 } // namespace __regex 01104 01105 _GLIBCXX_END_NAMESPACE_VERSION 01106 } // namespace 01107 01108 /* vim: set ts=8 sw=2 sts=2: */