libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2010, 2011 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex_nfa.h 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 namespace std _GLIBCXX_VISIBILITY(default) 00032 { 00033 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00034 00035 namespace __regex 00036 { 00037 00038 // Base class for, um, automata. Could be an NFA or a DFA. Your choice. 00039 class _Automaton 00040 { 00041 public: 00042 typedef unsigned int _SizeT; 00043 00044 public: 00045 virtual 00046 ~_Automaton() 00047 { } 00048 00049 virtual _SizeT 00050 _M_sub_count() const = 0; 00051 00052 #ifdef _GLIBCXX_DEBUG 00053 virtual std::ostream& 00054 _M_dot(std::ostream& __ostr) const = 0; 00055 #endif 00056 }; 00057 00058 // Generic shared pointer to an automaton. 00059 typedef std::shared_ptr<_Automaton> _AutomatonPtr; 00060 00061 // Operation codes that define the type of transitions within the base NFA 00062 // that represents the regular expression. 00063 enum _Opcode 00064 { 00065 _S_opcode_unknown = 0, 00066 _S_opcode_alternative = 1, 00067 _S_opcode_subexpr_begin = 4, 00068 _S_opcode_subexpr_end = 5, 00069 _S_opcode_match = 100, 00070 _S_opcode_accept = 255 00071 }; 00072 00073 // Provides a generic facade for a templated match_results. 00074 struct _Results 00075 { 00076 virtual void _M_set_pos(int __i, int __j, const _PatternCursor& __p) = 0; 00077 virtual void _M_set_matched(int __i, bool __is_matched) = 0; 00078 }; 00079 00080 // Tags current state (for subexpr begin/end). 00081 typedef std::function<void (const _PatternCursor&, _Results&)> _Tagger; 00082 00083 template<typename _FwdIterT, typename _TraitsT> 00084 struct _StartTagger 00085 { 00086 explicit 00087 _StartTagger(int __i) 00088 : _M_index(__i) 00089 { } 00090 00091 void 00092 operator()(const _PatternCursor& __pc, _Results& __r) 00093 { __r._M_set_pos(_M_index, 0, __pc); } 00094 00095 int _M_index; 00096 }; 00097 00098 template<typename _FwdIterT, typename _TraitsT> 00099 struct _EndTagger 00100 { 00101 explicit 00102 _EndTagger(int __i) 00103 : _M_index(__i) 00104 { } 00105 00106 void 00107 operator()(const _PatternCursor& __pc, _Results& __r) 00108 { __r._M_set_pos(_M_index, 1, __pc); } 00109 00110 int _M_index; 00111 _FwdIterT _M_pos; 00112 }; 00113 // Indicates if current state matches cursor current. 00114 typedef std::function<bool (const _PatternCursor&)> _Matcher; 00115 00116 // Matches any character 00117 inline bool 00118 _AnyMatcher(const _PatternCursor&) 00119 { return true; } 00120 00121 // Matches a single character 00122 template<typename _InIterT, typename _TraitsT> 00123 struct _CharMatcher 00124 { 00125 typedef typename _TraitsT::char_type char_type; 00126 00127 explicit 00128 _CharMatcher(char_type __c, const _TraitsT& __t = _TraitsT()) 00129 : _M_traits(__t), _M_c(_M_traits.translate(__c)) 00130 { } 00131 00132 bool 00133 operator()(const _PatternCursor& __pc) const 00134 { 00135 typedef const _SpecializedCursor<_InIterT>& _CursorT; 00136 _CursorT __c = static_cast<_CursorT>(__pc); 00137 return _M_traits.translate(__c._M_current()) == _M_c; 00138 } 00139 00140 const _TraitsT& _M_traits; 00141 char_type _M_c; 00142 }; 00143 00144 // Matches a character range (bracket expression) 00145 template<typename _InIterT, typename _TraitsT> 00146 struct _RangeMatcher 00147 { 00148 typedef typename _TraitsT::char_type _CharT; 00149 typedef std::basic_string<_CharT> _StringT; 00150 00151 explicit 00152 _RangeMatcher(bool __is_non_matching, const _TraitsT& __t = _TraitsT()) 00153 : _M_traits(__t), _M_is_non_matching(__is_non_matching) 00154 { } 00155 00156 bool 00157 operator()(const _PatternCursor& __pc) const 00158 { 00159 typedef const _SpecializedCursor<_InIterT>& _CursorT; 00160 _CursorT __c = static_cast<_CursorT>(__pc); 00161 return true; 00162 } 00163 00164 void 00165 _M_add_char(_CharT __c) 00166 { } 00167 00168 void 00169 _M_add_collating_element(const _StringT& __s) 00170 { } 00171 00172 void 00173 _M_add_equivalence_class(const _StringT& __s) 00174 { } 00175 00176 void 00177 _M_add_character_class(const _StringT& __s) 00178 { } 00179 00180 void 00181 _M_make_range() 00182 { } 00183 00184 const _TraitsT& _M_traits; 00185 bool _M_is_non_matching; 00186 }; 00187 00188 // Identifies a state in the NFA. 00189 typedef int _StateIdT; 00190 00191 // The special case in which a state identifier is not an index. 00192 static const _StateIdT _S_invalid_state_id = -1; 00193 00194 00195 // An individual state in an NFA 00196 // 00197 // In this case a "state" is an entry in the NFA definition coupled with its 00198 // outgoing transition(s). All states have a single outgoing transition, 00199 // except for accepting states (which have no outgoing transitions) and alt 00200 // states, which have two outgoing transitions. 00201 // 00202 struct _State 00203 { 00204 typedef int _OpcodeT; 00205 00206 _OpcodeT _M_opcode; // type of outgoing transition 00207 _StateIdT _M_next; // outgoing transition 00208 _StateIdT _M_alt; // for _S_opcode_alternative 00209 unsigned int _M_subexpr; // for _S_opcode_subexpr_* 00210 _Tagger _M_tagger; // for _S_opcode_subexpr_* 00211 _Matcher _M_matches; // for _S_opcode_match 00212 00213 explicit _State(_OpcodeT __opcode) 00214 : _M_opcode(__opcode), _M_next(_S_invalid_state_id) 00215 { } 00216 00217 _State(const _Matcher& __m) 00218 : _M_opcode(_S_opcode_match), _M_next(_S_invalid_state_id), _M_matches(__m) 00219 { } 00220 00221 _State(_OpcodeT __opcode, unsigned int __s, const _Tagger& __t) 00222 : _M_opcode(__opcode), _M_next(_S_invalid_state_id), _M_subexpr(__s), 00223 _M_tagger(__t) 00224 { } 00225 00226 _State(_StateIdT __next, _StateIdT __alt) 00227 : _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt) 00228 { } 00229 00230 #ifdef _GLIBCXX_DEBUG 00231 std::ostream& 00232 _M_print(std::ostream& ostr) const; 00233 00234 // Prints graphviz dot commands for state. 00235 std::ostream& 00236 _M_dot(std::ostream& __ostr, _StateIdT __id) const; 00237 #endif 00238 }; 00239 00240 00241 // The Grep Matcher works on sets of states. Here are sets of states. 00242 typedef std::set<_StateIdT> _StateSet; 00243 00244 // A collection of all states making up an NFA 00245 // 00246 // An NFA is a 4-tuple M = (K, S, s, F), where 00247 // K is a finite set of states, 00248 // S is the alphabet of the NFA, 00249 // s is the initial state, 00250 // F is a set of final (accepting) states. 00251 // 00252 // This NFA class is templated on S, a type that will hold values of the 00253 // underlying alphabet (without regard to semantics of that alphabet). The 00254 // other elements of the tuple are generated during construction of the NFA 00255 // and are available through accessor member functions. 00256 // 00257 class _Nfa 00258 : public _Automaton, public std::vector<_State> 00259 { 00260 public: 00261 typedef _State _StateT; 00262 typedef unsigned int _SizeT; 00263 typedef regex_constants::syntax_option_type _FlagT; 00264 00265 public: 00266 _Nfa(_FlagT __f) 00267 : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0) 00268 { } 00269 00270 ~_Nfa() 00271 { } 00272 00273 _FlagT 00274 _M_options() const 00275 { return _M_flags; } 00276 00277 _StateIdT 00278 _M_start() const 00279 { return _M_start_state; } 00280 00281 const _StateSet& 00282 _M_final_states() const 00283 { return _M_accepting_states; } 00284 00285 _SizeT 00286 _M_sub_count() const 00287 { return _M_subexpr_count; } 00288 00289 _StateIdT 00290 _M_insert_accept() 00291 { 00292 this->push_back(_StateT(_S_opcode_accept)); 00293 _M_accepting_states.insert(this->size()-1); 00294 return this->size()-1; 00295 } 00296 00297 _StateIdT 00298 _M_insert_alt(_StateIdT __next, _StateIdT __alt) 00299 { 00300 this->push_back(_StateT(__next, __alt)); 00301 return this->size()-1; 00302 } 00303 00304 _StateIdT 00305 _M_insert_matcher(_Matcher __m) 00306 { 00307 this->push_back(_StateT(__m)); 00308 return this->size()-1; 00309 } 00310 00311 _StateIdT 00312 _M_insert_subexpr_begin(const _Tagger& __t) 00313 { 00314 this->push_back(_StateT(_S_opcode_subexpr_begin, _M_subexpr_count++, __t)); 00315 return this->size()-1; 00316 } 00317 00318 _StateIdT 00319 _M_insert_subexpr_end(unsigned int __i, const _Tagger& __t) 00320 { 00321 this->push_back(_StateT(_S_opcode_subexpr_end, __i, __t)); 00322 return this->size()-1; 00323 } 00324 00325 #ifdef _GLIBCXX_DEBUG 00326 std::ostream& 00327 _M_dot(std::ostream& __ostr) const; 00328 #endif 00329 00330 private: 00331 _FlagT _M_flags; 00332 _StateIdT _M_start_state; 00333 _StateSet _M_accepting_states; 00334 _SizeT _M_subexpr_count; 00335 }; 00336 00337 // Describes a sequence of one or more %_State, its current start and end(s). 00338 // 00339 // This structure contains fragments of an NFA during construction. 00340 class _StateSeq 00341 { 00342 public: 00343 // Constructs a single-node sequence 00344 _StateSeq(_Nfa& __ss, _StateIdT __s, _StateIdT __e = _S_invalid_state_id) 00345 : _M_nfa(__ss), _M_start(__s), _M_end1(__s), _M_end2(__e) 00346 { } 00347 // Constructs a split sequence from two other sequencces 00348 _StateSeq(const _StateSeq& __e1, const _StateSeq& __e2) 00349 : _M_nfa(__e1._M_nfa), 00350 _M_start(_M_nfa._M_insert_alt(__e1._M_start, __e2._M_start)), 00351 _M_end1(__e1._M_end1), _M_end2(__e2._M_end1) 00352 { } 00353 00354 // Constructs a split sequence from a single sequence 00355 _StateSeq(const _StateSeq& __e, _StateIdT __id) 00356 : _M_nfa(__e._M_nfa), 00357 _M_start(_M_nfa._M_insert_alt(__id, __e._M_start)), 00358 _M_end1(__id), _M_end2(__e._M_end1) 00359 { } 00360 00361 // Constructs a copy of a %_StateSeq 00362 _StateSeq(const _StateSeq& __rhs) 00363 : _M_nfa(__rhs._M_nfa), _M_start(__rhs._M_start), 00364 _M_end1(__rhs._M_end1), _M_end2(__rhs._M_end2) 00365 { } 00366 00367 00368 _StateSeq& operator=(const _StateSeq& __rhs); 00369 00370 _StateIdT 00371 _M_front() const 00372 { return _M_start; } 00373 00374 // Extends a sequence by one. 00375 void 00376 _M_push_back(_StateIdT __id); 00377 00378 // Extends and maybe joins a sequence. 00379 void 00380 _M_append(_StateIdT __id); 00381 00382 void 00383 _M_append(_StateSeq& __rhs); 00384 00385 // Clones an entire sequence. 00386 _StateIdT 00387 _M_clone(); 00388 00389 private: 00390 _Nfa& _M_nfa; 00391 _StateIdT _M_start; 00392 _StateIdT _M_end1; 00393 _StateIdT _M_end2; 00394 00395 }; 00396 00397 } // namespace __regex 00398 00399 _GLIBCXX_END_NAMESPACE_VERSION 00400 } // namespace 00401 00402 #include <bits/regex_nfa.tcc> 00403