libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2010 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex_nfa.h 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 namespace std _GLIBCXX_VISIBILITY(default) 00032 { 00033 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00034 00035 namespace __regex 00036 { 00037 00038 // Base class for, um, automata. Could be an NFA or a DFA. Your choice. 00039 class _Automaton 00040 { 00041 public: 00042 typedef unsigned int _SizeT; 00043 00044 public: 00045 virtual 00046 ~_Automaton() 00047 { } 00048 00049 virtual _SizeT 00050 _M_sub_count() const = 0; 00051 00052 #ifdef _GLIBCXX_DEBUG 00053 virtual std::ostream& 00054 _M_dot(std::ostream& __ostr) const = 0; 00055 #endif 00056 }; 00057 00058 // Generic shred pointer to an automaton. 00059 typedef std::shared_ptr<_Automaton> _AutomatonPtr; 00060 00061 // Operation codes that define the type of transitions within the base NFA 00062 // that represents the regular expression. 00063 enum _Opcode 00064 { 00065 _S_opcode_unknown = 0, 00066 _S_opcode_alternative = 1, 00067 _S_opcode_subexpr_begin = 4, 00068 _S_opcode_subexpr_end = 5, 00069 _S_opcode_match = 100, 00070 _S_opcode_accept = 255 00071 }; 00072 00073 // Provides a generic facade for a templated match_results. 00074 struct _Results 00075 { 00076 virtual void _M_set_pos(int __i, int __j, const _PatternCursor& __p) = 0; 00077 virtual void _M_set_matched(int __i, bool __is_matched) = 0; 00078 }; 00079 00080 // Tags current state (for subexpr begin/end). 00081 typedef std::function<void (const _PatternCursor&, _Results&)> _Tagger; 00082 00083 template<typename _FwdIterT, typename _TraitsT> 00084 struct _StartTagger 00085 : public _Tagger 00086 { 00087 explicit 00088 _StartTagger(int __i) 00089 : _M_index(__i) 00090 { } 00091 00092 void 00093 operator()(const _PatternCursor& __pc, _Results& __r) 00094 { __r._M_set_pos(_M_index, 0, __pc); } 00095 00096 int _M_index; 00097 }; 00098 00099 template<typename _FwdIterT, typename _TraitsT> 00100 struct _EndTagger 00101 : public _Tagger 00102 { 00103 explicit 00104 _EndTagger(int __i) 00105 : _M_index(__i) 00106 { } 00107 00108 void 00109 operator()(const _PatternCursor& __pc, _Results& __r) 00110 { __r._M_set_pos(_M_index, 1, __pc); } 00111 00112 int _M_index; 00113 _FwdIterT _M_pos; 00114 }; 00115 // Indicates if current state matches cursor current. 00116 typedef std::function<bool (const _PatternCursor&)> _Matcher; 00117 00118 // Matches any character 00119 inline bool 00120 _AnyMatcher(const _PatternCursor&) 00121 { return true; } 00122 00123 // Matches a single character 00124 template<typename _InIterT, typename _TraitsT> 00125 struct _CharMatcher 00126 : public _Matcher 00127 { 00128 typedef typename _TraitsT::char_type char_type; 00129 00130 explicit 00131 _CharMatcher(char_type __c, const _TraitsT& __t = _TraitsT()) 00132 : _M_traits(__t), _M_c(_M_traits.translate(__c)) 00133 { } 00134 00135 bool 00136 operator()(const _PatternCursor& __pc) const 00137 { 00138 typedef const _SpecializedCursor<_InIterT>& _CursorT; 00139 _CursorT __c = static_cast<_CursorT>(__pc); 00140 return _M_traits.translate(__c._M_current()) == _M_c; 00141 } 00142 00143 const _TraitsT& _M_traits; 00144 char_type _M_c; 00145 }; 00146 00147 // Matches a character range (bracket expression) 00148 template<typename _InIterT, typename _TraitsT> 00149 struct _RangeMatcher 00150 : public _Matcher 00151 { 00152 typedef typename _TraitsT::char_type _CharT; 00153 typedef std::basic_string<_CharT> _StringT; 00154 00155 explicit 00156 _RangeMatcher(bool __is_non_matching, const _TraitsT& __t = _TraitsT()) 00157 : _M_traits(__t), _M_is_non_matching(__is_non_matching) 00158 { } 00159 00160 bool 00161 operator()(const _PatternCursor& __pc) const 00162 { 00163 typedef const _SpecializedCursor<_InIterT>& _CursorT; 00164 _CursorT __c = static_cast<_CursorT>(__pc); 00165 return true; 00166 } 00167 00168 void 00169 _M_add_char(_CharT __c) 00170 { } 00171 00172 void 00173 _M_add_collating_element(const _StringT& __s) 00174 { } 00175 00176 void 00177 _M_add_equivalence_class(const _StringT& __s) 00178 { } 00179 00180 void 00181 _M_add_character_class(const _StringT& __s) 00182 { } 00183 00184 void 00185 _M_make_range() 00186 { } 00187 00188 const _TraitsT& _M_traits; 00189 bool _M_is_non_matching; 00190 }; 00191 00192 // Identifies a state in the NFA. 00193 typedef int _StateIdT; 00194 00195 // The special case in which a state identifier is not an index. 00196 static const _StateIdT _S_invalid_state_id = -1; 00197 00198 00199 // An individual state in an NFA 00200 // 00201 // In this case a "state" is an entry in the NFA definition coupled with its 00202 // outgoing transition(s). All states have a single outgoing transition, 00203 // except for accepting states (which have no outgoing transitions) and alt 00204 // states, which have two outgoing transitions. 00205 // 00206 struct _State 00207 { 00208 typedef int _OpcodeT; 00209 00210 _OpcodeT _M_opcode; // type of outgoing transition 00211 _StateIdT _M_next; // outgoing tranition 00212 _StateIdT _M_alt; // for _S_opcode_alternative 00213 unsigned int _M_subexpr; // for _S_opcode_subexpr_* 00214 _Tagger _M_tagger; // for _S_opcode_subexpr_* 00215 _Matcher _M_matches; // for _S_opcode_match 00216 00217 explicit _State(_OpcodeT __opcode) 00218 : _M_opcode(__opcode), _M_next(_S_invalid_state_id) 00219 { } 00220 00221 _State(const _Matcher& __m) 00222 : _M_opcode(_S_opcode_match), _M_next(_S_invalid_state_id), _M_matches(__m) 00223 { } 00224 00225 _State(_OpcodeT __opcode, unsigned int __s, const _Tagger& __t) 00226 : _M_opcode(__opcode), _M_next(_S_invalid_state_id), _M_subexpr(__s), 00227 _M_tagger(__t) 00228 { } 00229 00230 _State(_StateIdT __next, _StateIdT __alt) 00231 : _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt) 00232 { } 00233 00234 #ifdef _GLIBCXX_DEBUG 00235 std::ostream& 00236 _M_print(std::ostream& ostr) const; 00237 00238 // Prints graphviz dot commands for state. 00239 std::ostream& 00240 _M_dot(std::ostream& __ostr, _StateIdT __id) const; 00241 #endif 00242 }; 00243 00244 00245 // The Grep Matcher works on sets of states. Here are sets of states. 00246 typedef std::set<_StateIdT> _StateSet; 00247 00248 // A collection of all states making up an NFA 00249 // 00250 // An NFA is a 4-tuple M = (K, S, s, F), where 00251 // K is a finite set of states, 00252 // S is the alphabet of the NFA, 00253 // s is the initial state, 00254 // F is a set of final (accepting) states. 00255 // 00256 // This NFA class is templated on S, a type that will hold values of the 00257 // underlying alphabet (without regard to semantics of that alphabet). The 00258 // other elements of the tuple are generated during construction of the NFA 00259 // and are available through accessor member functions. 00260 // 00261 class _Nfa 00262 : public _Automaton, public std::vector<_State> 00263 { 00264 public: 00265 typedef _State _StateT; 00266 typedef unsigned int _SizeT; 00267 typedef regex_constants::syntax_option_type _FlagT; 00268 00269 public: 00270 _Nfa(_FlagT __f) 00271 : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0) 00272 { } 00273 00274 ~_Nfa() 00275 { } 00276 00277 _FlagT 00278 _M_options() const 00279 { return _M_flags; } 00280 00281 _StateIdT 00282 _M_start() const 00283 { return _M_start_state; } 00284 00285 const _StateSet& 00286 _M_final_states() const 00287 { return _M_accepting_states; } 00288 00289 _SizeT 00290 _M_sub_count() const 00291 { return _M_subexpr_count; } 00292 00293 _StateIdT 00294 _M_insert_accept() 00295 { 00296 this->push_back(_StateT(_S_opcode_accept)); 00297 _M_accepting_states.insert(this->size()-1); 00298 return this->size()-1; 00299 } 00300 00301 _StateIdT 00302 _M_insert_alt(_StateIdT __next, _StateIdT __alt) 00303 { 00304 this->push_back(_StateT(__next, __alt)); 00305 return this->size()-1; 00306 } 00307 00308 _StateIdT 00309 _M_insert_matcher(_Matcher __m) 00310 { 00311 this->push_back(_StateT(__m)); 00312 return this->size()-1; 00313 } 00314 00315 _StateIdT 00316 _M_insert_subexpr_begin(const _Tagger& __t) 00317 { 00318 this->push_back(_StateT(_S_opcode_subexpr_begin, _M_subexpr_count++, __t)); 00319 return this->size()-1; 00320 } 00321 00322 _StateIdT 00323 _M_insert_subexpr_end(unsigned int __i, const _Tagger& __t) 00324 { 00325 this->push_back(_StateT(_S_opcode_subexpr_end, __i, __t)); 00326 return this->size()-1; 00327 } 00328 00329 #ifdef _GLIBCXX_DEBUG 00330 std::ostream& 00331 _M_dot(std::ostream& __ostr) const; 00332 #endif 00333 00334 private: 00335 _FlagT _M_flags; 00336 _StateIdT _M_start_state; 00337 _StateSet _M_accepting_states; 00338 _SizeT _M_subexpr_count; 00339 }; 00340 00341 // Describes a sequence of one or more %_State, its current start and end(s). 00342 // 00343 // This structure contains fragments of an NFA during construction. 00344 class _StateSeq 00345 { 00346 public: 00347 // Constructs a single-node sequence 00348 _StateSeq(_Nfa& __ss, _StateIdT __s, _StateIdT __e = _S_invalid_state_id) 00349 : _M_nfa(__ss), _M_start(__s), _M_end1(__s), _M_end2(__e) 00350 { } 00351 // Constructs a split sequence from two other sequencces 00352 _StateSeq(const _StateSeq& __e1, const _StateSeq& __e2) 00353 : _M_nfa(__e1._M_nfa), 00354 _M_start(_M_nfa._M_insert_alt(__e1._M_start, __e2._M_start)), 00355 _M_end1(__e1._M_end1), _M_end2(__e2._M_end1) 00356 { } 00357 00358 // Constructs a split sequence from a single sequence 00359 _StateSeq(const _StateSeq& __e, _StateIdT __id) 00360 : _M_nfa(__e._M_nfa), 00361 _M_start(_M_nfa._M_insert_alt(__id, __e._M_start)), 00362 _M_end1(__id), _M_end2(__e._M_end1) 00363 { } 00364 00365 // Constructs a copy of a %_StateSeq 00366 _StateSeq(const _StateSeq& __rhs) 00367 : _M_nfa(__rhs._M_nfa), _M_start(__rhs._M_start), 00368 _M_end1(__rhs._M_end1), _M_end2(__rhs._M_end2) 00369 { } 00370 00371 00372 _StateSeq& operator=(const _StateSeq& __rhs); 00373 00374 _StateIdT 00375 _M_front() const 00376 { return _M_start; } 00377 00378 // Extends a sequence by one. 00379 void 00380 _M_push_back(_StateIdT __id); 00381 00382 // Extends and maybe joins a sequence. 00383 void 00384 _M_append(_StateIdT __id); 00385 00386 void 00387 _M_append(_StateSeq& __rhs); 00388 00389 // Clones an entire sequence. 00390 _StateIdT 00391 _M_clone(); 00392 00393 private: 00394 _Nfa& _M_nfa; 00395 _StateIdT _M_start; 00396 _StateIdT _M_end1; 00397 _StateIdT _M_end2; 00398 00399 }; 00400 00401 } // namespace __regex 00402 00403 _GLIBCXX_END_NAMESPACE_VERSION 00404 } // namespace 00405 00406 #include <bits/regex_nfa.tcc> 00407