libstdc++

regex_nfa.h

Go to the documentation of this file.
00001 // class template regex -*- C++ -*-
00002 
00003 // Copyright (C) 2010, 2011 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 /**
00026  *  @file bits/regex_nfa.h
00027  *  This is an internal header file, included by other library headers.
00028  *  Do not attempt to use it directly. @headername{regex}
00029  */
00030 
00031 namespace std _GLIBCXX_VISIBILITY(default)
00032 {
00033 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00034 
00035 namespace __regex
00036 {
00037 
00038   // Base class for, um, automata.  Could be an NFA or a DFA.  Your choice.
00039   class _Automaton
00040   {
00041   public:
00042     typedef unsigned int _SizeT;
00043 
00044   public:
00045     virtual
00046     ~_Automaton()
00047     { }
00048 
00049     virtual _SizeT
00050     _M_sub_count() const = 0;
00051 
00052 #ifdef _GLIBCXX_DEBUG
00053     virtual std::ostream&
00054     _M_dot(std::ostream& __ostr) const = 0;
00055 #endif
00056   };
00057 
00058   // Generic shared pointer to an automaton.  
00059   typedef std::shared_ptr<_Automaton> _AutomatonPtr;
00060 
00061   // Operation codes that define the type of transitions within the base NFA
00062   // that represents the regular expression.
00063   enum _Opcode
00064   {
00065       _S_opcode_unknown       =   0,
00066       _S_opcode_alternative   =   1,
00067       _S_opcode_subexpr_begin =   4,
00068       _S_opcode_subexpr_end   =   5,
00069       _S_opcode_match         = 100,
00070       _S_opcode_accept        = 255
00071   };
00072 
00073   // Provides a generic facade for a templated match_results.
00074   struct _Results
00075   {
00076     virtual void _M_set_pos(int __i, int __j, const _PatternCursor& __p) = 0;
00077     virtual void _M_set_matched(int __i, bool __is_matched) = 0;
00078   };
00079 
00080   // Tags current state (for subexpr begin/end).
00081   typedef std::function<void (const _PatternCursor&, _Results&)> _Tagger;
00082 
00083   template<typename _FwdIterT, typename _TraitsT>
00084     struct _StartTagger
00085     {
00086       explicit
00087       _StartTagger(int __i)
00088       : _M_index(__i)
00089       { }
00090 
00091       void
00092       operator()(const _PatternCursor& __pc, _Results& __r)
00093       { __r._M_set_pos(_M_index, 0, __pc); }
00094 
00095       int       _M_index;
00096     };
00097 
00098   template<typename _FwdIterT, typename _TraitsT>
00099     struct _EndTagger
00100     {
00101       explicit
00102       _EndTagger(int __i)
00103       : _M_index(__i)
00104       { }
00105 
00106       void
00107       operator()(const _PatternCursor& __pc, _Results& __r)
00108       { __r._M_set_pos(_M_index, 1, __pc); }
00109 
00110       int       _M_index;
00111       _FwdIterT _M_pos;
00112     };
00113   // Indicates if current state matches cursor current.
00114   typedef std::function<bool (const _PatternCursor&)> _Matcher;
00115 
00116   // Matches any character
00117   inline bool
00118   _AnyMatcher(const _PatternCursor&)
00119   { return true; }
00120 
00121   // Matches a single character
00122   template<typename _InIterT, typename _TraitsT>
00123     struct _CharMatcher
00124     {
00125       typedef typename _TraitsT::char_type char_type;
00126 
00127       explicit
00128       _CharMatcher(char_type __c, const _TraitsT& __t = _TraitsT())
00129       : _M_traits(__t), _M_c(_M_traits.translate(__c))
00130       { }
00131 
00132       bool
00133       operator()(const _PatternCursor& __pc) const
00134       {
00135     typedef const _SpecializedCursor<_InIterT>& _CursorT;
00136     _CursorT __c = static_cast<_CursorT>(__pc);
00137     return _M_traits.translate(__c._M_current()) == _M_c;
00138       }
00139 
00140       const _TraitsT& _M_traits;
00141       char_type       _M_c;
00142     };
00143 
00144   // Matches a character range (bracket expression)
00145   template<typename _InIterT, typename _TraitsT>
00146     struct _RangeMatcher
00147     {
00148       typedef typename _TraitsT::char_type _CharT;
00149       typedef std::basic_string<_CharT>    _StringT;
00150 
00151       explicit
00152       _RangeMatcher(bool __is_non_matching, const _TraitsT& __t = _TraitsT())
00153       : _M_traits(__t), _M_is_non_matching(__is_non_matching)
00154       { }
00155 
00156       bool
00157       operator()(const _PatternCursor& __pc) const
00158       {
00159     typedef const _SpecializedCursor<_InIterT>& _CursorT;
00160     _CursorT __c = static_cast<_CursorT>(__pc);
00161     return true;
00162       }
00163 
00164       void
00165       _M_add_char(_CharT __c)
00166       { }
00167 
00168       void
00169       _M_add_collating_element(const _StringT& __s)
00170       { }
00171 
00172       void
00173       _M_add_equivalence_class(const _StringT& __s)
00174       { }
00175 
00176       void
00177       _M_add_character_class(const _StringT& __s)
00178       { }
00179 
00180       void
00181       _M_make_range()
00182       { }
00183 
00184       const _TraitsT& _M_traits;
00185       bool            _M_is_non_matching;
00186     };
00187 
00188   // Identifies a state in the NFA.
00189   typedef int _StateIdT;
00190 
00191   // The special case in which a state identifier is not an index.
00192   static const _StateIdT _S_invalid_state_id  = -1;
00193 
00194 
00195   // An individual state in an NFA
00196   //
00197   // In this case a "state" is an entry in the NFA definition coupled with its
00198   // outgoing transition(s).  All states have a single outgoing transition,
00199   // except for accepting states (which have no outgoing transitions) and alt
00200   // states, which have two outgoing transitions.
00201   //
00202   struct _State
00203   {
00204     typedef int  _OpcodeT;
00205 
00206     _OpcodeT     _M_opcode;    // type of outgoing transition
00207     _StateIdT    _M_next;      // outgoing transition
00208     _StateIdT    _M_alt;       // for _S_opcode_alternative
00209     unsigned int _M_subexpr;   // for _S_opcode_subexpr_*
00210     _Tagger      _M_tagger;    // for _S_opcode_subexpr_*
00211     _Matcher     _M_matches;   // for _S_opcode_match
00212 
00213     explicit _State(_OpcodeT __opcode)
00214     : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
00215     { }
00216 
00217     _State(const _Matcher& __m)
00218     : _M_opcode(_S_opcode_match), _M_next(_S_invalid_state_id), _M_matches(__m)
00219     { }
00220 
00221     _State(_OpcodeT __opcode, unsigned int __s, const _Tagger& __t)
00222     : _M_opcode(__opcode), _M_next(_S_invalid_state_id), _M_subexpr(__s),
00223       _M_tagger(__t)
00224     { }
00225 
00226     _State(_StateIdT __next, _StateIdT __alt)
00227     : _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt)
00228     { }
00229 
00230 #ifdef _GLIBCXX_DEBUG
00231     std::ostream&
00232     _M_print(std::ostream& ostr) const;
00233 
00234     // Prints graphviz dot commands for state.
00235     std::ostream&
00236     _M_dot(std::ostream& __ostr, _StateIdT __id) const;
00237 #endif
00238   };
00239 
00240   
00241   // The Grep Matcher works on sets of states.  Here are sets of states.
00242   typedef std::set<_StateIdT> _StateSet;
00243 
00244  // A collection of all states making up an NFA
00245   //
00246   // An NFA is a 4-tuple M = (K, S, s, F), where
00247   //    K is a finite set of states,
00248   //    S is the alphabet of the NFA,
00249   //    s is the initial state,
00250   //    F is a set of final (accepting) states.
00251   //
00252   // This NFA class is templated on S, a type that will hold values of the
00253   // underlying alphabet (without regard to semantics of that alphabet).  The
00254   // other elements of the tuple are generated during construction of the NFA
00255   // and are available through accessor member functions.
00256   //
00257   class _Nfa
00258   : public _Automaton, public std::vector<_State>
00259   {
00260   public:
00261     typedef _State                              _StateT;
00262     typedef unsigned int                        _SizeT;
00263     typedef regex_constants::syntax_option_type _FlagT;
00264 
00265   public:
00266     _Nfa(_FlagT __f)
00267     : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0)
00268     { }
00269 
00270     ~_Nfa()
00271     { }
00272 
00273     _FlagT
00274     _M_options() const
00275     { return _M_flags; }
00276 
00277     _StateIdT
00278     _M_start() const
00279     { return _M_start_state; }
00280 
00281     const _StateSet&
00282     _M_final_states() const
00283     { return _M_accepting_states; }
00284 
00285     _SizeT
00286     _M_sub_count() const
00287     { return _M_subexpr_count; }
00288 
00289     _StateIdT
00290     _M_insert_accept()
00291     {
00292       this->push_back(_StateT(_S_opcode_accept));
00293       _M_accepting_states.insert(this->size()-1);
00294       return this->size()-1;
00295     }
00296 
00297     _StateIdT
00298     _M_insert_alt(_StateIdT __next, _StateIdT __alt)
00299     {
00300       this->push_back(_StateT(__next, __alt));
00301       return this->size()-1;
00302     }
00303 
00304     _StateIdT
00305     _M_insert_matcher(_Matcher __m)
00306     {
00307       this->push_back(_StateT(__m));
00308       return this->size()-1;
00309     }
00310 
00311     _StateIdT
00312     _M_insert_subexpr_begin(const _Tagger& __t)
00313     {
00314       this->push_back(_StateT(_S_opcode_subexpr_begin, _M_subexpr_count++, __t));
00315       return this->size()-1;
00316     }
00317 
00318     _StateIdT 
00319     _M_insert_subexpr_end(unsigned int __i, const _Tagger& __t)
00320     {
00321       this->push_back(_StateT(_S_opcode_subexpr_end, __i, __t));
00322       return this->size()-1;
00323     }
00324 
00325 #ifdef _GLIBCXX_DEBUG
00326     std::ostream&
00327     _M_dot(std::ostream& __ostr) const;
00328 #endif
00329 
00330   private:
00331     _FlagT     _M_flags;
00332     _StateIdT  _M_start_state;
00333     _StateSet  _M_accepting_states;
00334     _SizeT     _M_subexpr_count;
00335   };
00336 
00337   // Describes a sequence of one or more %_State, its current start and end(s).
00338   //
00339   // This structure contains fragments of an NFA during construction.
00340   class _StateSeq
00341   {
00342   public:
00343     // Constructs a single-node sequence
00344     _StateSeq(_Nfa& __ss, _StateIdT __s, _StateIdT __e = _S_invalid_state_id)
00345     : _M_nfa(__ss), _M_start(__s), _M_end1(__s), _M_end2(__e)
00346     { }
00347     // Constructs a split sequence from two other sequencces
00348     _StateSeq(const _StateSeq& __e1, const _StateSeq& __e2)
00349     : _M_nfa(__e1._M_nfa),
00350       _M_start(_M_nfa._M_insert_alt(__e1._M_start, __e2._M_start)),
00351       _M_end1(__e1._M_end1), _M_end2(__e2._M_end1)
00352     { }
00353 
00354     // Constructs a split sequence from a single sequence
00355     _StateSeq(const _StateSeq& __e, _StateIdT __id)
00356     : _M_nfa(__e._M_nfa),
00357       _M_start(_M_nfa._M_insert_alt(__id, __e._M_start)),
00358       _M_end1(__id), _M_end2(__e._M_end1)
00359     { }
00360 
00361     // Constructs a copy of a %_StateSeq
00362     _StateSeq(const _StateSeq& __rhs)
00363     : _M_nfa(__rhs._M_nfa), _M_start(__rhs._M_start),
00364       _M_end1(__rhs._M_end1), _M_end2(__rhs._M_end2)
00365     { }
00366 
00367 
00368     _StateSeq& operator=(const _StateSeq& __rhs);
00369 
00370     _StateIdT
00371     _M_front() const
00372     { return _M_start; }
00373 
00374     // Extends a sequence by one.
00375     void
00376     _M_push_back(_StateIdT __id);
00377 
00378     // Extends and maybe joins a sequence.
00379     void
00380     _M_append(_StateIdT __id);
00381 
00382     void
00383     _M_append(_StateSeq& __rhs);
00384 
00385     // Clones an entire sequence.
00386     _StateIdT
00387     _M_clone();
00388 
00389   private:
00390     _Nfa&     _M_nfa;
00391     _StateIdT _M_start;
00392     _StateIdT _M_end1;
00393     _StateIdT _M_end2;
00394 
00395   };
00396 
00397 } // namespace __regex
00398 
00399 _GLIBCXX_END_NAMESPACE_VERSION
00400 } // namespace
00401 
00402 #include <bits/regex_nfa.tcc>
00403