libstdc++

regex_nfa.h

Go to the documentation of this file.
00001 // class template regex -*- C++ -*-
00002 
00003 // Copyright (C) 2010 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 /**
00026  *  @file bits/regex_nfa.h
00027  *  This is an internal header file, included by other library headers.
00028  *  Do not attempt to use it directly. @headername{regex}
00029  */
00030 
00031 namespace std _GLIBCXX_VISIBILITY(default)
00032 {
00033 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00034 
00035 namespace __regex
00036 {
00037 
00038   // Base class for, um, automata.  Could be an NFA or a DFA.  Your choice.
00039   class _Automaton
00040   {
00041   public:
00042     typedef unsigned int _SizeT;
00043 
00044   public:
00045     virtual
00046     ~_Automaton()
00047     { }
00048 
00049     virtual _SizeT
00050     _M_sub_count() const = 0;
00051 
00052 #ifdef _GLIBCXX_DEBUG
00053     virtual std::ostream&
00054     _M_dot(std::ostream& __ostr) const = 0;
00055 #endif
00056   };
00057 
00058   // Generic shred pointer to an automaton.  
00059   typedef std::shared_ptr<_Automaton> _AutomatonPtr;
00060 
00061   // Operation codes that define the type of transitions within the base NFA
00062   // that represents the regular expression.
00063   enum _Opcode
00064   {
00065       _S_opcode_unknown       =   0,
00066       _S_opcode_alternative   =   1,
00067       _S_opcode_subexpr_begin =   4,
00068       _S_opcode_subexpr_end   =   5,
00069       _S_opcode_match         = 100,
00070       _S_opcode_accept        = 255
00071   };
00072 
00073   // Provides a generic facade for a templated match_results.
00074   struct _Results
00075   {
00076     virtual void _M_set_pos(int __i, int __j, const _PatternCursor& __p) = 0;
00077     virtual void _M_set_matched(int __i, bool __is_matched) = 0;
00078   };
00079 
00080   // Tags current state (for subexpr begin/end).
00081   typedef std::function<void (const _PatternCursor&, _Results&)> _Tagger;
00082 
00083   template<typename _FwdIterT, typename _TraitsT>
00084     struct _StartTagger
00085     : public _Tagger
00086     {
00087       explicit
00088       _StartTagger(int __i)
00089       : _M_index(__i)
00090       { }
00091 
00092       void
00093       operator()(const _PatternCursor& __pc, _Results& __r)
00094       { __r._M_set_pos(_M_index, 0, __pc); }
00095 
00096       int       _M_index;
00097     };
00098 
00099   template<typename _FwdIterT, typename _TraitsT>
00100     struct _EndTagger
00101     : public _Tagger
00102     {
00103       explicit
00104       _EndTagger(int __i)
00105       : _M_index(__i)
00106       { }
00107 
00108       void
00109       operator()(const _PatternCursor& __pc, _Results& __r)
00110       { __r._M_set_pos(_M_index, 1, __pc); }
00111 
00112       int       _M_index;
00113       _FwdIterT _M_pos;
00114     };
00115   // Indicates if current state matches cursor current.
00116   typedef std::function<bool (const _PatternCursor&)> _Matcher;
00117 
00118   // Matches any character
00119   inline bool
00120   _AnyMatcher(const _PatternCursor&)
00121   { return true; }
00122 
00123   // Matches a single character
00124   template<typename _InIterT, typename _TraitsT>
00125     struct _CharMatcher
00126     : public _Matcher
00127     {
00128       typedef typename _TraitsT::char_type char_type;
00129 
00130       explicit
00131       _CharMatcher(char_type __c, const _TraitsT& __t = _TraitsT())
00132       : _M_traits(__t), _M_c(_M_traits.translate(__c))
00133       { }
00134 
00135       bool
00136       operator()(const _PatternCursor& __pc) const
00137       {
00138     typedef const _SpecializedCursor<_InIterT>& _CursorT;
00139     _CursorT __c = static_cast<_CursorT>(__pc);
00140     return _M_traits.translate(__c._M_current()) == _M_c;
00141       }
00142 
00143       const _TraitsT& _M_traits;
00144       char_type       _M_c;
00145     };
00146 
00147   // Matches a character range (bracket expression)
00148   template<typename _InIterT, typename _TraitsT>
00149     struct _RangeMatcher
00150     : public _Matcher
00151     {
00152       typedef typename _TraitsT::char_type _CharT;
00153       typedef std::basic_string<_CharT>    _StringT;
00154 
00155       explicit
00156       _RangeMatcher(bool __is_non_matching, const _TraitsT& __t = _TraitsT())
00157       : _M_traits(__t), _M_is_non_matching(__is_non_matching)
00158       { }
00159 
00160       bool
00161       operator()(const _PatternCursor& __pc) const
00162       {
00163     typedef const _SpecializedCursor<_InIterT>& _CursorT;
00164     _CursorT __c = static_cast<_CursorT>(__pc);
00165     return true;
00166       }
00167 
00168       void
00169       _M_add_char(_CharT __c)
00170       { }
00171 
00172       void
00173       _M_add_collating_element(const _StringT& __s)
00174       { }
00175 
00176       void
00177       _M_add_equivalence_class(const _StringT& __s)
00178       { }
00179 
00180       void
00181       _M_add_character_class(const _StringT& __s)
00182       { }
00183 
00184       void
00185       _M_make_range()
00186       { }
00187 
00188       const _TraitsT& _M_traits;
00189       bool            _M_is_non_matching;
00190     };
00191 
00192   // Identifies a state in the NFA.
00193   typedef int _StateIdT;
00194 
00195   // The special case in which a state identifier is not an index.
00196   static const _StateIdT _S_invalid_state_id  = -1;
00197 
00198 
00199   // An individual state in an NFA
00200   //
00201   // In this case a "state" is an entry in the NFA definition coupled with its
00202   // outgoing transition(s).  All states have a single outgoing transition,
00203   // except for accepting states (which have no outgoing transitions) and alt
00204   // states, which have two outgoing transitions.
00205   //
00206   struct _State
00207   {
00208     typedef int  _OpcodeT;
00209 
00210     _OpcodeT     _M_opcode;    // type of outgoing transition
00211     _StateIdT    _M_next;      // outgoing tranition
00212     _StateIdT    _M_alt;       // for _S_opcode_alternative
00213     unsigned int _M_subexpr;   // for _S_opcode_subexpr_*
00214     _Tagger      _M_tagger;    // for _S_opcode_subexpr_*
00215     _Matcher     _M_matches;   // for _S_opcode_match
00216 
00217     explicit _State(_OpcodeT __opcode)
00218     : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
00219     { }
00220 
00221     _State(const _Matcher& __m)
00222     : _M_opcode(_S_opcode_match), _M_next(_S_invalid_state_id), _M_matches(__m)
00223     { }
00224 
00225     _State(_OpcodeT __opcode, unsigned int __s, const _Tagger& __t)
00226     : _M_opcode(__opcode), _M_next(_S_invalid_state_id), _M_subexpr(__s),
00227       _M_tagger(__t)
00228     { }
00229 
00230     _State(_StateIdT __next, _StateIdT __alt)
00231     : _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt)
00232     { }
00233 
00234 #ifdef _GLIBCXX_DEBUG
00235     std::ostream&
00236     _M_print(std::ostream& ostr) const;
00237 
00238     // Prints graphviz dot commands for state.
00239     std::ostream&
00240     _M_dot(std::ostream& __ostr, _StateIdT __id) const;
00241 #endif
00242   };
00243 
00244   
00245   // The Grep Matcher works on sets of states.  Here are sets of states.
00246   typedef std::set<_StateIdT> _StateSet;
00247 
00248  // A collection of all states making up an NFA
00249   //
00250   // An NFA is a 4-tuple M = (K, S, s, F), where
00251   //    K is a finite set of states,
00252   //    S is the alphabet of the NFA,
00253   //    s is the initial state,
00254   //    F is a set of final (accepting) states.
00255   //
00256   // This NFA class is templated on S, a type that will hold values of the
00257   // underlying alphabet (without regard to semantics of that alphabet).  The
00258   // other elements of the tuple are generated during construction of the NFA
00259   // and are available through accessor member functions.
00260   //
00261   class _Nfa
00262   : public _Automaton, public std::vector<_State>
00263   {
00264   public:
00265     typedef _State                              _StateT;
00266     typedef unsigned int                        _SizeT;
00267     typedef regex_constants::syntax_option_type _FlagT;
00268 
00269   public:
00270     _Nfa(_FlagT __f)
00271     : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0)
00272     { }
00273 
00274     ~_Nfa()
00275     { }
00276 
00277     _FlagT
00278     _M_options() const
00279     { return _M_flags; }
00280 
00281     _StateIdT
00282     _M_start() const
00283     { return _M_start_state; }
00284 
00285     const _StateSet&
00286     _M_final_states() const
00287     { return _M_accepting_states; }
00288 
00289     _SizeT
00290     _M_sub_count() const
00291     { return _M_subexpr_count; }
00292 
00293     _StateIdT
00294     _M_insert_accept()
00295     {
00296       this->push_back(_StateT(_S_opcode_accept));
00297       _M_accepting_states.insert(this->size()-1);
00298       return this->size()-1;
00299     }
00300 
00301     _StateIdT
00302     _M_insert_alt(_StateIdT __next, _StateIdT __alt)
00303     {
00304       this->push_back(_StateT(__next, __alt));
00305       return this->size()-1;
00306     }
00307 
00308     _StateIdT
00309     _M_insert_matcher(_Matcher __m)
00310     {
00311       this->push_back(_StateT(__m));
00312       return this->size()-1;
00313     }
00314 
00315     _StateIdT
00316     _M_insert_subexpr_begin(const _Tagger& __t)
00317     {
00318       this->push_back(_StateT(_S_opcode_subexpr_begin, _M_subexpr_count++, __t));
00319       return this->size()-1;
00320     }
00321 
00322     _StateIdT 
00323     _M_insert_subexpr_end(unsigned int __i, const _Tagger& __t)
00324     {
00325       this->push_back(_StateT(_S_opcode_subexpr_end, __i, __t));
00326       return this->size()-1;
00327     }
00328 
00329 #ifdef _GLIBCXX_DEBUG
00330     std::ostream&
00331     _M_dot(std::ostream& __ostr) const;
00332 #endif
00333 
00334   private:
00335     _FlagT     _M_flags;
00336     _StateIdT  _M_start_state;
00337     _StateSet  _M_accepting_states;
00338     _SizeT     _M_subexpr_count;
00339   };
00340 
00341   // Describes a sequence of one or more %_State, its current start and end(s).
00342   //
00343   // This structure contains fragments of an NFA during construction.
00344   class _StateSeq
00345   {
00346   public:
00347     // Constructs a single-node sequence
00348     _StateSeq(_Nfa& __ss, _StateIdT __s, _StateIdT __e = _S_invalid_state_id)
00349     : _M_nfa(__ss), _M_start(__s), _M_end1(__s), _M_end2(__e)
00350     { }
00351     // Constructs a split sequence from two other sequencces
00352     _StateSeq(const _StateSeq& __e1, const _StateSeq& __e2)
00353     : _M_nfa(__e1._M_nfa),
00354       _M_start(_M_nfa._M_insert_alt(__e1._M_start, __e2._M_start)),
00355       _M_end1(__e1._M_end1), _M_end2(__e2._M_end1)
00356     { }
00357 
00358     // Constructs a split sequence from a single sequence
00359     _StateSeq(const _StateSeq& __e, _StateIdT __id)
00360     : _M_nfa(__e._M_nfa),
00361       _M_start(_M_nfa._M_insert_alt(__id, __e._M_start)),
00362       _M_end1(__id), _M_end2(__e._M_end1)
00363     { }
00364 
00365     // Constructs a copy of a %_StateSeq
00366     _StateSeq(const _StateSeq& __rhs)
00367     : _M_nfa(__rhs._M_nfa), _M_start(__rhs._M_start),
00368       _M_end1(__rhs._M_end1), _M_end2(__rhs._M_end2)
00369     { }
00370 
00371 
00372     _StateSeq& operator=(const _StateSeq& __rhs);
00373 
00374     _StateIdT
00375     _M_front() const
00376     { return _M_start; }
00377 
00378     // Extends a sequence by one.
00379     void
00380     _M_push_back(_StateIdT __id);
00381 
00382     // Extends and maybe joins a sequence.
00383     void
00384     _M_append(_StateIdT __id);
00385 
00386     void
00387     _M_append(_StateSeq& __rhs);
00388 
00389     // Clones an entire sequence.
00390     _StateIdT
00391     _M_clone();
00392 
00393   private:
00394     _Nfa&     _M_nfa;
00395     _StateIdT _M_start;
00396     _StateIdT _M_end1;
00397     _StateIdT _M_end2;
00398 
00399   };
00400 
00401 } // namespace __regex
00402 
00403 _GLIBCXX_END_NAMESPACE_VERSION
00404 } // namespace
00405 
00406 #include <bits/regex_nfa.tcc>
00407