libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2013-2017 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex_executor.tcc 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 namespace std _GLIBCXX_VISIBILITY(default) 00032 { 00033 namespace __detail 00034 { 00035 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00036 00037 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00038 bool __dfs_mode> 00039 bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00040 _M_search() 00041 { 00042 if (_M_search_from_first()) 00043 return true; 00044 if (_M_flags & regex_constants::match_continuous) 00045 return false; 00046 _M_flags |= regex_constants::match_prev_avail; 00047 while (_M_begin != _M_end) 00048 { 00049 ++_M_begin; 00050 if (_M_search_from_first()) 00051 return true; 00052 } 00053 return false; 00054 } 00055 00056 // The _M_main function operates in different modes, DFS mode or BFS mode, 00057 // indicated by template parameter __dfs_mode, and dispatches to one of the 00058 // _M_main_dispatch overloads. 00059 // 00060 // ------------------------------------------------------------ 00061 // 00062 // DFS mode: 00063 // 00064 // It applies a Depth-First-Search (aka backtracking) on given NFA and input 00065 // string. 00066 // At the very beginning the executor stands in the start state, then it 00067 // tries every possible state transition in current state recursively. Some 00068 // state transitions consume input string, say, a single-char-matcher or a 00069 // back-reference matcher; some don't, like assertion or other anchor nodes. 00070 // When the input is exhausted and/or the current state is an accepting 00071 // state, the whole executor returns true. 00072 // 00073 // TODO: This approach is exponentially slow for certain input. 00074 // Try to compile the NFA to a DFA. 00075 // 00076 // Time complexity: \Omega(match_length), O(2^(_M_nfa.size())) 00077 // Space complexity: \theta(match_results.size() + match_length) 00078 // 00079 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00080 bool __dfs_mode> 00081 bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00082 _M_main_dispatch(_Match_mode __match_mode, __dfs) 00083 { 00084 _M_has_sol = false; 00085 *_M_states._M_get_sol_pos() = _BiIter(); 00086 _M_cur_results = _M_results; 00087 _M_dfs(__match_mode, _M_states._M_start); 00088 return _M_has_sol; 00089 } 00090 00091 // ------------------------------------------------------------ 00092 // 00093 // BFS mode: 00094 // 00095 // Russ Cox's article (http://swtch.com/~rsc/regexp/regexp1.html) 00096 // explained this algorithm clearly. 00097 // 00098 // It first computes epsilon closure (states that can be achieved without 00099 // consuming characters) for every state that's still matching, 00100 // using the same DFS algorithm, but doesn't re-enter states (using 00101 // _M_states._M_visited to check), nor follow _S_opcode_match. 00102 // 00103 // Then apply DFS using every _S_opcode_match (in _M_states._M_match_queue) 00104 // as the start state. 00105 // 00106 // It significantly reduces potential duplicate states, so has a better 00107 // upper bound; but it requires more overhead. 00108 // 00109 // Time complexity: \Omega(match_length * match_results.size()) 00110 // O(match_length * _M_nfa.size() * match_results.size()) 00111 // Space complexity: \Omega(_M_nfa.size() + match_results.size()) 00112 // O(_M_nfa.size() * match_results.size()) 00113 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00114 bool __dfs_mode> 00115 bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00116 _M_main_dispatch(_Match_mode __match_mode, __bfs) 00117 { 00118 _M_states._M_queue(_M_states._M_start, _M_results); 00119 bool __ret = false; 00120 while (1) 00121 { 00122 _M_has_sol = false; 00123 if (_M_states._M_match_queue.empty()) 00124 break; 00125 std::fill_n(_M_states._M_visited_states.get(), _M_nfa.size(), false); 00126 auto __old_queue = std::move(_M_states._M_match_queue); 00127 for (auto& __task : __old_queue) 00128 { 00129 _M_cur_results = std::move(__task.second); 00130 _M_dfs(__match_mode, __task.first); 00131 } 00132 if (__match_mode == _Match_mode::_Prefix) 00133 __ret |= _M_has_sol; 00134 if (_M_current == _M_end) 00135 break; 00136 ++_M_current; 00137 } 00138 if (__match_mode == _Match_mode::_Exact) 00139 __ret = _M_has_sol; 00140 _M_states._M_match_queue.clear(); 00141 return __ret; 00142 } 00143 00144 // Return whether now match the given sub-NFA. 00145 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00146 bool __dfs_mode> 00147 bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00148 _M_lookahead(_StateIdT __next) 00149 { 00150 // Backreferences may refer to captured content. 00151 // We may want to make this faster by not copying, 00152 // but let's not be clever prematurely. 00153 _ResultsVec __what(_M_cur_results); 00154 _Executor __sub(_M_current, _M_end, __what, _M_re, _M_flags); 00155 __sub._M_states._M_start = __next; 00156 if (__sub._M_search_from_first()) 00157 { 00158 for (size_t __i = 0; __i < __what.size(); __i++) 00159 if (__what[__i].matched) 00160 _M_cur_results[__i] = __what[__i]; 00161 return true; 00162 } 00163 return false; 00164 } 00165 00166 // __rep_count records how many times (__rep_count.second) 00167 // this node is visited under certain input iterator 00168 // (__rep_count.first). This prevent the executor from entering 00169 // infinite loop by refusing to continue when it's already been 00170 // visited more than twice. It's `twice` instead of `once` because 00171 // we need to spare one more time for potential group capture. 00172 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00173 bool __dfs_mode> 00174 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00175 _M_rep_once_more(_Match_mode __match_mode, _StateIdT __i) 00176 { 00177 const auto& __state = _M_nfa[__i]; 00178 auto& __rep_count = _M_rep_count[__i]; 00179 if (__rep_count.second == 0 || __rep_count.first != _M_current) 00180 { 00181 auto __back = __rep_count; 00182 __rep_count.first = _M_current; 00183 __rep_count.second = 1; 00184 _M_dfs(__match_mode, __state._M_alt); 00185 __rep_count = __back; 00186 } 00187 else 00188 { 00189 if (__rep_count.second < 2) 00190 { 00191 __rep_count.second++; 00192 _M_dfs(__match_mode, __state._M_alt); 00193 __rep_count.second--; 00194 } 00195 } 00196 }; 00197 00198 // _M_alt branch is "match once more", while _M_next is "get me out 00199 // of this quantifier". Executing _M_next first or _M_alt first don't 00200 // mean the same thing, and we need to choose the correct order under 00201 // given greedy mode. 00202 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00203 bool __dfs_mode> 00204 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00205 _M_handle_repeat(_Match_mode __match_mode, _StateIdT __i) 00206 { 00207 const auto& __state = _M_nfa[__i]; 00208 00209 // Greedy. 00210 if (!__state._M_neg) 00211 { 00212 _M_rep_once_more(__match_mode, __i); 00213 // If it's DFS executor and already accepted, we're done. 00214 if (!__dfs_mode || !_M_has_sol) 00215 _M_dfs(__match_mode, __state._M_next); 00216 } 00217 else // Non-greedy mode 00218 { 00219 if (__dfs_mode) 00220 { 00221 // vice-versa. 00222 _M_dfs(__match_mode, __state._M_next); 00223 if (!_M_has_sol) 00224 _M_rep_once_more(__match_mode, __i); 00225 } 00226 else 00227 { 00228 // DON'T attempt anything, because there's already another 00229 // state with higher priority accepted. This state cannot 00230 // be better by attempting its next node. 00231 if (!_M_has_sol) 00232 { 00233 _M_dfs(__match_mode, __state._M_next); 00234 // DON'T attempt anything if it's already accepted. An 00235 // accepted state *must* be better than a solution that 00236 // matches a non-greedy quantifier one more time. 00237 if (!_M_has_sol) 00238 _M_rep_once_more(__match_mode, __i); 00239 } 00240 } 00241 } 00242 } 00243 00244 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00245 bool __dfs_mode> 00246 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00247 _M_handle_subexpr_begin(_Match_mode __match_mode, _StateIdT __i) 00248 { 00249 const auto& __state = _M_nfa[__i]; 00250 00251 auto& __res = _M_cur_results[__state._M_subexpr]; 00252 auto __back = __res.first; 00253 __res.first = _M_current; 00254 _M_dfs(__match_mode, __state._M_next); 00255 __res.first = __back; 00256 } 00257 00258 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00259 bool __dfs_mode> 00260 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00261 _M_handle_subexpr_end(_Match_mode __match_mode, _StateIdT __i) 00262 { 00263 const auto& __state = _M_nfa[__i]; 00264 00265 auto& __res = _M_cur_results[__state._M_subexpr]; 00266 auto __back = __res; 00267 __res.second = _M_current; 00268 __res.matched = true; 00269 _M_dfs(__match_mode, __state._M_next); 00270 __res = __back; 00271 } 00272 00273 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00274 bool __dfs_mode> 00275 inline void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00276 _M_handle_line_begin_assertion(_Match_mode __match_mode, _StateIdT __i) 00277 { 00278 const auto& __state = _M_nfa[__i]; 00279 if (_M_at_begin()) 00280 _M_dfs(__match_mode, __state._M_next); 00281 } 00282 00283 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00284 bool __dfs_mode> 00285 inline void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00286 _M_handle_line_end_assertion(_Match_mode __match_mode, _StateIdT __i) 00287 { 00288 const auto& __state = _M_nfa[__i]; 00289 if (_M_at_end()) 00290 _M_dfs(__match_mode, __state._M_next); 00291 } 00292 00293 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00294 bool __dfs_mode> 00295 inline void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00296 _M_handle_word_boundary(_Match_mode __match_mode, _StateIdT __i) 00297 { 00298 const auto& __state = _M_nfa[__i]; 00299 if (_M_word_boundary() == !__state._M_neg) 00300 _M_dfs(__match_mode, __state._M_next); 00301 } 00302 00303 // Here __state._M_alt offers a single start node for a sub-NFA. 00304 // We recursively invoke our algorithm to match the sub-NFA. 00305 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00306 bool __dfs_mode> 00307 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00308 _M_handle_subexpr_lookahead(_Match_mode __match_mode, _StateIdT __i) 00309 { 00310 const auto& __state = _M_nfa[__i]; 00311 if (_M_lookahead(__state._M_alt) == !__state._M_neg) 00312 _M_dfs(__match_mode, __state._M_next); 00313 } 00314 00315 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00316 bool __dfs_mode> 00317 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00318 _M_handle_match(_Match_mode __match_mode, _StateIdT __i) 00319 { 00320 const auto& __state = _M_nfa[__i]; 00321 00322 if (_M_current == _M_end) 00323 return; 00324 if (__dfs_mode) 00325 { 00326 if (__state._M_matches(*_M_current)) 00327 { 00328 ++_M_current; 00329 _M_dfs(__match_mode, __state._M_next); 00330 --_M_current; 00331 } 00332 } 00333 else 00334 if (__state._M_matches(*_M_current)) 00335 _M_states._M_queue(__state._M_next, _M_cur_results); 00336 } 00337 00338 // First fetch the matched result from _M_cur_results as __submatch; 00339 // then compare it with 00340 // (_M_current, _M_current + (__submatch.second - __submatch.first)). 00341 // If matched, keep going; else just return and try another state. 00342 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00343 bool __dfs_mode> 00344 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00345 _M_handle_backref(_Match_mode __match_mode, _StateIdT __i) 00346 { 00347 __glibcxx_assert(__dfs_mode); 00348 00349 const auto& __state = _M_nfa[__i]; 00350 auto& __submatch = _M_cur_results[__state._M_backref_index]; 00351 if (!__submatch.matched) 00352 return; 00353 auto __last = _M_current; 00354 for (auto __tmp = __submatch.first; 00355 __last != _M_end && __tmp != __submatch.second; 00356 ++__tmp) 00357 ++__last; 00358 if (_M_re._M_automaton->_M_traits.transform(__submatch.first, 00359 __submatch.second) 00360 == _M_re._M_automaton->_M_traits.transform(_M_current, __last)) 00361 { 00362 if (__last != _M_current) 00363 { 00364 auto __backup = _M_current; 00365 _M_current = __last; 00366 _M_dfs(__match_mode, __state._M_next); 00367 _M_current = __backup; 00368 } 00369 else 00370 _M_dfs(__match_mode, __state._M_next); 00371 } 00372 } 00373 00374 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00375 bool __dfs_mode> 00376 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00377 _M_handle_accept(_Match_mode __match_mode, _StateIdT __i) 00378 { 00379 if (__dfs_mode) 00380 { 00381 __glibcxx_assert(!_M_has_sol); 00382 if (__match_mode == _Match_mode::_Exact) 00383 _M_has_sol = _M_current == _M_end; 00384 else 00385 _M_has_sol = true; 00386 if (_M_current == _M_begin 00387 && (_M_flags & regex_constants::match_not_null)) 00388 _M_has_sol = false; 00389 if (_M_has_sol) 00390 { 00391 if (_M_nfa._M_flags & regex_constants::ECMAScript) 00392 _M_results = _M_cur_results; 00393 else // POSIX 00394 { 00395 __glibcxx_assert(_M_states._M_get_sol_pos()); 00396 // Here's POSIX's logic: match the longest one. However 00397 // we never know which one (lhs or rhs of "|") is longer 00398 // unless we try both of them and compare the results. 00399 // The member variable _M_sol_pos records the end 00400 // position of the last successful match. It's better 00401 // to be larger, because POSIX regex is always greedy. 00402 // TODO: This could be slow. 00403 if (*_M_states._M_get_sol_pos() == _BiIter() 00404 || std::distance(_M_begin, 00405 *_M_states._M_get_sol_pos()) 00406 < std::distance(_M_begin, _M_current)) 00407 { 00408 *_M_states._M_get_sol_pos() = _M_current; 00409 _M_results = _M_cur_results; 00410 } 00411 } 00412 } 00413 } 00414 else 00415 { 00416 if (_M_current == _M_begin 00417 && (_M_flags & regex_constants::match_not_null)) 00418 return; 00419 if (__match_mode == _Match_mode::_Prefix || _M_current == _M_end) 00420 if (!_M_has_sol) 00421 { 00422 _M_has_sol = true; 00423 _M_results = _M_cur_results; 00424 } 00425 } 00426 } 00427 00428 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00429 bool __dfs_mode> 00430 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00431 _M_handle_alternative(_Match_mode __match_mode, _StateIdT __i) 00432 { 00433 const auto& __state = _M_nfa[__i]; 00434 00435 if (_M_nfa._M_flags & regex_constants::ECMAScript) 00436 { 00437 // TODO: Fix BFS support. It is wrong. 00438 _M_dfs(__match_mode, __state._M_alt); 00439 // Pick lhs if it matches. Only try rhs if it doesn't. 00440 if (!_M_has_sol) 00441 _M_dfs(__match_mode, __state._M_next); 00442 } 00443 else 00444 { 00445 // Try both and compare the result. 00446 // See "case _S_opcode_accept:" handling above. 00447 _M_dfs(__match_mode, __state._M_alt); 00448 auto __has_sol = _M_has_sol; 00449 _M_has_sol = false; 00450 _M_dfs(__match_mode, __state._M_next); 00451 _M_has_sol |= __has_sol; 00452 } 00453 } 00454 00455 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00456 bool __dfs_mode> 00457 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00458 _M_dfs(_Match_mode __match_mode, _StateIdT __i) 00459 { 00460 if (_M_states._M_visited(__i)) 00461 return; 00462 00463 switch (_M_nfa[__i]._M_opcode()) 00464 { 00465 case _S_opcode_repeat: 00466 _M_handle_repeat(__match_mode, __i); break; 00467 case _S_opcode_subexpr_begin: 00468 _M_handle_subexpr_begin(__match_mode, __i); break; 00469 case _S_opcode_subexpr_end: 00470 _M_handle_subexpr_end(__match_mode, __i); break; 00471 case _S_opcode_line_begin_assertion: 00472 _M_handle_line_begin_assertion(__match_mode, __i); break; 00473 case _S_opcode_line_end_assertion: 00474 _M_handle_line_end_assertion(__match_mode, __i); break; 00475 case _S_opcode_word_boundary: 00476 _M_handle_word_boundary(__match_mode, __i); break; 00477 case _S_opcode_subexpr_lookahead: 00478 _M_handle_subexpr_lookahead(__match_mode, __i); break; 00479 case _S_opcode_match: 00480 _M_handle_match(__match_mode, __i); break; 00481 case _S_opcode_backref: 00482 _M_handle_backref(__match_mode, __i); break; 00483 case _S_opcode_accept: 00484 _M_handle_accept(__match_mode, __i); break; 00485 case _S_opcode_alternative: 00486 _M_handle_alternative(__match_mode, __i); break; 00487 default: 00488 __glibcxx_assert(false); 00489 } 00490 } 00491 00492 // Return whether now is at some word boundary. 00493 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00494 bool __dfs_mode> 00495 bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00496 _M_word_boundary() const 00497 { 00498 if (_M_current == _M_begin && (_M_flags & regex_constants::match_not_bow)) 00499 return false; 00500 if (_M_current == _M_end && (_M_flags & regex_constants::match_not_eow)) 00501 return false; 00502 00503 bool __left_is_word = false; 00504 if (_M_current != _M_begin 00505 || (_M_flags & regex_constants::match_prev_avail)) 00506 { 00507 auto __prev = _M_current; 00508 if (_M_is_word(*std::prev(__prev))) 00509 __left_is_word = true; 00510 } 00511 bool __right_is_word = 00512 _M_current != _M_end && _M_is_word(*_M_current); 00513 00514 return __left_is_word != __right_is_word; 00515 } 00516 00517 _GLIBCXX_END_NAMESPACE_VERSION 00518 } // namespace __detail 00519 } // namespace