libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2013-2015 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex_scanner.tcc 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 // FIXME make comments doxygen format. 00032 00033 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep 00034 // and awk 00035 // 1) grep is basic except '\n' is treated as '|' 00036 // 2) egrep is extended except '\n' is treated as '|' 00037 // 3) awk is extended except special escaping rules, and there's no 00038 // back-reference. 00039 // 00040 // References: 00041 // 00042 // ECMAScript: ECMA-262 15.10 00043 // 00044 // basic, extended: 00045 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html 00046 // 00047 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html 00048 00049 namespace std _GLIBCXX_VISIBILITY(default) 00050 { 00051 namespace __detail 00052 { 00053 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00054 00055 template<typename _CharT> 00056 _Scanner<_CharT>:: 00057 _Scanner(typename _Scanner::_IterT __begin, 00058 typename _Scanner::_IterT __end, 00059 _FlagT __flags, std::locale __loc) 00060 : _ScannerBase(__flags), 00061 _M_current(__begin), _M_end(__end), 00062 _M_ctype(std::use_facet<_CtypeT>(__loc)), 00063 _M_eat_escape(_M_is_ecma() 00064 ? &_Scanner::_M_eat_escape_ecma 00065 : &_Scanner::_M_eat_escape_posix) 00066 { _M_advance(); } 00067 00068 template<typename _CharT> 00069 void 00070 _Scanner<_CharT>:: 00071 _M_advance() 00072 { 00073 if (_M_current == _M_end) 00074 { 00075 _M_token = _S_token_eof; 00076 return; 00077 } 00078 00079 if (_M_state == _S_state_normal) 00080 _M_scan_normal(); 00081 else if (_M_state == _S_state_in_bracket) 00082 _M_scan_in_bracket(); 00083 else if (_M_state == _S_state_in_brace) 00084 _M_scan_in_brace(); 00085 else 00086 { 00087 _GLIBCXX_DEBUG_ASSERT(false); 00088 } 00089 } 00090 00091 // Differences between styles: 00092 // 1) "\(", "\)", "\{" in basic. It's not escaping. 00093 // 2) "(?:", "(?=", "(?!" in ECMAScript. 00094 template<typename _CharT> 00095 void 00096 _Scanner<_CharT>:: 00097 _M_scan_normal() 00098 { 00099 auto __c = *_M_current++; 00100 const char* __pos; 00101 00102 if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')) == nullptr) 00103 { 00104 _M_token = _S_token_ord_char; 00105 _M_value.assign(1, __c); 00106 return; 00107 } 00108 if (__c == '\\') 00109 { 00110 if (_M_current == _M_end) 00111 __throw_regex_error(regex_constants::error_escape); 00112 00113 if (!_M_is_basic() 00114 || (*_M_current != '(' 00115 && *_M_current != ')' 00116 && *_M_current != '{')) 00117 { 00118 (this->*_M_eat_escape)(); 00119 return; 00120 } 00121 __c = *_M_current++; 00122 } 00123 if (__c == '(') 00124 { 00125 if (_M_is_ecma() && *_M_current == '?') 00126 { 00127 if (++_M_current == _M_end) 00128 __throw_regex_error(regex_constants::error_paren); 00129 00130 if (*_M_current == ':') 00131 { 00132 ++_M_current; 00133 _M_token = _S_token_subexpr_no_group_begin; 00134 } 00135 else if (*_M_current == '=') 00136 { 00137 ++_M_current; 00138 _M_token = _S_token_subexpr_lookahead_begin; 00139 _M_value.assign(1, 'p'); 00140 } 00141 else if (*_M_current == '!') 00142 { 00143 ++_M_current; 00144 _M_token = _S_token_subexpr_lookahead_begin; 00145 _M_value.assign(1, 'n'); 00146 } 00147 else 00148 __throw_regex_error(regex_constants::error_paren); 00149 } 00150 else if (_M_flags & regex_constants::nosubs) 00151 _M_token = _S_token_subexpr_no_group_begin; 00152 else 00153 _M_token = _S_token_subexpr_begin; 00154 } 00155 else if (__c == ')') 00156 _M_token = _S_token_subexpr_end; 00157 else if (__c == '[') 00158 { 00159 _M_state = _S_state_in_bracket; 00160 _M_at_bracket_start = true; 00161 if (_M_current != _M_end && *_M_current == '^') 00162 { 00163 _M_token = _S_token_bracket_neg_begin; 00164 ++_M_current; 00165 } 00166 else 00167 _M_token = _S_token_bracket_begin; 00168 } 00169 else if (__c == '{') 00170 { 00171 _M_state = _S_state_in_brace; 00172 _M_token = _S_token_interval_begin; 00173 } 00174 else if (((__pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'))) 00175 != nullptr 00176 && *__pos != '\0' 00177 && __c != ']' 00178 && __c != '}') 00179 || (_M_is_grep() && __c == '\n')) 00180 { 00181 auto __it = _M_token_tbl; 00182 auto __narrowc = _M_ctype.narrow(__c, '\0'); 00183 for (; __it->first != '\0'; ++__it) 00184 if (__it->first == __narrowc) 00185 { 00186 _M_token = __it->second; 00187 return; 00188 } 00189 _GLIBCXX_DEBUG_ASSERT(false); 00190 } 00191 else 00192 { 00193 _M_token = _S_token_ord_char; 00194 _M_value.assign(1, __c); 00195 } 00196 } 00197 00198 // Differences between styles: 00199 // 1) different semantics of "[]" and "[^]". 00200 // 2) Escaping in bracket expr. 00201 template<typename _CharT> 00202 void 00203 _Scanner<_CharT>:: 00204 _M_scan_in_bracket() 00205 { 00206 if (_M_current == _M_end) 00207 __throw_regex_error(regex_constants::error_brack); 00208 00209 auto __c = *_M_current++; 00210 00211 if (__c == '[') 00212 { 00213 if (_M_current == _M_end) 00214 __throw_regex_error(regex_constants::error_brack); 00215 00216 if (*_M_current == '.') 00217 { 00218 _M_token = _S_token_collsymbol; 00219 _M_eat_class(*_M_current++); 00220 } 00221 else if (*_M_current == ':') 00222 { 00223 _M_token = _S_token_char_class_name; 00224 _M_eat_class(*_M_current++); 00225 } 00226 else if (*_M_current == '=') 00227 { 00228 _M_token = _S_token_equiv_class_name; 00229 _M_eat_class(*_M_current++); 00230 } 00231 else 00232 { 00233 _M_token = _S_token_ord_char; 00234 _M_value.assign(1, __c); 00235 } 00236 } 00237 // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted 00238 // literally. So "[]]" and "[^]]" are valid regexes. See the testcases 00239 // `*/empty_range.cc`. 00240 else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start)) 00241 { 00242 _M_token = _S_token_bracket_end; 00243 _M_state = _S_state_normal; 00244 } 00245 // ECMAScript and awk permits escaping in bracket. 00246 else if (__c == '\\' && (_M_is_ecma() || _M_is_awk())) 00247 (this->*_M_eat_escape)(); 00248 else 00249 { 00250 _M_token = _S_token_ord_char; 00251 _M_value.assign(1, __c); 00252 } 00253 _M_at_bracket_start = false; 00254 } 00255 00256 // Differences between styles: 00257 // 1) "\}" in basic style. 00258 template<typename _CharT> 00259 void 00260 _Scanner<_CharT>:: 00261 _M_scan_in_brace() 00262 { 00263 if (_M_current == _M_end) 00264 __throw_regex_error(regex_constants::error_brace); 00265 00266 auto __c = *_M_current++; 00267 00268 if (_M_ctype.is(_CtypeT::digit, __c)) 00269 { 00270 _M_token = _S_token_dup_count; 00271 _M_value.assign(1, __c); 00272 while (_M_current != _M_end 00273 && _M_ctype.is(_CtypeT::digit, *_M_current)) 00274 _M_value += *_M_current++; 00275 } 00276 else if (__c == ',') 00277 _M_token = _S_token_comma; 00278 // basic use \}. 00279 else if (_M_is_basic()) 00280 { 00281 if (__c == '\\' && _M_current != _M_end && *_M_current == '}') 00282 { 00283 _M_state = _S_state_normal; 00284 _M_token = _S_token_interval_end; 00285 ++_M_current; 00286 } 00287 else 00288 __throw_regex_error(regex_constants::error_badbrace); 00289 } 00290 else if (__c == '}') 00291 { 00292 _M_state = _S_state_normal; 00293 _M_token = _S_token_interval_end; 00294 } 00295 else 00296 __throw_regex_error(regex_constants::error_badbrace); 00297 } 00298 00299 template<typename _CharT> 00300 void 00301 _Scanner<_CharT>:: 00302 _M_eat_escape_ecma() 00303 { 00304 if (_M_current == _M_end) 00305 __throw_regex_error(regex_constants::error_escape); 00306 00307 auto __c = *_M_current++; 00308 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 00309 00310 if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket)) 00311 { 00312 _M_token = _S_token_ord_char; 00313 _M_value.assign(1, *__pos); 00314 } 00315 else if (__c == 'b') 00316 { 00317 _M_token = _S_token_word_bound; 00318 _M_value.assign(1, 'p'); 00319 } 00320 else if (__c == 'B') 00321 { 00322 _M_token = _S_token_word_bound; 00323 _M_value.assign(1, 'n'); 00324 } 00325 // N3376 28.13 00326 else if (__c == 'd' 00327 || __c == 'D' 00328 || __c == 's' 00329 || __c == 'S' 00330 || __c == 'w' 00331 || __c == 'W') 00332 { 00333 _M_token = _S_token_quoted_class; 00334 _M_value.assign(1, __c); 00335 } 00336 else if (__c == 'c') 00337 { 00338 if (_M_current == _M_end) 00339 __throw_regex_error(regex_constants::error_escape); 00340 _M_token = _S_token_ord_char; 00341 _M_value.assign(1, *_M_current++); 00342 } 00343 else if (__c == 'x' || __c == 'u') 00344 { 00345 _M_value.erase(); 00346 for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++) 00347 { 00348 if (_M_current == _M_end 00349 || !_M_ctype.is(_CtypeT::xdigit, *_M_current)) 00350 __throw_regex_error(regex_constants::error_escape); 00351 _M_value += *_M_current++; 00352 } 00353 _M_token = _S_token_hex_num; 00354 } 00355 // ECMAScript recognizes multi-digit back-references. 00356 else if (_M_ctype.is(_CtypeT::digit, __c)) 00357 { 00358 _M_value.assign(1, __c); 00359 while (_M_current != _M_end 00360 && _M_ctype.is(_CtypeT::digit, *_M_current)) 00361 _M_value += *_M_current++; 00362 _M_token = _S_token_backref; 00363 } 00364 else 00365 { 00366 _M_token = _S_token_ord_char; 00367 _M_value.assign(1, __c); 00368 } 00369 } 00370 00371 // Differences between styles: 00372 // 1) Extended doesn't support backref, but basic does. 00373 template<typename _CharT> 00374 void 00375 _Scanner<_CharT>:: 00376 _M_eat_escape_posix() 00377 { 00378 if (_M_current == _M_end) 00379 __throw_regex_error(regex_constants::error_escape); 00380 00381 auto __c = *_M_current; 00382 auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')); 00383 00384 if (__pos != nullptr && *__pos != '\0') 00385 { 00386 _M_token = _S_token_ord_char; 00387 _M_value.assign(1, __c); 00388 } 00389 // We MUST judge awk before handling backrefs. There's no backref in awk. 00390 else if (_M_is_awk()) 00391 { 00392 _M_eat_escape_awk(); 00393 return; 00394 } 00395 else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0') 00396 { 00397 _M_token = _S_token_backref; 00398 _M_value.assign(1, __c); 00399 } 00400 else 00401 { 00402 #ifdef __STRICT_ANSI__ 00403 // POSIX says it is undefined to escape ordinary characters 00404 __throw_regex_error(regex_constants::error_escape); 00405 #else 00406 _M_token = _S_token_ord_char; 00407 _M_value.assign(1, __c); 00408 #endif 00409 } 00410 ++_M_current; 00411 } 00412 00413 template<typename _CharT> 00414 void 00415 _Scanner<_CharT>:: 00416 _M_eat_escape_awk() 00417 { 00418 auto __c = *_M_current++; 00419 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 00420 00421 if (__pos != nullptr) 00422 { 00423 _M_token = _S_token_ord_char; 00424 _M_value.assign(1, *__pos); 00425 } 00426 // \ddd for oct representation 00427 else if (_M_ctype.is(_CtypeT::digit, __c) 00428 && __c != '8' 00429 && __c != '9') 00430 { 00431 _M_value.assign(1, __c); 00432 for (int __i = 0; 00433 __i < 2 00434 && _M_current != _M_end 00435 && _M_ctype.is(_CtypeT::digit, *_M_current) 00436 && *_M_current != '8' 00437 && *_M_current != '9'; 00438 __i++) 00439 _M_value += *_M_current++; 00440 _M_token = _S_token_oct_num; 00441 return; 00442 } 00443 else 00444 __throw_regex_error(regex_constants::error_escape); 00445 } 00446 00447 // Eats a character class or throws an exception. 00448 // __ch could be ':', '.' or '=', _M_current is the char after ']' when 00449 // returning. 00450 template<typename _CharT> 00451 void 00452 _Scanner<_CharT>:: 00453 _M_eat_class(char __ch) 00454 { 00455 for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;) 00456 _M_value += *_M_current++; 00457 if (_M_current == _M_end 00458 || *_M_current++ != __ch 00459 || _M_current == _M_end // skip __ch 00460 || *_M_current++ != ']') // skip ']' 00461 { 00462 if (__ch == ':') 00463 __throw_regex_error(regex_constants::error_ctype); 00464 else 00465 __throw_regex_error(regex_constants::error_collate); 00466 } 00467 } 00468 00469 #ifdef _GLIBCXX_DEBUG 00470 template<typename _CharT> 00471 std::ostream& 00472 _Scanner<_CharT>:: 00473 _M_print(std::ostream& ostr) 00474 { 00475 switch (_M_token) 00476 { 00477 case _S_token_anychar: 00478 ostr << "any-character\n"; 00479 break; 00480 case _S_token_backref: 00481 ostr << "backref\n"; 00482 break; 00483 case _S_token_bracket_begin: 00484 ostr << "bracket-begin\n"; 00485 break; 00486 case _S_token_bracket_neg_begin: 00487 ostr << "bracket-neg-begin\n"; 00488 break; 00489 case _S_token_bracket_end: 00490 ostr << "bracket-end\n"; 00491 break; 00492 case _S_token_char_class_name: 00493 ostr << "char-class-name \"" << _M_value << "\"\n"; 00494 break; 00495 case _S_token_closure0: 00496 ostr << "closure0\n"; 00497 break; 00498 case _S_token_closure1: 00499 ostr << "closure1\n"; 00500 break; 00501 case _S_token_collsymbol: 00502 ostr << "collsymbol \"" << _M_value << "\"\n"; 00503 break; 00504 case _S_token_comma: 00505 ostr << "comma\n"; 00506 break; 00507 case _S_token_dup_count: 00508 ostr << "dup count: " << _M_value << "\n"; 00509 break; 00510 case _S_token_eof: 00511 ostr << "EOF\n"; 00512 break; 00513 case _S_token_equiv_class_name: 00514 ostr << "equiv-class-name \"" << _M_value << "\"\n"; 00515 break; 00516 case _S_token_interval_begin: 00517 ostr << "interval begin\n"; 00518 break; 00519 case _S_token_interval_end: 00520 ostr << "interval end\n"; 00521 break; 00522 case _S_token_line_begin: 00523 ostr << "line begin\n"; 00524 break; 00525 case _S_token_line_end: 00526 ostr << "line end\n"; 00527 break; 00528 case _S_token_opt: 00529 ostr << "opt\n"; 00530 break; 00531 case _S_token_or: 00532 ostr << "or\n"; 00533 break; 00534 case _S_token_ord_char: 00535 ostr << "ordinary character: \"" << _M_value << "\"\n"; 00536 break; 00537 case _S_token_subexpr_begin: 00538 ostr << "subexpr begin\n"; 00539 break; 00540 case _S_token_subexpr_no_group_begin: 00541 ostr << "no grouping subexpr begin\n"; 00542 break; 00543 case _S_token_subexpr_lookahead_begin: 00544 ostr << "lookahead subexpr begin\n"; 00545 break; 00546 case _S_token_subexpr_end: 00547 ostr << "subexpr end\n"; 00548 break; 00549 case _S_token_unknown: 00550 ostr << "-- unknown token --\n"; 00551 break; 00552 case _S_token_oct_num: 00553 ostr << "oct number " << _M_value << "\n"; 00554 break; 00555 case _S_token_hex_num: 00556 ostr << "hex number " << _M_value << "\n"; 00557 break; 00558 case _S_token_quoted_class: 00559 ostr << "quoted class " << "\\" << _M_value << "\n"; 00560 break; 00561 default: 00562 _GLIBCXX_DEBUG_ASSERT(false); 00563 } 00564 return ostr; 00565 } 00566 #endif 00567 00568 _GLIBCXX_END_NAMESPACE_VERSION 00569 } // namespace __detail 00570 } // namespace