Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1 | // Copyright 2016 the V8 project authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #include "src/regexp/regexp-parser.h" |
| 6 | |
| 7 | #include "src/char-predicates-inl.h" |
| 8 | #include "src/factory.h" |
| 9 | #include "src/isolate.h" |
| 10 | #include "src/objects-inl.h" |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 11 | #include "src/ostreams.h" |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 12 | #include "src/regexp/jsregexp.h" |
| 13 | #include "src/utils.h" |
| 14 | |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 15 | #ifdef V8_I18N_SUPPORT |
| 16 | #include "unicode/uset.h" |
| 17 | #endif // V8_I18N_SUPPORT |
| 18 | |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 19 | namespace v8 { |
| 20 | namespace internal { |
| 21 | |
| 22 | RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 23 | JSRegExp::Flags flags, Isolate* isolate, Zone* zone) |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 24 | : isolate_(isolate), |
| 25 | zone_(zone), |
| 26 | error_(error), |
| 27 | captures_(NULL), |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 28 | named_captures_(NULL), |
| 29 | named_back_references_(NULL), |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 30 | in_(in), |
| 31 | current_(kEndMarker), |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 32 | ignore_case_(flags & JSRegExp::kIgnoreCase), |
| 33 | multiline_(flags & JSRegExp::kMultiline), |
| 34 | unicode_(flags & JSRegExp::kUnicode), |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 35 | next_pos_(0), |
| 36 | captures_started_(0), |
| 37 | capture_count_(0), |
| 38 | has_more_(true), |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 39 | simple_(false), |
| 40 | contains_anchor_(false), |
| 41 | is_scanned_for_captures_(false), |
| 42 | failed_(false) { |
| 43 | Advance(); |
| 44 | } |
| 45 | |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 46 | template <bool update_position> |
| 47 | inline uc32 RegExpParser::ReadNext() { |
| 48 | int position = next_pos_; |
| 49 | uc32 c0 = in()->Get(position); |
| 50 | position++; |
| 51 | // Read the whole surrogate pair in case of unicode flag, if possible. |
| 52 | if (unicode() && position < in()->length() && |
| 53 | unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { |
| 54 | uc16 c1 = in()->Get(position); |
| 55 | if (unibrow::Utf16::IsTrailSurrogate(c1)) { |
| 56 | c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); |
| 57 | position++; |
| 58 | } |
| 59 | } |
| 60 | if (update_position) next_pos_ = position; |
| 61 | return c0; |
| 62 | } |
| 63 | |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 64 | |
| 65 | uc32 RegExpParser::Next() { |
| 66 | if (has_next()) { |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 67 | return ReadNext<false>(); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 68 | } else { |
| 69 | return kEndMarker; |
| 70 | } |
| 71 | } |
| 72 | |
| 73 | |
| 74 | void RegExpParser::Advance() { |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 75 | if (has_next()) { |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 76 | StackLimitCheck check(isolate()); |
| 77 | if (check.HasOverflowed()) { |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 78 | ReportError(CStrVector( |
| 79 | MessageTemplate::TemplateString(MessageTemplate::kStackOverflow))); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 80 | } else if (zone()->excess_allocation()) { |
| 81 | ReportError(CStrVector("Regular expression too large")); |
| 82 | } else { |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 83 | current_ = ReadNext<true>(); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 84 | } |
| 85 | } else { |
| 86 | current_ = kEndMarker; |
| 87 | // Advance so that position() points to 1-after-the-last-character. This is |
| 88 | // important so that Reset() to this position works correctly. |
| 89 | next_pos_ = in()->length() + 1; |
| 90 | has_more_ = false; |
| 91 | } |
| 92 | } |
| 93 | |
| 94 | |
| 95 | void RegExpParser::Reset(int pos) { |
| 96 | next_pos_ = pos; |
| 97 | has_more_ = (pos < in()->length()); |
| 98 | Advance(); |
| 99 | } |
| 100 | |
| 101 | |
| 102 | void RegExpParser::Advance(int dist) { |
| 103 | next_pos_ += dist - 1; |
| 104 | Advance(); |
| 105 | } |
| 106 | |
| 107 | |
| 108 | bool RegExpParser::simple() { return simple_; } |
| 109 | |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 110 | bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { |
| 111 | switch (c) { |
| 112 | case '^': |
| 113 | case '$': |
| 114 | case '\\': |
| 115 | case '.': |
| 116 | case '*': |
| 117 | case '+': |
| 118 | case '?': |
| 119 | case '(': |
| 120 | case ')': |
| 121 | case '[': |
| 122 | case ']': |
| 123 | case '{': |
| 124 | case '}': |
| 125 | case '|': |
| 126 | case '/': |
| 127 | return true; |
| 128 | default: |
| 129 | break; |
| 130 | } |
| 131 | return false; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 132 | } |
| 133 | |
| 134 | |
| 135 | RegExpTree* RegExpParser::ReportError(Vector<const char> message) { |
Ben Murdoch | c561043 | 2016-08-08 18:44:38 +0100 | [diff] [blame] | 136 | if (failed_) return NULL; // Do not overwrite any existing error. |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 137 | failed_ = true; |
| 138 | *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked(); |
| 139 | // Zip to the end to make sure the no more input is read. |
| 140 | current_ = kEndMarker; |
| 141 | next_pos_ = in()->length(); |
| 142 | return NULL; |
| 143 | } |
| 144 | |
| 145 | |
| 146 | #define CHECK_FAILED /**/); \ |
| 147 | if (failed_) return NULL; \ |
| 148 | ((void)0 |
| 149 | |
| 150 | |
| 151 | // Pattern :: |
| 152 | // Disjunction |
| 153 | RegExpTree* RegExpParser::ParsePattern() { |
| 154 | RegExpTree* result = ParseDisjunction(CHECK_FAILED); |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 155 | PatchNamedBackReferences(CHECK_FAILED); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 156 | DCHECK(!has_more()); |
| 157 | // If the result of parsing is a literal string atom, and it has the |
| 158 | // same length as the input, then the atom is identical to the input. |
| 159 | if (result->IsAtom() && result->AsAtom()->length() == in()->length()) { |
| 160 | simple_ = true; |
| 161 | } |
| 162 | return result; |
| 163 | } |
| 164 | |
| 165 | |
| 166 | // Disjunction :: |
| 167 | // Alternative |
| 168 | // Alternative | Disjunction |
| 169 | // Alternative :: |
| 170 | // [empty] |
| 171 | // Term Alternative |
| 172 | // Term :: |
| 173 | // Assertion |
| 174 | // Atom |
| 175 | // Atom Quantifier |
| 176 | RegExpTree* RegExpParser::ParseDisjunction() { |
| 177 | // Used to store current state while parsing subexpressions. |
| 178 | RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 179 | nullptr, ignore_case(), unicode(), zone()); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 180 | RegExpParserState* state = &initial_state; |
| 181 | // Cache the builder in a local variable for quick access. |
| 182 | RegExpBuilder* builder = initial_state.builder(); |
| 183 | while (true) { |
| 184 | switch (current()) { |
| 185 | case kEndMarker: |
| 186 | if (state->IsSubexpression()) { |
| 187 | // Inside a parenthesized group when hitting end of input. |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 188 | return ReportError(CStrVector("Unterminated group")); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 189 | } |
| 190 | DCHECK_EQ(INITIAL, state->group_type()); |
| 191 | // Parsing completed successfully. |
| 192 | return builder->ToRegExp(); |
| 193 | case ')': { |
| 194 | if (!state->IsSubexpression()) { |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 195 | return ReportError(CStrVector("Unmatched ')'")); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 196 | } |
| 197 | DCHECK_NE(INITIAL, state->group_type()); |
| 198 | |
| 199 | Advance(); |
| 200 | // End disjunction parsing and convert builder content to new single |
| 201 | // regexp atom. |
| 202 | RegExpTree* body = builder->ToRegExp(); |
| 203 | |
| 204 | int end_capture_index = captures_started(); |
| 205 | |
| 206 | int capture_index = state->capture_index(); |
| 207 | SubexpressionType group_type = state->group_type(); |
| 208 | |
| 209 | // Build result of subexpression. |
| 210 | if (group_type == CAPTURE) { |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 211 | if (state->IsNamedCapture()) { |
| 212 | CreateNamedCaptureAtIndex(state->capture_name(), |
| 213 | capture_index CHECK_FAILED); |
| 214 | } |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 215 | RegExpCapture* capture = GetCapture(capture_index); |
| 216 | capture->set_body(body); |
| 217 | body = capture; |
| 218 | } else if (group_type != GROUPING) { |
| 219 | DCHECK(group_type == POSITIVE_LOOKAROUND || |
| 220 | group_type == NEGATIVE_LOOKAROUND); |
| 221 | bool is_positive = (group_type == POSITIVE_LOOKAROUND); |
| 222 | body = new (zone()) RegExpLookaround( |
| 223 | body, is_positive, end_capture_index - capture_index, |
| 224 | capture_index, state->lookaround_type()); |
| 225 | } |
| 226 | |
| 227 | // Restore previous state. |
| 228 | state = state->previous_state(); |
| 229 | builder = state->builder(); |
| 230 | |
| 231 | builder->AddAtom(body); |
| 232 | // For compatability with JSC and ES3, we allow quantifiers after |
| 233 | // lookaheads, and break in all cases. |
| 234 | break; |
| 235 | } |
| 236 | case '|': { |
| 237 | Advance(); |
| 238 | builder->NewAlternative(); |
| 239 | continue; |
| 240 | } |
| 241 | case '*': |
| 242 | case '+': |
| 243 | case '?': |
| 244 | return ReportError(CStrVector("Nothing to repeat")); |
| 245 | case '^': { |
| 246 | Advance(); |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 247 | if (multiline()) { |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 248 | builder->AddAssertion( |
| 249 | new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); |
| 250 | } else { |
| 251 | builder->AddAssertion( |
| 252 | new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT)); |
| 253 | set_contains_anchor(); |
| 254 | } |
| 255 | continue; |
| 256 | } |
| 257 | case '$': { |
| 258 | Advance(); |
| 259 | RegExpAssertion::AssertionType assertion_type = |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 260 | multiline() ? RegExpAssertion::END_OF_LINE |
| 261 | : RegExpAssertion::END_OF_INPUT; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 262 | builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type)); |
| 263 | continue; |
| 264 | } |
| 265 | case '.': { |
| 266 | Advance(); |
| 267 | // everything except \x0a, \x0d, \u2028 and \u2029 |
| 268 | ZoneList<CharacterRange>* ranges = |
| 269 | new (zone()) ZoneList<CharacterRange>(2, zone()); |
| 270 | CharacterRange::AddClassEscape('.', ranges, zone()); |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 271 | RegExpCharacterClass* cc = |
| 272 | new (zone()) RegExpCharacterClass(ranges, false); |
| 273 | builder->AddCharacterClass(cc); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 274 | break; |
| 275 | } |
| 276 | case '(': { |
| 277 | SubexpressionType subexpr_type = CAPTURE; |
| 278 | RegExpLookaround::Type lookaround_type = state->lookaround_type(); |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 279 | bool is_named_capture = false; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 280 | Advance(); |
| 281 | if (current() == '?') { |
| 282 | switch (Next()) { |
| 283 | case ':': |
| 284 | subexpr_type = GROUPING; |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 285 | Advance(2); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 286 | break; |
| 287 | case '=': |
| 288 | lookaround_type = RegExpLookaround::LOOKAHEAD; |
| 289 | subexpr_type = POSITIVE_LOOKAROUND; |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 290 | Advance(2); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 291 | break; |
| 292 | case '!': |
| 293 | lookaround_type = RegExpLookaround::LOOKAHEAD; |
| 294 | subexpr_type = NEGATIVE_LOOKAROUND; |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 295 | Advance(2); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 296 | break; |
| 297 | case '<': |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 298 | Advance(); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 299 | if (FLAG_harmony_regexp_lookbehind) { |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 300 | if (Next() == '=') { |
| 301 | subexpr_type = POSITIVE_LOOKAROUND; |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 302 | lookaround_type = RegExpLookaround::LOOKBEHIND; |
| 303 | Advance(2); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 304 | break; |
| 305 | } else if (Next() == '!') { |
| 306 | subexpr_type = NEGATIVE_LOOKAROUND; |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 307 | lookaround_type = RegExpLookaround::LOOKBEHIND; |
| 308 | Advance(2); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 309 | break; |
| 310 | } |
| 311 | } |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 312 | if (FLAG_harmony_regexp_named_captures && unicode()) { |
| 313 | is_named_capture = true; |
| 314 | Advance(); |
| 315 | break; |
| 316 | } |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 317 | // Fall through. |
| 318 | default: |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 319 | return ReportError(CStrVector("Invalid group")); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 320 | } |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 321 | } |
| 322 | |
| 323 | const ZoneVector<uc16>* capture_name = nullptr; |
| 324 | if (subexpr_type == CAPTURE) { |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 325 | if (captures_started_ >= kMaxCaptures) { |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 326 | return ReportError(CStrVector("Too many captures")); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 327 | } |
| 328 | captures_started_++; |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 329 | |
| 330 | if (is_named_capture) { |
| 331 | capture_name = ParseCaptureGroupName(CHECK_FAILED); |
| 332 | } |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 333 | } |
| 334 | // Store current state and begin new disjunction parsing. |
| 335 | state = new (zone()) RegExpParserState( |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 336 | state, subexpr_type, lookaround_type, captures_started_, |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 337 | capture_name, ignore_case(), unicode(), zone()); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 338 | builder = state->builder(); |
| 339 | continue; |
| 340 | } |
| 341 | case '[': { |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 342 | RegExpTree* cc = ParseCharacterClass(CHECK_FAILED); |
| 343 | builder->AddCharacterClass(cc->AsCharacterClass()); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 344 | break; |
| 345 | } |
| 346 | // Atom :: |
| 347 | // \ AtomEscape |
| 348 | case '\\': |
| 349 | switch (Next()) { |
| 350 | case kEndMarker: |
| 351 | return ReportError(CStrVector("\\ at end of pattern")); |
| 352 | case 'b': |
| 353 | Advance(2); |
| 354 | builder->AddAssertion( |
| 355 | new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY)); |
| 356 | continue; |
| 357 | case 'B': |
| 358 | Advance(2); |
| 359 | builder->AddAssertion( |
| 360 | new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY)); |
| 361 | continue; |
| 362 | // AtomEscape :: |
| 363 | // CharacterClassEscape |
| 364 | // |
| 365 | // CharacterClassEscape :: one of |
| 366 | // d D s S w W |
| 367 | case 'd': |
| 368 | case 'D': |
| 369 | case 's': |
| 370 | case 'S': |
| 371 | case 'w': |
| 372 | case 'W': { |
| 373 | uc32 c = Next(); |
| 374 | Advance(2); |
| 375 | ZoneList<CharacterRange>* ranges = |
| 376 | new (zone()) ZoneList<CharacterRange>(2, zone()); |
| 377 | CharacterRange::AddClassEscape(c, ranges, zone()); |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 378 | RegExpCharacterClass* cc = |
| 379 | new (zone()) RegExpCharacterClass(ranges, false); |
| 380 | builder->AddCharacterClass(cc); |
| 381 | break; |
| 382 | } |
| 383 | case 'p': |
| 384 | case 'P': { |
| 385 | uc32 p = Next(); |
| 386 | Advance(2); |
| 387 | if (unicode()) { |
| 388 | if (FLAG_harmony_regexp_property) { |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 389 | ZoneList<CharacterRange>* ranges = |
| 390 | new (zone()) ZoneList<CharacterRange>(2, zone()); |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 391 | if (!ParsePropertyClass(ranges, p == 'P')) { |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 392 | return ReportError(CStrVector("Invalid property name")); |
| 393 | } |
| 394 | RegExpCharacterClass* cc = |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 395 | new (zone()) RegExpCharacterClass(ranges, false); |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 396 | builder->AddCharacterClass(cc); |
| 397 | } else { |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 398 | // With /u, no identity escapes except for syntax characters |
| 399 | // are allowed. Otherwise, all identity escapes are allowed. |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 400 | return ReportError(CStrVector("Invalid escape")); |
| 401 | } |
| 402 | } else { |
| 403 | builder->AddCharacter(p); |
| 404 | } |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 405 | break; |
| 406 | } |
| 407 | case '1': |
| 408 | case '2': |
| 409 | case '3': |
| 410 | case '4': |
| 411 | case '5': |
| 412 | case '6': |
| 413 | case '7': |
| 414 | case '8': |
| 415 | case '9': { |
| 416 | int index = 0; |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 417 | bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED); |
| 418 | if (is_backref) { |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 419 | if (state->IsInsideCaptureGroup(index)) { |
| 420 | // The back reference is inside the capture group it refers to. |
| 421 | // Nothing can possibly have been captured yet, so we use empty |
| 422 | // instead. This ensures that, when checking a back reference, |
| 423 | // the capture registers of the referenced capture are either |
| 424 | // both set or both cleared. |
| 425 | builder->AddEmpty(); |
| 426 | } else { |
| 427 | RegExpCapture* capture = GetCapture(index); |
| 428 | RegExpTree* atom = new (zone()) RegExpBackReference(capture); |
| 429 | builder->AddAtom(atom); |
| 430 | } |
| 431 | break; |
| 432 | } |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 433 | // With /u, no identity escapes except for syntax characters |
| 434 | // are allowed. Otherwise, all identity escapes are allowed. |
| 435 | if (unicode()) { |
| 436 | return ReportError(CStrVector("Invalid escape")); |
| 437 | } |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 438 | uc32 first_digit = Next(); |
| 439 | if (first_digit == '8' || first_digit == '9') { |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 440 | builder->AddCharacter(first_digit); |
| 441 | Advance(2); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 442 | break; |
| 443 | } |
| 444 | } |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 445 | // Fall through. |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 446 | case '0': { |
| 447 | Advance(); |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 448 | if (unicode() && Next() >= '0' && Next() <= '9') { |
| 449 | // With /u, decimal escape with leading 0 are not parsed as octal. |
| 450 | return ReportError(CStrVector("Invalid decimal escape")); |
| 451 | } |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 452 | uc32 octal = ParseOctalLiteral(); |
| 453 | builder->AddCharacter(octal); |
| 454 | break; |
| 455 | } |
| 456 | // ControlEscape :: one of |
| 457 | // f n r t v |
| 458 | case 'f': |
| 459 | Advance(2); |
| 460 | builder->AddCharacter('\f'); |
| 461 | break; |
| 462 | case 'n': |
| 463 | Advance(2); |
| 464 | builder->AddCharacter('\n'); |
| 465 | break; |
| 466 | case 'r': |
| 467 | Advance(2); |
| 468 | builder->AddCharacter('\r'); |
| 469 | break; |
| 470 | case 't': |
| 471 | Advance(2); |
| 472 | builder->AddCharacter('\t'); |
| 473 | break; |
| 474 | case 'v': |
| 475 | Advance(2); |
| 476 | builder->AddCharacter('\v'); |
| 477 | break; |
| 478 | case 'c': { |
| 479 | Advance(); |
| 480 | uc32 controlLetter = Next(); |
| 481 | // Special case if it is an ASCII letter. |
| 482 | // Convert lower case letters to uppercase. |
| 483 | uc32 letter = controlLetter & ~('a' ^ 'A'); |
| 484 | if (letter < 'A' || 'Z' < letter) { |
| 485 | // controlLetter is not in range 'A'-'Z' or 'a'-'z'. |
| 486 | // This is outside the specification. We match JSC in |
| 487 | // reading the backslash as a literal character instead |
| 488 | // of as starting an escape. |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 489 | if (unicode()) { |
| 490 | // With /u, invalid escapes are not treated as identity escapes. |
| 491 | return ReportError(CStrVector("Invalid unicode escape")); |
| 492 | } |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 493 | builder->AddCharacter('\\'); |
| 494 | } else { |
| 495 | Advance(2); |
| 496 | builder->AddCharacter(controlLetter & 0x1f); |
| 497 | } |
| 498 | break; |
| 499 | } |
| 500 | case 'x': { |
| 501 | Advance(2); |
| 502 | uc32 value; |
| 503 | if (ParseHexEscape(2, &value)) { |
| 504 | builder->AddCharacter(value); |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 505 | } else if (!unicode()) { |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 506 | builder->AddCharacter('x'); |
| 507 | } else { |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 508 | // With /u, invalid escapes are not treated as identity escapes. |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 509 | return ReportError(CStrVector("Invalid escape")); |
| 510 | } |
| 511 | break; |
| 512 | } |
| 513 | case 'u': { |
| 514 | Advance(2); |
| 515 | uc32 value; |
| 516 | if (ParseUnicodeEscape(&value)) { |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 517 | builder->AddEscapedUnicodeCharacter(value); |
| 518 | } else if (!unicode()) { |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 519 | builder->AddCharacter('u'); |
| 520 | } else { |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 521 | // With /u, invalid escapes are not treated as identity escapes. |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 522 | return ReportError(CStrVector("Invalid unicode escape")); |
| 523 | } |
| 524 | break; |
| 525 | } |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 526 | case 'k': |
| 527 | if (FLAG_harmony_regexp_named_captures && unicode()) { |
| 528 | Advance(2); |
| 529 | ParseNamedBackReference(builder, state CHECK_FAILED); |
| 530 | break; |
| 531 | } |
| 532 | // Fall through. |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 533 | default: |
| 534 | Advance(); |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 535 | // With /u, no identity escapes except for syntax characters |
| 536 | // are allowed. Otherwise, all identity escapes are allowed. |
| 537 | if (!unicode() || IsSyntaxCharacterOrSlash(current())) { |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 538 | builder->AddCharacter(current()); |
| 539 | Advance(); |
| 540 | } else { |
| 541 | return ReportError(CStrVector("Invalid escape")); |
| 542 | } |
| 543 | break; |
| 544 | } |
| 545 | break; |
| 546 | case '{': { |
| 547 | int dummy; |
Ben Murdoch | c561043 | 2016-08-08 18:44:38 +0100 | [diff] [blame] | 548 | bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED); |
| 549 | if (parsed) return ReportError(CStrVector("Nothing to repeat")); |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 550 | // Fall through. |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 551 | } |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 552 | case '}': |
| 553 | case ']': |
| 554 | if (unicode()) { |
| 555 | return ReportError(CStrVector("Lone quantifier brackets")); |
| 556 | } |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 557 | // Fall through. |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 558 | default: |
| 559 | builder->AddUnicodeCharacter(current()); |
| 560 | Advance(); |
| 561 | break; |
| 562 | } // end switch(current()) |
| 563 | |
| 564 | int min; |
| 565 | int max; |
| 566 | switch (current()) { |
| 567 | // QuantifierPrefix :: |
| 568 | // * |
| 569 | // + |
| 570 | // ? |
| 571 | // { |
| 572 | case '*': |
| 573 | min = 0; |
| 574 | max = RegExpTree::kInfinity; |
| 575 | Advance(); |
| 576 | break; |
| 577 | case '+': |
| 578 | min = 1; |
| 579 | max = RegExpTree::kInfinity; |
| 580 | Advance(); |
| 581 | break; |
| 582 | case '?': |
| 583 | min = 0; |
| 584 | max = 1; |
| 585 | Advance(); |
| 586 | break; |
| 587 | case '{': |
| 588 | if (ParseIntervalQuantifier(&min, &max)) { |
| 589 | if (max < min) { |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 590 | return ReportError( |
| 591 | CStrVector("numbers out of order in {} quantifier")); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 592 | } |
| 593 | break; |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 594 | } else if (unicode()) { |
| 595 | // With /u, incomplete quantifiers are not allowed. |
| 596 | return ReportError(CStrVector("Incomplete quantifier")); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 597 | } |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 598 | continue; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 599 | default: |
| 600 | continue; |
| 601 | } |
| 602 | RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; |
| 603 | if (current() == '?') { |
| 604 | quantifier_type = RegExpQuantifier::NON_GREEDY; |
| 605 | Advance(); |
| 606 | } else if (FLAG_regexp_possessive_quantifier && current() == '+') { |
| 607 | // FLAG_regexp_possessive_quantifier is a debug-only flag. |
| 608 | quantifier_type = RegExpQuantifier::POSSESSIVE; |
| 609 | Advance(); |
| 610 | } |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 611 | if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) { |
| 612 | return ReportError(CStrVector("Invalid quantifier")); |
| 613 | } |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 614 | } |
| 615 | } |
| 616 | |
| 617 | |
| 618 | #ifdef DEBUG |
| 619 | // Currently only used in an DCHECK. |
| 620 | static bool IsSpecialClassEscape(uc32 c) { |
| 621 | switch (c) { |
| 622 | case 'd': |
| 623 | case 'D': |
| 624 | case 's': |
| 625 | case 'S': |
| 626 | case 'w': |
| 627 | case 'W': |
| 628 | return true; |
| 629 | default: |
| 630 | return false; |
| 631 | } |
| 632 | } |
| 633 | #endif |
| 634 | |
| 635 | |
| 636 | // In order to know whether an escape is a backreference or not we have to scan |
| 637 | // the entire regexp and find the number of capturing parentheses. However we |
| 638 | // don't want to scan the regexp twice unless it is necessary. This mini-parser |
| 639 | // is called when needed. It can see the difference between capturing and |
| 640 | // noncapturing parentheses and can skip character classes and backslash-escaped |
| 641 | // characters. |
| 642 | void RegExpParser::ScanForCaptures() { |
| 643 | // Start with captures started previous to current position |
| 644 | int capture_count = captures_started(); |
| 645 | // Add count of captures after this position. |
| 646 | int n; |
| 647 | while ((n = current()) != kEndMarker) { |
| 648 | Advance(); |
| 649 | switch (n) { |
| 650 | case '\\': |
| 651 | Advance(); |
| 652 | break; |
| 653 | case '[': { |
| 654 | int c; |
| 655 | while ((c = current()) != kEndMarker) { |
| 656 | Advance(); |
| 657 | if (c == '\\') { |
| 658 | Advance(); |
| 659 | } else { |
| 660 | if (c == ']') break; |
| 661 | } |
| 662 | } |
| 663 | break; |
| 664 | } |
| 665 | case '(': |
| 666 | if (current() != '?') capture_count++; |
| 667 | break; |
| 668 | } |
| 669 | } |
| 670 | capture_count_ = capture_count; |
| 671 | is_scanned_for_captures_ = true; |
| 672 | } |
| 673 | |
| 674 | |
| 675 | bool RegExpParser::ParseBackReferenceIndex(int* index_out) { |
| 676 | DCHECK_EQ('\\', current()); |
| 677 | DCHECK('1' <= Next() && Next() <= '9'); |
| 678 | // Try to parse a decimal literal that is no greater than the total number |
| 679 | // of left capturing parentheses in the input. |
| 680 | int start = position(); |
| 681 | int value = Next() - '0'; |
| 682 | Advance(2); |
| 683 | while (true) { |
| 684 | uc32 c = current(); |
| 685 | if (IsDecimalDigit(c)) { |
| 686 | value = 10 * value + (c - '0'); |
| 687 | if (value > kMaxCaptures) { |
| 688 | Reset(start); |
| 689 | return false; |
| 690 | } |
| 691 | Advance(); |
| 692 | } else { |
| 693 | break; |
| 694 | } |
| 695 | } |
| 696 | if (value > captures_started()) { |
| 697 | if (!is_scanned_for_captures_) { |
| 698 | int saved_position = position(); |
| 699 | ScanForCaptures(); |
| 700 | Reset(saved_position); |
| 701 | } |
| 702 | if (value > capture_count_) { |
| 703 | Reset(start); |
| 704 | return false; |
| 705 | } |
| 706 | } |
| 707 | *index_out = value; |
| 708 | return true; |
| 709 | } |
| 710 | |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 711 | static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) { |
| 712 | if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { |
| 713 | v->push_back(code_unit); |
| 714 | } else { |
| 715 | v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); |
| 716 | v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); |
| 717 | } |
| 718 | } |
| 719 | |
| 720 | const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { |
| 721 | DCHECK(FLAG_harmony_regexp_named_captures); |
| 722 | DCHECK(unicode()); |
| 723 | |
| 724 | ZoneVector<uc16>* name = |
| 725 | new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone()); |
| 726 | |
| 727 | bool at_start = true; |
| 728 | while (true) { |
| 729 | uc32 c = current(); |
| 730 | Advance(); |
| 731 | |
| 732 | // Convert unicode escapes. |
| 733 | if (c == '\\' && current() == 'u') { |
| 734 | Advance(); |
| 735 | if (!ParseUnicodeEscape(&c)) { |
| 736 | ReportError(CStrVector("Invalid Unicode escape sequence")); |
| 737 | return nullptr; |
| 738 | } |
| 739 | } |
| 740 | |
| 741 | if (at_start) { |
| 742 | if (!IdentifierStart::Is(c)) { |
| 743 | ReportError(CStrVector("Invalid capture group name")); |
| 744 | return nullptr; |
| 745 | } |
| 746 | push_code_unit(name, c); |
| 747 | at_start = false; |
| 748 | } else { |
| 749 | if (c == '>') { |
| 750 | break; |
| 751 | } else if (IdentifierPart::Is(c)) { |
| 752 | push_code_unit(name, c); |
| 753 | } else { |
| 754 | ReportError(CStrVector("Invalid capture group name")); |
| 755 | return nullptr; |
| 756 | } |
| 757 | } |
| 758 | } |
| 759 | |
| 760 | return name; |
| 761 | } |
| 762 | |
| 763 | bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, |
| 764 | int index) { |
| 765 | DCHECK(FLAG_harmony_regexp_named_captures); |
| 766 | DCHECK(unicode()); |
| 767 | DCHECK(0 < index && index <= captures_started_); |
| 768 | DCHECK_NOT_NULL(name); |
| 769 | |
| 770 | if (named_captures_ == nullptr) { |
| 771 | named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone()); |
| 772 | } else { |
| 773 | // Check for duplicates and bail if we find any. |
| 774 | for (const auto& named_capture : *named_captures_) { |
| 775 | if (*named_capture->name() == *name) { |
| 776 | ReportError(CStrVector("Duplicate capture group name")); |
| 777 | return false; |
| 778 | } |
| 779 | } |
| 780 | } |
| 781 | |
| 782 | RegExpCapture* capture = GetCapture(index); |
| 783 | DCHECK(capture->name() == nullptr); |
| 784 | |
| 785 | capture->set_name(name); |
| 786 | named_captures_->Add(capture, zone()); |
| 787 | |
| 788 | return true; |
| 789 | } |
| 790 | |
| 791 | bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, |
| 792 | RegExpParserState* state) { |
| 793 | // The parser is assumed to be on the '<' in \k<name>. |
| 794 | if (current() != '<') { |
| 795 | ReportError(CStrVector("Invalid named reference")); |
| 796 | return false; |
| 797 | } |
| 798 | |
| 799 | Advance(); |
| 800 | const ZoneVector<uc16>* name = ParseCaptureGroupName(); |
| 801 | if (name == nullptr) { |
| 802 | return false; |
| 803 | } |
| 804 | |
| 805 | if (state->IsInsideCaptureGroup(name)) { |
| 806 | builder->AddEmpty(); |
| 807 | } else { |
| 808 | RegExpBackReference* atom = new (zone()) RegExpBackReference(); |
| 809 | atom->set_name(name); |
| 810 | |
| 811 | builder->AddAtom(atom); |
| 812 | |
| 813 | if (named_back_references_ == nullptr) { |
| 814 | named_back_references_ = |
| 815 | new (zone()) ZoneList<RegExpBackReference*>(1, zone()); |
| 816 | } |
| 817 | named_back_references_->Add(atom, zone()); |
| 818 | } |
| 819 | |
| 820 | return true; |
| 821 | } |
| 822 | |
| 823 | void RegExpParser::PatchNamedBackReferences() { |
| 824 | if (named_back_references_ == nullptr) return; |
| 825 | |
| 826 | if (named_captures_ == nullptr) { |
| 827 | ReportError(CStrVector("Invalid named capture referenced")); |
| 828 | return; |
| 829 | } |
| 830 | |
| 831 | // Look up and patch the actual capture for each named back reference. |
| 832 | // TODO(jgruber): O(n^2), optimize if necessary. |
| 833 | |
| 834 | for (int i = 0; i < named_back_references_->length(); i++) { |
| 835 | RegExpBackReference* ref = named_back_references_->at(i); |
| 836 | |
| 837 | int index = -1; |
| 838 | for (const auto& capture : *named_captures_) { |
| 839 | if (*capture->name() == *ref->name()) { |
| 840 | index = capture->index(); |
| 841 | break; |
| 842 | } |
| 843 | } |
| 844 | |
| 845 | if (index == -1) { |
| 846 | ReportError(CStrVector("Invalid named capture referenced")); |
| 847 | return; |
| 848 | } |
| 849 | |
| 850 | ref->set_capture(GetCapture(index)); |
| 851 | } |
| 852 | } |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 853 | |
| 854 | RegExpCapture* RegExpParser::GetCapture(int index) { |
| 855 | // The index for the capture groups are one-based. Its index in the list is |
| 856 | // zero-based. |
| 857 | int know_captures = |
| 858 | is_scanned_for_captures_ ? capture_count_ : captures_started_; |
| 859 | DCHECK(index <= know_captures); |
| 860 | if (captures_ == NULL) { |
| 861 | captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone()); |
| 862 | } |
| 863 | while (captures_->length() < know_captures) { |
| 864 | captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone()); |
| 865 | } |
| 866 | return captures_->at(index - 1); |
| 867 | } |
| 868 | |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 869 | Handle<FixedArray> RegExpParser::CreateCaptureNameMap() { |
| 870 | if (named_captures_ == nullptr || named_captures_->is_empty()) |
| 871 | return Handle<FixedArray>(); |
| 872 | |
| 873 | Factory* factory = isolate()->factory(); |
| 874 | |
| 875 | int len = named_captures_->length() * 2; |
| 876 | Handle<FixedArray> array = factory->NewFixedArray(len); |
| 877 | |
| 878 | for (int i = 0; i < named_captures_->length(); i++) { |
| 879 | RegExpCapture* capture = named_captures_->at(i); |
| 880 | MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name()); |
| 881 | array->set(i * 2, *name.ToHandleChecked()); |
| 882 | array->set(i * 2 + 1, Smi::FromInt(capture->index())); |
| 883 | } |
| 884 | |
| 885 | return array; |
| 886 | } |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 887 | |
| 888 | bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) { |
| 889 | for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) { |
| 890 | if (s->group_type() != CAPTURE) continue; |
| 891 | // Return true if we found the matching capture index. |
| 892 | if (index == s->capture_index()) return true; |
| 893 | // Abort if index is larger than what has been parsed up till this state. |
| 894 | if (index > s->capture_index()) return false; |
| 895 | } |
| 896 | return false; |
| 897 | } |
| 898 | |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 899 | bool RegExpParser::RegExpParserState::IsInsideCaptureGroup( |
| 900 | const ZoneVector<uc16>* name) { |
| 901 | DCHECK_NOT_NULL(name); |
| 902 | for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) { |
| 903 | if (s->capture_name() == nullptr) continue; |
| 904 | if (*s->capture_name() == *name) return true; |
| 905 | } |
| 906 | return false; |
| 907 | } |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 908 | |
| 909 | // QuantifierPrefix :: |
| 910 | // { DecimalDigits } |
| 911 | // { DecimalDigits , } |
| 912 | // { DecimalDigits , DecimalDigits } |
| 913 | // |
| 914 | // Returns true if parsing succeeds, and set the min_out and max_out |
| 915 | // values. Values are truncated to RegExpTree::kInfinity if they overflow. |
| 916 | bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) { |
| 917 | DCHECK_EQ(current(), '{'); |
| 918 | int start = position(); |
| 919 | Advance(); |
| 920 | int min = 0; |
| 921 | if (!IsDecimalDigit(current())) { |
| 922 | Reset(start); |
| 923 | return false; |
| 924 | } |
| 925 | while (IsDecimalDigit(current())) { |
| 926 | int next = current() - '0'; |
| 927 | if (min > (RegExpTree::kInfinity - next) / 10) { |
| 928 | // Overflow. Skip past remaining decimal digits and return -1. |
| 929 | do { |
| 930 | Advance(); |
| 931 | } while (IsDecimalDigit(current())); |
| 932 | min = RegExpTree::kInfinity; |
| 933 | break; |
| 934 | } |
| 935 | min = 10 * min + next; |
| 936 | Advance(); |
| 937 | } |
| 938 | int max = 0; |
| 939 | if (current() == '}') { |
| 940 | max = min; |
| 941 | Advance(); |
| 942 | } else if (current() == ',') { |
| 943 | Advance(); |
| 944 | if (current() == '}') { |
| 945 | max = RegExpTree::kInfinity; |
| 946 | Advance(); |
| 947 | } else { |
| 948 | while (IsDecimalDigit(current())) { |
| 949 | int next = current() - '0'; |
| 950 | if (max > (RegExpTree::kInfinity - next) / 10) { |
| 951 | do { |
| 952 | Advance(); |
| 953 | } while (IsDecimalDigit(current())); |
| 954 | max = RegExpTree::kInfinity; |
| 955 | break; |
| 956 | } |
| 957 | max = 10 * max + next; |
| 958 | Advance(); |
| 959 | } |
| 960 | if (current() != '}') { |
| 961 | Reset(start); |
| 962 | return false; |
| 963 | } |
| 964 | Advance(); |
| 965 | } |
| 966 | } else { |
| 967 | Reset(start); |
| 968 | return false; |
| 969 | } |
| 970 | *min_out = min; |
| 971 | *max_out = max; |
| 972 | return true; |
| 973 | } |
| 974 | |
| 975 | |
| 976 | uc32 RegExpParser::ParseOctalLiteral() { |
| 977 | DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker); |
| 978 | // For compatibility with some other browsers (not all), we parse |
| 979 | // up to three octal digits with a value below 256. |
| 980 | uc32 value = current() - '0'; |
| 981 | Advance(); |
| 982 | if ('0' <= current() && current() <= '7') { |
| 983 | value = value * 8 + current() - '0'; |
| 984 | Advance(); |
| 985 | if (value < 32 && '0' <= current() && current() <= '7') { |
| 986 | value = value * 8 + current() - '0'; |
| 987 | Advance(); |
| 988 | } |
| 989 | } |
| 990 | return value; |
| 991 | } |
| 992 | |
| 993 | |
| 994 | bool RegExpParser::ParseHexEscape(int length, uc32* value) { |
| 995 | int start = position(); |
| 996 | uc32 val = 0; |
| 997 | for (int i = 0; i < length; ++i) { |
| 998 | uc32 c = current(); |
| 999 | int d = HexValue(c); |
| 1000 | if (d < 0) { |
| 1001 | Reset(start); |
| 1002 | return false; |
| 1003 | } |
| 1004 | val = val * 16 + d; |
| 1005 | Advance(); |
| 1006 | } |
| 1007 | *value = val; |
| 1008 | return true; |
| 1009 | } |
| 1010 | |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1011 | // This parses RegExpUnicodeEscapeSequence as described in ECMA262. |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1012 | bool RegExpParser::ParseUnicodeEscape(uc32* value) { |
| 1013 | // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are |
| 1014 | // allowed). In the latter case, the number of hex digits between { } is |
| 1015 | // arbitrary. \ and u have already been read. |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1016 | if (current() == '{' && unicode()) { |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1017 | int start = position(); |
| 1018 | Advance(); |
| 1019 | if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { |
| 1020 | if (current() == '}') { |
| 1021 | Advance(); |
| 1022 | return true; |
| 1023 | } |
| 1024 | } |
| 1025 | Reset(start); |
| 1026 | return false; |
| 1027 | } |
| 1028 | // \u but no {, or \u{...} escapes not allowed. |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1029 | bool result = ParseHexEscape(4, value); |
| 1030 | if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) && |
| 1031 | current() == '\\') { |
| 1032 | // Attempt to read trail surrogate. |
| 1033 | int start = position(); |
| 1034 | if (Next() == 'u') { |
| 1035 | Advance(2); |
| 1036 | uc32 trail; |
| 1037 | if (ParseHexEscape(4, &trail) && |
| 1038 | unibrow::Utf16::IsTrailSurrogate(trail)) { |
| 1039 | *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value), |
| 1040 | static_cast<uc16>(trail)); |
| 1041 | return true; |
| 1042 | } |
| 1043 | } |
| 1044 | Reset(start); |
| 1045 | } |
| 1046 | return result; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1047 | } |
| 1048 | |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1049 | #ifdef V8_I18N_SUPPORT |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 1050 | |
| 1051 | namespace { |
| 1052 | |
Ben Murdoch | c561043 | 2016-08-08 18:44:38 +0100 | [diff] [blame] | 1053 | bool IsExactPropertyAlias(const char* property_name, UProperty property) { |
| 1054 | const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME); |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1055 | if (short_name != NULL && strcmp(property_name, short_name) == 0) return true; |
| 1056 | for (int i = 0;; i++) { |
Ben Murdoch | c561043 | 2016-08-08 18:44:38 +0100 | [diff] [blame] | 1057 | const char* long_name = u_getPropertyName( |
| 1058 | property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1059 | if (long_name == NULL) break; |
| 1060 | if (strcmp(property_name, long_name) == 0) return true; |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1061 | } |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1062 | return false; |
| 1063 | } |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1064 | |
Ben Murdoch | c561043 | 2016-08-08 18:44:38 +0100 | [diff] [blame] | 1065 | bool IsExactPropertyValueAlias(const char* property_value_name, |
| 1066 | UProperty property, int32_t property_value) { |
| 1067 | const char* short_name = |
| 1068 | u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME); |
| 1069 | if (short_name != NULL && strcmp(property_value_name, short_name) == 0) { |
| 1070 | return true; |
| 1071 | } |
| 1072 | for (int i = 0;; i++) { |
| 1073 | const char* long_name = u_getPropertyValueName( |
| 1074 | property, property_value, |
| 1075 | static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); |
| 1076 | if (long_name == NULL) break; |
| 1077 | if (strcmp(property_value_name, long_name) == 0) return true; |
| 1078 | } |
| 1079 | return false; |
| 1080 | } |
| 1081 | |
| 1082 | bool LookupPropertyValueName(UProperty property, |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 1083 | const char* property_value_name, bool negate, |
Ben Murdoch | c561043 | 2016-08-08 18:44:38 +0100 | [diff] [blame] | 1084 | ZoneList<CharacterRange>* result, Zone* zone) { |
| 1085 | int32_t property_value = |
| 1086 | u_getPropertyValueEnum(property, property_value_name); |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1087 | if (property_value == UCHAR_INVALID_CODE) return false; |
| 1088 | |
| 1089 | // We require the property name to match exactly to one of the property value |
| 1090 | // aliases. However, u_getPropertyValueEnum uses loose matching. |
Ben Murdoch | c561043 | 2016-08-08 18:44:38 +0100 | [diff] [blame] | 1091 | if (!IsExactPropertyValueAlias(property_value_name, property, |
| 1092 | property_value)) { |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1093 | return false; |
| 1094 | } |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1095 | |
| 1096 | USet* set = uset_openEmpty(); |
| 1097 | UErrorCode ec = U_ZERO_ERROR; |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1098 | uset_applyIntPropertyValue(set, property, property_value, &ec); |
| 1099 | bool success = ec == U_ZERO_ERROR && !uset_isEmpty(set); |
| 1100 | |
| 1101 | if (success) { |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1102 | uset_removeAllStrings(set); |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 1103 | if (negate) uset_complement(set); |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1104 | int item_count = uset_getItemCount(set); |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1105 | int item_result = 0; |
| 1106 | for (int i = 0; i < item_count; i++) { |
| 1107 | uc32 start = 0; |
| 1108 | uc32 end = 0; |
| 1109 | item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec); |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1110 | result->Add(CharacterRange::Range(start, end), zone); |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1111 | } |
| 1112 | DCHECK_EQ(U_ZERO_ERROR, ec); |
| 1113 | DCHECK_EQ(0, item_result); |
| 1114 | } |
| 1115 | uset_close(set); |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1116 | return success; |
| 1117 | } |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1118 | |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 1119 | template <size_t N> |
| 1120 | inline bool NameEquals(const char* name, const char (&literal)[N]) { |
| 1121 | return strncmp(name, literal, N + 1) == 0; |
| 1122 | } |
| 1123 | |
| 1124 | bool LookupSpecialPropertyValueName(const char* name, |
| 1125 | ZoneList<CharacterRange>* result, |
| 1126 | bool negate, Zone* zone) { |
| 1127 | if (NameEquals(name, "Any")) { |
| 1128 | if (!negate) result->Add(CharacterRange::Everything(), zone); |
| 1129 | } else if (NameEquals(name, "ASCII")) { |
| 1130 | result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint) |
| 1131 | : CharacterRange::Range(0x0, 0x7f), |
| 1132 | zone); |
| 1133 | } else if (NameEquals(name, "Assigned")) { |
| 1134 | return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned", |
| 1135 | !negate, result, zone); |
| 1136 | } else { |
| 1137 | return false; |
| 1138 | } |
| 1139 | return true; |
| 1140 | } |
| 1141 | |
| 1142 | } // anonymous namespace |
| 1143 | |
| 1144 | bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result, |
| 1145 | bool negate) { |
Ben Murdoch | c561043 | 2016-08-08 18:44:38 +0100 | [diff] [blame] | 1146 | // Parse the property class as follows: |
Ben Murdoch | c561043 | 2016-08-08 18:44:38 +0100 | [diff] [blame] | 1147 | // - In \p{name}, 'name' is interpreted |
| 1148 | // - either as a general category property value name. |
| 1149 | // - or as a binary property name. |
| 1150 | // - In \p{name=value}, 'name' is interpreted as an enumerated property name, |
| 1151 | // and 'value' is interpreted as one of the available property value names. |
| 1152 | // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used. |
| 1153 | // - Loose matching is not applied. |
| 1154 | List<char> first_part; |
| 1155 | List<char> second_part; |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1156 | if (current() == '{') { |
Ben Murdoch | c561043 | 2016-08-08 18:44:38 +0100 | [diff] [blame] | 1157 | // Parse \p{[PropertyName=]PropertyNameValue} |
| 1158 | for (Advance(); current() != '}' && current() != '='; Advance()) { |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1159 | if (!has_next()) return false; |
Ben Murdoch | c561043 | 2016-08-08 18:44:38 +0100 | [diff] [blame] | 1160 | first_part.Add(static_cast<char>(current())); |
| 1161 | } |
| 1162 | if (current() == '=') { |
| 1163 | for (Advance(); current() != '}'; Advance()) { |
| 1164 | if (!has_next()) return false; |
| 1165 | second_part.Add(static_cast<char>(current())); |
| 1166 | } |
| 1167 | second_part.Add(0); // null-terminate string. |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1168 | } |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1169 | } else { |
| 1170 | return false; |
| 1171 | } |
| 1172 | Advance(); |
Ben Murdoch | c561043 | 2016-08-08 18:44:38 +0100 | [diff] [blame] | 1173 | first_part.Add(0); // null-terminate string. |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1174 | |
Ben Murdoch | c561043 | 2016-08-08 18:44:38 +0100 | [diff] [blame] | 1175 | if (second_part.is_empty()) { |
| 1176 | // First attempt to interpret as general category property value name. |
| 1177 | const char* name = first_part.ToConstVector().start(); |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 1178 | if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate, |
| 1179 | result, zone())) { |
| 1180 | return true; |
| 1181 | } |
| 1182 | // Interpret "Any", "ASCII", and "Assigned". |
| 1183 | if (LookupSpecialPropertyValueName(name, result, negate, zone())) { |
Ben Murdoch | c561043 | 2016-08-08 18:44:38 +0100 | [diff] [blame] | 1184 | return true; |
| 1185 | } |
| 1186 | // Then attempt to interpret as binary property name with value name 'Y'. |
| 1187 | UProperty property = u_getPropertyEnum(name); |
| 1188 | if (property < UCHAR_BINARY_START) return false; |
| 1189 | if (property >= UCHAR_BINARY_LIMIT) return false; |
| 1190 | if (!IsExactPropertyAlias(name, property)) return false; |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 1191 | return LookupPropertyValueName(property, negate ? "N" : "Y", false, result, |
| 1192 | zone()); |
Ben Murdoch | c561043 | 2016-08-08 18:44:38 +0100 | [diff] [blame] | 1193 | } else { |
| 1194 | // Both property name and value name are specified. Attempt to interpret |
| 1195 | // the property name as enumerated property. |
| 1196 | const char* property_name = first_part.ToConstVector().start(); |
| 1197 | const char* value_name = second_part.ToConstVector().start(); |
| 1198 | UProperty property = u_getPropertyEnum(property_name); |
| 1199 | if (property < UCHAR_INT_START) return false; |
| 1200 | if (property >= UCHAR_INT_LIMIT) return false; |
| 1201 | if (!IsExactPropertyAlias(property_name, property)) return false; |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 1202 | return LookupPropertyValueName(property, value_name, negate, result, |
| 1203 | zone()); |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1204 | } |
Ben Murdoch | c561043 | 2016-08-08 18:44:38 +0100 | [diff] [blame] | 1205 | } |
| 1206 | |
| 1207 | #else // V8_I18N_SUPPORT |
| 1208 | |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 1209 | bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result, |
| 1210 | bool negate) { |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1211 | return false; |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1212 | } |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1213 | |
Ben Murdoch | c561043 | 2016-08-08 18:44:38 +0100 | [diff] [blame] | 1214 | #endif // V8_I18N_SUPPORT |
| 1215 | |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1216 | bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { |
| 1217 | uc32 x = 0; |
| 1218 | int d = HexValue(current()); |
| 1219 | if (d < 0) { |
| 1220 | return false; |
| 1221 | } |
| 1222 | while (d >= 0) { |
| 1223 | x = x * 16 + d; |
| 1224 | if (x > max_value) { |
| 1225 | return false; |
| 1226 | } |
| 1227 | Advance(); |
| 1228 | d = HexValue(current()); |
| 1229 | } |
| 1230 | *value = x; |
| 1231 | return true; |
| 1232 | } |
| 1233 | |
| 1234 | |
| 1235 | uc32 RegExpParser::ParseClassCharacterEscape() { |
| 1236 | DCHECK(current() == '\\'); |
| 1237 | DCHECK(has_next() && !IsSpecialClassEscape(Next())); |
| 1238 | Advance(); |
| 1239 | switch (current()) { |
| 1240 | case 'b': |
| 1241 | Advance(); |
| 1242 | return '\b'; |
| 1243 | // ControlEscape :: one of |
| 1244 | // f n r t v |
| 1245 | case 'f': |
| 1246 | Advance(); |
| 1247 | return '\f'; |
| 1248 | case 'n': |
| 1249 | Advance(); |
| 1250 | return '\n'; |
| 1251 | case 'r': |
| 1252 | Advance(); |
| 1253 | return '\r'; |
| 1254 | case 't': |
| 1255 | Advance(); |
| 1256 | return '\t'; |
| 1257 | case 'v': |
| 1258 | Advance(); |
| 1259 | return '\v'; |
| 1260 | case 'c': { |
| 1261 | uc32 controlLetter = Next(); |
| 1262 | uc32 letter = controlLetter & ~('A' ^ 'a'); |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1263 | // For compatibility with JSC, inside a character class. We also accept |
| 1264 | // digits and underscore as control characters, unless with /u. |
| 1265 | if (letter >= 'A' && letter <= 'Z') { |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1266 | Advance(2); |
| 1267 | // Control letters mapped to ASCII control characters in the range |
| 1268 | // 0x00-0x1f. |
| 1269 | return controlLetter & 0x1f; |
| 1270 | } |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1271 | if (unicode()) { |
| 1272 | // With /u, invalid escapes are not treated as identity escapes. |
| 1273 | ReportError(CStrVector("Invalid class escape")); |
| 1274 | return 0; |
| 1275 | } |
| 1276 | if ((controlLetter >= '0' && controlLetter <= '9') || |
| 1277 | controlLetter == '_') { |
| 1278 | Advance(2); |
| 1279 | return controlLetter & 0x1f; |
| 1280 | } |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1281 | // We match JSC in reading the backslash as a literal |
| 1282 | // character instead of as starting an escape. |
| 1283 | return '\\'; |
| 1284 | } |
| 1285 | case '0': |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1286 | // With /u, \0 is interpreted as NUL if not followed by another digit. |
| 1287 | if (unicode() && !(Next() >= '0' && Next() <= '9')) { |
| 1288 | Advance(); |
| 1289 | return 0; |
| 1290 | } |
| 1291 | // Fall through. |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1292 | case '1': |
| 1293 | case '2': |
| 1294 | case '3': |
| 1295 | case '4': |
| 1296 | case '5': |
| 1297 | case '6': |
| 1298 | case '7': |
| 1299 | // For compatibility, we interpret a decimal escape that isn't |
| 1300 | // a back reference (and therefore either \0 or not valid according |
| 1301 | // to the specification) as a 1..3 digit octal character code. |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1302 | if (unicode()) { |
| 1303 | // With /u, decimal escape is not interpreted as octal character code. |
| 1304 | ReportError(CStrVector("Invalid class escape")); |
| 1305 | return 0; |
| 1306 | } |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1307 | return ParseOctalLiteral(); |
| 1308 | case 'x': { |
| 1309 | Advance(); |
| 1310 | uc32 value; |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1311 | if (ParseHexEscape(2, &value)) return value; |
| 1312 | if (unicode()) { |
| 1313 | // With /u, invalid escapes are not treated as identity escapes. |
| 1314 | ReportError(CStrVector("Invalid escape")); |
| 1315 | return 0; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1316 | } |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1317 | // If \x is not followed by a two-digit hexadecimal, treat it |
| 1318 | // as an identity escape. |
| 1319 | return 'x'; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1320 | } |
| 1321 | case 'u': { |
| 1322 | Advance(); |
| 1323 | uc32 value; |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1324 | if (ParseUnicodeEscape(&value)) return value; |
| 1325 | if (unicode()) { |
| 1326 | // With /u, invalid escapes are not treated as identity escapes. |
| 1327 | ReportError(CStrVector("Invalid unicode escape")); |
| 1328 | return 0; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1329 | } |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1330 | // If \u is not followed by a two-digit hexadecimal, treat it |
| 1331 | // as an identity escape. |
| 1332 | return 'u'; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1333 | } |
| 1334 | default: { |
| 1335 | uc32 result = current(); |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1336 | // With /u, no identity escapes except for syntax characters and '-' are |
| 1337 | // allowed. Otherwise, all identity escapes are allowed. |
| 1338 | if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1339 | Advance(); |
| 1340 | return result; |
| 1341 | } |
| 1342 | ReportError(CStrVector("Invalid escape")); |
| 1343 | return 0; |
| 1344 | } |
| 1345 | } |
| 1346 | return 0; |
| 1347 | } |
| 1348 | |
| 1349 | |
| 1350 | CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { |
| 1351 | DCHECK_EQ(0, *char_class); |
| 1352 | uc32 first = current(); |
| 1353 | if (first == '\\') { |
| 1354 | switch (Next()) { |
| 1355 | case 'w': |
| 1356 | case 'W': |
| 1357 | case 'd': |
| 1358 | case 'D': |
| 1359 | case 's': |
| 1360 | case 'S': { |
| 1361 | *char_class = Next(); |
| 1362 | Advance(2); |
| 1363 | return CharacterRange::Singleton(0); // Return dummy value. |
| 1364 | } |
| 1365 | case kEndMarker: |
| 1366 | return ReportError(CStrVector("\\ at end of pattern")); |
| 1367 | default: |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1368 | first = ParseClassCharacterEscape(CHECK_FAILED); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1369 | } |
| 1370 | } else { |
| 1371 | Advance(); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1372 | } |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1373 | |
| 1374 | return CharacterRange::Singleton(first); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1375 | } |
| 1376 | |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1377 | static const uc16 kNoCharClass = 0; |
| 1378 | |
| 1379 | // Adds range or pre-defined character class to character ranges. |
| 1380 | // If char_class is not kInvalidClass, it's interpreted as a class |
| 1381 | // escape (i.e., 's' means whitespace, from '\s'). |
| 1382 | static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, |
| 1383 | uc16 char_class, CharacterRange range, |
| 1384 | Zone* zone) { |
| 1385 | if (char_class != kNoCharClass) { |
| 1386 | CharacterRange::AddClassEscape(char_class, ranges, zone); |
| 1387 | } else { |
| 1388 | ranges->Add(range, zone); |
| 1389 | } |
| 1390 | } |
| 1391 | |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1392 | bool RegExpParser::ParseClassProperty(ZoneList<CharacterRange>* ranges) { |
| 1393 | if (!FLAG_harmony_regexp_property) return false; |
| 1394 | if (!unicode()) return false; |
| 1395 | if (current() != '\\') return false; |
| 1396 | uc32 next = Next(); |
| 1397 | bool parse_success = false; |
| 1398 | if (next == 'p') { |
| 1399 | Advance(2); |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 1400 | parse_success = ParsePropertyClass(ranges, false); |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1401 | } else if (next == 'P') { |
| 1402 | Advance(2); |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 1403 | parse_success = ParsePropertyClass(ranges, true); |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1404 | } else { |
| 1405 | return false; |
| 1406 | } |
| 1407 | if (!parse_success) |
| 1408 | ReportError(CStrVector("Invalid property name in character class")); |
| 1409 | return parse_success; |
| 1410 | } |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1411 | |
| 1412 | RegExpTree* RegExpParser::ParseCharacterClass() { |
| 1413 | static const char* kUnterminated = "Unterminated character class"; |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1414 | static const char* kRangeInvalid = "Invalid character class"; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1415 | static const char* kRangeOutOfOrder = "Range out of order in character class"; |
| 1416 | |
| 1417 | DCHECK_EQ(current(), '['); |
| 1418 | Advance(); |
| 1419 | bool is_negated = false; |
| 1420 | if (current() == '^') { |
| 1421 | is_negated = true; |
| 1422 | Advance(); |
| 1423 | } |
| 1424 | ZoneList<CharacterRange>* ranges = |
| 1425 | new (zone()) ZoneList<CharacterRange>(2, zone()); |
| 1426 | while (has_more() && current() != ']') { |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1427 | bool parsed_property = ParseClassProperty(ranges CHECK_FAILED); |
| 1428 | if (parsed_property) continue; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1429 | uc16 char_class = kNoCharClass; |
| 1430 | CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED); |
| 1431 | if (current() == '-') { |
| 1432 | Advance(); |
| 1433 | if (current() == kEndMarker) { |
| 1434 | // If we reach the end we break out of the loop and let the |
| 1435 | // following code report an error. |
| 1436 | break; |
| 1437 | } else if (current() == ']') { |
| 1438 | AddRangeOrEscape(ranges, char_class, first, zone()); |
| 1439 | ranges->Add(CharacterRange::Singleton('-'), zone()); |
| 1440 | break; |
| 1441 | } |
| 1442 | uc16 char_class_2 = kNoCharClass; |
| 1443 | CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED); |
| 1444 | if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { |
| 1445 | // Either end is an escaped character class. Treat the '-' verbatim. |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1446 | if (unicode()) { |
| 1447 | // ES2015 21.2.2.15.1 step 1. |
| 1448 | return ReportError(CStrVector(kRangeInvalid)); |
| 1449 | } |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1450 | AddRangeOrEscape(ranges, char_class, first, zone()); |
| 1451 | ranges->Add(CharacterRange::Singleton('-'), zone()); |
| 1452 | AddRangeOrEscape(ranges, char_class_2, next, zone()); |
| 1453 | continue; |
| 1454 | } |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1455 | // ES2015 21.2.2.15.1 step 6. |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1456 | if (first.from() > next.to()) { |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1457 | return ReportError(CStrVector(kRangeOutOfOrder)); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1458 | } |
| 1459 | ranges->Add(CharacterRange::Range(first.from(), next.to()), zone()); |
| 1460 | } else { |
| 1461 | AddRangeOrEscape(ranges, char_class, first, zone()); |
| 1462 | } |
| 1463 | } |
| 1464 | if (!has_more()) { |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1465 | return ReportError(CStrVector(kUnterminated)); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1466 | } |
| 1467 | Advance(); |
| 1468 | if (ranges->length() == 0) { |
| 1469 | ranges->Add(CharacterRange::Everything(), zone()); |
| 1470 | is_negated = !is_negated; |
| 1471 | } |
| 1472 | return new (zone()) RegExpCharacterClass(ranges, is_negated); |
| 1473 | } |
| 1474 | |
| 1475 | |
| 1476 | #undef CHECK_FAILED |
| 1477 | |
| 1478 | |
| 1479 | bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1480 | FlatStringReader* input, JSRegExp::Flags flags, |
| 1481 | RegExpCompileData* result) { |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1482 | DCHECK(result != NULL); |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1483 | RegExpParser parser(input, &result->error, flags, isolate, zone); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1484 | RegExpTree* tree = parser.ParsePattern(); |
| 1485 | if (parser.failed()) { |
| 1486 | DCHECK(tree == NULL); |
| 1487 | DCHECK(!result->error.is_null()); |
| 1488 | } else { |
| 1489 | DCHECK(tree != NULL); |
| 1490 | DCHECK(result->error.is_null()); |
| 1491 | if (FLAG_trace_regexp_parser) { |
| 1492 | OFStream os(stdout); |
| 1493 | tree->Print(os, zone); |
| 1494 | os << "\n"; |
| 1495 | } |
| 1496 | result->tree = tree; |
| 1497 | int capture_count = parser.captures_started(); |
| 1498 | result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; |
| 1499 | result->contains_anchor = parser.contains_anchor(); |
Ben Murdoch | 61f157c | 2016-09-16 13:49:30 +0100 | [diff] [blame] | 1500 | result->capture_name_map = parser.CreateCaptureNameMap(); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1501 | result->capture_count = capture_count; |
| 1502 | } |
| 1503 | return !parser.failed(); |
| 1504 | } |
| 1505 | |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1506 | RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode) |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1507 | : zone_(zone), |
| 1508 | pending_empty_(false), |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1509 | ignore_case_(ignore_case), |
| 1510 | unicode_(unicode), |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1511 | characters_(NULL), |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1512 | pending_surrogate_(kNoPendingSurrogate), |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1513 | terms_(), |
| 1514 | alternatives_() |
| 1515 | #ifdef DEBUG |
| 1516 | , |
| 1517 | last_added_(ADD_NONE) |
| 1518 | #endif |
| 1519 | { |
| 1520 | } |
| 1521 | |
| 1522 | |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1523 | void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) { |
| 1524 | DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); |
| 1525 | FlushPendingSurrogate(); |
| 1526 | // Hold onto the lead surrogate, waiting for a trail surrogate to follow. |
| 1527 | pending_surrogate_ = lead_surrogate; |
| 1528 | } |
| 1529 | |
| 1530 | |
| 1531 | void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { |
| 1532 | DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); |
| 1533 | if (pending_surrogate_ != kNoPendingSurrogate) { |
| 1534 | uc16 lead_surrogate = pending_surrogate_; |
| 1535 | pending_surrogate_ = kNoPendingSurrogate; |
| 1536 | DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); |
| 1537 | uc32 combined = |
| 1538 | unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate); |
| 1539 | if (NeedsDesugaringForIgnoreCase(combined)) { |
| 1540 | AddCharacterClassForDesugaring(combined); |
| 1541 | } else { |
| 1542 | ZoneList<uc16> surrogate_pair(2, zone()); |
| 1543 | surrogate_pair.Add(lead_surrogate, zone()); |
| 1544 | surrogate_pair.Add(trail_surrogate, zone()); |
| 1545 | RegExpAtom* atom = |
| 1546 | new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); |
| 1547 | AddAtom(atom); |
| 1548 | } |
| 1549 | } else { |
| 1550 | pending_surrogate_ = trail_surrogate; |
| 1551 | FlushPendingSurrogate(); |
| 1552 | } |
| 1553 | } |
| 1554 | |
| 1555 | |
| 1556 | void RegExpBuilder::FlushPendingSurrogate() { |
| 1557 | if (pending_surrogate_ != kNoPendingSurrogate) { |
| 1558 | DCHECK(unicode()); |
| 1559 | uc32 c = pending_surrogate_; |
| 1560 | pending_surrogate_ = kNoPendingSurrogate; |
| 1561 | AddCharacterClassForDesugaring(c); |
| 1562 | } |
| 1563 | } |
| 1564 | |
| 1565 | |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1566 | void RegExpBuilder::FlushCharacters() { |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1567 | FlushPendingSurrogate(); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1568 | pending_empty_ = false; |
| 1569 | if (characters_ != NULL) { |
| 1570 | RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); |
| 1571 | characters_ = NULL; |
| 1572 | text_.Add(atom, zone()); |
| 1573 | LAST(ADD_ATOM); |
| 1574 | } |
| 1575 | } |
| 1576 | |
| 1577 | |
| 1578 | void RegExpBuilder::FlushText() { |
| 1579 | FlushCharacters(); |
| 1580 | int num_text = text_.length(); |
| 1581 | if (num_text == 0) { |
| 1582 | return; |
| 1583 | } else if (num_text == 1) { |
| 1584 | terms_.Add(text_.last(), zone()); |
| 1585 | } else { |
| 1586 | RegExpText* text = new (zone()) RegExpText(zone()); |
| 1587 | for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); |
| 1588 | terms_.Add(text, zone()); |
| 1589 | } |
| 1590 | text_.Clear(); |
| 1591 | } |
| 1592 | |
| 1593 | |
| 1594 | void RegExpBuilder::AddCharacter(uc16 c) { |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1595 | FlushPendingSurrogate(); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1596 | pending_empty_ = false; |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1597 | if (NeedsDesugaringForIgnoreCase(c)) { |
| 1598 | AddCharacterClassForDesugaring(c); |
| 1599 | } else { |
| 1600 | if (characters_ == NULL) { |
| 1601 | characters_ = new (zone()) ZoneList<uc16>(4, zone()); |
| 1602 | } |
| 1603 | characters_->Add(c, zone()); |
| 1604 | LAST(ADD_CHAR); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1605 | } |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1606 | } |
| 1607 | |
| 1608 | |
| 1609 | void RegExpBuilder::AddUnicodeCharacter(uc32 c) { |
| 1610 | if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1611 | DCHECK(unicode()); |
| 1612 | AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); |
| 1613 | AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); |
| 1614 | } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { |
| 1615 | AddLeadSurrogate(c); |
| 1616 | } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { |
| 1617 | AddTrailSurrogate(c); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1618 | } else { |
| 1619 | AddCharacter(static_cast<uc16>(c)); |
| 1620 | } |
| 1621 | } |
| 1622 | |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1623 | void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) { |
| 1624 | // A lead or trail surrogate parsed via escape sequence will not |
| 1625 | // pair up with any preceding lead or following trail surrogate. |
| 1626 | FlushPendingSurrogate(); |
| 1627 | AddUnicodeCharacter(character); |
| 1628 | FlushPendingSurrogate(); |
| 1629 | } |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1630 | |
| 1631 | void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
| 1632 | |
| 1633 | |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1634 | void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
| 1635 | if (NeedsDesugaringForUnicode(cc)) { |
| 1636 | // With /u, character class needs to be desugared, so it |
| 1637 | // must be a standalone term instead of being part of a RegExpText. |
| 1638 | AddTerm(cc); |
| 1639 | } else { |
| 1640 | AddAtom(cc); |
| 1641 | } |
| 1642 | } |
| 1643 | |
| 1644 | void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) { |
| 1645 | AddTerm(new (zone()) RegExpCharacterClass( |
| 1646 | CharacterRange::List(zone(), CharacterRange::Singleton(c)), false)); |
| 1647 | } |
| 1648 | |
| 1649 | |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1650 | void RegExpBuilder::AddAtom(RegExpTree* term) { |
| 1651 | if (term->IsEmpty()) { |
| 1652 | AddEmpty(); |
| 1653 | return; |
| 1654 | } |
| 1655 | if (term->IsTextElement()) { |
| 1656 | FlushCharacters(); |
| 1657 | text_.Add(term, zone()); |
| 1658 | } else { |
| 1659 | FlushText(); |
| 1660 | terms_.Add(term, zone()); |
| 1661 | } |
| 1662 | LAST(ADD_ATOM); |
| 1663 | } |
| 1664 | |
| 1665 | |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1666 | void RegExpBuilder::AddTerm(RegExpTree* term) { |
| 1667 | FlushText(); |
| 1668 | terms_.Add(term, zone()); |
| 1669 | LAST(ADD_ATOM); |
| 1670 | } |
| 1671 | |
| 1672 | |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1673 | void RegExpBuilder::AddAssertion(RegExpTree* assert) { |
| 1674 | FlushText(); |
| 1675 | terms_.Add(assert, zone()); |
| 1676 | LAST(ADD_ASSERT); |
| 1677 | } |
| 1678 | |
| 1679 | |
| 1680 | void RegExpBuilder::NewAlternative() { FlushTerms(); } |
| 1681 | |
| 1682 | |
| 1683 | void RegExpBuilder::FlushTerms() { |
| 1684 | FlushText(); |
| 1685 | int num_terms = terms_.length(); |
| 1686 | RegExpTree* alternative; |
| 1687 | if (num_terms == 0) { |
| 1688 | alternative = new (zone()) RegExpEmpty(); |
| 1689 | } else if (num_terms == 1) { |
| 1690 | alternative = terms_.last(); |
| 1691 | } else { |
| 1692 | alternative = new (zone()) RegExpAlternative(terms_.GetList(zone())); |
| 1693 | } |
| 1694 | alternatives_.Add(alternative, zone()); |
| 1695 | terms_.Clear(); |
| 1696 | LAST(ADD_NONE); |
| 1697 | } |
| 1698 | |
| 1699 | |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1700 | bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) { |
| 1701 | if (!unicode()) return false; |
Ben Murdoch | da12d29 | 2016-06-02 14:46:10 +0100 | [diff] [blame] | 1702 | // TODO(yangguo): we could be smarter than this. Case-insensitivity does not |
| 1703 | // necessarily mean that we need to desugar. It's probably nicer to have a |
| 1704 | // separate pass to figure out unicode desugarings. |
| 1705 | if (ignore_case()) return true; |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1706 | ZoneList<CharacterRange>* ranges = cc->ranges(zone()); |
| 1707 | CharacterRange::Canonicalize(ranges); |
| 1708 | for (int i = ranges->length() - 1; i >= 0; i--) { |
| 1709 | uc32 from = ranges->at(i).from(); |
| 1710 | uc32 to = ranges->at(i).to(); |
| 1711 | // Check for non-BMP characters. |
| 1712 | if (to >= kNonBmpStart) return true; |
| 1713 | // Check for lone surrogates. |
| 1714 | if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true; |
| 1715 | } |
| 1716 | return false; |
| 1717 | } |
| 1718 | |
| 1719 | |
| 1720 | bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) { |
| 1721 | #ifdef V8_I18N_SUPPORT |
| 1722 | if (unicode() && ignore_case()) { |
| 1723 | USet* set = uset_open(c, c); |
| 1724 | uset_closeOver(set, USET_CASE_INSENSITIVE); |
| 1725 | uset_removeAllStrings(set); |
| 1726 | bool result = uset_size(set) > 1; |
| 1727 | uset_close(set); |
| 1728 | return result; |
| 1729 | } |
| 1730 | // In the case where ICU is not included, we act as if the unicode flag is |
| 1731 | // not set, and do not desugar. |
| 1732 | #endif // V8_I18N_SUPPORT |
| 1733 | return false; |
| 1734 | } |
| 1735 | |
| 1736 | |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1737 | RegExpTree* RegExpBuilder::ToRegExp() { |
| 1738 | FlushTerms(); |
| 1739 | int num_alternatives = alternatives_.length(); |
| 1740 | if (num_alternatives == 0) return new (zone()) RegExpEmpty(); |
| 1741 | if (num_alternatives == 1) return alternatives_.last(); |
| 1742 | return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); |
| 1743 | } |
| 1744 | |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1745 | bool RegExpBuilder::AddQuantifierToAtom( |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1746 | int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1747 | FlushPendingSurrogate(); |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1748 | if (pending_empty_) { |
| 1749 | pending_empty_ = false; |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1750 | return true; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1751 | } |
| 1752 | RegExpTree* atom; |
| 1753 | if (characters_ != NULL) { |
| 1754 | DCHECK(last_added_ == ADD_CHAR); |
| 1755 | // Last atom was character. |
| 1756 | Vector<const uc16> char_vector = characters_->ToConstVector(); |
| 1757 | int num_chars = char_vector.length(); |
| 1758 | if (num_chars > 1) { |
| 1759 | Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1); |
| 1760 | text_.Add(new (zone()) RegExpAtom(prefix), zone()); |
| 1761 | char_vector = char_vector.SubVector(num_chars - 1, num_chars); |
| 1762 | } |
| 1763 | characters_ = NULL; |
| 1764 | atom = new (zone()) RegExpAtom(char_vector); |
| 1765 | FlushText(); |
| 1766 | } else if (text_.length() > 0) { |
| 1767 | DCHECK(last_added_ == ADD_ATOM); |
| 1768 | atom = text_.RemoveLast(); |
| 1769 | FlushText(); |
| 1770 | } else if (terms_.length() > 0) { |
| 1771 | DCHECK(last_added_ == ADD_ATOM); |
| 1772 | atom = terms_.RemoveLast(); |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1773 | // With /u, lookarounds are not quantifiable. |
| 1774 | if (unicode() && atom->IsLookaround()) return false; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1775 | if (atom->max_match() == 0) { |
| 1776 | // Guaranteed to only match an empty string. |
| 1777 | LAST(ADD_TERM); |
| 1778 | if (min == 0) { |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1779 | return true; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1780 | } |
| 1781 | terms_.Add(atom, zone()); |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1782 | return true; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1783 | } |
| 1784 | } else { |
| 1785 | // Only call immediately after adding an atom or character! |
| 1786 | UNREACHABLE(); |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1787 | return false; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1788 | } |
| 1789 | terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
| 1790 | zone()); |
| 1791 | LAST(ADD_TERM); |
Ben Murdoch | 097c5b2 | 2016-05-18 11:27:45 +0100 | [diff] [blame] | 1792 | return true; |
Ben Murdoch | 4a90d5f | 2016-03-22 12:00:34 +0000 | [diff] [blame] | 1793 | } |
| 1794 | |
| 1795 | } // namespace internal |
| 1796 | } // namespace v8 |