blob: abb644ac3f8057c91c5648177b6b8bdc4d4e3f14 [file] [log] [blame]
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/regexp-parser.h"
6
7#include "src/char-predicates-inl.h"
8#include "src/factory.h"
9#include "src/isolate.h"
10#include "src/objects-inl.h"
Ben Murdoch097c5b22016-05-18 11:27:45 +010011#include "src/ostreams.h"
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000012#include "src/regexp/jsregexp.h"
13#include "src/utils.h"
14
Ben Murdoch097c5b22016-05-18 11:27:45 +010015#ifdef V8_I18N_SUPPORT
16#include "unicode/uset.h"
17#endif // V8_I18N_SUPPORT
18
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000019namespace v8 {
20namespace internal {
21
22RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
Ben Murdoch097c5b22016-05-18 11:27:45 +010023 JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000024 : isolate_(isolate),
25 zone_(zone),
26 error_(error),
27 captures_(NULL),
28 in_(in),
29 current_(kEndMarker),
Ben Murdoch097c5b22016-05-18 11:27:45 +010030 ignore_case_(flags & JSRegExp::kIgnoreCase),
31 multiline_(flags & JSRegExp::kMultiline),
32 unicode_(flags & JSRegExp::kUnicode),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000033 next_pos_(0),
34 captures_started_(0),
35 capture_count_(0),
36 has_more_(true),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000037 simple_(false),
38 contains_anchor_(false),
39 is_scanned_for_captures_(false),
40 failed_(false) {
41 Advance();
42}
43
Ben Murdoch097c5b22016-05-18 11:27:45 +010044template <bool update_position>
45inline uc32 RegExpParser::ReadNext() {
46 int position = next_pos_;
47 uc32 c0 = in()->Get(position);
48 position++;
49 // Read the whole surrogate pair in case of unicode flag, if possible.
50 if (unicode() && position < in()->length() &&
51 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
52 uc16 c1 = in()->Get(position);
53 if (unibrow::Utf16::IsTrailSurrogate(c1)) {
54 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
55 position++;
56 }
57 }
58 if (update_position) next_pos_ = position;
59 return c0;
60}
61
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000062
63uc32 RegExpParser::Next() {
64 if (has_next()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +010065 return ReadNext<false>();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000066 } else {
67 return kEndMarker;
68 }
69}
70
71
72void RegExpParser::Advance() {
Ben Murdoch097c5b22016-05-18 11:27:45 +010073 if (has_next()) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000074 StackLimitCheck check(isolate());
75 if (check.HasOverflowed()) {
76 ReportError(CStrVector(Isolate::kStackOverflowMessage));
77 } else if (zone()->excess_allocation()) {
78 ReportError(CStrVector("Regular expression too large"));
79 } else {
Ben Murdoch097c5b22016-05-18 11:27:45 +010080 current_ = ReadNext<true>();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000081 }
82 } else {
83 current_ = kEndMarker;
84 // Advance so that position() points to 1-after-the-last-character. This is
85 // important so that Reset() to this position works correctly.
86 next_pos_ = in()->length() + 1;
87 has_more_ = false;
88 }
89}
90
91
92void RegExpParser::Reset(int pos) {
93 next_pos_ = pos;
94 has_more_ = (pos < in()->length());
95 Advance();
96}
97
98
99void RegExpParser::Advance(int dist) {
100 next_pos_ += dist - 1;
101 Advance();
102}
103
104
105bool RegExpParser::simple() { return simple_; }
106
Ben Murdoch097c5b22016-05-18 11:27:45 +0100107bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
108 switch (c) {
109 case '^':
110 case '$':
111 case '\\':
112 case '.':
113 case '*':
114 case '+':
115 case '?':
116 case '(':
117 case ')':
118 case '[':
119 case ']':
120 case '{':
121 case '}':
122 case '|':
123 case '/':
124 return true;
125 default:
126 break;
127 }
128 return false;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000129}
130
131
132RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
Ben Murdochc5610432016-08-08 18:44:38 +0100133 if (failed_) return NULL; // Do not overwrite any existing error.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000134 failed_ = true;
135 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
136 // Zip to the end to make sure the no more input is read.
137 current_ = kEndMarker;
138 next_pos_ = in()->length();
139 return NULL;
140}
141
142
143#define CHECK_FAILED /**/); \
144 if (failed_) return NULL; \
145 ((void)0
146
147
148// Pattern ::
149// Disjunction
150RegExpTree* RegExpParser::ParsePattern() {
151 RegExpTree* result = ParseDisjunction(CHECK_FAILED);
152 DCHECK(!has_more());
153 // If the result of parsing is a literal string atom, and it has the
154 // same length as the input, then the atom is identical to the input.
155 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
156 simple_ = true;
157 }
158 return result;
159}
160
161
162// Disjunction ::
163// Alternative
164// Alternative | Disjunction
165// Alternative ::
166// [empty]
167// Term Alternative
168// Term ::
169// Assertion
170// Atom
171// Atom Quantifier
172RegExpTree* RegExpParser::ParseDisjunction() {
173 // Used to store current state while parsing subexpressions.
174 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
Ben Murdoch097c5b22016-05-18 11:27:45 +0100175 ignore_case(), unicode(), zone());
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000176 RegExpParserState* state = &initial_state;
177 // Cache the builder in a local variable for quick access.
178 RegExpBuilder* builder = initial_state.builder();
179 while (true) {
180 switch (current()) {
181 case kEndMarker:
182 if (state->IsSubexpression()) {
183 // Inside a parenthesized group when hitting end of input.
Ben Murdoch097c5b22016-05-18 11:27:45 +0100184 return ReportError(CStrVector("Unterminated group"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000185 }
186 DCHECK_EQ(INITIAL, state->group_type());
187 // Parsing completed successfully.
188 return builder->ToRegExp();
189 case ')': {
190 if (!state->IsSubexpression()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100191 return ReportError(CStrVector("Unmatched ')'"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000192 }
193 DCHECK_NE(INITIAL, state->group_type());
194
195 Advance();
196 // End disjunction parsing and convert builder content to new single
197 // regexp atom.
198 RegExpTree* body = builder->ToRegExp();
199
200 int end_capture_index = captures_started();
201
202 int capture_index = state->capture_index();
203 SubexpressionType group_type = state->group_type();
204
205 // Build result of subexpression.
206 if (group_type == CAPTURE) {
207 RegExpCapture* capture = GetCapture(capture_index);
208 capture->set_body(body);
209 body = capture;
210 } else if (group_type != GROUPING) {
211 DCHECK(group_type == POSITIVE_LOOKAROUND ||
212 group_type == NEGATIVE_LOOKAROUND);
213 bool is_positive = (group_type == POSITIVE_LOOKAROUND);
214 body = new (zone()) RegExpLookaround(
215 body, is_positive, end_capture_index - capture_index,
216 capture_index, state->lookaround_type());
217 }
218
219 // Restore previous state.
220 state = state->previous_state();
221 builder = state->builder();
222
223 builder->AddAtom(body);
224 // For compatability with JSC and ES3, we allow quantifiers after
225 // lookaheads, and break in all cases.
226 break;
227 }
228 case '|': {
229 Advance();
230 builder->NewAlternative();
231 continue;
232 }
233 case '*':
234 case '+':
235 case '?':
236 return ReportError(CStrVector("Nothing to repeat"));
237 case '^': {
238 Advance();
Ben Murdoch097c5b22016-05-18 11:27:45 +0100239 if (multiline()) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000240 builder->AddAssertion(
241 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
242 } else {
243 builder->AddAssertion(
244 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
245 set_contains_anchor();
246 }
247 continue;
248 }
249 case '$': {
250 Advance();
251 RegExpAssertion::AssertionType assertion_type =
Ben Murdoch097c5b22016-05-18 11:27:45 +0100252 multiline() ? RegExpAssertion::END_OF_LINE
253 : RegExpAssertion::END_OF_INPUT;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000254 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
255 continue;
256 }
257 case '.': {
258 Advance();
259 // everything except \x0a, \x0d, \u2028 and \u2029
260 ZoneList<CharacterRange>* ranges =
261 new (zone()) ZoneList<CharacterRange>(2, zone());
262 CharacterRange::AddClassEscape('.', ranges, zone());
Ben Murdoch097c5b22016-05-18 11:27:45 +0100263 RegExpCharacterClass* cc =
264 new (zone()) RegExpCharacterClass(ranges, false);
265 builder->AddCharacterClass(cc);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000266 break;
267 }
268 case '(': {
269 SubexpressionType subexpr_type = CAPTURE;
270 RegExpLookaround::Type lookaround_type = state->lookaround_type();
271 Advance();
272 if (current() == '?') {
273 switch (Next()) {
274 case ':':
275 subexpr_type = GROUPING;
276 break;
277 case '=':
278 lookaround_type = RegExpLookaround::LOOKAHEAD;
279 subexpr_type = POSITIVE_LOOKAROUND;
280 break;
281 case '!':
282 lookaround_type = RegExpLookaround::LOOKAHEAD;
283 subexpr_type = NEGATIVE_LOOKAROUND;
284 break;
285 case '<':
286 if (FLAG_harmony_regexp_lookbehind) {
287 Advance();
288 lookaround_type = RegExpLookaround::LOOKBEHIND;
289 if (Next() == '=') {
290 subexpr_type = POSITIVE_LOOKAROUND;
291 break;
292 } else if (Next() == '!') {
293 subexpr_type = NEGATIVE_LOOKAROUND;
294 break;
295 }
296 }
297 // Fall through.
298 default:
Ben Murdoch097c5b22016-05-18 11:27:45 +0100299 return ReportError(CStrVector("Invalid group"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000300 }
301 Advance(2);
302 } else {
303 if (captures_started_ >= kMaxCaptures) {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100304 return ReportError(CStrVector("Too many captures"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000305 }
306 captures_started_++;
307 }
308 // Store current state and begin new disjunction parsing.
309 state = new (zone()) RegExpParserState(
Ben Murdoch097c5b22016-05-18 11:27:45 +0100310 state, subexpr_type, lookaround_type, captures_started_,
311 ignore_case(), unicode(), zone());
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000312 builder = state->builder();
313 continue;
314 }
315 case '[': {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100316 RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
317 builder->AddCharacterClass(cc->AsCharacterClass());
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000318 break;
319 }
320 // Atom ::
321 // \ AtomEscape
322 case '\\':
323 switch (Next()) {
324 case kEndMarker:
325 return ReportError(CStrVector("\\ at end of pattern"));
326 case 'b':
327 Advance(2);
328 builder->AddAssertion(
329 new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
330 continue;
331 case 'B':
332 Advance(2);
333 builder->AddAssertion(
334 new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
335 continue;
336 // AtomEscape ::
337 // CharacterClassEscape
338 //
339 // CharacterClassEscape :: one of
340 // d D s S w W
341 case 'd':
342 case 'D':
343 case 's':
344 case 'S':
345 case 'w':
346 case 'W': {
347 uc32 c = Next();
348 Advance(2);
349 ZoneList<CharacterRange>* ranges =
350 new (zone()) ZoneList<CharacterRange>(2, zone());
351 CharacterRange::AddClassEscape(c, ranges, zone());
Ben Murdoch097c5b22016-05-18 11:27:45 +0100352 RegExpCharacterClass* cc =
353 new (zone()) RegExpCharacterClass(ranges, false);
354 builder->AddCharacterClass(cc);
355 break;
356 }
357 case 'p':
358 case 'P': {
359 uc32 p = Next();
360 Advance(2);
361 if (unicode()) {
362 if (FLAG_harmony_regexp_property) {
Ben Murdochda12d292016-06-02 14:46:10 +0100363 ZoneList<CharacterRange>* ranges =
364 new (zone()) ZoneList<CharacterRange>(2, zone());
365 if (!ParsePropertyClass(ranges)) {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100366 return ReportError(CStrVector("Invalid property name"));
367 }
368 RegExpCharacterClass* cc =
369 new (zone()) RegExpCharacterClass(ranges, p == 'P');
370 builder->AddCharacterClass(cc);
371 } else {
Ben Murdochda12d292016-06-02 14:46:10 +0100372 // With /u, no identity escapes except for syntax characters
373 // are allowed. Otherwise, all identity escapes are allowed.
Ben Murdoch097c5b22016-05-18 11:27:45 +0100374 return ReportError(CStrVector("Invalid escape"));
375 }
376 } else {
377 builder->AddCharacter(p);
378 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000379 break;
380 }
381 case '1':
382 case '2':
383 case '3':
384 case '4':
385 case '5':
386 case '6':
387 case '7':
388 case '8':
389 case '9': {
390 int index = 0;
Ben Murdoch097c5b22016-05-18 11:27:45 +0100391 bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
392 if (is_backref) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000393 if (state->IsInsideCaptureGroup(index)) {
394 // The back reference is inside the capture group it refers to.
395 // Nothing can possibly have been captured yet, so we use empty
396 // instead. This ensures that, when checking a back reference,
397 // the capture registers of the referenced capture are either
398 // both set or both cleared.
399 builder->AddEmpty();
400 } else {
401 RegExpCapture* capture = GetCapture(index);
402 RegExpTree* atom = new (zone()) RegExpBackReference(capture);
403 builder->AddAtom(atom);
404 }
405 break;
406 }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100407 // With /u, no identity escapes except for syntax characters
408 // are allowed. Otherwise, all identity escapes are allowed.
409 if (unicode()) {
410 return ReportError(CStrVector("Invalid escape"));
411 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000412 uc32 first_digit = Next();
413 if (first_digit == '8' || first_digit == '9') {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100414 builder->AddCharacter(first_digit);
415 Advance(2);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000416 break;
417 }
418 }
419 // FALLTHROUGH
420 case '0': {
421 Advance();
Ben Murdoch097c5b22016-05-18 11:27:45 +0100422 if (unicode() && Next() >= '0' && Next() <= '9') {
423 // With /u, decimal escape with leading 0 are not parsed as octal.
424 return ReportError(CStrVector("Invalid decimal escape"));
425 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000426 uc32 octal = ParseOctalLiteral();
427 builder->AddCharacter(octal);
428 break;
429 }
430 // ControlEscape :: one of
431 // f n r t v
432 case 'f':
433 Advance(2);
434 builder->AddCharacter('\f');
435 break;
436 case 'n':
437 Advance(2);
438 builder->AddCharacter('\n');
439 break;
440 case 'r':
441 Advance(2);
442 builder->AddCharacter('\r');
443 break;
444 case 't':
445 Advance(2);
446 builder->AddCharacter('\t');
447 break;
448 case 'v':
449 Advance(2);
450 builder->AddCharacter('\v');
451 break;
452 case 'c': {
453 Advance();
454 uc32 controlLetter = Next();
455 // Special case if it is an ASCII letter.
456 // Convert lower case letters to uppercase.
457 uc32 letter = controlLetter & ~('a' ^ 'A');
458 if (letter < 'A' || 'Z' < letter) {
459 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
460 // This is outside the specification. We match JSC in
461 // reading the backslash as a literal character instead
462 // of as starting an escape.
Ben Murdoch097c5b22016-05-18 11:27:45 +0100463 if (unicode()) {
464 // With /u, invalid escapes are not treated as identity escapes.
465 return ReportError(CStrVector("Invalid unicode escape"));
466 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000467 builder->AddCharacter('\\');
468 } else {
469 Advance(2);
470 builder->AddCharacter(controlLetter & 0x1f);
471 }
472 break;
473 }
474 case 'x': {
475 Advance(2);
476 uc32 value;
477 if (ParseHexEscape(2, &value)) {
478 builder->AddCharacter(value);
Ben Murdoch097c5b22016-05-18 11:27:45 +0100479 } else if (!unicode()) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000480 builder->AddCharacter('x');
481 } else {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100482 // With /u, invalid escapes are not treated as identity escapes.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000483 return ReportError(CStrVector("Invalid escape"));
484 }
485 break;
486 }
487 case 'u': {
488 Advance(2);
489 uc32 value;
490 if (ParseUnicodeEscape(&value)) {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100491 builder->AddEscapedUnicodeCharacter(value);
492 } else if (!unicode()) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000493 builder->AddCharacter('u');
494 } else {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100495 // With /u, invalid escapes are not treated as identity escapes.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000496 return ReportError(CStrVector("Invalid unicode escape"));
497 }
498 break;
499 }
500 default:
501 Advance();
Ben Murdoch097c5b22016-05-18 11:27:45 +0100502 // With /u, no identity escapes except for syntax characters
503 // are allowed. Otherwise, all identity escapes are allowed.
504 if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000505 builder->AddCharacter(current());
506 Advance();
507 } else {
508 return ReportError(CStrVector("Invalid escape"));
509 }
510 break;
511 }
512 break;
513 case '{': {
514 int dummy;
Ben Murdochc5610432016-08-08 18:44:38 +0100515 bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
516 if (parsed) return ReportError(CStrVector("Nothing to repeat"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000517 // fallthrough
518 }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100519 case '}':
520 case ']':
521 if (unicode()) {
522 return ReportError(CStrVector("Lone quantifier brackets"));
523 }
524 // fallthrough
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000525 default:
526 builder->AddUnicodeCharacter(current());
527 Advance();
528 break;
529 } // end switch(current())
530
531 int min;
532 int max;
533 switch (current()) {
534 // QuantifierPrefix ::
535 // *
536 // +
537 // ?
538 // {
539 case '*':
540 min = 0;
541 max = RegExpTree::kInfinity;
542 Advance();
543 break;
544 case '+':
545 min = 1;
546 max = RegExpTree::kInfinity;
547 Advance();
548 break;
549 case '?':
550 min = 0;
551 max = 1;
552 Advance();
553 break;
554 case '{':
555 if (ParseIntervalQuantifier(&min, &max)) {
556 if (max < min) {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100557 return ReportError(
558 CStrVector("numbers out of order in {} quantifier"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000559 }
560 break;
Ben Murdoch097c5b22016-05-18 11:27:45 +0100561 } else if (unicode()) {
562 // With /u, incomplete quantifiers are not allowed.
563 return ReportError(CStrVector("Incomplete quantifier"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000564 }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100565 continue;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000566 default:
567 continue;
568 }
569 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
570 if (current() == '?') {
571 quantifier_type = RegExpQuantifier::NON_GREEDY;
572 Advance();
573 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
574 // FLAG_regexp_possessive_quantifier is a debug-only flag.
575 quantifier_type = RegExpQuantifier::POSSESSIVE;
576 Advance();
577 }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100578 if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
579 return ReportError(CStrVector("Invalid quantifier"));
580 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000581 }
582}
583
584
585#ifdef DEBUG
586// Currently only used in an DCHECK.
587static bool IsSpecialClassEscape(uc32 c) {
588 switch (c) {
589 case 'd':
590 case 'D':
591 case 's':
592 case 'S':
593 case 'w':
594 case 'W':
595 return true;
596 default:
597 return false;
598 }
599}
600#endif
601
602
603// In order to know whether an escape is a backreference or not we have to scan
604// the entire regexp and find the number of capturing parentheses. However we
605// don't want to scan the regexp twice unless it is necessary. This mini-parser
606// is called when needed. It can see the difference between capturing and
607// noncapturing parentheses and can skip character classes and backslash-escaped
608// characters.
609void RegExpParser::ScanForCaptures() {
610 // Start with captures started previous to current position
611 int capture_count = captures_started();
612 // Add count of captures after this position.
613 int n;
614 while ((n = current()) != kEndMarker) {
615 Advance();
616 switch (n) {
617 case '\\':
618 Advance();
619 break;
620 case '[': {
621 int c;
622 while ((c = current()) != kEndMarker) {
623 Advance();
624 if (c == '\\') {
625 Advance();
626 } else {
627 if (c == ']') break;
628 }
629 }
630 break;
631 }
632 case '(':
633 if (current() != '?') capture_count++;
634 break;
635 }
636 }
637 capture_count_ = capture_count;
638 is_scanned_for_captures_ = true;
639}
640
641
642bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
643 DCHECK_EQ('\\', current());
644 DCHECK('1' <= Next() && Next() <= '9');
645 // Try to parse a decimal literal that is no greater than the total number
646 // of left capturing parentheses in the input.
647 int start = position();
648 int value = Next() - '0';
649 Advance(2);
650 while (true) {
651 uc32 c = current();
652 if (IsDecimalDigit(c)) {
653 value = 10 * value + (c - '0');
654 if (value > kMaxCaptures) {
655 Reset(start);
656 return false;
657 }
658 Advance();
659 } else {
660 break;
661 }
662 }
663 if (value > captures_started()) {
664 if (!is_scanned_for_captures_) {
665 int saved_position = position();
666 ScanForCaptures();
667 Reset(saved_position);
668 }
669 if (value > capture_count_) {
670 Reset(start);
671 return false;
672 }
673 }
674 *index_out = value;
675 return true;
676}
677
678
679RegExpCapture* RegExpParser::GetCapture(int index) {
680 // The index for the capture groups are one-based. Its index in the list is
681 // zero-based.
682 int know_captures =
683 is_scanned_for_captures_ ? capture_count_ : captures_started_;
684 DCHECK(index <= know_captures);
685 if (captures_ == NULL) {
686 captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
687 }
688 while (captures_->length() < know_captures) {
689 captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
690 }
691 return captures_->at(index - 1);
692}
693
694
695bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
696 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
697 if (s->group_type() != CAPTURE) continue;
698 // Return true if we found the matching capture index.
699 if (index == s->capture_index()) return true;
700 // Abort if index is larger than what has been parsed up till this state.
701 if (index > s->capture_index()) return false;
702 }
703 return false;
704}
705
706
707// QuantifierPrefix ::
708// { DecimalDigits }
709// { DecimalDigits , }
710// { DecimalDigits , DecimalDigits }
711//
712// Returns true if parsing succeeds, and set the min_out and max_out
713// values. Values are truncated to RegExpTree::kInfinity if they overflow.
714bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
715 DCHECK_EQ(current(), '{');
716 int start = position();
717 Advance();
718 int min = 0;
719 if (!IsDecimalDigit(current())) {
720 Reset(start);
721 return false;
722 }
723 while (IsDecimalDigit(current())) {
724 int next = current() - '0';
725 if (min > (RegExpTree::kInfinity - next) / 10) {
726 // Overflow. Skip past remaining decimal digits and return -1.
727 do {
728 Advance();
729 } while (IsDecimalDigit(current()));
730 min = RegExpTree::kInfinity;
731 break;
732 }
733 min = 10 * min + next;
734 Advance();
735 }
736 int max = 0;
737 if (current() == '}') {
738 max = min;
739 Advance();
740 } else if (current() == ',') {
741 Advance();
742 if (current() == '}') {
743 max = RegExpTree::kInfinity;
744 Advance();
745 } else {
746 while (IsDecimalDigit(current())) {
747 int next = current() - '0';
748 if (max > (RegExpTree::kInfinity - next) / 10) {
749 do {
750 Advance();
751 } while (IsDecimalDigit(current()));
752 max = RegExpTree::kInfinity;
753 break;
754 }
755 max = 10 * max + next;
756 Advance();
757 }
758 if (current() != '}') {
759 Reset(start);
760 return false;
761 }
762 Advance();
763 }
764 } else {
765 Reset(start);
766 return false;
767 }
768 *min_out = min;
769 *max_out = max;
770 return true;
771}
772
773
774uc32 RegExpParser::ParseOctalLiteral() {
775 DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
776 // For compatibility with some other browsers (not all), we parse
777 // up to three octal digits with a value below 256.
778 uc32 value = current() - '0';
779 Advance();
780 if ('0' <= current() && current() <= '7') {
781 value = value * 8 + current() - '0';
782 Advance();
783 if (value < 32 && '0' <= current() && current() <= '7') {
784 value = value * 8 + current() - '0';
785 Advance();
786 }
787 }
788 return value;
789}
790
791
792bool RegExpParser::ParseHexEscape(int length, uc32* value) {
793 int start = position();
794 uc32 val = 0;
795 for (int i = 0; i < length; ++i) {
796 uc32 c = current();
797 int d = HexValue(c);
798 if (d < 0) {
799 Reset(start);
800 return false;
801 }
802 val = val * 16 + d;
803 Advance();
804 }
805 *value = val;
806 return true;
807}
808
Ben Murdoch097c5b22016-05-18 11:27:45 +0100809// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000810bool RegExpParser::ParseUnicodeEscape(uc32* value) {
811 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
812 // allowed). In the latter case, the number of hex digits between { } is
813 // arbitrary. \ and u have already been read.
Ben Murdoch097c5b22016-05-18 11:27:45 +0100814 if (current() == '{' && unicode()) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000815 int start = position();
816 Advance();
817 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
818 if (current() == '}') {
819 Advance();
820 return true;
821 }
822 }
823 Reset(start);
824 return false;
825 }
826 // \u but no {, or \u{...} escapes not allowed.
Ben Murdoch097c5b22016-05-18 11:27:45 +0100827 bool result = ParseHexEscape(4, value);
828 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
829 current() == '\\') {
830 // Attempt to read trail surrogate.
831 int start = position();
832 if (Next() == 'u') {
833 Advance(2);
834 uc32 trail;
835 if (ParseHexEscape(4, &trail) &&
836 unibrow::Utf16::IsTrailSurrogate(trail)) {
837 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
838 static_cast<uc16>(trail));
839 return true;
840 }
841 }
842 Reset(start);
843 }
844 return result;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000845}
846
Ben Murdoch097c5b22016-05-18 11:27:45 +0100847#ifdef V8_I18N_SUPPORT
Ben Murdochc5610432016-08-08 18:44:38 +0100848bool IsExactPropertyAlias(const char* property_name, UProperty property) {
849 const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
Ben Murdochda12d292016-06-02 14:46:10 +0100850 if (short_name != NULL && strcmp(property_name, short_name) == 0) return true;
851 for (int i = 0;; i++) {
Ben Murdochc5610432016-08-08 18:44:38 +0100852 const char* long_name = u_getPropertyName(
853 property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
Ben Murdochda12d292016-06-02 14:46:10 +0100854 if (long_name == NULL) break;
855 if (strcmp(property_name, long_name) == 0) return true;
Ben Murdoch097c5b22016-05-18 11:27:45 +0100856 }
Ben Murdochda12d292016-06-02 14:46:10 +0100857 return false;
858}
Ben Murdoch097c5b22016-05-18 11:27:45 +0100859
Ben Murdochc5610432016-08-08 18:44:38 +0100860bool IsExactPropertyValueAlias(const char* property_value_name,
861 UProperty property, int32_t property_value) {
862 const char* short_name =
863 u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
864 if (short_name != NULL && strcmp(property_value_name, short_name) == 0) {
865 return true;
866 }
867 for (int i = 0;; i++) {
868 const char* long_name = u_getPropertyValueName(
869 property, property_value,
870 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
871 if (long_name == NULL) break;
872 if (strcmp(property_value_name, long_name) == 0) return true;
873 }
874 return false;
875}
876
877bool LookupPropertyValueName(UProperty property,
878 const char* property_value_name,
879 ZoneList<CharacterRange>* result, Zone* zone) {
880 int32_t property_value =
881 u_getPropertyValueEnum(property, property_value_name);
Ben Murdochda12d292016-06-02 14:46:10 +0100882 if (property_value == UCHAR_INVALID_CODE) return false;
883
884 // We require the property name to match exactly to one of the property value
885 // aliases. However, u_getPropertyValueEnum uses loose matching.
Ben Murdochc5610432016-08-08 18:44:38 +0100886 if (!IsExactPropertyValueAlias(property_value_name, property,
887 property_value)) {
Ben Murdochda12d292016-06-02 14:46:10 +0100888 return false;
889 }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100890
891 USet* set = uset_openEmpty();
892 UErrorCode ec = U_ZERO_ERROR;
Ben Murdochda12d292016-06-02 14:46:10 +0100893 uset_applyIntPropertyValue(set, property, property_value, &ec);
894 bool success = ec == U_ZERO_ERROR && !uset_isEmpty(set);
895
896 if (success) {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100897 uset_removeAllStrings(set);
898 int item_count = uset_getItemCount(set);
Ben Murdoch097c5b22016-05-18 11:27:45 +0100899 int item_result = 0;
900 for (int i = 0; i < item_count; i++) {
901 uc32 start = 0;
902 uc32 end = 0;
903 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
Ben Murdochda12d292016-06-02 14:46:10 +0100904 result->Add(CharacterRange::Range(start, end), zone);
Ben Murdoch097c5b22016-05-18 11:27:45 +0100905 }
906 DCHECK_EQ(U_ZERO_ERROR, ec);
907 DCHECK_EQ(0, item_result);
908 }
909 uset_close(set);
Ben Murdochda12d292016-06-02 14:46:10 +0100910 return success;
911}
Ben Murdochda12d292016-06-02 14:46:10 +0100912
913bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result) {
Ben Murdochc5610432016-08-08 18:44:38 +0100914 // Parse the property class as follows:
915 // - \pN with a single-character N is equivalent to \p{N}
916 // - In \p{name}, 'name' is interpreted
917 // - either as a general category property value name.
918 // - or as a binary property name.
919 // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
920 // and 'value' is interpreted as one of the available property value names.
921 // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
922 // - Loose matching is not applied.
923 List<char> first_part;
924 List<char> second_part;
Ben Murdochda12d292016-06-02 14:46:10 +0100925 if (current() == '{') {
Ben Murdochc5610432016-08-08 18:44:38 +0100926 // Parse \p{[PropertyName=]PropertyNameValue}
927 for (Advance(); current() != '}' && current() != '='; Advance()) {
Ben Murdochda12d292016-06-02 14:46:10 +0100928 if (!has_next()) return false;
Ben Murdochc5610432016-08-08 18:44:38 +0100929 first_part.Add(static_cast<char>(current()));
930 }
931 if (current() == '=') {
932 for (Advance(); current() != '}'; Advance()) {
933 if (!has_next()) return false;
934 second_part.Add(static_cast<char>(current()));
935 }
936 second_part.Add(0); // null-terminate string.
Ben Murdochda12d292016-06-02 14:46:10 +0100937 }
938 } else if (current() != kEndMarker) {
Ben Murdochc5610432016-08-08 18:44:38 +0100939 // Parse \pN, where N is a single-character property name value.
940 first_part.Add(static_cast<char>(current()));
Ben Murdochda12d292016-06-02 14:46:10 +0100941 } else {
942 return false;
943 }
944 Advance();
Ben Murdochc5610432016-08-08 18:44:38 +0100945 first_part.Add(0); // null-terminate string.
Ben Murdochda12d292016-06-02 14:46:10 +0100946
Ben Murdochc5610432016-08-08 18:44:38 +0100947 if (second_part.is_empty()) {
948 // First attempt to interpret as general category property value name.
949 const char* name = first_part.ToConstVector().start();
950 if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, result,
951 zone())) {
952 return true;
953 }
954 // Then attempt to interpret as binary property name with value name 'Y'.
955 UProperty property = u_getPropertyEnum(name);
956 if (property < UCHAR_BINARY_START) return false;
957 if (property >= UCHAR_BINARY_LIMIT) return false;
958 if (!IsExactPropertyAlias(name, property)) return false;
959 return LookupPropertyValueName(property, "Y", result, zone());
960 } else {
961 // Both property name and value name are specified. Attempt to interpret
962 // the property name as enumerated property.
963 const char* property_name = first_part.ToConstVector().start();
964 const char* value_name = second_part.ToConstVector().start();
965 UProperty property = u_getPropertyEnum(property_name);
966 if (property < UCHAR_INT_START) return false;
967 if (property >= UCHAR_INT_LIMIT) return false;
968 if (!IsExactPropertyAlias(property_name, property)) return false;
969 return LookupPropertyValueName(property, value_name, result, zone());
Ben Murdochda12d292016-06-02 14:46:10 +0100970 }
Ben Murdochc5610432016-08-08 18:44:38 +0100971}
972
973#else // V8_I18N_SUPPORT
974
975bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result) {
Ben Murdochda12d292016-06-02 14:46:10 +0100976 return false;
Ben Murdoch097c5b22016-05-18 11:27:45 +0100977}
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000978
Ben Murdochc5610432016-08-08 18:44:38 +0100979#endif // V8_I18N_SUPPORT
980
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000981bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
982 uc32 x = 0;
983 int d = HexValue(current());
984 if (d < 0) {
985 return false;
986 }
987 while (d >= 0) {
988 x = x * 16 + d;
989 if (x > max_value) {
990 return false;
991 }
992 Advance();
993 d = HexValue(current());
994 }
995 *value = x;
996 return true;
997}
998
999
1000uc32 RegExpParser::ParseClassCharacterEscape() {
1001 DCHECK(current() == '\\');
1002 DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1003 Advance();
1004 switch (current()) {
1005 case 'b':
1006 Advance();
1007 return '\b';
1008 // ControlEscape :: one of
1009 // f n r t v
1010 case 'f':
1011 Advance();
1012 return '\f';
1013 case 'n':
1014 Advance();
1015 return '\n';
1016 case 'r':
1017 Advance();
1018 return '\r';
1019 case 't':
1020 Advance();
1021 return '\t';
1022 case 'v':
1023 Advance();
1024 return '\v';
1025 case 'c': {
1026 uc32 controlLetter = Next();
1027 uc32 letter = controlLetter & ~('A' ^ 'a');
Ben Murdoch097c5b22016-05-18 11:27:45 +01001028 // For compatibility with JSC, inside a character class. We also accept
1029 // digits and underscore as control characters, unless with /u.
1030 if (letter >= 'A' && letter <= 'Z') {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001031 Advance(2);
1032 // Control letters mapped to ASCII control characters in the range
1033 // 0x00-0x1f.
1034 return controlLetter & 0x1f;
1035 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01001036 if (unicode()) {
1037 // With /u, invalid escapes are not treated as identity escapes.
1038 ReportError(CStrVector("Invalid class escape"));
1039 return 0;
1040 }
1041 if ((controlLetter >= '0' && controlLetter <= '9') ||
1042 controlLetter == '_') {
1043 Advance(2);
1044 return controlLetter & 0x1f;
1045 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001046 // We match JSC in reading the backslash as a literal
1047 // character instead of as starting an escape.
1048 return '\\';
1049 }
1050 case '0':
Ben Murdoch097c5b22016-05-18 11:27:45 +01001051 // With /u, \0 is interpreted as NUL if not followed by another digit.
1052 if (unicode() && !(Next() >= '0' && Next() <= '9')) {
1053 Advance();
1054 return 0;
1055 }
1056 // Fall through.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001057 case '1':
1058 case '2':
1059 case '3':
1060 case '4':
1061 case '5':
1062 case '6':
1063 case '7':
1064 // For compatibility, we interpret a decimal escape that isn't
1065 // a back reference (and therefore either \0 or not valid according
1066 // to the specification) as a 1..3 digit octal character code.
Ben Murdoch097c5b22016-05-18 11:27:45 +01001067 if (unicode()) {
1068 // With /u, decimal escape is not interpreted as octal character code.
1069 ReportError(CStrVector("Invalid class escape"));
1070 return 0;
1071 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001072 return ParseOctalLiteral();
1073 case 'x': {
1074 Advance();
1075 uc32 value;
Ben Murdoch097c5b22016-05-18 11:27:45 +01001076 if (ParseHexEscape(2, &value)) return value;
1077 if (unicode()) {
1078 // With /u, invalid escapes are not treated as identity escapes.
1079 ReportError(CStrVector("Invalid escape"));
1080 return 0;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001081 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01001082 // If \x is not followed by a two-digit hexadecimal, treat it
1083 // as an identity escape.
1084 return 'x';
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001085 }
1086 case 'u': {
1087 Advance();
1088 uc32 value;
Ben Murdoch097c5b22016-05-18 11:27:45 +01001089 if (ParseUnicodeEscape(&value)) return value;
1090 if (unicode()) {
1091 // With /u, invalid escapes are not treated as identity escapes.
1092 ReportError(CStrVector("Invalid unicode escape"));
1093 return 0;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001094 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01001095 // If \u is not followed by a two-digit hexadecimal, treat it
1096 // as an identity escape.
1097 return 'u';
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001098 }
1099 default: {
1100 uc32 result = current();
Ben Murdoch097c5b22016-05-18 11:27:45 +01001101 // With /u, no identity escapes except for syntax characters and '-' are
1102 // allowed. Otherwise, all identity escapes are allowed.
1103 if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001104 Advance();
1105 return result;
1106 }
1107 ReportError(CStrVector("Invalid escape"));
1108 return 0;
1109 }
1110 }
1111 return 0;
1112}
1113
1114
1115CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
1116 DCHECK_EQ(0, *char_class);
1117 uc32 first = current();
1118 if (first == '\\') {
1119 switch (Next()) {
1120 case 'w':
1121 case 'W':
1122 case 'd':
1123 case 'D':
1124 case 's':
1125 case 'S': {
1126 *char_class = Next();
1127 Advance(2);
1128 return CharacterRange::Singleton(0); // Return dummy value.
1129 }
1130 case kEndMarker:
1131 return ReportError(CStrVector("\\ at end of pattern"));
1132 default:
Ben Murdoch097c5b22016-05-18 11:27:45 +01001133 first = ParseClassCharacterEscape(CHECK_FAILED);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001134 }
1135 } else {
1136 Advance();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001137 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01001138
1139 return CharacterRange::Singleton(first);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001140}
1141
1142
1143static const uc16 kNoCharClass = 0;
1144
1145// Adds range or pre-defined character class to character ranges.
1146// If char_class is not kInvalidClass, it's interpreted as a class
1147// escape (i.e., 's' means whitespace, from '\s').
1148static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
1149 uc16 char_class, CharacterRange range,
1150 Zone* zone) {
1151 if (char_class != kNoCharClass) {
1152 CharacterRange::AddClassEscape(char_class, ranges, zone);
1153 } else {
1154 ranges->Add(range, zone);
1155 }
1156}
1157
Ben Murdochda12d292016-06-02 14:46:10 +01001158bool RegExpParser::ParseClassProperty(ZoneList<CharacterRange>* ranges) {
1159 if (!FLAG_harmony_regexp_property) return false;
1160 if (!unicode()) return false;
1161 if (current() != '\\') return false;
1162 uc32 next = Next();
1163 bool parse_success = false;
1164 if (next == 'p') {
1165 Advance(2);
1166 parse_success = ParsePropertyClass(ranges);
1167 } else if (next == 'P') {
1168 Advance(2);
1169 ZoneList<CharacterRange>* property_class =
1170 new (zone()) ZoneList<CharacterRange>(2, zone());
1171 parse_success = ParsePropertyClass(property_class);
1172 if (parse_success) {
1173 ZoneList<CharacterRange>* negated =
1174 new (zone()) ZoneList<CharacterRange>(2, zone());
1175 CharacterRange::Negate(property_class, negated, zone());
1176 const Vector<CharacterRange> negated_vector = negated->ToVector();
1177 ranges->AddAll(negated_vector, zone());
1178 }
1179 } else {
1180 return false;
1181 }
1182 if (!parse_success)
1183 ReportError(CStrVector("Invalid property name in character class"));
1184 return parse_success;
1185}
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001186
1187RegExpTree* RegExpParser::ParseCharacterClass() {
1188 static const char* kUnterminated = "Unterminated character class";
Ben Murdoch097c5b22016-05-18 11:27:45 +01001189 static const char* kRangeInvalid = "Invalid character class";
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001190 static const char* kRangeOutOfOrder = "Range out of order in character class";
1191
1192 DCHECK_EQ(current(), '[');
1193 Advance();
1194 bool is_negated = false;
1195 if (current() == '^') {
1196 is_negated = true;
1197 Advance();
1198 }
1199 ZoneList<CharacterRange>* ranges =
1200 new (zone()) ZoneList<CharacterRange>(2, zone());
1201 while (has_more() && current() != ']') {
Ben Murdochda12d292016-06-02 14:46:10 +01001202 bool parsed_property = ParseClassProperty(ranges CHECK_FAILED);
1203 if (parsed_property) continue;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001204 uc16 char_class = kNoCharClass;
1205 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
1206 if (current() == '-') {
1207 Advance();
1208 if (current() == kEndMarker) {
1209 // If we reach the end we break out of the loop and let the
1210 // following code report an error.
1211 break;
1212 } else if (current() == ']') {
1213 AddRangeOrEscape(ranges, char_class, first, zone());
1214 ranges->Add(CharacterRange::Singleton('-'), zone());
1215 break;
1216 }
1217 uc16 char_class_2 = kNoCharClass;
1218 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
1219 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
1220 // Either end is an escaped character class. Treat the '-' verbatim.
Ben Murdoch097c5b22016-05-18 11:27:45 +01001221 if (unicode()) {
1222 // ES2015 21.2.2.15.1 step 1.
1223 return ReportError(CStrVector(kRangeInvalid));
1224 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001225 AddRangeOrEscape(ranges, char_class, first, zone());
1226 ranges->Add(CharacterRange::Singleton('-'), zone());
1227 AddRangeOrEscape(ranges, char_class_2, next, zone());
1228 continue;
1229 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01001230 // ES2015 21.2.2.15.1 step 6.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001231 if (first.from() > next.to()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001232 return ReportError(CStrVector(kRangeOutOfOrder));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001233 }
1234 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
1235 } else {
1236 AddRangeOrEscape(ranges, char_class, first, zone());
1237 }
1238 }
1239 if (!has_more()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001240 return ReportError(CStrVector(kUnterminated));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001241 }
1242 Advance();
1243 if (ranges->length() == 0) {
1244 ranges->Add(CharacterRange::Everything(), zone());
1245 is_negated = !is_negated;
1246 }
1247 return new (zone()) RegExpCharacterClass(ranges, is_negated);
1248}
1249
1250
1251#undef CHECK_FAILED
1252
1253
1254bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
Ben Murdoch097c5b22016-05-18 11:27:45 +01001255 FlatStringReader* input, JSRegExp::Flags flags,
1256 RegExpCompileData* result) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001257 DCHECK(result != NULL);
Ben Murdoch097c5b22016-05-18 11:27:45 +01001258 RegExpParser parser(input, &result->error, flags, isolate, zone);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001259 RegExpTree* tree = parser.ParsePattern();
1260 if (parser.failed()) {
1261 DCHECK(tree == NULL);
1262 DCHECK(!result->error.is_null());
1263 } else {
1264 DCHECK(tree != NULL);
1265 DCHECK(result->error.is_null());
1266 if (FLAG_trace_regexp_parser) {
1267 OFStream os(stdout);
1268 tree->Print(os, zone);
1269 os << "\n";
1270 }
1271 result->tree = tree;
1272 int capture_count = parser.captures_started();
1273 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1274 result->contains_anchor = parser.contains_anchor();
1275 result->capture_count = capture_count;
1276 }
1277 return !parser.failed();
1278}
1279
Ben Murdoch097c5b22016-05-18 11:27:45 +01001280RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001281 : zone_(zone),
1282 pending_empty_(false),
Ben Murdoch097c5b22016-05-18 11:27:45 +01001283 ignore_case_(ignore_case),
1284 unicode_(unicode),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001285 characters_(NULL),
Ben Murdoch097c5b22016-05-18 11:27:45 +01001286 pending_surrogate_(kNoPendingSurrogate),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001287 terms_(),
1288 alternatives_()
1289#ifdef DEBUG
1290 ,
1291 last_added_(ADD_NONE)
1292#endif
1293{
1294}
1295
1296
Ben Murdoch097c5b22016-05-18 11:27:45 +01001297void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1298 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1299 FlushPendingSurrogate();
1300 // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1301 pending_surrogate_ = lead_surrogate;
1302}
1303
1304
1305void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1306 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1307 if (pending_surrogate_ != kNoPendingSurrogate) {
1308 uc16 lead_surrogate = pending_surrogate_;
1309 pending_surrogate_ = kNoPendingSurrogate;
1310 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1311 uc32 combined =
1312 unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1313 if (NeedsDesugaringForIgnoreCase(combined)) {
1314 AddCharacterClassForDesugaring(combined);
1315 } else {
1316 ZoneList<uc16> surrogate_pair(2, zone());
1317 surrogate_pair.Add(lead_surrogate, zone());
1318 surrogate_pair.Add(trail_surrogate, zone());
1319 RegExpAtom* atom =
1320 new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
1321 AddAtom(atom);
1322 }
1323 } else {
1324 pending_surrogate_ = trail_surrogate;
1325 FlushPendingSurrogate();
1326 }
1327}
1328
1329
1330void RegExpBuilder::FlushPendingSurrogate() {
1331 if (pending_surrogate_ != kNoPendingSurrogate) {
1332 DCHECK(unicode());
1333 uc32 c = pending_surrogate_;
1334 pending_surrogate_ = kNoPendingSurrogate;
1335 AddCharacterClassForDesugaring(c);
1336 }
1337}
1338
1339
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001340void RegExpBuilder::FlushCharacters() {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001341 FlushPendingSurrogate();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001342 pending_empty_ = false;
1343 if (characters_ != NULL) {
1344 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
1345 characters_ = NULL;
1346 text_.Add(atom, zone());
1347 LAST(ADD_ATOM);
1348 }
1349}
1350
1351
1352void RegExpBuilder::FlushText() {
1353 FlushCharacters();
1354 int num_text = text_.length();
1355 if (num_text == 0) {
1356 return;
1357 } else if (num_text == 1) {
1358 terms_.Add(text_.last(), zone());
1359 } else {
1360 RegExpText* text = new (zone()) RegExpText(zone());
1361 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1362 terms_.Add(text, zone());
1363 }
1364 text_.Clear();
1365}
1366
1367
1368void RegExpBuilder::AddCharacter(uc16 c) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001369 FlushPendingSurrogate();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001370 pending_empty_ = false;
Ben Murdoch097c5b22016-05-18 11:27:45 +01001371 if (NeedsDesugaringForIgnoreCase(c)) {
1372 AddCharacterClassForDesugaring(c);
1373 } else {
1374 if (characters_ == NULL) {
1375 characters_ = new (zone()) ZoneList<uc16>(4, zone());
1376 }
1377 characters_->Add(c, zone());
1378 LAST(ADD_CHAR);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001379 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001380}
1381
1382
1383void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1384 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001385 DCHECK(unicode());
1386 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1387 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1388 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1389 AddLeadSurrogate(c);
1390 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1391 AddTrailSurrogate(c);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001392 } else {
1393 AddCharacter(static_cast<uc16>(c));
1394 }
1395}
1396
Ben Murdoch097c5b22016-05-18 11:27:45 +01001397void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1398 // A lead or trail surrogate parsed via escape sequence will not
1399 // pair up with any preceding lead or following trail surrogate.
1400 FlushPendingSurrogate();
1401 AddUnicodeCharacter(character);
1402 FlushPendingSurrogate();
1403}
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001404
1405void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1406
1407
Ben Murdoch097c5b22016-05-18 11:27:45 +01001408void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1409 if (NeedsDesugaringForUnicode(cc)) {
1410 // With /u, character class needs to be desugared, so it
1411 // must be a standalone term instead of being part of a RegExpText.
1412 AddTerm(cc);
1413 } else {
1414 AddAtom(cc);
1415 }
1416}
1417
1418void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1419 AddTerm(new (zone()) RegExpCharacterClass(
1420 CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
1421}
1422
1423
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001424void RegExpBuilder::AddAtom(RegExpTree* term) {
1425 if (term->IsEmpty()) {
1426 AddEmpty();
1427 return;
1428 }
1429 if (term->IsTextElement()) {
1430 FlushCharacters();
1431 text_.Add(term, zone());
1432 } else {
1433 FlushText();
1434 terms_.Add(term, zone());
1435 }
1436 LAST(ADD_ATOM);
1437}
1438
1439
Ben Murdoch097c5b22016-05-18 11:27:45 +01001440void RegExpBuilder::AddTerm(RegExpTree* term) {
1441 FlushText();
1442 terms_.Add(term, zone());
1443 LAST(ADD_ATOM);
1444}
1445
1446
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001447void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1448 FlushText();
1449 terms_.Add(assert, zone());
1450 LAST(ADD_ASSERT);
1451}
1452
1453
1454void RegExpBuilder::NewAlternative() { FlushTerms(); }
1455
1456
1457void RegExpBuilder::FlushTerms() {
1458 FlushText();
1459 int num_terms = terms_.length();
1460 RegExpTree* alternative;
1461 if (num_terms == 0) {
1462 alternative = new (zone()) RegExpEmpty();
1463 } else if (num_terms == 1) {
1464 alternative = terms_.last();
1465 } else {
1466 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1467 }
1468 alternatives_.Add(alternative, zone());
1469 terms_.Clear();
1470 LAST(ADD_NONE);
1471}
1472
1473
Ben Murdoch097c5b22016-05-18 11:27:45 +01001474bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
1475 if (!unicode()) return false;
Ben Murdochda12d292016-06-02 14:46:10 +01001476 // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
1477 // necessarily mean that we need to desugar. It's probably nicer to have a
1478 // separate pass to figure out unicode desugarings.
1479 if (ignore_case()) return true;
Ben Murdoch097c5b22016-05-18 11:27:45 +01001480 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1481 CharacterRange::Canonicalize(ranges);
1482 for (int i = ranges->length() - 1; i >= 0; i--) {
1483 uc32 from = ranges->at(i).from();
1484 uc32 to = ranges->at(i).to();
1485 // Check for non-BMP characters.
1486 if (to >= kNonBmpStart) return true;
1487 // Check for lone surrogates.
1488 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
1489 }
1490 return false;
1491}
1492
1493
1494bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
1495#ifdef V8_I18N_SUPPORT
1496 if (unicode() && ignore_case()) {
1497 USet* set = uset_open(c, c);
1498 uset_closeOver(set, USET_CASE_INSENSITIVE);
1499 uset_removeAllStrings(set);
1500 bool result = uset_size(set) > 1;
1501 uset_close(set);
1502 return result;
1503 }
1504 // In the case where ICU is not included, we act as if the unicode flag is
1505 // not set, and do not desugar.
1506#endif // V8_I18N_SUPPORT
1507 return false;
1508}
1509
1510
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001511RegExpTree* RegExpBuilder::ToRegExp() {
1512 FlushTerms();
1513 int num_alternatives = alternatives_.length();
1514 if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1515 if (num_alternatives == 1) return alternatives_.last();
1516 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1517}
1518
Ben Murdoch097c5b22016-05-18 11:27:45 +01001519bool RegExpBuilder::AddQuantifierToAtom(
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001520 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001521 FlushPendingSurrogate();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001522 if (pending_empty_) {
1523 pending_empty_ = false;
Ben Murdoch097c5b22016-05-18 11:27:45 +01001524 return true;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001525 }
1526 RegExpTree* atom;
1527 if (characters_ != NULL) {
1528 DCHECK(last_added_ == ADD_CHAR);
1529 // Last atom was character.
1530 Vector<const uc16> char_vector = characters_->ToConstVector();
1531 int num_chars = char_vector.length();
1532 if (num_chars > 1) {
1533 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
1534 text_.Add(new (zone()) RegExpAtom(prefix), zone());
1535 char_vector = char_vector.SubVector(num_chars - 1, num_chars);
1536 }
1537 characters_ = NULL;
1538 atom = new (zone()) RegExpAtom(char_vector);
1539 FlushText();
1540 } else if (text_.length() > 0) {
1541 DCHECK(last_added_ == ADD_ATOM);
1542 atom = text_.RemoveLast();
1543 FlushText();
1544 } else if (terms_.length() > 0) {
1545 DCHECK(last_added_ == ADD_ATOM);
1546 atom = terms_.RemoveLast();
Ben Murdoch097c5b22016-05-18 11:27:45 +01001547 // With /u, lookarounds are not quantifiable.
1548 if (unicode() && atom->IsLookaround()) return false;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001549 if (atom->max_match() == 0) {
1550 // Guaranteed to only match an empty string.
1551 LAST(ADD_TERM);
1552 if (min == 0) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001553 return true;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001554 }
1555 terms_.Add(atom, zone());
Ben Murdoch097c5b22016-05-18 11:27:45 +01001556 return true;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001557 }
1558 } else {
1559 // Only call immediately after adding an atom or character!
1560 UNREACHABLE();
Ben Murdoch097c5b22016-05-18 11:27:45 +01001561 return false;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001562 }
1563 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1564 zone());
1565 LAST(ADD_TERM);
Ben Murdoch097c5b22016-05-18 11:27:45 +01001566 return true;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001567}
1568
1569} // namespace internal
1570} // namespace v8