blob: fa8900342cfc4878411a1c06d753254024f138fe [file] [log] [blame]
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/regexp-parser.h"
6
7#include "src/char-predicates-inl.h"
8#include "src/factory.h"
9#include "src/isolate.h"
10#include "src/objects-inl.h"
11#include "src/regexp/jsregexp.h"
12#include "src/utils.h"
13
14namespace v8 {
15namespace internal {
16
17RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
18 bool multiline, bool unicode, Isolate* isolate,
19 Zone* zone)
20 : isolate_(isolate),
21 zone_(zone),
22 error_(error),
23 captures_(NULL),
24 in_(in),
25 current_(kEndMarker),
26 next_pos_(0),
27 captures_started_(0),
28 capture_count_(0),
29 has_more_(true),
30 multiline_(multiline),
31 unicode_(unicode),
32 simple_(false),
33 contains_anchor_(false),
34 is_scanned_for_captures_(false),
35 failed_(false) {
36 Advance();
37}
38
39
40uc32 RegExpParser::Next() {
41 if (has_next()) {
42 return in()->Get(next_pos_);
43 } else {
44 return kEndMarker;
45 }
46}
47
48
49void RegExpParser::Advance() {
50 if (next_pos_ < in()->length()) {
51 StackLimitCheck check(isolate());
52 if (check.HasOverflowed()) {
53 ReportError(CStrVector(Isolate::kStackOverflowMessage));
54 } else if (zone()->excess_allocation()) {
55 ReportError(CStrVector("Regular expression too large"));
56 } else {
57 current_ = in()->Get(next_pos_);
58 next_pos_++;
59 // Read the whole surrogate pair in case of unicode flag, if possible.
60 if (unicode_ && next_pos_ < in()->length() &&
61 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) {
62 uc16 trail = in()->Get(next_pos_);
63 if (unibrow::Utf16::IsTrailSurrogate(trail)) {
64 current_ = unibrow::Utf16::CombineSurrogatePair(
65 static_cast<uc16>(current_), trail);
66 next_pos_++;
67 }
68 }
69 }
70 } else {
71 current_ = kEndMarker;
72 // Advance so that position() points to 1-after-the-last-character. This is
73 // important so that Reset() to this position works correctly.
74 next_pos_ = in()->length() + 1;
75 has_more_ = false;
76 }
77}
78
79
80void RegExpParser::Reset(int pos) {
81 next_pos_ = pos;
82 has_more_ = (pos < in()->length());
83 Advance();
84}
85
86
87void RegExpParser::Advance(int dist) {
88 next_pos_ += dist - 1;
89 Advance();
90}
91
92
93bool RegExpParser::simple() { return simple_; }
94
95
96bool RegExpParser::IsSyntaxCharacter(uc32 c) {
97 return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||
98 c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
99 c == '{' || c == '}' || c == '|';
100}
101
102
103RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
104 failed_ = true;
105 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
106 // Zip to the end to make sure the no more input is read.
107 current_ = kEndMarker;
108 next_pos_ = in()->length();
109 return NULL;
110}
111
112
113#define CHECK_FAILED /**/); \
114 if (failed_) return NULL; \
115 ((void)0
116
117
118// Pattern ::
119// Disjunction
120RegExpTree* RegExpParser::ParsePattern() {
121 RegExpTree* result = ParseDisjunction(CHECK_FAILED);
122 DCHECK(!has_more());
123 // If the result of parsing is a literal string atom, and it has the
124 // same length as the input, then the atom is identical to the input.
125 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
126 simple_ = true;
127 }
128 return result;
129}
130
131
132// Disjunction ::
133// Alternative
134// Alternative | Disjunction
135// Alternative ::
136// [empty]
137// Term Alternative
138// Term ::
139// Assertion
140// Atom
141// Atom Quantifier
142RegExpTree* RegExpParser::ParseDisjunction() {
143 // Used to store current state while parsing subexpressions.
144 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
145 zone());
146 RegExpParserState* state = &initial_state;
147 // Cache the builder in a local variable for quick access.
148 RegExpBuilder* builder = initial_state.builder();
149 while (true) {
150 switch (current()) {
151 case kEndMarker:
152 if (state->IsSubexpression()) {
153 // Inside a parenthesized group when hitting end of input.
154 ReportError(CStrVector("Unterminated group") CHECK_FAILED);
155 }
156 DCHECK_EQ(INITIAL, state->group_type());
157 // Parsing completed successfully.
158 return builder->ToRegExp();
159 case ')': {
160 if (!state->IsSubexpression()) {
161 ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);
162 }
163 DCHECK_NE(INITIAL, state->group_type());
164
165 Advance();
166 // End disjunction parsing and convert builder content to new single
167 // regexp atom.
168 RegExpTree* body = builder->ToRegExp();
169
170 int end_capture_index = captures_started();
171
172 int capture_index = state->capture_index();
173 SubexpressionType group_type = state->group_type();
174
175 // Build result of subexpression.
176 if (group_type == CAPTURE) {
177 RegExpCapture* capture = GetCapture(capture_index);
178 capture->set_body(body);
179 body = capture;
180 } else if (group_type != GROUPING) {
181 DCHECK(group_type == POSITIVE_LOOKAROUND ||
182 group_type == NEGATIVE_LOOKAROUND);
183 bool is_positive = (group_type == POSITIVE_LOOKAROUND);
184 body = new (zone()) RegExpLookaround(
185 body, is_positive, end_capture_index - capture_index,
186 capture_index, state->lookaround_type());
187 }
188
189 // Restore previous state.
190 state = state->previous_state();
191 builder = state->builder();
192
193 builder->AddAtom(body);
194 // For compatability with JSC and ES3, we allow quantifiers after
195 // lookaheads, and break in all cases.
196 break;
197 }
198 case '|': {
199 Advance();
200 builder->NewAlternative();
201 continue;
202 }
203 case '*':
204 case '+':
205 case '?':
206 return ReportError(CStrVector("Nothing to repeat"));
207 case '^': {
208 Advance();
209 if (multiline_) {
210 builder->AddAssertion(
211 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
212 } else {
213 builder->AddAssertion(
214 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
215 set_contains_anchor();
216 }
217 continue;
218 }
219 case '$': {
220 Advance();
221 RegExpAssertion::AssertionType assertion_type =
222 multiline_ ? RegExpAssertion::END_OF_LINE
223 : RegExpAssertion::END_OF_INPUT;
224 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
225 continue;
226 }
227 case '.': {
228 Advance();
229 // everything except \x0a, \x0d, \u2028 and \u2029
230 ZoneList<CharacterRange>* ranges =
231 new (zone()) ZoneList<CharacterRange>(2, zone());
232 CharacterRange::AddClassEscape('.', ranges, zone());
233 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
234 builder->AddAtom(atom);
235 break;
236 }
237 case '(': {
238 SubexpressionType subexpr_type = CAPTURE;
239 RegExpLookaround::Type lookaround_type = state->lookaround_type();
240 Advance();
241 if (current() == '?') {
242 switch (Next()) {
243 case ':':
244 subexpr_type = GROUPING;
245 break;
246 case '=':
247 lookaround_type = RegExpLookaround::LOOKAHEAD;
248 subexpr_type = POSITIVE_LOOKAROUND;
249 break;
250 case '!':
251 lookaround_type = RegExpLookaround::LOOKAHEAD;
252 subexpr_type = NEGATIVE_LOOKAROUND;
253 break;
254 case '<':
255 if (FLAG_harmony_regexp_lookbehind) {
256 Advance();
257 lookaround_type = RegExpLookaround::LOOKBEHIND;
258 if (Next() == '=') {
259 subexpr_type = POSITIVE_LOOKAROUND;
260 break;
261 } else if (Next() == '!') {
262 subexpr_type = NEGATIVE_LOOKAROUND;
263 break;
264 }
265 }
266 // Fall through.
267 default:
268 ReportError(CStrVector("Invalid group") CHECK_FAILED);
269 break;
270 }
271 Advance(2);
272 } else {
273 if (captures_started_ >= kMaxCaptures) {
274 ReportError(CStrVector("Too many captures") CHECK_FAILED);
275 }
276 captures_started_++;
277 }
278 // Store current state and begin new disjunction parsing.
279 state = new (zone()) RegExpParserState(
280 state, subexpr_type, lookaround_type, captures_started_, zone());
281 builder = state->builder();
282 continue;
283 }
284 case '[': {
285 RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);
286 builder->AddAtom(atom);
287 break;
288 }
289 // Atom ::
290 // \ AtomEscape
291 case '\\':
292 switch (Next()) {
293 case kEndMarker:
294 return ReportError(CStrVector("\\ at end of pattern"));
295 case 'b':
296 Advance(2);
297 builder->AddAssertion(
298 new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
299 continue;
300 case 'B':
301 Advance(2);
302 builder->AddAssertion(
303 new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
304 continue;
305 // AtomEscape ::
306 // CharacterClassEscape
307 //
308 // CharacterClassEscape :: one of
309 // d D s S w W
310 case 'd':
311 case 'D':
312 case 's':
313 case 'S':
314 case 'w':
315 case 'W': {
316 uc32 c = Next();
317 Advance(2);
318 ZoneList<CharacterRange>* ranges =
319 new (zone()) ZoneList<CharacterRange>(2, zone());
320 CharacterRange::AddClassEscape(c, ranges, zone());
321 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
322 builder->AddAtom(atom);
323 break;
324 }
325 case '1':
326 case '2':
327 case '3':
328 case '4':
329 case '5':
330 case '6':
331 case '7':
332 case '8':
333 case '9': {
334 int index = 0;
335 if (ParseBackReferenceIndex(&index)) {
336 if (state->IsInsideCaptureGroup(index)) {
337 // The back reference is inside the capture group it refers to.
338 // Nothing can possibly have been captured yet, so we use empty
339 // instead. This ensures that, when checking a back reference,
340 // the capture registers of the referenced capture are either
341 // both set or both cleared.
342 builder->AddEmpty();
343 } else {
344 RegExpCapture* capture = GetCapture(index);
345 RegExpTree* atom = new (zone()) RegExpBackReference(capture);
346 builder->AddAtom(atom);
347 }
348 break;
349 }
350 uc32 first_digit = Next();
351 if (first_digit == '8' || first_digit == '9') {
352 // If the 'u' flag is present, only syntax characters can be
353 // escaped,
354 // no other identity escapes are allowed. If the 'u' flag is not
355 // present, all identity escapes are allowed.
356 if (!unicode_) {
357 builder->AddCharacter(first_digit);
358 Advance(2);
359 } else {
360 return ReportError(CStrVector("Invalid escape"));
361 }
362 break;
363 }
364 }
365 // FALLTHROUGH
366 case '0': {
367 Advance();
368 uc32 octal = ParseOctalLiteral();
369 builder->AddCharacter(octal);
370 break;
371 }
372 // ControlEscape :: one of
373 // f n r t v
374 case 'f':
375 Advance(2);
376 builder->AddCharacter('\f');
377 break;
378 case 'n':
379 Advance(2);
380 builder->AddCharacter('\n');
381 break;
382 case 'r':
383 Advance(2);
384 builder->AddCharacter('\r');
385 break;
386 case 't':
387 Advance(2);
388 builder->AddCharacter('\t');
389 break;
390 case 'v':
391 Advance(2);
392 builder->AddCharacter('\v');
393 break;
394 case 'c': {
395 Advance();
396 uc32 controlLetter = Next();
397 // Special case if it is an ASCII letter.
398 // Convert lower case letters to uppercase.
399 uc32 letter = controlLetter & ~('a' ^ 'A');
400 if (letter < 'A' || 'Z' < letter) {
401 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
402 // This is outside the specification. We match JSC in
403 // reading the backslash as a literal character instead
404 // of as starting an escape.
405 builder->AddCharacter('\\');
406 } else {
407 Advance(2);
408 builder->AddCharacter(controlLetter & 0x1f);
409 }
410 break;
411 }
412 case 'x': {
413 Advance(2);
414 uc32 value;
415 if (ParseHexEscape(2, &value)) {
416 builder->AddCharacter(value);
417 } else if (!unicode_) {
418 builder->AddCharacter('x');
419 } else {
420 // If the 'u' flag is present, invalid escapes are not treated as
421 // identity escapes.
422 return ReportError(CStrVector("Invalid escape"));
423 }
424 break;
425 }
426 case 'u': {
427 Advance(2);
428 uc32 value;
429 if (ParseUnicodeEscape(&value)) {
430 builder->AddUnicodeCharacter(value);
431 } else if (!unicode_) {
432 builder->AddCharacter('u');
433 } else {
434 // If the 'u' flag is present, invalid escapes are not treated as
435 // identity escapes.
436 return ReportError(CStrVector("Invalid unicode escape"));
437 }
438 break;
439 }
440 default:
441 Advance();
442 // If the 'u' flag is present, only syntax characters can be
443 // escaped, no
444 // other identity escapes are allowed. If the 'u' flag is not
445 // present,
446 // all identity escapes are allowed.
447 if (!unicode_ || IsSyntaxCharacter(current())) {
448 builder->AddCharacter(current());
449 Advance();
450 } else {
451 return ReportError(CStrVector("Invalid escape"));
452 }
453 break;
454 }
455 break;
456 case '{': {
457 int dummy;
458 if (ParseIntervalQuantifier(&dummy, &dummy)) {
459 ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);
460 }
461 // fallthrough
462 }
463 default:
464 builder->AddUnicodeCharacter(current());
465 Advance();
466 break;
467 } // end switch(current())
468
469 int min;
470 int max;
471 switch (current()) {
472 // QuantifierPrefix ::
473 // *
474 // +
475 // ?
476 // {
477 case '*':
478 min = 0;
479 max = RegExpTree::kInfinity;
480 Advance();
481 break;
482 case '+':
483 min = 1;
484 max = RegExpTree::kInfinity;
485 Advance();
486 break;
487 case '?':
488 min = 0;
489 max = 1;
490 Advance();
491 break;
492 case '{':
493 if (ParseIntervalQuantifier(&min, &max)) {
494 if (max < min) {
495 ReportError(CStrVector("numbers out of order in {} quantifier.")
496 CHECK_FAILED);
497 }
498 break;
499 } else {
500 continue;
501 }
502 default:
503 continue;
504 }
505 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
506 if (current() == '?') {
507 quantifier_type = RegExpQuantifier::NON_GREEDY;
508 Advance();
509 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
510 // FLAG_regexp_possessive_quantifier is a debug-only flag.
511 quantifier_type = RegExpQuantifier::POSSESSIVE;
512 Advance();
513 }
514 builder->AddQuantifierToAtom(min, max, quantifier_type);
515 }
516}
517
518
519#ifdef DEBUG
520// Currently only used in an DCHECK.
521static bool IsSpecialClassEscape(uc32 c) {
522 switch (c) {
523 case 'd':
524 case 'D':
525 case 's':
526 case 'S':
527 case 'w':
528 case 'W':
529 return true;
530 default:
531 return false;
532 }
533}
534#endif
535
536
537// In order to know whether an escape is a backreference or not we have to scan
538// the entire regexp and find the number of capturing parentheses. However we
539// don't want to scan the regexp twice unless it is necessary. This mini-parser
540// is called when needed. It can see the difference between capturing and
541// noncapturing parentheses and can skip character classes and backslash-escaped
542// characters.
543void RegExpParser::ScanForCaptures() {
544 // Start with captures started previous to current position
545 int capture_count = captures_started();
546 // Add count of captures after this position.
547 int n;
548 while ((n = current()) != kEndMarker) {
549 Advance();
550 switch (n) {
551 case '\\':
552 Advance();
553 break;
554 case '[': {
555 int c;
556 while ((c = current()) != kEndMarker) {
557 Advance();
558 if (c == '\\') {
559 Advance();
560 } else {
561 if (c == ']') break;
562 }
563 }
564 break;
565 }
566 case '(':
567 if (current() != '?') capture_count++;
568 break;
569 }
570 }
571 capture_count_ = capture_count;
572 is_scanned_for_captures_ = true;
573}
574
575
576bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
577 DCHECK_EQ('\\', current());
578 DCHECK('1' <= Next() && Next() <= '9');
579 // Try to parse a decimal literal that is no greater than the total number
580 // of left capturing parentheses in the input.
581 int start = position();
582 int value = Next() - '0';
583 Advance(2);
584 while (true) {
585 uc32 c = current();
586 if (IsDecimalDigit(c)) {
587 value = 10 * value + (c - '0');
588 if (value > kMaxCaptures) {
589 Reset(start);
590 return false;
591 }
592 Advance();
593 } else {
594 break;
595 }
596 }
597 if (value > captures_started()) {
598 if (!is_scanned_for_captures_) {
599 int saved_position = position();
600 ScanForCaptures();
601 Reset(saved_position);
602 }
603 if (value > capture_count_) {
604 Reset(start);
605 return false;
606 }
607 }
608 *index_out = value;
609 return true;
610}
611
612
613RegExpCapture* RegExpParser::GetCapture(int index) {
614 // The index for the capture groups are one-based. Its index in the list is
615 // zero-based.
616 int know_captures =
617 is_scanned_for_captures_ ? capture_count_ : captures_started_;
618 DCHECK(index <= know_captures);
619 if (captures_ == NULL) {
620 captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
621 }
622 while (captures_->length() < know_captures) {
623 captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
624 }
625 return captures_->at(index - 1);
626}
627
628
629bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
630 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
631 if (s->group_type() != CAPTURE) continue;
632 // Return true if we found the matching capture index.
633 if (index == s->capture_index()) return true;
634 // Abort if index is larger than what has been parsed up till this state.
635 if (index > s->capture_index()) return false;
636 }
637 return false;
638}
639
640
641// QuantifierPrefix ::
642// { DecimalDigits }
643// { DecimalDigits , }
644// { DecimalDigits , DecimalDigits }
645//
646// Returns true if parsing succeeds, and set the min_out and max_out
647// values. Values are truncated to RegExpTree::kInfinity if they overflow.
648bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
649 DCHECK_EQ(current(), '{');
650 int start = position();
651 Advance();
652 int min = 0;
653 if (!IsDecimalDigit(current())) {
654 Reset(start);
655 return false;
656 }
657 while (IsDecimalDigit(current())) {
658 int next = current() - '0';
659 if (min > (RegExpTree::kInfinity - next) / 10) {
660 // Overflow. Skip past remaining decimal digits and return -1.
661 do {
662 Advance();
663 } while (IsDecimalDigit(current()));
664 min = RegExpTree::kInfinity;
665 break;
666 }
667 min = 10 * min + next;
668 Advance();
669 }
670 int max = 0;
671 if (current() == '}') {
672 max = min;
673 Advance();
674 } else if (current() == ',') {
675 Advance();
676 if (current() == '}') {
677 max = RegExpTree::kInfinity;
678 Advance();
679 } else {
680 while (IsDecimalDigit(current())) {
681 int next = current() - '0';
682 if (max > (RegExpTree::kInfinity - next) / 10) {
683 do {
684 Advance();
685 } while (IsDecimalDigit(current()));
686 max = RegExpTree::kInfinity;
687 break;
688 }
689 max = 10 * max + next;
690 Advance();
691 }
692 if (current() != '}') {
693 Reset(start);
694 return false;
695 }
696 Advance();
697 }
698 } else {
699 Reset(start);
700 return false;
701 }
702 *min_out = min;
703 *max_out = max;
704 return true;
705}
706
707
708uc32 RegExpParser::ParseOctalLiteral() {
709 DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
710 // For compatibility with some other browsers (not all), we parse
711 // up to three octal digits with a value below 256.
712 uc32 value = current() - '0';
713 Advance();
714 if ('0' <= current() && current() <= '7') {
715 value = value * 8 + current() - '0';
716 Advance();
717 if (value < 32 && '0' <= current() && current() <= '7') {
718 value = value * 8 + current() - '0';
719 Advance();
720 }
721 }
722 return value;
723}
724
725
726bool RegExpParser::ParseHexEscape(int length, uc32* value) {
727 int start = position();
728 uc32 val = 0;
729 for (int i = 0; i < length; ++i) {
730 uc32 c = current();
731 int d = HexValue(c);
732 if (d < 0) {
733 Reset(start);
734 return false;
735 }
736 val = val * 16 + d;
737 Advance();
738 }
739 *value = val;
740 return true;
741}
742
743
744bool RegExpParser::ParseUnicodeEscape(uc32* value) {
745 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
746 // allowed). In the latter case, the number of hex digits between { } is
747 // arbitrary. \ and u have already been read.
748 if (current() == '{' && unicode_) {
749 int start = position();
750 Advance();
751 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
752 if (current() == '}') {
753 Advance();
754 return true;
755 }
756 }
757 Reset(start);
758 return false;
759 }
760 // \u but no {, or \u{...} escapes not allowed.
761 return ParseHexEscape(4, value);
762}
763
764
765bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
766 uc32 x = 0;
767 int d = HexValue(current());
768 if (d < 0) {
769 return false;
770 }
771 while (d >= 0) {
772 x = x * 16 + d;
773 if (x > max_value) {
774 return false;
775 }
776 Advance();
777 d = HexValue(current());
778 }
779 *value = x;
780 return true;
781}
782
783
784uc32 RegExpParser::ParseClassCharacterEscape() {
785 DCHECK(current() == '\\');
786 DCHECK(has_next() && !IsSpecialClassEscape(Next()));
787 Advance();
788 switch (current()) {
789 case 'b':
790 Advance();
791 return '\b';
792 // ControlEscape :: one of
793 // f n r t v
794 case 'f':
795 Advance();
796 return '\f';
797 case 'n':
798 Advance();
799 return '\n';
800 case 'r':
801 Advance();
802 return '\r';
803 case 't':
804 Advance();
805 return '\t';
806 case 'v':
807 Advance();
808 return '\v';
809 case 'c': {
810 uc32 controlLetter = Next();
811 uc32 letter = controlLetter & ~('A' ^ 'a');
812 // For compatibility with JSC, inside a character class
813 // we also accept digits and underscore as control characters.
814 if ((controlLetter >= '0' && controlLetter <= '9') ||
815 controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) {
816 Advance(2);
817 // Control letters mapped to ASCII control characters in the range
818 // 0x00-0x1f.
819 return controlLetter & 0x1f;
820 }
821 // We match JSC in reading the backslash as a literal
822 // character instead of as starting an escape.
823 return '\\';
824 }
825 case '0':
826 case '1':
827 case '2':
828 case '3':
829 case '4':
830 case '5':
831 case '6':
832 case '7':
833 // For compatibility, we interpret a decimal escape that isn't
834 // a back reference (and therefore either \0 or not valid according
835 // to the specification) as a 1..3 digit octal character code.
836 return ParseOctalLiteral();
837 case 'x': {
838 Advance();
839 uc32 value;
840 if (ParseHexEscape(2, &value)) {
841 return value;
842 }
843 if (!unicode_) {
844 // If \x is not followed by a two-digit hexadecimal, treat it
845 // as an identity escape.
846 return 'x';
847 }
848 // If the 'u' flag is present, invalid escapes are not treated as
849 // identity escapes.
850 ReportError(CStrVector("Invalid escape"));
851 return 0;
852 }
853 case 'u': {
854 Advance();
855 uc32 value;
856 if (ParseUnicodeEscape(&value)) {
857 return value;
858 }
859 if (!unicode_) {
860 return 'u';
861 }
862 // If the 'u' flag is present, invalid escapes are not treated as
863 // identity escapes.
864 ReportError(CStrVector("Invalid unicode escape"));
865 return 0;
866 }
867 default: {
868 uc32 result = current();
869 // If the 'u' flag is present, only syntax characters can be escaped, no
870 // other identity escapes are allowed. If the 'u' flag is not present, all
871 // identity escapes are allowed.
872 if (!unicode_ || IsSyntaxCharacter(result)) {
873 Advance();
874 return result;
875 }
876 ReportError(CStrVector("Invalid escape"));
877 return 0;
878 }
879 }
880 return 0;
881}
882
883
884CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
885 DCHECK_EQ(0, *char_class);
886 uc32 first = current();
887 if (first == '\\') {
888 switch (Next()) {
889 case 'w':
890 case 'W':
891 case 'd':
892 case 'D':
893 case 's':
894 case 'S': {
895 *char_class = Next();
896 Advance(2);
897 return CharacterRange::Singleton(0); // Return dummy value.
898 }
899 case kEndMarker:
900 return ReportError(CStrVector("\\ at end of pattern"));
901 default:
902 uc32 c = ParseClassCharacterEscape(CHECK_FAILED);
903 return CharacterRange::Singleton(c);
904 }
905 } else {
906 Advance();
907 return CharacterRange::Singleton(first);
908 }
909}
910
911
912static const uc16 kNoCharClass = 0;
913
914// Adds range or pre-defined character class to character ranges.
915// If char_class is not kInvalidClass, it's interpreted as a class
916// escape (i.e., 's' means whitespace, from '\s').
917static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
918 uc16 char_class, CharacterRange range,
919 Zone* zone) {
920 if (char_class != kNoCharClass) {
921 CharacterRange::AddClassEscape(char_class, ranges, zone);
922 } else {
923 ranges->Add(range, zone);
924 }
925}
926
927
928RegExpTree* RegExpParser::ParseCharacterClass() {
929 static const char* kUnterminated = "Unterminated character class";
930 static const char* kRangeOutOfOrder = "Range out of order in character class";
931
932 DCHECK_EQ(current(), '[');
933 Advance();
934 bool is_negated = false;
935 if (current() == '^') {
936 is_negated = true;
937 Advance();
938 }
939 ZoneList<CharacterRange>* ranges =
940 new (zone()) ZoneList<CharacterRange>(2, zone());
941 while (has_more() && current() != ']') {
942 uc16 char_class = kNoCharClass;
943 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
944 if (current() == '-') {
945 Advance();
946 if (current() == kEndMarker) {
947 // If we reach the end we break out of the loop and let the
948 // following code report an error.
949 break;
950 } else if (current() == ']') {
951 AddRangeOrEscape(ranges, char_class, first, zone());
952 ranges->Add(CharacterRange::Singleton('-'), zone());
953 break;
954 }
955 uc16 char_class_2 = kNoCharClass;
956 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
957 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
958 // Either end is an escaped character class. Treat the '-' verbatim.
959 AddRangeOrEscape(ranges, char_class, first, zone());
960 ranges->Add(CharacterRange::Singleton('-'), zone());
961 AddRangeOrEscape(ranges, char_class_2, next, zone());
962 continue;
963 }
964 if (first.from() > next.to()) {
965 return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);
966 }
967 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
968 } else {
969 AddRangeOrEscape(ranges, char_class, first, zone());
970 }
971 }
972 if (!has_more()) {
973 return ReportError(CStrVector(kUnterminated) CHECK_FAILED);
974 }
975 Advance();
976 if (ranges->length() == 0) {
977 ranges->Add(CharacterRange::Everything(), zone());
978 is_negated = !is_negated;
979 }
980 return new (zone()) RegExpCharacterClass(ranges, is_negated);
981}
982
983
984#undef CHECK_FAILED
985
986
987bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
988 FlatStringReader* input, bool multiline,
989 bool unicode, RegExpCompileData* result) {
990 DCHECK(result != NULL);
991 RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone);
992 RegExpTree* tree = parser.ParsePattern();
993 if (parser.failed()) {
994 DCHECK(tree == NULL);
995 DCHECK(!result->error.is_null());
996 } else {
997 DCHECK(tree != NULL);
998 DCHECK(result->error.is_null());
999 if (FLAG_trace_regexp_parser) {
1000 OFStream os(stdout);
1001 tree->Print(os, zone);
1002 os << "\n";
1003 }
1004 result->tree = tree;
1005 int capture_count = parser.captures_started();
1006 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1007 result->contains_anchor = parser.contains_anchor();
1008 result->capture_count = capture_count;
1009 }
1010 return !parser.failed();
1011}
1012
1013
1014RegExpBuilder::RegExpBuilder(Zone* zone)
1015 : zone_(zone),
1016 pending_empty_(false),
1017 characters_(NULL),
1018 terms_(),
1019 alternatives_()
1020#ifdef DEBUG
1021 ,
1022 last_added_(ADD_NONE)
1023#endif
1024{
1025}
1026
1027
1028void RegExpBuilder::FlushCharacters() {
1029 pending_empty_ = false;
1030 if (characters_ != NULL) {
1031 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
1032 characters_ = NULL;
1033 text_.Add(atom, zone());
1034 LAST(ADD_ATOM);
1035 }
1036}
1037
1038
1039void RegExpBuilder::FlushText() {
1040 FlushCharacters();
1041 int num_text = text_.length();
1042 if (num_text == 0) {
1043 return;
1044 } else if (num_text == 1) {
1045 terms_.Add(text_.last(), zone());
1046 } else {
1047 RegExpText* text = new (zone()) RegExpText(zone());
1048 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1049 terms_.Add(text, zone());
1050 }
1051 text_.Clear();
1052}
1053
1054
1055void RegExpBuilder::AddCharacter(uc16 c) {
1056 pending_empty_ = false;
1057 if (characters_ == NULL) {
1058 characters_ = new (zone()) ZoneList<uc16>(4, zone());
1059 }
1060 characters_->Add(c, zone());
1061 LAST(ADD_CHAR);
1062}
1063
1064
1065void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1066 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
1067 ZoneList<uc16> surrogate_pair(2, zone());
1068 surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone());
1069 surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone());
1070 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
1071 AddAtom(atom);
1072 } else {
1073 AddCharacter(static_cast<uc16>(c));
1074 }
1075}
1076
1077
1078void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1079
1080
1081void RegExpBuilder::AddAtom(RegExpTree* term) {
1082 if (term->IsEmpty()) {
1083 AddEmpty();
1084 return;
1085 }
1086 if (term->IsTextElement()) {
1087 FlushCharacters();
1088 text_.Add(term, zone());
1089 } else {
1090 FlushText();
1091 terms_.Add(term, zone());
1092 }
1093 LAST(ADD_ATOM);
1094}
1095
1096
1097void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1098 FlushText();
1099 terms_.Add(assert, zone());
1100 LAST(ADD_ASSERT);
1101}
1102
1103
1104void RegExpBuilder::NewAlternative() { FlushTerms(); }
1105
1106
1107void RegExpBuilder::FlushTerms() {
1108 FlushText();
1109 int num_terms = terms_.length();
1110 RegExpTree* alternative;
1111 if (num_terms == 0) {
1112 alternative = new (zone()) RegExpEmpty();
1113 } else if (num_terms == 1) {
1114 alternative = terms_.last();
1115 } else {
1116 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1117 }
1118 alternatives_.Add(alternative, zone());
1119 terms_.Clear();
1120 LAST(ADD_NONE);
1121}
1122
1123
1124RegExpTree* RegExpBuilder::ToRegExp() {
1125 FlushTerms();
1126 int num_alternatives = alternatives_.length();
1127 if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1128 if (num_alternatives == 1) return alternatives_.last();
1129 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1130}
1131
1132
1133void RegExpBuilder::AddQuantifierToAtom(
1134 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
1135 if (pending_empty_) {
1136 pending_empty_ = false;
1137 return;
1138 }
1139 RegExpTree* atom;
1140 if (characters_ != NULL) {
1141 DCHECK(last_added_ == ADD_CHAR);
1142 // Last atom was character.
1143 Vector<const uc16> char_vector = characters_->ToConstVector();
1144 int num_chars = char_vector.length();
1145 if (num_chars > 1) {
1146 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
1147 text_.Add(new (zone()) RegExpAtom(prefix), zone());
1148 char_vector = char_vector.SubVector(num_chars - 1, num_chars);
1149 }
1150 characters_ = NULL;
1151 atom = new (zone()) RegExpAtom(char_vector);
1152 FlushText();
1153 } else if (text_.length() > 0) {
1154 DCHECK(last_added_ == ADD_ATOM);
1155 atom = text_.RemoveLast();
1156 FlushText();
1157 } else if (terms_.length() > 0) {
1158 DCHECK(last_added_ == ADD_ATOM);
1159 atom = terms_.RemoveLast();
1160 if (atom->max_match() == 0) {
1161 // Guaranteed to only match an empty string.
1162 LAST(ADD_TERM);
1163 if (min == 0) {
1164 return;
1165 }
1166 terms_.Add(atom, zone());
1167 return;
1168 }
1169 } else {
1170 // Only call immediately after adding an atom or character!
1171 UNREACHABLE();
1172 return;
1173 }
1174 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1175 zone());
1176 LAST(ADD_TERM);
1177}
1178
1179} // namespace internal
1180} // namespace v8