blob: 46c593c2643d45a1cbe4c637158fe37b11c3d4da [file] [log] [blame]
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/regexp-parser.h"
6
7#include "src/char-predicates-inl.h"
8#include "src/factory.h"
9#include "src/isolate.h"
10#include "src/objects-inl.h"
Ben Murdoch097c5b22016-05-18 11:27:45 +010011#include "src/ostreams.h"
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000012#include "src/regexp/jsregexp.h"
13#include "src/utils.h"
14
Ben Murdoch097c5b22016-05-18 11:27:45 +010015#ifdef V8_I18N_SUPPORT
16#include "unicode/uset.h"
17#endif // V8_I18N_SUPPORT
18
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000019namespace v8 {
20namespace internal {
21
22RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
Ben Murdoch097c5b22016-05-18 11:27:45 +010023 JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000024 : isolate_(isolate),
25 zone_(zone),
26 error_(error),
27 captures_(NULL),
28 in_(in),
29 current_(kEndMarker),
Ben Murdoch097c5b22016-05-18 11:27:45 +010030 ignore_case_(flags & JSRegExp::kIgnoreCase),
31 multiline_(flags & JSRegExp::kMultiline),
32 unicode_(flags & JSRegExp::kUnicode),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000033 next_pos_(0),
34 captures_started_(0),
35 capture_count_(0),
36 has_more_(true),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000037 simple_(false),
38 contains_anchor_(false),
39 is_scanned_for_captures_(false),
40 failed_(false) {
41 Advance();
42}
43
Ben Murdoch097c5b22016-05-18 11:27:45 +010044template <bool update_position>
45inline uc32 RegExpParser::ReadNext() {
46 int position = next_pos_;
47 uc32 c0 = in()->Get(position);
48 position++;
49 // Read the whole surrogate pair in case of unicode flag, if possible.
50 if (unicode() && position < in()->length() &&
51 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
52 uc16 c1 = in()->Get(position);
53 if (unibrow::Utf16::IsTrailSurrogate(c1)) {
54 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
55 position++;
56 }
57 }
58 if (update_position) next_pos_ = position;
59 return c0;
60}
61
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000062
63uc32 RegExpParser::Next() {
64 if (has_next()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +010065 return ReadNext<false>();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000066 } else {
67 return kEndMarker;
68 }
69}
70
71
72void RegExpParser::Advance() {
Ben Murdoch097c5b22016-05-18 11:27:45 +010073 if (has_next()) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000074 StackLimitCheck check(isolate());
75 if (check.HasOverflowed()) {
76 ReportError(CStrVector(Isolate::kStackOverflowMessage));
77 } else if (zone()->excess_allocation()) {
78 ReportError(CStrVector("Regular expression too large"));
79 } else {
Ben Murdoch097c5b22016-05-18 11:27:45 +010080 current_ = ReadNext<true>();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000081 }
82 } else {
83 current_ = kEndMarker;
84 // Advance so that position() points to 1-after-the-last-character. This is
85 // important so that Reset() to this position works correctly.
86 next_pos_ = in()->length() + 1;
87 has_more_ = false;
88 }
89}
90
91
92void RegExpParser::Reset(int pos) {
93 next_pos_ = pos;
94 has_more_ = (pos < in()->length());
95 Advance();
96}
97
98
99void RegExpParser::Advance(int dist) {
100 next_pos_ += dist - 1;
101 Advance();
102}
103
104
105bool RegExpParser::simple() { return simple_; }
106
Ben Murdoch097c5b22016-05-18 11:27:45 +0100107bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
108 switch (c) {
109 case '^':
110 case '$':
111 case '\\':
112 case '.':
113 case '*':
114 case '+':
115 case '?':
116 case '(':
117 case ')':
118 case '[':
119 case ']':
120 case '{':
121 case '}':
122 case '|':
123 case '/':
124 return true;
125 default:
126 break;
127 }
128 return false;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000129}
130
131
132RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
133 failed_ = true;
134 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
135 // Zip to the end to make sure the no more input is read.
136 current_ = kEndMarker;
137 next_pos_ = in()->length();
138 return NULL;
139}
140
141
142#define CHECK_FAILED /**/); \
143 if (failed_) return NULL; \
144 ((void)0
145
146
147// Pattern ::
148// Disjunction
149RegExpTree* RegExpParser::ParsePattern() {
150 RegExpTree* result = ParseDisjunction(CHECK_FAILED);
151 DCHECK(!has_more());
152 // If the result of parsing is a literal string atom, and it has the
153 // same length as the input, then the atom is identical to the input.
154 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
155 simple_ = true;
156 }
157 return result;
158}
159
160
161// Disjunction ::
162// Alternative
163// Alternative | Disjunction
164// Alternative ::
165// [empty]
166// Term Alternative
167// Term ::
168// Assertion
169// Atom
170// Atom Quantifier
171RegExpTree* RegExpParser::ParseDisjunction() {
172 // Used to store current state while parsing subexpressions.
173 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
Ben Murdoch097c5b22016-05-18 11:27:45 +0100174 ignore_case(), unicode(), zone());
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000175 RegExpParserState* state = &initial_state;
176 // Cache the builder in a local variable for quick access.
177 RegExpBuilder* builder = initial_state.builder();
178 while (true) {
179 switch (current()) {
180 case kEndMarker:
181 if (state->IsSubexpression()) {
182 // Inside a parenthesized group when hitting end of input.
Ben Murdoch097c5b22016-05-18 11:27:45 +0100183 return ReportError(CStrVector("Unterminated group"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000184 }
185 DCHECK_EQ(INITIAL, state->group_type());
186 // Parsing completed successfully.
187 return builder->ToRegExp();
188 case ')': {
189 if (!state->IsSubexpression()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100190 return ReportError(CStrVector("Unmatched ')'"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000191 }
192 DCHECK_NE(INITIAL, state->group_type());
193
194 Advance();
195 // End disjunction parsing and convert builder content to new single
196 // regexp atom.
197 RegExpTree* body = builder->ToRegExp();
198
199 int end_capture_index = captures_started();
200
201 int capture_index = state->capture_index();
202 SubexpressionType group_type = state->group_type();
203
204 // Build result of subexpression.
205 if (group_type == CAPTURE) {
206 RegExpCapture* capture = GetCapture(capture_index);
207 capture->set_body(body);
208 body = capture;
209 } else if (group_type != GROUPING) {
210 DCHECK(group_type == POSITIVE_LOOKAROUND ||
211 group_type == NEGATIVE_LOOKAROUND);
212 bool is_positive = (group_type == POSITIVE_LOOKAROUND);
213 body = new (zone()) RegExpLookaround(
214 body, is_positive, end_capture_index - capture_index,
215 capture_index, state->lookaround_type());
216 }
217
218 // Restore previous state.
219 state = state->previous_state();
220 builder = state->builder();
221
222 builder->AddAtom(body);
223 // For compatability with JSC and ES3, we allow quantifiers after
224 // lookaheads, and break in all cases.
225 break;
226 }
227 case '|': {
228 Advance();
229 builder->NewAlternative();
230 continue;
231 }
232 case '*':
233 case '+':
234 case '?':
235 return ReportError(CStrVector("Nothing to repeat"));
236 case '^': {
237 Advance();
Ben Murdoch097c5b22016-05-18 11:27:45 +0100238 if (multiline()) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000239 builder->AddAssertion(
240 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
241 } else {
242 builder->AddAssertion(
243 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
244 set_contains_anchor();
245 }
246 continue;
247 }
248 case '$': {
249 Advance();
250 RegExpAssertion::AssertionType assertion_type =
Ben Murdoch097c5b22016-05-18 11:27:45 +0100251 multiline() ? RegExpAssertion::END_OF_LINE
252 : RegExpAssertion::END_OF_INPUT;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000253 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
254 continue;
255 }
256 case '.': {
257 Advance();
258 // everything except \x0a, \x0d, \u2028 and \u2029
259 ZoneList<CharacterRange>* ranges =
260 new (zone()) ZoneList<CharacterRange>(2, zone());
261 CharacterRange::AddClassEscape('.', ranges, zone());
Ben Murdoch097c5b22016-05-18 11:27:45 +0100262 RegExpCharacterClass* cc =
263 new (zone()) RegExpCharacterClass(ranges, false);
264 builder->AddCharacterClass(cc);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000265 break;
266 }
267 case '(': {
268 SubexpressionType subexpr_type = CAPTURE;
269 RegExpLookaround::Type lookaround_type = state->lookaround_type();
270 Advance();
271 if (current() == '?') {
272 switch (Next()) {
273 case ':':
274 subexpr_type = GROUPING;
275 break;
276 case '=':
277 lookaround_type = RegExpLookaround::LOOKAHEAD;
278 subexpr_type = POSITIVE_LOOKAROUND;
279 break;
280 case '!':
281 lookaround_type = RegExpLookaround::LOOKAHEAD;
282 subexpr_type = NEGATIVE_LOOKAROUND;
283 break;
284 case '<':
285 if (FLAG_harmony_regexp_lookbehind) {
286 Advance();
287 lookaround_type = RegExpLookaround::LOOKBEHIND;
288 if (Next() == '=') {
289 subexpr_type = POSITIVE_LOOKAROUND;
290 break;
291 } else if (Next() == '!') {
292 subexpr_type = NEGATIVE_LOOKAROUND;
293 break;
294 }
295 }
296 // Fall through.
297 default:
Ben Murdoch097c5b22016-05-18 11:27:45 +0100298 return ReportError(CStrVector("Invalid group"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000299 }
300 Advance(2);
301 } else {
302 if (captures_started_ >= kMaxCaptures) {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100303 return ReportError(CStrVector("Too many captures"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000304 }
305 captures_started_++;
306 }
307 // Store current state and begin new disjunction parsing.
308 state = new (zone()) RegExpParserState(
Ben Murdoch097c5b22016-05-18 11:27:45 +0100309 state, subexpr_type, lookaround_type, captures_started_,
310 ignore_case(), unicode(), zone());
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000311 builder = state->builder();
312 continue;
313 }
314 case '[': {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100315 RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
316 builder->AddCharacterClass(cc->AsCharacterClass());
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000317 break;
318 }
319 // Atom ::
320 // \ AtomEscape
321 case '\\':
322 switch (Next()) {
323 case kEndMarker:
324 return ReportError(CStrVector("\\ at end of pattern"));
325 case 'b':
326 Advance(2);
327 builder->AddAssertion(
328 new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
329 continue;
330 case 'B':
331 Advance(2);
332 builder->AddAssertion(
333 new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
334 continue;
335 // AtomEscape ::
336 // CharacterClassEscape
337 //
338 // CharacterClassEscape :: one of
339 // d D s S w W
340 case 'd':
341 case 'D':
342 case 's':
343 case 'S':
344 case 'w':
345 case 'W': {
346 uc32 c = Next();
347 Advance(2);
348 ZoneList<CharacterRange>* ranges =
349 new (zone()) ZoneList<CharacterRange>(2, zone());
350 CharacterRange::AddClassEscape(c, ranges, zone());
Ben Murdoch097c5b22016-05-18 11:27:45 +0100351 RegExpCharacterClass* cc =
352 new (zone()) RegExpCharacterClass(ranges, false);
353 builder->AddCharacterClass(cc);
354 break;
355 }
356 case 'p':
357 case 'P': {
358 uc32 p = Next();
359 Advance(2);
360 if (unicode()) {
361 if (FLAG_harmony_regexp_property) {
362 ZoneList<CharacterRange>* ranges = ParsePropertyClass();
363 if (ranges == nullptr) {
364 return ReportError(CStrVector("Invalid property name"));
365 }
366 RegExpCharacterClass* cc =
367 new (zone()) RegExpCharacterClass(ranges, p == 'P');
368 builder->AddCharacterClass(cc);
369 } else {
370 return ReportError(CStrVector("Invalid escape"));
371 }
372 } else {
373 builder->AddCharacter(p);
374 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000375 break;
376 }
377 case '1':
378 case '2':
379 case '3':
380 case '4':
381 case '5':
382 case '6':
383 case '7':
384 case '8':
385 case '9': {
386 int index = 0;
Ben Murdoch097c5b22016-05-18 11:27:45 +0100387 bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
388 if (is_backref) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000389 if (state->IsInsideCaptureGroup(index)) {
390 // The back reference is inside the capture group it refers to.
391 // Nothing can possibly have been captured yet, so we use empty
392 // instead. This ensures that, when checking a back reference,
393 // the capture registers of the referenced capture are either
394 // both set or both cleared.
395 builder->AddEmpty();
396 } else {
397 RegExpCapture* capture = GetCapture(index);
398 RegExpTree* atom = new (zone()) RegExpBackReference(capture);
399 builder->AddAtom(atom);
400 }
401 break;
402 }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100403 // With /u, no identity escapes except for syntax characters
404 // are allowed. Otherwise, all identity escapes are allowed.
405 if (unicode()) {
406 return ReportError(CStrVector("Invalid escape"));
407 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000408 uc32 first_digit = Next();
409 if (first_digit == '8' || first_digit == '9') {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100410 builder->AddCharacter(first_digit);
411 Advance(2);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000412 break;
413 }
414 }
415 // FALLTHROUGH
416 case '0': {
417 Advance();
Ben Murdoch097c5b22016-05-18 11:27:45 +0100418 if (unicode() && Next() >= '0' && Next() <= '9') {
419 // With /u, decimal escape with leading 0 are not parsed as octal.
420 return ReportError(CStrVector("Invalid decimal escape"));
421 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000422 uc32 octal = ParseOctalLiteral();
423 builder->AddCharacter(octal);
424 break;
425 }
426 // ControlEscape :: one of
427 // f n r t v
428 case 'f':
429 Advance(2);
430 builder->AddCharacter('\f');
431 break;
432 case 'n':
433 Advance(2);
434 builder->AddCharacter('\n');
435 break;
436 case 'r':
437 Advance(2);
438 builder->AddCharacter('\r');
439 break;
440 case 't':
441 Advance(2);
442 builder->AddCharacter('\t');
443 break;
444 case 'v':
445 Advance(2);
446 builder->AddCharacter('\v');
447 break;
448 case 'c': {
449 Advance();
450 uc32 controlLetter = Next();
451 // Special case if it is an ASCII letter.
452 // Convert lower case letters to uppercase.
453 uc32 letter = controlLetter & ~('a' ^ 'A');
454 if (letter < 'A' || 'Z' < letter) {
455 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
456 // This is outside the specification. We match JSC in
457 // reading the backslash as a literal character instead
458 // of as starting an escape.
Ben Murdoch097c5b22016-05-18 11:27:45 +0100459 if (unicode()) {
460 // With /u, invalid escapes are not treated as identity escapes.
461 return ReportError(CStrVector("Invalid unicode escape"));
462 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000463 builder->AddCharacter('\\');
464 } else {
465 Advance(2);
466 builder->AddCharacter(controlLetter & 0x1f);
467 }
468 break;
469 }
470 case 'x': {
471 Advance(2);
472 uc32 value;
473 if (ParseHexEscape(2, &value)) {
474 builder->AddCharacter(value);
Ben Murdoch097c5b22016-05-18 11:27:45 +0100475 } else if (!unicode()) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000476 builder->AddCharacter('x');
477 } else {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100478 // With /u, invalid escapes are not treated as identity escapes.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000479 return ReportError(CStrVector("Invalid escape"));
480 }
481 break;
482 }
483 case 'u': {
484 Advance(2);
485 uc32 value;
486 if (ParseUnicodeEscape(&value)) {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100487 builder->AddEscapedUnicodeCharacter(value);
488 } else if (!unicode()) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000489 builder->AddCharacter('u');
490 } else {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100491 // With /u, invalid escapes are not treated as identity escapes.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000492 return ReportError(CStrVector("Invalid unicode escape"));
493 }
494 break;
495 }
496 default:
497 Advance();
Ben Murdoch097c5b22016-05-18 11:27:45 +0100498 // With /u, no identity escapes except for syntax characters
499 // are allowed. Otherwise, all identity escapes are allowed.
500 if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000501 builder->AddCharacter(current());
502 Advance();
503 } else {
504 return ReportError(CStrVector("Invalid escape"));
505 }
506 break;
507 }
508 break;
509 case '{': {
510 int dummy;
511 if (ParseIntervalQuantifier(&dummy, &dummy)) {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100512 return ReportError(CStrVector("Nothing to repeat"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000513 }
514 // fallthrough
515 }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100516 case '}':
517 case ']':
518 if (unicode()) {
519 return ReportError(CStrVector("Lone quantifier brackets"));
520 }
521 // fallthrough
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000522 default:
523 builder->AddUnicodeCharacter(current());
524 Advance();
525 break;
526 } // end switch(current())
527
528 int min;
529 int max;
530 switch (current()) {
531 // QuantifierPrefix ::
532 // *
533 // +
534 // ?
535 // {
536 case '*':
537 min = 0;
538 max = RegExpTree::kInfinity;
539 Advance();
540 break;
541 case '+':
542 min = 1;
543 max = RegExpTree::kInfinity;
544 Advance();
545 break;
546 case '?':
547 min = 0;
548 max = 1;
549 Advance();
550 break;
551 case '{':
552 if (ParseIntervalQuantifier(&min, &max)) {
553 if (max < min) {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100554 return ReportError(
555 CStrVector("numbers out of order in {} quantifier"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000556 }
557 break;
Ben Murdoch097c5b22016-05-18 11:27:45 +0100558 } else if (unicode()) {
559 // With /u, incomplete quantifiers are not allowed.
560 return ReportError(CStrVector("Incomplete quantifier"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000561 }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100562 continue;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000563 default:
564 continue;
565 }
566 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
567 if (current() == '?') {
568 quantifier_type = RegExpQuantifier::NON_GREEDY;
569 Advance();
570 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
571 // FLAG_regexp_possessive_quantifier is a debug-only flag.
572 quantifier_type = RegExpQuantifier::POSSESSIVE;
573 Advance();
574 }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100575 if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
576 return ReportError(CStrVector("Invalid quantifier"));
577 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000578 }
579}
580
581
582#ifdef DEBUG
583// Currently only used in an DCHECK.
584static bool IsSpecialClassEscape(uc32 c) {
585 switch (c) {
586 case 'd':
587 case 'D':
588 case 's':
589 case 'S':
590 case 'w':
591 case 'W':
592 return true;
593 default:
594 return false;
595 }
596}
597#endif
598
599
600// In order to know whether an escape is a backreference or not we have to scan
601// the entire regexp and find the number of capturing parentheses. However we
602// don't want to scan the regexp twice unless it is necessary. This mini-parser
603// is called when needed. It can see the difference between capturing and
604// noncapturing parentheses and can skip character classes and backslash-escaped
605// characters.
606void RegExpParser::ScanForCaptures() {
607 // Start with captures started previous to current position
608 int capture_count = captures_started();
609 // Add count of captures after this position.
610 int n;
611 while ((n = current()) != kEndMarker) {
612 Advance();
613 switch (n) {
614 case '\\':
615 Advance();
616 break;
617 case '[': {
618 int c;
619 while ((c = current()) != kEndMarker) {
620 Advance();
621 if (c == '\\') {
622 Advance();
623 } else {
624 if (c == ']') break;
625 }
626 }
627 break;
628 }
629 case '(':
630 if (current() != '?') capture_count++;
631 break;
632 }
633 }
634 capture_count_ = capture_count;
635 is_scanned_for_captures_ = true;
636}
637
638
639bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
640 DCHECK_EQ('\\', current());
641 DCHECK('1' <= Next() && Next() <= '9');
642 // Try to parse a decimal literal that is no greater than the total number
643 // of left capturing parentheses in the input.
644 int start = position();
645 int value = Next() - '0';
646 Advance(2);
647 while (true) {
648 uc32 c = current();
649 if (IsDecimalDigit(c)) {
650 value = 10 * value + (c - '0');
651 if (value > kMaxCaptures) {
652 Reset(start);
653 return false;
654 }
655 Advance();
656 } else {
657 break;
658 }
659 }
660 if (value > captures_started()) {
661 if (!is_scanned_for_captures_) {
662 int saved_position = position();
663 ScanForCaptures();
664 Reset(saved_position);
665 }
666 if (value > capture_count_) {
667 Reset(start);
668 return false;
669 }
670 }
671 *index_out = value;
672 return true;
673}
674
675
676RegExpCapture* RegExpParser::GetCapture(int index) {
677 // The index for the capture groups are one-based. Its index in the list is
678 // zero-based.
679 int know_captures =
680 is_scanned_for_captures_ ? capture_count_ : captures_started_;
681 DCHECK(index <= know_captures);
682 if (captures_ == NULL) {
683 captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
684 }
685 while (captures_->length() < know_captures) {
686 captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
687 }
688 return captures_->at(index - 1);
689}
690
691
692bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
693 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
694 if (s->group_type() != CAPTURE) continue;
695 // Return true if we found the matching capture index.
696 if (index == s->capture_index()) return true;
697 // Abort if index is larger than what has been parsed up till this state.
698 if (index > s->capture_index()) return false;
699 }
700 return false;
701}
702
703
704// QuantifierPrefix ::
705// { DecimalDigits }
706// { DecimalDigits , }
707// { DecimalDigits , DecimalDigits }
708//
709// Returns true if parsing succeeds, and set the min_out and max_out
710// values. Values are truncated to RegExpTree::kInfinity if they overflow.
711bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
712 DCHECK_EQ(current(), '{');
713 int start = position();
714 Advance();
715 int min = 0;
716 if (!IsDecimalDigit(current())) {
717 Reset(start);
718 return false;
719 }
720 while (IsDecimalDigit(current())) {
721 int next = current() - '0';
722 if (min > (RegExpTree::kInfinity - next) / 10) {
723 // Overflow. Skip past remaining decimal digits and return -1.
724 do {
725 Advance();
726 } while (IsDecimalDigit(current()));
727 min = RegExpTree::kInfinity;
728 break;
729 }
730 min = 10 * min + next;
731 Advance();
732 }
733 int max = 0;
734 if (current() == '}') {
735 max = min;
736 Advance();
737 } else if (current() == ',') {
738 Advance();
739 if (current() == '}') {
740 max = RegExpTree::kInfinity;
741 Advance();
742 } else {
743 while (IsDecimalDigit(current())) {
744 int next = current() - '0';
745 if (max > (RegExpTree::kInfinity - next) / 10) {
746 do {
747 Advance();
748 } while (IsDecimalDigit(current()));
749 max = RegExpTree::kInfinity;
750 break;
751 }
752 max = 10 * max + next;
753 Advance();
754 }
755 if (current() != '}') {
756 Reset(start);
757 return false;
758 }
759 Advance();
760 }
761 } else {
762 Reset(start);
763 return false;
764 }
765 *min_out = min;
766 *max_out = max;
767 return true;
768}
769
770
771uc32 RegExpParser::ParseOctalLiteral() {
772 DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
773 // For compatibility with some other browsers (not all), we parse
774 // up to three octal digits with a value below 256.
775 uc32 value = current() - '0';
776 Advance();
777 if ('0' <= current() && current() <= '7') {
778 value = value * 8 + current() - '0';
779 Advance();
780 if (value < 32 && '0' <= current() && current() <= '7') {
781 value = value * 8 + current() - '0';
782 Advance();
783 }
784 }
785 return value;
786}
787
788
789bool RegExpParser::ParseHexEscape(int length, uc32* value) {
790 int start = position();
791 uc32 val = 0;
792 for (int i = 0; i < length; ++i) {
793 uc32 c = current();
794 int d = HexValue(c);
795 if (d < 0) {
796 Reset(start);
797 return false;
798 }
799 val = val * 16 + d;
800 Advance();
801 }
802 *value = val;
803 return true;
804}
805
Ben Murdoch097c5b22016-05-18 11:27:45 +0100806// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000807bool RegExpParser::ParseUnicodeEscape(uc32* value) {
808 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
809 // allowed). In the latter case, the number of hex digits between { } is
810 // arbitrary. \ and u have already been read.
Ben Murdoch097c5b22016-05-18 11:27:45 +0100811 if (current() == '{' && unicode()) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000812 int start = position();
813 Advance();
814 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
815 if (current() == '}') {
816 Advance();
817 return true;
818 }
819 }
820 Reset(start);
821 return false;
822 }
823 // \u but no {, or \u{...} escapes not allowed.
Ben Murdoch097c5b22016-05-18 11:27:45 +0100824 bool result = ParseHexEscape(4, value);
825 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
826 current() == '\\') {
827 // Attempt to read trail surrogate.
828 int start = position();
829 if (Next() == 'u') {
830 Advance(2);
831 uc32 trail;
832 if (ParseHexEscape(4, &trail) &&
833 unibrow::Utf16::IsTrailSurrogate(trail)) {
834 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
835 static_cast<uc16>(trail));
836 return true;
837 }
838 }
839 Reset(start);
840 }
841 return result;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000842}
843
Ben Murdoch097c5b22016-05-18 11:27:45 +0100844ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() {
845#ifdef V8_I18N_SUPPORT
846 char property_name[3];
847 memset(property_name, 0, sizeof(property_name));
848 if (current() == '{') {
849 Advance();
850 if (current() < 'A' || current() > 'Z') return nullptr;
851 property_name[0] = static_cast<char>(current());
852 Advance();
853 if (current() >= 'a' && current() <= 'z') {
854 property_name[1] = static_cast<char>(current());
855 Advance();
856 }
857 if (current() != '}') return nullptr;
858 } else if (current() >= 'A' && current() <= 'Z') {
859 property_name[0] = static_cast<char>(current());
860 } else {
861 return nullptr;
862 }
863 Advance();
864
865 int32_t category =
866 u_getPropertyValueEnum(UCHAR_GENERAL_CATEGORY_MASK, property_name);
867 if (category == UCHAR_INVALID_CODE) return nullptr;
868
869 USet* set = uset_openEmpty();
870 UErrorCode ec = U_ZERO_ERROR;
871 uset_applyIntPropertyValue(set, UCHAR_GENERAL_CATEGORY_MASK, category, &ec);
872 ZoneList<CharacterRange>* ranges = nullptr;
873 if (ec == U_ZERO_ERROR && !uset_isEmpty(set)) {
874 uset_removeAllStrings(set);
875 int item_count = uset_getItemCount(set);
876 ranges = new (zone()) ZoneList<CharacterRange>(item_count, zone());
877 int item_result = 0;
878 for (int i = 0; i < item_count; i++) {
879 uc32 start = 0;
880 uc32 end = 0;
881 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
882 ranges->Add(CharacterRange::Range(start, end), zone());
883 }
884 DCHECK_EQ(U_ZERO_ERROR, ec);
885 DCHECK_EQ(0, item_result);
886 }
887 uset_close(set);
888 return ranges;
889#else // V8_I18N_SUPPORT
890 return nullptr;
891#endif // V8_I18N_SUPPORT
892}
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000893
894bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
895 uc32 x = 0;
896 int d = HexValue(current());
897 if (d < 0) {
898 return false;
899 }
900 while (d >= 0) {
901 x = x * 16 + d;
902 if (x > max_value) {
903 return false;
904 }
905 Advance();
906 d = HexValue(current());
907 }
908 *value = x;
909 return true;
910}
911
912
913uc32 RegExpParser::ParseClassCharacterEscape() {
914 DCHECK(current() == '\\');
915 DCHECK(has_next() && !IsSpecialClassEscape(Next()));
916 Advance();
917 switch (current()) {
918 case 'b':
919 Advance();
920 return '\b';
921 // ControlEscape :: one of
922 // f n r t v
923 case 'f':
924 Advance();
925 return '\f';
926 case 'n':
927 Advance();
928 return '\n';
929 case 'r':
930 Advance();
931 return '\r';
932 case 't':
933 Advance();
934 return '\t';
935 case 'v':
936 Advance();
937 return '\v';
938 case 'c': {
939 uc32 controlLetter = Next();
940 uc32 letter = controlLetter & ~('A' ^ 'a');
Ben Murdoch097c5b22016-05-18 11:27:45 +0100941 // For compatibility with JSC, inside a character class. We also accept
942 // digits and underscore as control characters, unless with /u.
943 if (letter >= 'A' && letter <= 'Z') {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000944 Advance(2);
945 // Control letters mapped to ASCII control characters in the range
946 // 0x00-0x1f.
947 return controlLetter & 0x1f;
948 }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100949 if (unicode()) {
950 // With /u, invalid escapes are not treated as identity escapes.
951 ReportError(CStrVector("Invalid class escape"));
952 return 0;
953 }
954 if ((controlLetter >= '0' && controlLetter <= '9') ||
955 controlLetter == '_') {
956 Advance(2);
957 return controlLetter & 0x1f;
958 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000959 // We match JSC in reading the backslash as a literal
960 // character instead of as starting an escape.
961 return '\\';
962 }
963 case '0':
Ben Murdoch097c5b22016-05-18 11:27:45 +0100964 // With /u, \0 is interpreted as NUL if not followed by another digit.
965 if (unicode() && !(Next() >= '0' && Next() <= '9')) {
966 Advance();
967 return 0;
968 }
969 // Fall through.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000970 case '1':
971 case '2':
972 case '3':
973 case '4':
974 case '5':
975 case '6':
976 case '7':
977 // For compatibility, we interpret a decimal escape that isn't
978 // a back reference (and therefore either \0 or not valid according
979 // to the specification) as a 1..3 digit octal character code.
Ben Murdoch097c5b22016-05-18 11:27:45 +0100980 if (unicode()) {
981 // With /u, decimal escape is not interpreted as octal character code.
982 ReportError(CStrVector("Invalid class escape"));
983 return 0;
984 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000985 return ParseOctalLiteral();
986 case 'x': {
987 Advance();
988 uc32 value;
Ben Murdoch097c5b22016-05-18 11:27:45 +0100989 if (ParseHexEscape(2, &value)) return value;
990 if (unicode()) {
991 // With /u, invalid escapes are not treated as identity escapes.
992 ReportError(CStrVector("Invalid escape"));
993 return 0;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000994 }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100995 // If \x is not followed by a two-digit hexadecimal, treat it
996 // as an identity escape.
997 return 'x';
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000998 }
999 case 'u': {
1000 Advance();
1001 uc32 value;
Ben Murdoch097c5b22016-05-18 11:27:45 +01001002 if (ParseUnicodeEscape(&value)) return value;
1003 if (unicode()) {
1004 // With /u, invalid escapes are not treated as identity escapes.
1005 ReportError(CStrVector("Invalid unicode escape"));
1006 return 0;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001007 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01001008 // If \u is not followed by a two-digit hexadecimal, treat it
1009 // as an identity escape.
1010 return 'u';
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001011 }
1012 default: {
1013 uc32 result = current();
Ben Murdoch097c5b22016-05-18 11:27:45 +01001014 // With /u, no identity escapes except for syntax characters and '-' are
1015 // allowed. Otherwise, all identity escapes are allowed.
1016 if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001017 Advance();
1018 return result;
1019 }
1020 ReportError(CStrVector("Invalid escape"));
1021 return 0;
1022 }
1023 }
1024 return 0;
1025}
1026
1027
1028CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
1029 DCHECK_EQ(0, *char_class);
1030 uc32 first = current();
1031 if (first == '\\') {
1032 switch (Next()) {
1033 case 'w':
1034 case 'W':
1035 case 'd':
1036 case 'D':
1037 case 's':
1038 case 'S': {
1039 *char_class = Next();
1040 Advance(2);
1041 return CharacterRange::Singleton(0); // Return dummy value.
1042 }
1043 case kEndMarker:
1044 return ReportError(CStrVector("\\ at end of pattern"));
1045 default:
Ben Murdoch097c5b22016-05-18 11:27:45 +01001046 first = ParseClassCharacterEscape(CHECK_FAILED);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001047 }
1048 } else {
1049 Advance();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001050 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01001051
1052 return CharacterRange::Singleton(first);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001053}
1054
1055
1056static const uc16 kNoCharClass = 0;
1057
1058// Adds range or pre-defined character class to character ranges.
1059// If char_class is not kInvalidClass, it's interpreted as a class
1060// escape (i.e., 's' means whitespace, from '\s').
1061static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
1062 uc16 char_class, CharacterRange range,
1063 Zone* zone) {
1064 if (char_class != kNoCharClass) {
1065 CharacterRange::AddClassEscape(char_class, ranges, zone);
1066 } else {
1067 ranges->Add(range, zone);
1068 }
1069}
1070
1071
1072RegExpTree* RegExpParser::ParseCharacterClass() {
1073 static const char* kUnterminated = "Unterminated character class";
Ben Murdoch097c5b22016-05-18 11:27:45 +01001074 static const char* kRangeInvalid = "Invalid character class";
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001075 static const char* kRangeOutOfOrder = "Range out of order in character class";
1076
1077 DCHECK_EQ(current(), '[');
1078 Advance();
1079 bool is_negated = false;
1080 if (current() == '^') {
1081 is_negated = true;
1082 Advance();
1083 }
1084 ZoneList<CharacterRange>* ranges =
1085 new (zone()) ZoneList<CharacterRange>(2, zone());
1086 while (has_more() && current() != ']') {
1087 uc16 char_class = kNoCharClass;
1088 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
1089 if (current() == '-') {
1090 Advance();
1091 if (current() == kEndMarker) {
1092 // If we reach the end we break out of the loop and let the
1093 // following code report an error.
1094 break;
1095 } else if (current() == ']') {
1096 AddRangeOrEscape(ranges, char_class, first, zone());
1097 ranges->Add(CharacterRange::Singleton('-'), zone());
1098 break;
1099 }
1100 uc16 char_class_2 = kNoCharClass;
1101 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
1102 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
1103 // Either end is an escaped character class. Treat the '-' verbatim.
Ben Murdoch097c5b22016-05-18 11:27:45 +01001104 if (unicode()) {
1105 // ES2015 21.2.2.15.1 step 1.
1106 return ReportError(CStrVector(kRangeInvalid));
1107 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001108 AddRangeOrEscape(ranges, char_class, first, zone());
1109 ranges->Add(CharacterRange::Singleton('-'), zone());
1110 AddRangeOrEscape(ranges, char_class_2, next, zone());
1111 continue;
1112 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01001113 // ES2015 21.2.2.15.1 step 6.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001114 if (first.from() > next.to()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001115 return ReportError(CStrVector(kRangeOutOfOrder));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001116 }
1117 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
1118 } else {
1119 AddRangeOrEscape(ranges, char_class, first, zone());
1120 }
1121 }
1122 if (!has_more()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001123 return ReportError(CStrVector(kUnterminated));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001124 }
1125 Advance();
1126 if (ranges->length() == 0) {
1127 ranges->Add(CharacterRange::Everything(), zone());
1128 is_negated = !is_negated;
1129 }
1130 return new (zone()) RegExpCharacterClass(ranges, is_negated);
1131}
1132
1133
1134#undef CHECK_FAILED
1135
1136
1137bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
Ben Murdoch097c5b22016-05-18 11:27:45 +01001138 FlatStringReader* input, JSRegExp::Flags flags,
1139 RegExpCompileData* result) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001140 DCHECK(result != NULL);
Ben Murdoch097c5b22016-05-18 11:27:45 +01001141 RegExpParser parser(input, &result->error, flags, isolate, zone);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001142 RegExpTree* tree = parser.ParsePattern();
1143 if (parser.failed()) {
1144 DCHECK(tree == NULL);
1145 DCHECK(!result->error.is_null());
1146 } else {
1147 DCHECK(tree != NULL);
1148 DCHECK(result->error.is_null());
1149 if (FLAG_trace_regexp_parser) {
1150 OFStream os(stdout);
1151 tree->Print(os, zone);
1152 os << "\n";
1153 }
1154 result->tree = tree;
1155 int capture_count = parser.captures_started();
1156 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1157 result->contains_anchor = parser.contains_anchor();
1158 result->capture_count = capture_count;
1159 }
1160 return !parser.failed();
1161}
1162
Ben Murdoch097c5b22016-05-18 11:27:45 +01001163RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001164 : zone_(zone),
1165 pending_empty_(false),
Ben Murdoch097c5b22016-05-18 11:27:45 +01001166 ignore_case_(ignore_case),
1167 unicode_(unicode),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001168 characters_(NULL),
Ben Murdoch097c5b22016-05-18 11:27:45 +01001169 pending_surrogate_(kNoPendingSurrogate),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001170 terms_(),
1171 alternatives_()
1172#ifdef DEBUG
1173 ,
1174 last_added_(ADD_NONE)
1175#endif
1176{
1177}
1178
1179
Ben Murdoch097c5b22016-05-18 11:27:45 +01001180void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1181 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1182 FlushPendingSurrogate();
1183 // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1184 pending_surrogate_ = lead_surrogate;
1185}
1186
1187
1188void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1189 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1190 if (pending_surrogate_ != kNoPendingSurrogate) {
1191 uc16 lead_surrogate = pending_surrogate_;
1192 pending_surrogate_ = kNoPendingSurrogate;
1193 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1194 uc32 combined =
1195 unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1196 if (NeedsDesugaringForIgnoreCase(combined)) {
1197 AddCharacterClassForDesugaring(combined);
1198 } else {
1199 ZoneList<uc16> surrogate_pair(2, zone());
1200 surrogate_pair.Add(lead_surrogate, zone());
1201 surrogate_pair.Add(trail_surrogate, zone());
1202 RegExpAtom* atom =
1203 new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
1204 AddAtom(atom);
1205 }
1206 } else {
1207 pending_surrogate_ = trail_surrogate;
1208 FlushPendingSurrogate();
1209 }
1210}
1211
1212
1213void RegExpBuilder::FlushPendingSurrogate() {
1214 if (pending_surrogate_ != kNoPendingSurrogate) {
1215 DCHECK(unicode());
1216 uc32 c = pending_surrogate_;
1217 pending_surrogate_ = kNoPendingSurrogate;
1218 AddCharacterClassForDesugaring(c);
1219 }
1220}
1221
1222
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001223void RegExpBuilder::FlushCharacters() {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001224 FlushPendingSurrogate();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001225 pending_empty_ = false;
1226 if (characters_ != NULL) {
1227 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
1228 characters_ = NULL;
1229 text_.Add(atom, zone());
1230 LAST(ADD_ATOM);
1231 }
1232}
1233
1234
1235void RegExpBuilder::FlushText() {
1236 FlushCharacters();
1237 int num_text = text_.length();
1238 if (num_text == 0) {
1239 return;
1240 } else if (num_text == 1) {
1241 terms_.Add(text_.last(), zone());
1242 } else {
1243 RegExpText* text = new (zone()) RegExpText(zone());
1244 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1245 terms_.Add(text, zone());
1246 }
1247 text_.Clear();
1248}
1249
1250
1251void RegExpBuilder::AddCharacter(uc16 c) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001252 FlushPendingSurrogate();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001253 pending_empty_ = false;
Ben Murdoch097c5b22016-05-18 11:27:45 +01001254 if (NeedsDesugaringForIgnoreCase(c)) {
1255 AddCharacterClassForDesugaring(c);
1256 } else {
1257 if (characters_ == NULL) {
1258 characters_ = new (zone()) ZoneList<uc16>(4, zone());
1259 }
1260 characters_->Add(c, zone());
1261 LAST(ADD_CHAR);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001262 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001263}
1264
1265
1266void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1267 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001268 DCHECK(unicode());
1269 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1270 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1271 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1272 AddLeadSurrogate(c);
1273 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1274 AddTrailSurrogate(c);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001275 } else {
1276 AddCharacter(static_cast<uc16>(c));
1277 }
1278}
1279
Ben Murdoch097c5b22016-05-18 11:27:45 +01001280void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1281 // A lead or trail surrogate parsed via escape sequence will not
1282 // pair up with any preceding lead or following trail surrogate.
1283 FlushPendingSurrogate();
1284 AddUnicodeCharacter(character);
1285 FlushPendingSurrogate();
1286}
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001287
1288void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1289
1290
Ben Murdoch097c5b22016-05-18 11:27:45 +01001291void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1292 if (NeedsDesugaringForUnicode(cc)) {
1293 // With /u, character class needs to be desugared, so it
1294 // must be a standalone term instead of being part of a RegExpText.
1295 AddTerm(cc);
1296 } else {
1297 AddAtom(cc);
1298 }
1299}
1300
1301void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1302 AddTerm(new (zone()) RegExpCharacterClass(
1303 CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
1304}
1305
1306
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001307void RegExpBuilder::AddAtom(RegExpTree* term) {
1308 if (term->IsEmpty()) {
1309 AddEmpty();
1310 return;
1311 }
1312 if (term->IsTextElement()) {
1313 FlushCharacters();
1314 text_.Add(term, zone());
1315 } else {
1316 FlushText();
1317 terms_.Add(term, zone());
1318 }
1319 LAST(ADD_ATOM);
1320}
1321
1322
Ben Murdoch097c5b22016-05-18 11:27:45 +01001323void RegExpBuilder::AddTerm(RegExpTree* term) {
1324 FlushText();
1325 terms_.Add(term, zone());
1326 LAST(ADD_ATOM);
1327}
1328
1329
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001330void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1331 FlushText();
1332 terms_.Add(assert, zone());
1333 LAST(ADD_ASSERT);
1334}
1335
1336
1337void RegExpBuilder::NewAlternative() { FlushTerms(); }
1338
1339
1340void RegExpBuilder::FlushTerms() {
1341 FlushText();
1342 int num_terms = terms_.length();
1343 RegExpTree* alternative;
1344 if (num_terms == 0) {
1345 alternative = new (zone()) RegExpEmpty();
1346 } else if (num_terms == 1) {
1347 alternative = terms_.last();
1348 } else {
1349 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1350 }
1351 alternatives_.Add(alternative, zone());
1352 terms_.Clear();
1353 LAST(ADD_NONE);
1354}
1355
1356
Ben Murdoch097c5b22016-05-18 11:27:45 +01001357bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
1358 if (!unicode()) return false;
1359 switch (cc->standard_type()) {
1360 case 's': // white space
1361 case 'w': // ASCII word character
1362 case 'd': // ASCII digit
1363 return false; // These characters do not need desugaring.
1364 default:
1365 break;
1366 }
1367 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1368 CharacterRange::Canonicalize(ranges);
1369 for (int i = ranges->length() - 1; i >= 0; i--) {
1370 uc32 from = ranges->at(i).from();
1371 uc32 to = ranges->at(i).to();
1372 // Check for non-BMP characters.
1373 if (to >= kNonBmpStart) return true;
1374 // Check for lone surrogates.
1375 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
1376 }
1377 return false;
1378}
1379
1380
1381bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
1382#ifdef V8_I18N_SUPPORT
1383 if (unicode() && ignore_case()) {
1384 USet* set = uset_open(c, c);
1385 uset_closeOver(set, USET_CASE_INSENSITIVE);
1386 uset_removeAllStrings(set);
1387 bool result = uset_size(set) > 1;
1388 uset_close(set);
1389 return result;
1390 }
1391 // In the case where ICU is not included, we act as if the unicode flag is
1392 // not set, and do not desugar.
1393#endif // V8_I18N_SUPPORT
1394 return false;
1395}
1396
1397
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001398RegExpTree* RegExpBuilder::ToRegExp() {
1399 FlushTerms();
1400 int num_alternatives = alternatives_.length();
1401 if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1402 if (num_alternatives == 1) return alternatives_.last();
1403 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1404}
1405
Ben Murdoch097c5b22016-05-18 11:27:45 +01001406bool RegExpBuilder::AddQuantifierToAtom(
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001407 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001408 FlushPendingSurrogate();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001409 if (pending_empty_) {
1410 pending_empty_ = false;
Ben Murdoch097c5b22016-05-18 11:27:45 +01001411 return true;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001412 }
1413 RegExpTree* atom;
1414 if (characters_ != NULL) {
1415 DCHECK(last_added_ == ADD_CHAR);
1416 // Last atom was character.
1417 Vector<const uc16> char_vector = characters_->ToConstVector();
1418 int num_chars = char_vector.length();
1419 if (num_chars > 1) {
1420 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
1421 text_.Add(new (zone()) RegExpAtom(prefix), zone());
1422 char_vector = char_vector.SubVector(num_chars - 1, num_chars);
1423 }
1424 characters_ = NULL;
1425 atom = new (zone()) RegExpAtom(char_vector);
1426 FlushText();
1427 } else if (text_.length() > 0) {
1428 DCHECK(last_added_ == ADD_ATOM);
1429 atom = text_.RemoveLast();
1430 FlushText();
1431 } else if (terms_.length() > 0) {
1432 DCHECK(last_added_ == ADD_ATOM);
1433 atom = terms_.RemoveLast();
Ben Murdoch097c5b22016-05-18 11:27:45 +01001434 // With /u, lookarounds are not quantifiable.
1435 if (unicode() && atom->IsLookaround()) return false;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001436 if (atom->max_match() == 0) {
1437 // Guaranteed to only match an empty string.
1438 LAST(ADD_TERM);
1439 if (min == 0) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001440 return true;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001441 }
1442 terms_.Add(atom, zone());
Ben Murdoch097c5b22016-05-18 11:27:45 +01001443 return true;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001444 }
1445 } else {
1446 // Only call immediately after adding an atom or character!
1447 UNREACHABLE();
Ben Murdoch097c5b22016-05-18 11:27:45 +01001448 return false;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001449 }
1450 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1451 zone());
1452 LAST(ADD_TERM);
Ben Murdoch097c5b22016-05-18 11:27:45 +01001453 return true;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001454}
1455
1456} // namespace internal
1457} // namespace v8