blob: d433fc8578a81d7a29c08dde3d405a815851d517 [file] [log] [blame]
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/regexp-parser.h"
6
7#include "src/char-predicates-inl.h"
8#include "src/factory.h"
9#include "src/isolate.h"
10#include "src/objects-inl.h"
Ben Murdoch097c5b22016-05-18 11:27:45 +010011#include "src/ostreams.h"
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000012#include "src/regexp/jsregexp.h"
13#include "src/utils.h"
14
Ben Murdoch097c5b22016-05-18 11:27:45 +010015#ifdef V8_I18N_SUPPORT
16#include "unicode/uset.h"
17#endif // V8_I18N_SUPPORT
18
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000019namespace v8 {
20namespace internal {
21
22RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
Ben Murdoch097c5b22016-05-18 11:27:45 +010023 JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000024 : isolate_(isolate),
25 zone_(zone),
26 error_(error),
27 captures_(NULL),
28 in_(in),
29 current_(kEndMarker),
Ben Murdoch097c5b22016-05-18 11:27:45 +010030 ignore_case_(flags & JSRegExp::kIgnoreCase),
31 multiline_(flags & JSRegExp::kMultiline),
32 unicode_(flags & JSRegExp::kUnicode),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000033 next_pos_(0),
34 captures_started_(0),
35 capture_count_(0),
36 has_more_(true),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000037 simple_(false),
38 contains_anchor_(false),
39 is_scanned_for_captures_(false),
40 failed_(false) {
41 Advance();
42}
43
Ben Murdoch097c5b22016-05-18 11:27:45 +010044template <bool update_position>
45inline uc32 RegExpParser::ReadNext() {
46 int position = next_pos_;
47 uc32 c0 = in()->Get(position);
48 position++;
49 // Read the whole surrogate pair in case of unicode flag, if possible.
50 if (unicode() && position < in()->length() &&
51 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
52 uc16 c1 = in()->Get(position);
53 if (unibrow::Utf16::IsTrailSurrogate(c1)) {
54 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
55 position++;
56 }
57 }
58 if (update_position) next_pos_ = position;
59 return c0;
60}
61
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000062
63uc32 RegExpParser::Next() {
64 if (has_next()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +010065 return ReadNext<false>();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000066 } else {
67 return kEndMarker;
68 }
69}
70
71
72void RegExpParser::Advance() {
Ben Murdoch097c5b22016-05-18 11:27:45 +010073 if (has_next()) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000074 StackLimitCheck check(isolate());
75 if (check.HasOverflowed()) {
76 ReportError(CStrVector(Isolate::kStackOverflowMessage));
77 } else if (zone()->excess_allocation()) {
78 ReportError(CStrVector("Regular expression too large"));
79 } else {
Ben Murdoch097c5b22016-05-18 11:27:45 +010080 current_ = ReadNext<true>();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000081 }
82 } else {
83 current_ = kEndMarker;
84 // Advance so that position() points to 1-after-the-last-character. This is
85 // important so that Reset() to this position works correctly.
86 next_pos_ = in()->length() + 1;
87 has_more_ = false;
88 }
89}
90
91
92void RegExpParser::Reset(int pos) {
93 next_pos_ = pos;
94 has_more_ = (pos < in()->length());
95 Advance();
96}
97
98
99void RegExpParser::Advance(int dist) {
100 next_pos_ += dist - 1;
101 Advance();
102}
103
104
105bool RegExpParser::simple() { return simple_; }
106
Ben Murdoch097c5b22016-05-18 11:27:45 +0100107bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
108 switch (c) {
109 case '^':
110 case '$':
111 case '\\':
112 case '.':
113 case '*':
114 case '+':
115 case '?':
116 case '(':
117 case ')':
118 case '[':
119 case ']':
120 case '{':
121 case '}':
122 case '|':
123 case '/':
124 return true;
125 default:
126 break;
127 }
128 return false;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000129}
130
131
132RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
133 failed_ = true;
134 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
135 // Zip to the end to make sure the no more input is read.
136 current_ = kEndMarker;
137 next_pos_ = in()->length();
138 return NULL;
139}
140
141
142#define CHECK_FAILED /**/); \
143 if (failed_) return NULL; \
144 ((void)0
145
146
147// Pattern ::
148// Disjunction
149RegExpTree* RegExpParser::ParsePattern() {
150 RegExpTree* result = ParseDisjunction(CHECK_FAILED);
151 DCHECK(!has_more());
152 // If the result of parsing is a literal string atom, and it has the
153 // same length as the input, then the atom is identical to the input.
154 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
155 simple_ = true;
156 }
157 return result;
158}
159
160
161// Disjunction ::
162// Alternative
163// Alternative | Disjunction
164// Alternative ::
165// [empty]
166// Term Alternative
167// Term ::
168// Assertion
169// Atom
170// Atom Quantifier
171RegExpTree* RegExpParser::ParseDisjunction() {
172 // Used to store current state while parsing subexpressions.
173 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
Ben Murdoch097c5b22016-05-18 11:27:45 +0100174 ignore_case(), unicode(), zone());
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000175 RegExpParserState* state = &initial_state;
176 // Cache the builder in a local variable for quick access.
177 RegExpBuilder* builder = initial_state.builder();
178 while (true) {
179 switch (current()) {
180 case kEndMarker:
181 if (state->IsSubexpression()) {
182 // Inside a parenthesized group when hitting end of input.
Ben Murdoch097c5b22016-05-18 11:27:45 +0100183 return ReportError(CStrVector("Unterminated group"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000184 }
185 DCHECK_EQ(INITIAL, state->group_type());
186 // Parsing completed successfully.
187 return builder->ToRegExp();
188 case ')': {
189 if (!state->IsSubexpression()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100190 return ReportError(CStrVector("Unmatched ')'"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000191 }
192 DCHECK_NE(INITIAL, state->group_type());
193
194 Advance();
195 // End disjunction parsing and convert builder content to new single
196 // regexp atom.
197 RegExpTree* body = builder->ToRegExp();
198
199 int end_capture_index = captures_started();
200
201 int capture_index = state->capture_index();
202 SubexpressionType group_type = state->group_type();
203
204 // Build result of subexpression.
205 if (group_type == CAPTURE) {
206 RegExpCapture* capture = GetCapture(capture_index);
207 capture->set_body(body);
208 body = capture;
209 } else if (group_type != GROUPING) {
210 DCHECK(group_type == POSITIVE_LOOKAROUND ||
211 group_type == NEGATIVE_LOOKAROUND);
212 bool is_positive = (group_type == POSITIVE_LOOKAROUND);
213 body = new (zone()) RegExpLookaround(
214 body, is_positive, end_capture_index - capture_index,
215 capture_index, state->lookaround_type());
216 }
217
218 // Restore previous state.
219 state = state->previous_state();
220 builder = state->builder();
221
222 builder->AddAtom(body);
223 // For compatability with JSC and ES3, we allow quantifiers after
224 // lookaheads, and break in all cases.
225 break;
226 }
227 case '|': {
228 Advance();
229 builder->NewAlternative();
230 continue;
231 }
232 case '*':
233 case '+':
234 case '?':
235 return ReportError(CStrVector("Nothing to repeat"));
236 case '^': {
237 Advance();
Ben Murdoch097c5b22016-05-18 11:27:45 +0100238 if (multiline()) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000239 builder->AddAssertion(
240 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
241 } else {
242 builder->AddAssertion(
243 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
244 set_contains_anchor();
245 }
246 continue;
247 }
248 case '$': {
249 Advance();
250 RegExpAssertion::AssertionType assertion_type =
Ben Murdoch097c5b22016-05-18 11:27:45 +0100251 multiline() ? RegExpAssertion::END_OF_LINE
252 : RegExpAssertion::END_OF_INPUT;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000253 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
254 continue;
255 }
256 case '.': {
257 Advance();
258 // everything except \x0a, \x0d, \u2028 and \u2029
259 ZoneList<CharacterRange>* ranges =
260 new (zone()) ZoneList<CharacterRange>(2, zone());
261 CharacterRange::AddClassEscape('.', ranges, zone());
Ben Murdoch097c5b22016-05-18 11:27:45 +0100262 RegExpCharacterClass* cc =
263 new (zone()) RegExpCharacterClass(ranges, false);
264 builder->AddCharacterClass(cc);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000265 break;
266 }
267 case '(': {
268 SubexpressionType subexpr_type = CAPTURE;
269 RegExpLookaround::Type lookaround_type = state->lookaround_type();
270 Advance();
271 if (current() == '?') {
272 switch (Next()) {
273 case ':':
274 subexpr_type = GROUPING;
275 break;
276 case '=':
277 lookaround_type = RegExpLookaround::LOOKAHEAD;
278 subexpr_type = POSITIVE_LOOKAROUND;
279 break;
280 case '!':
281 lookaround_type = RegExpLookaround::LOOKAHEAD;
282 subexpr_type = NEGATIVE_LOOKAROUND;
283 break;
284 case '<':
285 if (FLAG_harmony_regexp_lookbehind) {
286 Advance();
287 lookaround_type = RegExpLookaround::LOOKBEHIND;
288 if (Next() == '=') {
289 subexpr_type = POSITIVE_LOOKAROUND;
290 break;
291 } else if (Next() == '!') {
292 subexpr_type = NEGATIVE_LOOKAROUND;
293 break;
294 }
295 }
296 // Fall through.
297 default:
Ben Murdoch097c5b22016-05-18 11:27:45 +0100298 return ReportError(CStrVector("Invalid group"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000299 }
300 Advance(2);
301 } else {
302 if (captures_started_ >= kMaxCaptures) {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100303 return ReportError(CStrVector("Too many captures"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000304 }
305 captures_started_++;
306 }
307 // Store current state and begin new disjunction parsing.
308 state = new (zone()) RegExpParserState(
Ben Murdoch097c5b22016-05-18 11:27:45 +0100309 state, subexpr_type, lookaround_type, captures_started_,
310 ignore_case(), unicode(), zone());
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000311 builder = state->builder();
312 continue;
313 }
314 case '[': {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100315 RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
316 builder->AddCharacterClass(cc->AsCharacterClass());
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000317 break;
318 }
319 // Atom ::
320 // \ AtomEscape
321 case '\\':
322 switch (Next()) {
323 case kEndMarker:
324 return ReportError(CStrVector("\\ at end of pattern"));
325 case 'b':
326 Advance(2);
327 builder->AddAssertion(
328 new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
329 continue;
330 case 'B':
331 Advance(2);
332 builder->AddAssertion(
333 new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
334 continue;
335 // AtomEscape ::
336 // CharacterClassEscape
337 //
338 // CharacterClassEscape :: one of
339 // d D s S w W
340 case 'd':
341 case 'D':
342 case 's':
343 case 'S':
344 case 'w':
345 case 'W': {
346 uc32 c = Next();
347 Advance(2);
348 ZoneList<CharacterRange>* ranges =
349 new (zone()) ZoneList<CharacterRange>(2, zone());
350 CharacterRange::AddClassEscape(c, ranges, zone());
Ben Murdoch097c5b22016-05-18 11:27:45 +0100351 RegExpCharacterClass* cc =
352 new (zone()) RegExpCharacterClass(ranges, false);
353 builder->AddCharacterClass(cc);
354 break;
355 }
356 case 'p':
357 case 'P': {
358 uc32 p = Next();
359 Advance(2);
360 if (unicode()) {
361 if (FLAG_harmony_regexp_property) {
Ben Murdochda12d292016-06-02 14:46:10 +0100362 ZoneList<CharacterRange>* ranges =
363 new (zone()) ZoneList<CharacterRange>(2, zone());
364 if (!ParsePropertyClass(ranges)) {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100365 return ReportError(CStrVector("Invalid property name"));
366 }
367 RegExpCharacterClass* cc =
368 new (zone()) RegExpCharacterClass(ranges, p == 'P');
369 builder->AddCharacterClass(cc);
370 } else {
Ben Murdochda12d292016-06-02 14:46:10 +0100371 // With /u, no identity escapes except for syntax characters
372 // are allowed. Otherwise, all identity escapes are allowed.
Ben Murdoch097c5b22016-05-18 11:27:45 +0100373 return ReportError(CStrVector("Invalid escape"));
374 }
375 } else {
376 builder->AddCharacter(p);
377 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000378 break;
379 }
380 case '1':
381 case '2':
382 case '3':
383 case '4':
384 case '5':
385 case '6':
386 case '7':
387 case '8':
388 case '9': {
389 int index = 0;
Ben Murdoch097c5b22016-05-18 11:27:45 +0100390 bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
391 if (is_backref) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000392 if (state->IsInsideCaptureGroup(index)) {
393 // The back reference is inside the capture group it refers to.
394 // Nothing can possibly have been captured yet, so we use empty
395 // instead. This ensures that, when checking a back reference,
396 // the capture registers of the referenced capture are either
397 // both set or both cleared.
398 builder->AddEmpty();
399 } else {
400 RegExpCapture* capture = GetCapture(index);
401 RegExpTree* atom = new (zone()) RegExpBackReference(capture);
402 builder->AddAtom(atom);
403 }
404 break;
405 }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100406 // With /u, no identity escapes except for syntax characters
407 // are allowed. Otherwise, all identity escapes are allowed.
408 if (unicode()) {
409 return ReportError(CStrVector("Invalid escape"));
410 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000411 uc32 first_digit = Next();
412 if (first_digit == '8' || first_digit == '9') {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100413 builder->AddCharacter(first_digit);
414 Advance(2);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000415 break;
416 }
417 }
418 // FALLTHROUGH
419 case '0': {
420 Advance();
Ben Murdoch097c5b22016-05-18 11:27:45 +0100421 if (unicode() && Next() >= '0' && Next() <= '9') {
422 // With /u, decimal escape with leading 0 are not parsed as octal.
423 return ReportError(CStrVector("Invalid decimal escape"));
424 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000425 uc32 octal = ParseOctalLiteral();
426 builder->AddCharacter(octal);
427 break;
428 }
429 // ControlEscape :: one of
430 // f n r t v
431 case 'f':
432 Advance(2);
433 builder->AddCharacter('\f');
434 break;
435 case 'n':
436 Advance(2);
437 builder->AddCharacter('\n');
438 break;
439 case 'r':
440 Advance(2);
441 builder->AddCharacter('\r');
442 break;
443 case 't':
444 Advance(2);
445 builder->AddCharacter('\t');
446 break;
447 case 'v':
448 Advance(2);
449 builder->AddCharacter('\v');
450 break;
451 case 'c': {
452 Advance();
453 uc32 controlLetter = Next();
454 // Special case if it is an ASCII letter.
455 // Convert lower case letters to uppercase.
456 uc32 letter = controlLetter & ~('a' ^ 'A');
457 if (letter < 'A' || 'Z' < letter) {
458 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
459 // This is outside the specification. We match JSC in
460 // reading the backslash as a literal character instead
461 // of as starting an escape.
Ben Murdoch097c5b22016-05-18 11:27:45 +0100462 if (unicode()) {
463 // With /u, invalid escapes are not treated as identity escapes.
464 return ReportError(CStrVector("Invalid unicode escape"));
465 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000466 builder->AddCharacter('\\');
467 } else {
468 Advance(2);
469 builder->AddCharacter(controlLetter & 0x1f);
470 }
471 break;
472 }
473 case 'x': {
474 Advance(2);
475 uc32 value;
476 if (ParseHexEscape(2, &value)) {
477 builder->AddCharacter(value);
Ben Murdoch097c5b22016-05-18 11:27:45 +0100478 } else if (!unicode()) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000479 builder->AddCharacter('x');
480 } else {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100481 // With /u, invalid escapes are not treated as identity escapes.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000482 return ReportError(CStrVector("Invalid escape"));
483 }
484 break;
485 }
486 case 'u': {
487 Advance(2);
488 uc32 value;
489 if (ParseUnicodeEscape(&value)) {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100490 builder->AddEscapedUnicodeCharacter(value);
491 } else if (!unicode()) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000492 builder->AddCharacter('u');
493 } else {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100494 // With /u, invalid escapes are not treated as identity escapes.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000495 return ReportError(CStrVector("Invalid unicode escape"));
496 }
497 break;
498 }
499 default:
500 Advance();
Ben Murdoch097c5b22016-05-18 11:27:45 +0100501 // With /u, no identity escapes except for syntax characters
502 // are allowed. Otherwise, all identity escapes are allowed.
503 if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000504 builder->AddCharacter(current());
505 Advance();
506 } else {
507 return ReportError(CStrVector("Invalid escape"));
508 }
509 break;
510 }
511 break;
512 case '{': {
513 int dummy;
514 if (ParseIntervalQuantifier(&dummy, &dummy)) {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100515 return ReportError(CStrVector("Nothing to repeat"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000516 }
517 // fallthrough
518 }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100519 case '}':
520 case ']':
521 if (unicode()) {
522 return ReportError(CStrVector("Lone quantifier brackets"));
523 }
524 // fallthrough
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000525 default:
526 builder->AddUnicodeCharacter(current());
527 Advance();
528 break;
529 } // end switch(current())
530
531 int min;
532 int max;
533 switch (current()) {
534 // QuantifierPrefix ::
535 // *
536 // +
537 // ?
538 // {
539 case '*':
540 min = 0;
541 max = RegExpTree::kInfinity;
542 Advance();
543 break;
544 case '+':
545 min = 1;
546 max = RegExpTree::kInfinity;
547 Advance();
548 break;
549 case '?':
550 min = 0;
551 max = 1;
552 Advance();
553 break;
554 case '{':
555 if (ParseIntervalQuantifier(&min, &max)) {
556 if (max < min) {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100557 return ReportError(
558 CStrVector("numbers out of order in {} quantifier"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000559 }
560 break;
Ben Murdoch097c5b22016-05-18 11:27:45 +0100561 } else if (unicode()) {
562 // With /u, incomplete quantifiers are not allowed.
563 return ReportError(CStrVector("Incomplete quantifier"));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000564 }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100565 continue;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000566 default:
567 continue;
568 }
569 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
570 if (current() == '?') {
571 quantifier_type = RegExpQuantifier::NON_GREEDY;
572 Advance();
573 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
574 // FLAG_regexp_possessive_quantifier is a debug-only flag.
575 quantifier_type = RegExpQuantifier::POSSESSIVE;
576 Advance();
577 }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100578 if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
579 return ReportError(CStrVector("Invalid quantifier"));
580 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000581 }
582}
583
584
585#ifdef DEBUG
586// Currently only used in an DCHECK.
587static bool IsSpecialClassEscape(uc32 c) {
588 switch (c) {
589 case 'd':
590 case 'D':
591 case 's':
592 case 'S':
593 case 'w':
594 case 'W':
595 return true;
596 default:
597 return false;
598 }
599}
600#endif
601
602
603// In order to know whether an escape is a backreference or not we have to scan
604// the entire regexp and find the number of capturing parentheses. However we
605// don't want to scan the regexp twice unless it is necessary. This mini-parser
606// is called when needed. It can see the difference between capturing and
607// noncapturing parentheses and can skip character classes and backslash-escaped
608// characters.
609void RegExpParser::ScanForCaptures() {
610 // Start with captures started previous to current position
611 int capture_count = captures_started();
612 // Add count of captures after this position.
613 int n;
614 while ((n = current()) != kEndMarker) {
615 Advance();
616 switch (n) {
617 case '\\':
618 Advance();
619 break;
620 case '[': {
621 int c;
622 while ((c = current()) != kEndMarker) {
623 Advance();
624 if (c == '\\') {
625 Advance();
626 } else {
627 if (c == ']') break;
628 }
629 }
630 break;
631 }
632 case '(':
633 if (current() != '?') capture_count++;
634 break;
635 }
636 }
637 capture_count_ = capture_count;
638 is_scanned_for_captures_ = true;
639}
640
641
642bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
643 DCHECK_EQ('\\', current());
644 DCHECK('1' <= Next() && Next() <= '9');
645 // Try to parse a decimal literal that is no greater than the total number
646 // of left capturing parentheses in the input.
647 int start = position();
648 int value = Next() - '0';
649 Advance(2);
650 while (true) {
651 uc32 c = current();
652 if (IsDecimalDigit(c)) {
653 value = 10 * value + (c - '0');
654 if (value > kMaxCaptures) {
655 Reset(start);
656 return false;
657 }
658 Advance();
659 } else {
660 break;
661 }
662 }
663 if (value > captures_started()) {
664 if (!is_scanned_for_captures_) {
665 int saved_position = position();
666 ScanForCaptures();
667 Reset(saved_position);
668 }
669 if (value > capture_count_) {
670 Reset(start);
671 return false;
672 }
673 }
674 *index_out = value;
675 return true;
676}
677
678
679RegExpCapture* RegExpParser::GetCapture(int index) {
680 // The index for the capture groups are one-based. Its index in the list is
681 // zero-based.
682 int know_captures =
683 is_scanned_for_captures_ ? capture_count_ : captures_started_;
684 DCHECK(index <= know_captures);
685 if (captures_ == NULL) {
686 captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
687 }
688 while (captures_->length() < know_captures) {
689 captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
690 }
691 return captures_->at(index - 1);
692}
693
694
695bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
696 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
697 if (s->group_type() != CAPTURE) continue;
698 // Return true if we found the matching capture index.
699 if (index == s->capture_index()) return true;
700 // Abort if index is larger than what has been parsed up till this state.
701 if (index > s->capture_index()) return false;
702 }
703 return false;
704}
705
706
707// QuantifierPrefix ::
708// { DecimalDigits }
709// { DecimalDigits , }
710// { DecimalDigits , DecimalDigits }
711//
712// Returns true if parsing succeeds, and set the min_out and max_out
713// values. Values are truncated to RegExpTree::kInfinity if they overflow.
714bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
715 DCHECK_EQ(current(), '{');
716 int start = position();
717 Advance();
718 int min = 0;
719 if (!IsDecimalDigit(current())) {
720 Reset(start);
721 return false;
722 }
723 while (IsDecimalDigit(current())) {
724 int next = current() - '0';
725 if (min > (RegExpTree::kInfinity - next) / 10) {
726 // Overflow. Skip past remaining decimal digits and return -1.
727 do {
728 Advance();
729 } while (IsDecimalDigit(current()));
730 min = RegExpTree::kInfinity;
731 break;
732 }
733 min = 10 * min + next;
734 Advance();
735 }
736 int max = 0;
737 if (current() == '}') {
738 max = min;
739 Advance();
740 } else if (current() == ',') {
741 Advance();
742 if (current() == '}') {
743 max = RegExpTree::kInfinity;
744 Advance();
745 } else {
746 while (IsDecimalDigit(current())) {
747 int next = current() - '0';
748 if (max > (RegExpTree::kInfinity - next) / 10) {
749 do {
750 Advance();
751 } while (IsDecimalDigit(current()));
752 max = RegExpTree::kInfinity;
753 break;
754 }
755 max = 10 * max + next;
756 Advance();
757 }
758 if (current() != '}') {
759 Reset(start);
760 return false;
761 }
762 Advance();
763 }
764 } else {
765 Reset(start);
766 return false;
767 }
768 *min_out = min;
769 *max_out = max;
770 return true;
771}
772
773
774uc32 RegExpParser::ParseOctalLiteral() {
775 DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
776 // For compatibility with some other browsers (not all), we parse
777 // up to three octal digits with a value below 256.
778 uc32 value = current() - '0';
779 Advance();
780 if ('0' <= current() && current() <= '7') {
781 value = value * 8 + current() - '0';
782 Advance();
783 if (value < 32 && '0' <= current() && current() <= '7') {
784 value = value * 8 + current() - '0';
785 Advance();
786 }
787 }
788 return value;
789}
790
791
792bool RegExpParser::ParseHexEscape(int length, uc32* value) {
793 int start = position();
794 uc32 val = 0;
795 for (int i = 0; i < length; ++i) {
796 uc32 c = current();
797 int d = HexValue(c);
798 if (d < 0) {
799 Reset(start);
800 return false;
801 }
802 val = val * 16 + d;
803 Advance();
804 }
805 *value = val;
806 return true;
807}
808
Ben Murdoch097c5b22016-05-18 11:27:45 +0100809// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000810bool RegExpParser::ParseUnicodeEscape(uc32* value) {
811 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
812 // allowed). In the latter case, the number of hex digits between { } is
813 // arbitrary. \ and u have already been read.
Ben Murdoch097c5b22016-05-18 11:27:45 +0100814 if (current() == '{' && unicode()) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000815 int start = position();
816 Advance();
817 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
818 if (current() == '}') {
819 Advance();
820 return true;
821 }
822 }
823 Reset(start);
824 return false;
825 }
826 // \u but no {, or \u{...} escapes not allowed.
Ben Murdoch097c5b22016-05-18 11:27:45 +0100827 bool result = ParseHexEscape(4, value);
828 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
829 current() == '\\') {
830 // Attempt to read trail surrogate.
831 int start = position();
832 if (Next() == 'u') {
833 Advance(2);
834 uc32 trail;
835 if (ParseHexEscape(4, &trail) &&
836 unibrow::Utf16::IsTrailSurrogate(trail)) {
837 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
838 static_cast<uc16>(trail));
839 return true;
840 }
841 }
842 Reset(start);
843 }
844 return result;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000845}
846
Ben Murdoch097c5b22016-05-18 11:27:45 +0100847#ifdef V8_I18N_SUPPORT
Ben Murdochda12d292016-06-02 14:46:10 +0100848bool IsExactPropertyValueAlias(const char* property_name, UProperty property,
849 int32_t property_value) {
850 const char* short_name =
851 u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
852 if (short_name != NULL && strcmp(property_name, short_name) == 0) return true;
853 for (int i = 0;; i++) {
854 const char* long_name = u_getPropertyValueName(
855 property, property_value,
856 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
857 if (long_name == NULL) break;
858 if (strcmp(property_name, long_name) == 0) return true;
Ben Murdoch097c5b22016-05-18 11:27:45 +0100859 }
Ben Murdochda12d292016-06-02 14:46:10 +0100860 return false;
861}
Ben Murdoch097c5b22016-05-18 11:27:45 +0100862
Ben Murdochda12d292016-06-02 14:46:10 +0100863bool LookupPropertyClass(UProperty property, const char* property_name,
864 ZoneList<CharacterRange>* result, Zone* zone) {
865 int32_t property_value = u_getPropertyValueEnum(property, property_name);
866 if (property_value == UCHAR_INVALID_CODE) return false;
867
868 // We require the property name to match exactly to one of the property value
869 // aliases. However, u_getPropertyValueEnum uses loose matching.
870 if (!IsExactPropertyValueAlias(property_name, property, property_value)) {
871 return false;
872 }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100873
874 USet* set = uset_openEmpty();
875 UErrorCode ec = U_ZERO_ERROR;
Ben Murdochda12d292016-06-02 14:46:10 +0100876 uset_applyIntPropertyValue(set, property, property_value, &ec);
877 bool success = ec == U_ZERO_ERROR && !uset_isEmpty(set);
878
879 if (success) {
Ben Murdoch097c5b22016-05-18 11:27:45 +0100880 uset_removeAllStrings(set);
881 int item_count = uset_getItemCount(set);
Ben Murdoch097c5b22016-05-18 11:27:45 +0100882 int item_result = 0;
883 for (int i = 0; i < item_count; i++) {
884 uc32 start = 0;
885 uc32 end = 0;
886 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
Ben Murdochda12d292016-06-02 14:46:10 +0100887 result->Add(CharacterRange::Range(start, end), zone);
Ben Murdoch097c5b22016-05-18 11:27:45 +0100888 }
889 DCHECK_EQ(U_ZERO_ERROR, ec);
890 DCHECK_EQ(0, item_result);
891 }
892 uset_close(set);
Ben Murdochda12d292016-06-02 14:46:10 +0100893 return success;
894}
Ben Murdoch097c5b22016-05-18 11:27:45 +0100895#endif // V8_I18N_SUPPORT
Ben Murdochda12d292016-06-02 14:46:10 +0100896
897bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result) {
898#ifdef V8_I18N_SUPPORT
899 List<char> property_name_list;
900 if (current() == '{') {
901 for (Advance(); current() != '}'; Advance()) {
902 if (!has_next()) return false;
903 property_name_list.Add(static_cast<char>(current()));
904 }
905 } else if (current() != kEndMarker) {
906 property_name_list.Add(static_cast<char>(current()));
907 } else {
908 return false;
909 }
910 Advance();
911 property_name_list.Add(0); // null-terminate string.
912
913 const char* property_name = property_name_list.ToConstVector().start();
914
915#define PROPERTY_NAME_LOOKUP(PROPERTY) \
916 do { \
917 if (LookupPropertyClass(PROPERTY, property_name, result, zone())) { \
918 return true; \
919 } \
920 } while (false)
921
922 // General_Category (gc) found in PropertyValueAliases.txt
923 PROPERTY_NAME_LOOKUP(UCHAR_GENERAL_CATEGORY_MASK);
924 // Script (sc) found in Scripts.txt
925 PROPERTY_NAME_LOOKUP(UCHAR_SCRIPT);
926 // To disambiguate from script names, block names have an "In"-prefix.
927 if (property_name_list.length() > 3 && property_name[0] == 'I' &&
928 property_name[1] == 'n') {
929 // Block (blk) found in Blocks.txt
930 property_name += 2;
931 PROPERTY_NAME_LOOKUP(UCHAR_BLOCK);
932 }
933#undef PROPERTY_NAME_LOOKUP
934#endif // V8_I18N_SUPPORT
935 return false;
Ben Murdoch097c5b22016-05-18 11:27:45 +0100936}
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000937
938bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
939 uc32 x = 0;
940 int d = HexValue(current());
941 if (d < 0) {
942 return false;
943 }
944 while (d >= 0) {
945 x = x * 16 + d;
946 if (x > max_value) {
947 return false;
948 }
949 Advance();
950 d = HexValue(current());
951 }
952 *value = x;
953 return true;
954}
955
956
957uc32 RegExpParser::ParseClassCharacterEscape() {
958 DCHECK(current() == '\\');
959 DCHECK(has_next() && !IsSpecialClassEscape(Next()));
960 Advance();
961 switch (current()) {
962 case 'b':
963 Advance();
964 return '\b';
965 // ControlEscape :: one of
966 // f n r t v
967 case 'f':
968 Advance();
969 return '\f';
970 case 'n':
971 Advance();
972 return '\n';
973 case 'r':
974 Advance();
975 return '\r';
976 case 't':
977 Advance();
978 return '\t';
979 case 'v':
980 Advance();
981 return '\v';
982 case 'c': {
983 uc32 controlLetter = Next();
984 uc32 letter = controlLetter & ~('A' ^ 'a');
Ben Murdoch097c5b22016-05-18 11:27:45 +0100985 // For compatibility with JSC, inside a character class. We also accept
986 // digits and underscore as control characters, unless with /u.
987 if (letter >= 'A' && letter <= 'Z') {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000988 Advance(2);
989 // Control letters mapped to ASCII control characters in the range
990 // 0x00-0x1f.
991 return controlLetter & 0x1f;
992 }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100993 if (unicode()) {
994 // With /u, invalid escapes are not treated as identity escapes.
995 ReportError(CStrVector("Invalid class escape"));
996 return 0;
997 }
998 if ((controlLetter >= '0' && controlLetter <= '9') ||
999 controlLetter == '_') {
1000 Advance(2);
1001 return controlLetter & 0x1f;
1002 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001003 // We match JSC in reading the backslash as a literal
1004 // character instead of as starting an escape.
1005 return '\\';
1006 }
1007 case '0':
Ben Murdoch097c5b22016-05-18 11:27:45 +01001008 // With /u, \0 is interpreted as NUL if not followed by another digit.
1009 if (unicode() && !(Next() >= '0' && Next() <= '9')) {
1010 Advance();
1011 return 0;
1012 }
1013 // Fall through.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001014 case '1':
1015 case '2':
1016 case '3':
1017 case '4':
1018 case '5':
1019 case '6':
1020 case '7':
1021 // For compatibility, we interpret a decimal escape that isn't
1022 // a back reference (and therefore either \0 or not valid according
1023 // to the specification) as a 1..3 digit octal character code.
Ben Murdoch097c5b22016-05-18 11:27:45 +01001024 if (unicode()) {
1025 // With /u, decimal escape is not interpreted as octal character code.
1026 ReportError(CStrVector("Invalid class escape"));
1027 return 0;
1028 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001029 return ParseOctalLiteral();
1030 case 'x': {
1031 Advance();
1032 uc32 value;
Ben Murdoch097c5b22016-05-18 11:27:45 +01001033 if (ParseHexEscape(2, &value)) return value;
1034 if (unicode()) {
1035 // With /u, invalid escapes are not treated as identity escapes.
1036 ReportError(CStrVector("Invalid escape"));
1037 return 0;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001038 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01001039 // If \x is not followed by a two-digit hexadecimal, treat it
1040 // as an identity escape.
1041 return 'x';
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001042 }
1043 case 'u': {
1044 Advance();
1045 uc32 value;
Ben Murdoch097c5b22016-05-18 11:27:45 +01001046 if (ParseUnicodeEscape(&value)) return value;
1047 if (unicode()) {
1048 // With /u, invalid escapes are not treated as identity escapes.
1049 ReportError(CStrVector("Invalid unicode escape"));
1050 return 0;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001051 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01001052 // If \u is not followed by a two-digit hexadecimal, treat it
1053 // as an identity escape.
1054 return 'u';
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001055 }
1056 default: {
1057 uc32 result = current();
Ben Murdoch097c5b22016-05-18 11:27:45 +01001058 // With /u, no identity escapes except for syntax characters and '-' are
1059 // allowed. Otherwise, all identity escapes are allowed.
1060 if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001061 Advance();
1062 return result;
1063 }
1064 ReportError(CStrVector("Invalid escape"));
1065 return 0;
1066 }
1067 }
1068 return 0;
1069}
1070
1071
1072CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
1073 DCHECK_EQ(0, *char_class);
1074 uc32 first = current();
1075 if (first == '\\') {
1076 switch (Next()) {
1077 case 'w':
1078 case 'W':
1079 case 'd':
1080 case 'D':
1081 case 's':
1082 case 'S': {
1083 *char_class = Next();
1084 Advance(2);
1085 return CharacterRange::Singleton(0); // Return dummy value.
1086 }
1087 case kEndMarker:
1088 return ReportError(CStrVector("\\ at end of pattern"));
1089 default:
Ben Murdoch097c5b22016-05-18 11:27:45 +01001090 first = ParseClassCharacterEscape(CHECK_FAILED);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001091 }
1092 } else {
1093 Advance();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001094 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01001095
1096 return CharacterRange::Singleton(first);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001097}
1098
1099
1100static const uc16 kNoCharClass = 0;
1101
1102// Adds range or pre-defined character class to character ranges.
1103// If char_class is not kInvalidClass, it's interpreted as a class
1104// escape (i.e., 's' means whitespace, from '\s').
1105static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
1106 uc16 char_class, CharacterRange range,
1107 Zone* zone) {
1108 if (char_class != kNoCharClass) {
1109 CharacterRange::AddClassEscape(char_class, ranges, zone);
1110 } else {
1111 ranges->Add(range, zone);
1112 }
1113}
1114
Ben Murdochda12d292016-06-02 14:46:10 +01001115bool RegExpParser::ParseClassProperty(ZoneList<CharacterRange>* ranges) {
1116 if (!FLAG_harmony_regexp_property) return false;
1117 if (!unicode()) return false;
1118 if (current() != '\\') return false;
1119 uc32 next = Next();
1120 bool parse_success = false;
1121 if (next == 'p') {
1122 Advance(2);
1123 parse_success = ParsePropertyClass(ranges);
1124 } else if (next == 'P') {
1125 Advance(2);
1126 ZoneList<CharacterRange>* property_class =
1127 new (zone()) ZoneList<CharacterRange>(2, zone());
1128 parse_success = ParsePropertyClass(property_class);
1129 if (parse_success) {
1130 ZoneList<CharacterRange>* negated =
1131 new (zone()) ZoneList<CharacterRange>(2, zone());
1132 CharacterRange::Negate(property_class, negated, zone());
1133 const Vector<CharacterRange> negated_vector = negated->ToVector();
1134 ranges->AddAll(negated_vector, zone());
1135 }
1136 } else {
1137 return false;
1138 }
1139 if (!parse_success)
1140 ReportError(CStrVector("Invalid property name in character class"));
1141 return parse_success;
1142}
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001143
1144RegExpTree* RegExpParser::ParseCharacterClass() {
1145 static const char* kUnterminated = "Unterminated character class";
Ben Murdoch097c5b22016-05-18 11:27:45 +01001146 static const char* kRangeInvalid = "Invalid character class";
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001147 static const char* kRangeOutOfOrder = "Range out of order in character class";
1148
1149 DCHECK_EQ(current(), '[');
1150 Advance();
1151 bool is_negated = false;
1152 if (current() == '^') {
1153 is_negated = true;
1154 Advance();
1155 }
1156 ZoneList<CharacterRange>* ranges =
1157 new (zone()) ZoneList<CharacterRange>(2, zone());
1158 while (has_more() && current() != ']') {
Ben Murdochda12d292016-06-02 14:46:10 +01001159 bool parsed_property = ParseClassProperty(ranges CHECK_FAILED);
1160 if (parsed_property) continue;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001161 uc16 char_class = kNoCharClass;
1162 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
1163 if (current() == '-') {
1164 Advance();
1165 if (current() == kEndMarker) {
1166 // If we reach the end we break out of the loop and let the
1167 // following code report an error.
1168 break;
1169 } else if (current() == ']') {
1170 AddRangeOrEscape(ranges, char_class, first, zone());
1171 ranges->Add(CharacterRange::Singleton('-'), zone());
1172 break;
1173 }
1174 uc16 char_class_2 = kNoCharClass;
1175 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
1176 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
1177 // Either end is an escaped character class. Treat the '-' verbatim.
Ben Murdoch097c5b22016-05-18 11:27:45 +01001178 if (unicode()) {
1179 // ES2015 21.2.2.15.1 step 1.
1180 return ReportError(CStrVector(kRangeInvalid));
1181 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001182 AddRangeOrEscape(ranges, char_class, first, zone());
1183 ranges->Add(CharacterRange::Singleton('-'), zone());
1184 AddRangeOrEscape(ranges, char_class_2, next, zone());
1185 continue;
1186 }
Ben Murdoch097c5b22016-05-18 11:27:45 +01001187 // ES2015 21.2.2.15.1 step 6.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001188 if (first.from() > next.to()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001189 return ReportError(CStrVector(kRangeOutOfOrder));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001190 }
1191 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
1192 } else {
1193 AddRangeOrEscape(ranges, char_class, first, zone());
1194 }
1195 }
1196 if (!has_more()) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001197 return ReportError(CStrVector(kUnterminated));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001198 }
1199 Advance();
1200 if (ranges->length() == 0) {
1201 ranges->Add(CharacterRange::Everything(), zone());
1202 is_negated = !is_negated;
1203 }
1204 return new (zone()) RegExpCharacterClass(ranges, is_negated);
1205}
1206
1207
1208#undef CHECK_FAILED
1209
1210
1211bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
Ben Murdoch097c5b22016-05-18 11:27:45 +01001212 FlatStringReader* input, JSRegExp::Flags flags,
1213 RegExpCompileData* result) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001214 DCHECK(result != NULL);
Ben Murdoch097c5b22016-05-18 11:27:45 +01001215 RegExpParser parser(input, &result->error, flags, isolate, zone);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001216 RegExpTree* tree = parser.ParsePattern();
1217 if (parser.failed()) {
1218 DCHECK(tree == NULL);
1219 DCHECK(!result->error.is_null());
1220 } else {
1221 DCHECK(tree != NULL);
1222 DCHECK(result->error.is_null());
1223 if (FLAG_trace_regexp_parser) {
1224 OFStream os(stdout);
1225 tree->Print(os, zone);
1226 os << "\n";
1227 }
1228 result->tree = tree;
1229 int capture_count = parser.captures_started();
1230 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1231 result->contains_anchor = parser.contains_anchor();
1232 result->capture_count = capture_count;
1233 }
1234 return !parser.failed();
1235}
1236
Ben Murdoch097c5b22016-05-18 11:27:45 +01001237RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001238 : zone_(zone),
1239 pending_empty_(false),
Ben Murdoch097c5b22016-05-18 11:27:45 +01001240 ignore_case_(ignore_case),
1241 unicode_(unicode),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001242 characters_(NULL),
Ben Murdoch097c5b22016-05-18 11:27:45 +01001243 pending_surrogate_(kNoPendingSurrogate),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001244 terms_(),
1245 alternatives_()
1246#ifdef DEBUG
1247 ,
1248 last_added_(ADD_NONE)
1249#endif
1250{
1251}
1252
1253
Ben Murdoch097c5b22016-05-18 11:27:45 +01001254void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1255 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1256 FlushPendingSurrogate();
1257 // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1258 pending_surrogate_ = lead_surrogate;
1259}
1260
1261
1262void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1263 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1264 if (pending_surrogate_ != kNoPendingSurrogate) {
1265 uc16 lead_surrogate = pending_surrogate_;
1266 pending_surrogate_ = kNoPendingSurrogate;
1267 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1268 uc32 combined =
1269 unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1270 if (NeedsDesugaringForIgnoreCase(combined)) {
1271 AddCharacterClassForDesugaring(combined);
1272 } else {
1273 ZoneList<uc16> surrogate_pair(2, zone());
1274 surrogate_pair.Add(lead_surrogate, zone());
1275 surrogate_pair.Add(trail_surrogate, zone());
1276 RegExpAtom* atom =
1277 new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
1278 AddAtom(atom);
1279 }
1280 } else {
1281 pending_surrogate_ = trail_surrogate;
1282 FlushPendingSurrogate();
1283 }
1284}
1285
1286
1287void RegExpBuilder::FlushPendingSurrogate() {
1288 if (pending_surrogate_ != kNoPendingSurrogate) {
1289 DCHECK(unicode());
1290 uc32 c = pending_surrogate_;
1291 pending_surrogate_ = kNoPendingSurrogate;
1292 AddCharacterClassForDesugaring(c);
1293 }
1294}
1295
1296
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001297void RegExpBuilder::FlushCharacters() {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001298 FlushPendingSurrogate();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001299 pending_empty_ = false;
1300 if (characters_ != NULL) {
1301 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
1302 characters_ = NULL;
1303 text_.Add(atom, zone());
1304 LAST(ADD_ATOM);
1305 }
1306}
1307
1308
1309void RegExpBuilder::FlushText() {
1310 FlushCharacters();
1311 int num_text = text_.length();
1312 if (num_text == 0) {
1313 return;
1314 } else if (num_text == 1) {
1315 terms_.Add(text_.last(), zone());
1316 } else {
1317 RegExpText* text = new (zone()) RegExpText(zone());
1318 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1319 terms_.Add(text, zone());
1320 }
1321 text_.Clear();
1322}
1323
1324
1325void RegExpBuilder::AddCharacter(uc16 c) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001326 FlushPendingSurrogate();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001327 pending_empty_ = false;
Ben Murdoch097c5b22016-05-18 11:27:45 +01001328 if (NeedsDesugaringForIgnoreCase(c)) {
1329 AddCharacterClassForDesugaring(c);
1330 } else {
1331 if (characters_ == NULL) {
1332 characters_ = new (zone()) ZoneList<uc16>(4, zone());
1333 }
1334 characters_->Add(c, zone());
1335 LAST(ADD_CHAR);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001336 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001337}
1338
1339
1340void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1341 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001342 DCHECK(unicode());
1343 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1344 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1345 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1346 AddLeadSurrogate(c);
1347 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1348 AddTrailSurrogate(c);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001349 } else {
1350 AddCharacter(static_cast<uc16>(c));
1351 }
1352}
1353
Ben Murdoch097c5b22016-05-18 11:27:45 +01001354void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1355 // A lead or trail surrogate parsed via escape sequence will not
1356 // pair up with any preceding lead or following trail surrogate.
1357 FlushPendingSurrogate();
1358 AddUnicodeCharacter(character);
1359 FlushPendingSurrogate();
1360}
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001361
1362void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1363
1364
Ben Murdoch097c5b22016-05-18 11:27:45 +01001365void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1366 if (NeedsDesugaringForUnicode(cc)) {
1367 // With /u, character class needs to be desugared, so it
1368 // must be a standalone term instead of being part of a RegExpText.
1369 AddTerm(cc);
1370 } else {
1371 AddAtom(cc);
1372 }
1373}
1374
1375void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1376 AddTerm(new (zone()) RegExpCharacterClass(
1377 CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
1378}
1379
1380
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001381void RegExpBuilder::AddAtom(RegExpTree* term) {
1382 if (term->IsEmpty()) {
1383 AddEmpty();
1384 return;
1385 }
1386 if (term->IsTextElement()) {
1387 FlushCharacters();
1388 text_.Add(term, zone());
1389 } else {
1390 FlushText();
1391 terms_.Add(term, zone());
1392 }
1393 LAST(ADD_ATOM);
1394}
1395
1396
Ben Murdoch097c5b22016-05-18 11:27:45 +01001397void RegExpBuilder::AddTerm(RegExpTree* term) {
1398 FlushText();
1399 terms_.Add(term, zone());
1400 LAST(ADD_ATOM);
1401}
1402
1403
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001404void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1405 FlushText();
1406 terms_.Add(assert, zone());
1407 LAST(ADD_ASSERT);
1408}
1409
1410
1411void RegExpBuilder::NewAlternative() { FlushTerms(); }
1412
1413
1414void RegExpBuilder::FlushTerms() {
1415 FlushText();
1416 int num_terms = terms_.length();
1417 RegExpTree* alternative;
1418 if (num_terms == 0) {
1419 alternative = new (zone()) RegExpEmpty();
1420 } else if (num_terms == 1) {
1421 alternative = terms_.last();
1422 } else {
1423 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1424 }
1425 alternatives_.Add(alternative, zone());
1426 terms_.Clear();
1427 LAST(ADD_NONE);
1428}
1429
1430
Ben Murdoch097c5b22016-05-18 11:27:45 +01001431bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
1432 if (!unicode()) return false;
Ben Murdochda12d292016-06-02 14:46:10 +01001433 // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
1434 // necessarily mean that we need to desugar. It's probably nicer to have a
1435 // separate pass to figure out unicode desugarings.
1436 if (ignore_case()) return true;
Ben Murdoch097c5b22016-05-18 11:27:45 +01001437 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1438 CharacterRange::Canonicalize(ranges);
1439 for (int i = ranges->length() - 1; i >= 0; i--) {
1440 uc32 from = ranges->at(i).from();
1441 uc32 to = ranges->at(i).to();
1442 // Check for non-BMP characters.
1443 if (to >= kNonBmpStart) return true;
1444 // Check for lone surrogates.
1445 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
1446 }
1447 return false;
1448}
1449
1450
1451bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
1452#ifdef V8_I18N_SUPPORT
1453 if (unicode() && ignore_case()) {
1454 USet* set = uset_open(c, c);
1455 uset_closeOver(set, USET_CASE_INSENSITIVE);
1456 uset_removeAllStrings(set);
1457 bool result = uset_size(set) > 1;
1458 uset_close(set);
1459 return result;
1460 }
1461 // In the case where ICU is not included, we act as if the unicode flag is
1462 // not set, and do not desugar.
1463#endif // V8_I18N_SUPPORT
1464 return false;
1465}
1466
1467
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001468RegExpTree* RegExpBuilder::ToRegExp() {
1469 FlushTerms();
1470 int num_alternatives = alternatives_.length();
1471 if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1472 if (num_alternatives == 1) return alternatives_.last();
1473 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1474}
1475
Ben Murdoch097c5b22016-05-18 11:27:45 +01001476bool RegExpBuilder::AddQuantifierToAtom(
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001477 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001478 FlushPendingSurrogate();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001479 if (pending_empty_) {
1480 pending_empty_ = false;
Ben Murdoch097c5b22016-05-18 11:27:45 +01001481 return true;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001482 }
1483 RegExpTree* atom;
1484 if (characters_ != NULL) {
1485 DCHECK(last_added_ == ADD_CHAR);
1486 // Last atom was character.
1487 Vector<const uc16> char_vector = characters_->ToConstVector();
1488 int num_chars = char_vector.length();
1489 if (num_chars > 1) {
1490 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
1491 text_.Add(new (zone()) RegExpAtom(prefix), zone());
1492 char_vector = char_vector.SubVector(num_chars - 1, num_chars);
1493 }
1494 characters_ = NULL;
1495 atom = new (zone()) RegExpAtom(char_vector);
1496 FlushText();
1497 } else if (text_.length() > 0) {
1498 DCHECK(last_added_ == ADD_ATOM);
1499 atom = text_.RemoveLast();
1500 FlushText();
1501 } else if (terms_.length() > 0) {
1502 DCHECK(last_added_ == ADD_ATOM);
1503 atom = terms_.RemoveLast();
Ben Murdoch097c5b22016-05-18 11:27:45 +01001504 // With /u, lookarounds are not quantifiable.
1505 if (unicode() && atom->IsLookaround()) return false;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001506 if (atom->max_match() == 0) {
1507 // Guaranteed to only match an empty string.
1508 LAST(ADD_TERM);
1509 if (min == 0) {
Ben Murdoch097c5b22016-05-18 11:27:45 +01001510 return true;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001511 }
1512 terms_.Add(atom, zone());
Ben Murdoch097c5b22016-05-18 11:27:45 +01001513 return true;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001514 }
1515 } else {
1516 // Only call immediately after adding an atom or character!
1517 UNREACHABLE();
Ben Murdoch097c5b22016-05-18 11:27:45 +01001518 return false;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001519 }
1520 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1521 zone());
1522 LAST(ADD_TERM);
Ben Murdoch097c5b22016-05-18 11:27:45 +01001523 return true;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001524}
1525
1526} // namespace internal
1527} // namespace v8