Blame - src/regexp/regexp-parser.cc - fp2-dev/platform/external/v8

blob: 46c593c2643d45a1cbe4c637158fe37b11c3d4da [file] [log] [blame]

Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1	// Copyright 2016 the V8 project authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#include "src/regexp/regexp-parser.h"
				6
				7	#include "src/char-predicates-inl.h"
				8	#include "src/factory.h"
				9	#include "src/isolate.h"
				10	#include "src/objects-inl.h"
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	11	#include "src/ostreams.h"
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	12	#include "src/regexp/jsregexp.h"
				13	#include "src/utils.h"
				14
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	15	#ifdef V8_I18N_SUPPORT
				16	#include "unicode/uset.h"
				17	#endif // V8_I18N_SUPPORT
				18
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	19	namespace v8 {
				20	namespace internal {
				21
				22	RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	23	JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	24	: isolate_(isolate),
				25	zone_(zone),
				26	error_(error),
				27	captures_(NULL),
				28	in_(in),
				29	current_(kEndMarker),
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	30	ignore_case_(flags & JSRegExp::kIgnoreCase),
				31	multiline_(flags & JSRegExp::kMultiline),
				32	unicode_(flags & JSRegExp::kUnicode),
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	33	next_pos_(0),
				34	captures_started_(0),
				35	capture_count_(0),
				36	has_more_(true),
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	37	simple_(false),
				38	contains_anchor_(false),
				39	is_scanned_for_captures_(false),
				40	failed_(false) {
				41	Advance();
				42	}
				43
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	44	template <bool update_position>
				45	inline uc32 RegExpParser::ReadNext() {
				46	int position = next_pos_;
				47	uc32 c0 = in()->Get(position);
				48	position++;
				49	// Read the whole surrogate pair in case of unicode flag, if possible.
				50	if (unicode() && position < in()->length() &&
				51	unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
				52	uc16 c1 = in()->Get(position);
				53	if (unibrow::Utf16::IsTrailSurrogate(c1)) {
				54	c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
				55	position++;
				56	}
				57	}
				58	if (update_position) next_pos_ = position;
				59	return c0;
				60	}
				61
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	62
				63	uc32 RegExpParser::Next() {
				64	if (has_next()) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	65	return ReadNext<false>();
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	66	} else {
				67	return kEndMarker;
				68	}
				69	}
				70
				71
				72	void RegExpParser::Advance() {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	73	if (has_next()) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	74	StackLimitCheck check(isolate());
				75	if (check.HasOverflowed()) {
				76	ReportError(CStrVector(Isolate::kStackOverflowMessage));
				77	} else if (zone()->excess_allocation()) {
				78	ReportError(CStrVector("Regular expression too large"));
				79	} else {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	80	current_ = ReadNext<true>();
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	81	}
				82	} else {
				83	current_ = kEndMarker;
				84	// Advance so that position() points to 1-after-the-last-character. This is
				85	// important so that Reset() to this position works correctly.
				86	next_pos_ = in()->length() + 1;
				87	has_more_ = false;
				88	}
				89	}
				90
				91
				92	void RegExpParser::Reset(int pos) {
				93	next_pos_ = pos;
				94	has_more_ = (pos < in()->length());
				95	Advance();
				96	}
				97
				98
				99	void RegExpParser::Advance(int dist) {
				100	next_pos_ += dist - 1;
				101	Advance();
				102	}
				103
				104
				105	bool RegExpParser::simple() { return simple_; }
				106
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	107	bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
				108	switch (c) {
				109	case '^':
				110	case '$':
				111	case '\\':
				112	case '.':
				113	case '*':
				114	case '+':
				115	case '?':
				116	case '(':
				117	case ')':
				118	case '[':
				119	case ']':
				120	case '{':
				121	case '}':
				122	case '\|':
				123	case '/':
				124	return true;
				125	default:
				126	break;
				127	}
				128	return false;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	129	}
				130
				131
				132	RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
				133	failed_ = true;
				134	*error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
				135	// Zip to the end to make sure the no more input is read.
				136	current_ = kEndMarker;
				137	next_pos_ = in()->length();
				138	return NULL;
				139	}
				140
				141
				142	#define CHECK_FAILED /**/); \
				143	if (failed_) return NULL; \
				144	((void)0
				145
				146
				147	// Pattern ::
				148	// Disjunction
				149	RegExpTree* RegExpParser::ParsePattern() {
				150	RegExpTree* result = ParseDisjunction(CHECK_FAILED);
				151	DCHECK(!has_more());
				152	// If the result of parsing is a literal string atom, and it has the
				153	// same length as the input, then the atom is identical to the input.
				154	if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
				155	simple_ = true;
				156	}
				157	return result;
				158	}
				159
				160
				161	// Disjunction ::
				162	// Alternative
				163	// Alternative \| Disjunction
				164	// Alternative ::
				165	// [empty]
				166	// Term Alternative
				167	// Term ::
				168	// Assertion
				169	// Atom
				170	// Atom Quantifier
				171	RegExpTree* RegExpParser::ParseDisjunction() {
				172	// Used to store current state while parsing subexpressions.
				173	RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	174	ignore_case(), unicode(), zone());
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	175	RegExpParserState* state = &initial_state;
				176	// Cache the builder in a local variable for quick access.
				177	RegExpBuilder* builder = initial_state.builder();
				178	while (true) {
				179	switch (current()) {
				180	case kEndMarker:
				181	if (state->IsSubexpression()) {
				182	// Inside a parenthesized group when hitting end of input.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	183	return ReportError(CStrVector("Unterminated group"));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	184	}
				185	DCHECK_EQ(INITIAL, state->group_type());
				186	// Parsing completed successfully.
				187	return builder->ToRegExp();
				188	case ')': {
				189	if (!state->IsSubexpression()) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	190	return ReportError(CStrVector("Unmatched ')'"));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	191	}
				192	DCHECK_NE(INITIAL, state->group_type());
				193
				194	Advance();
				195	// End disjunction parsing and convert builder content to new single
				196	// regexp atom.
				197	RegExpTree* body = builder->ToRegExp();
				198
				199	int end_capture_index = captures_started();
				200
				201	int capture_index = state->capture_index();
				202	SubexpressionType group_type = state->group_type();
				203
				204	// Build result of subexpression.
				205	if (group_type == CAPTURE) {
				206	RegExpCapture* capture = GetCapture(capture_index);
				207	capture->set_body(body);
				208	body = capture;
				209	} else if (group_type != GROUPING) {
				210	DCHECK(group_type == POSITIVE_LOOKAROUND \|\|
				211	group_type == NEGATIVE_LOOKAROUND);
				212	bool is_positive = (group_type == POSITIVE_LOOKAROUND);
				213	body = new (zone()) RegExpLookaround(
				214	body, is_positive, end_capture_index - capture_index,
				215	capture_index, state->lookaround_type());
				216	}
				217
				218	// Restore previous state.
				219	state = state->previous_state();
				220	builder = state->builder();
				221
				222	builder->AddAtom(body);
				223	// For compatability with JSC and ES3, we allow quantifiers after
				224	// lookaheads, and break in all cases.
				225	break;
				226	}
				227	case '\|': {
				228	Advance();
				229	builder->NewAlternative();
				230	continue;
				231	}
				232	case '*':
				233	case '+':
				234	case '?':
				235	return ReportError(CStrVector("Nothing to repeat"));
				236	case '^': {
				237	Advance();
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	238	if (multiline()) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	239	builder->AddAssertion(
				240	new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
				241	} else {
				242	builder->AddAssertion(
				243	new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
				244	set_contains_anchor();
				245	}
				246	continue;
				247	}
				248	case '$': {
				249	Advance();
				250	RegExpAssertion::AssertionType assertion_type =
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	251	multiline() ? RegExpAssertion::END_OF_LINE
				252	: RegExpAssertion::END_OF_INPUT;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	253	builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
				254	continue;
				255	}
				256	case '.': {
				257	Advance();
				258	// everything except \x0a, \x0d, \u2028 and \u2029
				259	ZoneList<CharacterRange>* ranges =
				260	new (zone()) ZoneList<CharacterRange>(2, zone());
				261	CharacterRange::AddClassEscape('.', ranges, zone());
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	262	RegExpCharacterClass* cc =
				263	new (zone()) RegExpCharacterClass(ranges, false);
				264	builder->AddCharacterClass(cc);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	265	break;
				266	}
				267	case '(': {
				268	SubexpressionType subexpr_type = CAPTURE;
				269	RegExpLookaround::Type lookaround_type = state->lookaround_type();
				270	Advance();
				271	if (current() == '?') {
				272	switch (Next()) {
				273	case ':':
				274	subexpr_type = GROUPING;
				275	break;
				276	case '=':
				277	lookaround_type = RegExpLookaround::LOOKAHEAD;
				278	subexpr_type = POSITIVE_LOOKAROUND;
				279	break;
				280	case '!':
				281	lookaround_type = RegExpLookaround::LOOKAHEAD;
				282	subexpr_type = NEGATIVE_LOOKAROUND;
				283	break;
				284	case '<':
				285	if (FLAG_harmony_regexp_lookbehind) {
				286	Advance();
				287	lookaround_type = RegExpLookaround::LOOKBEHIND;
				288	if (Next() == '=') {
				289	subexpr_type = POSITIVE_LOOKAROUND;
				290	break;
				291	} else if (Next() == '!') {
				292	subexpr_type = NEGATIVE_LOOKAROUND;
				293	break;
				294	}
				295	}
				296	// Fall through.
				297	default:
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	298	return ReportError(CStrVector("Invalid group"));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	299	}
				300	Advance(2);
				301	} else {
				302	if (captures_started_ >= kMaxCaptures) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	303	return ReportError(CStrVector("Too many captures"));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	304	}
				305	captures_started_++;
				306	}
				307	// Store current state and begin new disjunction parsing.
				308	state = new (zone()) RegExpParserState(
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	309	state, subexpr_type, lookaround_type, captures_started_,
				310	ignore_case(), unicode(), zone());
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	311	builder = state->builder();
				312	continue;
				313	}
				314	case '[': {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	315	RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
				316	builder->AddCharacterClass(cc->AsCharacterClass());
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	317	break;
				318	}
				319	// Atom ::
				320	// \ AtomEscape
				321	case '\\':
				322	switch (Next()) {
				323	case kEndMarker:
				324	return ReportError(CStrVector("\\ at end of pattern"));
				325	case 'b':
				326	Advance(2);
				327	builder->AddAssertion(
				328	new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
				329	continue;
				330	case 'B':
				331	Advance(2);
				332	builder->AddAssertion(
				333	new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
				334	continue;
				335	// AtomEscape ::
				336	// CharacterClassEscape
				337	//
				338	// CharacterClassEscape :: one of
				339	// d D s S w W
				340	case 'd':
				341	case 'D':
				342	case 's':
				343	case 'S':
				344	case 'w':
				345	case 'W': {
				346	uc32 c = Next();
				347	Advance(2);
				348	ZoneList<CharacterRange>* ranges =
				349	new (zone()) ZoneList<CharacterRange>(2, zone());
				350	CharacterRange::AddClassEscape(c, ranges, zone());
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	351	RegExpCharacterClass* cc =
				352	new (zone()) RegExpCharacterClass(ranges, false);
				353	builder->AddCharacterClass(cc);
				354	break;
				355	}
				356	case 'p':
				357	case 'P': {
				358	uc32 p = Next();
				359	Advance(2);
				360	if (unicode()) {
				361	if (FLAG_harmony_regexp_property) {
				362	ZoneList<CharacterRange>* ranges = ParsePropertyClass();
				363	if (ranges == nullptr) {
				364	return ReportError(CStrVector("Invalid property name"));
				365	}
				366	RegExpCharacterClass* cc =
				367	new (zone()) RegExpCharacterClass(ranges, p == 'P');
				368	builder->AddCharacterClass(cc);
				369	} else {
				370	return ReportError(CStrVector("Invalid escape"));
				371	}
				372	} else {
				373	builder->AddCharacter(p);
				374	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	375	break;
				376	}
				377	case '1':
				378	case '2':
				379	case '3':
				380	case '4':
				381	case '5':
				382	case '6':
				383	case '7':
				384	case '8':
				385	case '9': {
				386	int index = 0;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	387	bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
				388	if (is_backref) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	389	if (state->IsInsideCaptureGroup(index)) {
				390	// The back reference is inside the capture group it refers to.
				391	// Nothing can possibly have been captured yet, so we use empty
				392	// instead. This ensures that, when checking a back reference,
				393	// the capture registers of the referenced capture are either
				394	// both set or both cleared.
				395	builder->AddEmpty();
				396	} else {
				397	RegExpCapture* capture = GetCapture(index);
				398	RegExpTree* atom = new (zone()) RegExpBackReference(capture);
				399	builder->AddAtom(atom);
				400	}
				401	break;
				402	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	403	// With /u, no identity escapes except for syntax characters
				404	// are allowed. Otherwise, all identity escapes are allowed.
				405	if (unicode()) {
				406	return ReportError(CStrVector("Invalid escape"));
				407	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	408	uc32 first_digit = Next();
				409	if (first_digit == '8' \|\| first_digit == '9') {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	410	builder->AddCharacter(first_digit);
				411	Advance(2);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	412	break;
				413	}
				414	}
				415	// FALLTHROUGH
				416	case '0': {
				417	Advance();
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	418	if (unicode() && Next() >= '0' && Next() <= '9') {
				419	// With /u, decimal escape with leading 0 are not parsed as octal.
				420	return ReportError(CStrVector("Invalid decimal escape"));
				421	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	422	uc32 octal = ParseOctalLiteral();
				423	builder->AddCharacter(octal);
				424	break;
				425	}
				426	// ControlEscape :: one of
				427	// f n r t v
				428	case 'f':
				429	Advance(2);
				430	builder->AddCharacter('\f');
				431	break;
				432	case 'n':
				433	Advance(2);
				434	builder->AddCharacter('\n');
				435	break;
				436	case 'r':
				437	Advance(2);
				438	builder->AddCharacter('\r');
				439	break;
				440	case 't':
				441	Advance(2);
				442	builder->AddCharacter('\t');
				443	break;
				444	case 'v':
				445	Advance(2);
				446	builder->AddCharacter('\v');
				447	break;
				448	case 'c': {
				449	Advance();
				450	uc32 controlLetter = Next();
				451	// Special case if it is an ASCII letter.
				452	// Convert lower case letters to uppercase.
				453	uc32 letter = controlLetter & ~('a' ^ 'A');
				454	if (letter < 'A' \|\| 'Z' < letter) {
				455	// controlLetter is not in range 'A'-'Z' or 'a'-'z'.
				456	// This is outside the specification. We match JSC in
				457	// reading the backslash as a literal character instead
				458	// of as starting an escape.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	459	if (unicode()) {
				460	// With /u, invalid escapes are not treated as identity escapes.
				461	return ReportError(CStrVector("Invalid unicode escape"));
				462	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	463	builder->AddCharacter('\\');
				464	} else {
				465	Advance(2);
				466	builder->AddCharacter(controlLetter & 0x1f);
				467	}
				468	break;
				469	}
				470	case 'x': {
				471	Advance(2);
				472	uc32 value;
				473	if (ParseHexEscape(2, &value)) {
				474	builder->AddCharacter(value);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	475	} else if (!unicode()) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	476	builder->AddCharacter('x');
				477	} else {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	478	// With /u, invalid escapes are not treated as identity escapes.
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	479	return ReportError(CStrVector("Invalid escape"));
				480	}
				481	break;
				482	}
				483	case 'u': {
				484	Advance(2);
				485	uc32 value;
				486	if (ParseUnicodeEscape(&value)) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	487	builder->AddEscapedUnicodeCharacter(value);
				488	} else if (!unicode()) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	489	builder->AddCharacter('u');
				490	} else {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	491	// With /u, invalid escapes are not treated as identity escapes.
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	492	return ReportError(CStrVector("Invalid unicode escape"));
				493	}
				494	break;
				495	}
				496	default:
				497	Advance();
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	498	// With /u, no identity escapes except for syntax characters
				499	// are allowed. Otherwise, all identity escapes are allowed.
				500	if (!unicode() \|\| IsSyntaxCharacterOrSlash(current())) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	501	builder->AddCharacter(current());
				502	Advance();
				503	} else {
				504	return ReportError(CStrVector("Invalid escape"));
				505	}
				506	break;
				507	}
				508	break;
				509	case '{': {
				510	int dummy;
				511	if (ParseIntervalQuantifier(&dummy, &dummy)) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	512	return ReportError(CStrVector("Nothing to repeat"));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	513	}
				514	// fallthrough
				515	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	516	case '}':
				517	case ']':
				518	if (unicode()) {
				519	return ReportError(CStrVector("Lone quantifier brackets"));
				520	}
				521	// fallthrough
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	522	default:
				523	builder->AddUnicodeCharacter(current());
				524	Advance();
				525	break;
				526	} // end switch(current())
				527
				528	int min;
				529	int max;
				530	switch (current()) {
				531	// QuantifierPrefix ::
				532	// *
				533	// +
				534	// ?
				535	// {
				536	case '*':
				537	min = 0;
				538	max = RegExpTree::kInfinity;
				539	Advance();
				540	break;
				541	case '+':
				542	min = 1;
				543	max = RegExpTree::kInfinity;
				544	Advance();
				545	break;
				546	case '?':
				547	min = 0;
				548	max = 1;
				549	Advance();
				550	break;
				551	case '{':
				552	if (ParseIntervalQuantifier(&min, &max)) {
				553	if (max < min) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	554	return ReportError(
				555	CStrVector("numbers out of order in {} quantifier"));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	556	}
				557	break;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	558	} else if (unicode()) {
				559	// With /u, incomplete quantifiers are not allowed.
				560	return ReportError(CStrVector("Incomplete quantifier"));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	561	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	562	continue;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	563	default:
				564	continue;
				565	}
				566	RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
				567	if (current() == '?') {
				568	quantifier_type = RegExpQuantifier::NON_GREEDY;
				569	Advance();
				570	} else if (FLAG_regexp_possessive_quantifier && current() == '+') {
				571	// FLAG_regexp_possessive_quantifier is a debug-only flag.
				572	quantifier_type = RegExpQuantifier::POSSESSIVE;
				573	Advance();
				574	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	575	if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
				576	return ReportError(CStrVector("Invalid quantifier"));
				577	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	578	}
				579	}
				580
				581
				582	#ifdef DEBUG
				583	// Currently only used in an DCHECK.
				584	static bool IsSpecialClassEscape(uc32 c) {
				585	switch (c) {
				586	case 'd':
				587	case 'D':
				588	case 's':
				589	case 'S':
				590	case 'w':
				591	case 'W':
				592	return true;
				593	default:
				594	return false;
				595	}
				596	}
				597	#endif
				598
				599
				600	// In order to know whether an escape is a backreference or not we have to scan
				601	// the entire regexp and find the number of capturing parentheses. However we
				602	// don't want to scan the regexp twice unless it is necessary. This mini-parser
				603	// is called when needed. It can see the difference between capturing and
				604	// noncapturing parentheses and can skip character classes and backslash-escaped
				605	// characters.
				606	void RegExpParser::ScanForCaptures() {
				607	// Start with captures started previous to current position
				608	int capture_count = captures_started();
				609	// Add count of captures after this position.
				610	int n;
				611	while ((n = current()) != kEndMarker) {
				612	Advance();
				613	switch (n) {
				614	case '\\':
				615	Advance();
				616	break;
				617	case '[': {
				618	int c;
				619	while ((c = current()) != kEndMarker) {
				620	Advance();
				621	if (c == '\\') {
				622	Advance();
				623	} else {
				624	if (c == ']') break;
				625	}
				626	}
				627	break;
				628	}
				629	case '(':
				630	if (current() != '?') capture_count++;
				631	break;
				632	}
				633	}
				634	capture_count_ = capture_count;
				635	is_scanned_for_captures_ = true;
				636	}
				637
				638
				639	bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
				640	DCHECK_EQ('\\', current());
				641	DCHECK('1' <= Next() && Next() <= '9');
				642	// Try to parse a decimal literal that is no greater than the total number
				643	// of left capturing parentheses in the input.
				644	int start = position();
				645	int value = Next() - '0';
				646	Advance(2);
				647	while (true) {
				648	uc32 c = current();
				649	if (IsDecimalDigit(c)) {
				650	value = 10 * value + (c - '0');
				651	if (value > kMaxCaptures) {
				652	Reset(start);
				653	return false;
				654	}
				655	Advance();
				656	} else {
				657	break;
				658	}
				659	}
				660	if (value > captures_started()) {
				661	if (!is_scanned_for_captures_) {
				662	int saved_position = position();
				663	ScanForCaptures();
				664	Reset(saved_position);
				665	}
				666	if (value > capture_count_) {
				667	Reset(start);
				668	return false;
				669	}
				670	}
				671	*index_out = value;
				672	return true;
				673	}
				674
				675
				676	RegExpCapture* RegExpParser::GetCapture(int index) {
				677	// The index for the capture groups are one-based. Its index in the list is
				678	// zero-based.
				679	int know_captures =
				680	is_scanned_for_captures_ ? capture_count_ : captures_started_;
				681	DCHECK(index <= know_captures);
				682	if (captures_ == NULL) {
				683	captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
				684	}
				685	while (captures_->length() < know_captures) {
				686	captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
				687	}
				688	return captures_->at(index - 1);
				689	}
				690
				691
				692	bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
				693	for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
				694	if (s->group_type() != CAPTURE) continue;
				695	// Return true if we found the matching capture index.
				696	if (index == s->capture_index()) return true;
				697	// Abort if index is larger than what has been parsed up till this state.
				698	if (index > s->capture_index()) return false;
				699	}
				700	return false;
				701	}
				702
				703
				704	// QuantifierPrefix ::
				705	// { DecimalDigits }
				706	// { DecimalDigits , }
				707	// { DecimalDigits , DecimalDigits }
				708	//
				709	// Returns true if parsing succeeds, and set the min_out and max_out
				710	// values. Values are truncated to RegExpTree::kInfinity if they overflow.
				711	bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
				712	DCHECK_EQ(current(), '{');
				713	int start = position();
				714	Advance();
				715	int min = 0;
				716	if (!IsDecimalDigit(current())) {
				717	Reset(start);
				718	return false;
				719	}
				720	while (IsDecimalDigit(current())) {
				721	int next = current() - '0';
				722	if (min > (RegExpTree::kInfinity - next) / 10) {
				723	// Overflow. Skip past remaining decimal digits and return -1.
				724	do {
				725	Advance();
				726	} while (IsDecimalDigit(current()));
				727	min = RegExpTree::kInfinity;
				728	break;
				729	}
				730	min = 10 * min + next;
				731	Advance();
				732	}
				733	int max = 0;
				734	if (current() == '}') {
				735	max = min;
				736	Advance();
				737	} else if (current() == ',') {
				738	Advance();
				739	if (current() == '}') {
				740	max = RegExpTree::kInfinity;
				741	Advance();
				742	} else {
				743	while (IsDecimalDigit(current())) {
				744	int next = current() - '0';
				745	if (max > (RegExpTree::kInfinity - next) / 10) {
				746	do {
				747	Advance();
				748	} while (IsDecimalDigit(current()));
				749	max = RegExpTree::kInfinity;
				750	break;
				751	}
				752	max = 10 * max + next;
				753	Advance();
				754	}
				755	if (current() != '}') {
				756	Reset(start);
				757	return false;
				758	}
				759	Advance();
				760	}
				761	} else {
				762	Reset(start);
				763	return false;
				764	}
				765	*min_out = min;
				766	*max_out = max;
				767	return true;
				768	}
				769
				770
				771	uc32 RegExpParser::ParseOctalLiteral() {
				772	DCHECK(('0' <= current() && current() <= '7') \|\| current() == kEndMarker);
				773	// For compatibility with some other browsers (not all), we parse
				774	// up to three octal digits with a value below 256.
				775	uc32 value = current() - '0';
				776	Advance();
				777	if ('0' <= current() && current() <= '7') {
				778	value = value * 8 + current() - '0';
				779	Advance();
				780	if (value < 32 && '0' <= current() && current() <= '7') {
				781	value = value * 8 + current() - '0';
				782	Advance();
				783	}
				784	}
				785	return value;
				786	}
				787
				788
				789	bool RegExpParser::ParseHexEscape(int length, uc32* value) {
				790	int start = position();
				791	uc32 val = 0;
				792	for (int i = 0; i < length; ++i) {
				793	uc32 c = current();
				794	int d = HexValue(c);
				795	if (d < 0) {
				796	Reset(start);
				797	return false;
				798	}
				799	val = val * 16 + d;
				800	Advance();
				801	}
				802	*value = val;
				803	return true;
				804	}
				805
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	806	// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	807	bool RegExpParser::ParseUnicodeEscape(uc32* value) {
				808	// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
				809	// allowed). In the latter case, the number of hex digits between { } is
				810	// arbitrary. \ and u have already been read.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	811	if (current() == '{' && unicode()) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	812	int start = position();
				813	Advance();
				814	if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
				815	if (current() == '}') {
				816	Advance();
				817	return true;
				818	}
				819	}
				820	Reset(start);
				821	return false;
				822	}
				823	// \u but no {, or \u{...} escapes not allowed.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	824	bool result = ParseHexEscape(4, value);
				825	if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
				826	current() == '\\') {
				827	// Attempt to read trail surrogate.
				828	int start = position();
				829	if (Next() == 'u') {
				830	Advance(2);
				831	uc32 trail;
				832	if (ParseHexEscape(4, &trail) &&
				833	unibrow::Utf16::IsTrailSurrogate(trail)) {
				834	value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(value),
				835	static_cast<uc16>(trail));
				836	return true;
				837	}
				838	}
				839	Reset(start);
				840	}
				841	return result;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	842	}
				843
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	844	ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() {
				845	#ifdef V8_I18N_SUPPORT
				846	char property_name[3];
				847	memset(property_name, 0, sizeof(property_name));
				848	if (current() == '{') {
				849	Advance();
				850	if (current() < 'A' \|\| current() > 'Z') return nullptr;
				851	property_name[0] = static_cast<char>(current());
				852	Advance();
				853	if (current() >= 'a' && current() <= 'z') {
				854	property_name[1] = static_cast<char>(current());
				855	Advance();
				856	}
				857	if (current() != '}') return nullptr;
				858	} else if (current() >= 'A' && current() <= 'Z') {
				859	property_name[0] = static_cast<char>(current());
				860	} else {
				861	return nullptr;
				862	}
				863	Advance();
				864
				865	int32_t category =
				866	u_getPropertyValueEnum(UCHAR_GENERAL_CATEGORY_MASK, property_name);
				867	if (category == UCHAR_INVALID_CODE) return nullptr;
				868
				869	USet* set = uset_openEmpty();
				870	UErrorCode ec = U_ZERO_ERROR;
				871	uset_applyIntPropertyValue(set, UCHAR_GENERAL_CATEGORY_MASK, category, &ec);
				872	ZoneList<CharacterRange>* ranges = nullptr;
				873	if (ec == U_ZERO_ERROR && !uset_isEmpty(set)) {
				874	uset_removeAllStrings(set);
				875	int item_count = uset_getItemCount(set);
				876	ranges = new (zone()) ZoneList<CharacterRange>(item_count, zone());
				877	int item_result = 0;
				878	for (int i = 0; i < item_count; i++) {
				879	uc32 start = 0;
				880	uc32 end = 0;
				881	item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
				882	ranges->Add(CharacterRange::Range(start, end), zone());
				883	}
				884	DCHECK_EQ(U_ZERO_ERROR, ec);
				885	DCHECK_EQ(0, item_result);
				886	}
				887	uset_close(set);
				888	return ranges;
				889	#else // V8_I18N_SUPPORT
				890	return nullptr;
				891	#endif // V8_I18N_SUPPORT
				892	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	893
				894	bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
				895	uc32 x = 0;
				896	int d = HexValue(current());
				897	if (d < 0) {
				898	return false;
				899	}
				900	while (d >= 0) {
				901	x = x * 16 + d;
				902	if (x > max_value) {
				903	return false;
				904	}
				905	Advance();
				906	d = HexValue(current());
				907	}
				908	*value = x;
				909	return true;
				910	}
				911
				912
				913	uc32 RegExpParser::ParseClassCharacterEscape() {
				914	DCHECK(current() == '\\');
				915	DCHECK(has_next() && !IsSpecialClassEscape(Next()));
				916	Advance();
				917	switch (current()) {
				918	case 'b':
				919	Advance();
				920	return '\b';
				921	// ControlEscape :: one of
				922	// f n r t v
				923	case 'f':
				924	Advance();
				925	return '\f';
				926	case 'n':
				927	Advance();
				928	return '\n';
				929	case 'r':
				930	Advance();
				931	return '\r';
				932	case 't':
				933	Advance();
				934	return '\t';
				935	case 'v':
				936	Advance();
				937	return '\v';
				938	case 'c': {
				939	uc32 controlLetter = Next();
				940	uc32 letter = controlLetter & ~('A' ^ 'a');
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	941	// For compatibility with JSC, inside a character class. We also accept
				942	// digits and underscore as control characters, unless with /u.
				943	if (letter >= 'A' && letter <= 'Z') {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	944	Advance(2);
				945	// Control letters mapped to ASCII control characters in the range
				946	// 0x00-0x1f.
				947	return controlLetter & 0x1f;
				948	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	949	if (unicode()) {
				950	// With /u, invalid escapes are not treated as identity escapes.
				951	ReportError(CStrVector("Invalid class escape"));
				952	return 0;
				953	}
				954	if ((controlLetter >= '0' && controlLetter <= '9') \|\|
				955	controlLetter == '_') {
				956	Advance(2);
				957	return controlLetter & 0x1f;
				958	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	959	// We match JSC in reading the backslash as a literal
				960	// character instead of as starting an escape.
				961	return '\\';
				962	}
				963	case '0':
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	964	// With /u, \0 is interpreted as NUL if not followed by another digit.
				965	if (unicode() && !(Next() >= '0' && Next() <= '9')) {
				966	Advance();
				967	return 0;
				968	}
				969	// Fall through.
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	970	case '1':
				971	case '2':
				972	case '3':
				973	case '4':
				974	case '5':
				975	case '6':
				976	case '7':
				977	// For compatibility, we interpret a decimal escape that isn't
				978	// a back reference (and therefore either \0 or not valid according
				979	// to the specification) as a 1..3 digit octal character code.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	980	if (unicode()) {
				981	// With /u, decimal escape is not interpreted as octal character code.
				982	ReportError(CStrVector("Invalid class escape"));
				983	return 0;
				984	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	985	return ParseOctalLiteral();
				986	case 'x': {
				987	Advance();
				988	uc32 value;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	989	if (ParseHexEscape(2, &value)) return value;
				990	if (unicode()) {
				991	// With /u, invalid escapes are not treated as identity escapes.
				992	ReportError(CStrVector("Invalid escape"));
				993	return 0;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	994	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	995	// If \x is not followed by a two-digit hexadecimal, treat it
				996	// as an identity escape.
				997	return 'x';
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	998	}
				999	case 'u': {
				1000	Advance();
				1001	uc32 value;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1002	if (ParseUnicodeEscape(&value)) return value;
				1003	if (unicode()) {
				1004	// With /u, invalid escapes are not treated as identity escapes.
				1005	ReportError(CStrVector("Invalid unicode escape"));
				1006	return 0;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1007	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1008	// If \u is not followed by a two-digit hexadecimal, treat it
				1009	// as an identity escape.
				1010	return 'u';
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1011	}
				1012	default: {
				1013	uc32 result = current();
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1014	// With /u, no identity escapes except for syntax characters and '-' are
				1015	// allowed. Otherwise, all identity escapes are allowed.
				1016	if (!unicode() \|\| IsSyntaxCharacterOrSlash(result) \|\| result == '-') {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1017	Advance();
				1018	return result;
				1019	}
				1020	ReportError(CStrVector("Invalid escape"));
				1021	return 0;
				1022	}
				1023	}
				1024	return 0;
				1025	}
				1026
				1027
				1028	CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
				1029	DCHECK_EQ(0, *char_class);
				1030	uc32 first = current();
				1031	if (first == '\\') {
				1032	switch (Next()) {
				1033	case 'w':
				1034	case 'W':
				1035	case 'd':
				1036	case 'D':
				1037	case 's':
				1038	case 'S': {
				1039	*char_class = Next();
				1040	Advance(2);
				1041	return CharacterRange::Singleton(0); // Return dummy value.
				1042	}
				1043	case kEndMarker:
				1044	return ReportError(CStrVector("\\ at end of pattern"));
				1045	default:
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1046	first = ParseClassCharacterEscape(CHECK_FAILED);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1047	}
				1048	} else {
				1049	Advance();
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1050	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1051
				1052	return CharacterRange::Singleton(first);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1053	}
				1054
				1055
				1056	static const uc16 kNoCharClass = 0;
				1057
				1058	// Adds range or pre-defined character class to character ranges.
				1059	// If char_class is not kInvalidClass, it's interpreted as a class
				1060	// escape (i.e., 's' means whitespace, from '\s').
				1061	static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
				1062	uc16 char_class, CharacterRange range,
				1063	Zone* zone) {
				1064	if (char_class != kNoCharClass) {
				1065	CharacterRange::AddClassEscape(char_class, ranges, zone);
				1066	} else {
				1067	ranges->Add(range, zone);
				1068	}
				1069	}
				1070
				1071
				1072	RegExpTree* RegExpParser::ParseCharacterClass() {
				1073	static const char* kUnterminated = "Unterminated character class";
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1074	static const char* kRangeInvalid = "Invalid character class";
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1075	static const char* kRangeOutOfOrder = "Range out of order in character class";
				1076
				1077	DCHECK_EQ(current(), '[');
				1078	Advance();
				1079	bool is_negated = false;
				1080	if (current() == '^') {
				1081	is_negated = true;
				1082	Advance();
				1083	}
				1084	ZoneList<CharacterRange>* ranges =
				1085	new (zone()) ZoneList<CharacterRange>(2, zone());
				1086	while (has_more() && current() != ']') {
				1087	uc16 char_class = kNoCharClass;
				1088	CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
				1089	if (current() == '-') {
				1090	Advance();
				1091	if (current() == kEndMarker) {
				1092	// If we reach the end we break out of the loop and let the
				1093	// following code report an error.
				1094	break;
				1095	} else if (current() == ']') {
				1096	AddRangeOrEscape(ranges, char_class, first, zone());
				1097	ranges->Add(CharacterRange::Singleton('-'), zone());
				1098	break;
				1099	}
				1100	uc16 char_class_2 = kNoCharClass;
				1101	CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
				1102	if (char_class != kNoCharClass \|\| char_class_2 != kNoCharClass) {
				1103	// Either end is an escaped character class. Treat the '-' verbatim.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1104	if (unicode()) {
				1105	// ES2015 21.2.2.15.1 step 1.
				1106	return ReportError(CStrVector(kRangeInvalid));
				1107	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1108	AddRangeOrEscape(ranges, char_class, first, zone());
				1109	ranges->Add(CharacterRange::Singleton('-'), zone());
				1110	AddRangeOrEscape(ranges, char_class_2, next, zone());
				1111	continue;
				1112	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1113	// ES2015 21.2.2.15.1 step 6.
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1114	if (first.from() > next.to()) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1115	return ReportError(CStrVector(kRangeOutOfOrder));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1116	}
				1117	ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
				1118	} else {
				1119	AddRangeOrEscape(ranges, char_class, first, zone());
				1120	}
				1121	}
				1122	if (!has_more()) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1123	return ReportError(CStrVector(kUnterminated));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1124	}
				1125	Advance();
				1126	if (ranges->length() == 0) {
				1127	ranges->Add(CharacterRange::Everything(), zone());
				1128	is_negated = !is_negated;
				1129	}
				1130	return new (zone()) RegExpCharacterClass(ranges, is_negated);
				1131	}
				1132
				1133
				1134	#undef CHECK_FAILED
				1135
				1136
				1137	bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1138	FlatStringReader* input, JSRegExp::Flags flags,
				1139	RegExpCompileData* result) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1140	DCHECK(result != NULL);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1141	RegExpParser parser(input, &result->error, flags, isolate, zone);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1142	RegExpTree* tree = parser.ParsePattern();
				1143	if (parser.failed()) {
				1144	DCHECK(tree == NULL);
				1145	DCHECK(!result->error.is_null());
				1146	} else {
				1147	DCHECK(tree != NULL);
				1148	DCHECK(result->error.is_null());
				1149	if (FLAG_trace_regexp_parser) {
				1150	OFStream os(stdout);
				1151	tree->Print(os, zone);
				1152	os << "\n";
				1153	}
				1154	result->tree = tree;
				1155	int capture_count = parser.captures_started();
				1156	result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
				1157	result->contains_anchor = parser.contains_anchor();
				1158	result->capture_count = capture_count;
				1159	}
				1160	return !parser.failed();
				1161	}
				1162
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1163	RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1164	: zone_(zone),
				1165	pending_empty_(false),
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1166	ignore_case_(ignore_case),
				1167	unicode_(unicode),
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1168	characters_(NULL),
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1169	pending_surrogate_(kNoPendingSurrogate),
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1170	terms_(),
				1171	alternatives_()
				1172	#ifdef DEBUG
				1173	,
				1174	last_added_(ADD_NONE)
				1175	#endif
				1176	{
				1177	}
				1178
				1179
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1180	void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
				1181	DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
				1182	FlushPendingSurrogate();
				1183	// Hold onto the lead surrogate, waiting for a trail surrogate to follow.
				1184	pending_surrogate_ = lead_surrogate;
				1185	}
				1186
				1187
				1188	void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
				1189	DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
				1190	if (pending_surrogate_ != kNoPendingSurrogate) {
				1191	uc16 lead_surrogate = pending_surrogate_;
				1192	pending_surrogate_ = kNoPendingSurrogate;
				1193	DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
				1194	uc32 combined =
				1195	unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
				1196	if (NeedsDesugaringForIgnoreCase(combined)) {
				1197	AddCharacterClassForDesugaring(combined);
				1198	} else {
				1199	ZoneList<uc16> surrogate_pair(2, zone());
				1200	surrogate_pair.Add(lead_surrogate, zone());
				1201	surrogate_pair.Add(trail_surrogate, zone());
				1202	RegExpAtom* atom =
				1203	new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
				1204	AddAtom(atom);
				1205	}
				1206	} else {
				1207	pending_surrogate_ = trail_surrogate;
				1208	FlushPendingSurrogate();
				1209	}
				1210	}
				1211
				1212
				1213	void RegExpBuilder::FlushPendingSurrogate() {
				1214	if (pending_surrogate_ != kNoPendingSurrogate) {
				1215	DCHECK(unicode());
				1216	uc32 c = pending_surrogate_;
				1217	pending_surrogate_ = kNoPendingSurrogate;
				1218	AddCharacterClassForDesugaring(c);
				1219	}
				1220	}
				1221
				1222
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1223	void RegExpBuilder::FlushCharacters() {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1224	FlushPendingSurrogate();
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1225	pending_empty_ = false;
				1226	if (characters_ != NULL) {
				1227	RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
				1228	characters_ = NULL;
				1229	text_.Add(atom, zone());
				1230	LAST(ADD_ATOM);
				1231	}
				1232	}
				1233
				1234
				1235	void RegExpBuilder::FlushText() {
				1236	FlushCharacters();
				1237	int num_text = text_.length();
				1238	if (num_text == 0) {
				1239	return;
				1240	} else if (num_text == 1) {
				1241	terms_.Add(text_.last(), zone());
				1242	} else {
				1243	RegExpText* text = new (zone()) RegExpText(zone());
				1244	for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
				1245	terms_.Add(text, zone());
				1246	}
				1247	text_.Clear();
				1248	}
				1249
				1250
				1251	void RegExpBuilder::AddCharacter(uc16 c) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1252	FlushPendingSurrogate();
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1253	pending_empty_ = false;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1254	if (NeedsDesugaringForIgnoreCase(c)) {
				1255	AddCharacterClassForDesugaring(c);
				1256	} else {
				1257	if (characters_ == NULL) {
				1258	characters_ = new (zone()) ZoneList<uc16>(4, zone());
				1259	}
				1260	characters_->Add(c, zone());
				1261	LAST(ADD_CHAR);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1262	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1263	}
				1264
				1265
				1266	void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
				1267	if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1268	DCHECK(unicode());
				1269	AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
				1270	AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
				1271	} else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
				1272	AddLeadSurrogate(c);
				1273	} else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
				1274	AddTrailSurrogate(c);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1275	} else {
				1276	AddCharacter(static_cast<uc16>(c));
				1277	}
				1278	}
				1279
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1280	void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
				1281	// A lead or trail surrogate parsed via escape sequence will not
				1282	// pair up with any preceding lead or following trail surrogate.
				1283	FlushPendingSurrogate();
				1284	AddUnicodeCharacter(character);
				1285	FlushPendingSurrogate();
				1286	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1287
				1288	void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
				1289
				1290
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1291	void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
				1292	if (NeedsDesugaringForUnicode(cc)) {
				1293	// With /u, character class needs to be desugared, so it
				1294	// must be a standalone term instead of being part of a RegExpText.
				1295	AddTerm(cc);
				1296	} else {
				1297	AddAtom(cc);
				1298	}
				1299	}
				1300
				1301	void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
				1302	AddTerm(new (zone()) RegExpCharacterClass(
				1303	CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
				1304	}
				1305
				1306
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1307	void RegExpBuilder::AddAtom(RegExpTree* term) {
				1308	if (term->IsEmpty()) {
				1309	AddEmpty();
				1310	return;
				1311	}
				1312	if (term->IsTextElement()) {
				1313	FlushCharacters();
				1314	text_.Add(term, zone());
				1315	} else {
				1316	FlushText();
				1317	terms_.Add(term, zone());
				1318	}
				1319	LAST(ADD_ATOM);
				1320	}
				1321
				1322
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1323	void RegExpBuilder::AddTerm(RegExpTree* term) {
				1324	FlushText();
				1325	terms_.Add(term, zone());
				1326	LAST(ADD_ATOM);
				1327	}
				1328
				1329
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1330	void RegExpBuilder::AddAssertion(RegExpTree* assert) {
				1331	FlushText();
				1332	terms_.Add(assert, zone());
				1333	LAST(ADD_ASSERT);
				1334	}
				1335
				1336
				1337	void RegExpBuilder::NewAlternative() { FlushTerms(); }
				1338
				1339
				1340	void RegExpBuilder::FlushTerms() {
				1341	FlushText();
				1342	int num_terms = terms_.length();
				1343	RegExpTree* alternative;
				1344	if (num_terms == 0) {
				1345	alternative = new (zone()) RegExpEmpty();
				1346	} else if (num_terms == 1) {
				1347	alternative = terms_.last();
				1348	} else {
				1349	alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
				1350	}
				1351	alternatives_.Add(alternative, zone());
				1352	terms_.Clear();
				1353	LAST(ADD_NONE);
				1354	}
				1355
				1356
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1357	bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
				1358	if (!unicode()) return false;
				1359	switch (cc->standard_type()) {
				1360	case 's': // white space
				1361	case 'w': // ASCII word character
				1362	case 'd': // ASCII digit
				1363	return false; // These characters do not need desugaring.
				1364	default:
				1365	break;
				1366	}
				1367	ZoneList<CharacterRange>* ranges = cc->ranges(zone());
				1368	CharacterRange::Canonicalize(ranges);
				1369	for (int i = ranges->length() - 1; i >= 0; i--) {
				1370	uc32 from = ranges->at(i).from();
				1371	uc32 to = ranges->at(i).to();
				1372	// Check for non-BMP characters.
				1373	if (to >= kNonBmpStart) return true;
				1374	// Check for lone surrogates.
				1375	if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
				1376	}
				1377	return false;
				1378	}
				1379
				1380
				1381	bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
				1382	#ifdef V8_I18N_SUPPORT
				1383	if (unicode() && ignore_case()) {
				1384	USet* set = uset_open(c, c);
				1385	uset_closeOver(set, USET_CASE_INSENSITIVE);
				1386	uset_removeAllStrings(set);
				1387	bool result = uset_size(set) > 1;
				1388	uset_close(set);
				1389	return result;
				1390	}
				1391	// In the case where ICU is not included, we act as if the unicode flag is
				1392	// not set, and do not desugar.
				1393	#endif // V8_I18N_SUPPORT
				1394	return false;
				1395	}
				1396
				1397
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1398	RegExpTree* RegExpBuilder::ToRegExp() {
				1399	FlushTerms();
				1400	int num_alternatives = alternatives_.length();
				1401	if (num_alternatives == 0) return new (zone()) RegExpEmpty();
				1402	if (num_alternatives == 1) return alternatives_.last();
				1403	return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
				1404	}
				1405
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1406	bool RegExpBuilder::AddQuantifierToAtom(
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1407	int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1408	FlushPendingSurrogate();
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1409	if (pending_empty_) {
				1410	pending_empty_ = false;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1411	return true;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1412	}
				1413	RegExpTree* atom;
				1414	if (characters_ != NULL) {
				1415	DCHECK(last_added_ == ADD_CHAR);
				1416	// Last atom was character.
				1417	Vector<const uc16> char_vector = characters_->ToConstVector();
				1418	int num_chars = char_vector.length();
				1419	if (num_chars > 1) {
				1420	Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
				1421	text_.Add(new (zone()) RegExpAtom(prefix), zone());
				1422	char_vector = char_vector.SubVector(num_chars - 1, num_chars);
				1423	}
				1424	characters_ = NULL;
				1425	atom = new (zone()) RegExpAtom(char_vector);
				1426	FlushText();
				1427	} else if (text_.length() > 0) {
				1428	DCHECK(last_added_ == ADD_ATOM);
				1429	atom = text_.RemoveLast();
				1430	FlushText();
				1431	} else if (terms_.length() > 0) {
				1432	DCHECK(last_added_ == ADD_ATOM);
				1433	atom = terms_.RemoveLast();
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1434	// With /u, lookarounds are not quantifiable.
				1435	if (unicode() && atom->IsLookaround()) return false;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1436	if (atom->max_match() == 0) {
				1437	// Guaranteed to only match an empty string.
				1438	LAST(ADD_TERM);
				1439	if (min == 0) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1440	return true;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1441	}
				1442	terms_.Add(atom, zone());
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1443	return true;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1444	}
				1445	} else {
				1446	// Only call immediately after adding an atom or character!
				1447	UNREACHABLE();
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1448	return false;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1449	}
				1450	terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
				1451	zone());
				1452	LAST(ADD_TERM);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1453	return true;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1454	}
				1455
				1456	} // namespace internal
				1457	} // namespace v8