Blame - src/regexp/regexp-parser.cc - fp2-dev/platform/external/v8

blob: d433fc8578a81d7a29c08dde3d405a815851d517 [file] [log] [blame]

Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1	// Copyright 2016 the V8 project authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#include "src/regexp/regexp-parser.h"
				6
				7	#include "src/char-predicates-inl.h"
				8	#include "src/factory.h"
				9	#include "src/isolate.h"
				10	#include "src/objects-inl.h"
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	11	#include "src/ostreams.h"
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	12	#include "src/regexp/jsregexp.h"
				13	#include "src/utils.h"
				14
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	15	#ifdef V8_I18N_SUPPORT
				16	#include "unicode/uset.h"
				17	#endif // V8_I18N_SUPPORT
				18
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	19	namespace v8 {
				20	namespace internal {
				21
				22	RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	23	JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	24	: isolate_(isolate),
				25	zone_(zone),
				26	error_(error),
				27	captures_(NULL),
				28	in_(in),
				29	current_(kEndMarker),
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	30	ignore_case_(flags & JSRegExp::kIgnoreCase),
				31	multiline_(flags & JSRegExp::kMultiline),
				32	unicode_(flags & JSRegExp::kUnicode),
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	33	next_pos_(0),
				34	captures_started_(0),
				35	capture_count_(0),
				36	has_more_(true),
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	37	simple_(false),
				38	contains_anchor_(false),
				39	is_scanned_for_captures_(false),
				40	failed_(false) {
				41	Advance();
				42	}
				43
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	44	template <bool update_position>
				45	inline uc32 RegExpParser::ReadNext() {
				46	int position = next_pos_;
				47	uc32 c0 = in()->Get(position);
				48	position++;
				49	// Read the whole surrogate pair in case of unicode flag, if possible.
				50	if (unicode() && position < in()->length() &&
				51	unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
				52	uc16 c1 = in()->Get(position);
				53	if (unibrow::Utf16::IsTrailSurrogate(c1)) {
				54	c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
				55	position++;
				56	}
				57	}
				58	if (update_position) next_pos_ = position;
				59	return c0;
				60	}
				61
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	62
				63	uc32 RegExpParser::Next() {
				64	if (has_next()) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	65	return ReadNext<false>();
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	66	} else {
				67	return kEndMarker;
				68	}
				69	}
				70
				71
				72	void RegExpParser::Advance() {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	73	if (has_next()) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	74	StackLimitCheck check(isolate());
				75	if (check.HasOverflowed()) {
				76	ReportError(CStrVector(Isolate::kStackOverflowMessage));
				77	} else if (zone()->excess_allocation()) {
				78	ReportError(CStrVector("Regular expression too large"));
				79	} else {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	80	current_ = ReadNext<true>();
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	81	}
				82	} else {
				83	current_ = kEndMarker;
				84	// Advance so that position() points to 1-after-the-last-character. This is
				85	// important so that Reset() to this position works correctly.
				86	next_pos_ = in()->length() + 1;
				87	has_more_ = false;
				88	}
				89	}
				90
				91
				92	void RegExpParser::Reset(int pos) {
				93	next_pos_ = pos;
				94	has_more_ = (pos < in()->length());
				95	Advance();
				96	}
				97
				98
				99	void RegExpParser::Advance(int dist) {
				100	next_pos_ += dist - 1;
				101	Advance();
				102	}
				103
				104
				105	bool RegExpParser::simple() { return simple_; }
				106
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	107	bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
				108	switch (c) {
				109	case '^':
				110	case '$':
				111	case '\\':
				112	case '.':
				113	case '*':
				114	case '+':
				115	case '?':
				116	case '(':
				117	case ')':
				118	case '[':
				119	case ']':
				120	case '{':
				121	case '}':
				122	case '\|':
				123	case '/':
				124	return true;
				125	default:
				126	break;
				127	}
				128	return false;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	129	}
				130
				131
				132	RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
				133	failed_ = true;
				134	*error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
				135	// Zip to the end to make sure the no more input is read.
				136	current_ = kEndMarker;
				137	next_pos_ = in()->length();
				138	return NULL;
				139	}
				140
				141
				142	#define CHECK_FAILED /**/); \
				143	if (failed_) return NULL; \
				144	((void)0
				145
				146
				147	// Pattern ::
				148	// Disjunction
				149	RegExpTree* RegExpParser::ParsePattern() {
				150	RegExpTree* result = ParseDisjunction(CHECK_FAILED);
				151	DCHECK(!has_more());
				152	// If the result of parsing is a literal string atom, and it has the
				153	// same length as the input, then the atom is identical to the input.
				154	if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
				155	simple_ = true;
				156	}
				157	return result;
				158	}
				159
				160
				161	// Disjunction ::
				162	// Alternative
				163	// Alternative \| Disjunction
				164	// Alternative ::
				165	// [empty]
				166	// Term Alternative
				167	// Term ::
				168	// Assertion
				169	// Atom
				170	// Atom Quantifier
				171	RegExpTree* RegExpParser::ParseDisjunction() {
				172	// Used to store current state while parsing subexpressions.
				173	RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	174	ignore_case(), unicode(), zone());
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	175	RegExpParserState* state = &initial_state;
				176	// Cache the builder in a local variable for quick access.
				177	RegExpBuilder* builder = initial_state.builder();
				178	while (true) {
				179	switch (current()) {
				180	case kEndMarker:
				181	if (state->IsSubexpression()) {
				182	// Inside a parenthesized group when hitting end of input.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	183	return ReportError(CStrVector("Unterminated group"));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	184	}
				185	DCHECK_EQ(INITIAL, state->group_type());
				186	// Parsing completed successfully.
				187	return builder->ToRegExp();
				188	case ')': {
				189	if (!state->IsSubexpression()) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	190	return ReportError(CStrVector("Unmatched ')'"));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	191	}
				192	DCHECK_NE(INITIAL, state->group_type());
				193
				194	Advance();
				195	// End disjunction parsing and convert builder content to new single
				196	// regexp atom.
				197	RegExpTree* body = builder->ToRegExp();
				198
				199	int end_capture_index = captures_started();
				200
				201	int capture_index = state->capture_index();
				202	SubexpressionType group_type = state->group_type();
				203
				204	// Build result of subexpression.
				205	if (group_type == CAPTURE) {
				206	RegExpCapture* capture = GetCapture(capture_index);
				207	capture->set_body(body);
				208	body = capture;
				209	} else if (group_type != GROUPING) {
				210	DCHECK(group_type == POSITIVE_LOOKAROUND \|\|
				211	group_type == NEGATIVE_LOOKAROUND);
				212	bool is_positive = (group_type == POSITIVE_LOOKAROUND);
				213	body = new (zone()) RegExpLookaround(
				214	body, is_positive, end_capture_index - capture_index,
				215	capture_index, state->lookaround_type());
				216	}
				217
				218	// Restore previous state.
				219	state = state->previous_state();
				220	builder = state->builder();
				221
				222	builder->AddAtom(body);
				223	// For compatability with JSC and ES3, we allow quantifiers after
				224	// lookaheads, and break in all cases.
				225	break;
				226	}
				227	case '\|': {
				228	Advance();
				229	builder->NewAlternative();
				230	continue;
				231	}
				232	case '*':
				233	case '+':
				234	case '?':
				235	return ReportError(CStrVector("Nothing to repeat"));
				236	case '^': {
				237	Advance();
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	238	if (multiline()) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	239	builder->AddAssertion(
				240	new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
				241	} else {
				242	builder->AddAssertion(
				243	new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
				244	set_contains_anchor();
				245	}
				246	continue;
				247	}
				248	case '$': {
				249	Advance();
				250	RegExpAssertion::AssertionType assertion_type =
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	251	multiline() ? RegExpAssertion::END_OF_LINE
				252	: RegExpAssertion::END_OF_INPUT;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	253	builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
				254	continue;
				255	}
				256	case '.': {
				257	Advance();
				258	// everything except \x0a, \x0d, \u2028 and \u2029
				259	ZoneList<CharacterRange>* ranges =
				260	new (zone()) ZoneList<CharacterRange>(2, zone());
				261	CharacterRange::AddClassEscape('.', ranges, zone());
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	262	RegExpCharacterClass* cc =
				263	new (zone()) RegExpCharacterClass(ranges, false);
				264	builder->AddCharacterClass(cc);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	265	break;
				266	}
				267	case '(': {
				268	SubexpressionType subexpr_type = CAPTURE;
				269	RegExpLookaround::Type lookaround_type = state->lookaround_type();
				270	Advance();
				271	if (current() == '?') {
				272	switch (Next()) {
				273	case ':':
				274	subexpr_type = GROUPING;
				275	break;
				276	case '=':
				277	lookaround_type = RegExpLookaround::LOOKAHEAD;
				278	subexpr_type = POSITIVE_LOOKAROUND;
				279	break;
				280	case '!':
				281	lookaround_type = RegExpLookaround::LOOKAHEAD;
				282	subexpr_type = NEGATIVE_LOOKAROUND;
				283	break;
				284	case '<':
				285	if (FLAG_harmony_regexp_lookbehind) {
				286	Advance();
				287	lookaround_type = RegExpLookaround::LOOKBEHIND;
				288	if (Next() == '=') {
				289	subexpr_type = POSITIVE_LOOKAROUND;
				290	break;
				291	} else if (Next() == '!') {
				292	subexpr_type = NEGATIVE_LOOKAROUND;
				293	break;
				294	}
				295	}
				296	// Fall through.
				297	default:
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	298	return ReportError(CStrVector("Invalid group"));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	299	}
				300	Advance(2);
				301	} else {
				302	if (captures_started_ >= kMaxCaptures) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	303	return ReportError(CStrVector("Too many captures"));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	304	}
				305	captures_started_++;
				306	}
				307	// Store current state and begin new disjunction parsing.
				308	state = new (zone()) RegExpParserState(
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	309	state, subexpr_type, lookaround_type, captures_started_,
				310	ignore_case(), unicode(), zone());
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	311	builder = state->builder();
				312	continue;
				313	}
				314	case '[': {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	315	RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
				316	builder->AddCharacterClass(cc->AsCharacterClass());
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	317	break;
				318	}
				319	// Atom ::
				320	// \ AtomEscape
				321	case '\\':
				322	switch (Next()) {
				323	case kEndMarker:
				324	return ReportError(CStrVector("\\ at end of pattern"));
				325	case 'b':
				326	Advance(2);
				327	builder->AddAssertion(
				328	new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
				329	continue;
				330	case 'B':
				331	Advance(2);
				332	builder->AddAssertion(
				333	new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
				334	continue;
				335	// AtomEscape ::
				336	// CharacterClassEscape
				337	//
				338	// CharacterClassEscape :: one of
				339	// d D s S w W
				340	case 'd':
				341	case 'D':
				342	case 's':
				343	case 'S':
				344	case 'w':
				345	case 'W': {
				346	uc32 c = Next();
				347	Advance(2);
				348	ZoneList<CharacterRange>* ranges =
				349	new (zone()) ZoneList<CharacterRange>(2, zone());
				350	CharacterRange::AddClassEscape(c, ranges, zone());
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	351	RegExpCharacterClass* cc =
				352	new (zone()) RegExpCharacterClass(ranges, false);
				353	builder->AddCharacterClass(cc);
				354	break;
				355	}
				356	case 'p':
				357	case 'P': {
				358	uc32 p = Next();
				359	Advance(2);
				360	if (unicode()) {
				361	if (FLAG_harmony_regexp_property) {
Ben Murdoch	da12d29	2016-06-02 14:46:10 +0100	[diff] [blame^]	362	ZoneList<CharacterRange>* ranges =
				363	new (zone()) ZoneList<CharacterRange>(2, zone());
				364	if (!ParsePropertyClass(ranges)) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	365	return ReportError(CStrVector("Invalid property name"));
				366	}
				367	RegExpCharacterClass* cc =
				368	new (zone()) RegExpCharacterClass(ranges, p == 'P');
				369	builder->AddCharacterClass(cc);
				370	} else {
Ben Murdoch	da12d29	2016-06-02 14:46:10 +0100	[diff] [blame^]	371	// With /u, no identity escapes except for syntax characters
				372	// are allowed. Otherwise, all identity escapes are allowed.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	373	return ReportError(CStrVector("Invalid escape"));
				374	}
				375	} else {
				376	builder->AddCharacter(p);
				377	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	378	break;
				379	}
				380	case '1':
				381	case '2':
				382	case '3':
				383	case '4':
				384	case '5':
				385	case '6':
				386	case '7':
				387	case '8':
				388	case '9': {
				389	int index = 0;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	390	bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
				391	if (is_backref) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	392	if (state->IsInsideCaptureGroup(index)) {
				393	// The back reference is inside the capture group it refers to.
				394	// Nothing can possibly have been captured yet, so we use empty
				395	// instead. This ensures that, when checking a back reference,
				396	// the capture registers of the referenced capture are either
				397	// both set or both cleared.
				398	builder->AddEmpty();
				399	} else {
				400	RegExpCapture* capture = GetCapture(index);
				401	RegExpTree* atom = new (zone()) RegExpBackReference(capture);
				402	builder->AddAtom(atom);
				403	}
				404	break;
				405	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	406	// With /u, no identity escapes except for syntax characters
				407	// are allowed. Otherwise, all identity escapes are allowed.
				408	if (unicode()) {
				409	return ReportError(CStrVector("Invalid escape"));
				410	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	411	uc32 first_digit = Next();
				412	if (first_digit == '8' \|\| first_digit == '9') {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	413	builder->AddCharacter(first_digit);
				414	Advance(2);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	415	break;
				416	}
				417	}
				418	// FALLTHROUGH
				419	case '0': {
				420	Advance();
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	421	if (unicode() && Next() >= '0' && Next() <= '9') {
				422	// With /u, decimal escape with leading 0 are not parsed as octal.
				423	return ReportError(CStrVector("Invalid decimal escape"));
				424	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	425	uc32 octal = ParseOctalLiteral();
				426	builder->AddCharacter(octal);
				427	break;
				428	}
				429	// ControlEscape :: one of
				430	// f n r t v
				431	case 'f':
				432	Advance(2);
				433	builder->AddCharacter('\f');
				434	break;
				435	case 'n':
				436	Advance(2);
				437	builder->AddCharacter('\n');
				438	break;
				439	case 'r':
				440	Advance(2);
				441	builder->AddCharacter('\r');
				442	break;
				443	case 't':
				444	Advance(2);
				445	builder->AddCharacter('\t');
				446	break;
				447	case 'v':
				448	Advance(2);
				449	builder->AddCharacter('\v');
				450	break;
				451	case 'c': {
				452	Advance();
				453	uc32 controlLetter = Next();
				454	// Special case if it is an ASCII letter.
				455	// Convert lower case letters to uppercase.
				456	uc32 letter = controlLetter & ~('a' ^ 'A');
				457	if (letter < 'A' \|\| 'Z' < letter) {
				458	// controlLetter is not in range 'A'-'Z' or 'a'-'z'.
				459	// This is outside the specification. We match JSC in
				460	// reading the backslash as a literal character instead
				461	// of as starting an escape.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	462	if (unicode()) {
				463	// With /u, invalid escapes are not treated as identity escapes.
				464	return ReportError(CStrVector("Invalid unicode escape"));
				465	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	466	builder->AddCharacter('\\');
				467	} else {
				468	Advance(2);
				469	builder->AddCharacter(controlLetter & 0x1f);
				470	}
				471	break;
				472	}
				473	case 'x': {
				474	Advance(2);
				475	uc32 value;
				476	if (ParseHexEscape(2, &value)) {
				477	builder->AddCharacter(value);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	478	} else if (!unicode()) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	479	builder->AddCharacter('x');
				480	} else {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	481	// With /u, invalid escapes are not treated as identity escapes.
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	482	return ReportError(CStrVector("Invalid escape"));
				483	}
				484	break;
				485	}
				486	case 'u': {
				487	Advance(2);
				488	uc32 value;
				489	if (ParseUnicodeEscape(&value)) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	490	builder->AddEscapedUnicodeCharacter(value);
				491	} else if (!unicode()) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	492	builder->AddCharacter('u');
				493	} else {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	494	// With /u, invalid escapes are not treated as identity escapes.
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	495	return ReportError(CStrVector("Invalid unicode escape"));
				496	}
				497	break;
				498	}
				499	default:
				500	Advance();
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	501	// With /u, no identity escapes except for syntax characters
				502	// are allowed. Otherwise, all identity escapes are allowed.
				503	if (!unicode() \|\| IsSyntaxCharacterOrSlash(current())) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	504	builder->AddCharacter(current());
				505	Advance();
				506	} else {
				507	return ReportError(CStrVector("Invalid escape"));
				508	}
				509	break;
				510	}
				511	break;
				512	case '{': {
				513	int dummy;
				514	if (ParseIntervalQuantifier(&dummy, &dummy)) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	515	return ReportError(CStrVector("Nothing to repeat"));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	516	}
				517	// fallthrough
				518	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	519	case '}':
				520	case ']':
				521	if (unicode()) {
				522	return ReportError(CStrVector("Lone quantifier brackets"));
				523	}
				524	// fallthrough
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	525	default:
				526	builder->AddUnicodeCharacter(current());
				527	Advance();
				528	break;
				529	} // end switch(current())
				530
				531	int min;
				532	int max;
				533	switch (current()) {
				534	// QuantifierPrefix ::
				535	// *
				536	// +
				537	// ?
				538	// {
				539	case '*':
				540	min = 0;
				541	max = RegExpTree::kInfinity;
				542	Advance();
				543	break;
				544	case '+':
				545	min = 1;
				546	max = RegExpTree::kInfinity;
				547	Advance();
				548	break;
				549	case '?':
				550	min = 0;
				551	max = 1;
				552	Advance();
				553	break;
				554	case '{':
				555	if (ParseIntervalQuantifier(&min, &max)) {
				556	if (max < min) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	557	return ReportError(
				558	CStrVector("numbers out of order in {} quantifier"));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	559	}
				560	break;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	561	} else if (unicode()) {
				562	// With /u, incomplete quantifiers are not allowed.
				563	return ReportError(CStrVector("Incomplete quantifier"));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	564	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	565	continue;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	566	default:
				567	continue;
				568	}
				569	RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
				570	if (current() == '?') {
				571	quantifier_type = RegExpQuantifier::NON_GREEDY;
				572	Advance();
				573	} else if (FLAG_regexp_possessive_quantifier && current() == '+') {
				574	// FLAG_regexp_possessive_quantifier is a debug-only flag.
				575	quantifier_type = RegExpQuantifier::POSSESSIVE;
				576	Advance();
				577	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	578	if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
				579	return ReportError(CStrVector("Invalid quantifier"));
				580	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	581	}
				582	}
				583
				584
				585	#ifdef DEBUG
				586	// Currently only used in an DCHECK.
				587	static bool IsSpecialClassEscape(uc32 c) {
				588	switch (c) {
				589	case 'd':
				590	case 'D':
				591	case 's':
				592	case 'S':
				593	case 'w':
				594	case 'W':
				595	return true;
				596	default:
				597	return false;
				598	}
				599	}
				600	#endif
				601
				602
				603	// In order to know whether an escape is a backreference or not we have to scan
				604	// the entire regexp and find the number of capturing parentheses. However we
				605	// don't want to scan the regexp twice unless it is necessary. This mini-parser
				606	// is called when needed. It can see the difference between capturing and
				607	// noncapturing parentheses and can skip character classes and backslash-escaped
				608	// characters.
				609	void RegExpParser::ScanForCaptures() {
				610	// Start with captures started previous to current position
				611	int capture_count = captures_started();
				612	// Add count of captures after this position.
				613	int n;
				614	while ((n = current()) != kEndMarker) {
				615	Advance();
				616	switch (n) {
				617	case '\\':
				618	Advance();
				619	break;
				620	case '[': {
				621	int c;
				622	while ((c = current()) != kEndMarker) {
				623	Advance();
				624	if (c == '\\') {
				625	Advance();
				626	} else {
				627	if (c == ']') break;
				628	}
				629	}
				630	break;
				631	}
				632	case '(':
				633	if (current() != '?') capture_count++;
				634	break;
				635	}
				636	}
				637	capture_count_ = capture_count;
				638	is_scanned_for_captures_ = true;
				639	}
				640
				641
				642	bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
				643	DCHECK_EQ('\\', current());
				644	DCHECK('1' <= Next() && Next() <= '9');
				645	// Try to parse a decimal literal that is no greater than the total number
				646	// of left capturing parentheses in the input.
				647	int start = position();
				648	int value = Next() - '0';
				649	Advance(2);
				650	while (true) {
				651	uc32 c = current();
				652	if (IsDecimalDigit(c)) {
				653	value = 10 * value + (c - '0');
				654	if (value > kMaxCaptures) {
				655	Reset(start);
				656	return false;
				657	}
				658	Advance();
				659	} else {
				660	break;
				661	}
				662	}
				663	if (value > captures_started()) {
				664	if (!is_scanned_for_captures_) {
				665	int saved_position = position();
				666	ScanForCaptures();
				667	Reset(saved_position);
				668	}
				669	if (value > capture_count_) {
				670	Reset(start);
				671	return false;
				672	}
				673	}
				674	*index_out = value;
				675	return true;
				676	}
				677
				678
				679	RegExpCapture* RegExpParser::GetCapture(int index) {
				680	// The index for the capture groups are one-based. Its index in the list is
				681	// zero-based.
				682	int know_captures =
				683	is_scanned_for_captures_ ? capture_count_ : captures_started_;
				684	DCHECK(index <= know_captures);
				685	if (captures_ == NULL) {
				686	captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
				687	}
				688	while (captures_->length() < know_captures) {
				689	captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
				690	}
				691	return captures_->at(index - 1);
				692	}
				693
				694
				695	bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
				696	for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
				697	if (s->group_type() != CAPTURE) continue;
				698	// Return true if we found the matching capture index.
				699	if (index == s->capture_index()) return true;
				700	// Abort if index is larger than what has been parsed up till this state.
				701	if (index > s->capture_index()) return false;
				702	}
				703	return false;
				704	}
				705
				706
				707	// QuantifierPrefix ::
				708	// { DecimalDigits }
				709	// { DecimalDigits , }
				710	// { DecimalDigits , DecimalDigits }
				711	//
				712	// Returns true if parsing succeeds, and set the min_out and max_out
				713	// values. Values are truncated to RegExpTree::kInfinity if they overflow.
				714	bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
				715	DCHECK_EQ(current(), '{');
				716	int start = position();
				717	Advance();
				718	int min = 0;
				719	if (!IsDecimalDigit(current())) {
				720	Reset(start);
				721	return false;
				722	}
				723	while (IsDecimalDigit(current())) {
				724	int next = current() - '0';
				725	if (min > (RegExpTree::kInfinity - next) / 10) {
				726	// Overflow. Skip past remaining decimal digits and return -1.
				727	do {
				728	Advance();
				729	} while (IsDecimalDigit(current()));
				730	min = RegExpTree::kInfinity;
				731	break;
				732	}
				733	min = 10 * min + next;
				734	Advance();
				735	}
				736	int max = 0;
				737	if (current() == '}') {
				738	max = min;
				739	Advance();
				740	} else if (current() == ',') {
				741	Advance();
				742	if (current() == '}') {
				743	max = RegExpTree::kInfinity;
				744	Advance();
				745	} else {
				746	while (IsDecimalDigit(current())) {
				747	int next = current() - '0';
				748	if (max > (RegExpTree::kInfinity - next) / 10) {
				749	do {
				750	Advance();
				751	} while (IsDecimalDigit(current()));
				752	max = RegExpTree::kInfinity;
				753	break;
				754	}
				755	max = 10 * max + next;
				756	Advance();
				757	}
				758	if (current() != '}') {
				759	Reset(start);
				760	return false;
				761	}
				762	Advance();
				763	}
				764	} else {
				765	Reset(start);
				766	return false;
				767	}
				768	*min_out = min;
				769	*max_out = max;
				770	return true;
				771	}
				772
				773
				774	uc32 RegExpParser::ParseOctalLiteral() {
				775	DCHECK(('0' <= current() && current() <= '7') \|\| current() == kEndMarker);
				776	// For compatibility with some other browsers (not all), we parse
				777	// up to three octal digits with a value below 256.
				778	uc32 value = current() - '0';
				779	Advance();
				780	if ('0' <= current() && current() <= '7') {
				781	value = value * 8 + current() - '0';
				782	Advance();
				783	if (value < 32 && '0' <= current() && current() <= '7') {
				784	value = value * 8 + current() - '0';
				785	Advance();
				786	}
				787	}
				788	return value;
				789	}
				790
				791
				792	bool RegExpParser::ParseHexEscape(int length, uc32* value) {
				793	int start = position();
				794	uc32 val = 0;
				795	for (int i = 0; i < length; ++i) {
				796	uc32 c = current();
				797	int d = HexValue(c);
				798	if (d < 0) {
				799	Reset(start);
				800	return false;
				801	}
				802	val = val * 16 + d;
				803	Advance();
				804	}
				805	*value = val;
				806	return true;
				807	}
				808
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	809	// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	810	bool RegExpParser::ParseUnicodeEscape(uc32* value) {
				811	// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
				812	// allowed). In the latter case, the number of hex digits between { } is
				813	// arbitrary. \ and u have already been read.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	814	if (current() == '{' && unicode()) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	815	int start = position();
				816	Advance();
				817	if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
				818	if (current() == '}') {
				819	Advance();
				820	return true;
				821	}
				822	}
				823	Reset(start);
				824	return false;
				825	}
				826	// \u but no {, or \u{...} escapes not allowed.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	827	bool result = ParseHexEscape(4, value);
				828	if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
				829	current() == '\\') {
				830	// Attempt to read trail surrogate.
				831	int start = position();
				832	if (Next() == 'u') {
				833	Advance(2);
				834	uc32 trail;
				835	if (ParseHexEscape(4, &trail) &&
				836	unibrow::Utf16::IsTrailSurrogate(trail)) {
				837	value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(value),
				838	static_cast<uc16>(trail));
				839	return true;
				840	}
				841	}
				842	Reset(start);
				843	}
				844	return result;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	845	}
				846
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	847	#ifdef V8_I18N_SUPPORT
Ben Murdoch	da12d29	2016-06-02 14:46:10 +0100	[diff] [blame^]	848	bool IsExactPropertyValueAlias(const char* property_name, UProperty property,
				849	int32_t property_value) {
				850	const char* short_name =
				851	u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
				852	if (short_name != NULL && strcmp(property_name, short_name) == 0) return true;
				853	for (int i = 0;; i++) {
				854	const char* long_name = u_getPropertyValueName(
				855	property, property_value,
				856	static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
				857	if (long_name == NULL) break;
				858	if (strcmp(property_name, long_name) == 0) return true;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	859	}
Ben Murdoch	da12d29	2016-06-02 14:46:10 +0100	[diff] [blame^]	860	return false;
				861	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	862
Ben Murdoch	da12d29	2016-06-02 14:46:10 +0100	[diff] [blame^]	863	bool LookupPropertyClass(UProperty property, const char* property_name,
				864	ZoneList<CharacterRange>* result, Zone* zone) {
				865	int32_t property_value = u_getPropertyValueEnum(property, property_name);
				866	if (property_value == UCHAR_INVALID_CODE) return false;
				867
				868	// We require the property name to match exactly to one of the property value
				869	// aliases. However, u_getPropertyValueEnum uses loose matching.
				870	if (!IsExactPropertyValueAlias(property_name, property, property_value)) {
				871	return false;
				872	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	873
				874	USet* set = uset_openEmpty();
				875	UErrorCode ec = U_ZERO_ERROR;
Ben Murdoch	da12d29	2016-06-02 14:46:10 +0100	[diff] [blame^]	876	uset_applyIntPropertyValue(set, property, property_value, &ec);
				877	bool success = ec == U_ZERO_ERROR && !uset_isEmpty(set);
				878
				879	if (success) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	880	uset_removeAllStrings(set);
				881	int item_count = uset_getItemCount(set);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	882	int item_result = 0;
				883	for (int i = 0; i < item_count; i++) {
				884	uc32 start = 0;
				885	uc32 end = 0;
				886	item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
Ben Murdoch	da12d29	2016-06-02 14:46:10 +0100	[diff] [blame^]	887	result->Add(CharacterRange::Range(start, end), zone);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	888	}
				889	DCHECK_EQ(U_ZERO_ERROR, ec);
				890	DCHECK_EQ(0, item_result);
				891	}
				892	uset_close(set);
Ben Murdoch	da12d29	2016-06-02 14:46:10 +0100	[diff] [blame^]	893	return success;
				894	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	895	#endif // V8_I18N_SUPPORT
Ben Murdoch	da12d29	2016-06-02 14:46:10 +0100	[diff] [blame^]	896
				897	bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result) {
				898	#ifdef V8_I18N_SUPPORT
				899	List<char> property_name_list;
				900	if (current() == '{') {
				901	for (Advance(); current() != '}'; Advance()) {
				902	if (!has_next()) return false;
				903	property_name_list.Add(static_cast<char>(current()));
				904	}
				905	} else if (current() != kEndMarker) {
				906	property_name_list.Add(static_cast<char>(current()));
				907	} else {
				908	return false;
				909	}
				910	Advance();
				911	property_name_list.Add(0); // null-terminate string.
				912
				913	const char* property_name = property_name_list.ToConstVector().start();
				914
				915	#define PROPERTY_NAME_LOOKUP(PROPERTY) \
				916	do { \
				917	if (LookupPropertyClass(PROPERTY, property_name, result, zone())) { \
				918	return true; \
				919	} \
				920	} while (false)
				921
				922	// General_Category (gc) found in PropertyValueAliases.txt
				923	PROPERTY_NAME_LOOKUP(UCHAR_GENERAL_CATEGORY_MASK);
				924	// Script (sc) found in Scripts.txt
				925	PROPERTY_NAME_LOOKUP(UCHAR_SCRIPT);
				926	// To disambiguate from script names, block names have an "In"-prefix.
				927	if (property_name_list.length() > 3 && property_name[0] == 'I' &&
				928	property_name[1] == 'n') {
				929	// Block (blk) found in Blocks.txt
				930	property_name += 2;
				931	PROPERTY_NAME_LOOKUP(UCHAR_BLOCK);
				932	}
				933	#undef PROPERTY_NAME_LOOKUP
				934	#endif // V8_I18N_SUPPORT
				935	return false;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	936	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	937
				938	bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
				939	uc32 x = 0;
				940	int d = HexValue(current());
				941	if (d < 0) {
				942	return false;
				943	}
				944	while (d >= 0) {
				945	x = x * 16 + d;
				946	if (x > max_value) {
				947	return false;
				948	}
				949	Advance();
				950	d = HexValue(current());
				951	}
				952	*value = x;
				953	return true;
				954	}
				955
				956
				957	uc32 RegExpParser::ParseClassCharacterEscape() {
				958	DCHECK(current() == '\\');
				959	DCHECK(has_next() && !IsSpecialClassEscape(Next()));
				960	Advance();
				961	switch (current()) {
				962	case 'b':
				963	Advance();
				964	return '\b';
				965	// ControlEscape :: one of
				966	// f n r t v
				967	case 'f':
				968	Advance();
				969	return '\f';
				970	case 'n':
				971	Advance();
				972	return '\n';
				973	case 'r':
				974	Advance();
				975	return '\r';
				976	case 't':
				977	Advance();
				978	return '\t';
				979	case 'v':
				980	Advance();
				981	return '\v';
				982	case 'c': {
				983	uc32 controlLetter = Next();
				984	uc32 letter = controlLetter & ~('A' ^ 'a');
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	985	// For compatibility with JSC, inside a character class. We also accept
				986	// digits and underscore as control characters, unless with /u.
				987	if (letter >= 'A' && letter <= 'Z') {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	988	Advance(2);
				989	// Control letters mapped to ASCII control characters in the range
				990	// 0x00-0x1f.
				991	return controlLetter & 0x1f;
				992	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	993	if (unicode()) {
				994	// With /u, invalid escapes are not treated as identity escapes.
				995	ReportError(CStrVector("Invalid class escape"));
				996	return 0;
				997	}
				998	if ((controlLetter >= '0' && controlLetter <= '9') \|\|
				999	controlLetter == '_') {
				1000	Advance(2);
				1001	return controlLetter & 0x1f;
				1002	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1003	// We match JSC in reading the backslash as a literal
				1004	// character instead of as starting an escape.
				1005	return '\\';
				1006	}
				1007	case '0':
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1008	// With /u, \0 is interpreted as NUL if not followed by another digit.
				1009	if (unicode() && !(Next() >= '0' && Next() <= '9')) {
				1010	Advance();
				1011	return 0;
				1012	}
				1013	// Fall through.
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1014	case '1':
				1015	case '2':
				1016	case '3':
				1017	case '4':
				1018	case '5':
				1019	case '6':
				1020	case '7':
				1021	// For compatibility, we interpret a decimal escape that isn't
				1022	// a back reference (and therefore either \0 or not valid according
				1023	// to the specification) as a 1..3 digit octal character code.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1024	if (unicode()) {
				1025	// With /u, decimal escape is not interpreted as octal character code.
				1026	ReportError(CStrVector("Invalid class escape"));
				1027	return 0;
				1028	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1029	return ParseOctalLiteral();
				1030	case 'x': {
				1031	Advance();
				1032	uc32 value;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1033	if (ParseHexEscape(2, &value)) return value;
				1034	if (unicode()) {
				1035	// With /u, invalid escapes are not treated as identity escapes.
				1036	ReportError(CStrVector("Invalid escape"));
				1037	return 0;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1038	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1039	// If \x is not followed by a two-digit hexadecimal, treat it
				1040	// as an identity escape.
				1041	return 'x';
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1042	}
				1043	case 'u': {
				1044	Advance();
				1045	uc32 value;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1046	if (ParseUnicodeEscape(&value)) return value;
				1047	if (unicode()) {
				1048	// With /u, invalid escapes are not treated as identity escapes.
				1049	ReportError(CStrVector("Invalid unicode escape"));
				1050	return 0;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1051	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1052	// If \u is not followed by a two-digit hexadecimal, treat it
				1053	// as an identity escape.
				1054	return 'u';
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1055	}
				1056	default: {
				1057	uc32 result = current();
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1058	// With /u, no identity escapes except for syntax characters and '-' are
				1059	// allowed. Otherwise, all identity escapes are allowed.
				1060	if (!unicode() \|\| IsSyntaxCharacterOrSlash(result) \|\| result == '-') {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1061	Advance();
				1062	return result;
				1063	}
				1064	ReportError(CStrVector("Invalid escape"));
				1065	return 0;
				1066	}
				1067	}
				1068	return 0;
				1069	}
				1070
				1071
				1072	CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
				1073	DCHECK_EQ(0, *char_class);
				1074	uc32 first = current();
				1075	if (first == '\\') {
				1076	switch (Next()) {
				1077	case 'w':
				1078	case 'W':
				1079	case 'd':
				1080	case 'D':
				1081	case 's':
				1082	case 'S': {
				1083	*char_class = Next();
				1084	Advance(2);
				1085	return CharacterRange::Singleton(0); // Return dummy value.
				1086	}
				1087	case kEndMarker:
				1088	return ReportError(CStrVector("\\ at end of pattern"));
				1089	default:
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1090	first = ParseClassCharacterEscape(CHECK_FAILED);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1091	}
				1092	} else {
				1093	Advance();
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1094	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1095
				1096	return CharacterRange::Singleton(first);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1097	}
				1098
				1099
				1100	static const uc16 kNoCharClass = 0;
				1101
				1102	// Adds range or pre-defined character class to character ranges.
				1103	// If char_class is not kInvalidClass, it's interpreted as a class
				1104	// escape (i.e., 's' means whitespace, from '\s').
				1105	static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
				1106	uc16 char_class, CharacterRange range,
				1107	Zone* zone) {
				1108	if (char_class != kNoCharClass) {
				1109	CharacterRange::AddClassEscape(char_class, ranges, zone);
				1110	} else {
				1111	ranges->Add(range, zone);
				1112	}
				1113	}
				1114
Ben Murdoch	da12d29	2016-06-02 14:46:10 +0100	[diff] [blame^]	1115	bool RegExpParser::ParseClassProperty(ZoneList<CharacterRange>* ranges) {
				1116	if (!FLAG_harmony_regexp_property) return false;
				1117	if (!unicode()) return false;
				1118	if (current() != '\\') return false;
				1119	uc32 next = Next();
				1120	bool parse_success = false;
				1121	if (next == 'p') {
				1122	Advance(2);
				1123	parse_success = ParsePropertyClass(ranges);
				1124	} else if (next == 'P') {
				1125	Advance(2);
				1126	ZoneList<CharacterRange>* property_class =
				1127	new (zone()) ZoneList<CharacterRange>(2, zone());
				1128	parse_success = ParsePropertyClass(property_class);
				1129	if (parse_success) {
				1130	ZoneList<CharacterRange>* negated =
				1131	new (zone()) ZoneList<CharacterRange>(2, zone());
				1132	CharacterRange::Negate(property_class, negated, zone());
				1133	const Vector<CharacterRange> negated_vector = negated->ToVector();
				1134	ranges->AddAll(negated_vector, zone());
				1135	}
				1136	} else {
				1137	return false;
				1138	}
				1139	if (!parse_success)
				1140	ReportError(CStrVector("Invalid property name in character class"));
				1141	return parse_success;
				1142	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1143
				1144	RegExpTree* RegExpParser::ParseCharacterClass() {
				1145	static const char* kUnterminated = "Unterminated character class";
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1146	static const char* kRangeInvalid = "Invalid character class";
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1147	static const char* kRangeOutOfOrder = "Range out of order in character class";
				1148
				1149	DCHECK_EQ(current(), '[');
				1150	Advance();
				1151	bool is_negated = false;
				1152	if (current() == '^') {
				1153	is_negated = true;
				1154	Advance();
				1155	}
				1156	ZoneList<CharacterRange>* ranges =
				1157	new (zone()) ZoneList<CharacterRange>(2, zone());
				1158	while (has_more() && current() != ']') {
Ben Murdoch	da12d29	2016-06-02 14:46:10 +0100	[diff] [blame^]	1159	bool parsed_property = ParseClassProperty(ranges CHECK_FAILED);
				1160	if (parsed_property) continue;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1161	uc16 char_class = kNoCharClass;
				1162	CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
				1163	if (current() == '-') {
				1164	Advance();
				1165	if (current() == kEndMarker) {
				1166	// If we reach the end we break out of the loop and let the
				1167	// following code report an error.
				1168	break;
				1169	} else if (current() == ']') {
				1170	AddRangeOrEscape(ranges, char_class, first, zone());
				1171	ranges->Add(CharacterRange::Singleton('-'), zone());
				1172	break;
				1173	}
				1174	uc16 char_class_2 = kNoCharClass;
				1175	CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
				1176	if (char_class != kNoCharClass \|\| char_class_2 != kNoCharClass) {
				1177	// Either end is an escaped character class. Treat the '-' verbatim.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1178	if (unicode()) {
				1179	// ES2015 21.2.2.15.1 step 1.
				1180	return ReportError(CStrVector(kRangeInvalid));
				1181	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1182	AddRangeOrEscape(ranges, char_class, first, zone());
				1183	ranges->Add(CharacterRange::Singleton('-'), zone());
				1184	AddRangeOrEscape(ranges, char_class_2, next, zone());
				1185	continue;
				1186	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1187	// ES2015 21.2.2.15.1 step 6.
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1188	if (first.from() > next.to()) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1189	return ReportError(CStrVector(kRangeOutOfOrder));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1190	}
				1191	ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
				1192	} else {
				1193	AddRangeOrEscape(ranges, char_class, first, zone());
				1194	}
				1195	}
				1196	if (!has_more()) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1197	return ReportError(CStrVector(kUnterminated));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1198	}
				1199	Advance();
				1200	if (ranges->length() == 0) {
				1201	ranges->Add(CharacterRange::Everything(), zone());
				1202	is_negated = !is_negated;
				1203	}
				1204	return new (zone()) RegExpCharacterClass(ranges, is_negated);
				1205	}
				1206
				1207
				1208	#undef CHECK_FAILED
				1209
				1210
				1211	bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1212	FlatStringReader* input, JSRegExp::Flags flags,
				1213	RegExpCompileData* result) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1214	DCHECK(result != NULL);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1215	RegExpParser parser(input, &result->error, flags, isolate, zone);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1216	RegExpTree* tree = parser.ParsePattern();
				1217	if (parser.failed()) {
				1218	DCHECK(tree == NULL);
				1219	DCHECK(!result->error.is_null());
				1220	} else {
				1221	DCHECK(tree != NULL);
				1222	DCHECK(result->error.is_null());
				1223	if (FLAG_trace_regexp_parser) {
				1224	OFStream os(stdout);
				1225	tree->Print(os, zone);
				1226	os << "\n";
				1227	}
				1228	result->tree = tree;
				1229	int capture_count = parser.captures_started();
				1230	result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
				1231	result->contains_anchor = parser.contains_anchor();
				1232	result->capture_count = capture_count;
				1233	}
				1234	return !parser.failed();
				1235	}
				1236
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1237	RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1238	: zone_(zone),
				1239	pending_empty_(false),
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1240	ignore_case_(ignore_case),
				1241	unicode_(unicode),
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1242	characters_(NULL),
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1243	pending_surrogate_(kNoPendingSurrogate),
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1244	terms_(),
				1245	alternatives_()
				1246	#ifdef DEBUG
				1247	,
				1248	last_added_(ADD_NONE)
				1249	#endif
				1250	{
				1251	}
				1252
				1253
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1254	void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
				1255	DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
				1256	FlushPendingSurrogate();
				1257	// Hold onto the lead surrogate, waiting for a trail surrogate to follow.
				1258	pending_surrogate_ = lead_surrogate;
				1259	}
				1260
				1261
				1262	void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
				1263	DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
				1264	if (pending_surrogate_ != kNoPendingSurrogate) {
				1265	uc16 lead_surrogate = pending_surrogate_;
				1266	pending_surrogate_ = kNoPendingSurrogate;
				1267	DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
				1268	uc32 combined =
				1269	unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
				1270	if (NeedsDesugaringForIgnoreCase(combined)) {
				1271	AddCharacterClassForDesugaring(combined);
				1272	} else {
				1273	ZoneList<uc16> surrogate_pair(2, zone());
				1274	surrogate_pair.Add(lead_surrogate, zone());
				1275	surrogate_pair.Add(trail_surrogate, zone());
				1276	RegExpAtom* atom =
				1277	new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
				1278	AddAtom(atom);
				1279	}
				1280	} else {
				1281	pending_surrogate_ = trail_surrogate;
				1282	FlushPendingSurrogate();
				1283	}
				1284	}
				1285
				1286
				1287	void RegExpBuilder::FlushPendingSurrogate() {
				1288	if (pending_surrogate_ != kNoPendingSurrogate) {
				1289	DCHECK(unicode());
				1290	uc32 c = pending_surrogate_;
				1291	pending_surrogate_ = kNoPendingSurrogate;
				1292	AddCharacterClassForDesugaring(c);
				1293	}
				1294	}
				1295
				1296
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1297	void RegExpBuilder::FlushCharacters() {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1298	FlushPendingSurrogate();
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1299	pending_empty_ = false;
				1300	if (characters_ != NULL) {
				1301	RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
				1302	characters_ = NULL;
				1303	text_.Add(atom, zone());
				1304	LAST(ADD_ATOM);
				1305	}
				1306	}
				1307
				1308
				1309	void RegExpBuilder::FlushText() {
				1310	FlushCharacters();
				1311	int num_text = text_.length();
				1312	if (num_text == 0) {
				1313	return;
				1314	} else if (num_text == 1) {
				1315	terms_.Add(text_.last(), zone());
				1316	} else {
				1317	RegExpText* text = new (zone()) RegExpText(zone());
				1318	for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
				1319	terms_.Add(text, zone());
				1320	}
				1321	text_.Clear();
				1322	}
				1323
				1324
				1325	void RegExpBuilder::AddCharacter(uc16 c) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1326	FlushPendingSurrogate();
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1327	pending_empty_ = false;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1328	if (NeedsDesugaringForIgnoreCase(c)) {
				1329	AddCharacterClassForDesugaring(c);
				1330	} else {
				1331	if (characters_ == NULL) {
				1332	characters_ = new (zone()) ZoneList<uc16>(4, zone());
				1333	}
				1334	characters_->Add(c, zone());
				1335	LAST(ADD_CHAR);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1336	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1337	}
				1338
				1339
				1340	void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
				1341	if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1342	DCHECK(unicode());
				1343	AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
				1344	AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
				1345	} else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
				1346	AddLeadSurrogate(c);
				1347	} else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
				1348	AddTrailSurrogate(c);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1349	} else {
				1350	AddCharacter(static_cast<uc16>(c));
				1351	}
				1352	}
				1353
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1354	void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
				1355	// A lead or trail surrogate parsed via escape sequence will not
				1356	// pair up with any preceding lead or following trail surrogate.
				1357	FlushPendingSurrogate();
				1358	AddUnicodeCharacter(character);
				1359	FlushPendingSurrogate();
				1360	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1361
				1362	void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
				1363
				1364
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1365	void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
				1366	if (NeedsDesugaringForUnicode(cc)) {
				1367	// With /u, character class needs to be desugared, so it
				1368	// must be a standalone term instead of being part of a RegExpText.
				1369	AddTerm(cc);
				1370	} else {
				1371	AddAtom(cc);
				1372	}
				1373	}
				1374
				1375	void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
				1376	AddTerm(new (zone()) RegExpCharacterClass(
				1377	CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
				1378	}
				1379
				1380
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1381	void RegExpBuilder::AddAtom(RegExpTree* term) {
				1382	if (term->IsEmpty()) {
				1383	AddEmpty();
				1384	return;
				1385	}
				1386	if (term->IsTextElement()) {
				1387	FlushCharacters();
				1388	text_.Add(term, zone());
				1389	} else {
				1390	FlushText();
				1391	terms_.Add(term, zone());
				1392	}
				1393	LAST(ADD_ATOM);
				1394	}
				1395
				1396
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1397	void RegExpBuilder::AddTerm(RegExpTree* term) {
				1398	FlushText();
				1399	terms_.Add(term, zone());
				1400	LAST(ADD_ATOM);
				1401	}
				1402
				1403
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1404	void RegExpBuilder::AddAssertion(RegExpTree* assert) {
				1405	FlushText();
				1406	terms_.Add(assert, zone());
				1407	LAST(ADD_ASSERT);
				1408	}
				1409
				1410
				1411	void RegExpBuilder::NewAlternative() { FlushTerms(); }
				1412
				1413
				1414	void RegExpBuilder::FlushTerms() {
				1415	FlushText();
				1416	int num_terms = terms_.length();
				1417	RegExpTree* alternative;
				1418	if (num_terms == 0) {
				1419	alternative = new (zone()) RegExpEmpty();
				1420	} else if (num_terms == 1) {
				1421	alternative = terms_.last();
				1422	} else {
				1423	alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
				1424	}
				1425	alternatives_.Add(alternative, zone());
				1426	terms_.Clear();
				1427	LAST(ADD_NONE);
				1428	}
				1429
				1430
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1431	bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
				1432	if (!unicode()) return false;
Ben Murdoch	da12d29	2016-06-02 14:46:10 +0100	[diff] [blame^]	1433	// TODO(yangguo): we could be smarter than this. Case-insensitivity does not
				1434	// necessarily mean that we need to desugar. It's probably nicer to have a
				1435	// separate pass to figure out unicode desugarings.
				1436	if (ignore_case()) return true;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1437	ZoneList<CharacterRange>* ranges = cc->ranges(zone());
				1438	CharacterRange::Canonicalize(ranges);
				1439	for (int i = ranges->length() - 1; i >= 0; i--) {
				1440	uc32 from = ranges->at(i).from();
				1441	uc32 to = ranges->at(i).to();
				1442	// Check for non-BMP characters.
				1443	if (to >= kNonBmpStart) return true;
				1444	// Check for lone surrogates.
				1445	if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
				1446	}
				1447	return false;
				1448	}
				1449
				1450
				1451	bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
				1452	#ifdef V8_I18N_SUPPORT
				1453	if (unicode() && ignore_case()) {
				1454	USet* set = uset_open(c, c);
				1455	uset_closeOver(set, USET_CASE_INSENSITIVE);
				1456	uset_removeAllStrings(set);
				1457	bool result = uset_size(set) > 1;
				1458	uset_close(set);
				1459	return result;
				1460	}
				1461	// In the case where ICU is not included, we act as if the unicode flag is
				1462	// not set, and do not desugar.
				1463	#endif // V8_I18N_SUPPORT
				1464	return false;
				1465	}
				1466
				1467
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1468	RegExpTree* RegExpBuilder::ToRegExp() {
				1469	FlushTerms();
				1470	int num_alternatives = alternatives_.length();
				1471	if (num_alternatives == 0) return new (zone()) RegExpEmpty();
				1472	if (num_alternatives == 1) return alternatives_.last();
				1473	return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
				1474	}
				1475
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1476	bool RegExpBuilder::AddQuantifierToAtom(
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1477	int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1478	FlushPendingSurrogate();
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1479	if (pending_empty_) {
				1480	pending_empty_ = false;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1481	return true;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1482	}
				1483	RegExpTree* atom;
				1484	if (characters_ != NULL) {
				1485	DCHECK(last_added_ == ADD_CHAR);
				1486	// Last atom was character.
				1487	Vector<const uc16> char_vector = characters_->ToConstVector();
				1488	int num_chars = char_vector.length();
				1489	if (num_chars > 1) {
				1490	Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
				1491	text_.Add(new (zone()) RegExpAtom(prefix), zone());
				1492	char_vector = char_vector.SubVector(num_chars - 1, num_chars);
				1493	}
				1494	characters_ = NULL;
				1495	atom = new (zone()) RegExpAtom(char_vector);
				1496	FlushText();
				1497	} else if (text_.length() > 0) {
				1498	DCHECK(last_added_ == ADD_ATOM);
				1499	atom = text_.RemoveLast();
				1500	FlushText();
				1501	} else if (terms_.length() > 0) {
				1502	DCHECK(last_added_ == ADD_ATOM);
				1503	atom = terms_.RemoveLast();
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1504	// With /u, lookarounds are not quantifiable.
				1505	if (unicode() && atom->IsLookaround()) return false;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1506	if (atom->max_match() == 0) {
				1507	// Guaranteed to only match an empty string.
				1508	LAST(ADD_TERM);
				1509	if (min == 0) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1510	return true;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1511	}
				1512	terms_.Add(atom, zone());
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1513	return true;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1514	}
				1515	} else {
				1516	// Only call immediately after adding an atom or character!
				1517	UNREACHABLE();
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1518	return false;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1519	}
				1520	terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
				1521	zone());
				1522	LAST(ADD_TERM);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1523	return true;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1524	}
				1525
				1526	} // namespace internal
				1527	} // namespace v8