Blame - src/regexp/jsregexp.cc - platform/external/v8

blob: 96a778cfb7ce2fd1f0f6158fd06aac7fa7f5fb39 [file] [log] [blame]

Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	1	// Copyright 2012 the V8 project authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#include "src/regexp/jsregexp.h"
				6
Ben Murdoch	f91f061	2016-11-29 16:50:11 +0000	[diff] [blame]	7	#include <memory>
				8
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	9	#include "src/base/platform/platform.h"
				10	#include "src/compilation-cache.h"
Ben Murdoch	f91f061	2016-11-29 16:50:11 +0000	[diff] [blame]	11	#include "src/elements.h"
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	12	#include "src/execution.h"
				13	#include "src/factory.h"
				14	#include "src/isolate-inl.h"
				15	#include "src/messages.h"
				16	#include "src/ostreams.h"
				17	#include "src/regexp/interpreter-irregexp.h"
				18	#include "src/regexp/jsregexp-inl.h"
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	19	#include "src/regexp/regexp-macro-assembler-irregexp.h"
				20	#include "src/regexp/regexp-macro-assembler-tracer.h"
Ben Murdoch	f91f061	2016-11-29 16:50:11 +0000	[diff] [blame]	21	#include "src/regexp/regexp-macro-assembler.h"
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	22	#include "src/regexp/regexp-parser.h"
				23	#include "src/regexp/regexp-stack.h"
				24	#include "src/runtime/runtime.h"
				25	#include "src/splay-tree-inl.h"
				26	#include "src/string-search.h"
				27	#include "src/unicode-decoder.h"
				28
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	29	#ifdef V8_I18N_SUPPORT
				30	#include "unicode/uset.h"
				31	#include "unicode/utypes.h"
				32	#endif // V8_I18N_SUPPORT
				33
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	34	#ifndef V8_INTERPRETED_REGEXP
				35	#if V8_TARGET_ARCH_IA32
				36	#include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
				37	#elif V8_TARGET_ARCH_X64
				38	#include "src/regexp/x64/regexp-macro-assembler-x64.h"
				39	#elif V8_TARGET_ARCH_ARM64
				40	#include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
				41	#elif V8_TARGET_ARCH_ARM
				42	#include "src/regexp/arm/regexp-macro-assembler-arm.h"
				43	#elif V8_TARGET_ARCH_PPC
				44	#include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
Ben Murdoch	3b9bc31	2016-06-02 14:46:10 +0100	[diff] [blame]	45	#elif V8_TARGET_ARCH_S390
				46	#include "src/regexp/s390/regexp-macro-assembler-s390.h"
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	47	#elif V8_TARGET_ARCH_MIPS
				48	#include "src/regexp/mips/regexp-macro-assembler-mips.h"
				49	#elif V8_TARGET_ARCH_MIPS64
				50	#include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
				51	#elif V8_TARGET_ARCH_X87
				52	#include "src/regexp/x87/regexp-macro-assembler-x87.h"
				53	#else
				54	#error Unsupported target architecture.
				55	#endif
				56	#endif
				57
				58
				59	namespace v8 {
				60	namespace internal {
				61
				62	MUST_USE_RESULT
				63	static inline MaybeHandle<Object> ThrowRegExpException(
				64	Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) {
				65	Isolate* isolate = re->GetIsolate();
				66	THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
				67	pattern, error_text),
				68	Object);
				69	}
				70
				71
				72	inline void ThrowRegExpException(Handle<JSRegExp> re,
				73	Handle<String> error_text) {
				74	USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));
				75	}
				76
				77
				78	ContainedInLattice AddRange(ContainedInLattice containment,
				79	const int* ranges,
				80	int ranges_length,
				81	Interval new_range) {
				82	DCHECK((ranges_length & 1) == 1);
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	83	DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	84	if (containment == kLatticeUnknown) return containment;
				85	bool inside = false;
				86	int last = 0;
				87	for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
				88	// Consider the range from last to ranges[i].
				89	// We haven't got to the new range yet.
				90	if (ranges[i] <= new_range.from()) continue;
				91	// New range is wholly inside last-ranges[i]. Note that new_range.to() is
				92	// inclusive, but the values in ranges are not.
				93	if (last <= new_range.from() && new_range.to() < ranges[i]) {
				94	return Combine(containment, inside ? kLatticeIn : kLatticeOut);
				95	}
				96	return kLatticeUnknown;
				97	}
				98	return containment;
				99	}
				100
				101
				102	// More makes code generation slower, less makes V8 benchmark score lower.
				103	const int kMaxLookaheadForBoyerMoore = 8;
				104	// In a 3-character pattern you can maximally step forwards 3 characters
				105	// at a time, which is not always enough to pay for the extra logic.
				106	const int kPatternTooShortForBoyerMoore = 2;
				107
				108
				109	// Identifies the sort of regexps where the regexp engine is faster
				110	// than the code used for atom matches.
				111	static bool HasFewDifferentCharacters(Handle<String> pattern) {
				112	int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
				113	if (length <= kPatternTooShortForBoyerMoore) return false;
				114	const int kMod = 128;
				115	bool character_found[kMod];
				116	int different = 0;
				117	memset(&character_found[0], 0, sizeof(character_found));
				118	for (int i = 0; i < length; i++) {
				119	int ch = (pattern->Get(i) & (kMod - 1));
				120	if (!character_found[ch]) {
				121	character_found[ch] = true;
				122	different++;
				123	// We declare a regexp low-alphabet if it has at least 3 times as many
				124	// characters as it has different characters.
				125	if (different * 3 > length) return false;
				126	}
				127	}
				128	return true;
				129	}
				130
				131
				132	// Generic RegExp methods. Dispatches to implementation specific methods.
				133
				134
				135	MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
				136	Handle<String> pattern,
				137	JSRegExp::Flags flags) {
				138	Isolate* isolate = re->GetIsolate();
Ben Murdoch	3b9bc31	2016-06-02 14:46:10 +0100	[diff] [blame]	139	Zone zone(isolate->allocator());
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	140	CompilationCache* compilation_cache = isolate->compilation_cache();
				141	MaybeHandle<FixedArray> maybe_cached =
				142	compilation_cache->LookupRegExp(pattern, flags);
				143	Handle<FixedArray> cached;
				144	bool in_cache = maybe_cached.ToHandle(&cached);
				145	LOG(isolate, RegExpCompileEvent(re, in_cache));
				146
				147	Handle<Object> result;
				148	if (in_cache) {
				149	re->set_data(*cached);
				150	return re;
				151	}
				152	pattern = String::Flatten(pattern);
				153	PostponeInterruptsScope postpone(isolate);
				154	RegExpCompileData parse_result;
				155	FlatStringReader reader(isolate, pattern);
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	156	if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags,
				157	&parse_result)) {
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	158	// Throw an exception if we fail to parse the pattern.
				159	return ThrowRegExpException(re, pattern, parse_result.error);
				160	}
				161
				162	bool has_been_compiled = false;
				163
				164	if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
				165	!(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
				166	// Parse-tree is a single atom that is equal to the pattern.
				167	AtomCompile(re, pattern, flags, pattern);
				168	has_been_compiled = true;
				169	} else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) &&
				170	!(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
				171	RegExpAtom* atom = parse_result.tree->AsAtom();
				172	Vector<const uc16> atom_pattern = atom->data();
				173	Handle<String> atom_string;
				174	ASSIGN_RETURN_ON_EXCEPTION(
				175	isolate, atom_string,
				176	isolate->factory()->NewStringFromTwoByte(atom_pattern),
				177	Object);
				178	if (!HasFewDifferentCharacters(atom_string)) {
				179	AtomCompile(re, pattern, flags, atom_string);
				180	has_been_compiled = true;
				181	}
				182	}
				183	if (!has_been_compiled) {
				184	IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
				185	}
				186	DCHECK(re->data()->IsFixedArray());
				187	// Compilation succeeded so the data is set on the regexp
				188	// and we can store it in the cache.
				189	Handle<FixedArray> data(FixedArray::cast(re->data()));
				190	compilation_cache->PutRegExp(pattern, flags, data);
				191
				192	return re;
				193	}
				194
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	195	MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
Ben Murdoch	f91f061	2016-11-29 16:50:11 +0000	[diff] [blame]	196	Handle<String> subject, int index,
				197	Handle<JSObject> last_match_info) {
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	198	switch (regexp->TypeTag()) {
				199	case JSRegExp::ATOM:
				200	return AtomExec(regexp, subject, index, last_match_info);
				201	case JSRegExp::IRREGEXP: {
				202	return IrregexpExec(regexp, subject, index, last_match_info);
				203	}
				204	default:
				205	UNREACHABLE();
				206	return MaybeHandle<Object>();
				207	}
				208	}
				209
				210
				211	// RegExp Atom implementation: Simple string search using indexOf.
				212
				213
				214	void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
				215	Handle<String> pattern,
				216	JSRegExp::Flags flags,
				217	Handle<String> match_pattern) {
				218	re->GetIsolate()->factory()->SetRegExpAtomData(re,
				219	JSRegExp::ATOM,
				220	pattern,
				221	flags,
				222	match_pattern);
				223	}
				224
				225
				226	static void SetAtomLastCapture(FixedArray* array,
				227	String* subject,
				228	int from,
				229	int to) {
				230	SealHandleScope shs(array->GetIsolate());
				231	RegExpImpl::SetLastCaptureCount(array, 2);
				232	RegExpImpl::SetLastSubject(array, subject);
				233	RegExpImpl::SetLastInput(array, subject);
				234	RegExpImpl::SetCapture(array, 0, from);
				235	RegExpImpl::SetCapture(array, 1, to);
				236	}
				237
				238
				239	int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
				240	Handle<String> subject,
				241	int index,
				242	int32_t* output,
				243	int output_size) {
				244	Isolate* isolate = regexp->GetIsolate();
				245
				246	DCHECK(0 <= index);
				247	DCHECK(index <= subject->length());
				248
				249	subject = String::Flatten(subject);
				250	DisallowHeapAllocation no_gc; // ensure vectors stay valid
				251
				252	String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
				253	int needle_len = needle->length();
				254	DCHECK(needle->IsFlat());
				255	DCHECK_LT(0, needle_len);
				256
				257	if (index + needle_len > subject->length()) {
				258	return RegExpImpl::RE_FAILURE;
				259	}
				260
				261	for (int i = 0; i < output_size; i += 2) {
				262	String::FlatContent needle_content = needle->GetFlatContent();
				263	String::FlatContent subject_content = subject->GetFlatContent();
				264	DCHECK(needle_content.IsFlat());
				265	DCHECK(subject_content.IsFlat());
				266	// dispatch on type of strings
				267	index =
				268	(needle_content.IsOneByte()
				269	? (subject_content.IsOneByte()
				270	? SearchString(isolate, subject_content.ToOneByteVector(),
				271	needle_content.ToOneByteVector(), index)
				272	: SearchString(isolate, subject_content.ToUC16Vector(),
				273	needle_content.ToOneByteVector(), index))
				274	: (subject_content.IsOneByte()
				275	? SearchString(isolate, subject_content.ToOneByteVector(),
				276	needle_content.ToUC16Vector(), index)
				277	: SearchString(isolate, subject_content.ToUC16Vector(),
				278	needle_content.ToUC16Vector(), index)));
				279	if (index == -1) {
				280	return i / 2; // Return number of matches.
				281	} else {
				282	output[i] = index;
				283	output[i+1] = index + needle_len;
				284	index += needle_len;
				285	}
				286	}
				287	return output_size / 2;
				288	}
				289
Ben Murdoch	f91f061	2016-11-29 16:50:11 +0000	[diff] [blame]	290	Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, Handle<String> subject,
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	291	int index,
Ben Murdoch	f91f061	2016-11-29 16:50:11 +0000	[diff] [blame]	292	Handle<JSObject> last_match_info) {
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	293	Isolate* isolate = re->GetIsolate();
				294
				295	static const int kNumRegisters = 2;
				296	STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
				297	int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
				298
				299	int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
				300
				301	if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
				302
				303	DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
				304	SealHandleScope shs(isolate);
				305	FixedArray* array = FixedArray::cast(last_match_info->elements());
				306	SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
				307	return last_match_info;
				308	}
				309
				310
				311	// Irregexp implementation.
				312
				313	// Ensures that the regexp object contains a compiled version of the
				314	// source for either one-byte or two-byte subject strings.
				315	// If the compiled version doesn't already exist, it is compiled
				316	// from the source pattern.
				317	// If compilation fails, an exception is thrown and this function
				318	// returns false.
				319	bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
				320	Handle<String> sample_subject,
				321	bool is_one_byte) {
				322	Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
				323	#ifdef V8_INTERPRETED_REGEXP
				324	if (compiled_code->IsByteArray()) return true;
				325	#else // V8_INTERPRETED_REGEXP (RegExp native code)
				326	if (compiled_code->IsCode()) return true;
				327	#endif
				328	// We could potentially have marked this as flushable, but have kept
				329	// a saved version if we did not flush it yet.
				330	Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
				331	if (saved_code->IsCode()) {
				332	// Reinstate the code in the original place.
				333	re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
				334	DCHECK(compiled_code->IsSmi());
				335	return true;
				336	}
				337	return CompileIrregexp(re, sample_subject, is_one_byte);
				338	}
				339
				340
				341	bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
				342	Handle<String> sample_subject,
				343	bool is_one_byte) {
				344	// Compile the RegExp.
				345	Isolate* isolate = re->GetIsolate();
Ben Murdoch	3b9bc31	2016-06-02 14:46:10 +0100	[diff] [blame]	346	Zone zone(isolate->allocator());
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	347	PostponeInterruptsScope postpone(isolate);
				348	// If we had a compilation error the last time this is saved at the
				349	// saved code index.
				350	Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
				351	// When arriving here entry can only be a smi, either representing an
				352	// uncompiled regexp, a previous compilation error, or code that has
				353	// been flushed.
				354	DCHECK(entry->IsSmi());
				355	int entry_value = Smi::cast(entry)->value();
				356	DCHECK(entry_value == JSRegExp::kUninitializedValue \|\|
				357	entry_value == JSRegExp::kCompilationErrorValue \|\|
				358	(entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
				359
				360	if (entry_value == JSRegExp::kCompilationErrorValue) {
				361	// A previous compilation failed and threw an error which we store in
				362	// the saved code index (we store the error message, not the actual
				363	// error). Recreate the error object and throw it.
				364	Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
				365	DCHECK(error_string->IsString());
				366	Handle<String> error_message(String::cast(error_string));
				367	ThrowRegExpException(re, error_message);
				368	return false;
				369	}
				370
				371	JSRegExp::Flags flags = re->GetFlags();
				372
				373	Handle<String> pattern(re->Pattern());
				374	pattern = String::Flatten(pattern);
				375	RegExpCompileData compile_data;
				376	FlatStringReader reader(isolate, pattern);
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	377	if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
				378	&compile_data)) {
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	379	// Throw an exception if we fail to parse the pattern.
				380	// THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
				381	USE(ThrowRegExpException(re, pattern, compile_data.error));
				382	return false;
				383	}
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	384	RegExpEngine::CompilationResult result =
				385	RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
				386	sample_subject, is_one_byte);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	387	if (result.error_message != NULL) {
				388	// Unable to compile regexp.
				389	Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
				390	CStrVector(result.error_message)).ToHandleChecked();
				391	ThrowRegExpException(re, error_message);
				392	return false;
				393	}
				394
				395	Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
				396	data->set(JSRegExp::code_index(is_one_byte), result.code);
Ben Murdoch	13e2dad	2016-09-16 13:49:30 +0100	[diff] [blame]	397	SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	398	int register_max = IrregexpMaxRegisterCount(*data);
				399	if (result.num_registers > register_max) {
				400	SetIrregexpMaxRegisterCount(*data, result.num_registers);
				401	}
				402
				403	return true;
				404	}
				405
				406
				407	int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
				408	return Smi::cast(
				409	re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
				410	}
				411
				412
				413	void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
				414	re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
				415	}
				416
Ben Murdoch	13e2dad	2016-09-16 13:49:30 +0100	[diff] [blame]	417	void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray* re,
				418	Handle<FixedArray> value) {
				419	if (value.is_null()) {
				420	re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::FromInt(0));
				421	} else {
				422	re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
				423	}
				424	}
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	425
				426	int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
				427	return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
				428	}
				429
				430
				431	int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
				432	return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
				433	}
				434
				435
				436	ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
				437	return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
				438	}
				439
				440
				441	Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
				442	return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
				443	}
				444
				445
				446	void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
				447	Handle<String> pattern,
				448	JSRegExp::Flags flags,
				449	int capture_count) {
				450	// Initialize compiled code entries to null.
				451	re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
				452	JSRegExp::IRREGEXP,
				453	pattern,
				454	flags,
				455	capture_count);
				456	}
				457
				458
				459	int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
				460	Handle<String> subject) {
				461	subject = String::Flatten(subject);
				462
				463	// Check representation of the underlying storage.
				464	bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
				465	if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
				466
				467	#ifdef V8_INTERPRETED_REGEXP
				468	// Byte-code regexp needs space allocated for all its registers.
				469	// The result captures are copied to the start of the registers array
				470	// if the match succeeds. This way those registers are not clobbered
				471	// when we set the last match info from last successful match.
				472	return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
				473	(IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
				474	#else // V8_INTERPRETED_REGEXP
				475	// Native regexp only needs room to output captures. Registers are handled
				476	// internally.
				477	return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
				478	#endif // V8_INTERPRETED_REGEXP
				479	}
				480
				481
				482	int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
				483	Handle<String> subject,
				484	int index,
				485	int32_t* output,
				486	int output_size) {
				487	Isolate* isolate = regexp->GetIsolate();
				488
				489	Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
				490
				491	DCHECK(index >= 0);
				492	DCHECK(index <= subject->length());
				493	DCHECK(subject->IsFlat());
				494
				495	bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
				496
				497	#ifndef V8_INTERPRETED_REGEXP
				498	DCHECK(output_size >= (IrregexpNumberOfCaptures(irregexp) + 1) 2);
				499	do {
				500	EnsureCompiledIrregexp(regexp, subject, is_one_byte);
				501	Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
				502	// The stack is used to allocate registers for the compiled regexp code.
				503	// This means that in case of failure, the output registers array is left
				504	// untouched and contains the capture results from the previous successful
				505	// match. We can use that to set the last match info lazily.
				506	NativeRegExpMacroAssembler::Result res =
				507	NativeRegExpMacroAssembler::Match(code,
				508	subject,
				509	output,
				510	output_size,
				511	index,
				512	isolate);
				513	if (res != NativeRegExpMacroAssembler::RETRY) {
				514	DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION \|\|
				515	isolate->has_pending_exception());
				516	STATIC_ASSERT(
				517	static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
				518	STATIC_ASSERT(
				519	static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
				520	STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
				521	== RE_EXCEPTION);
				522	return static_cast<IrregexpResult>(res);
				523	}
				524	// If result is RETRY, the string has changed representation, and we
				525	// must restart from scratch.
				526	// In this case, it means we must make sure we are prepared to handle
				527	// the, potentially, different subject (the string can switch between
				528	// being internal and external, and even between being Latin1 and UC16,
				529	// but the characters are always the same).
				530	IrregexpPrepare(regexp, subject);
				531	is_one_byte = subject->IsOneByteRepresentationUnderneath();
				532	} while (true);
				533	UNREACHABLE();
				534	return RE_EXCEPTION;
				535	#else // V8_INTERPRETED_REGEXP
				536
				537	DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
				538	// We must have done EnsureCompiledIrregexp, so we can get the number of
				539	// registers.
				540	int number_of_capture_registers =
				541	(IrregexpNumberOfCaptures(irregexp) + 1) 2;
				542	int32_t* raw_output = &output[number_of_capture_registers];
				543	// We do not touch the actual capture result registers until we know there
				544	// has been a match so that we can use those capture results to set the
				545	// last match info.
				546	for (int i = number_of_capture_registers - 1; i >= 0; i--) {
				547	raw_output[i] = -1;
				548	}
				549	Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
				550	isolate);
				551
				552	IrregexpResult result = IrregexpInterpreter::Match(isolate,
				553	byte_codes,
				554	subject,
				555	raw_output,
				556	index);
				557	if (result == RE_SUCCESS) {
				558	// Copy capture results to the start of the registers array.
				559	MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
				560	}
				561	if (result == RE_EXCEPTION) {
				562	DCHECK(!isolate->has_pending_exception());
				563	isolate->StackOverflow();
				564	}
				565	return result;
				566	#endif // V8_INTERPRETED_REGEXP
				567	}
				568
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	569	MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
				570	Handle<String> subject,
				571	int previous_index,
Ben Murdoch	f91f061	2016-11-29 16:50:11 +0000	[diff] [blame]	572	Handle<JSObject> last_match_info) {
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	573	Isolate* isolate = regexp->GetIsolate();
				574	DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
				575
				576	// Prepare space for the return values.
				577	#if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
				578	if (FLAG_trace_regexp_bytecodes) {
				579	String* pattern = regexp->Pattern();
				580	PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
				581	PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
				582	}
				583	#endif
				584	int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
				585	if (required_registers < 0) {
				586	// Compiling failed with an exception.
				587	DCHECK(isolate->has_pending_exception());
				588	return MaybeHandle<Object>();
				589	}
				590
				591	int32_t* output_registers = NULL;
				592	if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
				593	output_registers = NewArray<int32_t>(required_registers);
				594	}
Ben Murdoch	f91f061	2016-11-29 16:50:11 +0000	[diff] [blame]	595	std::unique_ptr<int32_t[]> auto_release(output_registers);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	596	if (output_registers == NULL) {
				597	output_registers = isolate->jsregexp_static_offsets_vector();
				598	}
				599
				600	int res = RegExpImpl::IrregexpExecRaw(
				601	regexp, subject, previous_index, output_registers, required_registers);
				602	if (res == RE_SUCCESS) {
				603	int capture_count =
				604	IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
				605	return SetLastMatchInfo(
				606	last_match_info, subject, capture_count, output_registers);
				607	}
				608	if (res == RE_EXCEPTION) {
				609	DCHECK(isolate->has_pending_exception());
				610	return MaybeHandle<Object>();
				611	}
				612	DCHECK(res == RE_FAILURE);
				613	return isolate->factory()->null_value();
				614	}
				615
Ben Murdoch	f91f061	2016-11-29 16:50:11 +0000	[diff] [blame]	616	static void EnsureSize(Handle<JSObject> array, uint32_t minimum_size) {
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	617	if (static_cast<uint32_t>(array->elements()->length()) < minimum_size) {
Ben Murdoch	f91f061	2016-11-29 16:50:11 +0000	[diff] [blame]	618	array->GetElementsAccessor()->GrowCapacityAndConvert(array, minimum_size);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	619	}
				620	}
				621
Ben Murdoch	f91f061	2016-11-29 16:50:11 +0000	[diff] [blame]	622	Handle<JSObject> RegExpImpl::SetLastMatchInfo(Handle<JSObject> last_match_info,
				623	Handle<String> subject,
				624	int capture_count,
				625	int32_t* match) {
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	626	DCHECK(last_match_info->HasFastObjectElements());
				627	int capture_register_count = (capture_count + 1) * 2;
				628	EnsureSize(last_match_info, capture_register_count + kLastMatchOverhead);
				629	DisallowHeapAllocation no_allocation;
				630	FixedArray* array = FixedArray::cast(last_match_info->elements());
				631	if (match != NULL) {
				632	for (int i = 0; i < capture_register_count; i += 2) {
				633	SetCapture(array, i, match[i]);
				634	SetCapture(array, i + 1, match[i + 1]);
				635	}
				636	}
				637	SetLastCaptureCount(array, capture_register_count);
				638	SetLastSubject(array, *subject);
				639	SetLastInput(array, *subject);
				640	return last_match_info;
				641	}
				642
				643
				644	RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
				645	Handle<String> subject,
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	646	Isolate* isolate)
				647	: register_array_(NULL),
				648	register_array_size_(0),
				649	regexp_(regexp),
				650	subject_(subject) {
				651	#ifdef V8_INTERPRETED_REGEXP
				652	bool interpreted = true;
				653	#else
				654	bool interpreted = false;
				655	#endif // V8_INTERPRETED_REGEXP
				656
				657	if (regexp_->TypeTag() == JSRegExp::ATOM) {
				658	static const int kAtomRegistersPerMatch = 2;
				659	registers_per_match_ = kAtomRegistersPerMatch;
				660	// There is no distinction between interpreted and native for atom regexps.
				661	interpreted = false;
				662	} else {
				663	registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
				664	if (registers_per_match_ < 0) {
				665	num_matches_ = -1; // Signal exception.
				666	return;
				667	}
				668	}
				669
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	670	DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal);
				671	if (!interpreted) {
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	672	register_array_size_ =
				673	Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
				674	max_matches_ = register_array_size_ / registers_per_match_;
				675	} else {
				676	// Global loop in interpreted regexp is not implemented. We choose
				677	// the size of the offsets vector so that it can only store one match.
				678	register_array_size_ = registers_per_match_;
				679	max_matches_ = 1;
				680	}
				681
				682	if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
				683	register_array_ = NewArray<int32_t>(register_array_size_);
				684	} else {
				685	register_array_ = isolate->jsregexp_static_offsets_vector();
				686	}
				687
				688	// Set state so that fetching the results the first time triggers a call
				689	// to the compiled regexp.
				690	current_match_index_ = max_matches_ - 1;
				691	num_matches_ = max_matches_;
				692	DCHECK(registers_per_match_ >= 2); // Each match has at least one capture.
				693	DCHECK_GE(register_array_size_, registers_per_match_);
				694	int32_t* last_match =
				695	&register_array_[current_match_index_ * registers_per_match_];
				696	last_match[0] = -1;
				697	last_match[1] = 0;
				698	}
				699
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	700	int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
				701	if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 &&
				702	last_index + 1 < subject_->length() &&
				703	unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
				704	unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
				705	// Advance over the surrogate pair.
				706	return last_index + 2;
				707	}
				708	return last_index + 1;
				709	}
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	710
				711	// -------------------------------------------------------------------
				712	// Implementation of the Irregexp regular expression engine.
				713	//
				714	// The Irregexp regular expression engine is intended to be a complete
				715	// implementation of ECMAScript regular expressions. It generates either
				716	// bytecodes or native code.
				717
				718	// The Irregexp regexp engine is structured in three steps.
				719	// 1) The parser generates an abstract syntax tree. See ast.cc.
				720	// 2) From the AST a node network is created. The nodes are all
				721	// subclasses of RegExpNode. The nodes represent states when
				722	// executing a regular expression. Several optimizations are
				723	// performed on the node network.
				724	// 3) From the nodes we generate either byte codes or native code
				725	// that can actually execute the regular expression (perform
				726	// the search). The code generation step is described in more
				727	// detail below.
				728
				729	// Code generation.
				730	//
				731	// The nodes are divided into four main categories.
				732	// * Choice nodes
				733	// These represent places where the regular expression can
				734	// match in more than one way. For example on entry to an
				735	// alternation (foo\|bar) or a repetition (*, +, ? or {}).
				736	// * Action nodes
				737	// These represent places where some action should be
				738	// performed. Examples include recording the current position
				739	// in the input string to a register (in order to implement
				740	// captures) or other actions on register for example in order
				741	// to implement the counters needed for {} repetitions.
				742	// * Matching nodes
				743	// These attempt to match some element part of the input string.
				744	// Examples of elements include character classes, plain strings
				745	// or back references.
				746	// * End nodes
				747	// These are used to implement the actions required on finding
				748	// a successful match or failing to find a match.
				749	//
				750	// The code generated (whether as byte codes or native code) maintains
				751	// some state as it runs. This consists of the following elements:
				752	//
				753	// * The capture registers. Used for string captures.
				754	// * Other registers. Used for counters etc.
				755	// * The current position.
				756	// * The stack of backtracking information. Used when a matching node
				757	// fails to find a match and needs to try an alternative.
				758	//
				759	// Conceptual regular expression execution model:
				760	//
				761	// There is a simple conceptual model of regular expression execution
				762	// which will be presented first. The actual code generated is a more
				763	// efficient simulation of the simple conceptual model:
				764	//
				765	// * Choice nodes are implemented as follows:
				766	// For each choice except the last {
				767	// push current position
				768	// push backtrack code location
				769	// <generate code to test for choice>
				770	// backtrack code location:
				771	// pop current position
				772	// }
				773	// <generate code to test for last choice>
				774	//
				775	// * Actions nodes are generated as follows
				776	// <push affected registers on backtrack stack>
				777	// <generate code to perform action>
				778	// push backtrack code location
				779	// <generate code to test for following nodes>
				780	// backtrack code location:
				781	// <pop affected registers to restore their state>
				782	// <pop backtrack location from stack and go to it>
				783	//
				784	// * Matching nodes are generated as follows:
				785	// if input string matches at current position
				786	// update current position
				787	// <generate code to test for following nodes>
				788	// else
				789	// <pop backtrack location from stack and go to it>
				790	//
				791	// Thus it can be seen that the current position is saved and restored
				792	// by the choice nodes, whereas the registers are saved and restored by
				793	// by the action nodes that manipulate them.
				794	//
				795	// The other interesting aspect of this model is that nodes are generated
				796	// at the point where they are needed by a recursive call to Emit(). If
				797	// the node has already been code generated then the Emit() call will
				798	// generate a jump to the previously generated code instead. In order to
				799	// limit recursion it is possible for the Emit() function to put the node
				800	// on a work list for later generation and instead generate a jump. The
				801	// destination of the jump is resolved later when the code is generated.
				802	//
				803	// Actual regular expression code generation.
				804	//
				805	// Code generation is actually more complicated than the above. In order
				806	// to improve the efficiency of the generated code some optimizations are
				807	// performed
				808	//
				809	// * Choice nodes have 1-character lookahead.
				810	// A choice node looks at the following character and eliminates some of
				811	// the choices immediately based on that character. This is not yet
				812	// implemented.
				813	// * Simple greedy loops store reduced backtracking information.
				814	// A quantifier like /.*foo/m will greedily match the whole input. It will
				815	// then need to backtrack to a point where it can match "foo". The naive
				816	// implementation of this would push each character position onto the
				817	// backtracking stack, then pop them off one by one. This would use space
				818	// proportional to the length of the input string. However since the "."
				819	// can only match in one way and always has a constant length (in this case
				820	// of 1) it suffices to store the current position on the top of the stack
				821	// once. Matching now becomes merely incrementing the current position and
				822	// backtracking becomes decrementing the current position and checking the
				823	// result against the stored current position. This is faster and saves
				824	// space.
				825	// * The current state is virtualized.
				826	// This is used to defer expensive operations until it is clear that they
				827	// are needed and to generate code for a node more than once, allowing
				828	// specialized an efficient versions of the code to be created. This is
				829	// explained in the section below.
				830	//
				831	// Execution state virtualization.
				832	//
				833	// Instead of emitting code, nodes that manipulate the state can record their
				834	// manipulation in an object called the Trace. The Trace object can record a
				835	// current position offset, an optional backtrack code location on the top of
				836	// the virtualized backtrack stack and some register changes. When a node is
				837	// to be emitted it can flush the Trace or update it. Flushing the Trace
				838	// will emit code to bring the actual state into line with the virtual state.
				839	// Avoiding flushing the state can postpone some work (e.g. updates of capture
				840	// registers). Postponing work can save time when executing the regular
				841	// expression since it may be found that the work never has to be done as a
				842	// failure to match can occur. In addition it is much faster to jump to a
				843	// known backtrack code location than it is to pop an unknown backtrack
				844	// location from the stack and jump there.
				845	//
				846	// The virtual state found in the Trace affects code generation. For example
				847	// the virtual state contains the difference between the actual current
				848	// position and the virtual current position, and matching code needs to use
				849	// this offset to attempt a match in the correct location of the input
				850	// string. Therefore code generated for a non-trivial trace is specialized
				851	// to that trace. The code generator therefore has the ability to generate
				852	// code for each node several times. In order to limit the size of the
				853	// generated code there is an arbitrary limit on how many specialized sets of
				854	// code may be generated for a given node. If the limit is reached, the
				855	// trace is flushed and a generic version of the code for a node is emitted.
				856	// This is subsequently used for that node. The code emitted for non-generic
				857	// trace is not recorded in the node and so it cannot currently be reused in
				858	// the event that code generation is requested for an identical trace.
				859
				860
				861	void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
				862	UNREACHABLE();
				863	}
				864
				865
				866	void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
				867	text->AddElement(TextElement::Atom(this), zone);
				868	}
				869
				870
				871	void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
				872	text->AddElement(TextElement::CharClass(this), zone);
				873	}
				874
				875
				876	void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
				877	for (int i = 0; i < elements()->length(); i++)
				878	text->AddElement(elements()->at(i), zone);
				879	}
				880
				881
				882	TextElement TextElement::Atom(RegExpAtom* atom) {
				883	return TextElement(ATOM, atom);
				884	}
				885
				886
				887	TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
				888	return TextElement(CHAR_CLASS, char_class);
				889	}
				890
				891
				892	int TextElement::length() const {
				893	switch (text_type()) {
				894	case ATOM:
				895	return atom()->length();
				896
				897	case CHAR_CLASS:
				898	return 1;
				899	}
				900	UNREACHABLE();
				901	return 0;
				902	}
				903
				904
				905	DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
				906	if (table_ == NULL) {
				907	table_ = new(zone()) DispatchTable(zone());
				908	DispatchTableConstructor cons(table_, ignore_case, zone());
				909	cons.BuildTable(this);
				910	}
				911	return table_;
				912	}
				913
				914
				915	class FrequencyCollator {
				916	public:
				917	FrequencyCollator() : total_samples_(0) {
				918	for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
				919	frequencies_[i] = CharacterFrequency(i);
				920	}
				921	}
				922
				923	void CountCharacter(int character) {
				924	int index = (character & RegExpMacroAssembler::kTableMask);
				925	frequencies_[index].Increment();
				926	total_samples_++;
				927	}
				928
				929	// Does not measure in percent, but rather per-128 (the table size from the
				930	// regexp macro assembler).
				931	int Frequency(int in_character) {
				932	DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
				933	if (total_samples_ < 1) return 1; // Division by zero.
				934	int freq_in_per128 =
				935	(frequencies_[in_character].counter() * 128) / total_samples_;
				936	return freq_in_per128;
				937	}
				938
				939	private:
				940	class CharacterFrequency {
				941	public:
				942	CharacterFrequency() : counter_(0), character_(-1) { }
				943	explicit CharacterFrequency(int character)
				944	: counter_(0), character_(character) { }
				945
				946	void Increment() { counter_++; }
				947	int counter() { return counter_; }
				948	int character() { return character_; }
				949
				950	private:
				951	int counter_;
				952	int character_;
				953	};
				954
				955
				956	private:
				957	CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
				958	int total_samples_;
				959	};
				960
				961
				962	class RegExpCompiler {
				963	public:
				964	RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	965	JSRegExp::Flags flags, bool is_one_byte);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	966
				967	int AllocateRegister() {
				968	if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
				969	reg_exp_too_big_ = true;
				970	return next_register_;
				971	}
				972	return next_register_++;
				973	}
				974
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	975	// Lookarounds to match lone surrogates for unicode character class matches
				976	// are never nested. We can therefore reuse registers.
				977	int UnicodeLookaroundStackRegister() {
				978	if (unicode_lookaround_stack_register_ == kNoRegister) {
				979	unicode_lookaround_stack_register_ = AllocateRegister();
				980	}
				981	return unicode_lookaround_stack_register_;
				982	}
				983
				984	int UnicodeLookaroundPositionRegister() {
				985	if (unicode_lookaround_position_register_ == kNoRegister) {
				986	unicode_lookaround_position_register_ = AllocateRegister();
				987	}
				988	return unicode_lookaround_position_register_;
				989	}
				990
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	991	RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
				992	RegExpNode* start,
				993	int capture_count,
				994	Handle<String> pattern);
				995
				996	inline void AddWork(RegExpNode* node) {
				997	if (!node->on_work_list() && !node->label()->is_bound()) {
				998	node->set_on_work_list(true);
				999	work_list_->Add(node);
				1000	}
				1001	}
				1002
				1003	static const int kImplementationOffset = 0;
				1004	static const int kNumberOfRegistersOffset = 0;
				1005	static const int kCodeOffset = 1;
				1006
				1007	RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
				1008	EndNode* accept() { return accept_; }
				1009
				1010	static const int kMaxRecursion = 100;
				1011	inline int recursion_depth() { return recursion_depth_; }
				1012	inline void IncrementRecursionDepth() { recursion_depth_++; }
				1013	inline void DecrementRecursionDepth() { recursion_depth_--; }
				1014
				1015	void SetRegExpTooBig() { reg_exp_too_big_ = true; }
				1016
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	1017	inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
				1018	inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	1019	inline bool one_byte() { return one_byte_; }
				1020	inline bool optimize() { return optimize_; }
				1021	inline void set_optimize(bool value) { optimize_ = value; }
				1022	inline bool limiting_recursion() { return limiting_recursion_; }
				1023	inline void set_limiting_recursion(bool value) {
				1024	limiting_recursion_ = value;
				1025	}
				1026	bool read_backward() { return read_backward_; }
				1027	void set_read_backward(bool value) { read_backward_ = value; }
				1028	FrequencyCollator* frequency_collator() { return &frequency_collator_; }
				1029
				1030	int current_expansion_factor() { return current_expansion_factor_; }
				1031	void set_current_expansion_factor(int value) {
				1032	current_expansion_factor_ = value;
				1033	}
				1034
				1035	Isolate* isolate() const { return isolate_; }
				1036	Zone* zone() const { return zone_; }
				1037
				1038	static const int kNoRegister = -1;
				1039
				1040	private:
				1041	EndNode* accept_;
				1042	int next_register_;
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	1043	int unicode_lookaround_stack_register_;
				1044	int unicode_lookaround_position_register_;
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	1045	List<RegExpNode> work_list_;
				1046	int recursion_depth_;
				1047	RegExpMacroAssembler* macro_assembler_;
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	1048	JSRegExp::Flags flags_;
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	1049	bool one_byte_;
				1050	bool reg_exp_too_big_;
				1051	bool limiting_recursion_;
				1052	bool optimize_;
				1053	bool read_backward_;
				1054	int current_expansion_factor_;
				1055	FrequencyCollator frequency_collator_;
				1056	Isolate* isolate_;
				1057	Zone* zone_;
				1058	};
				1059
				1060
				1061	class RecursionCheck {
				1062	public:
				1063	explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
				1064	compiler->IncrementRecursionDepth();
				1065	}
				1066	~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
				1067	private:
				1068	RegExpCompiler* compiler_;
				1069	};
				1070
				1071
				1072	static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
				1073	return RegExpEngine::CompilationResult(isolate, "RegExp too big");
				1074	}
				1075
				1076
				1077	// Attempts to compile the regexp using an Irregexp code generator. Returns
				1078	// a fixed array or a null handle depending on whether it succeeded.
				1079	RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	1080	JSRegExp::Flags flags, bool one_byte)
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	1081	: next_register_(2 * (capture_count + 1)),
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	1082	unicode_lookaround_stack_register_(kNoRegister),
				1083	unicode_lookaround_position_register_(kNoRegister),
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	1084	work_list_(NULL),
				1085	recursion_depth_(0),
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	1086	flags_(flags),
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	1087	one_byte_(one_byte),
				1088	reg_exp_too_big_(false),
				1089	limiting_recursion_(false),
				1090	optimize_(FLAG_regexp_optimization),
				1091	read_backward_(false),
				1092	current_expansion_factor_(1),
				1093	frequency_collator_(),
				1094	isolate_(isolate),
				1095	zone_(zone) {
				1096	accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
				1097	DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
				1098	}
				1099
				1100
				1101	RegExpEngine::CompilationResult RegExpCompiler::Assemble(
				1102	RegExpMacroAssembler* macro_assembler,
				1103	RegExpNode* start,
				1104	int capture_count,
				1105	Handle<String> pattern) {
				1106	Heap* heap = pattern->GetHeap();
				1107
				1108	#ifdef DEBUG
				1109	if (FLAG_trace_regexp_assembler)
				1110	macro_assembler_ =
				1111	new RegExpMacroAssemblerTracer(isolate(), macro_assembler);
				1112	else
				1113	#endif
				1114	macro_assembler_ = macro_assembler;
				1115
				1116	List <RegExpNode*> work_list(0);
				1117	work_list_ = &work_list;
				1118	Label fail;
				1119	macro_assembler_->PushBacktrack(&fail);
				1120	Trace new_trace;
				1121	start->Emit(this, &new_trace);
				1122	macro_assembler_->Bind(&fail);
				1123	macro_assembler_->Fail();
				1124	while (!work_list.is_empty()) {
				1125	RegExpNode* node = work_list.RemoveLast();
				1126	node->set_on_work_list(false);
				1127	if (!node->label()->is_bound()) node->Emit(this, &new_trace);
				1128	}
				1129	if (reg_exp_too_big_) {
				1130	macro_assembler_->AbortedCodeGeneration();
				1131	return IrregexpRegExpTooBig(isolate_);
				1132	}
				1133
				1134	Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
				1135	heap->IncreaseTotalRegexpCodeGenerated(code->Size());
				1136	work_list_ = NULL;
				1137	#ifdef ENABLE_DISASSEMBLER
				1138	if (FLAG_print_code) {
				1139	CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());
				1140	OFStream os(trace_scope.file());
				1141	Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
				1142	}
				1143	#endif
				1144	#ifdef DEBUG
				1145	if (FLAG_trace_regexp_assembler) {
				1146	delete macro_assembler_;
				1147	}
				1148	#endif
				1149	return RegExpEngine::CompilationResult(*code, next_register_);
				1150	}
				1151
				1152
				1153	bool Trace::DeferredAction::Mentions(int that) {
				1154	if (action_type() == ActionNode::CLEAR_CAPTURES) {
				1155	Interval range = static_cast<DeferredClearCaptures*>(this)->range();
				1156	return range.Contains(that);
				1157	} else {
				1158	return reg() == that;
				1159	}
				1160	}
				1161
				1162
				1163	bool Trace::mentions_reg(int reg) {
				1164	for (DeferredAction* action = actions_;
				1165	action != NULL;
				1166	action = action->next()) {
				1167	if (action->Mentions(reg))
				1168	return true;
				1169	}
				1170	return false;
				1171	}
				1172
				1173
				1174	bool Trace::GetStoredPosition(int reg, int* cp_offset) {
				1175	DCHECK_EQ(0, *cp_offset);
				1176	for (DeferredAction* action = actions_;
				1177	action != NULL;
				1178	action = action->next()) {
				1179	if (action->Mentions(reg)) {
				1180	if (action->action_type() == ActionNode::STORE_POSITION) {
				1181	cp_offset = static_cast<DeferredCapture>(action)->cp_offset();
				1182	return true;
				1183	} else {
				1184	return false;
				1185	}
				1186	}
				1187	}
				1188	return false;
				1189	}
				1190
				1191
				1192	int Trace::FindAffectedRegisters(OutSet* affected_registers,
				1193	Zone* zone) {
				1194	int max_register = RegExpCompiler::kNoRegister;
				1195	for (DeferredAction* action = actions_;
				1196	action != NULL;
				1197	action = action->next()) {
				1198	if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
				1199	Interval range = static_cast<DeferredClearCaptures*>(action)->range();
				1200	for (int i = range.from(); i <= range.to(); i++)
				1201	affected_registers->Set(i, zone);
				1202	if (range.to() > max_register) max_register = range.to();
				1203	} else {
				1204	affected_registers->Set(action->reg(), zone);
				1205	if (action->reg() > max_register) max_register = action->reg();
				1206	}
				1207	}
				1208	return max_register;
				1209	}
				1210
				1211
				1212	void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
				1213	int max_register,
				1214	const OutSet& registers_to_pop,
				1215	const OutSet& registers_to_clear) {
				1216	for (int reg = max_register; reg >= 0; reg--) {
				1217	if (registers_to_pop.Get(reg)) {
				1218	assembler->PopRegister(reg);
				1219	} else if (registers_to_clear.Get(reg)) {
				1220	int clear_to = reg;
				1221	while (reg > 0 && registers_to_clear.Get(reg - 1)) {
				1222	reg--;
				1223	}
				1224	assembler->ClearRegisters(reg, clear_to);
				1225	}
				1226	}
				1227	}
				1228
				1229
				1230	void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
				1231	int max_register,
				1232	const OutSet& affected_registers,
				1233	OutSet* registers_to_pop,
				1234	OutSet* registers_to_clear,
				1235	Zone* zone) {
				1236	// The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
				1237	const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
				1238
				1239	// Count pushes performed to force a stack limit check occasionally.
				1240	int pushes = 0;
				1241
				1242	for (int reg = 0; reg <= max_register; reg++) {
				1243	if (!affected_registers.Get(reg)) {
				1244	continue;
				1245	}
				1246
				1247	// The chronologically first deferred action in the trace
				1248	// is used to infer the action needed to restore a register
				1249	// to its previous state (or not, if it's safe to ignore it).
				1250	enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
				1251	DeferredActionUndoType undo_action = IGNORE;
				1252
				1253	int value = 0;
				1254	bool absolute = false;
				1255	bool clear = false;
				1256	static const int kNoStore = kMinInt;
				1257	int store_position = kNoStore;
				1258	// This is a little tricky because we are scanning the actions in reverse
				1259	// historical order (newest first).
				1260	for (DeferredAction* action = actions_;
				1261	action != NULL;
				1262	action = action->next()) {
				1263	if (action->Mentions(reg)) {
				1264	switch (action->action_type()) {
				1265	case ActionNode::SET_REGISTER: {
				1266	Trace::DeferredSetRegister* psr =
				1267	static_cast<Trace::DeferredSetRegister*>(action);
				1268	if (!absolute) {
				1269	value += psr->value();
				1270	absolute = true;
				1271	}
				1272	// SET_REGISTER is currently only used for newly introduced loop
				1273	// counters. They can have a significant previous value if they
				1274	// occour in a loop. TODO(lrn): Propagate this information, so
				1275	// we can set undo_action to IGNORE if we know there is no value to
				1276	// restore.
				1277	undo_action = RESTORE;
				1278	DCHECK_EQ(store_position, kNoStore);
				1279	DCHECK(!clear);
				1280	break;
				1281	}
				1282	case ActionNode::INCREMENT_REGISTER:
				1283	if (!absolute) {
				1284	value++;
				1285	}
				1286	DCHECK_EQ(store_position, kNoStore);
				1287	DCHECK(!clear);
				1288	undo_action = RESTORE;
				1289	break;
				1290	case ActionNode::STORE_POSITION: {
				1291	Trace::DeferredCapture* pc =
				1292	static_cast<Trace::DeferredCapture*>(action);
				1293	if (!clear && store_position == kNoStore) {
				1294	store_position = pc->cp_offset();
				1295	}
				1296
				1297	// For captures we know that stores and clears alternate.
				1298	// Other register, are never cleared, and if the occur
				1299	// inside a loop, they might be assigned more than once.
				1300	if (reg <= 1) {
				1301	// Registers zero and one, aka "capture zero", is
				1302	// always set correctly if we succeed. There is no
				1303	// need to undo a setting on backtrack, because we
				1304	// will set it again or fail.
				1305	undo_action = IGNORE;
				1306	} else {
				1307	undo_action = pc->is_capture() ? CLEAR : RESTORE;
				1308	}
				1309	DCHECK(!absolute);
				1310	DCHECK_EQ(value, 0);
				1311	break;
				1312	}
				1313	case ActionNode::CLEAR_CAPTURES: {
				1314	// Since we're scanning in reverse order, if we've already
				1315	// set the position we have to ignore historically earlier
				1316	// clearing operations.
				1317	if (store_position == kNoStore) {
				1318	clear = true;
				1319	}
				1320	undo_action = RESTORE;
				1321	DCHECK(!absolute);
				1322	DCHECK_EQ(value, 0);
				1323	break;
				1324	}
				1325	default:
				1326	UNREACHABLE();
				1327	break;
				1328	}
				1329	}
				1330	}
				1331	// Prepare for the undo-action (e.g., push if it's going to be popped).
				1332	if (undo_action == RESTORE) {
				1333	pushes++;
				1334	RegExpMacroAssembler::StackCheckFlag stack_check =
				1335	RegExpMacroAssembler::kNoStackLimitCheck;
				1336	if (pushes == push_limit) {
				1337	stack_check = RegExpMacroAssembler::kCheckStackLimit;
				1338	pushes = 0;
				1339	}
				1340
				1341	assembler->PushRegister(reg, stack_check);
				1342	registers_to_pop->Set(reg, zone);
				1343	} else if (undo_action == CLEAR) {
				1344	registers_to_clear->Set(reg, zone);
				1345	}
				1346	// Perform the chronologically last action (or accumulated increment)
				1347	// for the register.
				1348	if (store_position != kNoStore) {
				1349	assembler->WriteCurrentPositionToRegister(reg, store_position);
				1350	} else if (clear) {
				1351	assembler->ClearRegisters(reg, reg);
				1352	} else if (absolute) {
				1353	assembler->SetRegister(reg, value);
				1354	} else if (value != 0) {
				1355	assembler->AdvanceRegister(reg, value);
				1356	}
				1357	}
				1358	}
				1359
				1360
				1361	// This is called as we come into a loop choice node and some other tricky
				1362	// nodes. It normalizes the state of the code generator to ensure we can
				1363	// generate generic code.
				1364	void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
				1365	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1366
				1367	DCHECK(!is_trivial());
				1368
				1369	if (actions_ == NULL && backtrack() == NULL) {
				1370	// Here we just have some deferred cp advances to fix and we are back to
				1371	// a normal situation. We may also have to forget some information gained
				1372	// through a quick check that was already performed.
				1373	if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
				1374	// Create a new trivial state and generate the node with that.
				1375	Trace new_state;
				1376	successor->Emit(compiler, &new_state);
				1377	return;
				1378	}
				1379
				1380	// Generate deferred actions here along with code to undo them again.
				1381	OutSet affected_registers;
				1382
				1383	if (backtrack() != NULL) {
				1384	// Here we have a concrete backtrack location. These are set up by choice
				1385	// nodes and so they indicate that we have a deferred save of the current
				1386	// position which we may need to emit here.
				1387	assembler->PushCurrentPosition();
				1388	}
				1389
				1390	int max_register = FindAffectedRegisters(&affected_registers,
				1391	compiler->zone());
				1392	OutSet registers_to_pop;
				1393	OutSet registers_to_clear;
				1394	PerformDeferredActions(assembler,
				1395	max_register,
				1396	affected_registers,
				1397	&registers_to_pop,
				1398	&registers_to_clear,
				1399	compiler->zone());
				1400	if (cp_offset_ != 0) {
				1401	assembler->AdvanceCurrentPosition(cp_offset_);
				1402	}
				1403
				1404	// Create a new trivial state and generate the node with that.
				1405	Label undo;
				1406	assembler->PushBacktrack(&undo);
				1407	if (successor->KeepRecursing(compiler)) {
				1408	Trace new_state;
				1409	successor->Emit(compiler, &new_state);
				1410	} else {
				1411	compiler->AddWork(successor);
				1412	assembler->GoTo(successor->label());
				1413	}
				1414
				1415	// On backtrack we need to restore state.
				1416	assembler->Bind(&undo);
				1417	RestoreAffectedRegisters(assembler,
				1418	max_register,
				1419	registers_to_pop,
				1420	registers_to_clear);
				1421	if (backtrack() == NULL) {
				1422	assembler->Backtrack();
				1423	} else {
				1424	assembler->PopCurrentPosition();
				1425	assembler->GoTo(backtrack());
				1426	}
				1427	}
				1428
				1429
				1430	void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
				1431	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1432
				1433	// Omit flushing the trace. We discard the entire stack frame anyway.
				1434
				1435	if (!label()->is_bound()) {
				1436	// We are completely independent of the trace, since we ignore it,
				1437	// so this code can be used as the generic version.
				1438	assembler->Bind(label());
				1439	}
				1440
				1441	// Throw away everything on the backtrack stack since the start
				1442	// of the negative submatch and restore the character position.
				1443	assembler->ReadCurrentPositionFromRegister(current_position_register_);
				1444	assembler->ReadStackPointerFromRegister(stack_pointer_register_);
				1445	if (clear_capture_count_ > 0) {
				1446	// Clear any captures that might have been performed during the success
				1447	// of the body of the negative look-ahead.
				1448	int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
				1449	assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
				1450	}
				1451	// Now that we have unwound the stack we find at the top of the stack the
				1452	// backtrack that the BeginSubmatch node got.
				1453	assembler->Backtrack();
				1454	}
				1455
				1456
				1457	void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				1458	if (!trace->is_trivial()) {
				1459	trace->Flush(compiler, this);
				1460	return;
				1461	}
				1462	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1463	if (!label()->is_bound()) {
				1464	assembler->Bind(label());
				1465	}
				1466	switch (action_) {
				1467	case ACCEPT:
				1468	assembler->Succeed();
				1469	return;
				1470	case BACKTRACK:
				1471	assembler->GoTo(trace->backtrack());
				1472	return;
				1473	case NEGATIVE_SUBMATCH_SUCCESS:
				1474	// This case is handled in a different virtual method.
				1475	UNREACHABLE();
				1476	}
				1477	UNIMPLEMENTED();
				1478	}
				1479
				1480
				1481	void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
				1482	if (guards_ == NULL)
				1483	guards_ = new(zone) ZoneList<Guard*>(1, zone);
				1484	guards_->Add(guard, zone);
				1485	}
				1486
				1487
				1488	ActionNode* ActionNode::SetRegister(int reg,
				1489	int val,
				1490	RegExpNode* on_success) {
				1491	ActionNode* result =
				1492	new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
				1493	result->data_.u_store_register.reg = reg;
				1494	result->data_.u_store_register.value = val;
				1495	return result;
				1496	}
				1497
				1498
				1499	ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
				1500	ActionNode* result =
				1501	new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
				1502	result->data_.u_increment_register.reg = reg;
				1503	return result;
				1504	}
				1505
				1506
				1507	ActionNode* ActionNode::StorePosition(int reg,
				1508	bool is_capture,
				1509	RegExpNode* on_success) {
				1510	ActionNode* result =
				1511	new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
				1512	result->data_.u_position_register.reg = reg;
				1513	result->data_.u_position_register.is_capture = is_capture;
				1514	return result;
				1515	}
				1516
				1517
				1518	ActionNode* ActionNode::ClearCaptures(Interval range,
				1519	RegExpNode* on_success) {
				1520	ActionNode* result =
				1521	new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
				1522	result->data_.u_clear_captures.range_from = range.from();
				1523	result->data_.u_clear_captures.range_to = range.to();
				1524	return result;
				1525	}
				1526
				1527
				1528	ActionNode* ActionNode::BeginSubmatch(int stack_reg,
				1529	int position_reg,
				1530	RegExpNode* on_success) {
				1531	ActionNode* result =
				1532	new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
				1533	result->data_.u_submatch.stack_pointer_register = stack_reg;
				1534	result->data_.u_submatch.current_position_register = position_reg;
				1535	return result;
				1536	}
				1537
				1538
				1539	ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
				1540	int position_reg,
				1541	int clear_register_count,
				1542	int clear_register_from,
				1543	RegExpNode* on_success) {
				1544	ActionNode* result =
				1545	new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
				1546	result->data_.u_submatch.stack_pointer_register = stack_reg;
				1547	result->data_.u_submatch.current_position_register = position_reg;
				1548	result->data_.u_submatch.clear_register_count = clear_register_count;
				1549	result->data_.u_submatch.clear_register_from = clear_register_from;
				1550	return result;
				1551	}
				1552
				1553
				1554	ActionNode* ActionNode::EmptyMatchCheck(int start_register,
				1555	int repetition_register,
				1556	int repetition_limit,
				1557	RegExpNode* on_success) {
				1558	ActionNode* result =
				1559	new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
				1560	result->data_.u_empty_match_check.start_register = start_register;
				1561	result->data_.u_empty_match_check.repetition_register = repetition_register;
				1562	result->data_.u_empty_match_check.repetition_limit = repetition_limit;
				1563	return result;
				1564	}
				1565
				1566
				1567	#define DEFINE_ACCEPT(Type) \
				1568	void Type##Node::Accept(NodeVisitor* visitor) { \
				1569	visitor->Visit##Type(this); \
				1570	}
				1571	FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
				1572	#undef DEFINE_ACCEPT
				1573
				1574
				1575	void LoopChoiceNode::Accept(NodeVisitor* visitor) {
				1576	visitor->VisitLoopChoice(this);
				1577	}
				1578
				1579
				1580	// -------------------------------------------------------------------
				1581	// Emit code.
				1582
				1583
				1584	void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
				1585	Guard* guard,
				1586	Trace* trace) {
				1587	switch (guard->op()) {
				1588	case Guard::LT:
				1589	DCHECK(!trace->mentions_reg(guard->reg()));
				1590	macro_assembler->IfRegisterGE(guard->reg(),
				1591	guard->value(),
				1592	trace->backtrack());
				1593	break;
				1594	case Guard::GEQ:
				1595	DCHECK(!trace->mentions_reg(guard->reg()));
				1596	macro_assembler->IfRegisterLT(guard->reg(),
				1597	guard->value(),
				1598	trace->backtrack());
				1599	break;
				1600	}
				1601	}
				1602
				1603
				1604	// Returns the number of characters in the equivalence class, omitting those
				1605	// that cannot occur in the source string because it is Latin1.
				1606	static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
				1607	bool one_byte_subject,
				1608	unibrow::uchar* letters) {
				1609	int length =
				1610	isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
				1611	// Unibrow returns 0 or 1 for characters where case independence is
				1612	// trivial.
				1613	if (length == 0) {
				1614	letters[0] = character;
				1615	length = 1;
				1616	}
				1617
				1618	if (one_byte_subject) {
				1619	int new_length = 0;
				1620	for (int i = 0; i < length; i++) {
				1621	if (letters[i] <= String::kMaxOneByteCharCode) {
				1622	letters[new_length++] = letters[i];
				1623	}
				1624	}
				1625	length = new_length;
				1626	}
				1627
				1628	return length;
				1629	}
				1630
				1631
				1632	static inline bool EmitSimpleCharacter(Isolate* isolate,
				1633	RegExpCompiler* compiler,
				1634	uc16 c,
				1635	Label* on_failure,
				1636	int cp_offset,
				1637	bool check,
				1638	bool preloaded) {
				1639	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1640	bool bound_checked = false;
				1641	if (!preloaded) {
				1642	assembler->LoadCurrentCharacter(
				1643	cp_offset,
				1644	on_failure,
				1645	check);
				1646	bound_checked = true;
				1647	}
				1648	assembler->CheckNotCharacter(c, on_failure);
				1649	return bound_checked;
				1650	}
				1651
				1652
				1653	// Only emits non-letters (things that don't have case). Only used for case
				1654	// independent matches.
				1655	static inline bool EmitAtomNonLetter(Isolate* isolate,
				1656	RegExpCompiler* compiler,
				1657	uc16 c,
				1658	Label* on_failure,
				1659	int cp_offset,
				1660	bool check,
				1661	bool preloaded) {
				1662	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1663	bool one_byte = compiler->one_byte();
				1664	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				1665	int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
				1666	if (length < 1) {
				1667	// This can't match. Must be an one-byte subject and a non-one-byte
				1668	// character. We do not need to do anything since the one-byte pass
				1669	// already handled this.
				1670	return false; // Bounds not checked.
				1671	}
				1672	bool checked = false;
				1673	// We handle the length > 1 case in a later pass.
				1674	if (length == 1) {
				1675	if (one_byte && c > String::kMaxOneByteCharCodeU) {
				1676	// Can't match - see above.
				1677	return false; // Bounds not checked.
				1678	}
				1679	if (!preloaded) {
				1680	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
				1681	checked = check;
				1682	}
				1683	macro_assembler->CheckNotCharacter(c, on_failure);
				1684	}
				1685	return checked;
				1686	}
				1687
				1688
				1689	static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
				1690	bool one_byte, uc16 c1, uc16 c2,
				1691	Label* on_failure) {
				1692	uc16 char_mask;
				1693	if (one_byte) {
				1694	char_mask = String::kMaxOneByteCharCode;
				1695	} else {
				1696	char_mask = String::kMaxUtf16CodeUnit;
				1697	}
				1698	uc16 exor = c1 ^ c2;
				1699	// Check whether exor has only one bit set.
				1700	if (((exor - 1) & exor) == 0) {
				1701	// If c1 and c2 differ only by one bit.
				1702	// Ecma262UnCanonicalize always gives the highest number last.
				1703	DCHECK(c2 > c1);
				1704	uc16 mask = char_mask ^ exor;
				1705	macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
				1706	return true;
				1707	}
				1708	DCHECK(c2 > c1);
				1709	uc16 diff = c2 - c1;
				1710	if (((diff - 1) & diff) == 0 && c1 >= diff) {
				1711	// If the characters differ by 2^n but don't differ by one bit then
				1712	// subtract the difference from the found character, then do the or
				1713	// trick. We avoid the theoretical case where negative numbers are
				1714	// involved in order to simplify code generation.
				1715	uc16 mask = char_mask ^ diff;
				1716	macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
				1717	diff,
				1718	mask,
				1719	on_failure);
				1720	return true;
				1721	}
				1722	return false;
				1723	}
				1724
				1725
				1726	typedef bool EmitCharacterFunction(Isolate* isolate,
				1727	RegExpCompiler* compiler,
				1728	uc16 c,
				1729	Label* on_failure,
				1730	int cp_offset,
				1731	bool check,
				1732	bool preloaded);
				1733
				1734	// Only emits letters (things that have case). Only used for case independent
				1735	// matches.
				1736	static inline bool EmitAtomLetter(Isolate* isolate,
				1737	RegExpCompiler* compiler,
				1738	uc16 c,
				1739	Label* on_failure,
				1740	int cp_offset,
				1741	bool check,
				1742	bool preloaded) {
				1743	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1744	bool one_byte = compiler->one_byte();
				1745	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				1746	int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
				1747	if (length <= 1) return false;
				1748	// We may not need to check against the end of the input string
				1749	// if this character lies before a character that matched.
				1750	if (!preloaded) {
				1751	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
				1752	}
				1753	Label ok;
				1754	DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
				1755	switch (length) {
				1756	case 2: {
				1757	if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
				1758	chars[1], on_failure)) {
				1759	} else {
				1760	macro_assembler->CheckCharacter(chars[0], &ok);
				1761	macro_assembler->CheckNotCharacter(chars[1], on_failure);
				1762	macro_assembler->Bind(&ok);
				1763	}
				1764	break;
				1765	}
				1766	case 4:
				1767	macro_assembler->CheckCharacter(chars[3], &ok);
				1768	// Fall through!
				1769	case 3:
				1770	macro_assembler->CheckCharacter(chars[0], &ok);
				1771	macro_assembler->CheckCharacter(chars[1], &ok);
				1772	macro_assembler->CheckNotCharacter(chars[2], on_failure);
				1773	macro_assembler->Bind(&ok);
				1774	break;
				1775	default:
				1776	UNREACHABLE();
				1777	break;
				1778	}
				1779	return true;
				1780	}
				1781
				1782
				1783	static void EmitBoundaryTest(RegExpMacroAssembler* masm,
				1784	int border,
				1785	Label* fall_through,
				1786	Label* above_or_equal,
				1787	Label* below) {
				1788	if (below != fall_through) {
				1789	masm->CheckCharacterLT(border, below);
				1790	if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
				1791	} else {
				1792	masm->CheckCharacterGT(border - 1, above_or_equal);
				1793	}
				1794	}
				1795
				1796
				1797	static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
				1798	int first,
				1799	int last,
				1800	Label* fall_through,
				1801	Label* in_range,
				1802	Label* out_of_range) {
				1803	if (in_range == fall_through) {
				1804	if (first == last) {
				1805	masm->CheckNotCharacter(first, out_of_range);
				1806	} else {
				1807	masm->CheckCharacterNotInRange(first, last, out_of_range);
				1808	}
				1809	} else {
				1810	if (first == last) {
				1811	masm->CheckCharacter(first, in_range);
				1812	} else {
				1813	masm->CheckCharacterInRange(first, last, in_range);
				1814	}
				1815	if (out_of_range != fall_through) masm->GoTo(out_of_range);
				1816	}
				1817	}
				1818
				1819
				1820	// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
				1821	// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
				1822	static void EmitUseLookupTable(
				1823	RegExpMacroAssembler* masm,
				1824	ZoneList<int>* ranges,
				1825	int start_index,
				1826	int end_index,
				1827	int min_char,
				1828	Label* fall_through,
				1829	Label* even_label,
				1830	Label* odd_label) {
				1831	static const int kSize = RegExpMacroAssembler::kTableSize;
				1832	static const int kMask = RegExpMacroAssembler::kTableMask;
				1833
				1834	int base = (min_char & ~kMask);
				1835	USE(base);
				1836
				1837	// Assert that everything is on one kTableSize page.
				1838	for (int i = start_index; i <= end_index; i++) {
				1839	DCHECK_EQ(ranges->at(i) & ~kMask, base);
				1840	}
				1841	DCHECK(start_index == 0 \|\| (ranges->at(start_index - 1) & ~kMask) <= base);
				1842
				1843	char templ[kSize];
				1844	Label* on_bit_set;
				1845	Label* on_bit_clear;
				1846	int bit;
				1847	if (even_label == fall_through) {
				1848	on_bit_set = odd_label;
				1849	on_bit_clear = even_label;
				1850	bit = 1;
				1851	} else {
				1852	on_bit_set = even_label;
				1853	on_bit_clear = odd_label;
				1854	bit = 0;
				1855	}
				1856	for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
				1857	templ[i] = bit;
				1858	}
				1859	int j = 0;
				1860	bit ^= 1;
				1861	for (int i = start_index; i < end_index; i++) {
				1862	for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
				1863	templ[j] = bit;
				1864	}
				1865	bit ^= 1;
				1866	}
				1867	for (int i = j; i < kSize; i++) {
				1868	templ[i] = bit;
				1869	}
				1870	Factory* factory = masm->isolate()->factory();
				1871	// TODO(erikcorry): Cache these.
				1872	Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
				1873	for (int i = 0; i < kSize; i++) {
				1874	ba->set(i, templ[i]);
				1875	}
				1876	masm->CheckBitInTable(ba, on_bit_set);
				1877	if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
				1878	}
				1879
				1880
				1881	static void CutOutRange(RegExpMacroAssembler* masm,
				1882	ZoneList<int>* ranges,
				1883	int start_index,
				1884	int end_index,
				1885	int cut_index,
				1886	Label* even_label,
				1887	Label* odd_label) {
				1888	bool odd = (((cut_index - start_index) & 1) == 1);
				1889	Label* in_range_label = odd ? odd_label : even_label;
				1890	Label dummy;
				1891	EmitDoubleBoundaryTest(masm,
				1892	ranges->at(cut_index),
				1893	ranges->at(cut_index + 1) - 1,
				1894	&dummy,
				1895	in_range_label,
				1896	&dummy);
				1897	DCHECK(!dummy.is_linked());
				1898	// Cut out the single range by rewriting the array. This creates a new
				1899	// range that is a merger of the two ranges on either side of the one we
				1900	// are cutting out. The oddity of the labels is preserved.
				1901	for (int j = cut_index; j > start_index; j--) {
				1902	ranges->at(j) = ranges->at(j - 1);
				1903	}
				1904	for (int j = cut_index + 1; j < end_index; j++) {
				1905	ranges->at(j) = ranges->at(j + 1);
				1906	}
				1907	}
				1908
				1909
				1910	// Unicode case. Split the search space into kSize spaces that are handled
				1911	// with recursion.
				1912	static void SplitSearchSpace(ZoneList<int>* ranges,
				1913	int start_index,
				1914	int end_index,
				1915	int* new_start_index,
				1916	int* new_end_index,
				1917	int* border) {
				1918	static const int kSize = RegExpMacroAssembler::kTableSize;
				1919	static const int kMask = RegExpMacroAssembler::kTableMask;
				1920
				1921	int first = ranges->at(start_index);
				1922	int last = ranges->at(end_index) - 1;
				1923
				1924	*new_start_index = start_index;
				1925	*border = (ranges->at(start_index) & ~kMask) + kSize;
				1926	while (*new_start_index < end_index) {
				1927	if (ranges->at(new_start_index) > border) break;
				1928	(*new_start_index)++;
				1929	}
				1930	// new_start_index is the index of the first edge that is beyond the
				1931	// current kSize space.
				1932
				1933	// For very large search spaces we do a binary chop search of the non-Latin1
				1934	// space instead of just going to the end of the current kSize space. The
				1935	// heuristics are complicated a little by the fact that any 128-character
				1936	// encoding space can be quickly tested with a table lookup, so we don't
				1937	// wish to do binary chop search at a smaller granularity than that. A
				1938	// 128-character space can take up a lot of space in the ranges array if,
				1939	// for example, we only want to match every second character (eg. the lower
				1940	// case characters on some Unicode pages).
				1941	int binary_chop_index = (end_index + start_index) / 2;
				1942	// The first test ensures that we get to the code that handles the Latin1
				1943	// range with a single not-taken branch, speeding up this important
				1944	// character range (even non-Latin1 charset-based text has spaces and
				1945	// punctuation).
				1946	if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
				1947	end_index - start_index > (new_start_index - start_index) 2 &&
				1948	last - first > kSize * 2 && binary_chop_index > *new_start_index &&
				1949	ranges->at(binary_chop_index) >= first + 2 * kSize) {
				1950	int scan_forward_for_section_border = binary_chop_index;;
				1951	int new_border = (ranges->at(binary_chop_index) \| kMask) + 1;
				1952
				1953	while (scan_forward_for_section_border < end_index) {
				1954	if (ranges->at(scan_forward_for_section_border) > new_border) {
				1955	*new_start_index = scan_forward_for_section_border;
				1956	*border = new_border;
				1957	break;
				1958	}
				1959	scan_forward_for_section_border++;
				1960	}
				1961	}
				1962
				1963	DCHECK(*new_start_index > start_index);
				1964	new_end_index = new_start_index - 1;
				1965	if (ranges->at(new_end_index) == border) {
				1966	(*new_end_index)--;
				1967	}
				1968	if (*border >= ranges->at(end_index)) {
				1969	*border = ranges->at(end_index);
				1970	*new_start_index = end_index; // Won't be used.
				1971	*new_end_index = end_index - 1;
				1972	}
				1973	}
				1974
				1975
				1976	// Gets a series of segment boundaries representing a character class. If the
				1977	// character is in the range between an even and an odd boundary (counting from
				1978	// start_index) then go to even_label, otherwise go to odd_label. We already
				1979	// know that the character is in the range of min_char to max_char inclusive.
				1980	// Either label can be NULL indicating backtracking. Either label can also be
				1981	// equal to the fall_through label.
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	1982	static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
				1983	int start_index, int end_index, uc32 min_char,
				1984	uc32 max_char, Label* fall_through,
				1985	Label* even_label, Label* odd_label) {
				1986	DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
				1987	DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
				1988
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	1989	int first = ranges->at(start_index);
				1990	int last = ranges->at(end_index) - 1;
				1991
				1992	DCHECK_LT(min_char, first);
				1993
				1994	// Just need to test if the character is before or on-or-after
				1995	// a particular character.
				1996	if (start_index == end_index) {
				1997	EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
				1998	return;
				1999	}
				2000
				2001	// Another almost trivial case: There is one interval in the middle that is
				2002	// different from the end intervals.
				2003	if (start_index + 1 == end_index) {
				2004	EmitDoubleBoundaryTest(
				2005	masm, first, last, fall_through, even_label, odd_label);
				2006	return;
				2007	}
				2008
				2009	// It's not worth using table lookup if there are very few intervals in the
				2010	// character class.
				2011	if (end_index - start_index <= 6) {
				2012	// It is faster to test for individual characters, so we look for those
				2013	// first, then try arbitrary ranges in the second round.
				2014	static int kNoCutIndex = -1;
				2015	int cut = kNoCutIndex;
				2016	for (int i = start_index; i < end_index; i++) {
				2017	if (ranges->at(i) == ranges->at(i + 1) - 1) {
				2018	cut = i;
				2019	break;
				2020	}
				2021	}
				2022	if (cut == kNoCutIndex) cut = start_index;
				2023	CutOutRange(
				2024	masm, ranges, start_index, end_index, cut, even_label, odd_label);
				2025	DCHECK_GE(end_index - start_index, 2);
				2026	GenerateBranches(masm,
				2027	ranges,
				2028	start_index + 1,
				2029	end_index - 1,
				2030	min_char,
				2031	max_char,
				2032	fall_through,
				2033	even_label,
				2034	odd_label);
				2035	return;
				2036	}
				2037
				2038	// If there are a lot of intervals in the regexp, then we will use tables to
				2039	// determine whether the character is inside or outside the character class.
				2040	static const int kBits = RegExpMacroAssembler::kTableSizeBits;
				2041
				2042	if ((max_char >> kBits) == (min_char >> kBits)) {
				2043	EmitUseLookupTable(masm,
				2044	ranges,
				2045	start_index,
				2046	end_index,
				2047	min_char,
				2048	fall_through,
				2049	even_label,
				2050	odd_label);
				2051	return;
				2052	}
				2053
				2054	if ((min_char >> kBits) != (first >> kBits)) {
				2055	masm->CheckCharacterLT(first, odd_label);
				2056	GenerateBranches(masm,
				2057	ranges,
				2058	start_index + 1,
				2059	end_index,
				2060	first,
				2061	max_char,
				2062	fall_through,
				2063	odd_label,
				2064	even_label);
				2065	return;
				2066	}
				2067
				2068	int new_start_index = 0;
				2069	int new_end_index = 0;
				2070	int border = 0;
				2071
				2072	SplitSearchSpace(ranges,
				2073	start_index,
				2074	end_index,
				2075	&new_start_index,
				2076	&new_end_index,
				2077	&border);
				2078
				2079	Label handle_rest;
				2080	Label* above = &handle_rest;
				2081	if (border == last + 1) {
				2082	// We didn't find any section that started after the limit, so everything
				2083	// above the border is one of the terminal labels.
				2084	above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
				2085	DCHECK(new_end_index == end_index - 1);
				2086	}
				2087
				2088	DCHECK_LE(start_index, new_end_index);
				2089	DCHECK_LE(new_start_index, end_index);
				2090	DCHECK_LT(start_index, new_start_index);
				2091	DCHECK_LT(new_end_index, end_index);
				2092	DCHECK(new_end_index + 1 == new_start_index \|\|
				2093	(new_end_index + 2 == new_start_index &&
				2094	border == ranges->at(new_end_index + 1)));
				2095	DCHECK_LT(min_char, border - 1);
				2096	DCHECK_LT(border, max_char);
				2097	DCHECK_LT(ranges->at(new_end_index), border);
				2098	DCHECK(border < ranges->at(new_start_index) \|\|
				2099	(border == ranges->at(new_start_index) &&
				2100	new_start_index == end_index &&
				2101	new_end_index == end_index - 1 &&
				2102	border == last + 1));
				2103	DCHECK(new_start_index == 0 \|\| border >= ranges->at(new_start_index - 1));
				2104
				2105	masm->CheckCharacterGT(border - 1, above);
				2106	Label dummy;
				2107	GenerateBranches(masm,
				2108	ranges,
				2109	start_index,
				2110	new_end_index,
				2111	min_char,
				2112	border - 1,
				2113	&dummy,
				2114	even_label,
				2115	odd_label);
				2116	if (handle_rest.is_linked()) {
				2117	masm->Bind(&handle_rest);
				2118	bool flip = (new_start_index & 1) != (start_index & 1);
				2119	GenerateBranches(masm,
				2120	ranges,
				2121	new_start_index,
				2122	end_index,
				2123	border,
				2124	max_char,
				2125	&dummy,
				2126	flip ? odd_label : even_label,
				2127	flip ? even_label : odd_label);
				2128	}
				2129	}
				2130
				2131
				2132	static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
				2133	RegExpCharacterClass* cc, bool one_byte,
				2134	Label* on_failure, int cp_offset, bool check_offset,
				2135	bool preloaded, Zone* zone) {
				2136	ZoneList<CharacterRange>* ranges = cc->ranges(zone);
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	2137	CharacterRange::Canonicalize(ranges);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	2138
				2139	int max_char;
				2140	if (one_byte) {
				2141	max_char = String::kMaxOneByteCharCode;
				2142	} else {
				2143	max_char = String::kMaxUtf16CodeUnit;
				2144	}
				2145
				2146	int range_count = ranges->length();
				2147
				2148	int last_valid_range = range_count - 1;
				2149	while (last_valid_range >= 0) {
				2150	CharacterRange& range = ranges->at(last_valid_range);
				2151	if (range.from() <= max_char) {
				2152	break;
				2153	}
				2154	last_valid_range--;
				2155	}
				2156
				2157	if (last_valid_range < 0) {
				2158	if (!cc->is_negated()) {
				2159	macro_assembler->GoTo(on_failure);
				2160	}
				2161	if (check_offset) {
				2162	macro_assembler->CheckPosition(cp_offset, on_failure);
				2163	}
				2164	return;
				2165	}
				2166
				2167	if (last_valid_range == 0 &&
				2168	ranges->at(0).IsEverything(max_char)) {
				2169	if (cc->is_negated()) {
				2170	macro_assembler->GoTo(on_failure);
				2171	} else {
				2172	// This is a common case hit by non-anchored expressions.
				2173	if (check_offset) {
				2174	macro_assembler->CheckPosition(cp_offset, on_failure);
				2175	}
				2176	}
				2177	return;
				2178	}
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	2179
				2180	if (!preloaded) {
				2181	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
				2182	}
				2183
				2184	if (cc->is_standard(zone) &&
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	2185	macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
				2186	on_failure)) {
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	2187	return;
				2188	}
				2189
				2190
				2191	// A new list with ascending entries. Each entry is a code unit
				2192	// where there is a boundary between code units that are part of
				2193	// the class and code units that are not. Normally we insert an
				2194	// entry at zero which goes to the failure label, but if there
				2195	// was already one there we fall through for success on that entry.
				2196	// Subsequent entries have alternating meaning (success/failure).
				2197	ZoneList<int>* range_boundaries =
				2198	new(zone) ZoneList<int>(last_valid_range, zone);
				2199
				2200	bool zeroth_entry_is_failure = !cc->is_negated();
				2201
				2202	for (int i = 0; i <= last_valid_range; i++) {
				2203	CharacterRange& range = ranges->at(i);
				2204	if (range.from() == 0) {
				2205	DCHECK_EQ(i, 0);
				2206	zeroth_entry_is_failure = !zeroth_entry_is_failure;
				2207	} else {
				2208	range_boundaries->Add(range.from(), zone);
				2209	}
				2210	range_boundaries->Add(range.to() + 1, zone);
				2211	}
				2212	int end_index = range_boundaries->length() - 1;
				2213	if (range_boundaries->at(end_index) > max_char) {
				2214	end_index--;
				2215	}
				2216
				2217	Label fall_through;
				2218	GenerateBranches(macro_assembler,
				2219	range_boundaries,
				2220	0, // start_index.
				2221	end_index,
				2222	0, // min_char.
				2223	max_char,
				2224	&fall_through,
				2225	zeroth_entry_is_failure ? &fall_through : on_failure,
				2226	zeroth_entry_is_failure ? on_failure : &fall_through);
				2227	macro_assembler->Bind(&fall_through);
				2228	}
				2229
				2230
				2231	RegExpNode::~RegExpNode() {
				2232	}
				2233
				2234
				2235	RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
				2236	Trace* trace) {
				2237	// If we are generating a greedy loop then don't stop and don't reuse code.
				2238	if (trace->stop_node() != NULL) {
				2239	return CONTINUE;
				2240	}
				2241
				2242	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				2243	if (trace->is_trivial()) {
				2244	if (label_.is_bound() \|\| on_work_list() \|\| !KeepRecursing(compiler)) {
				2245	// If a generic version is already scheduled to be generated or we have
				2246	// recursed too deeply then just generate a jump to that code.
				2247	macro_assembler->GoTo(&label_);
				2248	// This will queue it up for generation of a generic version if it hasn't
				2249	// already been queued.
				2250	compiler->AddWork(this);
				2251	return DONE;
				2252	}
				2253	// Generate generic version of the node and bind the label for later use.
				2254	macro_assembler->Bind(&label_);
				2255	return CONTINUE;
				2256	}
				2257
				2258	// We are being asked to make a non-generic version. Keep track of how many
				2259	// non-generic versions we generate so as not to overdo it.
				2260	trace_count_++;
				2261	if (KeepRecursing(compiler) && compiler->optimize() &&
				2262	trace_count_ < kMaxCopiesCodeGenerated) {
				2263	return CONTINUE;
				2264	}
				2265
				2266	// If we get here code has been generated for this node too many times or
				2267	// recursion is too deep. Time to switch to a generic version. The code for
				2268	// generic versions above can handle deep recursion properly.
				2269	bool was_limiting = compiler->limiting_recursion();
				2270	compiler->set_limiting_recursion(true);
				2271	trace->Flush(compiler, this);
				2272	compiler->set_limiting_recursion(was_limiting);
				2273	return DONE;
				2274	}
				2275
				2276
				2277	bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
				2278	return !compiler->limiting_recursion() &&
				2279	compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
				2280	}
				2281
				2282
				2283	int ActionNode::EatsAtLeast(int still_to_find,
				2284	int budget,
				2285	bool not_at_start) {
				2286	if (budget <= 0) return 0;
				2287	if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
				2288	return on_success()->EatsAtLeast(still_to_find,
				2289	budget - 1,
				2290	not_at_start);
				2291	}
				2292
				2293
				2294	void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
				2295	BoyerMooreLookahead* bm, bool not_at_start) {
				2296	if (action_type_ == BEGIN_SUBMATCH) {
				2297	bm->SetRest(offset);
				2298	} else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
				2299	on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
				2300	}
				2301	SaveBMInfo(bm, not_at_start, offset);
				2302	}
				2303
				2304
				2305	int AssertionNode::EatsAtLeast(int still_to_find,
				2306	int budget,
				2307	bool not_at_start) {
				2308	if (budget <= 0) return 0;
				2309	// If we know we are not at the start and we are asked "how many characters
				2310	// will you match if you succeed?" then we can answer anything since false
				2311	// implies false. So lets just return the max answer (still_to_find) since
				2312	// that won't prevent us from preloading a lot of characters for the other
				2313	// branches in the node graph.
				2314	if (assertion_type() == AT_START && not_at_start) return still_to_find;
				2315	return on_success()->EatsAtLeast(still_to_find,
				2316	budget - 1,
				2317	not_at_start);
				2318	}
				2319
				2320
				2321	void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
				2322	BoyerMooreLookahead* bm, bool not_at_start) {
				2323	// Match the behaviour of EatsAtLeast on this node.
				2324	if (assertion_type() == AT_START && not_at_start) return;
				2325	on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
				2326	SaveBMInfo(bm, not_at_start, offset);
				2327	}
				2328
				2329
				2330	int BackReferenceNode::EatsAtLeast(int still_to_find,
				2331	int budget,
				2332	bool not_at_start) {
				2333	if (read_backward()) return 0;
				2334	if (budget <= 0) return 0;
				2335	return on_success()->EatsAtLeast(still_to_find,
				2336	budget - 1,
				2337	not_at_start);
				2338	}
				2339
				2340
				2341	int TextNode::EatsAtLeast(int still_to_find,
				2342	int budget,
				2343	bool not_at_start) {
				2344	if (read_backward()) return 0;
				2345	int answer = Length();
				2346	if (answer >= still_to_find) return answer;
				2347	if (budget <= 0) return answer;
				2348	// We are not at start after this node so we set the last argument to 'true'.
				2349	return answer + on_success()->EatsAtLeast(still_to_find - answer,
				2350	budget - 1,
				2351	true);
				2352	}
				2353
				2354
				2355	int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
				2356	bool not_at_start) {
				2357	if (budget <= 0) return 0;
				2358	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				2359	// afterwards.
				2360	RegExpNode* node = alternatives_->at(1).node();
				2361	return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
				2362	}
				2363
				2364
				2365	void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
				2366	QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
				2367	bool not_at_start) {
				2368	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				2369	// afterwards.
				2370	RegExpNode* node = alternatives_->at(1).node();
				2371	return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
				2372	}
				2373
				2374
				2375	int ChoiceNode::EatsAtLeastHelper(int still_to_find,
				2376	int budget,
				2377	RegExpNode* ignore_this_node,
				2378	bool not_at_start) {
				2379	if (budget <= 0) return 0;
				2380	int min = 100;
				2381	int choice_count = alternatives_->length();
				2382	budget = (budget - 1) / choice_count;
				2383	for (int i = 0; i < choice_count; i++) {
				2384	RegExpNode* node = alternatives_->at(i).node();
				2385	if (node == ignore_this_node) continue;
				2386	int node_eats_at_least =
				2387	node->EatsAtLeast(still_to_find, budget, not_at_start);
				2388	if (node_eats_at_least < min) min = node_eats_at_least;
				2389	if (min == 0) return 0;
				2390	}
				2391	return min;
				2392	}
				2393
				2394
				2395	int LoopChoiceNode::EatsAtLeast(int still_to_find,
				2396	int budget,
				2397	bool not_at_start) {
				2398	return EatsAtLeastHelper(still_to_find,
				2399	budget - 1,
				2400	loop_node_,
				2401	not_at_start);
				2402	}
				2403
				2404
				2405	int ChoiceNode::EatsAtLeast(int still_to_find,
				2406	int budget,
				2407	bool not_at_start) {
				2408	return EatsAtLeastHelper(still_to_find,
				2409	budget,
				2410	NULL,
				2411	not_at_start);
				2412	}
				2413
				2414
				2415	// Takes the left-most 1-bit and smears it out, setting all bits to its right.
				2416	static inline uint32_t SmearBitsRight(uint32_t v) {
				2417	v \|= v >> 1;
				2418	v \|= v >> 2;
				2419	v \|= v >> 4;
				2420	v \|= v >> 8;
				2421	v \|= v >> 16;
				2422	return v;
				2423	}
				2424
				2425
				2426	bool QuickCheckDetails::Rationalize(bool asc) {
				2427	bool found_useful_op = false;
				2428	uint32_t char_mask;
				2429	if (asc) {
				2430	char_mask = String::kMaxOneByteCharCode;
				2431	} else {
				2432	char_mask = String::kMaxUtf16CodeUnit;
				2433	}
				2434	mask_ = 0;
				2435	value_ = 0;
				2436	int char_shift = 0;
				2437	for (int i = 0; i < characters_; i++) {
				2438	Position* pos = &positions_[i];
				2439	if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
				2440	found_useful_op = true;
				2441	}
				2442	mask_ \|= (pos->mask & char_mask) << char_shift;
				2443	value_ \|= (pos->value & char_mask) << char_shift;
				2444	char_shift += asc ? 8 : 16;
				2445	}
				2446	return found_useful_op;
				2447	}
				2448
				2449
				2450	bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
				2451	Trace* bounds_check_trace,
				2452	Trace* trace,
				2453	bool preload_has_checked_bounds,
				2454	Label* on_possible_success,
				2455	QuickCheckDetails* details,
				2456	bool fall_through_on_failure) {
				2457	if (details->characters() == 0) return false;
				2458	GetQuickCheckDetails(
				2459	details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
				2460	if (details->cannot_match()) return false;
				2461	if (!details->Rationalize(compiler->one_byte())) return false;
				2462	DCHECK(details->characters() == 1 \|\|
				2463	compiler->macro_assembler()->CanReadUnaligned());
				2464	uint32_t mask = details->mask();
				2465	uint32_t value = details->value();
				2466
				2467	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2468
				2469	if (trace->characters_preloaded() != details->characters()) {
				2470	DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
				2471	// We are attempting to preload the minimum number of characters
				2472	// any choice would eat, so if the bounds check fails, then none of the
				2473	// choices can succeed, so we can just immediately backtrack, rather
				2474	// than go to the next choice.
				2475	assembler->LoadCurrentCharacter(trace->cp_offset(),
				2476	bounds_check_trace->backtrack(),
				2477	!preload_has_checked_bounds,
				2478	details->characters());
				2479	}
				2480
				2481
				2482	bool need_mask = true;
				2483
				2484	if (details->characters() == 1) {
				2485	// If number of characters preloaded is 1 then we used a byte or 16 bit
				2486	// load so the value is already masked down.
				2487	uint32_t char_mask;
				2488	if (compiler->one_byte()) {
				2489	char_mask = String::kMaxOneByteCharCode;
				2490	} else {
				2491	char_mask = String::kMaxUtf16CodeUnit;
				2492	}
				2493	if ((mask & char_mask) == char_mask) need_mask = false;
				2494	mask &= char_mask;
				2495	} else {
				2496	// For 2-character preloads in one-byte mode or 1-character preloads in
				2497	// two-byte mode we also use a 16 bit load with zero extend.
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	2498	static const uint32_t kTwoByteMask = 0xffff;
				2499	static const uint32_t kFourByteMask = 0xffffffff;
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	2500	if (details->characters() == 2 && compiler->one_byte()) {
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	2501	if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	2502	} else if (details->characters() == 1 && !compiler->one_byte()) {
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	2503	if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	2504	} else {
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	2505	if (mask == kFourByteMask) need_mask = false;
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	2506	}
				2507	}
				2508
				2509	if (fall_through_on_failure) {
				2510	if (need_mask) {
				2511	assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
				2512	} else {
				2513	assembler->CheckCharacter(value, on_possible_success);
				2514	}
				2515	} else {
				2516	if (need_mask) {
				2517	assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
				2518	} else {
				2519	assembler->CheckNotCharacter(value, trace->backtrack());
				2520	}
				2521	}
				2522	return true;
				2523	}
				2524
				2525
				2526	// Here is the meat of GetQuickCheckDetails (see also the comment on the
				2527	// super-class in the .h file).
				2528	//
				2529	// We iterate along the text object, building up for each character a
				2530	// mask and value that can be used to test for a quick failure to match.
				2531	// The masks and values for the positions will be combined into a single
				2532	// machine word for the current character width in order to be used in
				2533	// generating a quick check.
				2534	void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2535	RegExpCompiler* compiler,
				2536	int characters_filled_in,
				2537	bool not_at_start) {
				2538	// Do not collect any quick check details if the text node reads backward,
				2539	// since it reads in the opposite direction than we use for quick checks.
				2540	if (read_backward()) return;
				2541	Isolate* isolate = compiler->macro_assembler()->isolate();
				2542	DCHECK(characters_filled_in < details->characters());
				2543	int characters = details->characters();
				2544	int char_mask;
				2545	if (compiler->one_byte()) {
				2546	char_mask = String::kMaxOneByteCharCode;
				2547	} else {
				2548	char_mask = String::kMaxUtf16CodeUnit;
				2549	}
				2550	for (int k = 0; k < elements()->length(); k++) {
				2551	TextElement elm = elements()->at(k);
				2552	if (elm.text_type() == TextElement::ATOM) {
				2553	Vector<const uc16> quarks = elm.atom()->data();
				2554	for (int i = 0; i < characters && i < quarks.length(); i++) {
				2555	QuickCheckDetails::Position* pos =
				2556	details->positions(characters_filled_in);
				2557	uc16 c = quarks[i];
				2558	if (compiler->ignore_case()) {
				2559	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				2560	int length = GetCaseIndependentLetters(isolate, c,
				2561	compiler->one_byte(), chars);
				2562	if (length == 0) {
				2563	// This can happen because all case variants are non-Latin1, but we
				2564	// know the input is Latin1.
				2565	details->set_cannot_match();
				2566	pos->determines_perfectly = false;
				2567	return;
				2568	}
				2569	if (length == 1) {
				2570	// This letter has no case equivalents, so it's nice and simple
				2571	// and the mask-compare will determine definitely whether we have
				2572	// a match at this character position.
				2573	pos->mask = char_mask;
				2574	pos->value = c;
				2575	pos->determines_perfectly = true;
				2576	} else {
				2577	uint32_t common_bits = char_mask;
				2578	uint32_t bits = chars[0];
				2579	for (int j = 1; j < length; j++) {
				2580	uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
				2581	common_bits ^= differing_bits;
				2582	bits &= common_bits;
				2583	}
				2584	// If length is 2 and common bits has only one zero in it then
				2585	// our mask and compare instruction will determine definitely
				2586	// whether we have a match at this character position. Otherwise
				2587	// it can only be an approximate check.
				2588	uint32_t one_zero = (common_bits \| ~char_mask);
				2589	if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
				2590	pos->determines_perfectly = true;
				2591	}
				2592	pos->mask = common_bits;
				2593	pos->value = bits;
				2594	}
				2595	} else {
				2596	// Don't ignore case. Nice simple case where the mask-compare will
				2597	// determine definitely whether we have a match at this character
				2598	// position.
				2599	if (c > char_mask) {
				2600	details->set_cannot_match();
				2601	pos->determines_perfectly = false;
				2602	return;
				2603	}
				2604	pos->mask = char_mask;
				2605	pos->value = c;
				2606	pos->determines_perfectly = true;
				2607	}
				2608	characters_filled_in++;
				2609	DCHECK(characters_filled_in <= details->characters());
				2610	if (characters_filled_in == details->characters()) {
				2611	return;
				2612	}
				2613	}
				2614	} else {
				2615	QuickCheckDetails::Position* pos =
				2616	details->positions(characters_filled_in);
				2617	RegExpCharacterClass* tree = elm.char_class();
				2618	ZoneList<CharacterRange>* ranges = tree->ranges(zone());
				2619	if (tree->is_negated()) {
				2620	// A quick check uses multi-character mask and compare. There is no
				2621	// useful way to incorporate a negative char class into this scheme
				2622	// so we just conservatively create a mask and value that will always
				2623	// succeed.
				2624	pos->mask = 0;
				2625	pos->value = 0;
				2626	} else {
				2627	int first_range = 0;
				2628	while (ranges->at(first_range).from() > char_mask) {
				2629	first_range++;
				2630	if (first_range == ranges->length()) {
				2631	details->set_cannot_match();
				2632	pos->determines_perfectly = false;
				2633	return;
				2634	}
				2635	}
				2636	CharacterRange range = ranges->at(first_range);
				2637	uc16 from = range.from();
				2638	uc16 to = range.to();
				2639	if (to > char_mask) {
				2640	to = char_mask;
				2641	}
				2642	uint32_t differing_bits = (from ^ to);
				2643	// A mask and compare is only perfect if the differing bits form a
				2644	// number like 00011111 with one single block of trailing 1s.
				2645	if ((differing_bits & (differing_bits + 1)) == 0 &&
				2646	from + differing_bits == to) {
				2647	pos->determines_perfectly = true;
				2648	}
				2649	uint32_t common_bits = ~SmearBitsRight(differing_bits);
				2650	uint32_t bits = (from & common_bits);
				2651	for (int i = first_range + 1; i < ranges->length(); i++) {
				2652	CharacterRange range = ranges->at(i);
				2653	uc16 from = range.from();
				2654	uc16 to = range.to();
				2655	if (from > char_mask) continue;
				2656	if (to > char_mask) to = char_mask;
				2657	// Here we are combining more ranges into the mask and compare
				2658	// value. With each new range the mask becomes more sparse and
				2659	// so the chances of a false positive rise. A character class
				2660	// with multiple ranges is assumed never to be equivalent to a
				2661	// mask and compare operation.
				2662	pos->determines_perfectly = false;
				2663	uint32_t new_common_bits = (from ^ to);
				2664	new_common_bits = ~SmearBitsRight(new_common_bits);
				2665	common_bits &= new_common_bits;
				2666	bits &= new_common_bits;
				2667	uint32_t differing_bits = (from & common_bits) ^ bits;
				2668	common_bits ^= differing_bits;
				2669	bits &= common_bits;
				2670	}
				2671	pos->mask = common_bits;
				2672	pos->value = bits;
				2673	}
				2674	characters_filled_in++;
				2675	DCHECK(characters_filled_in <= details->characters());
				2676	if (characters_filled_in == details->characters()) {
				2677	return;
				2678	}
				2679	}
				2680	}
				2681	DCHECK(characters_filled_in != details->characters());
				2682	if (!details->cannot_match()) {
				2683	on_success()-> GetQuickCheckDetails(details,
				2684	compiler,
				2685	characters_filled_in,
				2686	true);
				2687	}
				2688	}
				2689
				2690
				2691	void QuickCheckDetails::Clear() {
				2692	for (int i = 0; i < characters_; i++) {
				2693	positions_[i].mask = 0;
				2694	positions_[i].value = 0;
				2695	positions_[i].determines_perfectly = false;
				2696	}
				2697	characters_ = 0;
				2698	}
				2699
				2700
				2701	void QuickCheckDetails::Advance(int by, bool one_byte) {
				2702	if (by >= characters_ \|\| by < 0) {
				2703	DCHECK_IMPLIES(by < 0, characters_ == 0);
				2704	Clear();
				2705	return;
				2706	}
				2707	DCHECK_LE(characters_ - by, 4);
				2708	DCHECK_LE(characters_, 4);
				2709	for (int i = 0; i < characters_ - by; i++) {
				2710	positions_[i] = positions_[by + i];
				2711	}
				2712	for (int i = characters_ - by; i < characters_; i++) {
				2713	positions_[i].mask = 0;
				2714	positions_[i].value = 0;
				2715	positions_[i].determines_perfectly = false;
				2716	}
				2717	characters_ -= by;
				2718	// We could change mask_ and value_ here but we would never advance unless
				2719	// they had already been used in a check and they won't be used again because
				2720	// it would gain us nothing. So there's no point.
				2721	}
				2722
				2723
				2724	void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
				2725	DCHECK(characters_ == other->characters_);
				2726	if (other->cannot_match_) {
				2727	return;
				2728	}
				2729	if (cannot_match_) {
				2730	this = other;
				2731	return;
				2732	}
				2733	for (int i = from_index; i < characters_; i++) {
				2734	QuickCheckDetails::Position* pos = positions(i);
				2735	QuickCheckDetails::Position* other_pos = other->positions(i);
				2736	if (pos->mask != other_pos->mask \|\|
				2737	pos->value != other_pos->value \|\|
				2738	!other_pos->determines_perfectly) {
				2739	// Our mask-compare operation will be approximate unless we have the
				2740	// exact same operation on both sides of the alternation.
				2741	pos->determines_perfectly = false;
				2742	}
				2743	pos->mask &= other_pos->mask;
				2744	pos->value &= pos->mask;
				2745	other_pos->value &= pos->mask;
				2746	uc16 differing_bits = (pos->value ^ other_pos->value);
				2747	pos->mask &= ~differing_bits;
				2748	pos->value &= pos->mask;
				2749	}
				2750	}
				2751
				2752
				2753	class VisitMarker {
				2754	public:
				2755	explicit VisitMarker(NodeInfo* info) : info_(info) {
				2756	DCHECK(!info->visited);
				2757	info->visited = true;
				2758	}
				2759	~VisitMarker() {
				2760	info_->visited = false;
				2761	}
				2762	private:
				2763	NodeInfo* info_;
				2764	};
				2765
				2766
				2767	RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
				2768	if (info()->replacement_calculated) return replacement();
				2769	if (depth < 0) return this;
				2770	DCHECK(!info()->visited);
				2771	VisitMarker marker(info());
				2772	return FilterSuccessor(depth - 1, ignore_case);
				2773	}
				2774
				2775
				2776	RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
				2777	RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
				2778	if (next == NULL) return set_replacement(NULL);
				2779	on_success_ = next;
				2780	return set_replacement(this);
				2781	}
				2782
				2783
				2784	// We need to check for the following characters: 0x39c 0x3bc 0x178.
				2785	static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
				2786	// TODO(dcarney): this could be a lot more efficient.
				2787	return range.Contains(0x39c) \|\|
				2788	range.Contains(0x3bc) \|\| range.Contains(0x178);
				2789	}
				2790
				2791
				2792	static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
				2793	for (int i = 0; i < ranges->length(); i++) {
				2794	// TODO(dcarney): this could be a lot more efficient.
				2795	if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
				2796	}
				2797	return false;
				2798	}
				2799
				2800
				2801	RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
				2802	if (info()->replacement_calculated) return replacement();
				2803	if (depth < 0) return this;
				2804	DCHECK(!info()->visited);
				2805	VisitMarker marker(info());
				2806	int element_count = elements()->length();
				2807	for (int i = 0; i < element_count; i++) {
				2808	TextElement elm = elements()->at(i);
				2809	if (elm.text_type() == TextElement::ATOM) {
				2810	Vector<const uc16> quarks = elm.atom()->data();
				2811	for (int j = 0; j < quarks.length(); j++) {
				2812	uint16_t c = quarks[j];
				2813	if (c <= String::kMaxOneByteCharCode) continue;
				2814	if (!ignore_case) return set_replacement(NULL);
				2815	// Here, we need to check for characters whose upper and lower cases
				2816	// are outside the Latin-1 range.
				2817	uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
				2818	// Character is outside Latin-1 completely
				2819	if (converted == 0) return set_replacement(NULL);
				2820	// Convert quark to Latin-1 in place.
				2821	uint16_t* copy = const_cast<uint16_t*>(quarks.start());
				2822	copy[j] = converted;
				2823	}
				2824	} else {
				2825	DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
				2826	RegExpCharacterClass* cc = elm.char_class();
				2827	ZoneList<CharacterRange>* ranges = cc->ranges(zone());
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	2828	CharacterRange::Canonicalize(ranges);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	2829	// Now they are in order so we only need to look at the first.
				2830	int range_count = ranges->length();
				2831	if (cc->is_negated()) {
				2832	if (range_count != 0 &&
				2833	ranges->at(0).from() == 0 &&
				2834	ranges->at(0).to() >= String::kMaxOneByteCharCode) {
				2835	// This will be handled in a later filter.
				2836	if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
				2837	return set_replacement(NULL);
				2838	}
				2839	} else {
				2840	if (range_count == 0 \|\|
				2841	ranges->at(0).from() > String::kMaxOneByteCharCode) {
				2842	// This will be handled in a later filter.
				2843	if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
				2844	return set_replacement(NULL);
				2845	}
				2846	}
				2847	}
				2848	}
				2849	return FilterSuccessor(depth - 1, ignore_case);
				2850	}
				2851
				2852
				2853	RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
				2854	if (info()->replacement_calculated) return replacement();
				2855	if (depth < 0) return this;
				2856	if (info()->visited) return this;
				2857	{
				2858	VisitMarker marker(info());
				2859
				2860	RegExpNode* continue_replacement =
				2861	continue_node_->FilterOneByte(depth - 1, ignore_case);
				2862	// If we can't continue after the loop then there is no sense in doing the
				2863	// loop.
				2864	if (continue_replacement == NULL) return set_replacement(NULL);
				2865	}
				2866
				2867	return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
				2868	}
				2869
				2870
				2871	RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
				2872	if (info()->replacement_calculated) return replacement();
				2873	if (depth < 0) return this;
				2874	if (info()->visited) return this;
				2875	VisitMarker marker(info());
				2876	int choice_count = alternatives_->length();
				2877
				2878	for (int i = 0; i < choice_count; i++) {
				2879	GuardedAlternative alternative = alternatives_->at(i);
				2880	if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
				2881	set_replacement(this);
				2882	return this;
				2883	}
				2884	}
				2885
				2886	int surviving = 0;
				2887	RegExpNode* survivor = NULL;
				2888	for (int i = 0; i < choice_count; i++) {
				2889	GuardedAlternative alternative = alternatives_->at(i);
				2890	RegExpNode* replacement =
				2891	alternative.node()->FilterOneByte(depth - 1, ignore_case);
				2892	DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
				2893	if (replacement != NULL) {
				2894	alternatives_->at(i).set_node(replacement);
				2895	surviving++;
				2896	survivor = replacement;
				2897	}
				2898	}
				2899	if (surviving < 2) return set_replacement(survivor);
				2900
				2901	set_replacement(this);
				2902	if (surviving == choice_count) {
				2903	return this;
				2904	}
				2905	// Only some of the nodes survived the filtering. We need to rebuild the
				2906	// alternatives list.
				2907	ZoneList<GuardedAlternative>* new_alternatives =
				2908	new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
				2909	for (int i = 0; i < choice_count; i++) {
				2910	RegExpNode* replacement =
				2911	alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
				2912	if (replacement != NULL) {
				2913	alternatives_->at(i).set_node(replacement);
				2914	new_alternatives->Add(alternatives_->at(i), zone());
				2915	}
				2916	}
				2917	alternatives_ = new_alternatives;
				2918	return this;
				2919	}
				2920
				2921
				2922	RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
				2923	bool ignore_case) {
				2924	if (info()->replacement_calculated) return replacement();
				2925	if (depth < 0) return this;
				2926	if (info()->visited) return this;
				2927	VisitMarker marker(info());
				2928	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				2929	// afterwards.
				2930	RegExpNode* node = alternatives_->at(1).node();
				2931	RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
				2932	if (replacement == NULL) return set_replacement(NULL);
				2933	alternatives_->at(1).set_node(replacement);
				2934
				2935	RegExpNode* neg_node = alternatives_->at(0).node();
				2936	RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
				2937	// If the negative lookahead is always going to fail then
				2938	// we don't need to check it.
				2939	if (neg_replacement == NULL) return set_replacement(replacement);
				2940	alternatives_->at(0).set_node(neg_replacement);
				2941	return set_replacement(this);
				2942	}
				2943
				2944
				2945	void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2946	RegExpCompiler* compiler,
				2947	int characters_filled_in,
				2948	bool not_at_start) {
				2949	if (body_can_be_zero_length_ \|\| info()->visited) return;
				2950	VisitMarker marker(info());
				2951	return ChoiceNode::GetQuickCheckDetails(details,
				2952	compiler,
				2953	characters_filled_in,
				2954	not_at_start);
				2955	}
				2956
				2957
				2958	void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
				2959	BoyerMooreLookahead* bm, bool not_at_start) {
				2960	if (body_can_be_zero_length_ \|\| budget <= 0) {
				2961	bm->SetRest(offset);
				2962	SaveBMInfo(bm, not_at_start, offset);
				2963	return;
				2964	}
				2965	ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
				2966	SaveBMInfo(bm, not_at_start, offset);
				2967	}
				2968
				2969
				2970	void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2971	RegExpCompiler* compiler,
				2972	int characters_filled_in,
				2973	bool not_at_start) {
				2974	not_at_start = (not_at_start \|\| not_at_start_);
				2975	int choice_count = alternatives_->length();
				2976	DCHECK(choice_count > 0);
				2977	alternatives_->at(0).node()->GetQuickCheckDetails(details,
				2978	compiler,
				2979	characters_filled_in,
				2980	not_at_start);
				2981	for (int i = 1; i < choice_count; i++) {
				2982	QuickCheckDetails new_details(details->characters());
				2983	RegExpNode* node = alternatives_->at(i).node();
				2984	node->GetQuickCheckDetails(&new_details, compiler,
				2985	characters_filled_in,
				2986	not_at_start);
				2987	// Here we merge the quick match details of the two branches.
				2988	details->Merge(&new_details, characters_filled_in);
				2989	}
				2990	}
				2991
				2992
				2993	// Check for [0-9A-Z_a-z].
				2994	static void EmitWordCheck(RegExpMacroAssembler* assembler,
				2995	Label* word,
				2996	Label* non_word,
				2997	bool fall_through_on_word) {
				2998	if (assembler->CheckSpecialCharacterClass(
				2999	fall_through_on_word ? 'w' : 'W',
				3000	fall_through_on_word ? non_word : word)) {
				3001	// Optimized implementation available.
				3002	return;
				3003	}
				3004	assembler->CheckCharacterGT('z', non_word);
				3005	assembler->CheckCharacterLT('0', non_word);
				3006	assembler->CheckCharacterGT('a' - 1, word);
				3007	assembler->CheckCharacterLT('9' + 1, word);
				3008	assembler->CheckCharacterLT('A', non_word);
				3009	assembler->CheckCharacterLT('Z' + 1, word);
				3010	if (fall_through_on_word) {
				3011	assembler->CheckNotCharacter('_', non_word);
				3012	} else {
				3013	assembler->CheckCharacter('_', word);
				3014	}
				3015	}
				3016
				3017
				3018	// Emit the code to check for a ^ in multiline mode (1-character lookbehind
				3019	// that matches newline or the start of input).
				3020	static void EmitHat(RegExpCompiler* compiler,
				3021	RegExpNode* on_success,
				3022	Trace* trace) {
				3023	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3024	// We will be loading the previous character into the current character
				3025	// register.
				3026	Trace new_trace(*trace);
				3027	new_trace.InvalidateCurrentCharacter();
				3028
				3029	Label ok;
				3030	if (new_trace.cp_offset() == 0) {
				3031	// The start of input counts as a newline in this context, so skip to
				3032	// ok if we are at the start.
				3033	assembler->CheckAtStart(&ok);
				3034	}
				3035	// We already checked that we are not at the start of input so it must be
				3036	// OK to load the previous character.
				3037	assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
				3038	new_trace.backtrack(),
				3039	false);
				3040	if (!assembler->CheckSpecialCharacterClass('n',
				3041	new_trace.backtrack())) {
				3042	// Newline means \n, \r, 0x2028 or 0x2029.
				3043	if (!compiler->one_byte()) {
				3044	assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
				3045	}
				3046	assembler->CheckCharacter('\n', &ok);
				3047	assembler->CheckNotCharacter('\r', new_trace.backtrack());
				3048	}
				3049	assembler->Bind(&ok);
				3050	on_success->Emit(compiler, &new_trace);
				3051	}
				3052
				3053
				3054	// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
				3055	void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
				3056	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3057	Isolate* isolate = assembler->isolate();
				3058	Trace::TriBool next_is_word_character = Trace::UNKNOWN;
				3059	bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
				3060	BoyerMooreLookahead* lookahead = bm_info(not_at_start);
				3061	if (lookahead == NULL) {
				3062	int eats_at_least =
				3063	Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
				3064	kRecursionBudget,
				3065	not_at_start));
				3066	if (eats_at_least >= 1) {
				3067	BoyerMooreLookahead* bm =
				3068	new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
				3069	FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
				3070	if (bm->at(0)->is_non_word())
				3071	next_is_word_character = Trace::FALSE_VALUE;
				3072	if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
				3073	}
				3074	} else {
				3075	if (lookahead->at(0)->is_non_word())
				3076	next_is_word_character = Trace::FALSE_VALUE;
				3077	if (lookahead->at(0)->is_word())
				3078	next_is_word_character = Trace::TRUE_VALUE;
				3079	}
				3080	bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
				3081	if (next_is_word_character == Trace::UNKNOWN) {
				3082	Label before_non_word;
				3083	Label before_word;
				3084	if (trace->characters_preloaded() != 1) {
				3085	assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
				3086	}
				3087	// Fall through on non-word.
				3088	EmitWordCheck(assembler, &before_word, &before_non_word, false);
				3089	// Next character is not a word character.
				3090	assembler->Bind(&before_non_word);
				3091	Label ok;
				3092	BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
				3093	assembler->GoTo(&ok);
				3094
				3095	assembler->Bind(&before_word);
				3096	BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
				3097	assembler->Bind(&ok);
				3098	} else if (next_is_word_character == Trace::TRUE_VALUE) {
				3099	BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
				3100	} else {
				3101	DCHECK(next_is_word_character == Trace::FALSE_VALUE);
				3102	BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
				3103	}
				3104	}
				3105
				3106
				3107	void AssertionNode::BacktrackIfPrevious(
				3108	RegExpCompiler* compiler,
				3109	Trace* trace,
				3110	AssertionNode::IfPrevious backtrack_if_previous) {
				3111	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3112	Trace new_trace(*trace);
				3113	new_trace.InvalidateCurrentCharacter();
				3114
				3115	Label fall_through, dummy;
				3116
				3117	Label* non_word = backtrack_if_previous == kIsNonWord ?
				3118	new_trace.backtrack() :
				3119	&fall_through;
				3120	Label* word = backtrack_if_previous == kIsNonWord ?
				3121	&fall_through :
				3122	new_trace.backtrack();
				3123
				3124	if (new_trace.cp_offset() == 0) {
				3125	// The start of input counts as a non-word character, so the question is
				3126	// decided if we are at the start.
				3127	assembler->CheckAtStart(non_word);
				3128	}
				3129	// We already checked that we are not at the start of input so it must be
				3130	// OK to load the previous character.
				3131	assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
				3132	EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
				3133
				3134	assembler->Bind(&fall_through);
				3135	on_success()->Emit(compiler, &new_trace);
				3136	}
				3137
				3138
				3139	void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
				3140	RegExpCompiler* compiler,
				3141	int filled_in,
				3142	bool not_at_start) {
				3143	if (assertion_type_ == AT_START && not_at_start) {
				3144	details->set_cannot_match();
				3145	return;
				3146	}
				3147	return on_success()->GetQuickCheckDetails(details,
				3148	compiler,
				3149	filled_in,
				3150	not_at_start);
				3151	}
				3152
				3153
				3154	void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3155	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3156	switch (assertion_type_) {
				3157	case AT_END: {
				3158	Label ok;
				3159	assembler->CheckPosition(trace->cp_offset(), &ok);
				3160	assembler->GoTo(trace->backtrack());
				3161	assembler->Bind(&ok);
				3162	break;
				3163	}
				3164	case AT_START: {
				3165	if (trace->at_start() == Trace::FALSE_VALUE) {
				3166	assembler->GoTo(trace->backtrack());
				3167	return;
				3168	}
				3169	if (trace->at_start() == Trace::UNKNOWN) {
				3170	assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
				3171	Trace at_start_trace = *trace;
				3172	at_start_trace.set_at_start(Trace::TRUE_VALUE);
				3173	on_success()->Emit(compiler, &at_start_trace);
				3174	return;
				3175	}
				3176	}
				3177	break;
				3178	case AFTER_NEWLINE:
				3179	EmitHat(compiler, on_success(), trace);
				3180	return;
				3181	case AT_BOUNDARY:
				3182	case AT_NON_BOUNDARY: {
				3183	EmitBoundaryCheck(compiler, trace);
				3184	return;
				3185	}
				3186	}
				3187	on_success()->Emit(compiler, trace);
				3188	}
				3189
				3190
				3191	static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
				3192	if (quick_check == NULL) return false;
				3193	if (offset >= quick_check->characters()) return false;
				3194	return quick_check->positions(offset)->determines_perfectly;
				3195	}
				3196
				3197
				3198	static void UpdateBoundsCheck(int index, int* checked_up_to) {
				3199	if (index > *checked_up_to) {
				3200	*checked_up_to = index;
				3201	}
				3202	}
				3203
				3204
				3205	// We call this repeatedly to generate code for each pass over the text node.
				3206	// The passes are in increasing order of difficulty because we hope one
				3207	// of the first passes will fail in which case we are saved the work of the
				3208	// later passes. for example for the case independent regexp /%[asdfghjkl]a/
				3209	// we will check the '%' in the first pass, the case independent 'a' in the
				3210	// second pass and the character class in the last pass.
				3211	//
				3212	// The passes are done from right to left, so for example to test for /bar/
				3213	// we will first test for an 'r' with offset 2, then an 'a' with offset 1
				3214	// and then a 'b' with offset 0. This means we can avoid the end-of-input
				3215	// bounds check most of the time. In the example we only need to check for
				3216	// end-of-input when loading the putative 'r'.
				3217	//
				3218	// A slight complication involves the fact that the first character may already
				3219	// be fetched into a register by the previous node. In this case we want to
				3220	// do the test for that character first. We do this in separate passes. The
				3221	// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
				3222	// pass has been performed then subsequent passes will have true in
				3223	// first_element_checked to indicate that that character does not need to be
				3224	// checked again.
				3225	//
				3226	// In addition to all this we are passed a Trace, which can
				3227	// contain an AlternativeGeneration object. In this AlternativeGeneration
				3228	// object we can see details of any quick check that was already passed in
				3229	// order to get to the code we are now generating. The quick check can involve
				3230	// loading characters, which means we do not need to recheck the bounds
				3231	// up to the limit the quick check already checked. In addition the quick
				3232	// check can have involved a mask and compare operation which may simplify
				3233	// or obviate the need for further checks at some character positions.
				3234	void TextNode::TextEmitPass(RegExpCompiler* compiler,
				3235	TextEmitPassType pass,
				3236	bool preloaded,
				3237	Trace* trace,
				3238	bool first_element_checked,
				3239	int* checked_up_to) {
				3240	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3241	Isolate* isolate = assembler->isolate();
				3242	bool one_byte = compiler->one_byte();
				3243	Label* backtrack = trace->backtrack();
				3244	QuickCheckDetails* quick_check = trace->quick_check_performed();
				3245	int element_count = elements()->length();
				3246	int backward_offset = read_backward() ? -Length() : 0;
				3247	for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
				3248	TextElement elm = elements()->at(i);
				3249	int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
				3250	if (elm.text_type() == TextElement::ATOM) {
				3251	Vector<const uc16> quarks = elm.atom()->data();
				3252	for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
				3253	if (first_element_checked && i == 0 && j == 0) continue;
				3254	if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
				3255	EmitCharacterFunction* emit_function = NULL;
				3256	switch (pass) {
				3257	case NON_LATIN1_MATCH:
				3258	DCHECK(one_byte);
				3259	if (quarks[j] > String::kMaxOneByteCharCode) {
				3260	assembler->GoTo(backtrack);
				3261	return;
				3262	}
				3263	break;
				3264	case NON_LETTER_CHARACTER_MATCH:
				3265	emit_function = &EmitAtomNonLetter;
				3266	break;
				3267	case SIMPLE_CHARACTER_MATCH:
				3268	emit_function = &EmitSimpleCharacter;
				3269	break;
				3270	case CASE_CHARACTER_MATCH:
				3271	emit_function = &EmitAtomLetter;
				3272	break;
				3273	default:
				3274	break;
				3275	}
				3276	if (emit_function != NULL) {
				3277	bool bounds_check = *checked_up_to < cp_offset + j \|\| read_backward();
				3278	bool bound_checked =
				3279	emit_function(isolate, compiler, quarks[j], backtrack,
				3280	cp_offset + j, bounds_check, preloaded);
				3281	if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
				3282	}
				3283	}
				3284	} else {
				3285	DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
				3286	if (pass == CHARACTER_CLASS_MATCH) {
				3287	if (first_element_checked && i == 0) continue;
				3288	if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
				3289	RegExpCharacterClass* cc = elm.char_class();
				3290	bool bounds_check = *checked_up_to < cp_offset \|\| read_backward();
				3291	EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
				3292	bounds_check, preloaded, zone());
				3293	UpdateBoundsCheck(cp_offset, checked_up_to);
				3294	}
				3295	}
				3296	}
				3297	}
				3298
				3299
				3300	int TextNode::Length() {
				3301	TextElement elm = elements()->last();
				3302	DCHECK(elm.cp_offset() >= 0);
				3303	return elm.cp_offset() + elm.length();
				3304	}
				3305
				3306
				3307	bool TextNode::SkipPass(int int_pass, bool ignore_case) {
				3308	TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
				3309	if (ignore_case) {
				3310	return pass == SIMPLE_CHARACTER_MATCH;
				3311	} else {
				3312	return pass == NON_LETTER_CHARACTER_MATCH \|\| pass == CASE_CHARACTER_MATCH;
				3313	}
				3314	}
				3315
				3316
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	3317	TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
				3318	ZoneList<CharacterRange>* ranges,
				3319	bool read_backward,
				3320	RegExpNode* on_success) {
				3321	DCHECK_NOT_NULL(ranges);
				3322	ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
				3323	elms->Add(
				3324	TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),
				3325	zone);
				3326	return new (zone) TextNode(elms, read_backward, on_success);
				3327	}
				3328
				3329
				3330	TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
				3331	CharacterRange trail,
				3332	bool read_backward,
				3333	RegExpNode* on_success) {
				3334	ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
				3335	ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
				3336	ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
				3337	elms->Add(TextElement::CharClass(
				3338	new (zone) RegExpCharacterClass(lead_ranges, false)),
				3339	zone);
				3340	elms->Add(TextElement::CharClass(
				3341	new (zone) RegExpCharacterClass(trail_ranges, false)),
				3342	zone);
				3343	return new (zone) TextNode(elms, read_backward, on_success);
				3344	}
				3345
				3346
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	3347	// This generates the code to match a text node. A text node can contain
				3348	// straight character sequences (possibly to be matched in a case-independent
				3349	// way) and character classes. For efficiency we do not do this in a single
				3350	// pass from left to right. Instead we pass over the text node several times,
				3351	// emitting code for some character positions every time. See the comment on
				3352	// TextEmitPass for details.
				3353	void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3354	LimitResult limit_result = LimitVersions(compiler, trace);
				3355	if (limit_result == DONE) return;
				3356	DCHECK(limit_result == CONTINUE);
				3357
				3358	if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
				3359	compiler->SetRegExpTooBig();
				3360	return;
				3361	}
				3362
				3363	if (compiler->one_byte()) {
				3364	int dummy = 0;
				3365	TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
				3366	}
				3367
				3368	bool first_elt_done = false;
				3369	int bound_checked_to = trace->cp_offset() - 1;
				3370	bound_checked_to += trace->bound_checked_up_to();
				3371
				3372	// If a character is preloaded into the current character register then
				3373	// check that now.
				3374	if (trace->characters_preloaded() == 1) {
				3375	for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
				3376	if (!SkipPass(pass, compiler->ignore_case())) {
				3377	TextEmitPass(compiler,
				3378	static_cast<TextEmitPassType>(pass),
				3379	true,
				3380	trace,
				3381	false,
				3382	&bound_checked_to);
				3383	}
				3384	}
				3385	first_elt_done = true;
				3386	}
				3387
				3388	for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
				3389	if (!SkipPass(pass, compiler->ignore_case())) {
				3390	TextEmitPass(compiler,
				3391	static_cast<TextEmitPassType>(pass),
				3392	false,
				3393	trace,
				3394	first_elt_done,
				3395	&bound_checked_to);
				3396	}
				3397	}
				3398
				3399	Trace successor_trace(*trace);
				3400	// If we advance backward, we may end up at the start.
				3401	successor_trace.AdvanceCurrentPositionInTrace(
				3402	read_backward() ? -Length() : Length(), compiler);
				3403	successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
				3404	: Trace::FALSE_VALUE);
				3405	RecursionCheck rc(compiler);
				3406	on_success()->Emit(compiler, &successor_trace);
				3407	}
				3408
				3409
				3410	void Trace::InvalidateCurrentCharacter() {
				3411	characters_preloaded_ = 0;
				3412	}
				3413
				3414
				3415	void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
				3416	// We don't have an instruction for shifting the current character register
				3417	// down or for using a shifted value for anything so lets just forget that
				3418	// we preloaded any characters into it.
				3419	characters_preloaded_ = 0;
				3420	// Adjust the offsets of the quick check performed information. This
				3421	// information is used to find out what we already determined about the
				3422	// characters by means of mask and compare.
				3423	quick_check_performed_.Advance(by, compiler->one_byte());
				3424	cp_offset_ += by;
				3425	if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
				3426	compiler->SetRegExpTooBig();
				3427	cp_offset_ = 0;
				3428	}
				3429	bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
				3430	}
				3431
				3432
				3433	void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
				3434	int element_count = elements()->length();
				3435	for (int i = 0; i < element_count; i++) {
				3436	TextElement elm = elements()->at(i);
				3437	if (elm.text_type() == TextElement::CHAR_CLASS) {
				3438	RegExpCharacterClass* cc = elm.char_class();
				3439	// None of the standard character classes is different in the case
				3440	// independent case and it slows us down if we don't know that.
				3441	if (cc->is_standard(zone())) continue;
				3442	ZoneList<CharacterRange>* ranges = cc->ranges(zone());
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	3443	CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	3444	}
				3445	}
				3446	}
				3447
				3448
				3449	int TextNode::GreedyLoopTextLength() { return Length(); }
				3450
				3451
				3452	RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
				3453	RegExpCompiler* compiler) {
				3454	if (read_backward()) return NULL;
				3455	if (elements()->length() != 1) return NULL;
				3456	TextElement elm = elements()->at(0);
				3457	if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
				3458	RegExpCharacterClass* node = elm.char_class();
				3459	ZoneList<CharacterRange>* ranges = node->ranges(zone());
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	3460	CharacterRange::Canonicalize(ranges);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	3461	if (node->is_negated()) {
				3462	return ranges->length() == 0 ? on_success() : NULL;
				3463	}
				3464	if (ranges->length() != 1) return NULL;
				3465	uint32_t max_char;
				3466	if (compiler->one_byte()) {
				3467	max_char = String::kMaxOneByteCharCode;
				3468	} else {
				3469	max_char = String::kMaxUtf16CodeUnit;
				3470	}
				3471	return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
				3472	}
				3473
				3474
				3475	// Finds the fixed match length of a sequence of nodes that goes from
				3476	// this alternative and back to this choice node. If there are variable
				3477	// length nodes or other complications in the way then return a sentinel
				3478	// value indicating that a greedy loop cannot be constructed.
				3479	int ChoiceNode::GreedyLoopTextLengthForAlternative(
				3480	GuardedAlternative* alternative) {
				3481	int length = 0;
				3482	RegExpNode* node = alternative->node();
				3483	// Later we will generate code for all these text nodes using recursion
				3484	// so we have to limit the max number.
				3485	int recursion_depth = 0;
				3486	while (node != this) {
				3487	if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
				3488	return kNodeIsTooComplexForGreedyLoops;
				3489	}
				3490	int node_length = node->GreedyLoopTextLength();
				3491	if (node_length == kNodeIsTooComplexForGreedyLoops) {
				3492	return kNodeIsTooComplexForGreedyLoops;
				3493	}
				3494	length += node_length;
				3495	SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
				3496	node = seq_node->on_success();
				3497	}
				3498	return read_backward() ? -length : length;
				3499	}
				3500
				3501
				3502	void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
				3503	DCHECK_NULL(loop_node_);
				3504	AddAlternative(alt);
				3505	loop_node_ = alt.node();
				3506	}
				3507
				3508
				3509	void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
				3510	DCHECK_NULL(continue_node_);
				3511	AddAlternative(alt);
				3512	continue_node_ = alt.node();
				3513	}
				3514
				3515
				3516	void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3517	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				3518	if (trace->stop_node() == this) {
				3519	// Back edge of greedy optimized loop node graph.
				3520	int text_length =
				3521	GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
				3522	DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);
				3523	// Update the counter-based backtracking info on the stack. This is an
				3524	// optimization for greedy loops (see below).
				3525	DCHECK(trace->cp_offset() == text_length);
				3526	macro_assembler->AdvanceCurrentPosition(text_length);
				3527	macro_assembler->GoTo(trace->loop_label());
				3528	return;
				3529	}
				3530	DCHECK_NULL(trace->stop_node());
				3531	if (!trace->is_trivial()) {
				3532	trace->Flush(compiler, this);
				3533	return;
				3534	}
				3535	ChoiceNode::Emit(compiler, trace);
				3536	}
				3537
				3538
				3539	int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
				3540	int eats_at_least) {
				3541	int preload_characters = Min(4, eats_at_least);
				3542	if (compiler->macro_assembler()->CanReadUnaligned()) {
				3543	bool one_byte = compiler->one_byte();
				3544	if (one_byte) {
				3545	if (preload_characters > 4) preload_characters = 4;
				3546	// We can't preload 3 characters because there is no machine instruction
				3547	// to do that. We can't just load 4 because we could be reading
				3548	// beyond the end of the string, which could cause a memory fault.
				3549	if (preload_characters == 3) preload_characters = 2;
				3550	} else {
				3551	if (preload_characters > 2) preload_characters = 2;
				3552	}
				3553	} else {
				3554	if (preload_characters > 1) preload_characters = 1;
				3555	}
				3556	return preload_characters;
				3557	}
				3558
				3559
				3560	// This class is used when generating the alternatives in a choice node. It
				3561	// records the way the alternative is being code generated.
				3562	class AlternativeGeneration: public Malloced {
				3563	public:
				3564	AlternativeGeneration()
				3565	: possible_success(),
				3566	expects_preload(false),
				3567	after(),
				3568	quick_check_details() { }
				3569	Label possible_success;
				3570	bool expects_preload;
				3571	Label after;
				3572	QuickCheckDetails quick_check_details;
				3573	};
				3574
				3575
				3576	// Creates a list of AlternativeGenerations. If the list has a reasonable
				3577	// size then it is on the stack, otherwise the excess is on the heap.
				3578	class AlternativeGenerationList {
				3579	public:
				3580	AlternativeGenerationList(int count, Zone* zone)
				3581	: alt_gens_(count, zone) {
				3582	for (int i = 0; i < count && i < kAFew; i++) {
				3583	alt_gens_.Add(a_few_alt_gens_ + i, zone);
				3584	}
				3585	for (int i = kAFew; i < count; i++) {
				3586	alt_gens_.Add(new AlternativeGeneration(), zone);
				3587	}
				3588	}
				3589	~AlternativeGenerationList() {
				3590	for (int i = kAFew; i < alt_gens_.length(); i++) {
				3591	delete alt_gens_[i];
				3592	alt_gens_[i] = NULL;
				3593	}
				3594	}
				3595
				3596	AlternativeGeneration* at(int i) {
				3597	return alt_gens_[i];
				3598	}
				3599
				3600	private:
				3601	static const int kAFew = 10;
				3602	ZoneList<AlternativeGeneration*> alt_gens_;
				3603	AlternativeGeneration a_few_alt_gens_[kAFew];
				3604	};
				3605
				3606
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	3607	static const uc32 kRangeEndMarker = 0x110000;
				3608
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	3609	// The '2' variant is has inclusive from and exclusive to.
				3610	// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
				3611	// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	3612	static const int kSpaceRanges[] = {
				3613	'\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 0x1681,
				3614	0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
				3615	0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	3616	static const int kSpaceRangeCount = arraysize(kSpaceRanges);
				3617
				3618	static const int kWordRanges[] = {
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	3619	'0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	3620	static const int kWordRangeCount = arraysize(kWordRanges);
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	3621	static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	3622	static const int kDigitRangeCount = arraysize(kDigitRanges);
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	3623	static const int kSurrogateRanges[] = {
				3624	kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	3625	static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	3626	static const int kLineTerminatorRanges[] = {
				3627	0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	3628	static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
				3629
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	3630	void BoyerMoorePositionInfo::Set(int character) {
				3631	SetInterval(Interval(character, character));
				3632	}
				3633
				3634
				3635	void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
				3636	s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
				3637	w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
				3638	d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
				3639	surrogate_ =
				3640	AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
				3641	if (interval.to() - interval.from() >= kMapSize - 1) {
				3642	if (map_count_ != kMapSize) {
				3643	map_count_ = kMapSize;
				3644	for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
				3645	}
				3646	return;
				3647	}
				3648	for (int i = interval.from(); i <= interval.to(); i++) {
				3649	int mod_character = (i & kMask);
				3650	if (!map_->at(mod_character)) {
				3651	map_count_++;
				3652	map_->at(mod_character) = true;
				3653	}
				3654	if (map_count_ == kMapSize) return;
				3655	}
				3656	}
				3657
				3658
				3659	void BoyerMoorePositionInfo::SetAll() {
				3660	s_ = w_ = d_ = kLatticeUnknown;
				3661	if (map_count_ != kMapSize) {
				3662	map_count_ = kMapSize;
				3663	for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
				3664	}
				3665	}
				3666
				3667
				3668	BoyerMooreLookahead::BoyerMooreLookahead(
				3669	int length, RegExpCompiler* compiler, Zone* zone)
				3670	: length_(length),
				3671	compiler_(compiler) {
				3672	if (compiler->one_byte()) {
				3673	max_char_ = String::kMaxOneByteCharCode;
				3674	} else {
				3675	max_char_ = String::kMaxUtf16CodeUnit;
				3676	}
				3677	bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
				3678	for (int i = 0; i < length; i++) {
				3679	bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
				3680	}
				3681	}
				3682
				3683
				3684	// Find the longest range of lookahead that has the fewest number of different
				3685	// characters that can occur at a given position. Since we are optimizing two
				3686	// different parameters at once this is a tradeoff.
				3687	bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
				3688	int biggest_points = 0;
				3689	// If more than 32 characters out of 128 can occur it is unlikely that we can
				3690	// be lucky enough to step forwards much of the time.
				3691	const int kMaxMax = 32;
				3692	for (int max_number_of_chars = 4;
				3693	max_number_of_chars < kMaxMax;
				3694	max_number_of_chars *= 2) {
				3695	biggest_points =
				3696	FindBestInterval(max_number_of_chars, biggest_points, from, to);
				3697	}
				3698	if (biggest_points == 0) return false;
				3699	return true;
				3700	}
				3701
				3702
				3703	// Find the highest-points range between 0 and length_ where the character
				3704	// information is not too vague. 'Too vague' means that there are more than
				3705	// max_number_of_chars that can occur at this position. Calculates the number
				3706	// of points as the product of width-of-the-range and
				3707	// probability-of-finding-one-of-the-characters, where the probability is
				3708	// calculated using the frequency distribution of the sample subject string.
				3709	int BoyerMooreLookahead::FindBestInterval(
				3710	int max_number_of_chars, int old_biggest_points, int* from, int* to) {
				3711	int biggest_points = old_biggest_points;
				3712	static const int kSize = RegExpMacroAssembler::kTableSize;
				3713	for (int i = 0; i < length_; ) {
				3714	while (i < length_ && Count(i) > max_number_of_chars) i++;
				3715	if (i == length_) break;
				3716	int remembered_from = i;
				3717	bool union_map[kSize];
				3718	for (int j = 0; j < kSize; j++) union_map[j] = false;
				3719	while (i < length_ && Count(i) <= max_number_of_chars) {
				3720	BoyerMoorePositionInfo* map = bitmaps_->at(i);
				3721	for (int j = 0; j < kSize; j++) union_map[j] \|= map->at(j);
				3722	i++;
				3723	}
				3724	int frequency = 0;
				3725	for (int j = 0; j < kSize; j++) {
				3726	if (union_map[j]) {
				3727	// Add 1 to the frequency to give a small per-character boost for
				3728	// the cases where our sampling is not good enough and many
				3729	// characters have a frequency of zero. This means the frequency
				3730	// can theoretically be up to 2*kSize though we treat it mostly as
				3731	// a fraction of kSize.
				3732	frequency += compiler_->frequency_collator()->Frequency(j) + 1;
				3733	}
				3734	}
				3735	// We use the probability of skipping times the distance we are skipping to
				3736	// judge the effectiveness of this. Actually we have a cut-off: By
				3737	// dividing by 2 we switch off the skipping if the probability of skipping
				3738	// is less than 50%. This is because the multibyte mask-and-compare
				3739	// skipping in quickcheck is more likely to do well on this case.
				3740	bool in_quickcheck_range =
				3741	((i - remembered_from < 4) \|\|
				3742	(compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
				3743	// Called 'probability' but it is only a rough estimate and can actually
				3744	// be outside the 0-kSize range.
				3745	int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
				3746	int points = (i - remembered_from) * probability;
				3747	if (points > biggest_points) {
				3748	*from = remembered_from;
				3749	*to = i - 1;
				3750	biggest_points = points;
				3751	}
				3752	}
				3753	return biggest_points;
				3754	}
				3755
				3756
				3757	// Take all the characters that will not prevent a successful match if they
				3758	// occur in the subject string in the range between min_lookahead and
				3759	// max_lookahead (inclusive) measured from the current position. If the
				3760	// character at max_lookahead offset is not one of these characters, then we
				3761	// can safely skip forwards by the number of characters in the range.
				3762	int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
				3763	int max_lookahead,
				3764	Handle<ByteArray> boolean_skip_table) {
				3765	const int kSize = RegExpMacroAssembler::kTableSize;
				3766
				3767	const int kSkipArrayEntry = 0;
				3768	const int kDontSkipArrayEntry = 1;
				3769
				3770	for (int i = 0; i < kSize; i++) {
				3771	boolean_skip_table->set(i, kSkipArrayEntry);
				3772	}
				3773	int skip = max_lookahead + 1 - min_lookahead;
				3774
				3775	for (int i = max_lookahead; i >= min_lookahead; i--) {
				3776	BoyerMoorePositionInfo* map = bitmaps_->at(i);
				3777	for (int j = 0; j < kSize; j++) {
				3778	if (map->at(j)) {
				3779	boolean_skip_table->set(j, kDontSkipArrayEntry);
				3780	}
				3781	}
				3782	}
				3783
				3784	return skip;
				3785	}
				3786
				3787
				3788	// See comment above on the implementation of GetSkipTable.
				3789	void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
				3790	const int kSize = RegExpMacroAssembler::kTableSize;
				3791
				3792	int min_lookahead = 0;
				3793	int max_lookahead = 0;
				3794
				3795	if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
				3796
				3797	bool found_single_character = false;
				3798	int single_character = 0;
				3799	for (int i = max_lookahead; i >= min_lookahead; i--) {
				3800	BoyerMoorePositionInfo* map = bitmaps_->at(i);
				3801	if (map->map_count() > 1 \|\|
				3802	(found_single_character && map->map_count() != 0)) {
				3803	found_single_character = false;
				3804	break;
				3805	}
				3806	for (int j = 0; j < kSize; j++) {
				3807	if (map->at(j)) {
				3808	found_single_character = true;
				3809	single_character = j;
				3810	break;
				3811	}
				3812	}
				3813	}
				3814
				3815	int lookahead_width = max_lookahead + 1 - min_lookahead;
				3816
				3817	if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
				3818	// The mask-compare can probably handle this better.
				3819	return;
				3820	}
				3821
				3822	if (found_single_character) {
				3823	Label cont, again;
				3824	masm->Bind(&again);
				3825	masm->LoadCurrentCharacter(max_lookahead, &cont, true);
				3826	if (max_char_ > kSize) {
				3827	masm->CheckCharacterAfterAnd(single_character,
				3828	RegExpMacroAssembler::kTableMask,
				3829	&cont);
				3830	} else {
				3831	masm->CheckCharacter(single_character, &cont);
				3832	}
				3833	masm->AdvanceCurrentPosition(lookahead_width);
				3834	masm->GoTo(&again);
				3835	masm->Bind(&cont);
				3836	return;
				3837	}
				3838
				3839	Factory* factory = masm->isolate()->factory();
				3840	Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
				3841	int skip_distance = GetSkipTable(
				3842	min_lookahead, max_lookahead, boolean_skip_table);
				3843	DCHECK(skip_distance != 0);
				3844
				3845	Label cont, again;
				3846	masm->Bind(&again);
				3847	masm->LoadCurrentCharacter(max_lookahead, &cont, true);
				3848	masm->CheckBitInTable(boolean_skip_table, &cont);
				3849	masm->AdvanceCurrentPosition(skip_distance);
				3850	masm->GoTo(&again);
				3851	masm->Bind(&cont);
				3852	}
				3853
				3854
				3855	/* Code generation for choice nodes.
				3856	*
				3857	* We generate quick checks that do a mask and compare to eliminate a
				3858	* choice. If the quick check succeeds then it jumps to the continuation to
				3859	* do slow checks and check subsequent nodes. If it fails (the common case)
				3860	* it falls through to the next choice.
				3861	*
				3862	* Here is the desired flow graph. Nodes directly below each other imply
				3863	* fallthrough. Alternatives 1 and 2 have quick checks. Alternative
				3864	* 3 doesn't have a quick check so we have to call the slow check.
				3865	* Nodes are marked Qn for quick checks and Sn for slow checks. The entire
				3866	* regexp continuation is generated directly after the Sn node, up to the
				3867	* next GoTo if we decide to reuse some already generated code. Some
				3868	* nodes expect preload_characters to be preloaded into the current
				3869	* character register. R nodes do this preloading. Vertices are marked
				3870	* F for failures and S for success (possible success in the case of quick
				3871	* nodes). L, V, < and > are used as arrow heads.
				3872	*
				3873	* ----------> R
				3874	* \|
				3875	* V
				3876	* Q1 -----> S1
				3877	* \| S /
				3878	* F\| /
				3879	* \| F/
				3880	* \| /
				3881	* \| R
				3882	* \| /
				3883	* V L
				3884	* Q2 -----> S2
				3885	* \| S /
				3886	* F\| /
				3887	* \| F/
				3888	* \| /
				3889	* \| R
				3890	* \| /
				3891	* V L
				3892	* S3
				3893	* \|
				3894	* F\|
				3895	* \|
				3896	* R
				3897	* \|
				3898	* backtrack V
				3899	* <----------Q4
				3900	* \ F \|
				3901	* \ \|S
				3902	* \ F V
				3903	* \-----S4
				3904	*
				3905	* For greedy loops we push the current position, then generate the code that
				3906	* eats the input specially in EmitGreedyLoop. The other choice (the
				3907	* continuation) is generated by the normal code in EmitChoices, and steps back
				3908	* in the input to the starting position when it fails to match. The loop code
				3909	* looks like this (U is the unwind code that steps back in the greedy loop).
				3910	*
				3911	* _____
				3912	* / \
				3913	* V \|
				3914	* ----------> S1 \|
				3915	* /\| \|
				3916	* / \|S \|
				3917	* F/ \_____/
				3918	* /
				3919	* \|<-----
				3920	* \| \
				3921	* V \|S
				3922	* Q2 ---> U----->backtrack
				3923	* \| F /
				3924	* S\| /
				3925	* V F /
				3926	* S2--/
				3927	*/
				3928
				3929	GreedyLoopState::GreedyLoopState(bool not_at_start) {
				3930	counter_backtrack_trace_.set_backtrack(&label_);
				3931	if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
				3932	}
				3933
				3934
				3935	void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
				3936	#ifdef DEBUG
				3937	int choice_count = alternatives_->length();
				3938	for (int i = 0; i < choice_count - 1; i++) {
				3939	GuardedAlternative alternative = alternatives_->at(i);
				3940	ZoneList<Guard> guards = alternative.guards();
				3941	int guard_count = (guards == NULL) ? 0 : guards->length();
				3942	for (int j = 0; j < guard_count; j++) {
				3943	DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
				3944	}
				3945	}
				3946	#endif
				3947	}
				3948
				3949
				3950	void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
				3951	Trace* current_trace,
				3952	PreloadState* state) {
				3953	if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
				3954	// Save some time by looking at most one machine word ahead.
				3955	state->eats_at_least_ =
				3956	EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
				3957	current_trace->at_start() == Trace::FALSE_VALUE);
				3958	}
				3959	state->preload_characters_ =
				3960	CalculatePreloadCharacters(compiler, state->eats_at_least_);
				3961
				3962	state->preload_is_current_ =
				3963	(current_trace->characters_preloaded() == state->preload_characters_);
				3964	state->preload_has_checked_bounds_ = state->preload_is_current_;
				3965	}
				3966
				3967
				3968	void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3969	int choice_count = alternatives_->length();
				3970
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	3971	if (choice_count == 1 && alternatives_->at(0).guards() == NULL) {
				3972	alternatives_->at(0).node()->Emit(compiler, trace);
				3973	return;
				3974	}
				3975
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	3976	AssertGuardsMentionRegisters(trace);
				3977
				3978	LimitResult limit_result = LimitVersions(compiler, trace);
				3979	if (limit_result == DONE) return;
				3980	DCHECK(limit_result == CONTINUE);
				3981
				3982	// For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
				3983	// other choice nodes we only flush if we are out of code size budget.
				3984	if (trace->flush_budget() == 0 && trace->actions() != NULL) {
				3985	trace->Flush(compiler, this);
				3986	return;
				3987	}
				3988
				3989	RecursionCheck rc(compiler);
				3990
				3991	PreloadState preload;
				3992	preload.init();
				3993	GreedyLoopState greedy_loop_state(not_at_start());
				3994
				3995	int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
				3996	AlternativeGenerationList alt_gens(choice_count, zone());
				3997
				3998	if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
				3999	trace = EmitGreedyLoop(compiler,
				4000	trace,
				4001	&alt_gens,
				4002	&preload,
				4003	&greedy_loop_state,
				4004	text_length);
				4005	} else {
				4006	// TODO(erikcorry): Delete this. We don't need this label, but it makes us
				4007	// match the traces produced pre-cleanup.
				4008	Label second_choice;
				4009	compiler->macro_assembler()->Bind(&second_choice);
				4010
				4011	preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
				4012
				4013	EmitChoices(compiler,
				4014	&alt_gens,
				4015	0,
				4016	trace,
				4017	&preload);
				4018	}
				4019
				4020	// At this point we need to generate slow checks for the alternatives where
				4021	// the quick check was inlined. We can recognize these because the associated
				4022	// label was bound.
				4023	int new_flush_budget = trace->flush_budget() / choice_count;
				4024	for (int i = 0; i < choice_count; i++) {
				4025	AlternativeGeneration* alt_gen = alt_gens.at(i);
				4026	Trace new_trace(*trace);
				4027	// If there are actions to be flushed we have to limit how many times
				4028	// they are flushed. Take the budget of the parent trace and distribute
				4029	// it fairly amongst the children.
				4030	if (new_trace.actions() != NULL) {
				4031	new_trace.set_flush_budget(new_flush_budget);
				4032	}
				4033	bool next_expects_preload =
				4034	i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
				4035	EmitOutOfLineContinuation(compiler,
				4036	&new_trace,
				4037	alternatives_->at(i),
				4038	alt_gen,
				4039	preload.preload_characters_,
				4040	next_expects_preload);
				4041	}
				4042	}
				4043
				4044
				4045	Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
				4046	Trace* trace,
				4047	AlternativeGenerationList* alt_gens,
				4048	PreloadState* preload,
				4049	GreedyLoopState* greedy_loop_state,
				4050	int text_length) {
				4051	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				4052	// Here we have special handling for greedy loops containing only text nodes
				4053	// and other simple nodes. These are handled by pushing the current
				4054	// position on the stack and then incrementing the current position each
				4055	// time around the switch. On backtrack we decrement the current position
				4056	// and check it against the pushed value. This avoids pushing backtrack
				4057	// information for each iteration of the loop, which could take up a lot of
				4058	// space.
				4059	DCHECK(trace->stop_node() == NULL);
				4060	macro_assembler->PushCurrentPosition();
				4061	Label greedy_match_failed;
				4062	Trace greedy_match_trace;
				4063	if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
				4064	greedy_match_trace.set_backtrack(&greedy_match_failed);
				4065	Label loop_label;
				4066	macro_assembler->Bind(&loop_label);
				4067	greedy_match_trace.set_stop_node(this);
				4068	greedy_match_trace.set_loop_label(&loop_label);
				4069	alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
				4070	macro_assembler->Bind(&greedy_match_failed);
				4071
				4072	Label second_choice; // For use in greedy matches.
				4073	macro_assembler->Bind(&second_choice);
				4074
				4075	Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
				4076
				4077	EmitChoices(compiler,
				4078	alt_gens,
				4079	1,
				4080	new_trace,
				4081	preload);
				4082
				4083	macro_assembler->Bind(greedy_loop_state->label());
				4084	// If we have unwound to the bottom then backtrack.
				4085	macro_assembler->CheckGreedyLoop(trace->backtrack());
				4086	// Otherwise try the second priority at an earlier position.
				4087	macro_assembler->AdvanceCurrentPosition(-text_length);
				4088	macro_assembler->GoTo(&second_choice);
				4089	return new_trace;
				4090	}
				4091
				4092	int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
				4093	Trace* trace) {
				4094	int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
				4095	if (alternatives_->length() != 2) return eats_at_least;
				4096
				4097	GuardedAlternative alt1 = alternatives_->at(1);
				4098	if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
				4099	return eats_at_least;
				4100	}
				4101	RegExpNode* eats_anything_node = alt1.node();
				4102	if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
				4103	return eats_at_least;
				4104	}
				4105
				4106	// Really we should be creating a new trace when we execute this function,
				4107	// but there is no need, because the code it generates cannot backtrack, and
				4108	// we always arrive here with a trivial trace (since it's the entry to a
				4109	// loop. That also implies that there are no preloaded characters, which is
				4110	// good, because it means we won't be violating any assumptions by
				4111	// overwriting those characters with new load instructions.
				4112	DCHECK(trace->is_trivial());
				4113
				4114	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				4115	Isolate* isolate = macro_assembler->isolate();
				4116	// At this point we know that we are at a non-greedy loop that will eat
				4117	// any character one at a time. Any non-anchored regexp has such a
				4118	// loop prepended to it in order to find where it starts. We look for
				4119	// a pattern of the form ...abc... where we can look 6 characters ahead
				4120	// and step forwards 3 if the character is not one of abc. Abc need
				4121	// not be atoms, they can be any reasonably limited character class or
				4122	// small alternation.
				4123	BoyerMooreLookahead* bm = bm_info(false);
				4124	if (bm == NULL) {
				4125	eats_at_least = Min(kMaxLookaheadForBoyerMoore,
				4126	EatsAtLeast(kMaxLookaheadForBoyerMoore,
				4127	kRecursionBudget,
				4128	false));
				4129	if (eats_at_least >= 1) {
				4130	bm = new(zone()) BoyerMooreLookahead(eats_at_least,
				4131	compiler,
				4132	zone());
				4133	GuardedAlternative alt0 = alternatives_->at(0);
				4134	alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
				4135	}
				4136	}
				4137	if (bm != NULL) {
				4138	bm->EmitSkipInstructions(macro_assembler);
				4139	}
				4140	return eats_at_least;
				4141	}
				4142
				4143
				4144	void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
				4145	AlternativeGenerationList* alt_gens,
				4146	int first_choice,
				4147	Trace* trace,
				4148	PreloadState* preload) {
				4149	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				4150	SetUpPreLoad(compiler, trace, preload);
				4151
				4152	// For now we just call all choices one after the other. The idea ultimately
				4153	// is to use the Dispatch table to try only the relevant ones.
				4154	int choice_count = alternatives_->length();
				4155
				4156	int new_flush_budget = trace->flush_budget() / choice_count;
				4157
				4158	for (int i = first_choice; i < choice_count; i++) {
				4159	bool is_last = i == choice_count - 1;
				4160	bool fall_through_on_failure = !is_last;
				4161	GuardedAlternative alternative = alternatives_->at(i);
				4162	AlternativeGeneration* alt_gen = alt_gens->at(i);
				4163	alt_gen->quick_check_details.set_characters(preload->preload_characters_);
				4164	ZoneList<Guard> guards = alternative.guards();
				4165	int guard_count = (guards == NULL) ? 0 : guards->length();
				4166	Trace new_trace(*trace);
				4167	new_trace.set_characters_preloaded(preload->preload_is_current_ ?
				4168	preload->preload_characters_ :
				4169	0);
				4170	if (preload->preload_has_checked_bounds_) {
				4171	new_trace.set_bound_checked_up_to(preload->preload_characters_);
				4172	}
				4173	new_trace.quick_check_performed()->Clear();
				4174	if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
				4175	if (!is_last) {
				4176	new_trace.set_backtrack(&alt_gen->after);
				4177	}
				4178	alt_gen->expects_preload = preload->preload_is_current_;
				4179	bool generate_full_check_inline = false;
				4180	if (compiler->optimize() &&
				4181	try_to_emit_quick_check_for_alternative(i == 0) &&
				4182	alternative.node()->EmitQuickCheck(
				4183	compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
				4184	&alt_gen->possible_success, &alt_gen->quick_check_details,
				4185	fall_through_on_failure)) {
				4186	// Quick check was generated for this choice.
				4187	preload->preload_is_current_ = true;
				4188	preload->preload_has_checked_bounds_ = true;
				4189	// If we generated the quick check to fall through on possible success,
				4190	// we now need to generate the full check inline.
				4191	if (!fall_through_on_failure) {
				4192	macro_assembler->Bind(&alt_gen->possible_success);
				4193	new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
				4194	new_trace.set_characters_preloaded(preload->preload_characters_);
				4195	new_trace.set_bound_checked_up_to(preload->preload_characters_);
				4196	generate_full_check_inline = true;
				4197	}
				4198	} else if (alt_gen->quick_check_details.cannot_match()) {
				4199	if (!fall_through_on_failure) {
				4200	macro_assembler->GoTo(trace->backtrack());
				4201	}
				4202	continue;
				4203	} else {
				4204	// No quick check was generated. Put the full code here.
				4205	// If this is not the first choice then there could be slow checks from
				4206	// previous cases that go here when they fail. There's no reason to
				4207	// insist that they preload characters since the slow check we are about
				4208	// to generate probably can't use it.
				4209	if (i != first_choice) {
				4210	alt_gen->expects_preload = false;
				4211	new_trace.InvalidateCurrentCharacter();
				4212	}
				4213	generate_full_check_inline = true;
				4214	}
				4215	if (generate_full_check_inline) {
				4216	if (new_trace.actions() != NULL) {
				4217	new_trace.set_flush_budget(new_flush_budget);
				4218	}
				4219	for (int j = 0; j < guard_count; j++) {
				4220	GenerateGuard(macro_assembler, guards->at(j), &new_trace);
				4221	}
				4222	alternative.node()->Emit(compiler, &new_trace);
				4223	preload->preload_is_current_ = false;
				4224	}
				4225	macro_assembler->Bind(&alt_gen->after);
				4226	}
				4227	}
				4228
				4229
				4230	void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
				4231	Trace* trace,
				4232	GuardedAlternative alternative,
				4233	AlternativeGeneration* alt_gen,
				4234	int preload_characters,
				4235	bool next_expects_preload) {
				4236	if (!alt_gen->possible_success.is_linked()) return;
				4237
				4238	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				4239	macro_assembler->Bind(&alt_gen->possible_success);
				4240	Trace out_of_line_trace(*trace);
				4241	out_of_line_trace.set_characters_preloaded(preload_characters);
				4242	out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
				4243	if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
				4244	ZoneList<Guard> guards = alternative.guards();
				4245	int guard_count = (guards == NULL) ? 0 : guards->length();
				4246	if (next_expects_preload) {
				4247	Label reload_current_char;
				4248	out_of_line_trace.set_backtrack(&reload_current_char);
				4249	for (int j = 0; j < guard_count; j++) {
				4250	GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
				4251	}
				4252	alternative.node()->Emit(compiler, &out_of_line_trace);
				4253	macro_assembler->Bind(&reload_current_char);
				4254	// Reload the current character, since the next quick check expects that.
				4255	// We don't need to check bounds here because we only get into this
				4256	// code through a quick check which already did the checked load.
				4257	macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
				4258	NULL,
				4259	false,
				4260	preload_characters);
				4261	macro_assembler->GoTo(&(alt_gen->after));
				4262	} else {
				4263	out_of_line_trace.set_backtrack(&(alt_gen->after));
				4264	for (int j = 0; j < guard_count; j++) {
				4265	GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
				4266	}
				4267	alternative.node()->Emit(compiler, &out_of_line_trace);
				4268	}
				4269	}
				4270
				4271
				4272	void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				4273	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				4274	LimitResult limit_result = LimitVersions(compiler, trace);
				4275	if (limit_result == DONE) return;
				4276	DCHECK(limit_result == CONTINUE);
				4277
				4278	RecursionCheck rc(compiler);
				4279
				4280	switch (action_type_) {
				4281	case STORE_POSITION: {
				4282	Trace::DeferredCapture
				4283	new_capture(data_.u_position_register.reg,
				4284	data_.u_position_register.is_capture,
				4285	trace);
				4286	Trace new_trace = *trace;
				4287	new_trace.add_action(&new_capture);
				4288	on_success()->Emit(compiler, &new_trace);
				4289	break;
				4290	}
				4291	case INCREMENT_REGISTER: {
				4292	Trace::DeferredIncrementRegister
				4293	new_increment(data_.u_increment_register.reg);
				4294	Trace new_trace = *trace;
				4295	new_trace.add_action(&new_increment);
				4296	on_success()->Emit(compiler, &new_trace);
				4297	break;
				4298	}
				4299	case SET_REGISTER: {
				4300	Trace::DeferredSetRegister
				4301	new_set(data_.u_store_register.reg, data_.u_store_register.value);
				4302	Trace new_trace = *trace;
				4303	new_trace.add_action(&new_set);
				4304	on_success()->Emit(compiler, &new_trace);
				4305	break;
				4306	}
				4307	case CLEAR_CAPTURES: {
				4308	Trace::DeferredClearCaptures
				4309	new_capture(Interval(data_.u_clear_captures.range_from,
				4310	data_.u_clear_captures.range_to));
				4311	Trace new_trace = *trace;
				4312	new_trace.add_action(&new_capture);
				4313	on_success()->Emit(compiler, &new_trace);
				4314	break;
				4315	}
				4316	case BEGIN_SUBMATCH:
				4317	if (!trace->is_trivial()) {
				4318	trace->Flush(compiler, this);
				4319	} else {
				4320	assembler->WriteCurrentPositionToRegister(
				4321	data_.u_submatch.current_position_register, 0);
				4322	assembler->WriteStackPointerToRegister(
				4323	data_.u_submatch.stack_pointer_register);
				4324	on_success()->Emit(compiler, trace);
				4325	}
				4326	break;
				4327	case EMPTY_MATCH_CHECK: {
				4328	int start_pos_reg = data_.u_empty_match_check.start_register;
				4329	int stored_pos = 0;
				4330	int rep_reg = data_.u_empty_match_check.repetition_register;
				4331	bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
				4332	bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
				4333	if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
				4334	// If we know we haven't advanced and there is no minimum we
				4335	// can just backtrack immediately.
				4336	assembler->GoTo(trace->backtrack());
				4337	} else if (know_dist && stored_pos < trace->cp_offset()) {
				4338	// If we know we've advanced we can generate the continuation
				4339	// immediately.
				4340	on_success()->Emit(compiler, trace);
				4341	} else if (!trace->is_trivial()) {
				4342	trace->Flush(compiler, this);
				4343	} else {
				4344	Label skip_empty_check;
				4345	// If we have a minimum number of repetitions we check the current
				4346	// number first and skip the empty check if it's not enough.
				4347	if (has_minimum) {
				4348	int limit = data_.u_empty_match_check.repetition_limit;
				4349	assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
				4350	}
				4351	// If the match is empty we bail out, otherwise we fall through
				4352	// to the on-success continuation.
				4353	assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
				4354	trace->backtrack());
				4355	assembler->Bind(&skip_empty_check);
				4356	on_success()->Emit(compiler, trace);
				4357	}
				4358	break;
				4359	}
				4360	case POSITIVE_SUBMATCH_SUCCESS: {
				4361	if (!trace->is_trivial()) {
				4362	trace->Flush(compiler, this);
				4363	return;
				4364	}
				4365	assembler->ReadCurrentPositionFromRegister(
				4366	data_.u_submatch.current_position_register);
				4367	assembler->ReadStackPointerFromRegister(
				4368	data_.u_submatch.stack_pointer_register);
				4369	int clear_register_count = data_.u_submatch.clear_register_count;
				4370	if (clear_register_count == 0) {
				4371	on_success()->Emit(compiler, trace);
				4372	return;
				4373	}
				4374	int clear_registers_from = data_.u_submatch.clear_register_from;
				4375	Label clear_registers_backtrack;
				4376	Trace new_trace = *trace;
				4377	new_trace.set_backtrack(&clear_registers_backtrack);
				4378	on_success()->Emit(compiler, &new_trace);
				4379
				4380	assembler->Bind(&clear_registers_backtrack);
				4381	int clear_registers_to = clear_registers_from + clear_register_count - 1;
				4382	assembler->ClearRegisters(clear_registers_from, clear_registers_to);
				4383
				4384	DCHECK(trace->backtrack() == NULL);
				4385	assembler->Backtrack();
				4386	return;
				4387	}
				4388	default:
				4389	UNREACHABLE();
				4390	}
				4391	}
				4392
				4393
				4394	void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				4395	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				4396	if (!trace->is_trivial()) {
				4397	trace->Flush(compiler, this);
				4398	return;
				4399	}
				4400
				4401	LimitResult limit_result = LimitVersions(compiler, trace);
				4402	if (limit_result == DONE) return;
				4403	DCHECK(limit_result == CONTINUE);
				4404
				4405	RecursionCheck rc(compiler);
				4406
				4407	DCHECK_EQ(start_reg_ + 1, end_reg_);
				4408	if (compiler->ignore_case()) {
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	4409	assembler->CheckNotBackReferenceIgnoreCase(
				4410	start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	4411	} else {
				4412	assembler->CheckNotBackReference(start_reg_, read_backward(),
				4413	trace->backtrack());
				4414	}
				4415	// We are going to advance backward, so we may end up at the start.
				4416	if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	4417
				4418	// Check that the back reference does not end inside a surrogate pair.
				4419	if (compiler->unicode() && !compiler->one_byte()) {
				4420	assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
				4421	}
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	4422	on_success()->Emit(compiler, trace);
				4423	}
				4424
				4425
				4426	// -------------------------------------------------------------------
				4427	// Dot/dotty output
				4428
				4429
				4430	#ifdef DEBUG
				4431
				4432
				4433	class DotPrinter: public NodeVisitor {
				4434	public:
				4435	DotPrinter(std::ostream& os, bool ignore_case) // NOLINT
				4436	: os_(os),
				4437	ignore_case_(ignore_case) {}
				4438	void PrintNode(const char* label, RegExpNode* node);
				4439	void Visit(RegExpNode* node);
				4440	void PrintAttributes(RegExpNode* from);
				4441	void PrintOnFailure(RegExpNode* from, RegExpNode* to);
				4442	#define DECLARE_VISIT(Type) \
				4443	virtual void Visit##Type(Type##Node* that);
				4444	FOR_EACH_NODE_TYPE(DECLARE_VISIT)
				4445	#undef DECLARE_VISIT
				4446	private:
				4447	std::ostream& os_;
				4448	bool ignore_case_;
				4449	};
				4450
				4451
				4452	void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
				4453	os_ << "digraph G {\n graph [label=\"";
				4454	for (int i = 0; label[i]; i++) {
				4455	switch (label[i]) {
				4456	case '\\':
				4457	os_ << "\\\\";
				4458	break;
				4459	case '"':
				4460	os_ << "\"";
				4461	break;
				4462	default:
				4463	os_ << label[i];
				4464	break;
				4465	}
				4466	}
				4467	os_ << "\"];\n";
				4468	Visit(node);
				4469	os_ << "}" << std::endl;
				4470	}
				4471
				4472
				4473	void DotPrinter::Visit(RegExpNode* node) {
				4474	if (node->info()->visited) return;
				4475	node->info()->visited = true;
				4476	node->Accept(this);
				4477	}
				4478
				4479
				4480	void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
				4481	os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
				4482	Visit(on_failure);
				4483	}
				4484
				4485
				4486	class TableEntryBodyPrinter {
				4487	public:
				4488	TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT
				4489	: os_(os),
				4490	choice_(choice) {}
				4491	void Call(uc16 from, DispatchTable::Entry entry) {
				4492	OutSet* out_set = entry.out_set();
				4493	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				4494	if (out_set->Get(i)) {
				4495	os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
				4496	<< choice()->alternatives()->at(i).node() << ";\n";
				4497	}
				4498	}
				4499	}
				4500	private:
				4501	ChoiceNode* choice() { return choice_; }
				4502	std::ostream& os_;
				4503	ChoiceNode* choice_;
				4504	};
				4505
				4506
				4507	class TableEntryHeaderPrinter {
				4508	public:
				4509	explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT
				4510	: first_(true),
				4511	os_(os) {}
				4512	void Call(uc16 from, DispatchTable::Entry entry) {
				4513	if (first_) {
				4514	first_ = false;
				4515	} else {
				4516	os_ << "\|";
				4517	}
				4518	os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "\|{";
				4519	OutSet* out_set = entry.out_set();
				4520	int priority = 0;
				4521	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				4522	if (out_set->Get(i)) {
				4523	if (priority > 0) os_ << "\|";
				4524	os_ << "<s" << from << "o" << i << "> " << priority;
				4525	priority++;
				4526	}
				4527	}
				4528	os_ << "}}";
				4529	}
				4530
				4531	private:
				4532	bool first_;
				4533	std::ostream& os_;
				4534	};
				4535
				4536
				4537	class AttributePrinter {
				4538	public:
				4539	explicit AttributePrinter(std::ostream& os) // NOLINT
				4540	: os_(os),
				4541	first_(true) {}
				4542	void PrintSeparator() {
				4543	if (first_) {
				4544	first_ = false;
				4545	} else {
				4546	os_ << "\|";
				4547	}
				4548	}
				4549	void PrintBit(const char* name, bool value) {
				4550	if (!value) return;
				4551	PrintSeparator();
				4552	os_ << "{" << name << "}";
				4553	}
				4554	void PrintPositive(const char* name, int value) {
				4555	if (value < 0) return;
				4556	PrintSeparator();
				4557	os_ << "{" << name << "\|" << value << "}";
				4558	}
				4559
				4560	private:
				4561	std::ostream& os_;
				4562	bool first_;
				4563	};
				4564
				4565
				4566	void DotPrinter::PrintAttributes(RegExpNode* that) {
				4567	os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
				4568	<< "margin=0.1, fontsize=10, label=\"{";
				4569	AttributePrinter printer(os_);
				4570	NodeInfo* info = that->info();
				4571	printer.PrintBit("NI", info->follows_newline_interest);
				4572	printer.PrintBit("WI", info->follows_word_interest);
				4573	printer.PrintBit("SI", info->follows_start_interest);
				4574	Label* label = that->label();
				4575	if (label->is_bound())
				4576	printer.PrintPositive("@", label->pos());
				4577	os_ << "}\"];\n"
				4578	<< " a" << that << " -> n" << that
				4579	<< " [style=dashed, color=grey, arrowhead=none];\n";
				4580	}
				4581
				4582
				4583	static const bool kPrintDispatchTable = false;
				4584	void DotPrinter::VisitChoice(ChoiceNode* that) {
				4585	if (kPrintDispatchTable) {
				4586	os_ << " n" << that << " [shape=Mrecord, label=\"";
				4587	TableEntryHeaderPrinter header_printer(os_);
				4588	that->GetTable(ignore_case_)->ForEach(&header_printer);
				4589	os_ << "\"]\n";
				4590	PrintAttributes(that);
				4591	TableEntryBodyPrinter body_printer(os_, that);
				4592	that->GetTable(ignore_case_)->ForEach(&body_printer);
				4593	} else {
				4594	os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
				4595	for (int i = 0; i < that->alternatives()->length(); i++) {
				4596	GuardedAlternative alt = that->alternatives()->at(i);
				4597	os_ << " n" << that << " -> n" << alt.node();
				4598	}
				4599	}
				4600	for (int i = 0; i < that->alternatives()->length(); i++) {
				4601	GuardedAlternative alt = that->alternatives()->at(i);
				4602	alt.node()->Accept(this);
				4603	}
				4604	}
				4605
				4606
				4607	void DotPrinter::VisitText(TextNode* that) {
				4608	Zone* zone = that->zone();
				4609	os_ << " n" << that << " [label=\"";
				4610	for (int i = 0; i < that->elements()->length(); i++) {
				4611	if (i > 0) os_ << " ";
				4612	TextElement elm = that->elements()->at(i);
				4613	switch (elm.text_type()) {
				4614	case TextElement::ATOM: {
				4615	Vector<const uc16> data = elm.atom()->data();
				4616	for (int i = 0; i < data.length(); i++) {
				4617	os_ << static_cast<char>(data[i]);
				4618	}
				4619	break;
				4620	}
				4621	case TextElement::CHAR_CLASS: {
				4622	RegExpCharacterClass* node = elm.char_class();
				4623	os_ << "[";
				4624	if (node->is_negated()) os_ << "^";
				4625	for (int j = 0; j < node->ranges(zone)->length(); j++) {
				4626	CharacterRange range = node->ranges(zone)->at(j);
				4627	os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
				4628	}
				4629	os_ << "]";
				4630	break;
				4631	}
				4632	default:
				4633	UNREACHABLE();
				4634	}
				4635	}
				4636	os_ << "\", shape=box, peripheries=2];\n";
				4637	PrintAttributes(that);
				4638	os_ << " n" << that << " -> n" << that->on_success() << ";\n";
				4639	Visit(that->on_success());
				4640	}
				4641
				4642
				4643	void DotPrinter::VisitBackReference(BackReferenceNode* that) {
				4644	os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
				4645	<< that->end_register() << "\", shape=doubleoctagon];\n";
				4646	PrintAttributes(that);
				4647	os_ << " n" << that << " -> n" << that->on_success() << ";\n";
				4648	Visit(that->on_success());
				4649	}
				4650
				4651
				4652	void DotPrinter::VisitEnd(EndNode* that) {
				4653	os_ << " n" << that << " [style=bold, shape=point];\n";
				4654	PrintAttributes(that);
				4655	}
				4656
				4657
				4658	void DotPrinter::VisitAssertion(AssertionNode* that) {
				4659	os_ << " n" << that << " [";
				4660	switch (that->assertion_type()) {
				4661	case AssertionNode::AT_END:
				4662	os_ << "label=\"$\", shape=septagon";
				4663	break;
				4664	case AssertionNode::AT_START:
				4665	os_ << "label=\"^\", shape=septagon";
				4666	break;
				4667	case AssertionNode::AT_BOUNDARY:
				4668	os_ << "label=\"\\b\", shape=septagon";
				4669	break;
				4670	case AssertionNode::AT_NON_BOUNDARY:
				4671	os_ << "label=\"\\B\", shape=septagon";
				4672	break;
				4673	case AssertionNode::AFTER_NEWLINE:
				4674	os_ << "label=\"(?<=\\n)\", shape=septagon";
				4675	break;
				4676	}
				4677	os_ << "];\n";
				4678	PrintAttributes(that);
				4679	RegExpNode* successor = that->on_success();
				4680	os_ << " n" << that << " -> n" << successor << ";\n";
				4681	Visit(successor);
				4682	}
				4683
				4684
				4685	void DotPrinter::VisitAction(ActionNode* that) {
				4686	os_ << " n" << that << " [";
				4687	switch (that->action_type_) {
				4688	case ActionNode::SET_REGISTER:
				4689	os_ << "label=\"$" << that->data_.u_store_register.reg
				4690	<< ":=" << that->data_.u_store_register.value << "\", shape=octagon";
				4691	break;
				4692	case ActionNode::INCREMENT_REGISTER:
				4693	os_ << "label=\"$" << that->data_.u_increment_register.reg
				4694	<< "++\", shape=octagon";
				4695	break;
				4696	case ActionNode::STORE_POSITION:
				4697	os_ << "label=\"$" << that->data_.u_position_register.reg
				4698	<< ":=$pos\", shape=octagon";
				4699	break;
				4700	case ActionNode::BEGIN_SUBMATCH:
				4701	os_ << "label=\"$" << that->data_.u_submatch.current_position_register
				4702	<< ":=$pos,begin\", shape=septagon";
				4703	break;
				4704	case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
				4705	os_ << "label=\"escape\", shape=septagon";
				4706	break;
				4707	case ActionNode::EMPTY_MATCH_CHECK:
				4708	os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
				4709	<< "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
				4710	<< "<" << that->data_.u_empty_match_check.repetition_limit
				4711	<< "?\", shape=septagon";
				4712	break;
				4713	case ActionNode::CLEAR_CAPTURES: {
				4714	os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
				4715	<< " to $" << that->data_.u_clear_captures.range_to
				4716	<< "\", shape=septagon";
				4717	break;
				4718	}
				4719	}
				4720	os_ << "];\n";
				4721	PrintAttributes(that);
				4722	RegExpNode* successor = that->on_success();
				4723	os_ << " n" << that << " -> n" << successor << ";\n";
				4724	Visit(successor);
				4725	}
				4726
				4727
				4728	class DispatchTableDumper {
				4729	public:
				4730	explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
				4731	void Call(uc16 key, DispatchTable::Entry entry);
				4732	private:
				4733	std::ostream& os_;
				4734	};
				4735
				4736
				4737	void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
				4738	os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
				4739	OutSet* set = entry.out_set();
				4740	bool first = true;
				4741	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				4742	if (set->Get(i)) {
				4743	if (first) {
				4744	first = false;
				4745	} else {
				4746	os_ << ", ";
				4747	}
				4748	os_ << i;
				4749	}
				4750	}
				4751	os_ << "}\n";
				4752	}
				4753
				4754
				4755	void DispatchTable::Dump() {
				4756	OFStream os(stderr);
				4757	DispatchTableDumper dumper(os);
				4758	tree()->ForEach(&dumper);
				4759	}
				4760
				4761
				4762	void RegExpEngine::DotPrint(const char* label,
				4763	RegExpNode* node,
				4764	bool ignore_case) {
				4765	OFStream os(stdout);
				4766	DotPrinter printer(os, ignore_case);
				4767	printer.PrintNode(label, node);
				4768	}
				4769
				4770
				4771	#endif // DEBUG
				4772
				4773
				4774	// -------------------------------------------------------------------
				4775	// Tree to graph conversion
				4776
				4777	RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
				4778	RegExpNode* on_success) {
				4779	ZoneList<TextElement>* elms =
				4780	new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
				4781	elms->Add(TextElement::Atom(this), compiler->zone());
				4782	return new (compiler->zone())
				4783	TextNode(elms, compiler->read_backward(), on_success);
				4784	}
				4785
				4786
				4787	RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
				4788	RegExpNode* on_success) {
				4789	return new (compiler->zone())
				4790	TextNode(elements(), compiler->read_backward(), on_success);
				4791	}
				4792
				4793
				4794	static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
				4795	const int* special_class,
				4796	int length) {
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	4797	length--; // Remove final marker.
				4798	DCHECK(special_class[length] == kRangeEndMarker);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	4799	DCHECK(ranges->length() != 0);
				4800	DCHECK(length != 0);
				4801	DCHECK(special_class[0] != 0);
				4802	if (ranges->length() != (length >> 1) + 1) {
				4803	return false;
				4804	}
				4805	CharacterRange range = ranges->at(0);
				4806	if (range.from() != 0) {
				4807	return false;
				4808	}
				4809	for (int i = 0; i < length; i += 2) {
				4810	if (special_class[i] != (range.to() + 1)) {
				4811	return false;
				4812	}
				4813	range = ranges->at((i >> 1) + 1);
				4814	if (special_class[i+1] != range.from()) {
				4815	return false;
				4816	}
				4817	}
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	4818	if (range.to() != String::kMaxCodePoint) {
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	4819	return false;
				4820	}
				4821	return true;
				4822	}
				4823
				4824
				4825	static bool CompareRanges(ZoneList<CharacterRange>* ranges,
				4826	const int* special_class,
				4827	int length) {
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	4828	length--; // Remove final marker.
				4829	DCHECK(special_class[length] == kRangeEndMarker);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	4830	if (ranges->length() * 2 != length) {
				4831	return false;
				4832	}
				4833	for (int i = 0; i < length; i += 2) {
				4834	CharacterRange range = ranges->at(i >> 1);
				4835	if (range.from() != special_class[i] \|\|
				4836	range.to() != special_class[i + 1] - 1) {
				4837	return false;
				4838	}
				4839	}
				4840	return true;
				4841	}
				4842
				4843
				4844	bool RegExpCharacterClass::is_standard(Zone* zone) {
				4845	// TODO(lrn): Remove need for this function, by not throwing away information
				4846	// along the way.
				4847	if (is_negated_) {
				4848	return false;
				4849	}
				4850	if (set_.is_standard()) {
				4851	return true;
				4852	}
				4853	if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
				4854	set_.set_standard_set_type('s');
				4855	return true;
				4856	}
				4857	if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
				4858	set_.set_standard_set_type('S');
				4859	return true;
				4860	}
				4861	if (CompareInverseRanges(set_.ranges(zone),
				4862	kLineTerminatorRanges,
				4863	kLineTerminatorRangeCount)) {
				4864	set_.set_standard_set_type('.');
				4865	return true;
				4866	}
				4867	if (CompareRanges(set_.ranges(zone),
				4868	kLineTerminatorRanges,
				4869	kLineTerminatorRangeCount)) {
				4870	set_.set_standard_set_type('n');
				4871	return true;
				4872	}
				4873	if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
				4874	set_.set_standard_set_type('w');
				4875	return true;
				4876	}
				4877	if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
				4878	set_.set_standard_set_type('W');
				4879	return true;
				4880	}
				4881	return false;
				4882	}
				4883
				4884
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	4885	UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
				4886	ZoneList<CharacterRange>* base)
				4887	: zone_(zone),
				4888	table_(zone),
				4889	bmp_(nullptr),
				4890	lead_surrogates_(nullptr),
				4891	trail_surrogates_(nullptr),
				4892	non_bmp_(nullptr) {
				4893	// The unicode range splitter categorizes given character ranges into:
				4894	// - Code points from the BMP representable by one code unit.
				4895	// - Code points outside the BMP that need to be split into surrogate pairs.
				4896	// - Lone lead surrogates.
				4897	// - Lone trail surrogates.
				4898	// Lone surrogates are valid code points, even though no actual characters.
				4899	// They require special matching to make sure we do not split surrogate pairs.
				4900	// We use the dispatch table to accomplish this. The base range is split up
				4901	// by the table by the overlay ranges, and the Call callback is used to
				4902	// filter and collect ranges for each category.
				4903	for (int i = 0; i < base->length(); i++) {
				4904	table_.AddRange(base->at(i), kBase, zone_);
				4905	}
				4906	// Add overlay ranges.
				4907	table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
				4908	kBmpCodePoints, zone_);
				4909	table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
				4910	kLeadSurrogates, zone_);
				4911	table_.AddRange(
				4912	CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
				4913	kTrailSurrogates, zone_);
				4914	table_.AddRange(
				4915	CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
				4916	kBmpCodePoints, zone_);
				4917	table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
				4918	kNonBmpCodePoints, zone_);
				4919	table_.ForEach(this);
				4920	}
				4921
				4922
				4923	void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
				4924	OutSet* outset = entry.out_set();
				4925	if (!outset->Get(kBase)) return;
				4926	ZoneList<CharacterRange>** target = NULL;
				4927	if (outset->Get(kBmpCodePoints)) {
				4928	target = &bmp_;
				4929	} else if (outset->Get(kLeadSurrogates)) {
				4930	target = &lead_surrogates_;
				4931	} else if (outset->Get(kTrailSurrogates)) {
				4932	target = &trail_surrogates_;
				4933	} else {
				4934	DCHECK(outset->Get(kNonBmpCodePoints));
				4935	target = &non_bmp_;
				4936	}
				4937	if (target == NULL) target = new (zone_) ZoneList<CharacterRange>(2, zone_);
				4938	(*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
				4939	}
				4940
				4941
				4942	void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
				4943	RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
				4944	ZoneList<CharacterRange>* bmp = splitter->bmp();
				4945	if (bmp == nullptr) return;
				4946	result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
				4947	compiler->zone(), bmp, compiler->read_backward(), on_success)));
				4948	}
				4949
				4950
				4951	void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
				4952	RegExpNode* on_success,
				4953	UnicodeRangeSplitter* splitter) {
				4954	ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
				4955	if (non_bmp == nullptr) return;
				4956	DCHECK(compiler->unicode());
				4957	DCHECK(!compiler->one_byte());
				4958	Zone* zone = compiler->zone();
				4959	CharacterRange::Canonicalize(non_bmp);
				4960	for (int i = 0; i < non_bmp->length(); i++) {
				4961	// Match surrogate pair.
				4962	// E.g. [\u10005-\u11005] becomes
				4963	// \ud800[\udc05-\udfff]\|
				4964	// [\ud801-\ud803][\udc00-\udfff]\|
				4965	// \ud804[\udc00-\udc05]
				4966	uc32 from = non_bmp->at(i).from();
				4967	uc32 to = non_bmp->at(i).to();
				4968	uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
				4969	uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
				4970	uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
				4971	uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
				4972	if (from_l == to_l) {
				4973	// The lead surrogate is the same.
				4974	result->AddAlternative(
				4975	GuardedAlternative(TextNode::CreateForSurrogatePair(
				4976	zone, CharacterRange::Singleton(from_l),
				4977	CharacterRange::Range(from_t, to_t), compiler->read_backward(),
				4978	on_success)));
				4979	} else {
				4980	if (from_t != kTrailSurrogateStart) {
				4981	// Add [from_l][from_t-\udfff]
				4982	result->AddAlternative(
				4983	GuardedAlternative(TextNode::CreateForSurrogatePair(
				4984	zone, CharacterRange::Singleton(from_l),
				4985	CharacterRange::Range(from_t, kTrailSurrogateEnd),
				4986	compiler->read_backward(), on_success)));
				4987	from_l++;
				4988	}
				4989	if (to_t != kTrailSurrogateEnd) {
				4990	// Add [to_l][\udc00-to_t]
				4991	result->AddAlternative(
				4992	GuardedAlternative(TextNode::CreateForSurrogatePair(
				4993	zone, CharacterRange::Singleton(to_l),
				4994	CharacterRange::Range(kTrailSurrogateStart, to_t),
				4995	compiler->read_backward(), on_success)));
				4996	to_l--;
				4997	}
				4998	if (from_l <= to_l) {
				4999	// Add [from_l-to_l][\udc00-\udfff]
				5000	result->AddAlternative(
				5001	GuardedAlternative(TextNode::CreateForSurrogatePair(
				5002	zone, CharacterRange::Range(from_l, to_l),
				5003	CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
				5004	compiler->read_backward(), on_success)));
				5005	}
				5006	}
				5007	}
				5008	}
				5009
				5010
				5011	RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
				5012	RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
				5013	ZoneList<CharacterRange>* match, RegExpNode* on_success,
				5014	bool read_backward) {
				5015	Zone* zone = compiler->zone();
				5016	RegExpNode* match_node = TextNode::CreateForCharacterRanges(
				5017	zone, match, read_backward, on_success);
				5018	int stack_register = compiler->UnicodeLookaroundStackRegister();
				5019	int position_register = compiler->UnicodeLookaroundPositionRegister();
				5020	RegExpLookaround::Builder lookaround(false, match_node, stack_register,
				5021	position_register);
				5022	RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
				5023	zone, lookbehind, !read_backward, lookaround.on_match_success());
				5024	return lookaround.ForMatch(negative_match);
				5025	}
				5026
				5027
				5028	RegExpNode* MatchAndNegativeLookaroundInReadDirection(
				5029	RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
				5030	ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
				5031	bool read_backward) {
				5032	Zone* zone = compiler->zone();
				5033	int stack_register = compiler->UnicodeLookaroundStackRegister();
				5034	int position_register = compiler->UnicodeLookaroundPositionRegister();
				5035	RegExpLookaround::Builder lookaround(false, on_success, stack_register,
				5036	position_register);
				5037	RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
				5038	zone, lookahead, read_backward, lookaround.on_match_success());
				5039	return TextNode::CreateForCharacterRanges(
				5040	zone, match, read_backward, lookaround.ForMatch(negative_match));
				5041	}
				5042
				5043
				5044	void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
				5045	RegExpNode* on_success,
				5046	UnicodeRangeSplitter* splitter) {
				5047	ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
				5048	if (lead_surrogates == nullptr) return;
				5049	Zone* zone = compiler->zone();
				5050	// E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
				5051	ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
				5052	zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
				5053
				5054	RegExpNode* match;
				5055	if (compiler->read_backward()) {
				5056	// Reading backward. Assert that reading forward, there is no trail
				5057	// surrogate, and then backward match the lead surrogate.
				5058	match = NegativeLookaroundAgainstReadDirectionAndMatch(
				5059	compiler, trail_surrogates, lead_surrogates, on_success, true);
				5060	} else {
				5061	// Reading forward. Forward match the lead surrogate and assert that
				5062	// no trail surrogate follows.
				5063	match = MatchAndNegativeLookaroundInReadDirection(
				5064	compiler, lead_surrogates, trail_surrogates, on_success, false);
				5065	}
				5066	result->AddAlternative(GuardedAlternative(match));
				5067	}
				5068
				5069
				5070	void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
				5071	RegExpNode* on_success,
				5072	UnicodeRangeSplitter* splitter) {
				5073	ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
				5074	if (trail_surrogates == nullptr) return;
				5075	Zone* zone = compiler->zone();
				5076	// E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
				5077	ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
				5078	zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
				5079
				5080	RegExpNode* match;
				5081	if (compiler->read_backward()) {
				5082	// Reading backward. Backward match the trail surrogate and assert that no
				5083	// lead surrogate precedes it.
				5084	match = MatchAndNegativeLookaroundInReadDirection(
				5085	compiler, trail_surrogates, lead_surrogates, on_success, true);
				5086	} else {
				5087	// Reading forward. Assert that reading backward, there is no lead
				5088	// surrogate, and then forward match the trail surrogate.
				5089	match = NegativeLookaroundAgainstReadDirectionAndMatch(
				5090	compiler, lead_surrogates, trail_surrogates, on_success, false);
				5091	}
				5092	result->AddAlternative(GuardedAlternative(match));
				5093	}
				5094
				5095	RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
				5096	RegExpNode* on_success) {
				5097	// This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
				5098	DCHECK(!compiler->read_backward());
				5099	Zone* zone = compiler->zone();
				5100	// Advance any character. If the character happens to be a lead surrogate and
				5101	// we advanced into the middle of a surrogate pair, it will work out, as
				5102	// nothing will match from there. We will have to advance again, consuming
				5103	// the associated trail surrogate.
				5104	ZoneList<CharacterRange>* range = CharacterRange::List(
				5105	zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
				5106	return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
				5107	}
				5108
				5109
				5110	void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
				5111	ZoneList<CharacterRange>* ranges) {
				5112	#ifdef V8_I18N_SUPPORT
				5113	// Use ICU to compute the case fold closure over the ranges.
				5114	DCHECK(compiler->unicode());
				5115	DCHECK(compiler->ignore_case());
				5116	USet* set = uset_openEmpty();
				5117	for (int i = 0; i < ranges->length(); i++) {
				5118	uset_addRange(set, ranges->at(i).from(), ranges->at(i).to());
				5119	}
				5120	ranges->Clear();
				5121	uset_closeOver(set, USET_CASE_INSENSITIVE);
				5122	// Full case mapping map single characters to multiple characters.
				5123	// Those are represented as strings in the set. Remove them so that
				5124	// we end up with only simple and common case mappings.
				5125	uset_removeAllStrings(set);
				5126	int item_count = uset_getItemCount(set);
				5127	int item_result = 0;
				5128	UErrorCode ec = U_ZERO_ERROR;
				5129	Zone* zone = compiler->zone();
				5130	for (int i = 0; i < item_count; i++) {
				5131	uc32 start = 0;
				5132	uc32 end = 0;
				5133	item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
				5134	ranges->Add(CharacterRange::Range(start, end), zone);
				5135	}
				5136	// No errors and everything we collected have been ranges.
				5137	DCHECK_EQ(U_ZERO_ERROR, ec);
				5138	DCHECK_EQ(0, item_result);
				5139	uset_close(set);
				5140	#else
				5141	// Fallback if ICU is not included.
				5142	CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
				5143	ranges, compiler->one_byte());
				5144	#endif // V8_I18N_SUPPORT
				5145	CharacterRange::Canonicalize(ranges);
				5146	}
				5147
				5148
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	5149	RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
				5150	RegExpNode* on_success) {
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	5151	set_.Canonicalize();
				5152	Zone* zone = compiler->zone();
				5153	ZoneList<CharacterRange>* ranges = this->ranges(zone);
				5154	if (compiler->unicode() && compiler->ignore_case()) {
				5155	AddUnicodeCaseEquivalents(compiler, ranges);
				5156	}
				5157	if (compiler->unicode() && !compiler->one_byte()) {
				5158	if (is_negated()) {
				5159	ZoneList<CharacterRange>* negated =
				5160	new (zone) ZoneList<CharacterRange>(2, zone);
				5161	CharacterRange::Negate(ranges, negated, zone);
				5162	ranges = negated;
				5163	}
				5164	if (ranges->length() == 0) {
Ben Murdoch	bcf72ee	2016-08-08 18:44:38 +0100	[diff] [blame]	5165	ranges->Add(CharacterRange::Everything(), zone);
				5166	RegExpCharacterClass* fail =
				5167	new (zone) RegExpCharacterClass(ranges, true);
				5168	return new (zone) TextNode(fail, compiler->read_backward(), on_success);
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	5169	}
				5170	if (standard_type() == '*') {
				5171	return UnanchoredAdvance(compiler, on_success);
				5172	} else {
				5173	ChoiceNode* result = new (zone) ChoiceNode(2, zone);
				5174	UnicodeRangeSplitter splitter(zone, ranges);
				5175	AddBmpCharacters(compiler, result, on_success, &splitter);
				5176	AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
				5177	AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
				5178	AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
				5179	return result;
				5180	}
				5181	} else {
				5182	return new (zone) TextNode(this, compiler->read_backward(), on_success);
				5183	}
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	5184	}
				5185
				5186
				5187	int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
				5188	RegExpAtom* atom1 = (*a)->AsAtom();
				5189	RegExpAtom* atom2 = (*b)->AsAtom();
				5190	uc16 character1 = atom1->data().at(0);
				5191	uc16 character2 = atom2->data().at(0);
				5192	if (character1 < character2) return -1;
				5193	if (character1 > character2) return 1;
				5194	return 0;
				5195	}
				5196
				5197
				5198	static unibrow::uchar Canonical(
				5199	unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
				5200	unibrow::uchar c) {
				5201	unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
				5202	int length = canonicalize->get(c, '\0', chars);
				5203	DCHECK_LE(length, 1);
				5204	unibrow::uchar canonical = c;
				5205	if (length == 1) canonical = chars[0];
				5206	return canonical;
				5207	}
				5208
				5209
				5210	int CompareFirstCharCaseIndependent(
				5211	unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
				5212	RegExpTree* const* a, RegExpTree* const* b) {
				5213	RegExpAtom* atom1 = (*a)->AsAtom();
				5214	RegExpAtom* atom2 = (*b)->AsAtom();
				5215	unibrow::uchar character1 = atom1->data().at(0);
				5216	unibrow::uchar character2 = atom2->data().at(0);
				5217	if (character1 == character2) return 0;
				5218	if (character1 >= 'a' \|\| character2 >= 'a') {
				5219	character1 = Canonical(canonicalize, character1);
				5220	character2 = Canonical(canonicalize, character2);
				5221	}
				5222	return static_cast<int>(character1) - static_cast<int>(character2);
				5223	}
				5224
				5225
				5226	// We can stable sort runs of atoms, since the order does not matter if they
				5227	// start with different characters.
				5228	// Returns true if any consecutive atoms were found.
				5229	bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
				5230	ZoneList<RegExpTree> alternatives = this->alternatives();
				5231	int length = alternatives->length();
				5232	bool found_consecutive_atoms = false;
				5233	for (int i = 0; i < length; i++) {
				5234	while (i < length) {
				5235	RegExpTree* alternative = alternatives->at(i);
				5236	if (alternative->IsAtom()) break;
				5237	i++;
				5238	}
				5239	// i is length or it is the index of an atom.
				5240	if (i == length) break;
				5241	int first_atom = i;
				5242	i++;
				5243	while (i < length) {
				5244	RegExpTree* alternative = alternatives->at(i);
				5245	if (!alternative->IsAtom()) break;
				5246	i++;
				5247	}
				5248	// Sort atoms to get ones with common prefixes together.
				5249	// This step is more tricky if we are in a case-independent regexp,
				5250	// because it would change /is\|I/ to /I\|is/, and order matters when
				5251	// the regexp parts don't match only disjoint starting points. To fix
				5252	// this we have a version of CompareFirstChar that uses case-
				5253	// independent character classes for comparison.
				5254	DCHECK_LT(first_atom, alternatives->length());
				5255	DCHECK_LE(i, alternatives->length());
				5256	DCHECK_LE(first_atom, i);
				5257	if (compiler->ignore_case()) {
				5258	unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
				5259	compiler->isolate()->regexp_macro_assembler_canonicalize();
				5260	auto compare_closure =
				5261	[canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
				5262	return CompareFirstCharCaseIndependent(canonicalize, a, b);
				5263	};
				5264	alternatives->StableSort(compare_closure, first_atom, i - first_atom);
				5265	} else {
				5266	alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
				5267	}
				5268	if (i - first_atom > 1) found_consecutive_atoms = true;
				5269	}
				5270	return found_consecutive_atoms;
				5271	}
				5272
				5273
				5274	// Optimizes ab\|ac\|az to a(?:b\|c\|d).
				5275	void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
				5276	Zone* zone = compiler->zone();
				5277	ZoneList<RegExpTree> alternatives = this->alternatives();
				5278	int length = alternatives->length();
				5279
				5280	int write_posn = 0;
				5281	int i = 0;
				5282	while (i < length) {
				5283	RegExpTree* alternative = alternatives->at(i);
				5284	if (!alternative->IsAtom()) {
				5285	alternatives->at(write_posn++) = alternatives->at(i);
				5286	i++;
				5287	continue;
				5288	}
				5289	RegExpAtom* atom = alternative->AsAtom();
				5290	unibrow::uchar common_prefix = atom->data().at(0);
				5291	int first_with_prefix = i;
				5292	int prefix_length = atom->length();
				5293	i++;
				5294	while (i < length) {
				5295	alternative = alternatives->at(i);
				5296	if (!alternative->IsAtom()) break;
				5297	atom = alternative->AsAtom();
				5298	unibrow::uchar new_prefix = atom->data().at(0);
				5299	if (new_prefix != common_prefix) {
				5300	if (!compiler->ignore_case()) break;
				5301	unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
				5302	compiler->isolate()->regexp_macro_assembler_canonicalize();
				5303	new_prefix = Canonical(canonicalize, new_prefix);
				5304	common_prefix = Canonical(canonicalize, common_prefix);
				5305	if (new_prefix != common_prefix) break;
				5306	}
				5307	prefix_length = Min(prefix_length, atom->length());
				5308	i++;
				5309	}
				5310	if (i > first_with_prefix + 2) {
				5311	// Found worthwhile run of alternatives with common prefix of at least one
				5312	// character. The sorting function above did not sort on more than one
				5313	// character for reasons of correctness, but there may still be a longer
				5314	// common prefix if the terms were similar or presorted in the input.
				5315	// Find out how long the common prefix is.
				5316	int run_length = i - first_with_prefix;
				5317	atom = alternatives->at(first_with_prefix)->AsAtom();
				5318	for (int j = 1; j < run_length && prefix_length > 1; j++) {
				5319	RegExpAtom* old_atom =
				5320	alternatives->at(j + first_with_prefix)->AsAtom();
				5321	for (int k = 1; k < prefix_length; k++) {
				5322	if (atom->data().at(k) != old_atom->data().at(k)) {
				5323	prefix_length = k;
				5324	break;
				5325	}
				5326	}
				5327	}
				5328	RegExpAtom* prefix =
				5329	new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
				5330	ZoneList<RegExpTree> pair = new (zone) ZoneList<RegExpTree*>(2, zone);
				5331	pair->Add(prefix, zone);
				5332	ZoneList<RegExpTree> suffixes =
				5333	new (zone) ZoneList<RegExpTree*>(run_length, zone);
				5334	for (int j = 0; j < run_length; j++) {
				5335	RegExpAtom* old_atom =
				5336	alternatives->at(j + first_with_prefix)->AsAtom();
				5337	int len = old_atom->length();
				5338	if (len == prefix_length) {
				5339	suffixes->Add(new (zone) RegExpEmpty(), zone);
				5340	} else {
				5341	RegExpTree* suffix = new (zone) RegExpAtom(
				5342	old_atom->data().SubVector(prefix_length, old_atom->length()));
				5343	suffixes->Add(suffix, zone);
				5344	}
				5345	}
				5346	pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
				5347	alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
				5348	} else {
				5349	// Just copy any non-worthwhile alternatives.
				5350	for (int j = first_with_prefix; j < i; j++) {
				5351	alternatives->at(write_posn++) = alternatives->at(j);
				5352	}
				5353	}
				5354	}
				5355	alternatives->Rewind(write_posn); // Trim end of array.
				5356	}
				5357
				5358
				5359	// Optimizes b\|c\|z to [bcz].
				5360	void RegExpDisjunction::FixSingleCharacterDisjunctions(
				5361	RegExpCompiler* compiler) {
				5362	Zone* zone = compiler->zone();
				5363	ZoneList<RegExpTree> alternatives = this->alternatives();
				5364	int length = alternatives->length();
				5365
				5366	int write_posn = 0;
				5367	int i = 0;
				5368	while (i < length) {
				5369	RegExpTree* alternative = alternatives->at(i);
				5370	if (!alternative->IsAtom()) {
				5371	alternatives->at(write_posn++) = alternatives->at(i);
				5372	i++;
				5373	continue;
				5374	}
				5375	RegExpAtom* atom = alternative->AsAtom();
				5376	if (atom->length() != 1) {
				5377	alternatives->at(write_posn++) = alternatives->at(i);
				5378	i++;
				5379	continue;
				5380	}
				5381	int first_in_run = i;
				5382	i++;
				5383	while (i < length) {
				5384	alternative = alternatives->at(i);
				5385	if (!alternative->IsAtom()) break;
				5386	atom = alternative->AsAtom();
				5387	if (atom->length() != 1) break;
				5388	i++;
				5389	}
				5390	if (i > first_in_run + 1) {
				5391	// Found non-trivial run of single-character alternatives.
				5392	int run_length = i - first_in_run;
				5393	ZoneList<CharacterRange>* ranges =
				5394	new (zone) ZoneList<CharacterRange>(2, zone);
				5395	for (int j = 0; j < run_length; j++) {
				5396	RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
				5397	DCHECK_EQ(old_atom->length(), 1);
				5398	ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
				5399	}
				5400	alternatives->at(write_posn++) =
				5401	new (zone) RegExpCharacterClass(ranges, false);
				5402	} else {
				5403	// Just copy any trivial alternatives.
				5404	for (int j = first_in_run; j < i; j++) {
				5405	alternatives->at(write_posn++) = alternatives->at(j);
				5406	}
				5407	}
				5408	}
				5409	alternatives->Rewind(write_posn); // Trim end of array.
				5410	}
				5411
				5412
				5413	RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
				5414	RegExpNode* on_success) {
				5415	ZoneList<RegExpTree> alternatives = this->alternatives();
				5416
				5417	if (alternatives->length() > 2) {
				5418	bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
				5419	if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
				5420	FixSingleCharacterDisjunctions(compiler);
				5421	if (alternatives->length() == 1) {
				5422	return alternatives->at(0)->ToNode(compiler, on_success);
				5423	}
				5424	}
				5425
				5426	int length = alternatives->length();
				5427
				5428	ChoiceNode* result =
				5429	new(compiler->zone()) ChoiceNode(length, compiler->zone());
				5430	for (int i = 0; i < length; i++) {
				5431	GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
				5432	on_success));
				5433	result->AddAlternative(alternative);
				5434	}
				5435	return result;
				5436	}
				5437
				5438
				5439	RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
				5440	RegExpNode* on_success) {
				5441	return ToNode(min(),
				5442	max(),
				5443	is_greedy(),
				5444	body(),
				5445	compiler,
				5446	on_success);
				5447	}
				5448
				5449
				5450	// Scoped object to keep track of how much we unroll quantifier loops in the
				5451	// regexp graph generator.
				5452	class RegExpExpansionLimiter {
				5453	public:
				5454	static const int kMaxExpansionFactor = 6;
				5455	RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
				5456	: compiler_(compiler),
				5457	saved_expansion_factor_(compiler->current_expansion_factor()),
				5458	ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
				5459	DCHECK(factor > 0);
				5460	if (ok_to_expand_) {
				5461	if (factor > kMaxExpansionFactor) {
				5462	// Avoid integer overflow of the current expansion factor.
				5463	ok_to_expand_ = false;
				5464	compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
				5465	} else {
				5466	int new_factor = saved_expansion_factor_ * factor;
				5467	ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
				5468	compiler->set_current_expansion_factor(new_factor);
				5469	}
				5470	}
				5471	}
				5472
				5473	~RegExpExpansionLimiter() {
				5474	compiler_->set_current_expansion_factor(saved_expansion_factor_);
				5475	}
				5476
				5477	bool ok_to_expand() { return ok_to_expand_; }
				5478
				5479	private:
				5480	RegExpCompiler* compiler_;
				5481	int saved_expansion_factor_;
				5482	bool ok_to_expand_;
				5483
				5484	DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
				5485	};
				5486
				5487
				5488	RegExpNode* RegExpQuantifier::ToNode(int min,
				5489	int max,
				5490	bool is_greedy,
				5491	RegExpTree* body,
				5492	RegExpCompiler* compiler,
				5493	RegExpNode* on_success,
				5494	bool not_at_start) {
				5495	// x{f, t} becomes this:
				5496	//
				5497	// (r++)<-.
				5498	// \| `
				5499	// \| (x)
				5500	// v ^
				5501	// (r=0)-->(?)---/ [if r < t]
				5502	// \|
				5503	// [if r >= f] \----> ...
				5504	//
				5505
				5506	// 15.10.2.5 RepeatMatcher algorithm.
				5507	// The parser has already eliminated the case where max is 0. In the case
				5508	// where max_match is zero the parser has removed the quantifier if min was
				5509	// > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
				5510
				5511	// If we know that we cannot match zero length then things are a little
				5512	// simpler since we don't need to make the special zero length match check
				5513	// from step 2.1. If the min and max are small we can unroll a little in
				5514	// this case.
				5515	static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
				5516	static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
				5517	if (max == 0) return on_success; // This can happen due to recursion.
				5518	bool body_can_be_empty = (body->min_match() == 0);
				5519	int body_start_reg = RegExpCompiler::kNoRegister;
				5520	Interval capture_registers = body->CaptureRegisters();
				5521	bool needs_capture_clearing = !capture_registers.is_empty();
				5522	Zone* zone = compiler->zone();
				5523
				5524	if (body_can_be_empty) {
				5525	body_start_reg = compiler->AllocateRegister();
				5526	} else if (compiler->optimize() && !needs_capture_clearing) {
				5527	// Only unroll if there are no captures and the body can't be
				5528	// empty.
				5529	{
				5530	RegExpExpansionLimiter limiter(
				5531	compiler, min + ((max != min) ? 1 : 0));
				5532	if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
				5533	int new_max = (max == kInfinity) ? max : max - min;
				5534	// Recurse once to get the loop or optional matches after the fixed
				5535	// ones.
				5536	RegExpNode* answer = ToNode(
				5537	0, new_max, is_greedy, body, compiler, on_success, true);
				5538	// Unroll the forced matches from 0 to min. This can cause chains of
				5539	// TextNodes (which the parser does not generate). These should be
				5540	// combined if it turns out they hinder good code generation.
				5541	for (int i = 0; i < min; i++) {
				5542	answer = body->ToNode(compiler, answer);
				5543	}
				5544	return answer;
				5545	}
				5546	}
				5547	if (max <= kMaxUnrolledMaxMatches && min == 0) {
				5548	DCHECK(max > 0); // Due to the 'if' above.
				5549	RegExpExpansionLimiter limiter(compiler, max);
				5550	if (limiter.ok_to_expand()) {
				5551	// Unroll the optional matches up to max.
				5552	RegExpNode* answer = on_success;
				5553	for (int i = 0; i < max; i++) {
				5554	ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
				5555	if (is_greedy) {
				5556	alternation->AddAlternative(
				5557	GuardedAlternative(body->ToNode(compiler, answer)));
				5558	alternation->AddAlternative(GuardedAlternative(on_success));
				5559	} else {
				5560	alternation->AddAlternative(GuardedAlternative(on_success));
				5561	alternation->AddAlternative(
				5562	GuardedAlternative(body->ToNode(compiler, answer)));
				5563	}
				5564	answer = alternation;
				5565	if (not_at_start && !compiler->read_backward()) {
				5566	alternation->set_not_at_start();
				5567	}
				5568	}
				5569	return answer;
				5570	}
				5571	}
				5572	}
				5573	bool has_min = min > 0;
				5574	bool has_max = max < RegExpTree::kInfinity;
				5575	bool needs_counter = has_min \|\| has_max;
				5576	int reg_ctr = needs_counter
				5577	? compiler->AllocateRegister()
				5578	: RegExpCompiler::kNoRegister;
				5579	LoopChoiceNode* center = new (zone)
				5580	LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
				5581	if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
				5582	RegExpNode* loop_return = needs_counter
				5583	? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
				5584	: static_cast<RegExpNode*>(center);
				5585	if (body_can_be_empty) {
				5586	// If the body can be empty we need to check if it was and then
				5587	// backtrack.
				5588	loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
				5589	reg_ctr,
				5590	min,
				5591	loop_return);
				5592	}
				5593	RegExpNode* body_node = body->ToNode(compiler, loop_return);
				5594	if (body_can_be_empty) {
				5595	// If the body can be empty we need to store the start position
				5596	// so we can bail out if it was empty.
				5597	body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
				5598	}
				5599	if (needs_capture_clearing) {
				5600	// Before entering the body of this loop we need to clear captures.
				5601	body_node = ActionNode::ClearCaptures(capture_registers, body_node);
				5602	}
				5603	GuardedAlternative body_alt(body_node);
				5604	if (has_max) {
				5605	Guard* body_guard =
				5606	new(zone) Guard(reg_ctr, Guard::LT, max);
				5607	body_alt.AddGuard(body_guard, zone);
				5608	}
				5609	GuardedAlternative rest_alt(on_success);
				5610	if (has_min) {
				5611	Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
				5612	rest_alt.AddGuard(rest_guard, zone);
				5613	}
				5614	if (is_greedy) {
				5615	center->AddLoopAlternative(body_alt);
				5616	center->AddContinueAlternative(rest_alt);
				5617	} else {
				5618	center->AddContinueAlternative(rest_alt);
				5619	center->AddLoopAlternative(body_alt);
				5620	}
				5621	if (needs_counter) {
				5622	return ActionNode::SetRegister(reg_ctr, 0, center);
				5623	} else {
				5624	return center;
				5625	}
				5626	}
				5627
				5628
				5629	RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
				5630	RegExpNode* on_success) {
				5631	NodeInfo info;
				5632	Zone* zone = compiler->zone();
				5633
				5634	switch (assertion_type()) {
				5635	case START_OF_LINE:
				5636	return AssertionNode::AfterNewline(on_success);
				5637	case START_OF_INPUT:
				5638	return AssertionNode::AtStart(on_success);
				5639	case BOUNDARY:
				5640	return AssertionNode::AtBoundary(on_success);
				5641	case NON_BOUNDARY:
				5642	return AssertionNode::AtNonBoundary(on_success);
				5643	case END_OF_INPUT:
				5644	return AssertionNode::AtEnd(on_success);
				5645	case END_OF_LINE: {
				5646	// Compile $ in multiline regexps as an alternation with a positive
				5647	// lookahead in one side and an end-of-input on the other side.
				5648	// We need two registers for the lookahead.
				5649	int stack_pointer_register = compiler->AllocateRegister();
				5650	int position_register = compiler->AllocateRegister();
				5651	// The ChoiceNode to distinguish between a newline and end-of-input.
				5652	ChoiceNode* result = new(zone) ChoiceNode(2, zone);
				5653	// Create a newline atom.
				5654	ZoneList<CharacterRange>* newline_ranges =
				5655	new(zone) ZoneList<CharacterRange>(3, zone);
				5656	CharacterRange::AddClassEscape('n', newline_ranges, zone);
				5657	RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
				5658	TextNode* newline_matcher = new (zone) TextNode(
				5659	newline_atom, false, ActionNode::PositiveSubmatchSuccess(
				5660	stack_pointer_register, position_register,
				5661	0, // No captures inside.
				5662	-1, // Ignored if no captures.
				5663	on_success));
				5664	// Create an end-of-input matcher.
				5665	RegExpNode* end_of_line = ActionNode::BeginSubmatch(
				5666	stack_pointer_register,
				5667	position_register,
				5668	newline_matcher);
				5669	// Add the two alternatives to the ChoiceNode.
				5670	GuardedAlternative eol_alternative(end_of_line);
				5671	result->AddAlternative(eol_alternative);
				5672	GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
				5673	result->AddAlternative(end_alternative);
				5674	return result;
				5675	}
				5676	default:
				5677	UNREACHABLE();
				5678	}
				5679	return on_success;
				5680	}
				5681
				5682
				5683	RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
				5684	RegExpNode* on_success) {
				5685	return new (compiler->zone())
				5686	BackReferenceNode(RegExpCapture::StartRegister(index()),
				5687	RegExpCapture::EndRegister(index()),
				5688	compiler->read_backward(), on_success);
				5689	}
				5690
				5691
				5692	RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
				5693	RegExpNode* on_success) {
				5694	return on_success;
				5695	}
				5696
				5697
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	5698	RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
				5699	int stack_pointer_register,
				5700	int position_register,
				5701	int capture_register_count,
				5702	int capture_register_start)
				5703	: is_positive_(is_positive),
				5704	on_success_(on_success),
				5705	stack_pointer_register_(stack_pointer_register),
				5706	position_register_(position_register) {
				5707	if (is_positive_) {
				5708	on_match_success_ = ActionNode::PositiveSubmatchSuccess(
				5709	stack_pointer_register, position_register, capture_register_count,
				5710	capture_register_start, on_success_);
				5711	} else {
				5712	Zone* zone = on_success_->zone();
				5713	on_match_success_ = new (zone) NegativeSubmatchSuccess(
				5714	stack_pointer_register, position_register, capture_register_count,
				5715	capture_register_start, zone);
				5716	}
				5717	}
				5718
				5719
				5720	RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
				5721	if (is_positive_) {
				5722	return ActionNode::BeginSubmatch(stack_pointer_register_,
				5723	position_register_, match);
				5724	} else {
				5725	Zone* zone = on_success_->zone();
				5726	// We use a ChoiceNode to represent the negative lookaround. The first
				5727	// alternative is the negative match. On success, the end node backtracks.
				5728	// On failure, the second alternative is tried and leads to success.
				5729	// NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
				5730	// first exit when calculating quick checks.
				5731	ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
				5732	GuardedAlternative(match), GuardedAlternative(on_success_), zone);
				5733	return ActionNode::BeginSubmatch(stack_pointer_register_,
				5734	position_register_, choice_node);
				5735	}
				5736	}
				5737
				5738
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	5739	RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
				5740	RegExpNode* on_success) {
				5741	int stack_pointer_register = compiler->AllocateRegister();
				5742	int position_register = compiler->AllocateRegister();
				5743
				5744	const int registers_per_capture = 2;
				5745	const int register_of_first_capture = 2;
				5746	int register_count = capture_count_ * registers_per_capture;
				5747	int register_start =
				5748	register_of_first_capture + capture_from_ * registers_per_capture;
				5749
				5750	RegExpNode* result;
				5751	bool was_reading_backward = compiler->read_backward();
				5752	compiler->set_read_backward(type() == LOOKBEHIND);
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	5753	Builder builder(is_positive(), on_success, stack_pointer_register,
				5754	position_register, register_count, register_start);
				5755	RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
				5756	result = builder.ForMatch(match);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	5757	compiler->set_read_backward(was_reading_backward);
				5758	return result;
				5759	}
				5760
				5761
				5762	RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
				5763	RegExpNode* on_success) {
				5764	return ToNode(body(), index(), compiler, on_success);
				5765	}
				5766
				5767
				5768	RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
				5769	int index,
				5770	RegExpCompiler* compiler,
				5771	RegExpNode* on_success) {
				5772	DCHECK_NOT_NULL(body);
				5773	int start_reg = RegExpCapture::StartRegister(index);
				5774	int end_reg = RegExpCapture::EndRegister(index);
				5775	if (compiler->read_backward()) std::swap(start_reg, end_reg);
				5776	RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
				5777	RegExpNode* body_node = body->ToNode(compiler, store_end);
				5778	return ActionNode::StorePosition(start_reg, true, body_node);
				5779	}
				5780
				5781
				5782	RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
				5783	RegExpNode* on_success) {
				5784	ZoneList<RegExpTree> children = nodes();
				5785	RegExpNode* current = on_success;
				5786	if (compiler->read_backward()) {
				5787	for (int i = 0; i < children->length(); i++) {
				5788	current = children->at(i)->ToNode(compiler, current);
				5789	}
				5790	} else {
				5791	for (int i = children->length() - 1; i >= 0; i--) {
				5792	current = children->at(i)->ToNode(compiler, current);
				5793	}
				5794	}
				5795	return current;
				5796	}
				5797
				5798
				5799	static void AddClass(const int* elmv,
				5800	int elmc,
				5801	ZoneList<CharacterRange>* ranges,
				5802	Zone* zone) {
				5803	elmc--;
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	5804	DCHECK(elmv[elmc] == kRangeEndMarker);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	5805	for (int i = 0; i < elmc; i += 2) {
				5806	DCHECK(elmv[i] < elmv[i + 1]);
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	5807	ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	5808	}
				5809	}
				5810
				5811
				5812	static void AddClassNegated(const int *elmv,
				5813	int elmc,
				5814	ZoneList<CharacterRange>* ranges,
				5815	Zone* zone) {
				5816	elmc--;
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	5817	DCHECK(elmv[elmc] == kRangeEndMarker);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	5818	DCHECK(elmv[0] != 0x0000);
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	5819	DCHECK(elmv[elmc - 1] != String::kMaxCodePoint);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	5820	uc16 last = 0x0000;
				5821	for (int i = 0; i < elmc; i += 2) {
				5822	DCHECK(last <= elmv[i] - 1);
				5823	DCHECK(elmv[i] < elmv[i + 1]);
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	5824	ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	5825	last = elmv[i + 1];
				5826	}
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	5827	ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	5828	}
				5829
				5830
				5831	void CharacterRange::AddClassEscape(uc16 type,
				5832	ZoneList<CharacterRange>* ranges,
				5833	Zone* zone) {
				5834	switch (type) {
				5835	case 's':
				5836	AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
				5837	break;
				5838	case 'S':
				5839	AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
				5840	break;
				5841	case 'w':
				5842	AddClass(kWordRanges, kWordRangeCount, ranges, zone);
				5843	break;
				5844	case 'W':
				5845	AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
				5846	break;
				5847	case 'd':
				5848	AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
				5849	break;
				5850	case 'D':
				5851	AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
				5852	break;
				5853	case '.':
				5854	AddClassNegated(kLineTerminatorRanges,
				5855	kLineTerminatorRangeCount,
				5856	ranges,
				5857	zone);
				5858	break;
				5859	// This is not a character range as defined by the spec but a
				5860	// convenient shorthand for a character class that matches any
				5861	// character.
				5862	case '*':
				5863	ranges->Add(CharacterRange::Everything(), zone);
				5864	break;
				5865	// This is the set of characters matched by the $ and ^ symbols
				5866	// in multiline mode.
				5867	case 'n':
				5868	AddClass(kLineTerminatorRanges,
				5869	kLineTerminatorRangeCount,
				5870	ranges,
				5871	zone);
				5872	break;
				5873	default:
				5874	UNREACHABLE();
				5875	}
				5876	}
				5877
				5878
				5879	Vector<const int> CharacterRange::GetWordBounds() {
				5880	return Vector<const int>(kWordRanges, kWordRangeCount - 1);
				5881	}
				5882
				5883
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	5884	void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
				5885	ZoneList<CharacterRange>* ranges,
				5886	bool is_one_byte) {
Ben Murdoch	bcf72ee	2016-08-08 18:44:38 +0100	[diff] [blame]	5887	CharacterRange::Canonicalize(ranges);
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	5888	int range_count = ranges->length();
				5889	for (int i = 0; i < range_count; i++) {
				5890	CharacterRange range = ranges->at(i);
				5891	uc32 bottom = range.from();
				5892	if (bottom > String::kMaxUtf16CodeUnit) return;
				5893	uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
				5894	// Nothing to be done for surrogates.
				5895	if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;
				5896	if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
				5897	if (bottom > String::kMaxOneByteCharCode) return;
				5898	if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	5899	}
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	5900	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				5901	if (top == bottom) {
				5902	// If this is a singleton we just expand the one character.
				5903	int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	5904	for (int i = 0; i < length; i++) {
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	5905	uc32 chr = chars[i];
				5906	if (chr != bottom) {
				5907	ranges->Add(CharacterRange::Singleton(chars[i]), zone);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	5908	}
				5909	}
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	5910	} else {
				5911	// If this is a range we expand the characters block by block, expanding
				5912	// contiguous subranges (blocks) one at a time. The approach is as
				5913	// follows. For a given start character we look up the remainder of the
				5914	// block that contains it (represented by the end point), for instance we
				5915	// find 'z' if the character is 'c'. A block is characterized by the
				5916	// property that all characters uncanonicalize in the same way, except
				5917	// that each entry in the result is incremented by the distance from the
				5918	// first element. So a-z is a block because 'a' uncanonicalizes to ['a',
				5919	// 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
				5920	// we've found the end point we look up its uncanonicalization and
				5921	// produce a range for each element. For instance for [c-f] we look up
				5922	// ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
				5923	// it is not already contained in the input, so [c-f] will be skipped but
				5924	// [C-F] will be added. If this range is not completely contained in a
				5925	// block we do this for all the blocks covered by the range (handling
				5926	// characters that is not in a block as a "singleton block").
				5927	unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				5928	int pos = bottom;
				5929	while (pos <= top) {
				5930	int length =
				5931	isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
				5932	uc32 block_end;
				5933	if (length == 0) {
				5934	block_end = pos;
				5935	} else {
				5936	DCHECK_EQ(1, length);
				5937	block_end = equivalents[0];
				5938	}
				5939	int end = (block_end > top) ? top : block_end;
				5940	length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
				5941	equivalents);
				5942	for (int i = 0; i < length; i++) {
				5943	uc32 c = equivalents[i];
				5944	uc32 range_from = c - (block_end - pos);
				5945	uc32 range_to = c - (block_end - end);
				5946	if (!(bottom <= range_from && range_to <= top)) {
				5947	ranges->Add(CharacterRange::Range(range_from, range_to), zone);
				5948	}
				5949	}
				5950	pos = end + 1;
				5951	}
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	5952	}
				5953	}
				5954	}
				5955
				5956
				5957	bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
				5958	DCHECK_NOT_NULL(ranges);
				5959	int n = ranges->length();
				5960	if (n <= 1) return true;
				5961	int max = ranges->at(0).to();
				5962	for (int i = 1; i < n; i++) {
				5963	CharacterRange next_range = ranges->at(i);
				5964	if (next_range.from() <= max + 1) return false;
				5965	max = next_range.to();
				5966	}
				5967	return true;
				5968	}
				5969
				5970
				5971	ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
				5972	if (ranges_ == NULL) {
				5973	ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
				5974	CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
				5975	}
				5976	return ranges_;
				5977	}
				5978
				5979
				5980	// Move a number of elements in a zonelist to another position
				5981	// in the same list. Handles overlapping source and target areas.
				5982	static void MoveRanges(ZoneList<CharacterRange>* list,
				5983	int from,
				5984	int to,
				5985	int count) {
				5986	// Ranges are potentially overlapping.
				5987	if (from < to) {
				5988	for (int i = count - 1; i >= 0; i--) {
				5989	list->at(to + i) = list->at(from + i);
				5990	}
				5991	} else {
				5992	for (int i = 0; i < count; i++) {
				5993	list->at(to + i) = list->at(from + i);
				5994	}
				5995	}
				5996	}
				5997
				5998
				5999	static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
				6000	int count,
				6001	CharacterRange insert) {
				6002	// Inserts a range into list[0..count[, which must be sorted
				6003	// by from value and non-overlapping and non-adjacent, using at most
				6004	// list[0..count] for the result. Returns the number of resulting
				6005	// canonicalized ranges. Inserting a range may collapse existing ranges into
				6006	// fewer ranges, so the return value can be anything in the range 1..count+1.
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6007	uc32 from = insert.from();
				6008	uc32 to = insert.to();
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6009	int start_pos = 0;
				6010	int end_pos = count;
				6011	for (int i = count - 1; i >= 0; i--) {
				6012	CharacterRange current = list->at(i);
				6013	if (current.from() > to + 1) {
				6014	end_pos = i;
				6015	} else if (current.to() + 1 < from) {
				6016	start_pos = i + 1;
				6017	break;
				6018	}
				6019	}
				6020
				6021	// Inserted range overlaps, or is adjacent to, ranges at positions
				6022	// [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
				6023	// not affected by the insertion.
				6024	// If start_pos == end_pos, the range must be inserted before start_pos.
				6025	// if start_pos < end_pos, the entire range from start_pos to end_pos
				6026	// must be merged with the insert range.
				6027
				6028	if (start_pos == end_pos) {
				6029	// Insert between existing ranges at position start_pos.
				6030	if (start_pos < count) {
				6031	MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
				6032	}
				6033	list->at(start_pos) = insert;
				6034	return count + 1;
				6035	}
				6036	if (start_pos + 1 == end_pos) {
				6037	// Replace single existing range at position start_pos.
				6038	CharacterRange to_replace = list->at(start_pos);
				6039	int new_from = Min(to_replace.from(), from);
				6040	int new_to = Max(to_replace.to(), to);
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6041	list->at(start_pos) = CharacterRange::Range(new_from, new_to);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6042	return count;
				6043	}
				6044	// Replace a number of existing ranges from start_pos to end_pos - 1.
				6045	// Move the remaining ranges down.
				6046
				6047	int new_from = Min(list->at(start_pos).from(), from);
				6048	int new_to = Max(list->at(end_pos - 1).to(), to);
				6049	if (end_pos < count) {
				6050	MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
				6051	}
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6052	list->at(start_pos) = CharacterRange::Range(new_from, new_to);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6053	return count - (end_pos - start_pos) + 1;
				6054	}
				6055
				6056
				6057	void CharacterSet::Canonicalize() {
				6058	// Special/default classes are always considered canonical. The result
				6059	// of calling ranges() will be sorted.
				6060	if (ranges_ == NULL) return;
				6061	CharacterRange::Canonicalize(ranges_);
				6062	}
				6063
				6064
				6065	void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
				6066	if (character_ranges->length() <= 1) return;
				6067	// Check whether ranges are already canonical (increasing, non-overlapping,
				6068	// non-adjacent).
				6069	int n = character_ranges->length();
				6070	int max = character_ranges->at(0).to();
				6071	int i = 1;
				6072	while (i < n) {
				6073	CharacterRange current = character_ranges->at(i);
				6074	if (current.from() <= max + 1) {
				6075	break;
				6076	}
				6077	max = current.to();
				6078	i++;
				6079	}
				6080	// Canonical until the i'th range. If that's all of them, we are done.
				6081	if (i == n) return;
				6082
				6083	// The ranges at index i and forward are not canonicalized. Make them so by
				6084	// doing the equivalent of insertion sort (inserting each into the previous
				6085	// list, in order).
				6086	// Notice that inserting a range can reduce the number of ranges in the
				6087	// result due to combining of adjacent and overlapping ranges.
				6088	int read = i; // Range to insert.
				6089	int num_canonical = i; // Length of canonicalized part of list.
				6090	do {
				6091	num_canonical = InsertRangeInCanonicalList(character_ranges,
				6092	num_canonical,
				6093	character_ranges->at(read));
				6094	read++;
				6095	} while (read < n);
				6096	character_ranges->Rewind(num_canonical);
				6097
				6098	DCHECK(CharacterRange::IsCanonical(character_ranges));
				6099	}
				6100
				6101
				6102	void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
				6103	ZoneList<CharacterRange>* negated_ranges,
				6104	Zone* zone) {
				6105	DCHECK(CharacterRange::IsCanonical(ranges));
				6106	DCHECK_EQ(0, negated_ranges->length());
				6107	int range_count = ranges->length();
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6108	uc32 from = 0;
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6109	int i = 0;
				6110	if (range_count > 0 && ranges->at(0).from() == 0) {
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6111	from = ranges->at(0).to() + 1;
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6112	i = 1;
				6113	}
				6114	while (i < range_count) {
				6115	CharacterRange range = ranges->at(i);
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6116	negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
				6117	from = range.to() + 1;
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6118	i++;
				6119	}
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6120	if (from < String::kMaxCodePoint) {
				6121	negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6122	zone);
				6123	}
				6124	}
				6125
				6126
				6127	// -------------------------------------------------------------------
				6128	// Splay tree
				6129
				6130
				6131	OutSet* OutSet::Extend(unsigned value, Zone* zone) {
				6132	if (Get(value))
				6133	return this;
				6134	if (successors(zone) != NULL) {
				6135	for (int i = 0; i < successors(zone)->length(); i++) {
				6136	OutSet* successor = successors(zone)->at(i);
				6137	if (successor->Get(value))
				6138	return successor;
				6139	}
				6140	} else {
				6141	successors_ = new(zone) ZoneList<OutSet*>(2, zone);
				6142	}
				6143	OutSet* result = new(zone) OutSet(first_, remaining_);
				6144	result->Set(value, zone);
				6145	successors(zone)->Add(result, zone);
				6146	return result;
				6147	}
				6148
				6149
				6150	void OutSet::Set(unsigned value, Zone *zone) {
				6151	if (value < kFirstLimit) {
				6152	first_ \|= (1 << value);
				6153	} else {
				6154	if (remaining_ == NULL)
				6155	remaining_ = new(zone) ZoneList<unsigned>(1, zone);
				6156	if (remaining_->is_empty() \|\| !remaining_->Contains(value))
				6157	remaining_->Add(value, zone);
				6158	}
				6159	}
				6160
				6161
				6162	bool OutSet::Get(unsigned value) const {
				6163	if (value < kFirstLimit) {
				6164	return (first_ & (1 << value)) != 0;
				6165	} else if (remaining_ == NULL) {
				6166	return false;
				6167	} else {
				6168	return remaining_->Contains(value);
				6169	}
				6170	}
				6171
				6172
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6173	const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6174
				6175
				6176	void DispatchTable::AddRange(CharacterRange full_range, int value,
				6177	Zone* zone) {
				6178	CharacterRange current = full_range;
				6179	if (tree()->is_empty()) {
				6180	// If this is the first range we just insert into the table.
				6181	ZoneSplayTree<Config>::Locator loc;
				6182	bool inserted = tree()->Insert(current.from(), &loc);
				6183	DCHECK(inserted);
				6184	USE(inserted);
				6185	loc.set_value(Entry(current.from(), current.to(),
				6186	empty()->Extend(value, zone)));
				6187	return;
				6188	}
				6189	// First see if there is a range to the left of this one that
				6190	// overlaps.
				6191	ZoneSplayTree<Config>::Locator loc;
				6192	if (tree()->FindGreatestLessThan(current.from(), &loc)) {
				6193	Entry* entry = &loc.value();
				6194	// If we've found a range that overlaps with this one, and it
				6195	// starts strictly to the left of this one, we have to fix it
				6196	// because the following code only handles ranges that start on
				6197	// or after the start point of the range we're adding.
				6198	if (entry->from() < current.from() && entry->to() >= current.from()) {
				6199	// Snap the overlapping range in half around the start point of
				6200	// the range we're adding.
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6201	CharacterRange left =
				6202	CharacterRange::Range(entry->from(), current.from() - 1);
				6203	CharacterRange right = CharacterRange::Range(current.from(), entry->to());
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6204	// The left part of the overlapping range doesn't overlap.
				6205	// Truncate the whole entry to be just the left part.
				6206	entry->set_to(left.to());
				6207	// The right part is the one that overlaps. We add this part
				6208	// to the map and let the next step deal with merging it with
				6209	// the range we're adding.
				6210	ZoneSplayTree<Config>::Locator loc;
				6211	bool inserted = tree()->Insert(right.from(), &loc);
				6212	DCHECK(inserted);
				6213	USE(inserted);
				6214	loc.set_value(Entry(right.from(),
				6215	right.to(),
				6216	entry->out_set()));
				6217	}
				6218	}
				6219	while (current.is_valid()) {
				6220	if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
				6221	(loc.value().from() <= current.to()) &&
				6222	(loc.value().to() >= current.from())) {
				6223	Entry* entry = &loc.value();
				6224	// We have overlap. If there is space between the start point of
				6225	// the range we're adding and where the overlapping range starts
				6226	// then we have to add a range covering just that space.
				6227	if (current.from() < entry->from()) {
				6228	ZoneSplayTree<Config>::Locator ins;
				6229	bool inserted = tree()->Insert(current.from(), &ins);
				6230	DCHECK(inserted);
				6231	USE(inserted);
				6232	ins.set_value(Entry(current.from(),
				6233	entry->from() - 1,
				6234	empty()->Extend(value, zone)));
				6235	current.set_from(entry->from());
				6236	}
				6237	DCHECK_EQ(current.from(), entry->from());
				6238	// If the overlapping range extends beyond the one we want to add
				6239	// we have to snap the right part off and add it separately.
				6240	if (entry->to() > current.to()) {
				6241	ZoneSplayTree<Config>::Locator ins;
				6242	bool inserted = tree()->Insert(current.to() + 1, &ins);
				6243	DCHECK(inserted);
				6244	USE(inserted);
				6245	ins.set_value(Entry(current.to() + 1,
				6246	entry->to(),
				6247	entry->out_set()));
				6248	entry->set_to(current.to());
				6249	}
				6250	DCHECK(entry->to() <= current.to());
				6251	// The overlapping range is now completely contained by the range
				6252	// we're adding so we can just update it and move the start point
				6253	// of the range we're adding just past it.
				6254	entry->AddValue(value, zone);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6255	DCHECK(entry->to() + 1 > current.from());
				6256	current.set_from(entry->to() + 1);
				6257	} else {
				6258	// There is no overlap so we can just add the range
				6259	ZoneSplayTree<Config>::Locator ins;
				6260	bool inserted = tree()->Insert(current.from(), &ins);
				6261	DCHECK(inserted);
				6262	USE(inserted);
				6263	ins.set_value(Entry(current.from(),
				6264	current.to(),
				6265	empty()->Extend(value, zone)));
				6266	break;
				6267	}
				6268	}
				6269	}
				6270
				6271
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6272	OutSet* DispatchTable::Get(uc32 value) {
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6273	ZoneSplayTree<Config>::Locator loc;
				6274	if (!tree()->FindGreatestLessThan(value, &loc))
				6275	return empty();
				6276	Entry* entry = &loc.value();
				6277	if (value <= entry->to())
				6278	return entry->out_set();
				6279	else
				6280	return empty();
				6281	}
				6282
				6283
				6284	// -------------------------------------------------------------------
				6285	// Analysis
				6286
				6287
				6288	void Analysis::EnsureAnalyzed(RegExpNode* that) {
				6289	StackLimitCheck check(isolate());
				6290	if (check.HasOverflowed()) {
				6291	fail("Stack overflow");
				6292	return;
				6293	}
				6294	if (that->info()->been_analyzed \|\| that->info()->being_analyzed)
				6295	return;
				6296	that->info()->being_analyzed = true;
				6297	that->Accept(this);
				6298	that->info()->being_analyzed = false;
				6299	that->info()->been_analyzed = true;
				6300	}
				6301
				6302
				6303	void Analysis::VisitEnd(EndNode* that) {
				6304	// nothing to do
				6305	}
				6306
				6307
				6308	void TextNode::CalculateOffsets() {
				6309	int element_count = elements()->length();
				6310	// Set up the offsets of the elements relative to the start. This is a fixed
				6311	// quantity since a TextNode can only contain fixed-width things.
				6312	int cp_offset = 0;
				6313	for (int i = 0; i < element_count; i++) {
				6314	TextElement& elm = elements()->at(i);
				6315	elm.set_cp_offset(cp_offset);
				6316	cp_offset += elm.length();
				6317	}
				6318	}
				6319
				6320
				6321	void Analysis::VisitText(TextNode* that) {
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6322	if (ignore_case()) {
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6323	that->MakeCaseIndependent(isolate(), is_one_byte_);
				6324	}
				6325	EnsureAnalyzed(that->on_success());
				6326	if (!has_failed()) {
				6327	that->CalculateOffsets();
				6328	}
				6329	}
				6330
				6331
				6332	void Analysis::VisitAction(ActionNode* that) {
				6333	RegExpNode* target = that->on_success();
				6334	EnsureAnalyzed(target);
				6335	if (!has_failed()) {
				6336	// If the next node is interested in what it follows then this node
				6337	// has to be interested too so it can pass the information on.
				6338	that->info()->AddFromFollowing(target->info());
				6339	}
				6340	}
				6341
				6342
				6343	void Analysis::VisitChoice(ChoiceNode* that) {
				6344	NodeInfo* info = that->info();
				6345	for (int i = 0; i < that->alternatives()->length(); i++) {
				6346	RegExpNode* node = that->alternatives()->at(i).node();
				6347	EnsureAnalyzed(node);
				6348	if (has_failed()) return;
				6349	// Anything the following nodes need to know has to be known by
				6350	// this node also, so it can pass it on.
				6351	info->AddFromFollowing(node->info());
				6352	}
				6353	}
				6354
				6355
				6356	void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
				6357	NodeInfo* info = that->info();
				6358	for (int i = 0; i < that->alternatives()->length(); i++) {
				6359	RegExpNode* node = that->alternatives()->at(i).node();
				6360	if (node != that->loop_node()) {
				6361	EnsureAnalyzed(node);
				6362	if (has_failed()) return;
				6363	info->AddFromFollowing(node->info());
				6364	}
				6365	}
				6366	// Check the loop last since it may need the value of this node
				6367	// to get a correct result.
				6368	EnsureAnalyzed(that->loop_node());
				6369	if (!has_failed()) {
				6370	info->AddFromFollowing(that->loop_node()->info());
				6371	}
				6372	}
				6373
				6374
				6375	void Analysis::VisitBackReference(BackReferenceNode* that) {
				6376	EnsureAnalyzed(that->on_success());
				6377	}
				6378
				6379
				6380	void Analysis::VisitAssertion(AssertionNode* that) {
				6381	EnsureAnalyzed(that->on_success());
				6382	}
				6383
				6384
				6385	void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
				6386	BoyerMooreLookahead* bm,
				6387	bool not_at_start) {
				6388	// Working out the set of characters that a backreference can match is too
				6389	// hard, so we just say that any character can match.
				6390	bm->SetRest(offset);
				6391	SaveBMInfo(bm, not_at_start, offset);
				6392	}
				6393
				6394
				6395	STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
				6396	RegExpMacroAssembler::kTableSize);
				6397
				6398
				6399	void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
				6400	BoyerMooreLookahead* bm, bool not_at_start) {
				6401	ZoneList<GuardedAlternative>* alts = alternatives();
				6402	budget = (budget - 1) / alts->length();
				6403	for (int i = 0; i < alts->length(); i++) {
				6404	GuardedAlternative& alt = alts->at(i);
				6405	if (alt.guards() != NULL && alt.guards()->length() != 0) {
				6406	bm->SetRest(offset); // Give up trying to fill in info.
				6407	SaveBMInfo(bm, not_at_start, offset);
				6408	return;
				6409	}
				6410	alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
				6411	}
				6412	SaveBMInfo(bm, not_at_start, offset);
				6413	}
				6414
				6415
				6416	void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
				6417	BoyerMooreLookahead* bm, bool not_at_start) {
				6418	if (initial_offset >= bm->length()) return;
				6419	int offset = initial_offset;
				6420	int max_char = bm->max_char();
				6421	for (int i = 0; i < elements()->length(); i++) {
				6422	if (offset >= bm->length()) {
				6423	if (initial_offset == 0) set_bm_info(not_at_start, bm);
				6424	return;
				6425	}
				6426	TextElement text = elements()->at(i);
				6427	if (text.text_type() == TextElement::ATOM) {
				6428	RegExpAtom* atom = text.atom();
				6429	for (int j = 0; j < atom->length(); j++, offset++) {
				6430	if (offset >= bm->length()) {
				6431	if (initial_offset == 0) set_bm_info(not_at_start, bm);
				6432	return;
				6433	}
				6434	uc16 character = atom->data()[j];
				6435	if (bm->compiler()->ignore_case()) {
				6436	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				6437	int length = GetCaseIndependentLetters(
				6438	isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
				6439	chars);
				6440	for (int j = 0; j < length; j++) {
				6441	bm->Set(offset, chars[j]);
				6442	}
				6443	} else {
				6444	if (character <= max_char) bm->Set(offset, character);
				6445	}
				6446	}
				6447	} else {
				6448	DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
				6449	RegExpCharacterClass* char_class = text.char_class();
				6450	ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
				6451	if (char_class->is_negated()) {
				6452	bm->SetAll(offset);
				6453	} else {
				6454	for (int k = 0; k < ranges->length(); k++) {
				6455	CharacterRange& range = ranges->at(k);
				6456	if (range.from() > max_char) continue;
				6457	int to = Min(max_char, static_cast<int>(range.to()));
				6458	bm->SetInterval(offset, Interval(range.from(), to));
				6459	}
				6460	}
				6461	offset++;
				6462	}
				6463	}
				6464	if (offset >= bm->length()) {
				6465	if (initial_offset == 0) set_bm_info(not_at_start, bm);
				6466	return;
				6467	}
				6468	on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
				6469	true); // Not at start after a text node.
				6470	if (initial_offset == 0) set_bm_info(not_at_start, bm);
				6471	}
				6472
				6473
				6474	// -------------------------------------------------------------------
				6475	// Dispatch table construction
				6476
				6477
				6478	void DispatchTableConstructor::VisitEnd(EndNode* that) {
				6479	AddRange(CharacterRange::Everything());
				6480	}
				6481
				6482
				6483	void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
				6484	node->set_being_calculated(true);
				6485	ZoneList<GuardedAlternative>* alternatives = node->alternatives();
				6486	for (int i = 0; i < alternatives->length(); i++) {
				6487	set_choice_index(i);
				6488	alternatives->at(i).node()->Accept(this);
				6489	}
				6490	node->set_being_calculated(false);
				6491	}
				6492
				6493
				6494	class AddDispatchRange {
				6495	public:
				6496	explicit AddDispatchRange(DispatchTableConstructor* constructor)
				6497	: constructor_(constructor) { }
				6498	void Call(uc32 from, DispatchTable::Entry entry);
				6499	private:
				6500	DispatchTableConstructor* constructor_;
				6501	};
				6502
				6503
				6504	void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6505	constructor_->AddRange(CharacterRange::Range(from, entry.to()));
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6506	}
				6507
				6508
				6509	void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
				6510	if (node->being_calculated())
				6511	return;
				6512	DispatchTable* table = node->GetTable(ignore_case_);
				6513	AddDispatchRange adder(this);
				6514	table->ForEach(&adder);
				6515	}
				6516
				6517
				6518	void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
				6519	// TODO(160): Find the node that we refer back to and propagate its start
				6520	// set back to here. For now we just accept anything.
				6521	AddRange(CharacterRange::Everything());
				6522	}
				6523
				6524
				6525	void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
				6526	RegExpNode* target = that->on_success();
				6527	target->Accept(this);
				6528	}
				6529
				6530
				6531	static int CompareRangeByFrom(const CharacterRange* a,
				6532	const CharacterRange* b) {
				6533	return Compare<uc16>(a->from(), b->from());
				6534	}
				6535
				6536
				6537	void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
				6538	ranges->Sort(CompareRangeByFrom);
				6539	uc16 last = 0;
				6540	for (int i = 0; i < ranges->length(); i++) {
				6541	CharacterRange range = ranges->at(i);
				6542	if (last < range.from())
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6543	AddRange(CharacterRange::Range(last, range.from() - 1));
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6544	if (range.to() >= last) {
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6545	if (range.to() == String::kMaxCodePoint) {
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6546	return;
				6547	} else {
				6548	last = range.to() + 1;
				6549	}
				6550	}
				6551	}
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6552	AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6553	}
				6554
				6555
				6556	void DispatchTableConstructor::VisitText(TextNode* that) {
				6557	TextElement elm = that->elements()->at(0);
				6558	switch (elm.text_type()) {
				6559	case TextElement::ATOM: {
				6560	uc16 c = elm.atom()->data()[0];
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6561	AddRange(CharacterRange::Range(c, c));
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6562	break;
				6563	}
				6564	case TextElement::CHAR_CLASS: {
				6565	RegExpCharacterClass* tree = elm.char_class();
				6566	ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
				6567	if (tree->is_negated()) {
				6568	AddInverse(ranges);
				6569	} else {
				6570	for (int i = 0; i < ranges->length(); i++)
				6571	AddRange(ranges->at(i));
				6572	}
				6573	break;
				6574	}
				6575	default: {
				6576	UNIMPLEMENTED();
				6577	}
				6578	}
				6579	}
				6580
				6581
				6582	void DispatchTableConstructor::VisitAction(ActionNode* that) {
				6583	RegExpNode* target = that->on_success();
				6584	target->Accept(this);
				6585	}
				6586
				6587
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6588	RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
				6589	RegExpNode* on_success) {
				6590	// If the regexp matching starts within a surrogate pair, step back
				6591	// to the lead surrogate and start matching from there.
				6592	DCHECK(!compiler->read_backward());
				6593	Zone* zone = compiler->zone();
				6594	ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
				6595	zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
				6596	ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
				6597	zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
				6598
				6599	ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
				6600
				6601	int stack_register = compiler->UnicodeLookaroundStackRegister();
				6602	int position_register = compiler->UnicodeLookaroundPositionRegister();
				6603	RegExpNode* step_back = TextNode::CreateForCharacterRanges(
				6604	zone, lead_surrogates, true, on_success);
				6605	RegExpLookaround::Builder builder(true, step_back, stack_register,
				6606	position_register);
				6607	RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
				6608	zone, trail_surrogates, false, builder.on_match_success());
				6609
				6610	optional_step_back->AddAlternative(
				6611	GuardedAlternative(builder.ForMatch(match_trail)));
				6612	optional_step_back->AddAlternative(GuardedAlternative(on_success));
				6613
				6614	return optional_step_back;
				6615	}
				6616
				6617
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6618	RegExpEngine::CompilationResult RegExpEngine::Compile(
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6619	Isolate* isolate, Zone* zone, RegExpCompileData* data,
				6620	JSRegExp::Flags flags, Handle<String> pattern,
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6621	Handle<String> sample_subject, bool is_one_byte) {
				6622	if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
				6623	return IrregexpRegExpTooBig(isolate);
				6624	}
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6625	bool ignore_case = flags & JSRegExp::kIgnoreCase;
				6626	bool is_sticky = flags & JSRegExp::kSticky;
				6627	bool is_global = flags & JSRegExp::kGlobal;
				6628	bool is_unicode = flags & JSRegExp::kUnicode;
				6629	RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6630	is_one_byte);
				6631
				6632	if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
				6633
				6634	// Sample some characters from the middle of the string.
				6635	static const int kSampleSize = 128;
				6636
				6637	sample_subject = String::Flatten(sample_subject);
				6638	int chars_sampled = 0;
				6639	int half_way = (sample_subject->length() - kSampleSize) / 2;
				6640	for (int i = Max(0, half_way);
				6641	i < sample_subject->length() && chars_sampled < kSampleSize;
				6642	i++, chars_sampled++) {
				6643	compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
				6644	}
				6645
				6646	// Wrap the body of the regexp in capture #0.
				6647	RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
				6648	0,
				6649	&compiler,
				6650	compiler.accept());
				6651	RegExpNode* node = captured_body;
				6652	bool is_end_anchored = data->tree->IsAnchoredAtEnd();
				6653	bool is_start_anchored = data->tree->IsAnchoredAtStart();
				6654	int max_length = data->tree->max_match();
				6655	if (!is_start_anchored && !is_sticky) {
				6656	// Add a .*? at the beginning, outside the body capture, unless
				6657	// this expression is anchored at the beginning or sticky.
				6658	RegExpNode* loop_node = RegExpQuantifier::ToNode(
				6659	0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'),
				6660	&compiler, captured_body, data->contains_anchor);
				6661
				6662	if (data->contains_anchor) {
				6663	// Unroll loop once, to take care of the case that might start
				6664	// at the start of input.
				6665	ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
				6666	first_step_node->AddAlternative(GuardedAlternative(captured_body));
				6667	first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
				6668	new (zone) RegExpCharacterClass('*'), false, loop_node)));
				6669	node = first_step_node;
				6670	} else {
				6671	node = loop_node;
				6672	}
				6673	}
				6674	if (is_one_byte) {
				6675	node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
				6676	// Do it again to propagate the new nodes to places where they were not
				6677	// put because they had not been calculated yet.
				6678	if (node != NULL) {
				6679	node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
				6680	}
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6681	} else if (compiler.unicode() && (is_global \|\| is_sticky)) {
				6682	node = OptionallyStepBackToLeadSurrogate(&compiler, node);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6683	}
				6684
				6685	if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
				6686	data->node = node;
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6687	Analysis analysis(isolate, flags, is_one_byte);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6688	analysis.EnsureAnalyzed(node);
				6689	if (analysis.has_failed()) {
				6690	const char* error_message = analysis.error_message();
				6691	return CompilationResult(isolate, error_message);
				6692	}
				6693
				6694	// Create the correct assembler for the architecture.
				6695	#ifndef V8_INTERPRETED_REGEXP
				6696	// Native regexp implementation.
				6697
				6698	NativeRegExpMacroAssembler::Mode mode =
				6699	is_one_byte ? NativeRegExpMacroAssembler::LATIN1
				6700	: NativeRegExpMacroAssembler::UC16;
				6701
				6702	#if V8_TARGET_ARCH_IA32
				6703	RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
				6704	(data->capture_count + 1) * 2);
				6705	#elif V8_TARGET_ARCH_X64
				6706	RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
				6707	(data->capture_count + 1) * 2);
				6708	#elif V8_TARGET_ARCH_ARM
				6709	RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
				6710	(data->capture_count + 1) * 2);
				6711	#elif V8_TARGET_ARCH_ARM64
				6712	RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
				6713	(data->capture_count + 1) * 2);
Ben Murdoch	3b9bc31	2016-06-02 14:46:10 +0100	[diff] [blame]	6714	#elif V8_TARGET_ARCH_S390
				6715	RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode,
				6716	(data->capture_count + 1) * 2);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6717	#elif V8_TARGET_ARCH_PPC
				6718	RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
				6719	(data->capture_count + 1) * 2);
				6720	#elif V8_TARGET_ARCH_MIPS
				6721	RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
				6722	(data->capture_count + 1) * 2);
				6723	#elif V8_TARGET_ARCH_MIPS64
				6724	RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
				6725	(data->capture_count + 1) * 2);
				6726	#elif V8_TARGET_ARCH_X87
				6727	RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode,
				6728	(data->capture_count + 1) * 2);
				6729	#else
				6730	#error "Unsupported architecture"
				6731	#endif
				6732
				6733	#else // V8_INTERPRETED_REGEXP
				6734	// Interpreted regexp implementation.
				6735	EmbeddedVector<byte, 1024> codes;
				6736	RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
				6737	#endif // V8_INTERPRETED_REGEXP
				6738
				6739	macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern));
				6740
				6741	// Inserted here, instead of in Assembler, because it depends on information
				6742	// in the AST that isn't replicated in the Node structure.
				6743	static const int kMaxBacksearchLimit = 1024;
				6744	if (is_end_anchored &&
				6745	!is_start_anchored &&
				6746	max_length < kMaxBacksearchLimit) {
				6747	macro_assembler.SetCurrentPositionFromEnd(max_length);
				6748	}
				6749
				6750	if (is_global) {
Ben Murdoch	109988c	2016-05-18 11:27:45 +0100	[diff] [blame]	6751	RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
				6752	if (data->tree->min_match() > 0) {
				6753	mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
				6754	} else if (is_unicode) {
				6755	mode = RegExpMacroAssembler::GLOBAL_UNICODE;
				6756	}
				6757	macro_assembler.set_global_mode(mode);
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6758	}
				6759
				6760	return compiler.Assemble(&macro_assembler,
				6761	node,
				6762	data->capture_count,
				6763	pattern);
				6764	}
				6765
				6766
				6767	bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) {
				6768	Heap* heap = pattern->GetHeap();
				6769	bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
				6770	if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&
Ben Murdoch	bcf72ee	2016-08-08 18:44:38 +0100	[diff] [blame]	6771	heap->memory_allocator()->SizeExecutable() >
Ben Murdoch	014dc51	2016-03-22 12:00:34 +0000	[diff] [blame]	6772	RegExpImpl::kRegExpExecutableMemoryLimit) {
				6773	too_much = true;
				6774	}
				6775	return too_much;
				6776	}
				6777
				6778
				6779	Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string,
				6780	Object* key_pattern,
				6781	FixedArray** last_match_cache,
				6782	ResultsCacheType type) {
				6783	FixedArray* cache;
				6784	if (!key_string->IsInternalizedString()) return Smi::FromInt(0);
				6785	if (type == STRING_SPLIT_SUBSTRINGS) {
				6786	DCHECK(key_pattern->IsString());
				6787	if (!key_pattern->IsInternalizedString()) return Smi::FromInt(0);
				6788	cache = heap->string_split_cache();
				6789	} else {
				6790	DCHECK(type == REGEXP_MULTIPLE_INDICES);
				6791	DCHECK(key_pattern->IsFixedArray());
				6792	cache = heap->regexp_multiple_cache();
				6793	}
				6794
				6795	uint32_t hash = key_string->Hash();
				6796	uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
				6797	~(kArrayEntriesPerCacheEntry - 1));
				6798	if (cache->get(index + kStringOffset) != key_string \|\|
				6799	cache->get(index + kPatternOffset) != key_pattern) {
				6800	index =
				6801	((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
				6802	if (cache->get(index + kStringOffset) != key_string \|\|
				6803	cache->get(index + kPatternOffset) != key_pattern) {
				6804	return Smi::FromInt(0);
				6805	}
				6806	}
				6807
				6808	*last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
				6809	return cache->get(index + kArrayOffset);
				6810	}
				6811
				6812
				6813	void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
				6814	Handle<Object> key_pattern,
				6815	Handle<FixedArray> value_array,
				6816	Handle<FixedArray> last_match_cache,
				6817	ResultsCacheType type) {
				6818	Factory* factory = isolate->factory();
				6819	Handle<FixedArray> cache;
				6820	if (!key_string->IsInternalizedString()) return;
				6821	if (type == STRING_SPLIT_SUBSTRINGS) {
				6822	DCHECK(key_pattern->IsString());
				6823	if (!key_pattern->IsInternalizedString()) return;
				6824	cache = factory->string_split_cache();
				6825	} else {
				6826	DCHECK(type == REGEXP_MULTIPLE_INDICES);
				6827	DCHECK(key_pattern->IsFixedArray());
				6828	cache = factory->regexp_multiple_cache();
				6829	}
				6830
				6831	uint32_t hash = key_string->Hash();
				6832	uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
				6833	~(kArrayEntriesPerCacheEntry - 1));
				6834	if (cache->get(index + kStringOffset) == Smi::FromInt(0)) {
				6835	cache->set(index + kStringOffset, *key_string);
				6836	cache->set(index + kPatternOffset, *key_pattern);
				6837	cache->set(index + kArrayOffset, *value_array);
				6838	cache->set(index + kLastMatchOffset, *last_match_cache);
				6839	} else {
				6840	uint32_t index2 =
				6841	((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
				6842	if (cache->get(index2 + kStringOffset) == Smi::FromInt(0)) {
				6843	cache->set(index2 + kStringOffset, *key_string);
				6844	cache->set(index2 + kPatternOffset, *key_pattern);
				6845	cache->set(index2 + kArrayOffset, *value_array);
				6846	cache->set(index2 + kLastMatchOffset, *last_match_cache);
				6847	} else {
				6848	cache->set(index2 + kStringOffset, Smi::FromInt(0));
				6849	cache->set(index2 + kPatternOffset, Smi::FromInt(0));
				6850	cache->set(index2 + kArrayOffset, Smi::FromInt(0));
				6851	cache->set(index2 + kLastMatchOffset, Smi::FromInt(0));
				6852	cache->set(index + kStringOffset, *key_string);
				6853	cache->set(index + kPatternOffset, *key_pattern);
				6854	cache->set(index + kArrayOffset, *value_array);
				6855	cache->set(index + kLastMatchOffset, *last_match_cache);
				6856	}
				6857	}
				6858	// If the array is a reasonably short list of substrings, convert it into a
				6859	// list of internalized strings.
				6860	if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
				6861	for (int i = 0; i < value_array->length(); i++) {
				6862	Handle<String> str(String::cast(value_array->get(i)), isolate);
				6863	Handle<String> internalized_str = factory->InternalizeString(str);
				6864	value_array->set(i, *internalized_str);
				6865	}
				6866	}
				6867	// Convert backing store to a copy-on-write array.
				6868	value_array->set_map_no_write_barrier(*factory->fixed_cow_array_map());
				6869	}
				6870
				6871
				6872	void RegExpResultsCache::Clear(FixedArray* cache) {
				6873	for (int i = 0; i < kRegExpResultsCacheSize; i++) {
				6874	cache->set(i, Smi::FromInt(0));
				6875	}
				6876	}
				6877
				6878	} // namespace internal
				6879	} // namespace v8