Blame - src/regexp/jsregexp.cc - fp2-dev/platform/external/v8

blob: 80f48ca1a9fd34318de2c638889b58fed4f3dc7d [file] [log] [blame]

Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1	// Copyright 2012 the V8 project authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#include "src/regexp/jsregexp.h"
				6
				7	#include "src/ast/ast.h"
				8	#include "src/base/platform/platform.h"
				9	#include "src/compilation-cache.h"
				10	#include "src/compiler.h"
				11	#include "src/execution.h"
				12	#include "src/factory.h"
				13	#include "src/isolate-inl.h"
				14	#include "src/messages.h"
				15	#include "src/ostreams.h"
				16	#include "src/regexp/interpreter-irregexp.h"
				17	#include "src/regexp/jsregexp-inl.h"
				18	#include "src/regexp/regexp-macro-assembler.h"
				19	#include "src/regexp/regexp-macro-assembler-irregexp.h"
				20	#include "src/regexp/regexp-macro-assembler-tracer.h"
				21	#include "src/regexp/regexp-parser.h"
				22	#include "src/regexp/regexp-stack.h"
				23	#include "src/runtime/runtime.h"
				24	#include "src/splay-tree-inl.h"
				25	#include "src/string-search.h"
				26	#include "src/unicode-decoder.h"
				27
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	28	#ifdef V8_I18N_SUPPORT
				29	#include "unicode/uset.h"
				30	#include "unicode/utypes.h"
				31	#endif // V8_I18N_SUPPORT
				32
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	33	#ifndef V8_INTERPRETED_REGEXP
				34	#if V8_TARGET_ARCH_IA32
				35	#include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
				36	#elif V8_TARGET_ARCH_X64
				37	#include "src/regexp/x64/regexp-macro-assembler-x64.h"
				38	#elif V8_TARGET_ARCH_ARM64
				39	#include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
				40	#elif V8_TARGET_ARCH_ARM
				41	#include "src/regexp/arm/regexp-macro-assembler-arm.h"
				42	#elif V8_TARGET_ARCH_PPC
				43	#include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
				44	#elif V8_TARGET_ARCH_MIPS
				45	#include "src/regexp/mips/regexp-macro-assembler-mips.h"
				46	#elif V8_TARGET_ARCH_MIPS64
				47	#include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
				48	#elif V8_TARGET_ARCH_X87
				49	#include "src/regexp/x87/regexp-macro-assembler-x87.h"
				50	#else
				51	#error Unsupported target architecture.
				52	#endif
				53	#endif
				54
				55
				56	namespace v8 {
				57	namespace internal {
				58
				59	MUST_USE_RESULT
				60	static inline MaybeHandle<Object> ThrowRegExpException(
				61	Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) {
				62	Isolate* isolate = re->GetIsolate();
				63	THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
				64	pattern, error_text),
				65	Object);
				66	}
				67
				68
				69	inline void ThrowRegExpException(Handle<JSRegExp> re,
				70	Handle<String> error_text) {
				71	USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));
				72	}
				73
				74
				75	ContainedInLattice AddRange(ContainedInLattice containment,
				76	const int* ranges,
				77	int ranges_length,
				78	Interval new_range) {
				79	DCHECK((ranges_length & 1) == 1);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	80	DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	81	if (containment == kLatticeUnknown) return containment;
				82	bool inside = false;
				83	int last = 0;
				84	for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
				85	// Consider the range from last to ranges[i].
				86	// We haven't got to the new range yet.
				87	if (ranges[i] <= new_range.from()) continue;
				88	// New range is wholly inside last-ranges[i]. Note that new_range.to() is
				89	// inclusive, but the values in ranges are not.
				90	if (last <= new_range.from() && new_range.to() < ranges[i]) {
				91	return Combine(containment, inside ? kLatticeIn : kLatticeOut);
				92	}
				93	return kLatticeUnknown;
				94	}
				95	return containment;
				96	}
				97
				98
				99	// More makes code generation slower, less makes V8 benchmark score lower.
				100	const int kMaxLookaheadForBoyerMoore = 8;
				101	// In a 3-character pattern you can maximally step forwards 3 characters
				102	// at a time, which is not always enough to pay for the extra logic.
				103	const int kPatternTooShortForBoyerMoore = 2;
				104
				105
				106	// Identifies the sort of regexps where the regexp engine is faster
				107	// than the code used for atom matches.
				108	static bool HasFewDifferentCharacters(Handle<String> pattern) {
				109	int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
				110	if (length <= kPatternTooShortForBoyerMoore) return false;
				111	const int kMod = 128;
				112	bool character_found[kMod];
				113	int different = 0;
				114	memset(&character_found[0], 0, sizeof(character_found));
				115	for (int i = 0; i < length; i++) {
				116	int ch = (pattern->Get(i) & (kMod - 1));
				117	if (!character_found[ch]) {
				118	character_found[ch] = true;
				119	different++;
				120	// We declare a regexp low-alphabet if it has at least 3 times as many
				121	// characters as it has different characters.
				122	if (different * 3 > length) return false;
				123	}
				124	}
				125	return true;
				126	}
				127
				128
				129	// Generic RegExp methods. Dispatches to implementation specific methods.
				130
				131
				132	MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
				133	Handle<String> pattern,
				134	JSRegExp::Flags flags) {
				135	Isolate* isolate = re->GetIsolate();
				136	Zone zone;
				137	CompilationCache* compilation_cache = isolate->compilation_cache();
				138	MaybeHandle<FixedArray> maybe_cached =
				139	compilation_cache->LookupRegExp(pattern, flags);
				140	Handle<FixedArray> cached;
				141	bool in_cache = maybe_cached.ToHandle(&cached);
				142	LOG(isolate, RegExpCompileEvent(re, in_cache));
				143
				144	Handle<Object> result;
				145	if (in_cache) {
				146	re->set_data(*cached);
				147	return re;
				148	}
				149	pattern = String::Flatten(pattern);
				150	PostponeInterruptsScope postpone(isolate);
				151	RegExpCompileData parse_result;
				152	FlatStringReader reader(isolate, pattern);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	153	if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags,
				154	&parse_result)) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	155	// Throw an exception if we fail to parse the pattern.
				156	return ThrowRegExpException(re, pattern, parse_result.error);
				157	}
				158
				159	bool has_been_compiled = false;
				160
				161	if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
				162	!(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
				163	// Parse-tree is a single atom that is equal to the pattern.
				164	AtomCompile(re, pattern, flags, pattern);
				165	has_been_compiled = true;
				166	} else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) &&
				167	!(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
				168	RegExpAtom* atom = parse_result.tree->AsAtom();
				169	Vector<const uc16> atom_pattern = atom->data();
				170	Handle<String> atom_string;
				171	ASSIGN_RETURN_ON_EXCEPTION(
				172	isolate, atom_string,
				173	isolate->factory()->NewStringFromTwoByte(atom_pattern),
				174	Object);
				175	if (!HasFewDifferentCharacters(atom_string)) {
				176	AtomCompile(re, pattern, flags, atom_string);
				177	has_been_compiled = true;
				178	}
				179	}
				180	if (!has_been_compiled) {
				181	IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
				182	}
				183	DCHECK(re->data()->IsFixedArray());
				184	// Compilation succeeded so the data is set on the regexp
				185	// and we can store it in the cache.
				186	Handle<FixedArray> data(FixedArray::cast(re->data()));
				187	compilation_cache->PutRegExp(pattern, flags, data);
				188
				189	return re;
				190	}
				191
				192
				193	MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
				194	Handle<String> subject,
				195	int index,
				196	Handle<JSArray> last_match_info) {
				197	switch (regexp->TypeTag()) {
				198	case JSRegExp::ATOM:
				199	return AtomExec(regexp, subject, index, last_match_info);
				200	case JSRegExp::IRREGEXP: {
				201	return IrregexpExec(regexp, subject, index, last_match_info);
				202	}
				203	default:
				204	UNREACHABLE();
				205	return MaybeHandle<Object>();
				206	}
				207	}
				208
				209
				210	// RegExp Atom implementation: Simple string search using indexOf.
				211
				212
				213	void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
				214	Handle<String> pattern,
				215	JSRegExp::Flags flags,
				216	Handle<String> match_pattern) {
				217	re->GetIsolate()->factory()->SetRegExpAtomData(re,
				218	JSRegExp::ATOM,
				219	pattern,
				220	flags,
				221	match_pattern);
				222	}
				223
				224
				225	static void SetAtomLastCapture(FixedArray* array,
				226	String* subject,
				227	int from,
				228	int to) {
				229	SealHandleScope shs(array->GetIsolate());
				230	RegExpImpl::SetLastCaptureCount(array, 2);
				231	RegExpImpl::SetLastSubject(array, subject);
				232	RegExpImpl::SetLastInput(array, subject);
				233	RegExpImpl::SetCapture(array, 0, from);
				234	RegExpImpl::SetCapture(array, 1, to);
				235	}
				236
				237
				238	int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
				239	Handle<String> subject,
				240	int index,
				241	int32_t* output,
				242	int output_size) {
				243	Isolate* isolate = regexp->GetIsolate();
				244
				245	DCHECK(0 <= index);
				246	DCHECK(index <= subject->length());
				247
				248	subject = String::Flatten(subject);
				249	DisallowHeapAllocation no_gc; // ensure vectors stay valid
				250
				251	String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
				252	int needle_len = needle->length();
				253	DCHECK(needle->IsFlat());
				254	DCHECK_LT(0, needle_len);
				255
				256	if (index + needle_len > subject->length()) {
				257	return RegExpImpl::RE_FAILURE;
				258	}
				259
				260	for (int i = 0; i < output_size; i += 2) {
				261	String::FlatContent needle_content = needle->GetFlatContent();
				262	String::FlatContent subject_content = subject->GetFlatContent();
				263	DCHECK(needle_content.IsFlat());
				264	DCHECK(subject_content.IsFlat());
				265	// dispatch on type of strings
				266	index =
				267	(needle_content.IsOneByte()
				268	? (subject_content.IsOneByte()
				269	? SearchString(isolate, subject_content.ToOneByteVector(),
				270	needle_content.ToOneByteVector(), index)
				271	: SearchString(isolate, subject_content.ToUC16Vector(),
				272	needle_content.ToOneByteVector(), index))
				273	: (subject_content.IsOneByte()
				274	? SearchString(isolate, subject_content.ToOneByteVector(),
				275	needle_content.ToUC16Vector(), index)
				276	: SearchString(isolate, subject_content.ToUC16Vector(),
				277	needle_content.ToUC16Vector(), index)));
				278	if (index == -1) {
				279	return i / 2; // Return number of matches.
				280	} else {
				281	output[i] = index;
				282	output[i+1] = index + needle_len;
				283	index += needle_len;
				284	}
				285	}
				286	return output_size / 2;
				287	}
				288
				289
				290	Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
				291	Handle<String> subject,
				292	int index,
				293	Handle<JSArray> last_match_info) {
				294	Isolate* isolate = re->GetIsolate();
				295
				296	static const int kNumRegisters = 2;
				297	STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
				298	int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
				299
				300	int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
				301
				302	if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
				303
				304	DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
				305	SealHandleScope shs(isolate);
				306	FixedArray* array = FixedArray::cast(last_match_info->elements());
				307	SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
				308	return last_match_info;
				309	}
				310
				311
				312	// Irregexp implementation.
				313
				314	// Ensures that the regexp object contains a compiled version of the
				315	// source for either one-byte or two-byte subject strings.
				316	// If the compiled version doesn't already exist, it is compiled
				317	// from the source pattern.
				318	// If compilation fails, an exception is thrown and this function
				319	// returns false.
				320	bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
				321	Handle<String> sample_subject,
				322	bool is_one_byte) {
				323	Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
				324	#ifdef V8_INTERPRETED_REGEXP
				325	if (compiled_code->IsByteArray()) return true;
				326	#else // V8_INTERPRETED_REGEXP (RegExp native code)
				327	if (compiled_code->IsCode()) return true;
				328	#endif
				329	// We could potentially have marked this as flushable, but have kept
				330	// a saved version if we did not flush it yet.
				331	Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
				332	if (saved_code->IsCode()) {
				333	// Reinstate the code in the original place.
				334	re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
				335	DCHECK(compiled_code->IsSmi());
				336	return true;
				337	}
				338	return CompileIrregexp(re, sample_subject, is_one_byte);
				339	}
				340
				341
				342	bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
				343	Handle<String> sample_subject,
				344	bool is_one_byte) {
				345	// Compile the RegExp.
				346	Isolate* isolate = re->GetIsolate();
				347	Zone zone;
				348	PostponeInterruptsScope postpone(isolate);
				349	// If we had a compilation error the last time this is saved at the
				350	// saved code index.
				351	Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
				352	// When arriving here entry can only be a smi, either representing an
				353	// uncompiled regexp, a previous compilation error, or code that has
				354	// been flushed.
				355	DCHECK(entry->IsSmi());
				356	int entry_value = Smi::cast(entry)->value();
				357	DCHECK(entry_value == JSRegExp::kUninitializedValue \|\|
				358	entry_value == JSRegExp::kCompilationErrorValue \|\|
				359	(entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
				360
				361	if (entry_value == JSRegExp::kCompilationErrorValue) {
				362	// A previous compilation failed and threw an error which we store in
				363	// the saved code index (we store the error message, not the actual
				364	// error). Recreate the error object and throw it.
				365	Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
				366	DCHECK(error_string->IsString());
				367	Handle<String> error_message(String::cast(error_string));
				368	ThrowRegExpException(re, error_message);
				369	return false;
				370	}
				371
				372	JSRegExp::Flags flags = re->GetFlags();
				373
				374	Handle<String> pattern(re->Pattern());
				375	pattern = String::Flatten(pattern);
				376	RegExpCompileData compile_data;
				377	FlatStringReader reader(isolate, pattern);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	378	if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
				379	&compile_data)) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	380	// Throw an exception if we fail to parse the pattern.
				381	// THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
				382	USE(ThrowRegExpException(re, pattern, compile_data.error));
				383	return false;
				384	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	385	RegExpEngine::CompilationResult result =
				386	RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
				387	sample_subject, is_one_byte);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	388	if (result.error_message != NULL) {
				389	// Unable to compile regexp.
				390	Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
				391	CStrVector(result.error_message)).ToHandleChecked();
				392	ThrowRegExpException(re, error_message);
				393	return false;
				394	}
				395
				396	Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
				397	data->set(JSRegExp::code_index(is_one_byte), result.code);
				398	int register_max = IrregexpMaxRegisterCount(*data);
				399	if (result.num_registers > register_max) {
				400	SetIrregexpMaxRegisterCount(*data, result.num_registers);
				401	}
				402
				403	return true;
				404	}
				405
				406
				407	int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
				408	return Smi::cast(
				409	re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
				410	}
				411
				412
				413	void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
				414	re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
				415	}
				416
				417
				418	int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
				419	return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
				420	}
				421
				422
				423	int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
				424	return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
				425	}
				426
				427
				428	ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
				429	return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
				430	}
				431
				432
				433	Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
				434	return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
				435	}
				436
				437
				438	void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
				439	Handle<String> pattern,
				440	JSRegExp::Flags flags,
				441	int capture_count) {
				442	// Initialize compiled code entries to null.
				443	re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
				444	JSRegExp::IRREGEXP,
				445	pattern,
				446	flags,
				447	capture_count);
				448	}
				449
				450
				451	int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
				452	Handle<String> subject) {
				453	subject = String::Flatten(subject);
				454
				455	// Check representation of the underlying storage.
				456	bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
				457	if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
				458
				459	#ifdef V8_INTERPRETED_REGEXP
				460	// Byte-code regexp needs space allocated for all its registers.
				461	// The result captures are copied to the start of the registers array
				462	// if the match succeeds. This way those registers are not clobbered
				463	// when we set the last match info from last successful match.
				464	return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
				465	(IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
				466	#else // V8_INTERPRETED_REGEXP
				467	// Native regexp only needs room to output captures. Registers are handled
				468	// internally.
				469	return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
				470	#endif // V8_INTERPRETED_REGEXP
				471	}
				472
				473
				474	int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
				475	Handle<String> subject,
				476	int index,
				477	int32_t* output,
				478	int output_size) {
				479	Isolate* isolate = regexp->GetIsolate();
				480
				481	Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
				482
				483	DCHECK(index >= 0);
				484	DCHECK(index <= subject->length());
				485	DCHECK(subject->IsFlat());
				486
				487	bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
				488
				489	#ifndef V8_INTERPRETED_REGEXP
				490	DCHECK(output_size >= (IrregexpNumberOfCaptures(irregexp) + 1) 2);
				491	do {
				492	EnsureCompiledIrregexp(regexp, subject, is_one_byte);
				493	Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
				494	// The stack is used to allocate registers for the compiled regexp code.
				495	// This means that in case of failure, the output registers array is left
				496	// untouched and contains the capture results from the previous successful
				497	// match. We can use that to set the last match info lazily.
				498	NativeRegExpMacroAssembler::Result res =
				499	NativeRegExpMacroAssembler::Match(code,
				500	subject,
				501	output,
				502	output_size,
				503	index,
				504	isolate);
				505	if (res != NativeRegExpMacroAssembler::RETRY) {
				506	DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION \|\|
				507	isolate->has_pending_exception());
				508	STATIC_ASSERT(
				509	static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
				510	STATIC_ASSERT(
				511	static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
				512	STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
				513	== RE_EXCEPTION);
				514	return static_cast<IrregexpResult>(res);
				515	}
				516	// If result is RETRY, the string has changed representation, and we
				517	// must restart from scratch.
				518	// In this case, it means we must make sure we are prepared to handle
				519	// the, potentially, different subject (the string can switch between
				520	// being internal and external, and even between being Latin1 and UC16,
				521	// but the characters are always the same).
				522	IrregexpPrepare(regexp, subject);
				523	is_one_byte = subject->IsOneByteRepresentationUnderneath();
				524	} while (true);
				525	UNREACHABLE();
				526	return RE_EXCEPTION;
				527	#else // V8_INTERPRETED_REGEXP
				528
				529	DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
				530	// We must have done EnsureCompiledIrregexp, so we can get the number of
				531	// registers.
				532	int number_of_capture_registers =
				533	(IrregexpNumberOfCaptures(irregexp) + 1) 2;
				534	int32_t* raw_output = &output[number_of_capture_registers];
				535	// We do not touch the actual capture result registers until we know there
				536	// has been a match so that we can use those capture results to set the
				537	// last match info.
				538	for (int i = number_of_capture_registers - 1; i >= 0; i--) {
				539	raw_output[i] = -1;
				540	}
				541	Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
				542	isolate);
				543
				544	IrregexpResult result = IrregexpInterpreter::Match(isolate,
				545	byte_codes,
				546	subject,
				547	raw_output,
				548	index);
				549	if (result == RE_SUCCESS) {
				550	// Copy capture results to the start of the registers array.
				551	MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
				552	}
				553	if (result == RE_EXCEPTION) {
				554	DCHECK(!isolate->has_pending_exception());
				555	isolate->StackOverflow();
				556	}
				557	return result;
				558	#endif // V8_INTERPRETED_REGEXP
				559	}
				560
				561
				562	MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
				563	Handle<String> subject,
				564	int previous_index,
				565	Handle<JSArray> last_match_info) {
				566	Isolate* isolate = regexp->GetIsolate();
				567	DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
				568
				569	// Prepare space for the return values.
				570	#if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
				571	if (FLAG_trace_regexp_bytecodes) {
				572	String* pattern = regexp->Pattern();
				573	PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
				574	PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
				575	}
				576	#endif
				577	int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
				578	if (required_registers < 0) {
				579	// Compiling failed with an exception.
				580	DCHECK(isolate->has_pending_exception());
				581	return MaybeHandle<Object>();
				582	}
				583
				584	int32_t* output_registers = NULL;
				585	if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
				586	output_registers = NewArray<int32_t>(required_registers);
				587	}
				588	base::SmartArrayPointer<int32_t> auto_release(output_registers);
				589	if (output_registers == NULL) {
				590	output_registers = isolate->jsregexp_static_offsets_vector();
				591	}
				592
				593	int res = RegExpImpl::IrregexpExecRaw(
				594	regexp, subject, previous_index, output_registers, required_registers);
				595	if (res == RE_SUCCESS) {
				596	int capture_count =
				597	IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
				598	return SetLastMatchInfo(
				599	last_match_info, subject, capture_count, output_registers);
				600	}
				601	if (res == RE_EXCEPTION) {
				602	DCHECK(isolate->has_pending_exception());
				603	return MaybeHandle<Object>();
				604	}
				605	DCHECK(res == RE_FAILURE);
				606	return isolate->factory()->null_value();
				607	}
				608
				609
				610	static void EnsureSize(Handle<JSArray> array, uint32_t minimum_size) {
				611	if (static_cast<uint32_t>(array->elements()->length()) < minimum_size) {
				612	JSArray::SetLength(array, minimum_size);
				613	}
				614	}
				615
				616
				617	Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info,
				618	Handle<String> subject,
				619	int capture_count,
				620	int32_t* match) {
				621	DCHECK(last_match_info->HasFastObjectElements());
				622	int capture_register_count = (capture_count + 1) * 2;
				623	EnsureSize(last_match_info, capture_register_count + kLastMatchOverhead);
				624	DisallowHeapAllocation no_allocation;
				625	FixedArray* array = FixedArray::cast(last_match_info->elements());
				626	if (match != NULL) {
				627	for (int i = 0; i < capture_register_count; i += 2) {
				628	SetCapture(array, i, match[i]);
				629	SetCapture(array, i + 1, match[i + 1]);
				630	}
				631	}
				632	SetLastCaptureCount(array, capture_register_count);
				633	SetLastSubject(array, *subject);
				634	SetLastInput(array, *subject);
				635	return last_match_info;
				636	}
				637
				638
				639	RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
				640	Handle<String> subject,
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	641	Isolate* isolate)
				642	: register_array_(NULL),
				643	register_array_size_(0),
				644	regexp_(regexp),
				645	subject_(subject) {
				646	#ifdef V8_INTERPRETED_REGEXP
				647	bool interpreted = true;
				648	#else
				649	bool interpreted = false;
				650	#endif // V8_INTERPRETED_REGEXP
				651
				652	if (regexp_->TypeTag() == JSRegExp::ATOM) {
				653	static const int kAtomRegistersPerMatch = 2;
				654	registers_per_match_ = kAtomRegistersPerMatch;
				655	// There is no distinction between interpreted and native for atom regexps.
				656	interpreted = false;
				657	} else {
				658	registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
				659	if (registers_per_match_ < 0) {
				660	num_matches_ = -1; // Signal exception.
				661	return;
				662	}
				663	}
				664
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	665	DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal);
				666	if (!interpreted) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	667	register_array_size_ =
				668	Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
				669	max_matches_ = register_array_size_ / registers_per_match_;
				670	} else {
				671	// Global loop in interpreted regexp is not implemented. We choose
				672	// the size of the offsets vector so that it can only store one match.
				673	register_array_size_ = registers_per_match_;
				674	max_matches_ = 1;
				675	}
				676
				677	if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
				678	register_array_ = NewArray<int32_t>(register_array_size_);
				679	} else {
				680	register_array_ = isolate->jsregexp_static_offsets_vector();
				681	}
				682
				683	// Set state so that fetching the results the first time triggers a call
				684	// to the compiled regexp.
				685	current_match_index_ = max_matches_ - 1;
				686	num_matches_ = max_matches_;
				687	DCHECK(registers_per_match_ >= 2); // Each match has at least one capture.
				688	DCHECK_GE(register_array_size_, registers_per_match_);
				689	int32_t* last_match =
				690	&register_array_[current_match_index_ * registers_per_match_];
				691	last_match[0] = -1;
				692	last_match[1] = 0;
				693	}
				694
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	695	int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
				696	if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 &&
				697	last_index + 1 < subject_->length() &&
				698	unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
				699	unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
				700	// Advance over the surrogate pair.
				701	return last_index + 2;
				702	}
				703	return last_index + 1;
				704	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	705
				706	// -------------------------------------------------------------------
				707	// Implementation of the Irregexp regular expression engine.
				708	//
				709	// The Irregexp regular expression engine is intended to be a complete
				710	// implementation of ECMAScript regular expressions. It generates either
				711	// bytecodes or native code.
				712
				713	// The Irregexp regexp engine is structured in three steps.
				714	// 1) The parser generates an abstract syntax tree. See ast.cc.
				715	// 2) From the AST a node network is created. The nodes are all
				716	// subclasses of RegExpNode. The nodes represent states when
				717	// executing a regular expression. Several optimizations are
				718	// performed on the node network.
				719	// 3) From the nodes we generate either byte codes or native code
				720	// that can actually execute the regular expression (perform
				721	// the search). The code generation step is described in more
				722	// detail below.
				723
				724	// Code generation.
				725	//
				726	// The nodes are divided into four main categories.
				727	// * Choice nodes
				728	// These represent places where the regular expression can
				729	// match in more than one way. For example on entry to an
				730	// alternation (foo\|bar) or a repetition (*, +, ? or {}).
				731	// * Action nodes
				732	// These represent places where some action should be
				733	// performed. Examples include recording the current position
				734	// in the input string to a register (in order to implement
				735	// captures) or other actions on register for example in order
				736	// to implement the counters needed for {} repetitions.
				737	// * Matching nodes
				738	// These attempt to match some element part of the input string.
				739	// Examples of elements include character classes, plain strings
				740	// or back references.
				741	// * End nodes
				742	// These are used to implement the actions required on finding
				743	// a successful match or failing to find a match.
				744	//
				745	// The code generated (whether as byte codes or native code) maintains
				746	// some state as it runs. This consists of the following elements:
				747	//
				748	// * The capture registers. Used for string captures.
				749	// * Other registers. Used for counters etc.
				750	// * The current position.
				751	// * The stack of backtracking information. Used when a matching node
				752	// fails to find a match and needs to try an alternative.
				753	//
				754	// Conceptual regular expression execution model:
				755	//
				756	// There is a simple conceptual model of regular expression execution
				757	// which will be presented first. The actual code generated is a more
				758	// efficient simulation of the simple conceptual model:
				759	//
				760	// * Choice nodes are implemented as follows:
				761	// For each choice except the last {
				762	// push current position
				763	// push backtrack code location
				764	// <generate code to test for choice>
				765	// backtrack code location:
				766	// pop current position
				767	// }
				768	// <generate code to test for last choice>
				769	//
				770	// * Actions nodes are generated as follows
				771	// <push affected registers on backtrack stack>
				772	// <generate code to perform action>
				773	// push backtrack code location
				774	// <generate code to test for following nodes>
				775	// backtrack code location:
				776	// <pop affected registers to restore their state>
				777	// <pop backtrack location from stack and go to it>
				778	//
				779	// * Matching nodes are generated as follows:
				780	// if input string matches at current position
				781	// update current position
				782	// <generate code to test for following nodes>
				783	// else
				784	// <pop backtrack location from stack and go to it>
				785	//
				786	// Thus it can be seen that the current position is saved and restored
				787	// by the choice nodes, whereas the registers are saved and restored by
				788	// by the action nodes that manipulate them.
				789	//
				790	// The other interesting aspect of this model is that nodes are generated
				791	// at the point where they are needed by a recursive call to Emit(). If
				792	// the node has already been code generated then the Emit() call will
				793	// generate a jump to the previously generated code instead. In order to
				794	// limit recursion it is possible for the Emit() function to put the node
				795	// on a work list for later generation and instead generate a jump. The
				796	// destination of the jump is resolved later when the code is generated.
				797	//
				798	// Actual regular expression code generation.
				799	//
				800	// Code generation is actually more complicated than the above. In order
				801	// to improve the efficiency of the generated code some optimizations are
				802	// performed
				803	//
				804	// * Choice nodes have 1-character lookahead.
				805	// A choice node looks at the following character and eliminates some of
				806	// the choices immediately based on that character. This is not yet
				807	// implemented.
				808	// * Simple greedy loops store reduced backtracking information.
				809	// A quantifier like /.*foo/m will greedily match the whole input. It will
				810	// then need to backtrack to a point where it can match "foo". The naive
				811	// implementation of this would push each character position onto the
				812	// backtracking stack, then pop them off one by one. This would use space
				813	// proportional to the length of the input string. However since the "."
				814	// can only match in one way and always has a constant length (in this case
				815	// of 1) it suffices to store the current position on the top of the stack
				816	// once. Matching now becomes merely incrementing the current position and
				817	// backtracking becomes decrementing the current position and checking the
				818	// result against the stored current position. This is faster and saves
				819	// space.
				820	// * The current state is virtualized.
				821	// This is used to defer expensive operations until it is clear that they
				822	// are needed and to generate code for a node more than once, allowing
				823	// specialized an efficient versions of the code to be created. This is
				824	// explained in the section below.
				825	//
				826	// Execution state virtualization.
				827	//
				828	// Instead of emitting code, nodes that manipulate the state can record their
				829	// manipulation in an object called the Trace. The Trace object can record a
				830	// current position offset, an optional backtrack code location on the top of
				831	// the virtualized backtrack stack and some register changes. When a node is
				832	// to be emitted it can flush the Trace or update it. Flushing the Trace
				833	// will emit code to bring the actual state into line with the virtual state.
				834	// Avoiding flushing the state can postpone some work (e.g. updates of capture
				835	// registers). Postponing work can save time when executing the regular
				836	// expression since it may be found that the work never has to be done as a
				837	// failure to match can occur. In addition it is much faster to jump to a
				838	// known backtrack code location than it is to pop an unknown backtrack
				839	// location from the stack and jump there.
				840	//
				841	// The virtual state found in the Trace affects code generation. For example
				842	// the virtual state contains the difference between the actual current
				843	// position and the virtual current position, and matching code needs to use
				844	// this offset to attempt a match in the correct location of the input
				845	// string. Therefore code generated for a non-trivial trace is specialized
				846	// to that trace. The code generator therefore has the ability to generate
				847	// code for each node several times. In order to limit the size of the
				848	// generated code there is an arbitrary limit on how many specialized sets of
				849	// code may be generated for a given node. If the limit is reached, the
				850	// trace is flushed and a generic version of the code for a node is emitted.
				851	// This is subsequently used for that node. The code emitted for non-generic
				852	// trace is not recorded in the node and so it cannot currently be reused in
				853	// the event that code generation is requested for an identical trace.
				854
				855
				856	void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
				857	UNREACHABLE();
				858	}
				859
				860
				861	void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
				862	text->AddElement(TextElement::Atom(this), zone);
				863	}
				864
				865
				866	void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
				867	text->AddElement(TextElement::CharClass(this), zone);
				868	}
				869
				870
				871	void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
				872	for (int i = 0; i < elements()->length(); i++)
				873	text->AddElement(elements()->at(i), zone);
				874	}
				875
				876
				877	TextElement TextElement::Atom(RegExpAtom* atom) {
				878	return TextElement(ATOM, atom);
				879	}
				880
				881
				882	TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
				883	return TextElement(CHAR_CLASS, char_class);
				884	}
				885
				886
				887	int TextElement::length() const {
				888	switch (text_type()) {
				889	case ATOM:
				890	return atom()->length();
				891
				892	case CHAR_CLASS:
				893	return 1;
				894	}
				895	UNREACHABLE();
				896	return 0;
				897	}
				898
				899
				900	DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
				901	if (table_ == NULL) {
				902	table_ = new(zone()) DispatchTable(zone());
				903	DispatchTableConstructor cons(table_, ignore_case, zone());
				904	cons.BuildTable(this);
				905	}
				906	return table_;
				907	}
				908
				909
				910	class FrequencyCollator {
				911	public:
				912	FrequencyCollator() : total_samples_(0) {
				913	for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
				914	frequencies_[i] = CharacterFrequency(i);
				915	}
				916	}
				917
				918	void CountCharacter(int character) {
				919	int index = (character & RegExpMacroAssembler::kTableMask);
				920	frequencies_[index].Increment();
				921	total_samples_++;
				922	}
				923
				924	// Does not measure in percent, but rather per-128 (the table size from the
				925	// regexp macro assembler).
				926	int Frequency(int in_character) {
				927	DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
				928	if (total_samples_ < 1) return 1; // Division by zero.
				929	int freq_in_per128 =
				930	(frequencies_[in_character].counter() * 128) / total_samples_;
				931	return freq_in_per128;
				932	}
				933
				934	private:
				935	class CharacterFrequency {
				936	public:
				937	CharacterFrequency() : counter_(0), character_(-1) { }
				938	explicit CharacterFrequency(int character)
				939	: counter_(0), character_(character) { }
				940
				941	void Increment() { counter_++; }
				942	int counter() { return counter_; }
				943	int character() { return character_; }
				944
				945	private:
				946	int counter_;
				947	int character_;
				948	};
				949
				950
				951	private:
				952	CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
				953	int total_samples_;
				954	};
				955
				956
				957	class RegExpCompiler {
				958	public:
				959	RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	960	JSRegExp::Flags flags, bool is_one_byte);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	961
				962	int AllocateRegister() {
				963	if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
				964	reg_exp_too_big_ = true;
				965	return next_register_;
				966	}
				967	return next_register_++;
				968	}
				969
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	970	// Lookarounds to match lone surrogates for unicode character class matches
				971	// are never nested. We can therefore reuse registers.
				972	int UnicodeLookaroundStackRegister() {
				973	if (unicode_lookaround_stack_register_ == kNoRegister) {
				974	unicode_lookaround_stack_register_ = AllocateRegister();
				975	}
				976	return unicode_lookaround_stack_register_;
				977	}
				978
				979	int UnicodeLookaroundPositionRegister() {
				980	if (unicode_lookaround_position_register_ == kNoRegister) {
				981	unicode_lookaround_position_register_ = AllocateRegister();
				982	}
				983	return unicode_lookaround_position_register_;
				984	}
				985
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	986	RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
				987	RegExpNode* start,
				988	int capture_count,
				989	Handle<String> pattern);
				990
				991	inline void AddWork(RegExpNode* node) {
				992	if (!node->on_work_list() && !node->label()->is_bound()) {
				993	node->set_on_work_list(true);
				994	work_list_->Add(node);
				995	}
				996	}
				997
				998	static const int kImplementationOffset = 0;
				999	static const int kNumberOfRegistersOffset = 0;
				1000	static const int kCodeOffset = 1;
				1001
				1002	RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
				1003	EndNode* accept() { return accept_; }
				1004
				1005	static const int kMaxRecursion = 100;
				1006	inline int recursion_depth() { return recursion_depth_; }
				1007	inline void IncrementRecursionDepth() { recursion_depth_++; }
				1008	inline void DecrementRecursionDepth() { recursion_depth_--; }
				1009
				1010	void SetRegExpTooBig() { reg_exp_too_big_ = true; }
				1011
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1012	inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
				1013	inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1014	inline bool one_byte() { return one_byte_; }
				1015	inline bool optimize() { return optimize_; }
				1016	inline void set_optimize(bool value) { optimize_ = value; }
				1017	inline bool limiting_recursion() { return limiting_recursion_; }
				1018	inline void set_limiting_recursion(bool value) {
				1019	limiting_recursion_ = value;
				1020	}
				1021	bool read_backward() { return read_backward_; }
				1022	void set_read_backward(bool value) { read_backward_ = value; }
				1023	FrequencyCollator* frequency_collator() { return &frequency_collator_; }
				1024
				1025	int current_expansion_factor() { return current_expansion_factor_; }
				1026	void set_current_expansion_factor(int value) {
				1027	current_expansion_factor_ = value;
				1028	}
				1029
				1030	Isolate* isolate() const { return isolate_; }
				1031	Zone* zone() const { return zone_; }
				1032
				1033	static const int kNoRegister = -1;
				1034
				1035	private:
				1036	EndNode* accept_;
				1037	int next_register_;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1038	int unicode_lookaround_stack_register_;
				1039	int unicode_lookaround_position_register_;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1040	List<RegExpNode> work_list_;
				1041	int recursion_depth_;
				1042	RegExpMacroAssembler* macro_assembler_;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1043	JSRegExp::Flags flags_;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1044	bool one_byte_;
				1045	bool reg_exp_too_big_;
				1046	bool limiting_recursion_;
				1047	bool optimize_;
				1048	bool read_backward_;
				1049	int current_expansion_factor_;
				1050	FrequencyCollator frequency_collator_;
				1051	Isolate* isolate_;
				1052	Zone* zone_;
				1053	};
				1054
				1055
				1056	class RecursionCheck {
				1057	public:
				1058	explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
				1059	compiler->IncrementRecursionDepth();
				1060	}
				1061	~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
				1062	private:
				1063	RegExpCompiler* compiler_;
				1064	};
				1065
				1066
				1067	static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
				1068	return RegExpEngine::CompilationResult(isolate, "RegExp too big");
				1069	}
				1070
				1071
				1072	// Attempts to compile the regexp using an Irregexp code generator. Returns
				1073	// a fixed array or a null handle depending on whether it succeeded.
				1074	RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1075	JSRegExp::Flags flags, bool one_byte)
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1076	: next_register_(2 * (capture_count + 1)),
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1077	unicode_lookaround_stack_register_(kNoRegister),
				1078	unicode_lookaround_position_register_(kNoRegister),
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1079	work_list_(NULL),
				1080	recursion_depth_(0),
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1081	flags_(flags),
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1082	one_byte_(one_byte),
				1083	reg_exp_too_big_(false),
				1084	limiting_recursion_(false),
				1085	optimize_(FLAG_regexp_optimization),
				1086	read_backward_(false),
				1087	current_expansion_factor_(1),
				1088	frequency_collator_(),
				1089	isolate_(isolate),
				1090	zone_(zone) {
				1091	accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
				1092	DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
				1093	}
				1094
				1095
				1096	RegExpEngine::CompilationResult RegExpCompiler::Assemble(
				1097	RegExpMacroAssembler* macro_assembler,
				1098	RegExpNode* start,
				1099	int capture_count,
				1100	Handle<String> pattern) {
				1101	Heap* heap = pattern->GetHeap();
				1102
				1103	#ifdef DEBUG
				1104	if (FLAG_trace_regexp_assembler)
				1105	macro_assembler_ =
				1106	new RegExpMacroAssemblerTracer(isolate(), macro_assembler);
				1107	else
				1108	#endif
				1109	macro_assembler_ = macro_assembler;
				1110
				1111	List <RegExpNode*> work_list(0);
				1112	work_list_ = &work_list;
				1113	Label fail;
				1114	macro_assembler_->PushBacktrack(&fail);
				1115	Trace new_trace;
				1116	start->Emit(this, &new_trace);
				1117	macro_assembler_->Bind(&fail);
				1118	macro_assembler_->Fail();
				1119	while (!work_list.is_empty()) {
				1120	RegExpNode* node = work_list.RemoveLast();
				1121	node->set_on_work_list(false);
				1122	if (!node->label()->is_bound()) node->Emit(this, &new_trace);
				1123	}
				1124	if (reg_exp_too_big_) {
				1125	macro_assembler_->AbortedCodeGeneration();
				1126	return IrregexpRegExpTooBig(isolate_);
				1127	}
				1128
				1129	Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
				1130	heap->IncreaseTotalRegexpCodeGenerated(code->Size());
				1131	work_list_ = NULL;
				1132	#ifdef ENABLE_DISASSEMBLER
				1133	if (FLAG_print_code) {
				1134	CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());
				1135	OFStream os(trace_scope.file());
				1136	Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
				1137	}
				1138	#endif
				1139	#ifdef DEBUG
				1140	if (FLAG_trace_regexp_assembler) {
				1141	delete macro_assembler_;
				1142	}
				1143	#endif
				1144	return RegExpEngine::CompilationResult(*code, next_register_);
				1145	}
				1146
				1147
				1148	bool Trace::DeferredAction::Mentions(int that) {
				1149	if (action_type() == ActionNode::CLEAR_CAPTURES) {
				1150	Interval range = static_cast<DeferredClearCaptures*>(this)->range();
				1151	return range.Contains(that);
				1152	} else {
				1153	return reg() == that;
				1154	}
				1155	}
				1156
				1157
				1158	bool Trace::mentions_reg(int reg) {
				1159	for (DeferredAction* action = actions_;
				1160	action != NULL;
				1161	action = action->next()) {
				1162	if (action->Mentions(reg))
				1163	return true;
				1164	}
				1165	return false;
				1166	}
				1167
				1168
				1169	bool Trace::GetStoredPosition(int reg, int* cp_offset) {
				1170	DCHECK_EQ(0, *cp_offset);
				1171	for (DeferredAction* action = actions_;
				1172	action != NULL;
				1173	action = action->next()) {
				1174	if (action->Mentions(reg)) {
				1175	if (action->action_type() == ActionNode::STORE_POSITION) {
				1176	cp_offset = static_cast<DeferredCapture>(action)->cp_offset();
				1177	return true;
				1178	} else {
				1179	return false;
				1180	}
				1181	}
				1182	}
				1183	return false;
				1184	}
				1185
				1186
				1187	int Trace::FindAffectedRegisters(OutSet* affected_registers,
				1188	Zone* zone) {
				1189	int max_register = RegExpCompiler::kNoRegister;
				1190	for (DeferredAction* action = actions_;
				1191	action != NULL;
				1192	action = action->next()) {
				1193	if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
				1194	Interval range = static_cast<DeferredClearCaptures*>(action)->range();
				1195	for (int i = range.from(); i <= range.to(); i++)
				1196	affected_registers->Set(i, zone);
				1197	if (range.to() > max_register) max_register = range.to();
				1198	} else {
				1199	affected_registers->Set(action->reg(), zone);
				1200	if (action->reg() > max_register) max_register = action->reg();
				1201	}
				1202	}
				1203	return max_register;
				1204	}
				1205
				1206
				1207	void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
				1208	int max_register,
				1209	const OutSet& registers_to_pop,
				1210	const OutSet& registers_to_clear) {
				1211	for (int reg = max_register; reg >= 0; reg--) {
				1212	if (registers_to_pop.Get(reg)) {
				1213	assembler->PopRegister(reg);
				1214	} else if (registers_to_clear.Get(reg)) {
				1215	int clear_to = reg;
				1216	while (reg > 0 && registers_to_clear.Get(reg - 1)) {
				1217	reg--;
				1218	}
				1219	assembler->ClearRegisters(reg, clear_to);
				1220	}
				1221	}
				1222	}
				1223
				1224
				1225	void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
				1226	int max_register,
				1227	const OutSet& affected_registers,
				1228	OutSet* registers_to_pop,
				1229	OutSet* registers_to_clear,
				1230	Zone* zone) {
				1231	// The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
				1232	const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
				1233
				1234	// Count pushes performed to force a stack limit check occasionally.
				1235	int pushes = 0;
				1236
				1237	for (int reg = 0; reg <= max_register; reg++) {
				1238	if (!affected_registers.Get(reg)) {
				1239	continue;
				1240	}
				1241
				1242	// The chronologically first deferred action in the trace
				1243	// is used to infer the action needed to restore a register
				1244	// to its previous state (or not, if it's safe to ignore it).
				1245	enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
				1246	DeferredActionUndoType undo_action = IGNORE;
				1247
				1248	int value = 0;
				1249	bool absolute = false;
				1250	bool clear = false;
				1251	static const int kNoStore = kMinInt;
				1252	int store_position = kNoStore;
				1253	// This is a little tricky because we are scanning the actions in reverse
				1254	// historical order (newest first).
				1255	for (DeferredAction* action = actions_;
				1256	action != NULL;
				1257	action = action->next()) {
				1258	if (action->Mentions(reg)) {
				1259	switch (action->action_type()) {
				1260	case ActionNode::SET_REGISTER: {
				1261	Trace::DeferredSetRegister* psr =
				1262	static_cast<Trace::DeferredSetRegister*>(action);
				1263	if (!absolute) {
				1264	value += psr->value();
				1265	absolute = true;
				1266	}
				1267	// SET_REGISTER is currently only used for newly introduced loop
				1268	// counters. They can have a significant previous value if they
				1269	// occour in a loop. TODO(lrn): Propagate this information, so
				1270	// we can set undo_action to IGNORE if we know there is no value to
				1271	// restore.
				1272	undo_action = RESTORE;
				1273	DCHECK_EQ(store_position, kNoStore);
				1274	DCHECK(!clear);
				1275	break;
				1276	}
				1277	case ActionNode::INCREMENT_REGISTER:
				1278	if (!absolute) {
				1279	value++;
				1280	}
				1281	DCHECK_EQ(store_position, kNoStore);
				1282	DCHECK(!clear);
				1283	undo_action = RESTORE;
				1284	break;
				1285	case ActionNode::STORE_POSITION: {
				1286	Trace::DeferredCapture* pc =
				1287	static_cast<Trace::DeferredCapture*>(action);
				1288	if (!clear && store_position == kNoStore) {
				1289	store_position = pc->cp_offset();
				1290	}
				1291
				1292	// For captures we know that stores and clears alternate.
				1293	// Other register, are never cleared, and if the occur
				1294	// inside a loop, they might be assigned more than once.
				1295	if (reg <= 1) {
				1296	// Registers zero and one, aka "capture zero", is
				1297	// always set correctly if we succeed. There is no
				1298	// need to undo a setting on backtrack, because we
				1299	// will set it again or fail.
				1300	undo_action = IGNORE;
				1301	} else {
				1302	undo_action = pc->is_capture() ? CLEAR : RESTORE;
				1303	}
				1304	DCHECK(!absolute);
				1305	DCHECK_EQ(value, 0);
				1306	break;
				1307	}
				1308	case ActionNode::CLEAR_CAPTURES: {
				1309	// Since we're scanning in reverse order, if we've already
				1310	// set the position we have to ignore historically earlier
				1311	// clearing operations.
				1312	if (store_position == kNoStore) {
				1313	clear = true;
				1314	}
				1315	undo_action = RESTORE;
				1316	DCHECK(!absolute);
				1317	DCHECK_EQ(value, 0);
				1318	break;
				1319	}
				1320	default:
				1321	UNREACHABLE();
				1322	break;
				1323	}
				1324	}
				1325	}
				1326	// Prepare for the undo-action (e.g., push if it's going to be popped).
				1327	if (undo_action == RESTORE) {
				1328	pushes++;
				1329	RegExpMacroAssembler::StackCheckFlag stack_check =
				1330	RegExpMacroAssembler::kNoStackLimitCheck;
				1331	if (pushes == push_limit) {
				1332	stack_check = RegExpMacroAssembler::kCheckStackLimit;
				1333	pushes = 0;
				1334	}
				1335
				1336	assembler->PushRegister(reg, stack_check);
				1337	registers_to_pop->Set(reg, zone);
				1338	} else if (undo_action == CLEAR) {
				1339	registers_to_clear->Set(reg, zone);
				1340	}
				1341	// Perform the chronologically last action (or accumulated increment)
				1342	// for the register.
				1343	if (store_position != kNoStore) {
				1344	assembler->WriteCurrentPositionToRegister(reg, store_position);
				1345	} else if (clear) {
				1346	assembler->ClearRegisters(reg, reg);
				1347	} else if (absolute) {
				1348	assembler->SetRegister(reg, value);
				1349	} else if (value != 0) {
				1350	assembler->AdvanceRegister(reg, value);
				1351	}
				1352	}
				1353	}
				1354
				1355
				1356	// This is called as we come into a loop choice node and some other tricky
				1357	// nodes. It normalizes the state of the code generator to ensure we can
				1358	// generate generic code.
				1359	void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
				1360	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1361
				1362	DCHECK(!is_trivial());
				1363
				1364	if (actions_ == NULL && backtrack() == NULL) {
				1365	// Here we just have some deferred cp advances to fix and we are back to
				1366	// a normal situation. We may also have to forget some information gained
				1367	// through a quick check that was already performed.
				1368	if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
				1369	// Create a new trivial state and generate the node with that.
				1370	Trace new_state;
				1371	successor->Emit(compiler, &new_state);
				1372	return;
				1373	}
				1374
				1375	// Generate deferred actions here along with code to undo them again.
				1376	OutSet affected_registers;
				1377
				1378	if (backtrack() != NULL) {
				1379	// Here we have a concrete backtrack location. These are set up by choice
				1380	// nodes and so they indicate that we have a deferred save of the current
				1381	// position which we may need to emit here.
				1382	assembler->PushCurrentPosition();
				1383	}
				1384
				1385	int max_register = FindAffectedRegisters(&affected_registers,
				1386	compiler->zone());
				1387	OutSet registers_to_pop;
				1388	OutSet registers_to_clear;
				1389	PerformDeferredActions(assembler,
				1390	max_register,
				1391	affected_registers,
				1392	&registers_to_pop,
				1393	&registers_to_clear,
				1394	compiler->zone());
				1395	if (cp_offset_ != 0) {
				1396	assembler->AdvanceCurrentPosition(cp_offset_);
				1397	}
				1398
				1399	// Create a new trivial state and generate the node with that.
				1400	Label undo;
				1401	assembler->PushBacktrack(&undo);
				1402	if (successor->KeepRecursing(compiler)) {
				1403	Trace new_state;
				1404	successor->Emit(compiler, &new_state);
				1405	} else {
				1406	compiler->AddWork(successor);
				1407	assembler->GoTo(successor->label());
				1408	}
				1409
				1410	// On backtrack we need to restore state.
				1411	assembler->Bind(&undo);
				1412	RestoreAffectedRegisters(assembler,
				1413	max_register,
				1414	registers_to_pop,
				1415	registers_to_clear);
				1416	if (backtrack() == NULL) {
				1417	assembler->Backtrack();
				1418	} else {
				1419	assembler->PopCurrentPosition();
				1420	assembler->GoTo(backtrack());
				1421	}
				1422	}
				1423
				1424
				1425	void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
				1426	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1427
				1428	// Omit flushing the trace. We discard the entire stack frame anyway.
				1429
				1430	if (!label()->is_bound()) {
				1431	// We are completely independent of the trace, since we ignore it,
				1432	// so this code can be used as the generic version.
				1433	assembler->Bind(label());
				1434	}
				1435
				1436	// Throw away everything on the backtrack stack since the start
				1437	// of the negative submatch and restore the character position.
				1438	assembler->ReadCurrentPositionFromRegister(current_position_register_);
				1439	assembler->ReadStackPointerFromRegister(stack_pointer_register_);
				1440	if (clear_capture_count_ > 0) {
				1441	// Clear any captures that might have been performed during the success
				1442	// of the body of the negative look-ahead.
				1443	int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
				1444	assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
				1445	}
				1446	// Now that we have unwound the stack we find at the top of the stack the
				1447	// backtrack that the BeginSubmatch node got.
				1448	assembler->Backtrack();
				1449	}
				1450
				1451
				1452	void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				1453	if (!trace->is_trivial()) {
				1454	trace->Flush(compiler, this);
				1455	return;
				1456	}
				1457	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1458	if (!label()->is_bound()) {
				1459	assembler->Bind(label());
				1460	}
				1461	switch (action_) {
				1462	case ACCEPT:
				1463	assembler->Succeed();
				1464	return;
				1465	case BACKTRACK:
				1466	assembler->GoTo(trace->backtrack());
				1467	return;
				1468	case NEGATIVE_SUBMATCH_SUCCESS:
				1469	// This case is handled in a different virtual method.
				1470	UNREACHABLE();
				1471	}
				1472	UNIMPLEMENTED();
				1473	}
				1474
				1475
				1476	void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
				1477	if (guards_ == NULL)
				1478	guards_ = new(zone) ZoneList<Guard*>(1, zone);
				1479	guards_->Add(guard, zone);
				1480	}
				1481
				1482
				1483	ActionNode* ActionNode::SetRegister(int reg,
				1484	int val,
				1485	RegExpNode* on_success) {
				1486	ActionNode* result =
				1487	new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
				1488	result->data_.u_store_register.reg = reg;
				1489	result->data_.u_store_register.value = val;
				1490	return result;
				1491	}
				1492
				1493
				1494	ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
				1495	ActionNode* result =
				1496	new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
				1497	result->data_.u_increment_register.reg = reg;
				1498	return result;
				1499	}
				1500
				1501
				1502	ActionNode* ActionNode::StorePosition(int reg,
				1503	bool is_capture,
				1504	RegExpNode* on_success) {
				1505	ActionNode* result =
				1506	new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
				1507	result->data_.u_position_register.reg = reg;
				1508	result->data_.u_position_register.is_capture = is_capture;
				1509	return result;
				1510	}
				1511
				1512
				1513	ActionNode* ActionNode::ClearCaptures(Interval range,
				1514	RegExpNode* on_success) {
				1515	ActionNode* result =
				1516	new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
				1517	result->data_.u_clear_captures.range_from = range.from();
				1518	result->data_.u_clear_captures.range_to = range.to();
				1519	return result;
				1520	}
				1521
				1522
				1523	ActionNode* ActionNode::BeginSubmatch(int stack_reg,
				1524	int position_reg,
				1525	RegExpNode* on_success) {
				1526	ActionNode* result =
				1527	new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
				1528	result->data_.u_submatch.stack_pointer_register = stack_reg;
				1529	result->data_.u_submatch.current_position_register = position_reg;
				1530	return result;
				1531	}
				1532
				1533
				1534	ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
				1535	int position_reg,
				1536	int clear_register_count,
				1537	int clear_register_from,
				1538	RegExpNode* on_success) {
				1539	ActionNode* result =
				1540	new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
				1541	result->data_.u_submatch.stack_pointer_register = stack_reg;
				1542	result->data_.u_submatch.current_position_register = position_reg;
				1543	result->data_.u_submatch.clear_register_count = clear_register_count;
				1544	result->data_.u_submatch.clear_register_from = clear_register_from;
				1545	return result;
				1546	}
				1547
				1548
				1549	ActionNode* ActionNode::EmptyMatchCheck(int start_register,
				1550	int repetition_register,
				1551	int repetition_limit,
				1552	RegExpNode* on_success) {
				1553	ActionNode* result =
				1554	new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
				1555	result->data_.u_empty_match_check.start_register = start_register;
				1556	result->data_.u_empty_match_check.repetition_register = repetition_register;
				1557	result->data_.u_empty_match_check.repetition_limit = repetition_limit;
				1558	return result;
				1559	}
				1560
				1561
				1562	#define DEFINE_ACCEPT(Type) \
				1563	void Type##Node::Accept(NodeVisitor* visitor) { \
				1564	visitor->Visit##Type(this); \
				1565	}
				1566	FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
				1567	#undef DEFINE_ACCEPT
				1568
				1569
				1570	void LoopChoiceNode::Accept(NodeVisitor* visitor) {
				1571	visitor->VisitLoopChoice(this);
				1572	}
				1573
				1574
				1575	// -------------------------------------------------------------------
				1576	// Emit code.
				1577
				1578
				1579	void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
				1580	Guard* guard,
				1581	Trace* trace) {
				1582	switch (guard->op()) {
				1583	case Guard::LT:
				1584	DCHECK(!trace->mentions_reg(guard->reg()));
				1585	macro_assembler->IfRegisterGE(guard->reg(),
				1586	guard->value(),
				1587	trace->backtrack());
				1588	break;
				1589	case Guard::GEQ:
				1590	DCHECK(!trace->mentions_reg(guard->reg()));
				1591	macro_assembler->IfRegisterLT(guard->reg(),
				1592	guard->value(),
				1593	trace->backtrack());
				1594	break;
				1595	}
				1596	}
				1597
				1598
				1599	// Returns the number of characters in the equivalence class, omitting those
				1600	// that cannot occur in the source string because it is Latin1.
				1601	static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
				1602	bool one_byte_subject,
				1603	unibrow::uchar* letters) {
				1604	int length =
				1605	isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
				1606	// Unibrow returns 0 or 1 for characters where case independence is
				1607	// trivial.
				1608	if (length == 0) {
				1609	letters[0] = character;
				1610	length = 1;
				1611	}
				1612
				1613	if (one_byte_subject) {
				1614	int new_length = 0;
				1615	for (int i = 0; i < length; i++) {
				1616	if (letters[i] <= String::kMaxOneByteCharCode) {
				1617	letters[new_length++] = letters[i];
				1618	}
				1619	}
				1620	length = new_length;
				1621	}
				1622
				1623	return length;
				1624	}
				1625
				1626
				1627	static inline bool EmitSimpleCharacter(Isolate* isolate,
				1628	RegExpCompiler* compiler,
				1629	uc16 c,
				1630	Label* on_failure,
				1631	int cp_offset,
				1632	bool check,
				1633	bool preloaded) {
				1634	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1635	bool bound_checked = false;
				1636	if (!preloaded) {
				1637	assembler->LoadCurrentCharacter(
				1638	cp_offset,
				1639	on_failure,
				1640	check);
				1641	bound_checked = true;
				1642	}
				1643	assembler->CheckNotCharacter(c, on_failure);
				1644	return bound_checked;
				1645	}
				1646
				1647
				1648	// Only emits non-letters (things that don't have case). Only used for case
				1649	// independent matches.
				1650	static inline bool EmitAtomNonLetter(Isolate* isolate,
				1651	RegExpCompiler* compiler,
				1652	uc16 c,
				1653	Label* on_failure,
				1654	int cp_offset,
				1655	bool check,
				1656	bool preloaded) {
				1657	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1658	bool one_byte = compiler->one_byte();
				1659	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				1660	int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
				1661	if (length < 1) {
				1662	// This can't match. Must be an one-byte subject and a non-one-byte
				1663	// character. We do not need to do anything since the one-byte pass
				1664	// already handled this.
				1665	return false; // Bounds not checked.
				1666	}
				1667	bool checked = false;
				1668	// We handle the length > 1 case in a later pass.
				1669	if (length == 1) {
				1670	if (one_byte && c > String::kMaxOneByteCharCodeU) {
				1671	// Can't match - see above.
				1672	return false; // Bounds not checked.
				1673	}
				1674	if (!preloaded) {
				1675	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
				1676	checked = check;
				1677	}
				1678	macro_assembler->CheckNotCharacter(c, on_failure);
				1679	}
				1680	return checked;
				1681	}
				1682
				1683
				1684	static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
				1685	bool one_byte, uc16 c1, uc16 c2,
				1686	Label* on_failure) {
				1687	uc16 char_mask;
				1688	if (one_byte) {
				1689	char_mask = String::kMaxOneByteCharCode;
				1690	} else {
				1691	char_mask = String::kMaxUtf16CodeUnit;
				1692	}
				1693	uc16 exor = c1 ^ c2;
				1694	// Check whether exor has only one bit set.
				1695	if (((exor - 1) & exor) == 0) {
				1696	// If c1 and c2 differ only by one bit.
				1697	// Ecma262UnCanonicalize always gives the highest number last.
				1698	DCHECK(c2 > c1);
				1699	uc16 mask = char_mask ^ exor;
				1700	macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
				1701	return true;
				1702	}
				1703	DCHECK(c2 > c1);
				1704	uc16 diff = c2 - c1;
				1705	if (((diff - 1) & diff) == 0 && c1 >= diff) {
				1706	// If the characters differ by 2^n but don't differ by one bit then
				1707	// subtract the difference from the found character, then do the or
				1708	// trick. We avoid the theoretical case where negative numbers are
				1709	// involved in order to simplify code generation.
				1710	uc16 mask = char_mask ^ diff;
				1711	macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
				1712	diff,
				1713	mask,
				1714	on_failure);
				1715	return true;
				1716	}
				1717	return false;
				1718	}
				1719
				1720
				1721	typedef bool EmitCharacterFunction(Isolate* isolate,
				1722	RegExpCompiler* compiler,
				1723	uc16 c,
				1724	Label* on_failure,
				1725	int cp_offset,
				1726	bool check,
				1727	bool preloaded);
				1728
				1729	// Only emits letters (things that have case). Only used for case independent
				1730	// matches.
				1731	static inline bool EmitAtomLetter(Isolate* isolate,
				1732	RegExpCompiler* compiler,
				1733	uc16 c,
				1734	Label* on_failure,
				1735	int cp_offset,
				1736	bool check,
				1737	bool preloaded) {
				1738	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1739	bool one_byte = compiler->one_byte();
				1740	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				1741	int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
				1742	if (length <= 1) return false;
				1743	// We may not need to check against the end of the input string
				1744	// if this character lies before a character that matched.
				1745	if (!preloaded) {
				1746	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
				1747	}
				1748	Label ok;
				1749	DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
				1750	switch (length) {
				1751	case 2: {
				1752	if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
				1753	chars[1], on_failure)) {
				1754	} else {
				1755	macro_assembler->CheckCharacter(chars[0], &ok);
				1756	macro_assembler->CheckNotCharacter(chars[1], on_failure);
				1757	macro_assembler->Bind(&ok);
				1758	}
				1759	break;
				1760	}
				1761	case 4:
				1762	macro_assembler->CheckCharacter(chars[3], &ok);
				1763	// Fall through!
				1764	case 3:
				1765	macro_assembler->CheckCharacter(chars[0], &ok);
				1766	macro_assembler->CheckCharacter(chars[1], &ok);
				1767	macro_assembler->CheckNotCharacter(chars[2], on_failure);
				1768	macro_assembler->Bind(&ok);
				1769	break;
				1770	default:
				1771	UNREACHABLE();
				1772	break;
				1773	}
				1774	return true;
				1775	}
				1776
				1777
				1778	static void EmitBoundaryTest(RegExpMacroAssembler* masm,
				1779	int border,
				1780	Label* fall_through,
				1781	Label* above_or_equal,
				1782	Label* below) {
				1783	if (below != fall_through) {
				1784	masm->CheckCharacterLT(border, below);
				1785	if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
				1786	} else {
				1787	masm->CheckCharacterGT(border - 1, above_or_equal);
				1788	}
				1789	}
				1790
				1791
				1792	static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
				1793	int first,
				1794	int last,
				1795	Label* fall_through,
				1796	Label* in_range,
				1797	Label* out_of_range) {
				1798	if (in_range == fall_through) {
				1799	if (first == last) {
				1800	masm->CheckNotCharacter(first, out_of_range);
				1801	} else {
				1802	masm->CheckCharacterNotInRange(first, last, out_of_range);
				1803	}
				1804	} else {
				1805	if (first == last) {
				1806	masm->CheckCharacter(first, in_range);
				1807	} else {
				1808	masm->CheckCharacterInRange(first, last, in_range);
				1809	}
				1810	if (out_of_range != fall_through) masm->GoTo(out_of_range);
				1811	}
				1812	}
				1813
				1814
				1815	// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
				1816	// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
				1817	static void EmitUseLookupTable(
				1818	RegExpMacroAssembler* masm,
				1819	ZoneList<int>* ranges,
				1820	int start_index,
				1821	int end_index,
				1822	int min_char,
				1823	Label* fall_through,
				1824	Label* even_label,
				1825	Label* odd_label) {
				1826	static const int kSize = RegExpMacroAssembler::kTableSize;
				1827	static const int kMask = RegExpMacroAssembler::kTableMask;
				1828
				1829	int base = (min_char & ~kMask);
				1830	USE(base);
				1831
				1832	// Assert that everything is on one kTableSize page.
				1833	for (int i = start_index; i <= end_index; i++) {
				1834	DCHECK_EQ(ranges->at(i) & ~kMask, base);
				1835	}
				1836	DCHECK(start_index == 0 \|\| (ranges->at(start_index - 1) & ~kMask) <= base);
				1837
				1838	char templ[kSize];
				1839	Label* on_bit_set;
				1840	Label* on_bit_clear;
				1841	int bit;
				1842	if (even_label == fall_through) {
				1843	on_bit_set = odd_label;
				1844	on_bit_clear = even_label;
				1845	bit = 1;
				1846	} else {
				1847	on_bit_set = even_label;
				1848	on_bit_clear = odd_label;
				1849	bit = 0;
				1850	}
				1851	for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
				1852	templ[i] = bit;
				1853	}
				1854	int j = 0;
				1855	bit ^= 1;
				1856	for (int i = start_index; i < end_index; i++) {
				1857	for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
				1858	templ[j] = bit;
				1859	}
				1860	bit ^= 1;
				1861	}
				1862	for (int i = j; i < kSize; i++) {
				1863	templ[i] = bit;
				1864	}
				1865	Factory* factory = masm->isolate()->factory();
				1866	// TODO(erikcorry): Cache these.
				1867	Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
				1868	for (int i = 0; i < kSize; i++) {
				1869	ba->set(i, templ[i]);
				1870	}
				1871	masm->CheckBitInTable(ba, on_bit_set);
				1872	if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
				1873	}
				1874
				1875
				1876	static void CutOutRange(RegExpMacroAssembler* masm,
				1877	ZoneList<int>* ranges,
				1878	int start_index,
				1879	int end_index,
				1880	int cut_index,
				1881	Label* even_label,
				1882	Label* odd_label) {
				1883	bool odd = (((cut_index - start_index) & 1) == 1);
				1884	Label* in_range_label = odd ? odd_label : even_label;
				1885	Label dummy;
				1886	EmitDoubleBoundaryTest(masm,
				1887	ranges->at(cut_index),
				1888	ranges->at(cut_index + 1) - 1,
				1889	&dummy,
				1890	in_range_label,
				1891	&dummy);
				1892	DCHECK(!dummy.is_linked());
				1893	// Cut out the single range by rewriting the array. This creates a new
				1894	// range that is a merger of the two ranges on either side of the one we
				1895	// are cutting out. The oddity of the labels is preserved.
				1896	for (int j = cut_index; j > start_index; j--) {
				1897	ranges->at(j) = ranges->at(j - 1);
				1898	}
				1899	for (int j = cut_index + 1; j < end_index; j++) {
				1900	ranges->at(j) = ranges->at(j + 1);
				1901	}
				1902	}
				1903
				1904
				1905	// Unicode case. Split the search space into kSize spaces that are handled
				1906	// with recursion.
				1907	static void SplitSearchSpace(ZoneList<int>* ranges,
				1908	int start_index,
				1909	int end_index,
				1910	int* new_start_index,
				1911	int* new_end_index,
				1912	int* border) {
				1913	static const int kSize = RegExpMacroAssembler::kTableSize;
				1914	static const int kMask = RegExpMacroAssembler::kTableMask;
				1915
				1916	int first = ranges->at(start_index);
				1917	int last = ranges->at(end_index) - 1;
				1918
				1919	*new_start_index = start_index;
				1920	*border = (ranges->at(start_index) & ~kMask) + kSize;
				1921	while (*new_start_index < end_index) {
				1922	if (ranges->at(new_start_index) > border) break;
				1923	(*new_start_index)++;
				1924	}
				1925	// new_start_index is the index of the first edge that is beyond the
				1926	// current kSize space.
				1927
				1928	// For very large search spaces we do a binary chop search of the non-Latin1
				1929	// space instead of just going to the end of the current kSize space. The
				1930	// heuristics are complicated a little by the fact that any 128-character
				1931	// encoding space can be quickly tested with a table lookup, so we don't
				1932	// wish to do binary chop search at a smaller granularity than that. A
				1933	// 128-character space can take up a lot of space in the ranges array if,
				1934	// for example, we only want to match every second character (eg. the lower
				1935	// case characters on some Unicode pages).
				1936	int binary_chop_index = (end_index + start_index) / 2;
				1937	// The first test ensures that we get to the code that handles the Latin1
				1938	// range with a single not-taken branch, speeding up this important
				1939	// character range (even non-Latin1 charset-based text has spaces and
				1940	// punctuation).
				1941	if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
				1942	end_index - start_index > (new_start_index - start_index) 2 &&
				1943	last - first > kSize * 2 && binary_chop_index > *new_start_index &&
				1944	ranges->at(binary_chop_index) >= first + 2 * kSize) {
				1945	int scan_forward_for_section_border = binary_chop_index;;
				1946	int new_border = (ranges->at(binary_chop_index) \| kMask) + 1;
				1947
				1948	while (scan_forward_for_section_border < end_index) {
				1949	if (ranges->at(scan_forward_for_section_border) > new_border) {
				1950	*new_start_index = scan_forward_for_section_border;
				1951	*border = new_border;
				1952	break;
				1953	}
				1954	scan_forward_for_section_border++;
				1955	}
				1956	}
				1957
				1958	DCHECK(*new_start_index > start_index);
				1959	new_end_index = new_start_index - 1;
				1960	if (ranges->at(new_end_index) == border) {
				1961	(*new_end_index)--;
				1962	}
				1963	if (*border >= ranges->at(end_index)) {
				1964	*border = ranges->at(end_index);
				1965	*new_start_index = end_index; // Won't be used.
				1966	*new_end_index = end_index - 1;
				1967	}
				1968	}
				1969
				1970
				1971	// Gets a series of segment boundaries representing a character class. If the
				1972	// character is in the range between an even and an odd boundary (counting from
				1973	// start_index) then go to even_label, otherwise go to odd_label. We already
				1974	// know that the character is in the range of min_char to max_char inclusive.
				1975	// Either label can be NULL indicating backtracking. Either label can also be
				1976	// equal to the fall_through label.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	1977	static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
				1978	int start_index, int end_index, uc32 min_char,
				1979	uc32 max_char, Label* fall_through,
				1980	Label* even_label, Label* odd_label) {
				1981	DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
				1982	DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
				1983
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1984	int first = ranges->at(start_index);
				1985	int last = ranges->at(end_index) - 1;
				1986
				1987	DCHECK_LT(min_char, first);
				1988
				1989	// Just need to test if the character is before or on-or-after
				1990	// a particular character.
				1991	if (start_index == end_index) {
				1992	EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
				1993	return;
				1994	}
				1995
				1996	// Another almost trivial case: There is one interval in the middle that is
				1997	// different from the end intervals.
				1998	if (start_index + 1 == end_index) {
				1999	EmitDoubleBoundaryTest(
				2000	masm, first, last, fall_through, even_label, odd_label);
				2001	return;
				2002	}
				2003
				2004	// It's not worth using table lookup if there are very few intervals in the
				2005	// character class.
				2006	if (end_index - start_index <= 6) {
				2007	// It is faster to test for individual characters, so we look for those
				2008	// first, then try arbitrary ranges in the second round.
				2009	static int kNoCutIndex = -1;
				2010	int cut = kNoCutIndex;
				2011	for (int i = start_index; i < end_index; i++) {
				2012	if (ranges->at(i) == ranges->at(i + 1) - 1) {
				2013	cut = i;
				2014	break;
				2015	}
				2016	}
				2017	if (cut == kNoCutIndex) cut = start_index;
				2018	CutOutRange(
				2019	masm, ranges, start_index, end_index, cut, even_label, odd_label);
				2020	DCHECK_GE(end_index - start_index, 2);
				2021	GenerateBranches(masm,
				2022	ranges,
				2023	start_index + 1,
				2024	end_index - 1,
				2025	min_char,
				2026	max_char,
				2027	fall_through,
				2028	even_label,
				2029	odd_label);
				2030	return;
				2031	}
				2032
				2033	// If there are a lot of intervals in the regexp, then we will use tables to
				2034	// determine whether the character is inside or outside the character class.
				2035	static const int kBits = RegExpMacroAssembler::kTableSizeBits;
				2036
				2037	if ((max_char >> kBits) == (min_char >> kBits)) {
				2038	EmitUseLookupTable(masm,
				2039	ranges,
				2040	start_index,
				2041	end_index,
				2042	min_char,
				2043	fall_through,
				2044	even_label,
				2045	odd_label);
				2046	return;
				2047	}
				2048
				2049	if ((min_char >> kBits) != (first >> kBits)) {
				2050	masm->CheckCharacterLT(first, odd_label);
				2051	GenerateBranches(masm,
				2052	ranges,
				2053	start_index + 1,
				2054	end_index,
				2055	first,
				2056	max_char,
				2057	fall_through,
				2058	odd_label,
				2059	even_label);
				2060	return;
				2061	}
				2062
				2063	int new_start_index = 0;
				2064	int new_end_index = 0;
				2065	int border = 0;
				2066
				2067	SplitSearchSpace(ranges,
				2068	start_index,
				2069	end_index,
				2070	&new_start_index,
				2071	&new_end_index,
				2072	&border);
				2073
				2074	Label handle_rest;
				2075	Label* above = &handle_rest;
				2076	if (border == last + 1) {
				2077	// We didn't find any section that started after the limit, so everything
				2078	// above the border is one of the terminal labels.
				2079	above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
				2080	DCHECK(new_end_index == end_index - 1);
				2081	}
				2082
				2083	DCHECK_LE(start_index, new_end_index);
				2084	DCHECK_LE(new_start_index, end_index);
				2085	DCHECK_LT(start_index, new_start_index);
				2086	DCHECK_LT(new_end_index, end_index);
				2087	DCHECK(new_end_index + 1 == new_start_index \|\|
				2088	(new_end_index + 2 == new_start_index &&
				2089	border == ranges->at(new_end_index + 1)));
				2090	DCHECK_LT(min_char, border - 1);
				2091	DCHECK_LT(border, max_char);
				2092	DCHECK_LT(ranges->at(new_end_index), border);
				2093	DCHECK(border < ranges->at(new_start_index) \|\|
				2094	(border == ranges->at(new_start_index) &&
				2095	new_start_index == end_index &&
				2096	new_end_index == end_index - 1 &&
				2097	border == last + 1));
				2098	DCHECK(new_start_index == 0 \|\| border >= ranges->at(new_start_index - 1));
				2099
				2100	masm->CheckCharacterGT(border - 1, above);
				2101	Label dummy;
				2102	GenerateBranches(masm,
				2103	ranges,
				2104	start_index,
				2105	new_end_index,
				2106	min_char,
				2107	border - 1,
				2108	&dummy,
				2109	even_label,
				2110	odd_label);
				2111	if (handle_rest.is_linked()) {
				2112	masm->Bind(&handle_rest);
				2113	bool flip = (new_start_index & 1) != (start_index & 1);
				2114	GenerateBranches(masm,
				2115	ranges,
				2116	new_start_index,
				2117	end_index,
				2118	border,
				2119	max_char,
				2120	&dummy,
				2121	flip ? odd_label : even_label,
				2122	flip ? even_label : odd_label);
				2123	}
				2124	}
				2125
				2126
				2127	static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
				2128	RegExpCharacterClass* cc, bool one_byte,
				2129	Label* on_failure, int cp_offset, bool check_offset,
				2130	bool preloaded, Zone* zone) {
				2131	ZoneList<CharacterRange>* ranges = cc->ranges(zone);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	2132	CharacterRange::Canonicalize(ranges);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2133
				2134	int max_char;
				2135	if (one_byte) {
				2136	max_char = String::kMaxOneByteCharCode;
				2137	} else {
				2138	max_char = String::kMaxUtf16CodeUnit;
				2139	}
				2140
				2141	int range_count = ranges->length();
				2142
				2143	int last_valid_range = range_count - 1;
				2144	while (last_valid_range >= 0) {
				2145	CharacterRange& range = ranges->at(last_valid_range);
				2146	if (range.from() <= max_char) {
				2147	break;
				2148	}
				2149	last_valid_range--;
				2150	}
				2151
				2152	if (last_valid_range < 0) {
				2153	if (!cc->is_negated()) {
				2154	macro_assembler->GoTo(on_failure);
				2155	}
				2156	if (check_offset) {
				2157	macro_assembler->CheckPosition(cp_offset, on_failure);
				2158	}
				2159	return;
				2160	}
				2161
				2162	if (last_valid_range == 0 &&
				2163	ranges->at(0).IsEverything(max_char)) {
				2164	if (cc->is_negated()) {
				2165	macro_assembler->GoTo(on_failure);
				2166	} else {
				2167	// This is a common case hit by non-anchored expressions.
				2168	if (check_offset) {
				2169	macro_assembler->CheckPosition(cp_offset, on_failure);
				2170	}
				2171	}
				2172	return;
				2173	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2174
				2175	if (!preloaded) {
				2176	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
				2177	}
				2178
				2179	if (cc->is_standard(zone) &&
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	2180	macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
				2181	on_failure)) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2182	return;
				2183	}
				2184
				2185
				2186	// A new list with ascending entries. Each entry is a code unit
				2187	// where there is a boundary between code units that are part of
				2188	// the class and code units that are not. Normally we insert an
				2189	// entry at zero which goes to the failure label, but if there
				2190	// was already one there we fall through for success on that entry.
				2191	// Subsequent entries have alternating meaning (success/failure).
				2192	ZoneList<int>* range_boundaries =
				2193	new(zone) ZoneList<int>(last_valid_range, zone);
				2194
				2195	bool zeroth_entry_is_failure = !cc->is_negated();
				2196
				2197	for (int i = 0; i <= last_valid_range; i++) {
				2198	CharacterRange& range = ranges->at(i);
				2199	if (range.from() == 0) {
				2200	DCHECK_EQ(i, 0);
				2201	zeroth_entry_is_failure = !zeroth_entry_is_failure;
				2202	} else {
				2203	range_boundaries->Add(range.from(), zone);
				2204	}
				2205	range_boundaries->Add(range.to() + 1, zone);
				2206	}
				2207	int end_index = range_boundaries->length() - 1;
				2208	if (range_boundaries->at(end_index) > max_char) {
				2209	end_index--;
				2210	}
				2211
				2212	Label fall_through;
				2213	GenerateBranches(macro_assembler,
				2214	range_boundaries,
				2215	0, // start_index.
				2216	end_index,
				2217	0, // min_char.
				2218	max_char,
				2219	&fall_through,
				2220	zeroth_entry_is_failure ? &fall_through : on_failure,
				2221	zeroth_entry_is_failure ? on_failure : &fall_through);
				2222	macro_assembler->Bind(&fall_through);
				2223	}
				2224
				2225
				2226	RegExpNode::~RegExpNode() {
				2227	}
				2228
				2229
				2230	RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
				2231	Trace* trace) {
				2232	// If we are generating a greedy loop then don't stop and don't reuse code.
				2233	if (trace->stop_node() != NULL) {
				2234	return CONTINUE;
				2235	}
				2236
				2237	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				2238	if (trace->is_trivial()) {
				2239	if (label_.is_bound() \|\| on_work_list() \|\| !KeepRecursing(compiler)) {
				2240	// If a generic version is already scheduled to be generated or we have
				2241	// recursed too deeply then just generate a jump to that code.
				2242	macro_assembler->GoTo(&label_);
				2243	// This will queue it up for generation of a generic version if it hasn't
				2244	// already been queued.
				2245	compiler->AddWork(this);
				2246	return DONE;
				2247	}
				2248	// Generate generic version of the node and bind the label for later use.
				2249	macro_assembler->Bind(&label_);
				2250	return CONTINUE;
				2251	}
				2252
				2253	// We are being asked to make a non-generic version. Keep track of how many
				2254	// non-generic versions we generate so as not to overdo it.
				2255	trace_count_++;
				2256	if (KeepRecursing(compiler) && compiler->optimize() &&
				2257	trace_count_ < kMaxCopiesCodeGenerated) {
				2258	return CONTINUE;
				2259	}
				2260
				2261	// If we get here code has been generated for this node too many times or
				2262	// recursion is too deep. Time to switch to a generic version. The code for
				2263	// generic versions above can handle deep recursion properly.
				2264	bool was_limiting = compiler->limiting_recursion();
				2265	compiler->set_limiting_recursion(true);
				2266	trace->Flush(compiler, this);
				2267	compiler->set_limiting_recursion(was_limiting);
				2268	return DONE;
				2269	}
				2270
				2271
				2272	bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
				2273	return !compiler->limiting_recursion() &&
				2274	compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
				2275	}
				2276
				2277
				2278	int ActionNode::EatsAtLeast(int still_to_find,
				2279	int budget,
				2280	bool not_at_start) {
				2281	if (budget <= 0) return 0;
				2282	if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
				2283	return on_success()->EatsAtLeast(still_to_find,
				2284	budget - 1,
				2285	not_at_start);
				2286	}
				2287
				2288
				2289	void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
				2290	BoyerMooreLookahead* bm, bool not_at_start) {
				2291	if (action_type_ == BEGIN_SUBMATCH) {
				2292	bm->SetRest(offset);
				2293	} else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
				2294	on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
				2295	}
				2296	SaveBMInfo(bm, not_at_start, offset);
				2297	}
				2298
				2299
				2300	int AssertionNode::EatsAtLeast(int still_to_find,
				2301	int budget,
				2302	bool not_at_start) {
				2303	if (budget <= 0) return 0;
				2304	// If we know we are not at the start and we are asked "how many characters
				2305	// will you match if you succeed?" then we can answer anything since false
				2306	// implies false. So lets just return the max answer (still_to_find) since
				2307	// that won't prevent us from preloading a lot of characters for the other
				2308	// branches in the node graph.
				2309	if (assertion_type() == AT_START && not_at_start) return still_to_find;
				2310	return on_success()->EatsAtLeast(still_to_find,
				2311	budget - 1,
				2312	not_at_start);
				2313	}
				2314
				2315
				2316	void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
				2317	BoyerMooreLookahead* bm, bool not_at_start) {
				2318	// Match the behaviour of EatsAtLeast on this node.
				2319	if (assertion_type() == AT_START && not_at_start) return;
				2320	on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
				2321	SaveBMInfo(bm, not_at_start, offset);
				2322	}
				2323
				2324
				2325	int BackReferenceNode::EatsAtLeast(int still_to_find,
				2326	int budget,
				2327	bool not_at_start) {
				2328	if (read_backward()) return 0;
				2329	if (budget <= 0) return 0;
				2330	return on_success()->EatsAtLeast(still_to_find,
				2331	budget - 1,
				2332	not_at_start);
				2333	}
				2334
				2335
				2336	int TextNode::EatsAtLeast(int still_to_find,
				2337	int budget,
				2338	bool not_at_start) {
				2339	if (read_backward()) return 0;
				2340	int answer = Length();
				2341	if (answer >= still_to_find) return answer;
				2342	if (budget <= 0) return answer;
				2343	// We are not at start after this node so we set the last argument to 'true'.
				2344	return answer + on_success()->EatsAtLeast(still_to_find - answer,
				2345	budget - 1,
				2346	true);
				2347	}
				2348
				2349
				2350	int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
				2351	bool not_at_start) {
				2352	if (budget <= 0) return 0;
				2353	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				2354	// afterwards.
				2355	RegExpNode* node = alternatives_->at(1).node();
				2356	return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
				2357	}
				2358
				2359
				2360	void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
				2361	QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
				2362	bool not_at_start) {
				2363	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				2364	// afterwards.
				2365	RegExpNode* node = alternatives_->at(1).node();
				2366	return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
				2367	}
				2368
				2369
				2370	int ChoiceNode::EatsAtLeastHelper(int still_to_find,
				2371	int budget,
				2372	RegExpNode* ignore_this_node,
				2373	bool not_at_start) {
				2374	if (budget <= 0) return 0;
				2375	int min = 100;
				2376	int choice_count = alternatives_->length();
				2377	budget = (budget - 1) / choice_count;
				2378	for (int i = 0; i < choice_count; i++) {
				2379	RegExpNode* node = alternatives_->at(i).node();
				2380	if (node == ignore_this_node) continue;
				2381	int node_eats_at_least =
				2382	node->EatsAtLeast(still_to_find, budget, not_at_start);
				2383	if (node_eats_at_least < min) min = node_eats_at_least;
				2384	if (min == 0) return 0;
				2385	}
				2386	return min;
				2387	}
				2388
				2389
				2390	int LoopChoiceNode::EatsAtLeast(int still_to_find,
				2391	int budget,
				2392	bool not_at_start) {
				2393	return EatsAtLeastHelper(still_to_find,
				2394	budget - 1,
				2395	loop_node_,
				2396	not_at_start);
				2397	}
				2398
				2399
				2400	int ChoiceNode::EatsAtLeast(int still_to_find,
				2401	int budget,
				2402	bool not_at_start) {
				2403	return EatsAtLeastHelper(still_to_find,
				2404	budget,
				2405	NULL,
				2406	not_at_start);
				2407	}
				2408
				2409
				2410	// Takes the left-most 1-bit and smears it out, setting all bits to its right.
				2411	static inline uint32_t SmearBitsRight(uint32_t v) {
				2412	v \|= v >> 1;
				2413	v \|= v >> 2;
				2414	v \|= v >> 4;
				2415	v \|= v >> 8;
				2416	v \|= v >> 16;
				2417	return v;
				2418	}
				2419
				2420
				2421	bool QuickCheckDetails::Rationalize(bool asc) {
				2422	bool found_useful_op = false;
				2423	uint32_t char_mask;
				2424	if (asc) {
				2425	char_mask = String::kMaxOneByteCharCode;
				2426	} else {
				2427	char_mask = String::kMaxUtf16CodeUnit;
				2428	}
				2429	mask_ = 0;
				2430	value_ = 0;
				2431	int char_shift = 0;
				2432	for (int i = 0; i < characters_; i++) {
				2433	Position* pos = &positions_[i];
				2434	if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
				2435	found_useful_op = true;
				2436	}
				2437	mask_ \|= (pos->mask & char_mask) << char_shift;
				2438	value_ \|= (pos->value & char_mask) << char_shift;
				2439	char_shift += asc ? 8 : 16;
				2440	}
				2441	return found_useful_op;
				2442	}
				2443
				2444
				2445	bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
				2446	Trace* bounds_check_trace,
				2447	Trace* trace,
				2448	bool preload_has_checked_bounds,
				2449	Label* on_possible_success,
				2450	QuickCheckDetails* details,
				2451	bool fall_through_on_failure) {
				2452	if (details->characters() == 0) return false;
				2453	GetQuickCheckDetails(
				2454	details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
				2455	if (details->cannot_match()) return false;
				2456	if (!details->Rationalize(compiler->one_byte())) return false;
				2457	DCHECK(details->characters() == 1 \|\|
				2458	compiler->macro_assembler()->CanReadUnaligned());
				2459	uint32_t mask = details->mask();
				2460	uint32_t value = details->value();
				2461
				2462	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2463
				2464	if (trace->characters_preloaded() != details->characters()) {
				2465	DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
				2466	// We are attempting to preload the minimum number of characters
				2467	// any choice would eat, so if the bounds check fails, then none of the
				2468	// choices can succeed, so we can just immediately backtrack, rather
				2469	// than go to the next choice.
				2470	assembler->LoadCurrentCharacter(trace->cp_offset(),
				2471	bounds_check_trace->backtrack(),
				2472	!preload_has_checked_bounds,
				2473	details->characters());
				2474	}
				2475
				2476
				2477	bool need_mask = true;
				2478
				2479	if (details->characters() == 1) {
				2480	// If number of characters preloaded is 1 then we used a byte or 16 bit
				2481	// load so the value is already masked down.
				2482	uint32_t char_mask;
				2483	if (compiler->one_byte()) {
				2484	char_mask = String::kMaxOneByteCharCode;
				2485	} else {
				2486	char_mask = String::kMaxUtf16CodeUnit;
				2487	}
				2488	if ((mask & char_mask) == char_mask) need_mask = false;
				2489	mask &= char_mask;
				2490	} else {
				2491	// For 2-character preloads in one-byte mode or 1-character preloads in
				2492	// two-byte mode we also use a 16 bit load with zero extend.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	2493	static const uint32_t kTwoByteMask = 0xffff;
				2494	static const uint32_t kFourByteMask = 0xffffffff;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2495	if (details->characters() == 2 && compiler->one_byte()) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	2496	if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2497	} else if (details->characters() == 1 && !compiler->one_byte()) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	2498	if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2499	} else {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	2500	if (mask == kFourByteMask) need_mask = false;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2501	}
				2502	}
				2503
				2504	if (fall_through_on_failure) {
				2505	if (need_mask) {
				2506	assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
				2507	} else {
				2508	assembler->CheckCharacter(value, on_possible_success);
				2509	}
				2510	} else {
				2511	if (need_mask) {
				2512	assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
				2513	} else {
				2514	assembler->CheckNotCharacter(value, trace->backtrack());
				2515	}
				2516	}
				2517	return true;
				2518	}
				2519
				2520
				2521	// Here is the meat of GetQuickCheckDetails (see also the comment on the
				2522	// super-class in the .h file).
				2523	//
				2524	// We iterate along the text object, building up for each character a
				2525	// mask and value that can be used to test for a quick failure to match.
				2526	// The masks and values for the positions will be combined into a single
				2527	// machine word for the current character width in order to be used in
				2528	// generating a quick check.
				2529	void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2530	RegExpCompiler* compiler,
				2531	int characters_filled_in,
				2532	bool not_at_start) {
				2533	// Do not collect any quick check details if the text node reads backward,
				2534	// since it reads in the opposite direction than we use for quick checks.
				2535	if (read_backward()) return;
				2536	Isolate* isolate = compiler->macro_assembler()->isolate();
				2537	DCHECK(characters_filled_in < details->characters());
				2538	int characters = details->characters();
				2539	int char_mask;
				2540	if (compiler->one_byte()) {
				2541	char_mask = String::kMaxOneByteCharCode;
				2542	} else {
				2543	char_mask = String::kMaxUtf16CodeUnit;
				2544	}
				2545	for (int k = 0; k < elements()->length(); k++) {
				2546	TextElement elm = elements()->at(k);
				2547	if (elm.text_type() == TextElement::ATOM) {
				2548	Vector<const uc16> quarks = elm.atom()->data();
				2549	for (int i = 0; i < characters && i < quarks.length(); i++) {
				2550	QuickCheckDetails::Position* pos =
				2551	details->positions(characters_filled_in);
				2552	uc16 c = quarks[i];
				2553	if (compiler->ignore_case()) {
				2554	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				2555	int length = GetCaseIndependentLetters(isolate, c,
				2556	compiler->one_byte(), chars);
				2557	if (length == 0) {
				2558	// This can happen because all case variants are non-Latin1, but we
				2559	// know the input is Latin1.
				2560	details->set_cannot_match();
				2561	pos->determines_perfectly = false;
				2562	return;
				2563	}
				2564	if (length == 1) {
				2565	// This letter has no case equivalents, so it's nice and simple
				2566	// and the mask-compare will determine definitely whether we have
				2567	// a match at this character position.
				2568	pos->mask = char_mask;
				2569	pos->value = c;
				2570	pos->determines_perfectly = true;
				2571	} else {
				2572	uint32_t common_bits = char_mask;
				2573	uint32_t bits = chars[0];
				2574	for (int j = 1; j < length; j++) {
				2575	uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
				2576	common_bits ^= differing_bits;
				2577	bits &= common_bits;
				2578	}
				2579	// If length is 2 and common bits has only one zero in it then
				2580	// our mask and compare instruction will determine definitely
				2581	// whether we have a match at this character position. Otherwise
				2582	// it can only be an approximate check.
				2583	uint32_t one_zero = (common_bits \| ~char_mask);
				2584	if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
				2585	pos->determines_perfectly = true;
				2586	}
				2587	pos->mask = common_bits;
				2588	pos->value = bits;
				2589	}
				2590	} else {
				2591	// Don't ignore case. Nice simple case where the mask-compare will
				2592	// determine definitely whether we have a match at this character
				2593	// position.
				2594	if (c > char_mask) {
				2595	details->set_cannot_match();
				2596	pos->determines_perfectly = false;
				2597	return;
				2598	}
				2599	pos->mask = char_mask;
				2600	pos->value = c;
				2601	pos->determines_perfectly = true;
				2602	}
				2603	characters_filled_in++;
				2604	DCHECK(characters_filled_in <= details->characters());
				2605	if (characters_filled_in == details->characters()) {
				2606	return;
				2607	}
				2608	}
				2609	} else {
				2610	QuickCheckDetails::Position* pos =
				2611	details->positions(characters_filled_in);
				2612	RegExpCharacterClass* tree = elm.char_class();
				2613	ZoneList<CharacterRange>* ranges = tree->ranges(zone());
				2614	if (tree->is_negated()) {
				2615	// A quick check uses multi-character mask and compare. There is no
				2616	// useful way to incorporate a negative char class into this scheme
				2617	// so we just conservatively create a mask and value that will always
				2618	// succeed.
				2619	pos->mask = 0;
				2620	pos->value = 0;
				2621	} else {
				2622	int first_range = 0;
				2623	while (ranges->at(first_range).from() > char_mask) {
				2624	first_range++;
				2625	if (first_range == ranges->length()) {
				2626	details->set_cannot_match();
				2627	pos->determines_perfectly = false;
				2628	return;
				2629	}
				2630	}
				2631	CharacterRange range = ranges->at(first_range);
				2632	uc16 from = range.from();
				2633	uc16 to = range.to();
				2634	if (to > char_mask) {
				2635	to = char_mask;
				2636	}
				2637	uint32_t differing_bits = (from ^ to);
				2638	// A mask and compare is only perfect if the differing bits form a
				2639	// number like 00011111 with one single block of trailing 1s.
				2640	if ((differing_bits & (differing_bits + 1)) == 0 &&
				2641	from + differing_bits == to) {
				2642	pos->determines_perfectly = true;
				2643	}
				2644	uint32_t common_bits = ~SmearBitsRight(differing_bits);
				2645	uint32_t bits = (from & common_bits);
				2646	for (int i = first_range + 1; i < ranges->length(); i++) {
				2647	CharacterRange range = ranges->at(i);
				2648	uc16 from = range.from();
				2649	uc16 to = range.to();
				2650	if (from > char_mask) continue;
				2651	if (to > char_mask) to = char_mask;
				2652	// Here we are combining more ranges into the mask and compare
				2653	// value. With each new range the mask becomes more sparse and
				2654	// so the chances of a false positive rise. A character class
				2655	// with multiple ranges is assumed never to be equivalent to a
				2656	// mask and compare operation.
				2657	pos->determines_perfectly = false;
				2658	uint32_t new_common_bits = (from ^ to);
				2659	new_common_bits = ~SmearBitsRight(new_common_bits);
				2660	common_bits &= new_common_bits;
				2661	bits &= new_common_bits;
				2662	uint32_t differing_bits = (from & common_bits) ^ bits;
				2663	common_bits ^= differing_bits;
				2664	bits &= common_bits;
				2665	}
				2666	pos->mask = common_bits;
				2667	pos->value = bits;
				2668	}
				2669	characters_filled_in++;
				2670	DCHECK(characters_filled_in <= details->characters());
				2671	if (characters_filled_in == details->characters()) {
				2672	return;
				2673	}
				2674	}
				2675	}
				2676	DCHECK(characters_filled_in != details->characters());
				2677	if (!details->cannot_match()) {
				2678	on_success()-> GetQuickCheckDetails(details,
				2679	compiler,
				2680	characters_filled_in,
				2681	true);
				2682	}
				2683	}
				2684
				2685
				2686	void QuickCheckDetails::Clear() {
				2687	for (int i = 0; i < characters_; i++) {
				2688	positions_[i].mask = 0;
				2689	positions_[i].value = 0;
				2690	positions_[i].determines_perfectly = false;
				2691	}
				2692	characters_ = 0;
				2693	}
				2694
				2695
				2696	void QuickCheckDetails::Advance(int by, bool one_byte) {
				2697	if (by >= characters_ \|\| by < 0) {
				2698	DCHECK_IMPLIES(by < 0, characters_ == 0);
				2699	Clear();
				2700	return;
				2701	}
				2702	DCHECK_LE(characters_ - by, 4);
				2703	DCHECK_LE(characters_, 4);
				2704	for (int i = 0; i < characters_ - by; i++) {
				2705	positions_[i] = positions_[by + i];
				2706	}
				2707	for (int i = characters_ - by; i < characters_; i++) {
				2708	positions_[i].mask = 0;
				2709	positions_[i].value = 0;
				2710	positions_[i].determines_perfectly = false;
				2711	}
				2712	characters_ -= by;
				2713	// We could change mask_ and value_ here but we would never advance unless
				2714	// they had already been used in a check and they won't be used again because
				2715	// it would gain us nothing. So there's no point.
				2716	}
				2717
				2718
				2719	void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
				2720	DCHECK(characters_ == other->characters_);
				2721	if (other->cannot_match_) {
				2722	return;
				2723	}
				2724	if (cannot_match_) {
				2725	this = other;
				2726	return;
				2727	}
				2728	for (int i = from_index; i < characters_; i++) {
				2729	QuickCheckDetails::Position* pos = positions(i);
				2730	QuickCheckDetails::Position* other_pos = other->positions(i);
				2731	if (pos->mask != other_pos->mask \|\|
				2732	pos->value != other_pos->value \|\|
				2733	!other_pos->determines_perfectly) {
				2734	// Our mask-compare operation will be approximate unless we have the
				2735	// exact same operation on both sides of the alternation.
				2736	pos->determines_perfectly = false;
				2737	}
				2738	pos->mask &= other_pos->mask;
				2739	pos->value &= pos->mask;
				2740	other_pos->value &= pos->mask;
				2741	uc16 differing_bits = (pos->value ^ other_pos->value);
				2742	pos->mask &= ~differing_bits;
				2743	pos->value &= pos->mask;
				2744	}
				2745	}
				2746
				2747
				2748	class VisitMarker {
				2749	public:
				2750	explicit VisitMarker(NodeInfo* info) : info_(info) {
				2751	DCHECK(!info->visited);
				2752	info->visited = true;
				2753	}
				2754	~VisitMarker() {
				2755	info_->visited = false;
				2756	}
				2757	private:
				2758	NodeInfo* info_;
				2759	};
				2760
				2761
				2762	RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
				2763	if (info()->replacement_calculated) return replacement();
				2764	if (depth < 0) return this;
				2765	DCHECK(!info()->visited);
				2766	VisitMarker marker(info());
				2767	return FilterSuccessor(depth - 1, ignore_case);
				2768	}
				2769
				2770
				2771	RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
				2772	RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
				2773	if (next == NULL) return set_replacement(NULL);
				2774	on_success_ = next;
				2775	return set_replacement(this);
				2776	}
				2777
				2778
				2779	// We need to check for the following characters: 0x39c 0x3bc 0x178.
				2780	static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
				2781	// TODO(dcarney): this could be a lot more efficient.
				2782	return range.Contains(0x39c) \|\|
				2783	range.Contains(0x3bc) \|\| range.Contains(0x178);
				2784	}
				2785
				2786
				2787	static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
				2788	for (int i = 0; i < ranges->length(); i++) {
				2789	// TODO(dcarney): this could be a lot more efficient.
				2790	if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
				2791	}
				2792	return false;
				2793	}
				2794
				2795
				2796	RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
				2797	if (info()->replacement_calculated) return replacement();
				2798	if (depth < 0) return this;
				2799	DCHECK(!info()->visited);
				2800	VisitMarker marker(info());
				2801	int element_count = elements()->length();
				2802	for (int i = 0; i < element_count; i++) {
				2803	TextElement elm = elements()->at(i);
				2804	if (elm.text_type() == TextElement::ATOM) {
				2805	Vector<const uc16> quarks = elm.atom()->data();
				2806	for (int j = 0; j < quarks.length(); j++) {
				2807	uint16_t c = quarks[j];
				2808	if (c <= String::kMaxOneByteCharCode) continue;
				2809	if (!ignore_case) return set_replacement(NULL);
				2810	// Here, we need to check for characters whose upper and lower cases
				2811	// are outside the Latin-1 range.
				2812	uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
				2813	// Character is outside Latin-1 completely
				2814	if (converted == 0) return set_replacement(NULL);
				2815	// Convert quark to Latin-1 in place.
				2816	uint16_t* copy = const_cast<uint16_t*>(quarks.start());
				2817	copy[j] = converted;
				2818	}
				2819	} else {
				2820	DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
				2821	RegExpCharacterClass* cc = elm.char_class();
				2822	ZoneList<CharacterRange>* ranges = cc->ranges(zone());
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	2823	CharacterRange::Canonicalize(ranges);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2824	// Now they are in order so we only need to look at the first.
				2825	int range_count = ranges->length();
				2826	if (cc->is_negated()) {
				2827	if (range_count != 0 &&
				2828	ranges->at(0).from() == 0 &&
				2829	ranges->at(0).to() >= String::kMaxOneByteCharCode) {
				2830	// This will be handled in a later filter.
				2831	if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
				2832	return set_replacement(NULL);
				2833	}
				2834	} else {
				2835	if (range_count == 0 \|\|
				2836	ranges->at(0).from() > String::kMaxOneByteCharCode) {
				2837	// This will be handled in a later filter.
				2838	if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
				2839	return set_replacement(NULL);
				2840	}
				2841	}
				2842	}
				2843	}
				2844	return FilterSuccessor(depth - 1, ignore_case);
				2845	}
				2846
				2847
				2848	RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
				2849	if (info()->replacement_calculated) return replacement();
				2850	if (depth < 0) return this;
				2851	if (info()->visited) return this;
				2852	{
				2853	VisitMarker marker(info());
				2854
				2855	RegExpNode* continue_replacement =
				2856	continue_node_->FilterOneByte(depth - 1, ignore_case);
				2857	// If we can't continue after the loop then there is no sense in doing the
				2858	// loop.
				2859	if (continue_replacement == NULL) return set_replacement(NULL);
				2860	}
				2861
				2862	return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
				2863	}
				2864
				2865
				2866	RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
				2867	if (info()->replacement_calculated) return replacement();
				2868	if (depth < 0) return this;
				2869	if (info()->visited) return this;
				2870	VisitMarker marker(info());
				2871	int choice_count = alternatives_->length();
				2872
				2873	for (int i = 0; i < choice_count; i++) {
				2874	GuardedAlternative alternative = alternatives_->at(i);
				2875	if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
				2876	set_replacement(this);
				2877	return this;
				2878	}
				2879	}
				2880
				2881	int surviving = 0;
				2882	RegExpNode* survivor = NULL;
				2883	for (int i = 0; i < choice_count; i++) {
				2884	GuardedAlternative alternative = alternatives_->at(i);
				2885	RegExpNode* replacement =
				2886	alternative.node()->FilterOneByte(depth - 1, ignore_case);
				2887	DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
				2888	if (replacement != NULL) {
				2889	alternatives_->at(i).set_node(replacement);
				2890	surviving++;
				2891	survivor = replacement;
				2892	}
				2893	}
				2894	if (surviving < 2) return set_replacement(survivor);
				2895
				2896	set_replacement(this);
				2897	if (surviving == choice_count) {
				2898	return this;
				2899	}
				2900	// Only some of the nodes survived the filtering. We need to rebuild the
				2901	// alternatives list.
				2902	ZoneList<GuardedAlternative>* new_alternatives =
				2903	new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
				2904	for (int i = 0; i < choice_count; i++) {
				2905	RegExpNode* replacement =
				2906	alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
				2907	if (replacement != NULL) {
				2908	alternatives_->at(i).set_node(replacement);
				2909	new_alternatives->Add(alternatives_->at(i), zone());
				2910	}
				2911	}
				2912	alternatives_ = new_alternatives;
				2913	return this;
				2914	}
				2915
				2916
				2917	RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
				2918	bool ignore_case) {
				2919	if (info()->replacement_calculated) return replacement();
				2920	if (depth < 0) return this;
				2921	if (info()->visited) return this;
				2922	VisitMarker marker(info());
				2923	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				2924	// afterwards.
				2925	RegExpNode* node = alternatives_->at(1).node();
				2926	RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
				2927	if (replacement == NULL) return set_replacement(NULL);
				2928	alternatives_->at(1).set_node(replacement);
				2929
				2930	RegExpNode* neg_node = alternatives_->at(0).node();
				2931	RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
				2932	// If the negative lookahead is always going to fail then
				2933	// we don't need to check it.
				2934	if (neg_replacement == NULL) return set_replacement(replacement);
				2935	alternatives_->at(0).set_node(neg_replacement);
				2936	return set_replacement(this);
				2937	}
				2938
				2939
				2940	void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2941	RegExpCompiler* compiler,
				2942	int characters_filled_in,
				2943	bool not_at_start) {
				2944	if (body_can_be_zero_length_ \|\| info()->visited) return;
				2945	VisitMarker marker(info());
				2946	return ChoiceNode::GetQuickCheckDetails(details,
				2947	compiler,
				2948	characters_filled_in,
				2949	not_at_start);
				2950	}
				2951
				2952
				2953	void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
				2954	BoyerMooreLookahead* bm, bool not_at_start) {
				2955	if (body_can_be_zero_length_ \|\| budget <= 0) {
				2956	bm->SetRest(offset);
				2957	SaveBMInfo(bm, not_at_start, offset);
				2958	return;
				2959	}
				2960	ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
				2961	SaveBMInfo(bm, not_at_start, offset);
				2962	}
				2963
				2964
				2965	void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2966	RegExpCompiler* compiler,
				2967	int characters_filled_in,
				2968	bool not_at_start) {
				2969	not_at_start = (not_at_start \|\| not_at_start_);
				2970	int choice_count = alternatives_->length();
				2971	DCHECK(choice_count > 0);
				2972	alternatives_->at(0).node()->GetQuickCheckDetails(details,
				2973	compiler,
				2974	characters_filled_in,
				2975	not_at_start);
				2976	for (int i = 1; i < choice_count; i++) {
				2977	QuickCheckDetails new_details(details->characters());
				2978	RegExpNode* node = alternatives_->at(i).node();
				2979	node->GetQuickCheckDetails(&new_details, compiler,
				2980	characters_filled_in,
				2981	not_at_start);
				2982	// Here we merge the quick match details of the two branches.
				2983	details->Merge(&new_details, characters_filled_in);
				2984	}
				2985	}
				2986
				2987
				2988	// Check for [0-9A-Z_a-z].
				2989	static void EmitWordCheck(RegExpMacroAssembler* assembler,
				2990	Label* word,
				2991	Label* non_word,
				2992	bool fall_through_on_word) {
				2993	if (assembler->CheckSpecialCharacterClass(
				2994	fall_through_on_word ? 'w' : 'W',
				2995	fall_through_on_word ? non_word : word)) {
				2996	// Optimized implementation available.
				2997	return;
				2998	}
				2999	assembler->CheckCharacterGT('z', non_word);
				3000	assembler->CheckCharacterLT('0', non_word);
				3001	assembler->CheckCharacterGT('a' - 1, word);
				3002	assembler->CheckCharacterLT('9' + 1, word);
				3003	assembler->CheckCharacterLT('A', non_word);
				3004	assembler->CheckCharacterLT('Z' + 1, word);
				3005	if (fall_through_on_word) {
				3006	assembler->CheckNotCharacter('_', non_word);
				3007	} else {
				3008	assembler->CheckCharacter('_', word);
				3009	}
				3010	}
				3011
				3012
				3013	// Emit the code to check for a ^ in multiline mode (1-character lookbehind
				3014	// that matches newline or the start of input).
				3015	static void EmitHat(RegExpCompiler* compiler,
				3016	RegExpNode* on_success,
				3017	Trace* trace) {
				3018	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3019	// We will be loading the previous character into the current character
				3020	// register.
				3021	Trace new_trace(*trace);
				3022	new_trace.InvalidateCurrentCharacter();
				3023
				3024	Label ok;
				3025	if (new_trace.cp_offset() == 0) {
				3026	// The start of input counts as a newline in this context, so skip to
				3027	// ok if we are at the start.
				3028	assembler->CheckAtStart(&ok);
				3029	}
				3030	// We already checked that we are not at the start of input so it must be
				3031	// OK to load the previous character.
				3032	assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
				3033	new_trace.backtrack(),
				3034	false);
				3035	if (!assembler->CheckSpecialCharacterClass('n',
				3036	new_trace.backtrack())) {
				3037	// Newline means \n, \r, 0x2028 or 0x2029.
				3038	if (!compiler->one_byte()) {
				3039	assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
				3040	}
				3041	assembler->CheckCharacter('\n', &ok);
				3042	assembler->CheckNotCharacter('\r', new_trace.backtrack());
				3043	}
				3044	assembler->Bind(&ok);
				3045	on_success->Emit(compiler, &new_trace);
				3046	}
				3047
				3048
				3049	// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
				3050	void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
				3051	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3052	Isolate* isolate = assembler->isolate();
				3053	Trace::TriBool next_is_word_character = Trace::UNKNOWN;
				3054	bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
				3055	BoyerMooreLookahead* lookahead = bm_info(not_at_start);
				3056	if (lookahead == NULL) {
				3057	int eats_at_least =
				3058	Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
				3059	kRecursionBudget,
				3060	not_at_start));
				3061	if (eats_at_least >= 1) {
				3062	BoyerMooreLookahead* bm =
				3063	new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
				3064	FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
				3065	if (bm->at(0)->is_non_word())
				3066	next_is_word_character = Trace::FALSE_VALUE;
				3067	if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
				3068	}
				3069	} else {
				3070	if (lookahead->at(0)->is_non_word())
				3071	next_is_word_character = Trace::FALSE_VALUE;
				3072	if (lookahead->at(0)->is_word())
				3073	next_is_word_character = Trace::TRUE_VALUE;
				3074	}
				3075	bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
				3076	if (next_is_word_character == Trace::UNKNOWN) {
				3077	Label before_non_word;
				3078	Label before_word;
				3079	if (trace->characters_preloaded() != 1) {
				3080	assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
				3081	}
				3082	// Fall through on non-word.
				3083	EmitWordCheck(assembler, &before_word, &before_non_word, false);
				3084	// Next character is not a word character.
				3085	assembler->Bind(&before_non_word);
				3086	Label ok;
				3087	BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
				3088	assembler->GoTo(&ok);
				3089
				3090	assembler->Bind(&before_word);
				3091	BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
				3092	assembler->Bind(&ok);
				3093	} else if (next_is_word_character == Trace::TRUE_VALUE) {
				3094	BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
				3095	} else {
				3096	DCHECK(next_is_word_character == Trace::FALSE_VALUE);
				3097	BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
				3098	}
				3099	}
				3100
				3101
				3102	void AssertionNode::BacktrackIfPrevious(
				3103	RegExpCompiler* compiler,
				3104	Trace* trace,
				3105	AssertionNode::IfPrevious backtrack_if_previous) {
				3106	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3107	Trace new_trace(*trace);
				3108	new_trace.InvalidateCurrentCharacter();
				3109
				3110	Label fall_through, dummy;
				3111
				3112	Label* non_word = backtrack_if_previous == kIsNonWord ?
				3113	new_trace.backtrack() :
				3114	&fall_through;
				3115	Label* word = backtrack_if_previous == kIsNonWord ?
				3116	&fall_through :
				3117	new_trace.backtrack();
				3118
				3119	if (new_trace.cp_offset() == 0) {
				3120	// The start of input counts as a non-word character, so the question is
				3121	// decided if we are at the start.
				3122	assembler->CheckAtStart(non_word);
				3123	}
				3124	// We already checked that we are not at the start of input so it must be
				3125	// OK to load the previous character.
				3126	assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
				3127	EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
				3128
				3129	assembler->Bind(&fall_through);
				3130	on_success()->Emit(compiler, &new_trace);
				3131	}
				3132
				3133
				3134	void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
				3135	RegExpCompiler* compiler,
				3136	int filled_in,
				3137	bool not_at_start) {
				3138	if (assertion_type_ == AT_START && not_at_start) {
				3139	details->set_cannot_match();
				3140	return;
				3141	}
				3142	return on_success()->GetQuickCheckDetails(details,
				3143	compiler,
				3144	filled_in,
				3145	not_at_start);
				3146	}
				3147
				3148
				3149	void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3150	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3151	switch (assertion_type_) {
				3152	case AT_END: {
				3153	Label ok;
				3154	assembler->CheckPosition(trace->cp_offset(), &ok);
				3155	assembler->GoTo(trace->backtrack());
				3156	assembler->Bind(&ok);
				3157	break;
				3158	}
				3159	case AT_START: {
				3160	if (trace->at_start() == Trace::FALSE_VALUE) {
				3161	assembler->GoTo(trace->backtrack());
				3162	return;
				3163	}
				3164	if (trace->at_start() == Trace::UNKNOWN) {
				3165	assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
				3166	Trace at_start_trace = *trace;
				3167	at_start_trace.set_at_start(Trace::TRUE_VALUE);
				3168	on_success()->Emit(compiler, &at_start_trace);
				3169	return;
				3170	}
				3171	}
				3172	break;
				3173	case AFTER_NEWLINE:
				3174	EmitHat(compiler, on_success(), trace);
				3175	return;
				3176	case AT_BOUNDARY:
				3177	case AT_NON_BOUNDARY: {
				3178	EmitBoundaryCheck(compiler, trace);
				3179	return;
				3180	}
				3181	}
				3182	on_success()->Emit(compiler, trace);
				3183	}
				3184
				3185
				3186	static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
				3187	if (quick_check == NULL) return false;
				3188	if (offset >= quick_check->characters()) return false;
				3189	return quick_check->positions(offset)->determines_perfectly;
				3190	}
				3191
				3192
				3193	static void UpdateBoundsCheck(int index, int* checked_up_to) {
				3194	if (index > *checked_up_to) {
				3195	*checked_up_to = index;
				3196	}
				3197	}
				3198
				3199
				3200	// We call this repeatedly to generate code for each pass over the text node.
				3201	// The passes are in increasing order of difficulty because we hope one
				3202	// of the first passes will fail in which case we are saved the work of the
				3203	// later passes. for example for the case independent regexp /%[asdfghjkl]a/
				3204	// we will check the '%' in the first pass, the case independent 'a' in the
				3205	// second pass and the character class in the last pass.
				3206	//
				3207	// The passes are done from right to left, so for example to test for /bar/
				3208	// we will first test for an 'r' with offset 2, then an 'a' with offset 1
				3209	// and then a 'b' with offset 0. This means we can avoid the end-of-input
				3210	// bounds check most of the time. In the example we only need to check for
				3211	// end-of-input when loading the putative 'r'.
				3212	//
				3213	// A slight complication involves the fact that the first character may already
				3214	// be fetched into a register by the previous node. In this case we want to
				3215	// do the test for that character first. We do this in separate passes. The
				3216	// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
				3217	// pass has been performed then subsequent passes will have true in
				3218	// first_element_checked to indicate that that character does not need to be
				3219	// checked again.
				3220	//
				3221	// In addition to all this we are passed a Trace, which can
				3222	// contain an AlternativeGeneration object. In this AlternativeGeneration
				3223	// object we can see details of any quick check that was already passed in
				3224	// order to get to the code we are now generating. The quick check can involve
				3225	// loading characters, which means we do not need to recheck the bounds
				3226	// up to the limit the quick check already checked. In addition the quick
				3227	// check can have involved a mask and compare operation which may simplify
				3228	// or obviate the need for further checks at some character positions.
				3229	void TextNode::TextEmitPass(RegExpCompiler* compiler,
				3230	TextEmitPassType pass,
				3231	bool preloaded,
				3232	Trace* trace,
				3233	bool first_element_checked,
				3234	int* checked_up_to) {
				3235	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3236	Isolate* isolate = assembler->isolate();
				3237	bool one_byte = compiler->one_byte();
				3238	Label* backtrack = trace->backtrack();
				3239	QuickCheckDetails* quick_check = trace->quick_check_performed();
				3240	int element_count = elements()->length();
				3241	int backward_offset = read_backward() ? -Length() : 0;
				3242	for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
				3243	TextElement elm = elements()->at(i);
				3244	int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
				3245	if (elm.text_type() == TextElement::ATOM) {
				3246	Vector<const uc16> quarks = elm.atom()->data();
				3247	for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
				3248	if (first_element_checked && i == 0 && j == 0) continue;
				3249	if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
				3250	EmitCharacterFunction* emit_function = NULL;
				3251	switch (pass) {
				3252	case NON_LATIN1_MATCH:
				3253	DCHECK(one_byte);
				3254	if (quarks[j] > String::kMaxOneByteCharCode) {
				3255	assembler->GoTo(backtrack);
				3256	return;
				3257	}
				3258	break;
				3259	case NON_LETTER_CHARACTER_MATCH:
				3260	emit_function = &EmitAtomNonLetter;
				3261	break;
				3262	case SIMPLE_CHARACTER_MATCH:
				3263	emit_function = &EmitSimpleCharacter;
				3264	break;
				3265	case CASE_CHARACTER_MATCH:
				3266	emit_function = &EmitAtomLetter;
				3267	break;
				3268	default:
				3269	break;
				3270	}
				3271	if (emit_function != NULL) {
				3272	bool bounds_check = *checked_up_to < cp_offset + j \|\| read_backward();
				3273	bool bound_checked =
				3274	emit_function(isolate, compiler, quarks[j], backtrack,
				3275	cp_offset + j, bounds_check, preloaded);
				3276	if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
				3277	}
				3278	}
				3279	} else {
				3280	DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
				3281	if (pass == CHARACTER_CLASS_MATCH) {
				3282	if (first_element_checked && i == 0) continue;
				3283	if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
				3284	RegExpCharacterClass* cc = elm.char_class();
				3285	bool bounds_check = *checked_up_to < cp_offset \|\| read_backward();
				3286	EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
				3287	bounds_check, preloaded, zone());
				3288	UpdateBoundsCheck(cp_offset, checked_up_to);
				3289	}
				3290	}
				3291	}
				3292	}
				3293
				3294
				3295	int TextNode::Length() {
				3296	TextElement elm = elements()->last();
				3297	DCHECK(elm.cp_offset() >= 0);
				3298	return elm.cp_offset() + elm.length();
				3299	}
				3300
				3301
				3302	bool TextNode::SkipPass(int int_pass, bool ignore_case) {
				3303	TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
				3304	if (ignore_case) {
				3305	return pass == SIMPLE_CHARACTER_MATCH;
				3306	} else {
				3307	return pass == NON_LETTER_CHARACTER_MATCH \|\| pass == CASE_CHARACTER_MATCH;
				3308	}
				3309	}
				3310
				3311
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	3312	TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
				3313	ZoneList<CharacterRange>* ranges,
				3314	bool read_backward,
				3315	RegExpNode* on_success) {
				3316	DCHECK_NOT_NULL(ranges);
				3317	ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
				3318	elms->Add(
				3319	TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),
				3320	zone);
				3321	return new (zone) TextNode(elms, read_backward, on_success);
				3322	}
				3323
				3324
				3325	TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
				3326	CharacterRange trail,
				3327	bool read_backward,
				3328	RegExpNode* on_success) {
				3329	ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
				3330	ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
				3331	ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
				3332	elms->Add(TextElement::CharClass(
				3333	new (zone) RegExpCharacterClass(lead_ranges, false)),
				3334	zone);
				3335	elms->Add(TextElement::CharClass(
				3336	new (zone) RegExpCharacterClass(trail_ranges, false)),
				3337	zone);
				3338	return new (zone) TextNode(elms, read_backward, on_success);
				3339	}
				3340
				3341
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3342	// This generates the code to match a text node. A text node can contain
				3343	// straight character sequences (possibly to be matched in a case-independent
				3344	// way) and character classes. For efficiency we do not do this in a single
				3345	// pass from left to right. Instead we pass over the text node several times,
				3346	// emitting code for some character positions every time. See the comment on
				3347	// TextEmitPass for details.
				3348	void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3349	LimitResult limit_result = LimitVersions(compiler, trace);
				3350	if (limit_result == DONE) return;
				3351	DCHECK(limit_result == CONTINUE);
				3352
				3353	if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
				3354	compiler->SetRegExpTooBig();
				3355	return;
				3356	}
				3357
				3358	if (compiler->one_byte()) {
				3359	int dummy = 0;
				3360	TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
				3361	}
				3362
				3363	bool first_elt_done = false;
				3364	int bound_checked_to = trace->cp_offset() - 1;
				3365	bound_checked_to += trace->bound_checked_up_to();
				3366
				3367	// If a character is preloaded into the current character register then
				3368	// check that now.
				3369	if (trace->characters_preloaded() == 1) {
				3370	for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
				3371	if (!SkipPass(pass, compiler->ignore_case())) {
				3372	TextEmitPass(compiler,
				3373	static_cast<TextEmitPassType>(pass),
				3374	true,
				3375	trace,
				3376	false,
				3377	&bound_checked_to);
				3378	}
				3379	}
				3380	first_elt_done = true;
				3381	}
				3382
				3383	for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
				3384	if (!SkipPass(pass, compiler->ignore_case())) {
				3385	TextEmitPass(compiler,
				3386	static_cast<TextEmitPassType>(pass),
				3387	false,
				3388	trace,
				3389	first_elt_done,
				3390	&bound_checked_to);
				3391	}
				3392	}
				3393
				3394	Trace successor_trace(*trace);
				3395	// If we advance backward, we may end up at the start.
				3396	successor_trace.AdvanceCurrentPositionInTrace(
				3397	read_backward() ? -Length() : Length(), compiler);
				3398	successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
				3399	: Trace::FALSE_VALUE);
				3400	RecursionCheck rc(compiler);
				3401	on_success()->Emit(compiler, &successor_trace);
				3402	}
				3403
				3404
				3405	void Trace::InvalidateCurrentCharacter() {
				3406	characters_preloaded_ = 0;
				3407	}
				3408
				3409
				3410	void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
				3411	// We don't have an instruction for shifting the current character register
				3412	// down or for using a shifted value for anything so lets just forget that
				3413	// we preloaded any characters into it.
				3414	characters_preloaded_ = 0;
				3415	// Adjust the offsets of the quick check performed information. This
				3416	// information is used to find out what we already determined about the
				3417	// characters by means of mask and compare.
				3418	quick_check_performed_.Advance(by, compiler->one_byte());
				3419	cp_offset_ += by;
				3420	if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
				3421	compiler->SetRegExpTooBig();
				3422	cp_offset_ = 0;
				3423	}
				3424	bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
				3425	}
				3426
				3427
				3428	void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
				3429	int element_count = elements()->length();
				3430	for (int i = 0; i < element_count; i++) {
				3431	TextElement elm = elements()->at(i);
				3432	if (elm.text_type() == TextElement::CHAR_CLASS) {
				3433	RegExpCharacterClass* cc = elm.char_class();
				3434	// None of the standard character classes is different in the case
				3435	// independent case and it slows us down if we don't know that.
				3436	if (cc->is_standard(zone())) continue;
				3437	ZoneList<CharacterRange>* ranges = cc->ranges(zone());
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	3438	CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3439	}
				3440	}
				3441	}
				3442
				3443
				3444	int TextNode::GreedyLoopTextLength() { return Length(); }
				3445
				3446
				3447	RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
				3448	RegExpCompiler* compiler) {
				3449	if (read_backward()) return NULL;
				3450	if (elements()->length() != 1) return NULL;
				3451	TextElement elm = elements()->at(0);
				3452	if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
				3453	RegExpCharacterClass* node = elm.char_class();
				3454	ZoneList<CharacterRange>* ranges = node->ranges(zone());
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	3455	CharacterRange::Canonicalize(ranges);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3456	if (node->is_negated()) {
				3457	return ranges->length() == 0 ? on_success() : NULL;
				3458	}
				3459	if (ranges->length() != 1) return NULL;
				3460	uint32_t max_char;
				3461	if (compiler->one_byte()) {
				3462	max_char = String::kMaxOneByteCharCode;
				3463	} else {
				3464	max_char = String::kMaxUtf16CodeUnit;
				3465	}
				3466	return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
				3467	}
				3468
				3469
				3470	// Finds the fixed match length of a sequence of nodes that goes from
				3471	// this alternative and back to this choice node. If there are variable
				3472	// length nodes or other complications in the way then return a sentinel
				3473	// value indicating that a greedy loop cannot be constructed.
				3474	int ChoiceNode::GreedyLoopTextLengthForAlternative(
				3475	GuardedAlternative* alternative) {
				3476	int length = 0;
				3477	RegExpNode* node = alternative->node();
				3478	// Later we will generate code for all these text nodes using recursion
				3479	// so we have to limit the max number.
				3480	int recursion_depth = 0;
				3481	while (node != this) {
				3482	if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
				3483	return kNodeIsTooComplexForGreedyLoops;
				3484	}
				3485	int node_length = node->GreedyLoopTextLength();
				3486	if (node_length == kNodeIsTooComplexForGreedyLoops) {
				3487	return kNodeIsTooComplexForGreedyLoops;
				3488	}
				3489	length += node_length;
				3490	SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
				3491	node = seq_node->on_success();
				3492	}
				3493	return read_backward() ? -length : length;
				3494	}
				3495
				3496
				3497	void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
				3498	DCHECK_NULL(loop_node_);
				3499	AddAlternative(alt);
				3500	loop_node_ = alt.node();
				3501	}
				3502
				3503
				3504	void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
				3505	DCHECK_NULL(continue_node_);
				3506	AddAlternative(alt);
				3507	continue_node_ = alt.node();
				3508	}
				3509
				3510
				3511	void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3512	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				3513	if (trace->stop_node() == this) {
				3514	// Back edge of greedy optimized loop node graph.
				3515	int text_length =
				3516	GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
				3517	DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);
				3518	// Update the counter-based backtracking info on the stack. This is an
				3519	// optimization for greedy loops (see below).
				3520	DCHECK(trace->cp_offset() == text_length);
				3521	macro_assembler->AdvanceCurrentPosition(text_length);
				3522	macro_assembler->GoTo(trace->loop_label());
				3523	return;
				3524	}
				3525	DCHECK_NULL(trace->stop_node());
				3526	if (!trace->is_trivial()) {
				3527	trace->Flush(compiler, this);
				3528	return;
				3529	}
				3530	ChoiceNode::Emit(compiler, trace);
				3531	}
				3532
				3533
				3534	int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
				3535	int eats_at_least) {
				3536	int preload_characters = Min(4, eats_at_least);
				3537	if (compiler->macro_assembler()->CanReadUnaligned()) {
				3538	bool one_byte = compiler->one_byte();
				3539	if (one_byte) {
				3540	if (preload_characters > 4) preload_characters = 4;
				3541	// We can't preload 3 characters because there is no machine instruction
				3542	// to do that. We can't just load 4 because we could be reading
				3543	// beyond the end of the string, which could cause a memory fault.
				3544	if (preload_characters == 3) preload_characters = 2;
				3545	} else {
				3546	if (preload_characters > 2) preload_characters = 2;
				3547	}
				3548	} else {
				3549	if (preload_characters > 1) preload_characters = 1;
				3550	}
				3551	return preload_characters;
				3552	}
				3553
				3554
				3555	// This class is used when generating the alternatives in a choice node. It
				3556	// records the way the alternative is being code generated.
				3557	class AlternativeGeneration: public Malloced {
				3558	public:
				3559	AlternativeGeneration()
				3560	: possible_success(),
				3561	expects_preload(false),
				3562	after(),
				3563	quick_check_details() { }
				3564	Label possible_success;
				3565	bool expects_preload;
				3566	Label after;
				3567	QuickCheckDetails quick_check_details;
				3568	};
				3569
				3570
				3571	// Creates a list of AlternativeGenerations. If the list has a reasonable
				3572	// size then it is on the stack, otherwise the excess is on the heap.
				3573	class AlternativeGenerationList {
				3574	public:
				3575	AlternativeGenerationList(int count, Zone* zone)
				3576	: alt_gens_(count, zone) {
				3577	for (int i = 0; i < count && i < kAFew; i++) {
				3578	alt_gens_.Add(a_few_alt_gens_ + i, zone);
				3579	}
				3580	for (int i = kAFew; i < count; i++) {
				3581	alt_gens_.Add(new AlternativeGeneration(), zone);
				3582	}
				3583	}
				3584	~AlternativeGenerationList() {
				3585	for (int i = kAFew; i < alt_gens_.length(); i++) {
				3586	delete alt_gens_[i];
				3587	alt_gens_[i] = NULL;
				3588	}
				3589	}
				3590
				3591	AlternativeGeneration* at(int i) {
				3592	return alt_gens_[i];
				3593	}
				3594
				3595	private:
				3596	static const int kAFew = 10;
				3597	ZoneList<AlternativeGeneration*> alt_gens_;
				3598	AlternativeGeneration a_few_alt_gens_[kAFew];
				3599	};
				3600
				3601
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	3602	static const uc32 kRangeEndMarker = 0x110000;
				3603
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3604	// The '2' variant is has inclusive from and exclusive to.
				3605	// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
				3606	// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	3607	static const int kSpaceRanges[] = {
				3608	'\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 0x1681,
				3609	0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
				3610	0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3611	static const int kSpaceRangeCount = arraysize(kSpaceRanges);
				3612
				3613	static const int kWordRanges[] = {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	3614	'0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3615	static const int kWordRangeCount = arraysize(kWordRanges);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	3616	static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3617	static const int kDigitRangeCount = arraysize(kDigitRanges);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	3618	static const int kSurrogateRanges[] = {
				3619	kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3620	static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	3621	static const int kLineTerminatorRanges[] = {
				3622	0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3623	static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
				3624
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3625	void BoyerMoorePositionInfo::Set(int character) {
				3626	SetInterval(Interval(character, character));
				3627	}
				3628
				3629
				3630	void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
				3631	s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
				3632	w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
				3633	d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
				3634	surrogate_ =
				3635	AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
				3636	if (interval.to() - interval.from() >= kMapSize - 1) {
				3637	if (map_count_ != kMapSize) {
				3638	map_count_ = kMapSize;
				3639	for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
				3640	}
				3641	return;
				3642	}
				3643	for (int i = interval.from(); i <= interval.to(); i++) {
				3644	int mod_character = (i & kMask);
				3645	if (!map_->at(mod_character)) {
				3646	map_count_++;
				3647	map_->at(mod_character) = true;
				3648	}
				3649	if (map_count_ == kMapSize) return;
				3650	}
				3651	}
				3652
				3653
				3654	void BoyerMoorePositionInfo::SetAll() {
				3655	s_ = w_ = d_ = kLatticeUnknown;
				3656	if (map_count_ != kMapSize) {
				3657	map_count_ = kMapSize;
				3658	for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
				3659	}
				3660	}
				3661
				3662
				3663	BoyerMooreLookahead::BoyerMooreLookahead(
				3664	int length, RegExpCompiler* compiler, Zone* zone)
				3665	: length_(length),
				3666	compiler_(compiler) {
				3667	if (compiler->one_byte()) {
				3668	max_char_ = String::kMaxOneByteCharCode;
				3669	} else {
				3670	max_char_ = String::kMaxUtf16CodeUnit;
				3671	}
				3672	bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
				3673	for (int i = 0; i < length; i++) {
				3674	bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
				3675	}
				3676	}
				3677
				3678
				3679	// Find the longest range of lookahead that has the fewest number of different
				3680	// characters that can occur at a given position. Since we are optimizing two
				3681	// different parameters at once this is a tradeoff.
				3682	bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
				3683	int biggest_points = 0;
				3684	// If more than 32 characters out of 128 can occur it is unlikely that we can
				3685	// be lucky enough to step forwards much of the time.
				3686	const int kMaxMax = 32;
				3687	for (int max_number_of_chars = 4;
				3688	max_number_of_chars < kMaxMax;
				3689	max_number_of_chars *= 2) {
				3690	biggest_points =
				3691	FindBestInterval(max_number_of_chars, biggest_points, from, to);
				3692	}
				3693	if (biggest_points == 0) return false;
				3694	return true;
				3695	}
				3696
				3697
				3698	// Find the highest-points range between 0 and length_ where the character
				3699	// information is not too vague. 'Too vague' means that there are more than
				3700	// max_number_of_chars that can occur at this position. Calculates the number
				3701	// of points as the product of width-of-the-range and
				3702	// probability-of-finding-one-of-the-characters, where the probability is
				3703	// calculated using the frequency distribution of the sample subject string.
				3704	int BoyerMooreLookahead::FindBestInterval(
				3705	int max_number_of_chars, int old_biggest_points, int* from, int* to) {
				3706	int biggest_points = old_biggest_points;
				3707	static const int kSize = RegExpMacroAssembler::kTableSize;
				3708	for (int i = 0; i < length_; ) {
				3709	while (i < length_ && Count(i) > max_number_of_chars) i++;
				3710	if (i == length_) break;
				3711	int remembered_from = i;
				3712	bool union_map[kSize];
				3713	for (int j = 0; j < kSize; j++) union_map[j] = false;
				3714	while (i < length_ && Count(i) <= max_number_of_chars) {
				3715	BoyerMoorePositionInfo* map = bitmaps_->at(i);
				3716	for (int j = 0; j < kSize; j++) union_map[j] \|= map->at(j);
				3717	i++;
				3718	}
				3719	int frequency = 0;
				3720	for (int j = 0; j < kSize; j++) {
				3721	if (union_map[j]) {
				3722	// Add 1 to the frequency to give a small per-character boost for
				3723	// the cases where our sampling is not good enough and many
				3724	// characters have a frequency of zero. This means the frequency
				3725	// can theoretically be up to 2*kSize though we treat it mostly as
				3726	// a fraction of kSize.
				3727	frequency += compiler_->frequency_collator()->Frequency(j) + 1;
				3728	}
				3729	}
				3730	// We use the probability of skipping times the distance we are skipping to
				3731	// judge the effectiveness of this. Actually we have a cut-off: By
				3732	// dividing by 2 we switch off the skipping if the probability of skipping
				3733	// is less than 50%. This is because the multibyte mask-and-compare
				3734	// skipping in quickcheck is more likely to do well on this case.
				3735	bool in_quickcheck_range =
				3736	((i - remembered_from < 4) \|\|
				3737	(compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
				3738	// Called 'probability' but it is only a rough estimate and can actually
				3739	// be outside the 0-kSize range.
				3740	int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
				3741	int points = (i - remembered_from) * probability;
				3742	if (points > biggest_points) {
				3743	*from = remembered_from;
				3744	*to = i - 1;
				3745	biggest_points = points;
				3746	}
				3747	}
				3748	return biggest_points;
				3749	}
				3750
				3751
				3752	// Take all the characters that will not prevent a successful match if they
				3753	// occur in the subject string in the range between min_lookahead and
				3754	// max_lookahead (inclusive) measured from the current position. If the
				3755	// character at max_lookahead offset is not one of these characters, then we
				3756	// can safely skip forwards by the number of characters in the range.
				3757	int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
				3758	int max_lookahead,
				3759	Handle<ByteArray> boolean_skip_table) {
				3760	const int kSize = RegExpMacroAssembler::kTableSize;
				3761
				3762	const int kSkipArrayEntry = 0;
				3763	const int kDontSkipArrayEntry = 1;
				3764
				3765	for (int i = 0; i < kSize; i++) {
				3766	boolean_skip_table->set(i, kSkipArrayEntry);
				3767	}
				3768	int skip = max_lookahead + 1 - min_lookahead;
				3769
				3770	for (int i = max_lookahead; i >= min_lookahead; i--) {
				3771	BoyerMoorePositionInfo* map = bitmaps_->at(i);
				3772	for (int j = 0; j < kSize; j++) {
				3773	if (map->at(j)) {
				3774	boolean_skip_table->set(j, kDontSkipArrayEntry);
				3775	}
				3776	}
				3777	}
				3778
				3779	return skip;
				3780	}
				3781
				3782
				3783	// See comment above on the implementation of GetSkipTable.
				3784	void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
				3785	const int kSize = RegExpMacroAssembler::kTableSize;
				3786
				3787	int min_lookahead = 0;
				3788	int max_lookahead = 0;
				3789
				3790	if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
				3791
				3792	bool found_single_character = false;
				3793	int single_character = 0;
				3794	for (int i = max_lookahead; i >= min_lookahead; i--) {
				3795	BoyerMoorePositionInfo* map = bitmaps_->at(i);
				3796	if (map->map_count() > 1 \|\|
				3797	(found_single_character && map->map_count() != 0)) {
				3798	found_single_character = false;
				3799	break;
				3800	}
				3801	for (int j = 0; j < kSize; j++) {
				3802	if (map->at(j)) {
				3803	found_single_character = true;
				3804	single_character = j;
				3805	break;
				3806	}
				3807	}
				3808	}
				3809
				3810	int lookahead_width = max_lookahead + 1 - min_lookahead;
				3811
				3812	if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
				3813	// The mask-compare can probably handle this better.
				3814	return;
				3815	}
				3816
				3817	if (found_single_character) {
				3818	Label cont, again;
				3819	masm->Bind(&again);
				3820	masm->LoadCurrentCharacter(max_lookahead, &cont, true);
				3821	if (max_char_ > kSize) {
				3822	masm->CheckCharacterAfterAnd(single_character,
				3823	RegExpMacroAssembler::kTableMask,
				3824	&cont);
				3825	} else {
				3826	masm->CheckCharacter(single_character, &cont);
				3827	}
				3828	masm->AdvanceCurrentPosition(lookahead_width);
				3829	masm->GoTo(&again);
				3830	masm->Bind(&cont);
				3831	return;
				3832	}
				3833
				3834	Factory* factory = masm->isolate()->factory();
				3835	Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
				3836	int skip_distance = GetSkipTable(
				3837	min_lookahead, max_lookahead, boolean_skip_table);
				3838	DCHECK(skip_distance != 0);
				3839
				3840	Label cont, again;
				3841	masm->Bind(&again);
				3842	masm->LoadCurrentCharacter(max_lookahead, &cont, true);
				3843	masm->CheckBitInTable(boolean_skip_table, &cont);
				3844	masm->AdvanceCurrentPosition(skip_distance);
				3845	masm->GoTo(&again);
				3846	masm->Bind(&cont);
				3847	}
				3848
				3849
				3850	/* Code generation for choice nodes.
				3851	*
				3852	* We generate quick checks that do a mask and compare to eliminate a
				3853	* choice. If the quick check succeeds then it jumps to the continuation to
				3854	* do slow checks and check subsequent nodes. If it fails (the common case)
				3855	* it falls through to the next choice.
				3856	*
				3857	* Here is the desired flow graph. Nodes directly below each other imply
				3858	* fallthrough. Alternatives 1 and 2 have quick checks. Alternative
				3859	* 3 doesn't have a quick check so we have to call the slow check.
				3860	* Nodes are marked Qn for quick checks and Sn for slow checks. The entire
				3861	* regexp continuation is generated directly after the Sn node, up to the
				3862	* next GoTo if we decide to reuse some already generated code. Some
				3863	* nodes expect preload_characters to be preloaded into the current
				3864	* character register. R nodes do this preloading. Vertices are marked
				3865	* F for failures and S for success (possible success in the case of quick
				3866	* nodes). L, V, < and > are used as arrow heads.
				3867	*
				3868	* ----------> R
				3869	* \|
				3870	* V
				3871	* Q1 -----> S1
				3872	* \| S /
				3873	* F\| /
				3874	* \| F/
				3875	* \| /
				3876	* \| R
				3877	* \| /
				3878	* V L
				3879	* Q2 -----> S2
				3880	* \| S /
				3881	* F\| /
				3882	* \| F/
				3883	* \| /
				3884	* \| R
				3885	* \| /
				3886	* V L
				3887	* S3
				3888	* \|
				3889	* F\|
				3890	* \|
				3891	* R
				3892	* \|
				3893	* backtrack V
				3894	* <----------Q4
				3895	* \ F \|
				3896	* \ \|S
				3897	* \ F V
				3898	* \-----S4
				3899	*
				3900	* For greedy loops we push the current position, then generate the code that
				3901	* eats the input specially in EmitGreedyLoop. The other choice (the
				3902	* continuation) is generated by the normal code in EmitChoices, and steps back
				3903	* in the input to the starting position when it fails to match. The loop code
				3904	* looks like this (U is the unwind code that steps back in the greedy loop).
				3905	*
				3906	* _____
				3907	* / \
				3908	* V \|
				3909	* ----------> S1 \|
				3910	* /\| \|
				3911	* / \|S \|
				3912	* F/ \_____/
				3913	* /
				3914	* \|<-----
				3915	* \| \
				3916	* V \|S
				3917	* Q2 ---> U----->backtrack
				3918	* \| F /
				3919	* S\| /
				3920	* V F /
				3921	* S2--/
				3922	*/
				3923
				3924	GreedyLoopState::GreedyLoopState(bool not_at_start) {
				3925	counter_backtrack_trace_.set_backtrack(&label_);
				3926	if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
				3927	}
				3928
				3929
				3930	void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
				3931	#ifdef DEBUG
				3932	int choice_count = alternatives_->length();
				3933	for (int i = 0; i < choice_count - 1; i++) {
				3934	GuardedAlternative alternative = alternatives_->at(i);
				3935	ZoneList<Guard> guards = alternative.guards();
				3936	int guard_count = (guards == NULL) ? 0 : guards->length();
				3937	for (int j = 0; j < guard_count; j++) {
				3938	DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
				3939	}
				3940	}
				3941	#endif
				3942	}
				3943
				3944
				3945	void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
				3946	Trace* current_trace,
				3947	PreloadState* state) {
				3948	if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
				3949	// Save some time by looking at most one machine word ahead.
				3950	state->eats_at_least_ =
				3951	EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
				3952	current_trace->at_start() == Trace::FALSE_VALUE);
				3953	}
				3954	state->preload_characters_ =
				3955	CalculatePreloadCharacters(compiler, state->eats_at_least_);
				3956
				3957	state->preload_is_current_ =
				3958	(current_trace->characters_preloaded() == state->preload_characters_);
				3959	state->preload_has_checked_bounds_ = state->preload_is_current_;
				3960	}
				3961
				3962
				3963	void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3964	int choice_count = alternatives_->length();
				3965
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	3966	if (choice_count == 1 && alternatives_->at(0).guards() == NULL) {
				3967	alternatives_->at(0).node()->Emit(compiler, trace);
				3968	return;
				3969	}
				3970
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3971	AssertGuardsMentionRegisters(trace);
				3972
				3973	LimitResult limit_result = LimitVersions(compiler, trace);
				3974	if (limit_result == DONE) return;
				3975	DCHECK(limit_result == CONTINUE);
				3976
				3977	// For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
				3978	// other choice nodes we only flush if we are out of code size budget.
				3979	if (trace->flush_budget() == 0 && trace->actions() != NULL) {
				3980	trace->Flush(compiler, this);
				3981	return;
				3982	}
				3983
				3984	RecursionCheck rc(compiler);
				3985
				3986	PreloadState preload;
				3987	preload.init();
				3988	GreedyLoopState greedy_loop_state(not_at_start());
				3989
				3990	int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
				3991	AlternativeGenerationList alt_gens(choice_count, zone());
				3992
				3993	if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
				3994	trace = EmitGreedyLoop(compiler,
				3995	trace,
				3996	&alt_gens,
				3997	&preload,
				3998	&greedy_loop_state,
				3999	text_length);
				4000	} else {
				4001	// TODO(erikcorry): Delete this. We don't need this label, but it makes us
				4002	// match the traces produced pre-cleanup.
				4003	Label second_choice;
				4004	compiler->macro_assembler()->Bind(&second_choice);
				4005
				4006	preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
				4007
				4008	EmitChoices(compiler,
				4009	&alt_gens,
				4010	0,
				4011	trace,
				4012	&preload);
				4013	}
				4014
				4015	// At this point we need to generate slow checks for the alternatives where
				4016	// the quick check was inlined. We can recognize these because the associated
				4017	// label was bound.
				4018	int new_flush_budget = trace->flush_budget() / choice_count;
				4019	for (int i = 0; i < choice_count; i++) {
				4020	AlternativeGeneration* alt_gen = alt_gens.at(i);
				4021	Trace new_trace(*trace);
				4022	// If there are actions to be flushed we have to limit how many times
				4023	// they are flushed. Take the budget of the parent trace and distribute
				4024	// it fairly amongst the children.
				4025	if (new_trace.actions() != NULL) {
				4026	new_trace.set_flush_budget(new_flush_budget);
				4027	}
				4028	bool next_expects_preload =
				4029	i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
				4030	EmitOutOfLineContinuation(compiler,
				4031	&new_trace,
				4032	alternatives_->at(i),
				4033	alt_gen,
				4034	preload.preload_characters_,
				4035	next_expects_preload);
				4036	}
				4037	}
				4038
				4039
				4040	Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
				4041	Trace* trace,
				4042	AlternativeGenerationList* alt_gens,
				4043	PreloadState* preload,
				4044	GreedyLoopState* greedy_loop_state,
				4045	int text_length) {
				4046	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				4047	// Here we have special handling for greedy loops containing only text nodes
				4048	// and other simple nodes. These are handled by pushing the current
				4049	// position on the stack and then incrementing the current position each
				4050	// time around the switch. On backtrack we decrement the current position
				4051	// and check it against the pushed value. This avoids pushing backtrack
				4052	// information for each iteration of the loop, which could take up a lot of
				4053	// space.
				4054	DCHECK(trace->stop_node() == NULL);
				4055	macro_assembler->PushCurrentPosition();
				4056	Label greedy_match_failed;
				4057	Trace greedy_match_trace;
				4058	if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
				4059	greedy_match_trace.set_backtrack(&greedy_match_failed);
				4060	Label loop_label;
				4061	macro_assembler->Bind(&loop_label);
				4062	greedy_match_trace.set_stop_node(this);
				4063	greedy_match_trace.set_loop_label(&loop_label);
				4064	alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
				4065	macro_assembler->Bind(&greedy_match_failed);
				4066
				4067	Label second_choice; // For use in greedy matches.
				4068	macro_assembler->Bind(&second_choice);
				4069
				4070	Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
				4071
				4072	EmitChoices(compiler,
				4073	alt_gens,
				4074	1,
				4075	new_trace,
				4076	preload);
				4077
				4078	macro_assembler->Bind(greedy_loop_state->label());
				4079	// If we have unwound to the bottom then backtrack.
				4080	macro_assembler->CheckGreedyLoop(trace->backtrack());
				4081	// Otherwise try the second priority at an earlier position.
				4082	macro_assembler->AdvanceCurrentPosition(-text_length);
				4083	macro_assembler->GoTo(&second_choice);
				4084	return new_trace;
				4085	}
				4086
				4087	int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
				4088	Trace* trace) {
				4089	int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
				4090	if (alternatives_->length() != 2) return eats_at_least;
				4091
				4092	GuardedAlternative alt1 = alternatives_->at(1);
				4093	if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
				4094	return eats_at_least;
				4095	}
				4096	RegExpNode* eats_anything_node = alt1.node();
				4097	if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
				4098	return eats_at_least;
				4099	}
				4100
				4101	// Really we should be creating a new trace when we execute this function,
				4102	// but there is no need, because the code it generates cannot backtrack, and
				4103	// we always arrive here with a trivial trace (since it's the entry to a
				4104	// loop. That also implies that there are no preloaded characters, which is
				4105	// good, because it means we won't be violating any assumptions by
				4106	// overwriting those characters with new load instructions.
				4107	DCHECK(trace->is_trivial());
				4108
				4109	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				4110	Isolate* isolate = macro_assembler->isolate();
				4111	// At this point we know that we are at a non-greedy loop that will eat
				4112	// any character one at a time. Any non-anchored regexp has such a
				4113	// loop prepended to it in order to find where it starts. We look for
				4114	// a pattern of the form ...abc... where we can look 6 characters ahead
				4115	// and step forwards 3 if the character is not one of abc. Abc need
				4116	// not be atoms, they can be any reasonably limited character class or
				4117	// small alternation.
				4118	BoyerMooreLookahead* bm = bm_info(false);
				4119	if (bm == NULL) {
				4120	eats_at_least = Min(kMaxLookaheadForBoyerMoore,
				4121	EatsAtLeast(kMaxLookaheadForBoyerMoore,
				4122	kRecursionBudget,
				4123	false));
				4124	if (eats_at_least >= 1) {
				4125	bm = new(zone()) BoyerMooreLookahead(eats_at_least,
				4126	compiler,
				4127	zone());
				4128	GuardedAlternative alt0 = alternatives_->at(0);
				4129	alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
				4130	}
				4131	}
				4132	if (bm != NULL) {
				4133	bm->EmitSkipInstructions(macro_assembler);
				4134	}
				4135	return eats_at_least;
				4136	}
				4137
				4138
				4139	void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
				4140	AlternativeGenerationList* alt_gens,
				4141	int first_choice,
				4142	Trace* trace,
				4143	PreloadState* preload) {
				4144	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				4145	SetUpPreLoad(compiler, trace, preload);
				4146
				4147	// For now we just call all choices one after the other. The idea ultimately
				4148	// is to use the Dispatch table to try only the relevant ones.
				4149	int choice_count = alternatives_->length();
				4150
				4151	int new_flush_budget = trace->flush_budget() / choice_count;
				4152
				4153	for (int i = first_choice; i < choice_count; i++) {
				4154	bool is_last = i == choice_count - 1;
				4155	bool fall_through_on_failure = !is_last;
				4156	GuardedAlternative alternative = alternatives_->at(i);
				4157	AlternativeGeneration* alt_gen = alt_gens->at(i);
				4158	alt_gen->quick_check_details.set_characters(preload->preload_characters_);
				4159	ZoneList<Guard> guards = alternative.guards();
				4160	int guard_count = (guards == NULL) ? 0 : guards->length();
				4161	Trace new_trace(*trace);
				4162	new_trace.set_characters_preloaded(preload->preload_is_current_ ?
				4163	preload->preload_characters_ :
				4164	0);
				4165	if (preload->preload_has_checked_bounds_) {
				4166	new_trace.set_bound_checked_up_to(preload->preload_characters_);
				4167	}
				4168	new_trace.quick_check_performed()->Clear();
				4169	if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
				4170	if (!is_last) {
				4171	new_trace.set_backtrack(&alt_gen->after);
				4172	}
				4173	alt_gen->expects_preload = preload->preload_is_current_;
				4174	bool generate_full_check_inline = false;
				4175	if (compiler->optimize() &&
				4176	try_to_emit_quick_check_for_alternative(i == 0) &&
				4177	alternative.node()->EmitQuickCheck(
				4178	compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
				4179	&alt_gen->possible_success, &alt_gen->quick_check_details,
				4180	fall_through_on_failure)) {
				4181	// Quick check was generated for this choice.
				4182	preload->preload_is_current_ = true;
				4183	preload->preload_has_checked_bounds_ = true;
				4184	// If we generated the quick check to fall through on possible success,
				4185	// we now need to generate the full check inline.
				4186	if (!fall_through_on_failure) {
				4187	macro_assembler->Bind(&alt_gen->possible_success);
				4188	new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
				4189	new_trace.set_characters_preloaded(preload->preload_characters_);
				4190	new_trace.set_bound_checked_up_to(preload->preload_characters_);
				4191	generate_full_check_inline = true;
				4192	}
				4193	} else if (alt_gen->quick_check_details.cannot_match()) {
				4194	if (!fall_through_on_failure) {
				4195	macro_assembler->GoTo(trace->backtrack());
				4196	}
				4197	continue;
				4198	} else {
				4199	// No quick check was generated. Put the full code here.
				4200	// If this is not the first choice then there could be slow checks from
				4201	// previous cases that go here when they fail. There's no reason to
				4202	// insist that they preload characters since the slow check we are about
				4203	// to generate probably can't use it.
				4204	if (i != first_choice) {
				4205	alt_gen->expects_preload = false;
				4206	new_trace.InvalidateCurrentCharacter();
				4207	}
				4208	generate_full_check_inline = true;
				4209	}
				4210	if (generate_full_check_inline) {
				4211	if (new_trace.actions() != NULL) {
				4212	new_trace.set_flush_budget(new_flush_budget);
				4213	}
				4214	for (int j = 0; j < guard_count; j++) {
				4215	GenerateGuard(macro_assembler, guards->at(j), &new_trace);
				4216	}
				4217	alternative.node()->Emit(compiler, &new_trace);
				4218	preload->preload_is_current_ = false;
				4219	}
				4220	macro_assembler->Bind(&alt_gen->after);
				4221	}
				4222	}
				4223
				4224
				4225	void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
				4226	Trace* trace,
				4227	GuardedAlternative alternative,
				4228	AlternativeGeneration* alt_gen,
				4229	int preload_characters,
				4230	bool next_expects_preload) {
				4231	if (!alt_gen->possible_success.is_linked()) return;
				4232
				4233	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				4234	macro_assembler->Bind(&alt_gen->possible_success);
				4235	Trace out_of_line_trace(*trace);
				4236	out_of_line_trace.set_characters_preloaded(preload_characters);
				4237	out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
				4238	if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
				4239	ZoneList<Guard> guards = alternative.guards();
				4240	int guard_count = (guards == NULL) ? 0 : guards->length();
				4241	if (next_expects_preload) {
				4242	Label reload_current_char;
				4243	out_of_line_trace.set_backtrack(&reload_current_char);
				4244	for (int j = 0; j < guard_count; j++) {
				4245	GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
				4246	}
				4247	alternative.node()->Emit(compiler, &out_of_line_trace);
				4248	macro_assembler->Bind(&reload_current_char);
				4249	// Reload the current character, since the next quick check expects that.
				4250	// We don't need to check bounds here because we only get into this
				4251	// code through a quick check which already did the checked load.
				4252	macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
				4253	NULL,
				4254	false,
				4255	preload_characters);
				4256	macro_assembler->GoTo(&(alt_gen->after));
				4257	} else {
				4258	out_of_line_trace.set_backtrack(&(alt_gen->after));
				4259	for (int j = 0; j < guard_count; j++) {
				4260	GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
				4261	}
				4262	alternative.node()->Emit(compiler, &out_of_line_trace);
				4263	}
				4264	}
				4265
				4266
				4267	void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				4268	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				4269	LimitResult limit_result = LimitVersions(compiler, trace);
				4270	if (limit_result == DONE) return;
				4271	DCHECK(limit_result == CONTINUE);
				4272
				4273	RecursionCheck rc(compiler);
				4274
				4275	switch (action_type_) {
				4276	case STORE_POSITION: {
				4277	Trace::DeferredCapture
				4278	new_capture(data_.u_position_register.reg,
				4279	data_.u_position_register.is_capture,
				4280	trace);
				4281	Trace new_trace = *trace;
				4282	new_trace.add_action(&new_capture);
				4283	on_success()->Emit(compiler, &new_trace);
				4284	break;
				4285	}
				4286	case INCREMENT_REGISTER: {
				4287	Trace::DeferredIncrementRegister
				4288	new_increment(data_.u_increment_register.reg);
				4289	Trace new_trace = *trace;
				4290	new_trace.add_action(&new_increment);
				4291	on_success()->Emit(compiler, &new_trace);
				4292	break;
				4293	}
				4294	case SET_REGISTER: {
				4295	Trace::DeferredSetRegister
				4296	new_set(data_.u_store_register.reg, data_.u_store_register.value);
				4297	Trace new_trace = *trace;
				4298	new_trace.add_action(&new_set);
				4299	on_success()->Emit(compiler, &new_trace);
				4300	break;
				4301	}
				4302	case CLEAR_CAPTURES: {
				4303	Trace::DeferredClearCaptures
				4304	new_capture(Interval(data_.u_clear_captures.range_from,
				4305	data_.u_clear_captures.range_to));
				4306	Trace new_trace = *trace;
				4307	new_trace.add_action(&new_capture);
				4308	on_success()->Emit(compiler, &new_trace);
				4309	break;
				4310	}
				4311	case BEGIN_SUBMATCH:
				4312	if (!trace->is_trivial()) {
				4313	trace->Flush(compiler, this);
				4314	} else {
				4315	assembler->WriteCurrentPositionToRegister(
				4316	data_.u_submatch.current_position_register, 0);
				4317	assembler->WriteStackPointerToRegister(
				4318	data_.u_submatch.stack_pointer_register);
				4319	on_success()->Emit(compiler, trace);
				4320	}
				4321	break;
				4322	case EMPTY_MATCH_CHECK: {
				4323	int start_pos_reg = data_.u_empty_match_check.start_register;
				4324	int stored_pos = 0;
				4325	int rep_reg = data_.u_empty_match_check.repetition_register;
				4326	bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
				4327	bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
				4328	if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
				4329	// If we know we haven't advanced and there is no minimum we
				4330	// can just backtrack immediately.
				4331	assembler->GoTo(trace->backtrack());
				4332	} else if (know_dist && stored_pos < trace->cp_offset()) {
				4333	// If we know we've advanced we can generate the continuation
				4334	// immediately.
				4335	on_success()->Emit(compiler, trace);
				4336	} else if (!trace->is_trivial()) {
				4337	trace->Flush(compiler, this);
				4338	} else {
				4339	Label skip_empty_check;
				4340	// If we have a minimum number of repetitions we check the current
				4341	// number first and skip the empty check if it's not enough.
				4342	if (has_minimum) {
				4343	int limit = data_.u_empty_match_check.repetition_limit;
				4344	assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
				4345	}
				4346	// If the match is empty we bail out, otherwise we fall through
				4347	// to the on-success continuation.
				4348	assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
				4349	trace->backtrack());
				4350	assembler->Bind(&skip_empty_check);
				4351	on_success()->Emit(compiler, trace);
				4352	}
				4353	break;
				4354	}
				4355	case POSITIVE_SUBMATCH_SUCCESS: {
				4356	if (!trace->is_trivial()) {
				4357	trace->Flush(compiler, this);
				4358	return;
				4359	}
				4360	assembler->ReadCurrentPositionFromRegister(
				4361	data_.u_submatch.current_position_register);
				4362	assembler->ReadStackPointerFromRegister(
				4363	data_.u_submatch.stack_pointer_register);
				4364	int clear_register_count = data_.u_submatch.clear_register_count;
				4365	if (clear_register_count == 0) {
				4366	on_success()->Emit(compiler, trace);
				4367	return;
				4368	}
				4369	int clear_registers_from = data_.u_submatch.clear_register_from;
				4370	Label clear_registers_backtrack;
				4371	Trace new_trace = *trace;
				4372	new_trace.set_backtrack(&clear_registers_backtrack);
				4373	on_success()->Emit(compiler, &new_trace);
				4374
				4375	assembler->Bind(&clear_registers_backtrack);
				4376	int clear_registers_to = clear_registers_from + clear_register_count - 1;
				4377	assembler->ClearRegisters(clear_registers_from, clear_registers_to);
				4378
				4379	DCHECK(trace->backtrack() == NULL);
				4380	assembler->Backtrack();
				4381	return;
				4382	}
				4383	default:
				4384	UNREACHABLE();
				4385	}
				4386	}
				4387
				4388
				4389	void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				4390	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				4391	if (!trace->is_trivial()) {
				4392	trace->Flush(compiler, this);
				4393	return;
				4394	}
				4395
				4396	LimitResult limit_result = LimitVersions(compiler, trace);
				4397	if (limit_result == DONE) return;
				4398	DCHECK(limit_result == CONTINUE);
				4399
				4400	RecursionCheck rc(compiler);
				4401
				4402	DCHECK_EQ(start_reg_ + 1, end_reg_);
				4403	if (compiler->ignore_case()) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	4404	assembler->CheckNotBackReferenceIgnoreCase(
				4405	start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	4406	} else {
				4407	assembler->CheckNotBackReference(start_reg_, read_backward(),
				4408	trace->backtrack());
				4409	}
				4410	// We are going to advance backward, so we may end up at the start.
				4411	if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	4412
				4413	// Check that the back reference does not end inside a surrogate pair.
				4414	if (compiler->unicode() && !compiler->one_byte()) {
				4415	assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
				4416	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	4417	on_success()->Emit(compiler, trace);
				4418	}
				4419
				4420
				4421	// -------------------------------------------------------------------
				4422	// Dot/dotty output
				4423
				4424
				4425	#ifdef DEBUG
				4426
				4427
				4428	class DotPrinter: public NodeVisitor {
				4429	public:
				4430	DotPrinter(std::ostream& os, bool ignore_case) // NOLINT
				4431	: os_(os),
				4432	ignore_case_(ignore_case) {}
				4433	void PrintNode(const char* label, RegExpNode* node);
				4434	void Visit(RegExpNode* node);
				4435	void PrintAttributes(RegExpNode* from);
				4436	void PrintOnFailure(RegExpNode* from, RegExpNode* to);
				4437	#define DECLARE_VISIT(Type) \
				4438	virtual void Visit##Type(Type##Node* that);
				4439	FOR_EACH_NODE_TYPE(DECLARE_VISIT)
				4440	#undef DECLARE_VISIT
				4441	private:
				4442	std::ostream& os_;
				4443	bool ignore_case_;
				4444	};
				4445
				4446
				4447	void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
				4448	os_ << "digraph G {\n graph [label=\"";
				4449	for (int i = 0; label[i]; i++) {
				4450	switch (label[i]) {
				4451	case '\\':
				4452	os_ << "\\\\";
				4453	break;
				4454	case '"':
				4455	os_ << "\"";
				4456	break;
				4457	default:
				4458	os_ << label[i];
				4459	break;
				4460	}
				4461	}
				4462	os_ << "\"];\n";
				4463	Visit(node);
				4464	os_ << "}" << std::endl;
				4465	}
				4466
				4467
				4468	void DotPrinter::Visit(RegExpNode* node) {
				4469	if (node->info()->visited) return;
				4470	node->info()->visited = true;
				4471	node->Accept(this);
				4472	}
				4473
				4474
				4475	void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
				4476	os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
				4477	Visit(on_failure);
				4478	}
				4479
				4480
				4481	class TableEntryBodyPrinter {
				4482	public:
				4483	TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT
				4484	: os_(os),
				4485	choice_(choice) {}
				4486	void Call(uc16 from, DispatchTable::Entry entry) {
				4487	OutSet* out_set = entry.out_set();
				4488	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				4489	if (out_set->Get(i)) {
				4490	os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
				4491	<< choice()->alternatives()->at(i).node() << ";\n";
				4492	}
				4493	}
				4494	}
				4495	private:
				4496	ChoiceNode* choice() { return choice_; }
				4497	std::ostream& os_;
				4498	ChoiceNode* choice_;
				4499	};
				4500
				4501
				4502	class TableEntryHeaderPrinter {
				4503	public:
				4504	explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT
				4505	: first_(true),
				4506	os_(os) {}
				4507	void Call(uc16 from, DispatchTable::Entry entry) {
				4508	if (first_) {
				4509	first_ = false;
				4510	} else {
				4511	os_ << "\|";
				4512	}
				4513	os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "\|{";
				4514	OutSet* out_set = entry.out_set();
				4515	int priority = 0;
				4516	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				4517	if (out_set->Get(i)) {
				4518	if (priority > 0) os_ << "\|";
				4519	os_ << "<s" << from << "o" << i << "> " << priority;
				4520	priority++;
				4521	}
				4522	}
				4523	os_ << "}}";
				4524	}
				4525
				4526	private:
				4527	bool first_;
				4528	std::ostream& os_;
				4529	};
				4530
				4531
				4532	class AttributePrinter {
				4533	public:
				4534	explicit AttributePrinter(std::ostream& os) // NOLINT
				4535	: os_(os),
				4536	first_(true) {}
				4537	void PrintSeparator() {
				4538	if (first_) {
				4539	first_ = false;
				4540	} else {
				4541	os_ << "\|";
				4542	}
				4543	}
				4544	void PrintBit(const char* name, bool value) {
				4545	if (!value) return;
				4546	PrintSeparator();
				4547	os_ << "{" << name << "}";
				4548	}
				4549	void PrintPositive(const char* name, int value) {
				4550	if (value < 0) return;
				4551	PrintSeparator();
				4552	os_ << "{" << name << "\|" << value << "}";
				4553	}
				4554
				4555	private:
				4556	std::ostream& os_;
				4557	bool first_;
				4558	};
				4559
				4560
				4561	void DotPrinter::PrintAttributes(RegExpNode* that) {
				4562	os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
				4563	<< "margin=0.1, fontsize=10, label=\"{";
				4564	AttributePrinter printer(os_);
				4565	NodeInfo* info = that->info();
				4566	printer.PrintBit("NI", info->follows_newline_interest);
				4567	printer.PrintBit("WI", info->follows_word_interest);
				4568	printer.PrintBit("SI", info->follows_start_interest);
				4569	Label* label = that->label();
				4570	if (label->is_bound())
				4571	printer.PrintPositive("@", label->pos());
				4572	os_ << "}\"];\n"
				4573	<< " a" << that << " -> n" << that
				4574	<< " [style=dashed, color=grey, arrowhead=none];\n";
				4575	}
				4576
				4577
				4578	static const bool kPrintDispatchTable = false;
				4579	void DotPrinter::VisitChoice(ChoiceNode* that) {
				4580	if (kPrintDispatchTable) {
				4581	os_ << " n" << that << " [shape=Mrecord, label=\"";
				4582	TableEntryHeaderPrinter header_printer(os_);
				4583	that->GetTable(ignore_case_)->ForEach(&header_printer);
				4584	os_ << "\"]\n";
				4585	PrintAttributes(that);
				4586	TableEntryBodyPrinter body_printer(os_, that);
				4587	that->GetTable(ignore_case_)->ForEach(&body_printer);
				4588	} else {
				4589	os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
				4590	for (int i = 0; i < that->alternatives()->length(); i++) {
				4591	GuardedAlternative alt = that->alternatives()->at(i);
				4592	os_ << " n" << that << " -> n" << alt.node();
				4593	}
				4594	}
				4595	for (int i = 0; i < that->alternatives()->length(); i++) {
				4596	GuardedAlternative alt = that->alternatives()->at(i);
				4597	alt.node()->Accept(this);
				4598	}
				4599	}
				4600
				4601
				4602	void DotPrinter::VisitText(TextNode* that) {
				4603	Zone* zone = that->zone();
				4604	os_ << " n" << that << " [label=\"";
				4605	for (int i = 0; i < that->elements()->length(); i++) {
				4606	if (i > 0) os_ << " ";
				4607	TextElement elm = that->elements()->at(i);
				4608	switch (elm.text_type()) {
				4609	case TextElement::ATOM: {
				4610	Vector<const uc16> data = elm.atom()->data();
				4611	for (int i = 0; i < data.length(); i++) {
				4612	os_ << static_cast<char>(data[i]);
				4613	}
				4614	break;
				4615	}
				4616	case TextElement::CHAR_CLASS: {
				4617	RegExpCharacterClass* node = elm.char_class();
				4618	os_ << "[";
				4619	if (node->is_negated()) os_ << "^";
				4620	for (int j = 0; j < node->ranges(zone)->length(); j++) {
				4621	CharacterRange range = node->ranges(zone)->at(j);
				4622	os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
				4623	}
				4624	os_ << "]";
				4625	break;
				4626	}
				4627	default:
				4628	UNREACHABLE();
				4629	}
				4630	}
				4631	os_ << "\", shape=box, peripheries=2];\n";
				4632	PrintAttributes(that);
				4633	os_ << " n" << that << " -> n" << that->on_success() << ";\n";
				4634	Visit(that->on_success());
				4635	}
				4636
				4637
				4638	void DotPrinter::VisitBackReference(BackReferenceNode* that) {
				4639	os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
				4640	<< that->end_register() << "\", shape=doubleoctagon];\n";
				4641	PrintAttributes(that);
				4642	os_ << " n" << that << " -> n" << that->on_success() << ";\n";
				4643	Visit(that->on_success());
				4644	}
				4645
				4646
				4647	void DotPrinter::VisitEnd(EndNode* that) {
				4648	os_ << " n" << that << " [style=bold, shape=point];\n";
				4649	PrintAttributes(that);
				4650	}
				4651
				4652
				4653	void DotPrinter::VisitAssertion(AssertionNode* that) {
				4654	os_ << " n" << that << " [";
				4655	switch (that->assertion_type()) {
				4656	case AssertionNode::AT_END:
				4657	os_ << "label=\"$\", shape=septagon";
				4658	break;
				4659	case AssertionNode::AT_START:
				4660	os_ << "label=\"^\", shape=septagon";
				4661	break;
				4662	case AssertionNode::AT_BOUNDARY:
				4663	os_ << "label=\"\\b\", shape=septagon";
				4664	break;
				4665	case AssertionNode::AT_NON_BOUNDARY:
				4666	os_ << "label=\"\\B\", shape=septagon";
				4667	break;
				4668	case AssertionNode::AFTER_NEWLINE:
				4669	os_ << "label=\"(?<=\\n)\", shape=septagon";
				4670	break;
				4671	}
				4672	os_ << "];\n";
				4673	PrintAttributes(that);
				4674	RegExpNode* successor = that->on_success();
				4675	os_ << " n" << that << " -> n" << successor << ";\n";
				4676	Visit(successor);
				4677	}
				4678
				4679
				4680	void DotPrinter::VisitAction(ActionNode* that) {
				4681	os_ << " n" << that << " [";
				4682	switch (that->action_type_) {
				4683	case ActionNode::SET_REGISTER:
				4684	os_ << "label=\"$" << that->data_.u_store_register.reg
				4685	<< ":=" << that->data_.u_store_register.value << "\", shape=octagon";
				4686	break;
				4687	case ActionNode::INCREMENT_REGISTER:
				4688	os_ << "label=\"$" << that->data_.u_increment_register.reg
				4689	<< "++\", shape=octagon";
				4690	break;
				4691	case ActionNode::STORE_POSITION:
				4692	os_ << "label=\"$" << that->data_.u_position_register.reg
				4693	<< ":=$pos\", shape=octagon";
				4694	break;
				4695	case ActionNode::BEGIN_SUBMATCH:
				4696	os_ << "label=\"$" << that->data_.u_submatch.current_position_register
				4697	<< ":=$pos,begin\", shape=septagon";
				4698	break;
				4699	case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
				4700	os_ << "label=\"escape\", shape=septagon";
				4701	break;
				4702	case ActionNode::EMPTY_MATCH_CHECK:
				4703	os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
				4704	<< "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
				4705	<< "<" << that->data_.u_empty_match_check.repetition_limit
				4706	<< "?\", shape=septagon";
				4707	break;
				4708	case ActionNode::CLEAR_CAPTURES: {
				4709	os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
				4710	<< " to $" << that->data_.u_clear_captures.range_to
				4711	<< "\", shape=septagon";
				4712	break;
				4713	}
				4714	}
				4715	os_ << "];\n";
				4716	PrintAttributes(that);
				4717	RegExpNode* successor = that->on_success();
				4718	os_ << " n" << that << " -> n" << successor << ";\n";
				4719	Visit(successor);
				4720	}
				4721
				4722
				4723	class DispatchTableDumper {
				4724	public:
				4725	explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
				4726	void Call(uc16 key, DispatchTable::Entry entry);
				4727	private:
				4728	std::ostream& os_;
				4729	};
				4730
				4731
				4732	void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
				4733	os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
				4734	OutSet* set = entry.out_set();
				4735	bool first = true;
				4736	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				4737	if (set->Get(i)) {
				4738	if (first) {
				4739	first = false;
				4740	} else {
				4741	os_ << ", ";
				4742	}
				4743	os_ << i;
				4744	}
				4745	}
				4746	os_ << "}\n";
				4747	}
				4748
				4749
				4750	void DispatchTable::Dump() {
				4751	OFStream os(stderr);
				4752	DispatchTableDumper dumper(os);
				4753	tree()->ForEach(&dumper);
				4754	}
				4755
				4756
				4757	void RegExpEngine::DotPrint(const char* label,
				4758	RegExpNode* node,
				4759	bool ignore_case) {
				4760	OFStream os(stdout);
				4761	DotPrinter printer(os, ignore_case);
				4762	printer.PrintNode(label, node);
				4763	}
				4764
				4765
				4766	#endif // DEBUG
				4767
				4768
				4769	// -------------------------------------------------------------------
				4770	// Tree to graph conversion
				4771
				4772	RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
				4773	RegExpNode* on_success) {
				4774	ZoneList<TextElement>* elms =
				4775	new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
				4776	elms->Add(TextElement::Atom(this), compiler->zone());
				4777	return new (compiler->zone())
				4778	TextNode(elms, compiler->read_backward(), on_success);
				4779	}
				4780
				4781
				4782	RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
				4783	RegExpNode* on_success) {
				4784	return new (compiler->zone())
				4785	TextNode(elements(), compiler->read_backward(), on_success);
				4786	}
				4787
				4788
				4789	static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
				4790	const int* special_class,
				4791	int length) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	4792	length--; // Remove final marker.
				4793	DCHECK(special_class[length] == kRangeEndMarker);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	4794	DCHECK(ranges->length() != 0);
				4795	DCHECK(length != 0);
				4796	DCHECK(special_class[0] != 0);
				4797	if (ranges->length() != (length >> 1) + 1) {
				4798	return false;
				4799	}
				4800	CharacterRange range = ranges->at(0);
				4801	if (range.from() != 0) {
				4802	return false;
				4803	}
				4804	for (int i = 0; i < length; i += 2) {
				4805	if (special_class[i] != (range.to() + 1)) {
				4806	return false;
				4807	}
				4808	range = ranges->at((i >> 1) + 1);
				4809	if (special_class[i+1] != range.from()) {
				4810	return false;
				4811	}
				4812	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	4813	if (range.to() != String::kMaxCodePoint) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	4814	return false;
				4815	}
				4816	return true;
				4817	}
				4818
				4819
				4820	static bool CompareRanges(ZoneList<CharacterRange>* ranges,
				4821	const int* special_class,
				4822	int length) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	4823	length--; // Remove final marker.
				4824	DCHECK(special_class[length] == kRangeEndMarker);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	4825	if (ranges->length() * 2 != length) {
				4826	return false;
				4827	}
				4828	for (int i = 0; i < length; i += 2) {
				4829	CharacterRange range = ranges->at(i >> 1);
				4830	if (range.from() != special_class[i] \|\|
				4831	range.to() != special_class[i + 1] - 1) {
				4832	return false;
				4833	}
				4834	}
				4835	return true;
				4836	}
				4837
				4838
				4839	bool RegExpCharacterClass::is_standard(Zone* zone) {
				4840	// TODO(lrn): Remove need for this function, by not throwing away information
				4841	// along the way.
				4842	if (is_negated_) {
				4843	return false;
				4844	}
				4845	if (set_.is_standard()) {
				4846	return true;
				4847	}
				4848	if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
				4849	set_.set_standard_set_type('s');
				4850	return true;
				4851	}
				4852	if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
				4853	set_.set_standard_set_type('S');
				4854	return true;
				4855	}
				4856	if (CompareInverseRanges(set_.ranges(zone),
				4857	kLineTerminatorRanges,
				4858	kLineTerminatorRangeCount)) {
				4859	set_.set_standard_set_type('.');
				4860	return true;
				4861	}
				4862	if (CompareRanges(set_.ranges(zone),
				4863	kLineTerminatorRanges,
				4864	kLineTerminatorRangeCount)) {
				4865	set_.set_standard_set_type('n');
				4866	return true;
				4867	}
				4868	if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
				4869	set_.set_standard_set_type('w');
				4870	return true;
				4871	}
				4872	if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
				4873	set_.set_standard_set_type('W');
				4874	return true;
				4875	}
				4876	return false;
				4877	}
				4878
				4879
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	4880	UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
				4881	ZoneList<CharacterRange>* base)
				4882	: zone_(zone),
				4883	table_(zone),
				4884	bmp_(nullptr),
				4885	lead_surrogates_(nullptr),
				4886	trail_surrogates_(nullptr),
				4887	non_bmp_(nullptr) {
				4888	// The unicode range splitter categorizes given character ranges into:
				4889	// - Code points from the BMP representable by one code unit.
				4890	// - Code points outside the BMP that need to be split into surrogate pairs.
				4891	// - Lone lead surrogates.
				4892	// - Lone trail surrogates.
				4893	// Lone surrogates are valid code points, even though no actual characters.
				4894	// They require special matching to make sure we do not split surrogate pairs.
				4895	// We use the dispatch table to accomplish this. The base range is split up
				4896	// by the table by the overlay ranges, and the Call callback is used to
				4897	// filter and collect ranges for each category.
				4898	for (int i = 0; i < base->length(); i++) {
				4899	table_.AddRange(base->at(i), kBase, zone_);
				4900	}
				4901	// Add overlay ranges.
				4902	table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
				4903	kBmpCodePoints, zone_);
				4904	table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
				4905	kLeadSurrogates, zone_);
				4906	table_.AddRange(
				4907	CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
				4908	kTrailSurrogates, zone_);
				4909	table_.AddRange(
				4910	CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
				4911	kBmpCodePoints, zone_);
				4912	table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
				4913	kNonBmpCodePoints, zone_);
				4914	table_.ForEach(this);
				4915	}
				4916
				4917
				4918	void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
				4919	OutSet* outset = entry.out_set();
				4920	if (!outset->Get(kBase)) return;
				4921	ZoneList<CharacterRange>** target = NULL;
				4922	if (outset->Get(kBmpCodePoints)) {
				4923	target = &bmp_;
				4924	} else if (outset->Get(kLeadSurrogates)) {
				4925	target = &lead_surrogates_;
				4926	} else if (outset->Get(kTrailSurrogates)) {
				4927	target = &trail_surrogates_;
				4928	} else {
				4929	DCHECK(outset->Get(kNonBmpCodePoints));
				4930	target = &non_bmp_;
				4931	}
				4932	if (target == NULL) target = new (zone_) ZoneList<CharacterRange>(2, zone_);
				4933	(*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
				4934	}
				4935
				4936
				4937	void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
				4938	RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
				4939	ZoneList<CharacterRange>* bmp = splitter->bmp();
				4940	if (bmp == nullptr) return;
				4941	result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
				4942	compiler->zone(), bmp, compiler->read_backward(), on_success)));
				4943	}
				4944
				4945
				4946	void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
				4947	RegExpNode* on_success,
				4948	UnicodeRangeSplitter* splitter) {
				4949	ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
				4950	if (non_bmp == nullptr) return;
				4951	DCHECK(compiler->unicode());
				4952	DCHECK(!compiler->one_byte());
				4953	Zone* zone = compiler->zone();
				4954	CharacterRange::Canonicalize(non_bmp);
				4955	for (int i = 0; i < non_bmp->length(); i++) {
				4956	// Match surrogate pair.
				4957	// E.g. [\u10005-\u11005] becomes
				4958	// \ud800[\udc05-\udfff]\|
				4959	// [\ud801-\ud803][\udc00-\udfff]\|
				4960	// \ud804[\udc00-\udc05]
				4961	uc32 from = non_bmp->at(i).from();
				4962	uc32 to = non_bmp->at(i).to();
				4963	uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
				4964	uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
				4965	uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
				4966	uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
				4967	if (from_l == to_l) {
				4968	// The lead surrogate is the same.
				4969	result->AddAlternative(
				4970	GuardedAlternative(TextNode::CreateForSurrogatePair(
				4971	zone, CharacterRange::Singleton(from_l),
				4972	CharacterRange::Range(from_t, to_t), compiler->read_backward(),
				4973	on_success)));
				4974	} else {
				4975	if (from_t != kTrailSurrogateStart) {
				4976	// Add [from_l][from_t-\udfff]
				4977	result->AddAlternative(
				4978	GuardedAlternative(TextNode::CreateForSurrogatePair(
				4979	zone, CharacterRange::Singleton(from_l),
				4980	CharacterRange::Range(from_t, kTrailSurrogateEnd),
				4981	compiler->read_backward(), on_success)));
				4982	from_l++;
				4983	}
				4984	if (to_t != kTrailSurrogateEnd) {
				4985	// Add [to_l][\udc00-to_t]
				4986	result->AddAlternative(
				4987	GuardedAlternative(TextNode::CreateForSurrogatePair(
				4988	zone, CharacterRange::Singleton(to_l),
				4989	CharacterRange::Range(kTrailSurrogateStart, to_t),
				4990	compiler->read_backward(), on_success)));
				4991	to_l--;
				4992	}
				4993	if (from_l <= to_l) {
				4994	// Add [from_l-to_l][\udc00-\udfff]
				4995	result->AddAlternative(
				4996	GuardedAlternative(TextNode::CreateForSurrogatePair(
				4997	zone, CharacterRange::Range(from_l, to_l),
				4998	CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
				4999	compiler->read_backward(), on_success)));
				5000	}
				5001	}
				5002	}
				5003	}
				5004
				5005
				5006	RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
				5007	RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
				5008	ZoneList<CharacterRange>* match, RegExpNode* on_success,
				5009	bool read_backward) {
				5010	Zone* zone = compiler->zone();
				5011	RegExpNode* match_node = TextNode::CreateForCharacterRanges(
				5012	zone, match, read_backward, on_success);
				5013	int stack_register = compiler->UnicodeLookaroundStackRegister();
				5014	int position_register = compiler->UnicodeLookaroundPositionRegister();
				5015	RegExpLookaround::Builder lookaround(false, match_node, stack_register,
				5016	position_register);
				5017	RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
				5018	zone, lookbehind, !read_backward, lookaround.on_match_success());
				5019	return lookaround.ForMatch(negative_match);
				5020	}
				5021
				5022
				5023	RegExpNode* MatchAndNegativeLookaroundInReadDirection(
				5024	RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
				5025	ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
				5026	bool read_backward) {
				5027	Zone* zone = compiler->zone();
				5028	int stack_register = compiler->UnicodeLookaroundStackRegister();
				5029	int position_register = compiler->UnicodeLookaroundPositionRegister();
				5030	RegExpLookaround::Builder lookaround(false, on_success, stack_register,
				5031	position_register);
				5032	RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
				5033	zone, lookahead, read_backward, lookaround.on_match_success());
				5034	return TextNode::CreateForCharacterRanges(
				5035	zone, match, read_backward, lookaround.ForMatch(negative_match));
				5036	}
				5037
				5038
				5039	void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
				5040	RegExpNode* on_success,
				5041	UnicodeRangeSplitter* splitter) {
				5042	ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
				5043	if (lead_surrogates == nullptr) return;
				5044	Zone* zone = compiler->zone();
				5045	// E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
				5046	ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
				5047	zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
				5048
				5049	RegExpNode* match;
				5050	if (compiler->read_backward()) {
				5051	// Reading backward. Assert that reading forward, there is no trail
				5052	// surrogate, and then backward match the lead surrogate.
				5053	match = NegativeLookaroundAgainstReadDirectionAndMatch(
				5054	compiler, trail_surrogates, lead_surrogates, on_success, true);
				5055	} else {
				5056	// Reading forward. Forward match the lead surrogate and assert that
				5057	// no trail surrogate follows.
				5058	match = MatchAndNegativeLookaroundInReadDirection(
				5059	compiler, lead_surrogates, trail_surrogates, on_success, false);
				5060	}
				5061	result->AddAlternative(GuardedAlternative(match));
				5062	}
				5063
				5064
				5065	void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
				5066	RegExpNode* on_success,
				5067	UnicodeRangeSplitter* splitter) {
				5068	ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
				5069	if (trail_surrogates == nullptr) return;
				5070	Zone* zone = compiler->zone();
				5071	// E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
				5072	ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
				5073	zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
				5074
				5075	RegExpNode* match;
				5076	if (compiler->read_backward()) {
				5077	// Reading backward. Backward match the trail surrogate and assert that no
				5078	// lead surrogate precedes it.
				5079	match = MatchAndNegativeLookaroundInReadDirection(
				5080	compiler, trail_surrogates, lead_surrogates, on_success, true);
				5081	} else {
				5082	// Reading forward. Assert that reading backward, there is no lead
				5083	// surrogate, and then forward match the trail surrogate.
				5084	match = NegativeLookaroundAgainstReadDirectionAndMatch(
				5085	compiler, lead_surrogates, trail_surrogates, on_success, false);
				5086	}
				5087	result->AddAlternative(GuardedAlternative(match));
				5088	}
				5089
				5090	RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
				5091	RegExpNode* on_success) {
				5092	// This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
				5093	DCHECK(!compiler->read_backward());
				5094	Zone* zone = compiler->zone();
				5095	// Advance any character. If the character happens to be a lead surrogate and
				5096	// we advanced into the middle of a surrogate pair, it will work out, as
				5097	// nothing will match from there. We will have to advance again, consuming
				5098	// the associated trail surrogate.
				5099	ZoneList<CharacterRange>* range = CharacterRange::List(
				5100	zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
				5101	return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
				5102	}
				5103
				5104
				5105	void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
				5106	ZoneList<CharacterRange>* ranges) {
				5107	#ifdef V8_I18N_SUPPORT
				5108	// Use ICU to compute the case fold closure over the ranges.
				5109	DCHECK(compiler->unicode());
				5110	DCHECK(compiler->ignore_case());
				5111	USet* set = uset_openEmpty();
				5112	for (int i = 0; i < ranges->length(); i++) {
				5113	uset_addRange(set, ranges->at(i).from(), ranges->at(i).to());
				5114	}
				5115	ranges->Clear();
				5116	uset_closeOver(set, USET_CASE_INSENSITIVE);
				5117	// Full case mapping map single characters to multiple characters.
				5118	// Those are represented as strings in the set. Remove them so that
				5119	// we end up with only simple and common case mappings.
				5120	uset_removeAllStrings(set);
				5121	int item_count = uset_getItemCount(set);
				5122	int item_result = 0;
				5123	UErrorCode ec = U_ZERO_ERROR;
				5124	Zone* zone = compiler->zone();
				5125	for (int i = 0; i < item_count; i++) {
				5126	uc32 start = 0;
				5127	uc32 end = 0;
				5128	item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
				5129	ranges->Add(CharacterRange::Range(start, end), zone);
				5130	}
				5131	// No errors and everything we collected have been ranges.
				5132	DCHECK_EQ(U_ZERO_ERROR, ec);
				5133	DCHECK_EQ(0, item_result);
				5134	uset_close(set);
				5135	#else
				5136	// Fallback if ICU is not included.
				5137	CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
				5138	ranges, compiler->one_byte());
				5139	#endif // V8_I18N_SUPPORT
				5140	CharacterRange::Canonicalize(ranges);
				5141	}
				5142
				5143
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5144	RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
				5145	RegExpNode* on_success) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	5146	set_.Canonicalize();
				5147	Zone* zone = compiler->zone();
				5148	ZoneList<CharacterRange>* ranges = this->ranges(zone);
				5149	if (compiler->unicode() && compiler->ignore_case()) {
				5150	AddUnicodeCaseEquivalents(compiler, ranges);
				5151	}
				5152	if (compiler->unicode() && !compiler->one_byte()) {
				5153	if (is_negated()) {
				5154	ZoneList<CharacterRange>* negated =
				5155	new (zone) ZoneList<CharacterRange>(2, zone);
				5156	CharacterRange::Negate(ranges, negated, zone);
				5157	ranges = negated;
				5158	}
				5159	if (ranges->length() == 0) {
				5160	// No matches possible.
				5161	return new (zone) EndNode(EndNode::BACKTRACK, zone);
				5162	}
				5163	if (standard_type() == '*') {
				5164	return UnanchoredAdvance(compiler, on_success);
				5165	} else {
				5166	ChoiceNode* result = new (zone) ChoiceNode(2, zone);
				5167	UnicodeRangeSplitter splitter(zone, ranges);
				5168	AddBmpCharacters(compiler, result, on_success, &splitter);
				5169	AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
				5170	AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
				5171	AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
				5172	return result;
				5173	}
				5174	} else {
				5175	return new (zone) TextNode(this, compiler->read_backward(), on_success);
				5176	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5177	}
				5178
				5179
				5180	int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
				5181	RegExpAtom* atom1 = (*a)->AsAtom();
				5182	RegExpAtom* atom2 = (*b)->AsAtom();
				5183	uc16 character1 = atom1->data().at(0);
				5184	uc16 character2 = atom2->data().at(0);
				5185	if (character1 < character2) return -1;
				5186	if (character1 > character2) return 1;
				5187	return 0;
				5188	}
				5189
				5190
				5191	static unibrow::uchar Canonical(
				5192	unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
				5193	unibrow::uchar c) {
				5194	unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
				5195	int length = canonicalize->get(c, '\0', chars);
				5196	DCHECK_LE(length, 1);
				5197	unibrow::uchar canonical = c;
				5198	if (length == 1) canonical = chars[0];
				5199	return canonical;
				5200	}
				5201
				5202
				5203	int CompareFirstCharCaseIndependent(
				5204	unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
				5205	RegExpTree* const* a, RegExpTree* const* b) {
				5206	RegExpAtom* atom1 = (*a)->AsAtom();
				5207	RegExpAtom* atom2 = (*b)->AsAtom();
				5208	unibrow::uchar character1 = atom1->data().at(0);
				5209	unibrow::uchar character2 = atom2->data().at(0);
				5210	if (character1 == character2) return 0;
				5211	if (character1 >= 'a' \|\| character2 >= 'a') {
				5212	character1 = Canonical(canonicalize, character1);
				5213	character2 = Canonical(canonicalize, character2);
				5214	}
				5215	return static_cast<int>(character1) - static_cast<int>(character2);
				5216	}
				5217
				5218
				5219	// We can stable sort runs of atoms, since the order does not matter if they
				5220	// start with different characters.
				5221	// Returns true if any consecutive atoms were found.
				5222	bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
				5223	ZoneList<RegExpTree> alternatives = this->alternatives();
				5224	int length = alternatives->length();
				5225	bool found_consecutive_atoms = false;
				5226	for (int i = 0; i < length; i++) {
				5227	while (i < length) {
				5228	RegExpTree* alternative = alternatives->at(i);
				5229	if (alternative->IsAtom()) break;
				5230	i++;
				5231	}
				5232	// i is length or it is the index of an atom.
				5233	if (i == length) break;
				5234	int first_atom = i;
				5235	i++;
				5236	while (i < length) {
				5237	RegExpTree* alternative = alternatives->at(i);
				5238	if (!alternative->IsAtom()) break;
				5239	i++;
				5240	}
				5241	// Sort atoms to get ones with common prefixes together.
				5242	// This step is more tricky if we are in a case-independent regexp,
				5243	// because it would change /is\|I/ to /I\|is/, and order matters when
				5244	// the regexp parts don't match only disjoint starting points. To fix
				5245	// this we have a version of CompareFirstChar that uses case-
				5246	// independent character classes for comparison.
				5247	DCHECK_LT(first_atom, alternatives->length());
				5248	DCHECK_LE(i, alternatives->length());
				5249	DCHECK_LE(first_atom, i);
				5250	if (compiler->ignore_case()) {
				5251	unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
				5252	compiler->isolate()->regexp_macro_assembler_canonicalize();
				5253	auto compare_closure =
				5254	[canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
				5255	return CompareFirstCharCaseIndependent(canonicalize, a, b);
				5256	};
				5257	alternatives->StableSort(compare_closure, first_atom, i - first_atom);
				5258	} else {
				5259	alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
				5260	}
				5261	if (i - first_atom > 1) found_consecutive_atoms = true;
				5262	}
				5263	return found_consecutive_atoms;
				5264	}
				5265
				5266
				5267	// Optimizes ab\|ac\|az to a(?:b\|c\|d).
				5268	void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
				5269	Zone* zone = compiler->zone();
				5270	ZoneList<RegExpTree> alternatives = this->alternatives();
				5271	int length = alternatives->length();
				5272
				5273	int write_posn = 0;
				5274	int i = 0;
				5275	while (i < length) {
				5276	RegExpTree* alternative = alternatives->at(i);
				5277	if (!alternative->IsAtom()) {
				5278	alternatives->at(write_posn++) = alternatives->at(i);
				5279	i++;
				5280	continue;
				5281	}
				5282	RegExpAtom* atom = alternative->AsAtom();
				5283	unibrow::uchar common_prefix = atom->data().at(0);
				5284	int first_with_prefix = i;
				5285	int prefix_length = atom->length();
				5286	i++;
				5287	while (i < length) {
				5288	alternative = alternatives->at(i);
				5289	if (!alternative->IsAtom()) break;
				5290	atom = alternative->AsAtom();
				5291	unibrow::uchar new_prefix = atom->data().at(0);
				5292	if (new_prefix != common_prefix) {
				5293	if (!compiler->ignore_case()) break;
				5294	unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
				5295	compiler->isolate()->regexp_macro_assembler_canonicalize();
				5296	new_prefix = Canonical(canonicalize, new_prefix);
				5297	common_prefix = Canonical(canonicalize, common_prefix);
				5298	if (new_prefix != common_prefix) break;
				5299	}
				5300	prefix_length = Min(prefix_length, atom->length());
				5301	i++;
				5302	}
				5303	if (i > first_with_prefix + 2) {
				5304	// Found worthwhile run of alternatives with common prefix of at least one
				5305	// character. The sorting function above did not sort on more than one
				5306	// character for reasons of correctness, but there may still be a longer
				5307	// common prefix if the terms were similar or presorted in the input.
				5308	// Find out how long the common prefix is.
				5309	int run_length = i - first_with_prefix;
				5310	atom = alternatives->at(first_with_prefix)->AsAtom();
				5311	for (int j = 1; j < run_length && prefix_length > 1; j++) {
				5312	RegExpAtom* old_atom =
				5313	alternatives->at(j + first_with_prefix)->AsAtom();
				5314	for (int k = 1; k < prefix_length; k++) {
				5315	if (atom->data().at(k) != old_atom->data().at(k)) {
				5316	prefix_length = k;
				5317	break;
				5318	}
				5319	}
				5320	}
				5321	RegExpAtom* prefix =
				5322	new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
				5323	ZoneList<RegExpTree> pair = new (zone) ZoneList<RegExpTree*>(2, zone);
				5324	pair->Add(prefix, zone);
				5325	ZoneList<RegExpTree> suffixes =
				5326	new (zone) ZoneList<RegExpTree*>(run_length, zone);
				5327	for (int j = 0; j < run_length; j++) {
				5328	RegExpAtom* old_atom =
				5329	alternatives->at(j + first_with_prefix)->AsAtom();
				5330	int len = old_atom->length();
				5331	if (len == prefix_length) {
				5332	suffixes->Add(new (zone) RegExpEmpty(), zone);
				5333	} else {
				5334	RegExpTree* suffix = new (zone) RegExpAtom(
				5335	old_atom->data().SubVector(prefix_length, old_atom->length()));
				5336	suffixes->Add(suffix, zone);
				5337	}
				5338	}
				5339	pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
				5340	alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
				5341	} else {
				5342	// Just copy any non-worthwhile alternatives.
				5343	for (int j = first_with_prefix; j < i; j++) {
				5344	alternatives->at(write_posn++) = alternatives->at(j);
				5345	}
				5346	}
				5347	}
				5348	alternatives->Rewind(write_posn); // Trim end of array.
				5349	}
				5350
				5351
				5352	// Optimizes b\|c\|z to [bcz].
				5353	void RegExpDisjunction::FixSingleCharacterDisjunctions(
				5354	RegExpCompiler* compiler) {
				5355	Zone* zone = compiler->zone();
				5356	ZoneList<RegExpTree> alternatives = this->alternatives();
				5357	int length = alternatives->length();
				5358
				5359	int write_posn = 0;
				5360	int i = 0;
				5361	while (i < length) {
				5362	RegExpTree* alternative = alternatives->at(i);
				5363	if (!alternative->IsAtom()) {
				5364	alternatives->at(write_posn++) = alternatives->at(i);
				5365	i++;
				5366	continue;
				5367	}
				5368	RegExpAtom* atom = alternative->AsAtom();
				5369	if (atom->length() != 1) {
				5370	alternatives->at(write_posn++) = alternatives->at(i);
				5371	i++;
				5372	continue;
				5373	}
				5374	int first_in_run = i;
				5375	i++;
				5376	while (i < length) {
				5377	alternative = alternatives->at(i);
				5378	if (!alternative->IsAtom()) break;
				5379	atom = alternative->AsAtom();
				5380	if (atom->length() != 1) break;
				5381	i++;
				5382	}
				5383	if (i > first_in_run + 1) {
				5384	// Found non-trivial run of single-character alternatives.
				5385	int run_length = i - first_in_run;
				5386	ZoneList<CharacterRange>* ranges =
				5387	new (zone) ZoneList<CharacterRange>(2, zone);
				5388	for (int j = 0; j < run_length; j++) {
				5389	RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
				5390	DCHECK_EQ(old_atom->length(), 1);
				5391	ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
				5392	}
				5393	alternatives->at(write_posn++) =
				5394	new (zone) RegExpCharacterClass(ranges, false);
				5395	} else {
				5396	// Just copy any trivial alternatives.
				5397	for (int j = first_in_run; j < i; j++) {
				5398	alternatives->at(write_posn++) = alternatives->at(j);
				5399	}
				5400	}
				5401	}
				5402	alternatives->Rewind(write_posn); // Trim end of array.
				5403	}
				5404
				5405
				5406	RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
				5407	RegExpNode* on_success) {
				5408	ZoneList<RegExpTree> alternatives = this->alternatives();
				5409
				5410	if (alternatives->length() > 2) {
				5411	bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
				5412	if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
				5413	FixSingleCharacterDisjunctions(compiler);
				5414	if (alternatives->length() == 1) {
				5415	return alternatives->at(0)->ToNode(compiler, on_success);
				5416	}
				5417	}
				5418
				5419	int length = alternatives->length();
				5420
				5421	ChoiceNode* result =
				5422	new(compiler->zone()) ChoiceNode(length, compiler->zone());
				5423	for (int i = 0; i < length; i++) {
				5424	GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
				5425	on_success));
				5426	result->AddAlternative(alternative);
				5427	}
				5428	return result;
				5429	}
				5430
				5431
				5432	RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
				5433	RegExpNode* on_success) {
				5434	return ToNode(min(),
				5435	max(),
				5436	is_greedy(),
				5437	body(),
				5438	compiler,
				5439	on_success);
				5440	}
				5441
				5442
				5443	// Scoped object to keep track of how much we unroll quantifier loops in the
				5444	// regexp graph generator.
				5445	class RegExpExpansionLimiter {
				5446	public:
				5447	static const int kMaxExpansionFactor = 6;
				5448	RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
				5449	: compiler_(compiler),
				5450	saved_expansion_factor_(compiler->current_expansion_factor()),
				5451	ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
				5452	DCHECK(factor > 0);
				5453	if (ok_to_expand_) {
				5454	if (factor > kMaxExpansionFactor) {
				5455	// Avoid integer overflow of the current expansion factor.
				5456	ok_to_expand_ = false;
				5457	compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
				5458	} else {
				5459	int new_factor = saved_expansion_factor_ * factor;
				5460	ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
				5461	compiler->set_current_expansion_factor(new_factor);
				5462	}
				5463	}
				5464	}
				5465
				5466	~RegExpExpansionLimiter() {
				5467	compiler_->set_current_expansion_factor(saved_expansion_factor_);
				5468	}
				5469
				5470	bool ok_to_expand() { return ok_to_expand_; }
				5471
				5472	private:
				5473	RegExpCompiler* compiler_;
				5474	int saved_expansion_factor_;
				5475	bool ok_to_expand_;
				5476
				5477	DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
				5478	};
				5479
				5480
				5481	RegExpNode* RegExpQuantifier::ToNode(int min,
				5482	int max,
				5483	bool is_greedy,
				5484	RegExpTree* body,
				5485	RegExpCompiler* compiler,
				5486	RegExpNode* on_success,
				5487	bool not_at_start) {
				5488	// x{f, t} becomes this:
				5489	//
				5490	// (r++)<-.
				5491	// \| `
				5492	// \| (x)
				5493	// v ^
				5494	// (r=0)-->(?)---/ [if r < t]
				5495	// \|
				5496	// [if r >= f] \----> ...
				5497	//
				5498
				5499	// 15.10.2.5 RepeatMatcher algorithm.
				5500	// The parser has already eliminated the case where max is 0. In the case
				5501	// where max_match is zero the parser has removed the quantifier if min was
				5502	// > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
				5503
				5504	// If we know that we cannot match zero length then things are a little
				5505	// simpler since we don't need to make the special zero length match check
				5506	// from step 2.1. If the min and max are small we can unroll a little in
				5507	// this case.
				5508	static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
				5509	static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
				5510	if (max == 0) return on_success; // This can happen due to recursion.
				5511	bool body_can_be_empty = (body->min_match() == 0);
				5512	int body_start_reg = RegExpCompiler::kNoRegister;
				5513	Interval capture_registers = body->CaptureRegisters();
				5514	bool needs_capture_clearing = !capture_registers.is_empty();
				5515	Zone* zone = compiler->zone();
				5516
				5517	if (body_can_be_empty) {
				5518	body_start_reg = compiler->AllocateRegister();
				5519	} else if (compiler->optimize() && !needs_capture_clearing) {
				5520	// Only unroll if there are no captures and the body can't be
				5521	// empty.
				5522	{
				5523	RegExpExpansionLimiter limiter(
				5524	compiler, min + ((max != min) ? 1 : 0));
				5525	if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
				5526	int new_max = (max == kInfinity) ? max : max - min;
				5527	// Recurse once to get the loop or optional matches after the fixed
				5528	// ones.
				5529	RegExpNode* answer = ToNode(
				5530	0, new_max, is_greedy, body, compiler, on_success, true);
				5531	// Unroll the forced matches from 0 to min. This can cause chains of
				5532	// TextNodes (which the parser does not generate). These should be
				5533	// combined if it turns out they hinder good code generation.
				5534	for (int i = 0; i < min; i++) {
				5535	answer = body->ToNode(compiler, answer);
				5536	}
				5537	return answer;
				5538	}
				5539	}
				5540	if (max <= kMaxUnrolledMaxMatches && min == 0) {
				5541	DCHECK(max > 0); // Due to the 'if' above.
				5542	RegExpExpansionLimiter limiter(compiler, max);
				5543	if (limiter.ok_to_expand()) {
				5544	// Unroll the optional matches up to max.
				5545	RegExpNode* answer = on_success;
				5546	for (int i = 0; i < max; i++) {
				5547	ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
				5548	if (is_greedy) {
				5549	alternation->AddAlternative(
				5550	GuardedAlternative(body->ToNode(compiler, answer)));
				5551	alternation->AddAlternative(GuardedAlternative(on_success));
				5552	} else {
				5553	alternation->AddAlternative(GuardedAlternative(on_success));
				5554	alternation->AddAlternative(
				5555	GuardedAlternative(body->ToNode(compiler, answer)));
				5556	}
				5557	answer = alternation;
				5558	if (not_at_start && !compiler->read_backward()) {
				5559	alternation->set_not_at_start();
				5560	}
				5561	}
				5562	return answer;
				5563	}
				5564	}
				5565	}
				5566	bool has_min = min > 0;
				5567	bool has_max = max < RegExpTree::kInfinity;
				5568	bool needs_counter = has_min \|\| has_max;
				5569	int reg_ctr = needs_counter
				5570	? compiler->AllocateRegister()
				5571	: RegExpCompiler::kNoRegister;
				5572	LoopChoiceNode* center = new (zone)
				5573	LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
				5574	if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
				5575	RegExpNode* loop_return = needs_counter
				5576	? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
				5577	: static_cast<RegExpNode*>(center);
				5578	if (body_can_be_empty) {
				5579	// If the body can be empty we need to check if it was and then
				5580	// backtrack.
				5581	loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
				5582	reg_ctr,
				5583	min,
				5584	loop_return);
				5585	}
				5586	RegExpNode* body_node = body->ToNode(compiler, loop_return);
				5587	if (body_can_be_empty) {
				5588	// If the body can be empty we need to store the start position
				5589	// so we can bail out if it was empty.
				5590	body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
				5591	}
				5592	if (needs_capture_clearing) {
				5593	// Before entering the body of this loop we need to clear captures.
				5594	body_node = ActionNode::ClearCaptures(capture_registers, body_node);
				5595	}
				5596	GuardedAlternative body_alt(body_node);
				5597	if (has_max) {
				5598	Guard* body_guard =
				5599	new(zone) Guard(reg_ctr, Guard::LT, max);
				5600	body_alt.AddGuard(body_guard, zone);
				5601	}
				5602	GuardedAlternative rest_alt(on_success);
				5603	if (has_min) {
				5604	Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
				5605	rest_alt.AddGuard(rest_guard, zone);
				5606	}
				5607	if (is_greedy) {
				5608	center->AddLoopAlternative(body_alt);
				5609	center->AddContinueAlternative(rest_alt);
				5610	} else {
				5611	center->AddContinueAlternative(rest_alt);
				5612	center->AddLoopAlternative(body_alt);
				5613	}
				5614	if (needs_counter) {
				5615	return ActionNode::SetRegister(reg_ctr, 0, center);
				5616	} else {
				5617	return center;
				5618	}
				5619	}
				5620
				5621
				5622	RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
				5623	RegExpNode* on_success) {
				5624	NodeInfo info;
				5625	Zone* zone = compiler->zone();
				5626
				5627	switch (assertion_type()) {
				5628	case START_OF_LINE:
				5629	return AssertionNode::AfterNewline(on_success);
				5630	case START_OF_INPUT:
				5631	return AssertionNode::AtStart(on_success);
				5632	case BOUNDARY:
				5633	return AssertionNode::AtBoundary(on_success);
				5634	case NON_BOUNDARY:
				5635	return AssertionNode::AtNonBoundary(on_success);
				5636	case END_OF_INPUT:
				5637	return AssertionNode::AtEnd(on_success);
				5638	case END_OF_LINE: {
				5639	// Compile $ in multiline regexps as an alternation with a positive
				5640	// lookahead in one side and an end-of-input on the other side.
				5641	// We need two registers for the lookahead.
				5642	int stack_pointer_register = compiler->AllocateRegister();
				5643	int position_register = compiler->AllocateRegister();
				5644	// The ChoiceNode to distinguish between a newline and end-of-input.
				5645	ChoiceNode* result = new(zone) ChoiceNode(2, zone);
				5646	// Create a newline atom.
				5647	ZoneList<CharacterRange>* newline_ranges =
				5648	new(zone) ZoneList<CharacterRange>(3, zone);
				5649	CharacterRange::AddClassEscape('n', newline_ranges, zone);
				5650	RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
				5651	TextNode* newline_matcher = new (zone) TextNode(
				5652	newline_atom, false, ActionNode::PositiveSubmatchSuccess(
				5653	stack_pointer_register, position_register,
				5654	0, // No captures inside.
				5655	-1, // Ignored if no captures.
				5656	on_success));
				5657	// Create an end-of-input matcher.
				5658	RegExpNode* end_of_line = ActionNode::BeginSubmatch(
				5659	stack_pointer_register,
				5660	position_register,
				5661	newline_matcher);
				5662	// Add the two alternatives to the ChoiceNode.
				5663	GuardedAlternative eol_alternative(end_of_line);
				5664	result->AddAlternative(eol_alternative);
				5665	GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
				5666	result->AddAlternative(end_alternative);
				5667	return result;
				5668	}
				5669	default:
				5670	UNREACHABLE();
				5671	}
				5672	return on_success;
				5673	}
				5674
				5675
				5676	RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
				5677	RegExpNode* on_success) {
				5678	return new (compiler->zone())
				5679	BackReferenceNode(RegExpCapture::StartRegister(index()),
				5680	RegExpCapture::EndRegister(index()),
				5681	compiler->read_backward(), on_success);
				5682	}
				5683
				5684
				5685	RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
				5686	RegExpNode* on_success) {
				5687	return on_success;
				5688	}
				5689
				5690
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	5691	RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
				5692	int stack_pointer_register,
				5693	int position_register,
				5694	int capture_register_count,
				5695	int capture_register_start)
				5696	: is_positive_(is_positive),
				5697	on_success_(on_success),
				5698	stack_pointer_register_(stack_pointer_register),
				5699	position_register_(position_register) {
				5700	if (is_positive_) {
				5701	on_match_success_ = ActionNode::PositiveSubmatchSuccess(
				5702	stack_pointer_register, position_register, capture_register_count,
				5703	capture_register_start, on_success_);
				5704	} else {
				5705	Zone* zone = on_success_->zone();
				5706	on_match_success_ = new (zone) NegativeSubmatchSuccess(
				5707	stack_pointer_register, position_register, capture_register_count,
				5708	capture_register_start, zone);
				5709	}
				5710	}
				5711
				5712
				5713	RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
				5714	if (is_positive_) {
				5715	return ActionNode::BeginSubmatch(stack_pointer_register_,
				5716	position_register_, match);
				5717	} else {
				5718	Zone* zone = on_success_->zone();
				5719	// We use a ChoiceNode to represent the negative lookaround. The first
				5720	// alternative is the negative match. On success, the end node backtracks.
				5721	// On failure, the second alternative is tried and leads to success.
				5722	// NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
				5723	// first exit when calculating quick checks.
				5724	ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
				5725	GuardedAlternative(match), GuardedAlternative(on_success_), zone);
				5726	return ActionNode::BeginSubmatch(stack_pointer_register_,
				5727	position_register_, choice_node);
				5728	}
				5729	}
				5730
				5731
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5732	RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
				5733	RegExpNode* on_success) {
				5734	int stack_pointer_register = compiler->AllocateRegister();
				5735	int position_register = compiler->AllocateRegister();
				5736
				5737	const int registers_per_capture = 2;
				5738	const int register_of_first_capture = 2;
				5739	int register_count = capture_count_ * registers_per_capture;
				5740	int register_start =
				5741	register_of_first_capture + capture_from_ * registers_per_capture;
				5742
				5743	RegExpNode* result;
				5744	bool was_reading_backward = compiler->read_backward();
				5745	compiler->set_read_backward(type() == LOOKBEHIND);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	5746	Builder builder(is_positive(), on_success, stack_pointer_register,
				5747	position_register, register_count, register_start);
				5748	RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
				5749	result = builder.ForMatch(match);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5750	compiler->set_read_backward(was_reading_backward);
				5751	return result;
				5752	}
				5753
				5754
				5755	RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
				5756	RegExpNode* on_success) {
				5757	return ToNode(body(), index(), compiler, on_success);
				5758	}
				5759
				5760
				5761	RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
				5762	int index,
				5763	RegExpCompiler* compiler,
				5764	RegExpNode* on_success) {
				5765	DCHECK_NOT_NULL(body);
				5766	int start_reg = RegExpCapture::StartRegister(index);
				5767	int end_reg = RegExpCapture::EndRegister(index);
				5768	if (compiler->read_backward()) std::swap(start_reg, end_reg);
				5769	RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
				5770	RegExpNode* body_node = body->ToNode(compiler, store_end);
				5771	return ActionNode::StorePosition(start_reg, true, body_node);
				5772	}
				5773
				5774
				5775	RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
				5776	RegExpNode* on_success) {
				5777	ZoneList<RegExpTree> children = nodes();
				5778	RegExpNode* current = on_success;
				5779	if (compiler->read_backward()) {
				5780	for (int i = 0; i < children->length(); i++) {
				5781	current = children->at(i)->ToNode(compiler, current);
				5782	}
				5783	} else {
				5784	for (int i = children->length() - 1; i >= 0; i--) {
				5785	current = children->at(i)->ToNode(compiler, current);
				5786	}
				5787	}
				5788	return current;
				5789	}
				5790
				5791
				5792	static void AddClass(const int* elmv,
				5793	int elmc,
				5794	ZoneList<CharacterRange>* ranges,
				5795	Zone* zone) {
				5796	elmc--;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	5797	DCHECK(elmv[elmc] == kRangeEndMarker);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5798	for (int i = 0; i < elmc; i += 2) {
				5799	DCHECK(elmv[i] < elmv[i + 1]);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	5800	ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5801	}
				5802	}
				5803
				5804
				5805	static void AddClassNegated(const int *elmv,
				5806	int elmc,
				5807	ZoneList<CharacterRange>* ranges,
				5808	Zone* zone) {
				5809	elmc--;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	5810	DCHECK(elmv[elmc] == kRangeEndMarker);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5811	DCHECK(elmv[0] != 0x0000);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	5812	DCHECK(elmv[elmc - 1] != String::kMaxCodePoint);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5813	uc16 last = 0x0000;
				5814	for (int i = 0; i < elmc; i += 2) {
				5815	DCHECK(last <= elmv[i] - 1);
				5816	DCHECK(elmv[i] < elmv[i + 1]);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	5817	ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5818	last = elmv[i + 1];
				5819	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	5820	ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5821	}
				5822
				5823
				5824	void CharacterRange::AddClassEscape(uc16 type,
				5825	ZoneList<CharacterRange>* ranges,
				5826	Zone* zone) {
				5827	switch (type) {
				5828	case 's':
				5829	AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
				5830	break;
				5831	case 'S':
				5832	AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
				5833	break;
				5834	case 'w':
				5835	AddClass(kWordRanges, kWordRangeCount, ranges, zone);
				5836	break;
				5837	case 'W':
				5838	AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
				5839	break;
				5840	case 'd':
				5841	AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
				5842	break;
				5843	case 'D':
				5844	AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
				5845	break;
				5846	case '.':
				5847	AddClassNegated(kLineTerminatorRanges,
				5848	kLineTerminatorRangeCount,
				5849	ranges,
				5850	zone);
				5851	break;
				5852	// This is not a character range as defined by the spec but a
				5853	// convenient shorthand for a character class that matches any
				5854	// character.
				5855	case '*':
				5856	ranges->Add(CharacterRange::Everything(), zone);
				5857	break;
				5858	// This is the set of characters matched by the $ and ^ symbols
				5859	// in multiline mode.
				5860	case 'n':
				5861	AddClass(kLineTerminatorRanges,
				5862	kLineTerminatorRangeCount,
				5863	ranges,
				5864	zone);
				5865	break;
				5866	default:
				5867	UNREACHABLE();
				5868	}
				5869	}
				5870
				5871
				5872	Vector<const int> CharacterRange::GetWordBounds() {
				5873	return Vector<const int>(kWordRanges, kWordRangeCount - 1);
				5874	}
				5875
				5876
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5877	void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
				5878	ZoneList<CharacterRange>* ranges,
				5879	bool is_one_byte) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	5880	int range_count = ranges->length();
				5881	for (int i = 0; i < range_count; i++) {
				5882	CharacterRange range = ranges->at(i);
				5883	uc32 bottom = range.from();
				5884	if (bottom > String::kMaxUtf16CodeUnit) return;
				5885	uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
				5886	// Nothing to be done for surrogates.
				5887	if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;
				5888	if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
				5889	if (bottom > String::kMaxOneByteCharCode) return;
				5890	if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5891	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	5892	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				5893	if (top == bottom) {
				5894	// If this is a singleton we just expand the one character.
				5895	int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5896	for (int i = 0; i < length; i++) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	5897	uc32 chr = chars[i];
				5898	if (chr != bottom) {
				5899	ranges->Add(CharacterRange::Singleton(chars[i]), zone);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5900	}
				5901	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	5902	} else {
				5903	// If this is a range we expand the characters block by block, expanding
				5904	// contiguous subranges (blocks) one at a time. The approach is as
				5905	// follows. For a given start character we look up the remainder of the
				5906	// block that contains it (represented by the end point), for instance we
				5907	// find 'z' if the character is 'c'. A block is characterized by the
				5908	// property that all characters uncanonicalize in the same way, except
				5909	// that each entry in the result is incremented by the distance from the
				5910	// first element. So a-z is a block because 'a' uncanonicalizes to ['a',
				5911	// 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
				5912	// we've found the end point we look up its uncanonicalization and
				5913	// produce a range for each element. For instance for [c-f] we look up
				5914	// ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
				5915	// it is not already contained in the input, so [c-f] will be skipped but
				5916	// [C-F] will be added. If this range is not completely contained in a
				5917	// block we do this for all the blocks covered by the range (handling
				5918	// characters that is not in a block as a "singleton block").
				5919	unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				5920	int pos = bottom;
				5921	while (pos <= top) {
				5922	int length =
				5923	isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
				5924	uc32 block_end;
				5925	if (length == 0) {
				5926	block_end = pos;
				5927	} else {
				5928	DCHECK_EQ(1, length);
				5929	block_end = equivalents[0];
				5930	}
				5931	int end = (block_end > top) ? top : block_end;
				5932	length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
				5933	equivalents);
				5934	for (int i = 0; i < length; i++) {
				5935	uc32 c = equivalents[i];
				5936	uc32 range_from = c - (block_end - pos);
				5937	uc32 range_to = c - (block_end - end);
				5938	if (!(bottom <= range_from && range_to <= top)) {
				5939	ranges->Add(CharacterRange::Range(range_from, range_to), zone);
				5940	}
				5941	}
				5942	pos = end + 1;
				5943	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5944	}
				5945	}
				5946	}
				5947
				5948
				5949	bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
				5950	DCHECK_NOT_NULL(ranges);
				5951	int n = ranges->length();
				5952	if (n <= 1) return true;
				5953	int max = ranges->at(0).to();
				5954	for (int i = 1; i < n; i++) {
				5955	CharacterRange next_range = ranges->at(i);
				5956	if (next_range.from() <= max + 1) return false;
				5957	max = next_range.to();
				5958	}
				5959	return true;
				5960	}
				5961
				5962
				5963	ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
				5964	if (ranges_ == NULL) {
				5965	ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
				5966	CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
				5967	}
				5968	return ranges_;
				5969	}
				5970
				5971
				5972	// Move a number of elements in a zonelist to another position
				5973	// in the same list. Handles overlapping source and target areas.
				5974	static void MoveRanges(ZoneList<CharacterRange>* list,
				5975	int from,
				5976	int to,
				5977	int count) {
				5978	// Ranges are potentially overlapping.
				5979	if (from < to) {
				5980	for (int i = count - 1; i >= 0; i--) {
				5981	list->at(to + i) = list->at(from + i);
				5982	}
				5983	} else {
				5984	for (int i = 0; i < count; i++) {
				5985	list->at(to + i) = list->at(from + i);
				5986	}
				5987	}
				5988	}
				5989
				5990
				5991	static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
				5992	int count,
				5993	CharacterRange insert) {
				5994	// Inserts a range into list[0..count[, which must be sorted
				5995	// by from value and non-overlapping and non-adjacent, using at most
				5996	// list[0..count] for the result. Returns the number of resulting
				5997	// canonicalized ranges. Inserting a range may collapse existing ranges into
				5998	// fewer ranges, so the return value can be anything in the range 1..count+1.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	5999	uc32 from = insert.from();
				6000	uc32 to = insert.to();
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6001	int start_pos = 0;
				6002	int end_pos = count;
				6003	for (int i = count - 1; i >= 0; i--) {
				6004	CharacterRange current = list->at(i);
				6005	if (current.from() > to + 1) {
				6006	end_pos = i;
				6007	} else if (current.to() + 1 < from) {
				6008	start_pos = i + 1;
				6009	break;
				6010	}
				6011	}
				6012
				6013	// Inserted range overlaps, or is adjacent to, ranges at positions
				6014	// [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
				6015	// not affected by the insertion.
				6016	// If start_pos == end_pos, the range must be inserted before start_pos.
				6017	// if start_pos < end_pos, the entire range from start_pos to end_pos
				6018	// must be merged with the insert range.
				6019
				6020	if (start_pos == end_pos) {
				6021	// Insert between existing ranges at position start_pos.
				6022	if (start_pos < count) {
				6023	MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
				6024	}
				6025	list->at(start_pos) = insert;
				6026	return count + 1;
				6027	}
				6028	if (start_pos + 1 == end_pos) {
				6029	// Replace single existing range at position start_pos.
				6030	CharacterRange to_replace = list->at(start_pos);
				6031	int new_from = Min(to_replace.from(), from);
				6032	int new_to = Max(to_replace.to(), to);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6033	list->at(start_pos) = CharacterRange::Range(new_from, new_to);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6034	return count;
				6035	}
				6036	// Replace a number of existing ranges from start_pos to end_pos - 1.
				6037	// Move the remaining ranges down.
				6038
				6039	int new_from = Min(list->at(start_pos).from(), from);
				6040	int new_to = Max(list->at(end_pos - 1).to(), to);
				6041	if (end_pos < count) {
				6042	MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
				6043	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6044	list->at(start_pos) = CharacterRange::Range(new_from, new_to);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6045	return count - (end_pos - start_pos) + 1;
				6046	}
				6047
				6048
				6049	void CharacterSet::Canonicalize() {
				6050	// Special/default classes are always considered canonical. The result
				6051	// of calling ranges() will be sorted.
				6052	if (ranges_ == NULL) return;
				6053	CharacterRange::Canonicalize(ranges_);
				6054	}
				6055
				6056
				6057	void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
				6058	if (character_ranges->length() <= 1) return;
				6059	// Check whether ranges are already canonical (increasing, non-overlapping,
				6060	// non-adjacent).
				6061	int n = character_ranges->length();
				6062	int max = character_ranges->at(0).to();
				6063	int i = 1;
				6064	while (i < n) {
				6065	CharacterRange current = character_ranges->at(i);
				6066	if (current.from() <= max + 1) {
				6067	break;
				6068	}
				6069	max = current.to();
				6070	i++;
				6071	}
				6072	// Canonical until the i'th range. If that's all of them, we are done.
				6073	if (i == n) return;
				6074
				6075	// The ranges at index i and forward are not canonicalized. Make them so by
				6076	// doing the equivalent of insertion sort (inserting each into the previous
				6077	// list, in order).
				6078	// Notice that inserting a range can reduce the number of ranges in the
				6079	// result due to combining of adjacent and overlapping ranges.
				6080	int read = i; // Range to insert.
				6081	int num_canonical = i; // Length of canonicalized part of list.
				6082	do {
				6083	num_canonical = InsertRangeInCanonicalList(character_ranges,
				6084	num_canonical,
				6085	character_ranges->at(read));
				6086	read++;
				6087	} while (read < n);
				6088	character_ranges->Rewind(num_canonical);
				6089
				6090	DCHECK(CharacterRange::IsCanonical(character_ranges));
				6091	}
				6092
				6093
				6094	void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
				6095	ZoneList<CharacterRange>* negated_ranges,
				6096	Zone* zone) {
				6097	DCHECK(CharacterRange::IsCanonical(ranges));
				6098	DCHECK_EQ(0, negated_ranges->length());
				6099	int range_count = ranges->length();
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6100	uc32 from = 0;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6101	int i = 0;
				6102	if (range_count > 0 && ranges->at(0).from() == 0) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6103	from = ranges->at(0).to() + 1;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6104	i = 1;
				6105	}
				6106	while (i < range_count) {
				6107	CharacterRange range = ranges->at(i);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6108	negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
				6109	from = range.to() + 1;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6110	i++;
				6111	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6112	if (from < String::kMaxCodePoint) {
				6113	negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6114	zone);
				6115	}
				6116	}
				6117
				6118
				6119	// -------------------------------------------------------------------
				6120	// Splay tree
				6121
				6122
				6123	OutSet* OutSet::Extend(unsigned value, Zone* zone) {
				6124	if (Get(value))
				6125	return this;
				6126	if (successors(zone) != NULL) {
				6127	for (int i = 0; i < successors(zone)->length(); i++) {
				6128	OutSet* successor = successors(zone)->at(i);
				6129	if (successor->Get(value))
				6130	return successor;
				6131	}
				6132	} else {
				6133	successors_ = new(zone) ZoneList<OutSet*>(2, zone);
				6134	}
				6135	OutSet* result = new(zone) OutSet(first_, remaining_);
				6136	result->Set(value, zone);
				6137	successors(zone)->Add(result, zone);
				6138	return result;
				6139	}
				6140
				6141
				6142	void OutSet::Set(unsigned value, Zone *zone) {
				6143	if (value < kFirstLimit) {
				6144	first_ \|= (1 << value);
				6145	} else {
				6146	if (remaining_ == NULL)
				6147	remaining_ = new(zone) ZoneList<unsigned>(1, zone);
				6148	if (remaining_->is_empty() \|\| !remaining_->Contains(value))
				6149	remaining_->Add(value, zone);
				6150	}
				6151	}
				6152
				6153
				6154	bool OutSet::Get(unsigned value) const {
				6155	if (value < kFirstLimit) {
				6156	return (first_ & (1 << value)) != 0;
				6157	} else if (remaining_ == NULL) {
				6158	return false;
				6159	} else {
				6160	return remaining_->Contains(value);
				6161	}
				6162	}
				6163
				6164
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6165	const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6166
				6167
				6168	void DispatchTable::AddRange(CharacterRange full_range, int value,
				6169	Zone* zone) {
				6170	CharacterRange current = full_range;
				6171	if (tree()->is_empty()) {
				6172	// If this is the first range we just insert into the table.
				6173	ZoneSplayTree<Config>::Locator loc;
				6174	bool inserted = tree()->Insert(current.from(), &loc);
				6175	DCHECK(inserted);
				6176	USE(inserted);
				6177	loc.set_value(Entry(current.from(), current.to(),
				6178	empty()->Extend(value, zone)));
				6179	return;
				6180	}
				6181	// First see if there is a range to the left of this one that
				6182	// overlaps.
				6183	ZoneSplayTree<Config>::Locator loc;
				6184	if (tree()->FindGreatestLessThan(current.from(), &loc)) {
				6185	Entry* entry = &loc.value();
				6186	// If we've found a range that overlaps with this one, and it
				6187	// starts strictly to the left of this one, we have to fix it
				6188	// because the following code only handles ranges that start on
				6189	// or after the start point of the range we're adding.
				6190	if (entry->from() < current.from() && entry->to() >= current.from()) {
				6191	// Snap the overlapping range in half around the start point of
				6192	// the range we're adding.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6193	CharacterRange left =
				6194	CharacterRange::Range(entry->from(), current.from() - 1);
				6195	CharacterRange right = CharacterRange::Range(current.from(), entry->to());
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6196	// The left part of the overlapping range doesn't overlap.
				6197	// Truncate the whole entry to be just the left part.
				6198	entry->set_to(left.to());
				6199	// The right part is the one that overlaps. We add this part
				6200	// to the map and let the next step deal with merging it with
				6201	// the range we're adding.
				6202	ZoneSplayTree<Config>::Locator loc;
				6203	bool inserted = tree()->Insert(right.from(), &loc);
				6204	DCHECK(inserted);
				6205	USE(inserted);
				6206	loc.set_value(Entry(right.from(),
				6207	right.to(),
				6208	entry->out_set()));
				6209	}
				6210	}
				6211	while (current.is_valid()) {
				6212	if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
				6213	(loc.value().from() <= current.to()) &&
				6214	(loc.value().to() >= current.from())) {
				6215	Entry* entry = &loc.value();
				6216	// We have overlap. If there is space between the start point of
				6217	// the range we're adding and where the overlapping range starts
				6218	// then we have to add a range covering just that space.
				6219	if (current.from() < entry->from()) {
				6220	ZoneSplayTree<Config>::Locator ins;
				6221	bool inserted = tree()->Insert(current.from(), &ins);
				6222	DCHECK(inserted);
				6223	USE(inserted);
				6224	ins.set_value(Entry(current.from(),
				6225	entry->from() - 1,
				6226	empty()->Extend(value, zone)));
				6227	current.set_from(entry->from());
				6228	}
				6229	DCHECK_EQ(current.from(), entry->from());
				6230	// If the overlapping range extends beyond the one we want to add
				6231	// we have to snap the right part off and add it separately.
				6232	if (entry->to() > current.to()) {
				6233	ZoneSplayTree<Config>::Locator ins;
				6234	bool inserted = tree()->Insert(current.to() + 1, &ins);
				6235	DCHECK(inserted);
				6236	USE(inserted);
				6237	ins.set_value(Entry(current.to() + 1,
				6238	entry->to(),
				6239	entry->out_set()));
				6240	entry->set_to(current.to());
				6241	}
				6242	DCHECK(entry->to() <= current.to());
				6243	// The overlapping range is now completely contained by the range
				6244	// we're adding so we can just update it and move the start point
				6245	// of the range we're adding just past it.
				6246	entry->AddValue(value, zone);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6247	DCHECK(entry->to() + 1 > current.from());
				6248	current.set_from(entry->to() + 1);
				6249	} else {
				6250	// There is no overlap so we can just add the range
				6251	ZoneSplayTree<Config>::Locator ins;
				6252	bool inserted = tree()->Insert(current.from(), &ins);
				6253	DCHECK(inserted);
				6254	USE(inserted);
				6255	ins.set_value(Entry(current.from(),
				6256	current.to(),
				6257	empty()->Extend(value, zone)));
				6258	break;
				6259	}
				6260	}
				6261	}
				6262
				6263
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6264	OutSet* DispatchTable::Get(uc32 value) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6265	ZoneSplayTree<Config>::Locator loc;
				6266	if (!tree()->FindGreatestLessThan(value, &loc))
				6267	return empty();
				6268	Entry* entry = &loc.value();
				6269	if (value <= entry->to())
				6270	return entry->out_set();
				6271	else
				6272	return empty();
				6273	}
				6274
				6275
				6276	// -------------------------------------------------------------------
				6277	// Analysis
				6278
				6279
				6280	void Analysis::EnsureAnalyzed(RegExpNode* that) {
				6281	StackLimitCheck check(isolate());
				6282	if (check.HasOverflowed()) {
				6283	fail("Stack overflow");
				6284	return;
				6285	}
				6286	if (that->info()->been_analyzed \|\| that->info()->being_analyzed)
				6287	return;
				6288	that->info()->being_analyzed = true;
				6289	that->Accept(this);
				6290	that->info()->being_analyzed = false;
				6291	that->info()->been_analyzed = true;
				6292	}
				6293
				6294
				6295	void Analysis::VisitEnd(EndNode* that) {
				6296	// nothing to do
				6297	}
				6298
				6299
				6300	void TextNode::CalculateOffsets() {
				6301	int element_count = elements()->length();
				6302	// Set up the offsets of the elements relative to the start. This is a fixed
				6303	// quantity since a TextNode can only contain fixed-width things.
				6304	int cp_offset = 0;
				6305	for (int i = 0; i < element_count; i++) {
				6306	TextElement& elm = elements()->at(i);
				6307	elm.set_cp_offset(cp_offset);
				6308	cp_offset += elm.length();
				6309	}
				6310	}
				6311
				6312
				6313	void Analysis::VisitText(TextNode* that) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6314	if (ignore_case()) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6315	that->MakeCaseIndependent(isolate(), is_one_byte_);
				6316	}
				6317	EnsureAnalyzed(that->on_success());
				6318	if (!has_failed()) {
				6319	that->CalculateOffsets();
				6320	}
				6321	}
				6322
				6323
				6324	void Analysis::VisitAction(ActionNode* that) {
				6325	RegExpNode* target = that->on_success();
				6326	EnsureAnalyzed(target);
				6327	if (!has_failed()) {
				6328	// If the next node is interested in what it follows then this node
				6329	// has to be interested too so it can pass the information on.
				6330	that->info()->AddFromFollowing(target->info());
				6331	}
				6332	}
				6333
				6334
				6335	void Analysis::VisitChoice(ChoiceNode* that) {
				6336	NodeInfo* info = that->info();
				6337	for (int i = 0; i < that->alternatives()->length(); i++) {
				6338	RegExpNode* node = that->alternatives()->at(i).node();
				6339	EnsureAnalyzed(node);
				6340	if (has_failed()) return;
				6341	// Anything the following nodes need to know has to be known by
				6342	// this node also, so it can pass it on.
				6343	info->AddFromFollowing(node->info());
				6344	}
				6345	}
				6346
				6347
				6348	void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
				6349	NodeInfo* info = that->info();
				6350	for (int i = 0; i < that->alternatives()->length(); i++) {
				6351	RegExpNode* node = that->alternatives()->at(i).node();
				6352	if (node != that->loop_node()) {
				6353	EnsureAnalyzed(node);
				6354	if (has_failed()) return;
				6355	info->AddFromFollowing(node->info());
				6356	}
				6357	}
				6358	// Check the loop last since it may need the value of this node
				6359	// to get a correct result.
				6360	EnsureAnalyzed(that->loop_node());
				6361	if (!has_failed()) {
				6362	info->AddFromFollowing(that->loop_node()->info());
				6363	}
				6364	}
				6365
				6366
				6367	void Analysis::VisitBackReference(BackReferenceNode* that) {
				6368	EnsureAnalyzed(that->on_success());
				6369	}
				6370
				6371
				6372	void Analysis::VisitAssertion(AssertionNode* that) {
				6373	EnsureAnalyzed(that->on_success());
				6374	}
				6375
				6376
				6377	void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
				6378	BoyerMooreLookahead* bm,
				6379	bool not_at_start) {
				6380	// Working out the set of characters that a backreference can match is too
				6381	// hard, so we just say that any character can match.
				6382	bm->SetRest(offset);
				6383	SaveBMInfo(bm, not_at_start, offset);
				6384	}
				6385
				6386
				6387	STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
				6388	RegExpMacroAssembler::kTableSize);
				6389
				6390
				6391	void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
				6392	BoyerMooreLookahead* bm, bool not_at_start) {
				6393	ZoneList<GuardedAlternative>* alts = alternatives();
				6394	budget = (budget - 1) / alts->length();
				6395	for (int i = 0; i < alts->length(); i++) {
				6396	GuardedAlternative& alt = alts->at(i);
				6397	if (alt.guards() != NULL && alt.guards()->length() != 0) {
				6398	bm->SetRest(offset); // Give up trying to fill in info.
				6399	SaveBMInfo(bm, not_at_start, offset);
				6400	return;
				6401	}
				6402	alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
				6403	}
				6404	SaveBMInfo(bm, not_at_start, offset);
				6405	}
				6406
				6407
				6408	void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
				6409	BoyerMooreLookahead* bm, bool not_at_start) {
				6410	if (initial_offset >= bm->length()) return;
				6411	int offset = initial_offset;
				6412	int max_char = bm->max_char();
				6413	for (int i = 0; i < elements()->length(); i++) {
				6414	if (offset >= bm->length()) {
				6415	if (initial_offset == 0) set_bm_info(not_at_start, bm);
				6416	return;
				6417	}
				6418	TextElement text = elements()->at(i);
				6419	if (text.text_type() == TextElement::ATOM) {
				6420	RegExpAtom* atom = text.atom();
				6421	for (int j = 0; j < atom->length(); j++, offset++) {
				6422	if (offset >= bm->length()) {
				6423	if (initial_offset == 0) set_bm_info(not_at_start, bm);
				6424	return;
				6425	}
				6426	uc16 character = atom->data()[j];
				6427	if (bm->compiler()->ignore_case()) {
				6428	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				6429	int length = GetCaseIndependentLetters(
				6430	isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
				6431	chars);
				6432	for (int j = 0; j < length; j++) {
				6433	bm->Set(offset, chars[j]);
				6434	}
				6435	} else {
				6436	if (character <= max_char) bm->Set(offset, character);
				6437	}
				6438	}
				6439	} else {
				6440	DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
				6441	RegExpCharacterClass* char_class = text.char_class();
				6442	ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
				6443	if (char_class->is_negated()) {
				6444	bm->SetAll(offset);
				6445	} else {
				6446	for (int k = 0; k < ranges->length(); k++) {
				6447	CharacterRange& range = ranges->at(k);
				6448	if (range.from() > max_char) continue;
				6449	int to = Min(max_char, static_cast<int>(range.to()));
				6450	bm->SetInterval(offset, Interval(range.from(), to));
				6451	}
				6452	}
				6453	offset++;
				6454	}
				6455	}
				6456	if (offset >= bm->length()) {
				6457	if (initial_offset == 0) set_bm_info(not_at_start, bm);
				6458	return;
				6459	}
				6460	on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
				6461	true); // Not at start after a text node.
				6462	if (initial_offset == 0) set_bm_info(not_at_start, bm);
				6463	}
				6464
				6465
				6466	// -------------------------------------------------------------------
				6467	// Dispatch table construction
				6468
				6469
				6470	void DispatchTableConstructor::VisitEnd(EndNode* that) {
				6471	AddRange(CharacterRange::Everything());
				6472	}
				6473
				6474
				6475	void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
				6476	node->set_being_calculated(true);
				6477	ZoneList<GuardedAlternative>* alternatives = node->alternatives();
				6478	for (int i = 0; i < alternatives->length(); i++) {
				6479	set_choice_index(i);
				6480	alternatives->at(i).node()->Accept(this);
				6481	}
				6482	node->set_being_calculated(false);
				6483	}
				6484
				6485
				6486	class AddDispatchRange {
				6487	public:
				6488	explicit AddDispatchRange(DispatchTableConstructor* constructor)
				6489	: constructor_(constructor) { }
				6490	void Call(uc32 from, DispatchTable::Entry entry);
				6491	private:
				6492	DispatchTableConstructor* constructor_;
				6493	};
				6494
				6495
				6496	void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6497	constructor_->AddRange(CharacterRange::Range(from, entry.to()));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6498	}
				6499
				6500
				6501	void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
				6502	if (node->being_calculated())
				6503	return;
				6504	DispatchTable* table = node->GetTable(ignore_case_);
				6505	AddDispatchRange adder(this);
				6506	table->ForEach(&adder);
				6507	}
				6508
				6509
				6510	void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
				6511	// TODO(160): Find the node that we refer back to and propagate its start
				6512	// set back to here. For now we just accept anything.
				6513	AddRange(CharacterRange::Everything());
				6514	}
				6515
				6516
				6517	void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
				6518	RegExpNode* target = that->on_success();
				6519	target->Accept(this);
				6520	}
				6521
				6522
				6523	static int CompareRangeByFrom(const CharacterRange* a,
				6524	const CharacterRange* b) {
				6525	return Compare<uc16>(a->from(), b->from());
				6526	}
				6527
				6528
				6529	void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
				6530	ranges->Sort(CompareRangeByFrom);
				6531	uc16 last = 0;
				6532	for (int i = 0; i < ranges->length(); i++) {
				6533	CharacterRange range = ranges->at(i);
				6534	if (last < range.from())
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6535	AddRange(CharacterRange::Range(last, range.from() - 1));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6536	if (range.to() >= last) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6537	if (range.to() == String::kMaxCodePoint) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6538	return;
				6539	} else {
				6540	last = range.to() + 1;
				6541	}
				6542	}
				6543	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6544	AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6545	}
				6546
				6547
				6548	void DispatchTableConstructor::VisitText(TextNode* that) {
				6549	TextElement elm = that->elements()->at(0);
				6550	switch (elm.text_type()) {
				6551	case TextElement::ATOM: {
				6552	uc16 c = elm.atom()->data()[0];
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6553	AddRange(CharacterRange::Range(c, c));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6554	break;
				6555	}
				6556	case TextElement::CHAR_CLASS: {
				6557	RegExpCharacterClass* tree = elm.char_class();
				6558	ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
				6559	if (tree->is_negated()) {
				6560	AddInverse(ranges);
				6561	} else {
				6562	for (int i = 0; i < ranges->length(); i++)
				6563	AddRange(ranges->at(i));
				6564	}
				6565	break;
				6566	}
				6567	default: {
				6568	UNIMPLEMENTED();
				6569	}
				6570	}
				6571	}
				6572
				6573
				6574	void DispatchTableConstructor::VisitAction(ActionNode* that) {
				6575	RegExpNode* target = that->on_success();
				6576	target->Accept(this);
				6577	}
				6578
				6579
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6580	RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
				6581	RegExpNode* on_success) {
				6582	// If the regexp matching starts within a surrogate pair, step back
				6583	// to the lead surrogate and start matching from there.
				6584	DCHECK(!compiler->read_backward());
				6585	Zone* zone = compiler->zone();
				6586	ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
				6587	zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
				6588	ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
				6589	zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
				6590
				6591	ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
				6592
				6593	int stack_register = compiler->UnicodeLookaroundStackRegister();
				6594	int position_register = compiler->UnicodeLookaroundPositionRegister();
				6595	RegExpNode* step_back = TextNode::CreateForCharacterRanges(
				6596	zone, lead_surrogates, true, on_success);
				6597	RegExpLookaround::Builder builder(true, step_back, stack_register,
				6598	position_register);
				6599	RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
				6600	zone, trail_surrogates, false, builder.on_match_success());
				6601
				6602	optional_step_back->AddAlternative(
				6603	GuardedAlternative(builder.ForMatch(match_trail)));
				6604	optional_step_back->AddAlternative(GuardedAlternative(on_success));
				6605
				6606	return optional_step_back;
				6607	}
				6608
				6609
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6610	RegExpEngine::CompilationResult RegExpEngine::Compile(
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6611	Isolate* isolate, Zone* zone, RegExpCompileData* data,
				6612	JSRegExp::Flags flags, Handle<String> pattern,
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6613	Handle<String> sample_subject, bool is_one_byte) {
				6614	if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
				6615	return IrregexpRegExpTooBig(isolate);
				6616	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6617	bool ignore_case = flags & JSRegExp::kIgnoreCase;
				6618	bool is_sticky = flags & JSRegExp::kSticky;
				6619	bool is_global = flags & JSRegExp::kGlobal;
				6620	bool is_unicode = flags & JSRegExp::kUnicode;
				6621	RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6622	is_one_byte);
				6623
				6624	if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
				6625
				6626	// Sample some characters from the middle of the string.
				6627	static const int kSampleSize = 128;
				6628
				6629	sample_subject = String::Flatten(sample_subject);
				6630	int chars_sampled = 0;
				6631	int half_way = (sample_subject->length() - kSampleSize) / 2;
				6632	for (int i = Max(0, half_way);
				6633	i < sample_subject->length() && chars_sampled < kSampleSize;
				6634	i++, chars_sampled++) {
				6635	compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
				6636	}
				6637
				6638	// Wrap the body of the regexp in capture #0.
				6639	RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
				6640	0,
				6641	&compiler,
				6642	compiler.accept());
				6643	RegExpNode* node = captured_body;
				6644	bool is_end_anchored = data->tree->IsAnchoredAtEnd();
				6645	bool is_start_anchored = data->tree->IsAnchoredAtStart();
				6646	int max_length = data->tree->max_match();
				6647	if (!is_start_anchored && !is_sticky) {
				6648	// Add a .*? at the beginning, outside the body capture, unless
				6649	// this expression is anchored at the beginning or sticky.
				6650	RegExpNode* loop_node = RegExpQuantifier::ToNode(
				6651	0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'),
				6652	&compiler, captured_body, data->contains_anchor);
				6653
				6654	if (data->contains_anchor) {
				6655	// Unroll loop once, to take care of the case that might start
				6656	// at the start of input.
				6657	ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
				6658	first_step_node->AddAlternative(GuardedAlternative(captured_body));
				6659	first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
				6660	new (zone) RegExpCharacterClass('*'), false, loop_node)));
				6661	node = first_step_node;
				6662	} else {
				6663	node = loop_node;
				6664	}
				6665	}
				6666	if (is_one_byte) {
				6667	node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
				6668	// Do it again to propagate the new nodes to places where they were not
				6669	// put because they had not been calculated yet.
				6670	if (node != NULL) {
				6671	node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
				6672	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6673	} else if (compiler.unicode() && (is_global \|\| is_sticky)) {
				6674	node = OptionallyStepBackToLeadSurrogate(&compiler, node);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6675	}
				6676
				6677	if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
				6678	data->node = node;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6679	Analysis analysis(isolate, flags, is_one_byte);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6680	analysis.EnsureAnalyzed(node);
				6681	if (analysis.has_failed()) {
				6682	const char* error_message = analysis.error_message();
				6683	return CompilationResult(isolate, error_message);
				6684	}
				6685
				6686	// Create the correct assembler for the architecture.
				6687	#ifndef V8_INTERPRETED_REGEXP
				6688	// Native regexp implementation.
				6689
				6690	NativeRegExpMacroAssembler::Mode mode =
				6691	is_one_byte ? NativeRegExpMacroAssembler::LATIN1
				6692	: NativeRegExpMacroAssembler::UC16;
				6693
				6694	#if V8_TARGET_ARCH_IA32
				6695	RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
				6696	(data->capture_count + 1) * 2);
				6697	#elif V8_TARGET_ARCH_X64
				6698	RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
				6699	(data->capture_count + 1) * 2);
				6700	#elif V8_TARGET_ARCH_ARM
				6701	RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
				6702	(data->capture_count + 1) * 2);
				6703	#elif V8_TARGET_ARCH_ARM64
				6704	RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
				6705	(data->capture_count + 1) * 2);
				6706	#elif V8_TARGET_ARCH_PPC
				6707	RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
				6708	(data->capture_count + 1) * 2);
				6709	#elif V8_TARGET_ARCH_MIPS
				6710	RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
				6711	(data->capture_count + 1) * 2);
				6712	#elif V8_TARGET_ARCH_MIPS64
				6713	RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
				6714	(data->capture_count + 1) * 2);
				6715	#elif V8_TARGET_ARCH_X87
				6716	RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode,
				6717	(data->capture_count + 1) * 2);
				6718	#else
				6719	#error "Unsupported architecture"
				6720	#endif
				6721
				6722	#else // V8_INTERPRETED_REGEXP
				6723	// Interpreted regexp implementation.
				6724	EmbeddedVector<byte, 1024> codes;
				6725	RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
				6726	#endif // V8_INTERPRETED_REGEXP
				6727
				6728	macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern));
				6729
				6730	// Inserted here, instead of in Assembler, because it depends on information
				6731	// in the AST that isn't replicated in the Node structure.
				6732	static const int kMaxBacksearchLimit = 1024;
				6733	if (is_end_anchored &&
				6734	!is_start_anchored &&
				6735	max_length < kMaxBacksearchLimit) {
				6736	macro_assembler.SetCurrentPositionFromEnd(max_length);
				6737	}
				6738
				6739	if (is_global) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame^]	6740	RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
				6741	if (data->tree->min_match() > 0) {
				6742	mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
				6743	} else if (is_unicode) {
				6744	mode = RegExpMacroAssembler::GLOBAL_UNICODE;
				6745	}
				6746	macro_assembler.set_global_mode(mode);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6747	}
				6748
				6749	return compiler.Assemble(&macro_assembler,
				6750	node,
				6751	data->capture_count,
				6752	pattern);
				6753	}
				6754
				6755
				6756	bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) {
				6757	Heap* heap = pattern->GetHeap();
				6758	bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
				6759	if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&
				6760	heap->isolate()->memory_allocator()->SizeExecutable() >
				6761	RegExpImpl::kRegExpExecutableMemoryLimit) {
				6762	too_much = true;
				6763	}
				6764	return too_much;
				6765	}
				6766
				6767
				6768	Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string,
				6769	Object* key_pattern,
				6770	FixedArray** last_match_cache,
				6771	ResultsCacheType type) {
				6772	FixedArray* cache;
				6773	if (!key_string->IsInternalizedString()) return Smi::FromInt(0);
				6774	if (type == STRING_SPLIT_SUBSTRINGS) {
				6775	DCHECK(key_pattern->IsString());
				6776	if (!key_pattern->IsInternalizedString()) return Smi::FromInt(0);
				6777	cache = heap->string_split_cache();
				6778	} else {
				6779	DCHECK(type == REGEXP_MULTIPLE_INDICES);
				6780	DCHECK(key_pattern->IsFixedArray());
				6781	cache = heap->regexp_multiple_cache();
				6782	}
				6783
				6784	uint32_t hash = key_string->Hash();
				6785	uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
				6786	~(kArrayEntriesPerCacheEntry - 1));
				6787	if (cache->get(index + kStringOffset) != key_string \|\|
				6788	cache->get(index + kPatternOffset) != key_pattern) {
				6789	index =
				6790	((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
				6791	if (cache->get(index + kStringOffset) != key_string \|\|
				6792	cache->get(index + kPatternOffset) != key_pattern) {
				6793	return Smi::FromInt(0);
				6794	}
				6795	}
				6796
				6797	*last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
				6798	return cache->get(index + kArrayOffset);
				6799	}
				6800
				6801
				6802	void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
				6803	Handle<Object> key_pattern,
				6804	Handle<FixedArray> value_array,
				6805	Handle<FixedArray> last_match_cache,
				6806	ResultsCacheType type) {
				6807	Factory* factory = isolate->factory();
				6808	Handle<FixedArray> cache;
				6809	if (!key_string->IsInternalizedString()) return;
				6810	if (type == STRING_SPLIT_SUBSTRINGS) {
				6811	DCHECK(key_pattern->IsString());
				6812	if (!key_pattern->IsInternalizedString()) return;
				6813	cache = factory->string_split_cache();
				6814	} else {
				6815	DCHECK(type == REGEXP_MULTIPLE_INDICES);
				6816	DCHECK(key_pattern->IsFixedArray());
				6817	cache = factory->regexp_multiple_cache();
				6818	}
				6819
				6820	uint32_t hash = key_string->Hash();
				6821	uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
				6822	~(kArrayEntriesPerCacheEntry - 1));
				6823	if (cache->get(index + kStringOffset) == Smi::FromInt(0)) {
				6824	cache->set(index + kStringOffset, *key_string);
				6825	cache->set(index + kPatternOffset, *key_pattern);
				6826	cache->set(index + kArrayOffset, *value_array);
				6827	cache->set(index + kLastMatchOffset, *last_match_cache);
				6828	} else {
				6829	uint32_t index2 =
				6830	((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
				6831	if (cache->get(index2 + kStringOffset) == Smi::FromInt(0)) {
				6832	cache->set(index2 + kStringOffset, *key_string);
				6833	cache->set(index2 + kPatternOffset, *key_pattern);
				6834	cache->set(index2 + kArrayOffset, *value_array);
				6835	cache->set(index2 + kLastMatchOffset, *last_match_cache);
				6836	} else {
				6837	cache->set(index2 + kStringOffset, Smi::FromInt(0));
				6838	cache->set(index2 + kPatternOffset, Smi::FromInt(0));
				6839	cache->set(index2 + kArrayOffset, Smi::FromInt(0));
				6840	cache->set(index2 + kLastMatchOffset, Smi::FromInt(0));
				6841	cache->set(index + kStringOffset, *key_string);
				6842	cache->set(index + kPatternOffset, *key_pattern);
				6843	cache->set(index + kArrayOffset, *value_array);
				6844	cache->set(index + kLastMatchOffset, *last_match_cache);
				6845	}
				6846	}
				6847	// If the array is a reasonably short list of substrings, convert it into a
				6848	// list of internalized strings.
				6849	if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
				6850	for (int i = 0; i < value_array->length(); i++) {
				6851	Handle<String> str(String::cast(value_array->get(i)), isolate);
				6852	Handle<String> internalized_str = factory->InternalizeString(str);
				6853	value_array->set(i, *internalized_str);
				6854	}
				6855	}
				6856	// Convert backing store to a copy-on-write array.
				6857	value_array->set_map_no_write_barrier(*factory->fixed_cow_array_map());
				6858	}
				6859
				6860
				6861	void RegExpResultsCache::Clear(FixedArray* cache) {
				6862	for (int i = 0; i < kRegExpResultsCacheSize; i++) {
				6863	cache->set(i, Smi::FromInt(0));
				6864	}
				6865	}
				6866
				6867	} // namespace internal
				6868	} // namespace v8