Blame - src/regexp/jsregexp.cc - fp2-dev/platform/external/v8

blob: 6c50f4e72e6f9d543bd396687c1958c69ab63f1a [file] [log] [blame]

Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1	// Copyright 2012 the V8 project authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#include "src/regexp/jsregexp.h"
				6
				7	#include "src/ast/ast.h"
				8	#include "src/base/platform/platform.h"
				9	#include "src/compilation-cache.h"
				10	#include "src/compiler.h"
				11	#include "src/execution.h"
				12	#include "src/factory.h"
				13	#include "src/isolate-inl.h"
				14	#include "src/messages.h"
				15	#include "src/ostreams.h"
				16	#include "src/regexp/interpreter-irregexp.h"
				17	#include "src/regexp/jsregexp-inl.h"
				18	#include "src/regexp/regexp-macro-assembler.h"
				19	#include "src/regexp/regexp-macro-assembler-irregexp.h"
				20	#include "src/regexp/regexp-macro-assembler-tracer.h"
				21	#include "src/regexp/regexp-parser.h"
				22	#include "src/regexp/regexp-stack.h"
				23	#include "src/runtime/runtime.h"
				24	#include "src/splay-tree-inl.h"
				25	#include "src/string-search.h"
				26	#include "src/unicode-decoder.h"
				27
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	28	#ifdef V8_I18N_SUPPORT
				29	#include "unicode/uset.h"
				30	#include "unicode/utypes.h"
				31	#endif // V8_I18N_SUPPORT
				32
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	33	#ifndef V8_INTERPRETED_REGEXP
				34	#if V8_TARGET_ARCH_IA32
				35	#include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
				36	#elif V8_TARGET_ARCH_X64
				37	#include "src/regexp/x64/regexp-macro-assembler-x64.h"
				38	#elif V8_TARGET_ARCH_ARM64
				39	#include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
				40	#elif V8_TARGET_ARCH_ARM
				41	#include "src/regexp/arm/regexp-macro-assembler-arm.h"
				42	#elif V8_TARGET_ARCH_PPC
				43	#include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
Ben Murdoch	da12d29	2016-06-02 14:46:10 +0100	[diff] [blame]	44	#elif V8_TARGET_ARCH_S390
				45	#include "src/regexp/s390/regexp-macro-assembler-s390.h"
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	46	#elif V8_TARGET_ARCH_MIPS
				47	#include "src/regexp/mips/regexp-macro-assembler-mips.h"
				48	#elif V8_TARGET_ARCH_MIPS64
				49	#include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
				50	#elif V8_TARGET_ARCH_X87
				51	#include "src/regexp/x87/regexp-macro-assembler-x87.h"
				52	#else
				53	#error Unsupported target architecture.
				54	#endif
				55	#endif
				56
				57
				58	namespace v8 {
				59	namespace internal {
				60
				61	MUST_USE_RESULT
				62	static inline MaybeHandle<Object> ThrowRegExpException(
				63	Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) {
				64	Isolate* isolate = re->GetIsolate();
				65	THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
				66	pattern, error_text),
				67	Object);
				68	}
				69
				70
				71	inline void ThrowRegExpException(Handle<JSRegExp> re,
				72	Handle<String> error_text) {
				73	USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));
				74	}
				75
				76
				77	ContainedInLattice AddRange(ContainedInLattice containment,
				78	const int* ranges,
				79	int ranges_length,
				80	Interval new_range) {
				81	DCHECK((ranges_length & 1) == 1);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	82	DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	83	if (containment == kLatticeUnknown) return containment;
				84	bool inside = false;
				85	int last = 0;
				86	for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
				87	// Consider the range from last to ranges[i].
				88	// We haven't got to the new range yet.
				89	if (ranges[i] <= new_range.from()) continue;
				90	// New range is wholly inside last-ranges[i]. Note that new_range.to() is
				91	// inclusive, but the values in ranges are not.
				92	if (last <= new_range.from() && new_range.to() < ranges[i]) {
				93	return Combine(containment, inside ? kLatticeIn : kLatticeOut);
				94	}
				95	return kLatticeUnknown;
				96	}
				97	return containment;
				98	}
				99
				100
				101	// More makes code generation slower, less makes V8 benchmark score lower.
				102	const int kMaxLookaheadForBoyerMoore = 8;
				103	// In a 3-character pattern you can maximally step forwards 3 characters
				104	// at a time, which is not always enough to pay for the extra logic.
				105	const int kPatternTooShortForBoyerMoore = 2;
				106
				107
				108	// Identifies the sort of regexps where the regexp engine is faster
				109	// than the code used for atom matches.
				110	static bool HasFewDifferentCharacters(Handle<String> pattern) {
				111	int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
				112	if (length <= kPatternTooShortForBoyerMoore) return false;
				113	const int kMod = 128;
				114	bool character_found[kMod];
				115	int different = 0;
				116	memset(&character_found[0], 0, sizeof(character_found));
				117	for (int i = 0; i < length; i++) {
				118	int ch = (pattern->Get(i) & (kMod - 1));
				119	if (!character_found[ch]) {
				120	character_found[ch] = true;
				121	different++;
				122	// We declare a regexp low-alphabet if it has at least 3 times as many
				123	// characters as it has different characters.
				124	if (different * 3 > length) return false;
				125	}
				126	}
				127	return true;
				128	}
				129
				130
				131	// Generic RegExp methods. Dispatches to implementation specific methods.
				132
				133
				134	MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
				135	Handle<String> pattern,
				136	JSRegExp::Flags flags) {
				137	Isolate* isolate = re->GetIsolate();
Ben Murdoch	da12d29	2016-06-02 14:46:10 +0100	[diff] [blame]	138	Zone zone(isolate->allocator());
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	139	CompilationCache* compilation_cache = isolate->compilation_cache();
				140	MaybeHandle<FixedArray> maybe_cached =
				141	compilation_cache->LookupRegExp(pattern, flags);
				142	Handle<FixedArray> cached;
				143	bool in_cache = maybe_cached.ToHandle(&cached);
				144	LOG(isolate, RegExpCompileEvent(re, in_cache));
				145
				146	Handle<Object> result;
				147	if (in_cache) {
				148	re->set_data(*cached);
				149	return re;
				150	}
				151	pattern = String::Flatten(pattern);
				152	PostponeInterruptsScope postpone(isolate);
				153	RegExpCompileData parse_result;
				154	FlatStringReader reader(isolate, pattern);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	155	if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags,
				156	&parse_result)) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	157	// Throw an exception if we fail to parse the pattern.
				158	return ThrowRegExpException(re, pattern, parse_result.error);
				159	}
				160
				161	bool has_been_compiled = false;
				162
				163	if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
				164	!(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
				165	// Parse-tree is a single atom that is equal to the pattern.
				166	AtomCompile(re, pattern, flags, pattern);
				167	has_been_compiled = true;
				168	} else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) &&
				169	!(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
				170	RegExpAtom* atom = parse_result.tree->AsAtom();
				171	Vector<const uc16> atom_pattern = atom->data();
				172	Handle<String> atom_string;
				173	ASSIGN_RETURN_ON_EXCEPTION(
				174	isolate, atom_string,
				175	isolate->factory()->NewStringFromTwoByte(atom_pattern),
				176	Object);
				177	if (!HasFewDifferentCharacters(atom_string)) {
				178	AtomCompile(re, pattern, flags, atom_string);
				179	has_been_compiled = true;
				180	}
				181	}
				182	if (!has_been_compiled) {
				183	IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
				184	}
				185	DCHECK(re->data()->IsFixedArray());
				186	// Compilation succeeded so the data is set on the regexp
				187	// and we can store it in the cache.
				188	Handle<FixedArray> data(FixedArray::cast(re->data()));
				189	compilation_cache->PutRegExp(pattern, flags, data);
				190
				191	return re;
				192	}
				193
				194
				195	MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
				196	Handle<String> subject,
				197	int index,
				198	Handle<JSArray> last_match_info) {
				199	switch (regexp->TypeTag()) {
				200	case JSRegExp::ATOM:
				201	return AtomExec(regexp, subject, index, last_match_info);
				202	case JSRegExp::IRREGEXP: {
				203	return IrregexpExec(regexp, subject, index, last_match_info);
				204	}
				205	default:
				206	UNREACHABLE();
				207	return MaybeHandle<Object>();
				208	}
				209	}
				210
				211
				212	// RegExp Atom implementation: Simple string search using indexOf.
				213
				214
				215	void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
				216	Handle<String> pattern,
				217	JSRegExp::Flags flags,
				218	Handle<String> match_pattern) {
				219	re->GetIsolate()->factory()->SetRegExpAtomData(re,
				220	JSRegExp::ATOM,
				221	pattern,
				222	flags,
				223	match_pattern);
				224	}
				225
				226
				227	static void SetAtomLastCapture(FixedArray* array,
				228	String* subject,
				229	int from,
				230	int to) {
				231	SealHandleScope shs(array->GetIsolate());
				232	RegExpImpl::SetLastCaptureCount(array, 2);
				233	RegExpImpl::SetLastSubject(array, subject);
				234	RegExpImpl::SetLastInput(array, subject);
				235	RegExpImpl::SetCapture(array, 0, from);
				236	RegExpImpl::SetCapture(array, 1, to);
				237	}
				238
				239
				240	int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
				241	Handle<String> subject,
				242	int index,
				243	int32_t* output,
				244	int output_size) {
				245	Isolate* isolate = regexp->GetIsolate();
				246
				247	DCHECK(0 <= index);
				248	DCHECK(index <= subject->length());
				249
				250	subject = String::Flatten(subject);
				251	DisallowHeapAllocation no_gc; // ensure vectors stay valid
				252
				253	String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
				254	int needle_len = needle->length();
				255	DCHECK(needle->IsFlat());
				256	DCHECK_LT(0, needle_len);
				257
				258	if (index + needle_len > subject->length()) {
				259	return RegExpImpl::RE_FAILURE;
				260	}
				261
				262	for (int i = 0; i < output_size; i += 2) {
				263	String::FlatContent needle_content = needle->GetFlatContent();
				264	String::FlatContent subject_content = subject->GetFlatContent();
				265	DCHECK(needle_content.IsFlat());
				266	DCHECK(subject_content.IsFlat());
				267	// dispatch on type of strings
				268	index =
				269	(needle_content.IsOneByte()
				270	? (subject_content.IsOneByte()
				271	? SearchString(isolate, subject_content.ToOneByteVector(),
				272	needle_content.ToOneByteVector(), index)
				273	: SearchString(isolate, subject_content.ToUC16Vector(),
				274	needle_content.ToOneByteVector(), index))
				275	: (subject_content.IsOneByte()
				276	? SearchString(isolate, subject_content.ToOneByteVector(),
				277	needle_content.ToUC16Vector(), index)
				278	: SearchString(isolate, subject_content.ToUC16Vector(),
				279	needle_content.ToUC16Vector(), index)));
				280	if (index == -1) {
				281	return i / 2; // Return number of matches.
				282	} else {
				283	output[i] = index;
				284	output[i+1] = index + needle_len;
				285	index += needle_len;
				286	}
				287	}
				288	return output_size / 2;
				289	}
				290
				291
				292	Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
				293	Handle<String> subject,
				294	int index,
				295	Handle<JSArray> last_match_info) {
				296	Isolate* isolate = re->GetIsolate();
				297
				298	static const int kNumRegisters = 2;
				299	STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
				300	int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
				301
				302	int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
				303
				304	if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
				305
				306	DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
				307	SealHandleScope shs(isolate);
				308	FixedArray* array = FixedArray::cast(last_match_info->elements());
				309	SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
				310	return last_match_info;
				311	}
				312
				313
				314	// Irregexp implementation.
				315
				316	// Ensures that the regexp object contains a compiled version of the
				317	// source for either one-byte or two-byte subject strings.
				318	// If the compiled version doesn't already exist, it is compiled
				319	// from the source pattern.
				320	// If compilation fails, an exception is thrown and this function
				321	// returns false.
				322	bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
				323	Handle<String> sample_subject,
				324	bool is_one_byte) {
				325	Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
				326	#ifdef V8_INTERPRETED_REGEXP
				327	if (compiled_code->IsByteArray()) return true;
				328	#else // V8_INTERPRETED_REGEXP (RegExp native code)
				329	if (compiled_code->IsCode()) return true;
				330	#endif
				331	// We could potentially have marked this as flushable, but have kept
				332	// a saved version if we did not flush it yet.
				333	Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
				334	if (saved_code->IsCode()) {
				335	// Reinstate the code in the original place.
				336	re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
				337	DCHECK(compiled_code->IsSmi());
				338	return true;
				339	}
				340	return CompileIrregexp(re, sample_subject, is_one_byte);
				341	}
				342
				343
				344	bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
				345	Handle<String> sample_subject,
				346	bool is_one_byte) {
				347	// Compile the RegExp.
				348	Isolate* isolate = re->GetIsolate();
Ben Murdoch	da12d29	2016-06-02 14:46:10 +0100	[diff] [blame]	349	Zone zone(isolate->allocator());
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	350	PostponeInterruptsScope postpone(isolate);
				351	// If we had a compilation error the last time this is saved at the
				352	// saved code index.
				353	Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
				354	// When arriving here entry can only be a smi, either representing an
				355	// uncompiled regexp, a previous compilation error, or code that has
				356	// been flushed.
				357	DCHECK(entry->IsSmi());
				358	int entry_value = Smi::cast(entry)->value();
				359	DCHECK(entry_value == JSRegExp::kUninitializedValue \|\|
				360	entry_value == JSRegExp::kCompilationErrorValue \|\|
				361	(entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
				362
				363	if (entry_value == JSRegExp::kCompilationErrorValue) {
				364	// A previous compilation failed and threw an error which we store in
				365	// the saved code index (we store the error message, not the actual
				366	// error). Recreate the error object and throw it.
				367	Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
				368	DCHECK(error_string->IsString());
				369	Handle<String> error_message(String::cast(error_string));
				370	ThrowRegExpException(re, error_message);
				371	return false;
				372	}
				373
				374	JSRegExp::Flags flags = re->GetFlags();
				375
				376	Handle<String> pattern(re->Pattern());
				377	pattern = String::Flatten(pattern);
				378	RegExpCompileData compile_data;
				379	FlatStringReader reader(isolate, pattern);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	380	if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
				381	&compile_data)) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	382	// Throw an exception if we fail to parse the pattern.
				383	// THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
				384	USE(ThrowRegExpException(re, pattern, compile_data.error));
				385	return false;
				386	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	387	RegExpEngine::CompilationResult result =
				388	RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
				389	sample_subject, is_one_byte);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	390	if (result.error_message != NULL) {
				391	// Unable to compile regexp.
				392	Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
				393	CStrVector(result.error_message)).ToHandleChecked();
				394	ThrowRegExpException(re, error_message);
				395	return false;
				396	}
				397
				398	Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
				399	data->set(JSRegExp::code_index(is_one_byte), result.code);
				400	int register_max = IrregexpMaxRegisterCount(*data);
				401	if (result.num_registers > register_max) {
				402	SetIrregexpMaxRegisterCount(*data, result.num_registers);
				403	}
				404
				405	return true;
				406	}
				407
				408
				409	int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
				410	return Smi::cast(
				411	re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
				412	}
				413
				414
				415	void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
				416	re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
				417	}
				418
				419
				420	int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
				421	return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
				422	}
				423
				424
				425	int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
				426	return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
				427	}
				428
				429
				430	ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
				431	return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
				432	}
				433
				434
				435	Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
				436	return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
				437	}
				438
				439
				440	void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
				441	Handle<String> pattern,
				442	JSRegExp::Flags flags,
				443	int capture_count) {
				444	// Initialize compiled code entries to null.
				445	re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
				446	JSRegExp::IRREGEXP,
				447	pattern,
				448	flags,
				449	capture_count);
				450	}
				451
				452
				453	int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
				454	Handle<String> subject) {
				455	subject = String::Flatten(subject);
				456
				457	// Check representation of the underlying storage.
				458	bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
				459	if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
				460
				461	#ifdef V8_INTERPRETED_REGEXP
				462	// Byte-code regexp needs space allocated for all its registers.
				463	// The result captures are copied to the start of the registers array
				464	// if the match succeeds. This way those registers are not clobbered
				465	// when we set the last match info from last successful match.
				466	return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
				467	(IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
				468	#else // V8_INTERPRETED_REGEXP
				469	// Native regexp only needs room to output captures. Registers are handled
				470	// internally.
				471	return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
				472	#endif // V8_INTERPRETED_REGEXP
				473	}
				474
				475
				476	int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
				477	Handle<String> subject,
				478	int index,
				479	int32_t* output,
				480	int output_size) {
				481	Isolate* isolate = regexp->GetIsolate();
				482
				483	Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
				484
				485	DCHECK(index >= 0);
				486	DCHECK(index <= subject->length());
				487	DCHECK(subject->IsFlat());
				488
				489	bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
				490
				491	#ifndef V8_INTERPRETED_REGEXP
				492	DCHECK(output_size >= (IrregexpNumberOfCaptures(irregexp) + 1) 2);
				493	do {
				494	EnsureCompiledIrregexp(regexp, subject, is_one_byte);
				495	Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
				496	// The stack is used to allocate registers for the compiled regexp code.
				497	// This means that in case of failure, the output registers array is left
				498	// untouched and contains the capture results from the previous successful
				499	// match. We can use that to set the last match info lazily.
				500	NativeRegExpMacroAssembler::Result res =
				501	NativeRegExpMacroAssembler::Match(code,
				502	subject,
				503	output,
				504	output_size,
				505	index,
				506	isolate);
				507	if (res != NativeRegExpMacroAssembler::RETRY) {
				508	DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION \|\|
				509	isolate->has_pending_exception());
				510	STATIC_ASSERT(
				511	static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
				512	STATIC_ASSERT(
				513	static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
				514	STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
				515	== RE_EXCEPTION);
				516	return static_cast<IrregexpResult>(res);
				517	}
				518	// If result is RETRY, the string has changed representation, and we
				519	// must restart from scratch.
				520	// In this case, it means we must make sure we are prepared to handle
				521	// the, potentially, different subject (the string can switch between
				522	// being internal and external, and even between being Latin1 and UC16,
				523	// but the characters are always the same).
				524	IrregexpPrepare(regexp, subject);
				525	is_one_byte = subject->IsOneByteRepresentationUnderneath();
				526	} while (true);
				527	UNREACHABLE();
				528	return RE_EXCEPTION;
				529	#else // V8_INTERPRETED_REGEXP
				530
				531	DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
				532	// We must have done EnsureCompiledIrregexp, so we can get the number of
				533	// registers.
				534	int number_of_capture_registers =
				535	(IrregexpNumberOfCaptures(irregexp) + 1) 2;
				536	int32_t* raw_output = &output[number_of_capture_registers];
				537	// We do not touch the actual capture result registers until we know there
				538	// has been a match so that we can use those capture results to set the
				539	// last match info.
				540	for (int i = number_of_capture_registers - 1; i >= 0; i--) {
				541	raw_output[i] = -1;
				542	}
				543	Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
				544	isolate);
				545
				546	IrregexpResult result = IrregexpInterpreter::Match(isolate,
				547	byte_codes,
				548	subject,
				549	raw_output,
				550	index);
				551	if (result == RE_SUCCESS) {
				552	// Copy capture results to the start of the registers array.
				553	MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
				554	}
				555	if (result == RE_EXCEPTION) {
				556	DCHECK(!isolate->has_pending_exception());
				557	isolate->StackOverflow();
				558	}
				559	return result;
				560	#endif // V8_INTERPRETED_REGEXP
				561	}
				562
				563
				564	MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
				565	Handle<String> subject,
				566	int previous_index,
				567	Handle<JSArray> last_match_info) {
				568	Isolate* isolate = regexp->GetIsolate();
				569	DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
				570
				571	// Prepare space for the return values.
				572	#if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
				573	if (FLAG_trace_regexp_bytecodes) {
				574	String* pattern = regexp->Pattern();
				575	PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
				576	PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
				577	}
				578	#endif
				579	int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
				580	if (required_registers < 0) {
				581	// Compiling failed with an exception.
				582	DCHECK(isolate->has_pending_exception());
				583	return MaybeHandle<Object>();
				584	}
				585
				586	int32_t* output_registers = NULL;
				587	if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
				588	output_registers = NewArray<int32_t>(required_registers);
				589	}
				590	base::SmartArrayPointer<int32_t> auto_release(output_registers);
				591	if (output_registers == NULL) {
				592	output_registers = isolate->jsregexp_static_offsets_vector();
				593	}
				594
				595	int res = RegExpImpl::IrregexpExecRaw(
				596	regexp, subject, previous_index, output_registers, required_registers);
				597	if (res == RE_SUCCESS) {
				598	int capture_count =
				599	IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
				600	return SetLastMatchInfo(
				601	last_match_info, subject, capture_count, output_registers);
				602	}
				603	if (res == RE_EXCEPTION) {
				604	DCHECK(isolate->has_pending_exception());
				605	return MaybeHandle<Object>();
				606	}
				607	DCHECK(res == RE_FAILURE);
				608	return isolate->factory()->null_value();
				609	}
				610
				611
				612	static void EnsureSize(Handle<JSArray> array, uint32_t minimum_size) {
				613	if (static_cast<uint32_t>(array->elements()->length()) < minimum_size) {
				614	JSArray::SetLength(array, minimum_size);
				615	}
				616	}
				617
				618
				619	Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info,
				620	Handle<String> subject,
				621	int capture_count,
				622	int32_t* match) {
				623	DCHECK(last_match_info->HasFastObjectElements());
				624	int capture_register_count = (capture_count + 1) * 2;
				625	EnsureSize(last_match_info, capture_register_count + kLastMatchOverhead);
				626	DisallowHeapAllocation no_allocation;
				627	FixedArray* array = FixedArray::cast(last_match_info->elements());
				628	if (match != NULL) {
				629	for (int i = 0; i < capture_register_count; i += 2) {
				630	SetCapture(array, i, match[i]);
				631	SetCapture(array, i + 1, match[i + 1]);
				632	}
				633	}
				634	SetLastCaptureCount(array, capture_register_count);
				635	SetLastSubject(array, *subject);
				636	SetLastInput(array, *subject);
				637	return last_match_info;
				638	}
				639
				640
				641	RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
				642	Handle<String> subject,
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	643	Isolate* isolate)
				644	: register_array_(NULL),
				645	register_array_size_(0),
				646	regexp_(regexp),
				647	subject_(subject) {
				648	#ifdef V8_INTERPRETED_REGEXP
				649	bool interpreted = true;
				650	#else
				651	bool interpreted = false;
				652	#endif // V8_INTERPRETED_REGEXP
				653
				654	if (regexp_->TypeTag() == JSRegExp::ATOM) {
				655	static const int kAtomRegistersPerMatch = 2;
				656	registers_per_match_ = kAtomRegistersPerMatch;
				657	// There is no distinction between interpreted and native for atom regexps.
				658	interpreted = false;
				659	} else {
				660	registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
				661	if (registers_per_match_ < 0) {
				662	num_matches_ = -1; // Signal exception.
				663	return;
				664	}
				665	}
				666
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	667	DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal);
				668	if (!interpreted) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	669	register_array_size_ =
				670	Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
				671	max_matches_ = register_array_size_ / registers_per_match_;
				672	} else {
				673	// Global loop in interpreted regexp is not implemented. We choose
				674	// the size of the offsets vector so that it can only store one match.
				675	register_array_size_ = registers_per_match_;
				676	max_matches_ = 1;
				677	}
				678
				679	if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
				680	register_array_ = NewArray<int32_t>(register_array_size_);
				681	} else {
				682	register_array_ = isolate->jsregexp_static_offsets_vector();
				683	}
				684
				685	// Set state so that fetching the results the first time triggers a call
				686	// to the compiled regexp.
				687	current_match_index_ = max_matches_ - 1;
				688	num_matches_ = max_matches_;
				689	DCHECK(registers_per_match_ >= 2); // Each match has at least one capture.
				690	DCHECK_GE(register_array_size_, registers_per_match_);
				691	int32_t* last_match =
				692	&register_array_[current_match_index_ * registers_per_match_];
				693	last_match[0] = -1;
				694	last_match[1] = 0;
				695	}
				696
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	697	int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
				698	if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 &&
				699	last_index + 1 < subject_->length() &&
				700	unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
				701	unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
				702	// Advance over the surrogate pair.
				703	return last_index + 2;
				704	}
				705	return last_index + 1;
				706	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	707
				708	// -------------------------------------------------------------------
				709	// Implementation of the Irregexp regular expression engine.
				710	//
				711	// The Irregexp regular expression engine is intended to be a complete
				712	// implementation of ECMAScript regular expressions. It generates either
				713	// bytecodes or native code.
				714
				715	// The Irregexp regexp engine is structured in three steps.
				716	// 1) The parser generates an abstract syntax tree. See ast.cc.
				717	// 2) From the AST a node network is created. The nodes are all
				718	// subclasses of RegExpNode. The nodes represent states when
				719	// executing a regular expression. Several optimizations are
				720	// performed on the node network.
				721	// 3) From the nodes we generate either byte codes or native code
				722	// that can actually execute the regular expression (perform
				723	// the search). The code generation step is described in more
				724	// detail below.
				725
				726	// Code generation.
				727	//
				728	// The nodes are divided into four main categories.
				729	// * Choice nodes
				730	// These represent places where the regular expression can
				731	// match in more than one way. For example on entry to an
				732	// alternation (foo\|bar) or a repetition (*, +, ? or {}).
				733	// * Action nodes
				734	// These represent places where some action should be
				735	// performed. Examples include recording the current position
				736	// in the input string to a register (in order to implement
				737	// captures) or other actions on register for example in order
				738	// to implement the counters needed for {} repetitions.
				739	// * Matching nodes
				740	// These attempt to match some element part of the input string.
				741	// Examples of elements include character classes, plain strings
				742	// or back references.
				743	// * End nodes
				744	// These are used to implement the actions required on finding
				745	// a successful match or failing to find a match.
				746	//
				747	// The code generated (whether as byte codes or native code) maintains
				748	// some state as it runs. This consists of the following elements:
				749	//
				750	// * The capture registers. Used for string captures.
				751	// * Other registers. Used for counters etc.
				752	// * The current position.
				753	// * The stack of backtracking information. Used when a matching node
				754	// fails to find a match and needs to try an alternative.
				755	//
				756	// Conceptual regular expression execution model:
				757	//
				758	// There is a simple conceptual model of regular expression execution
				759	// which will be presented first. The actual code generated is a more
				760	// efficient simulation of the simple conceptual model:
				761	//
				762	// * Choice nodes are implemented as follows:
				763	// For each choice except the last {
				764	// push current position
				765	// push backtrack code location
				766	// <generate code to test for choice>
				767	// backtrack code location:
				768	// pop current position
				769	// }
				770	// <generate code to test for last choice>
				771	//
				772	// * Actions nodes are generated as follows
				773	// <push affected registers on backtrack stack>
				774	// <generate code to perform action>
				775	// push backtrack code location
				776	// <generate code to test for following nodes>
				777	// backtrack code location:
				778	// <pop affected registers to restore their state>
				779	// <pop backtrack location from stack and go to it>
				780	//
				781	// * Matching nodes are generated as follows:
				782	// if input string matches at current position
				783	// update current position
				784	// <generate code to test for following nodes>
				785	// else
				786	// <pop backtrack location from stack and go to it>
				787	//
				788	// Thus it can be seen that the current position is saved and restored
				789	// by the choice nodes, whereas the registers are saved and restored by
				790	// by the action nodes that manipulate them.
				791	//
				792	// The other interesting aspect of this model is that nodes are generated
				793	// at the point where they are needed by a recursive call to Emit(). If
				794	// the node has already been code generated then the Emit() call will
				795	// generate a jump to the previously generated code instead. In order to
				796	// limit recursion it is possible for the Emit() function to put the node
				797	// on a work list for later generation and instead generate a jump. The
				798	// destination of the jump is resolved later when the code is generated.
				799	//
				800	// Actual regular expression code generation.
				801	//
				802	// Code generation is actually more complicated than the above. In order
				803	// to improve the efficiency of the generated code some optimizations are
				804	// performed
				805	//
				806	// * Choice nodes have 1-character lookahead.
				807	// A choice node looks at the following character and eliminates some of
				808	// the choices immediately based on that character. This is not yet
				809	// implemented.
				810	// * Simple greedy loops store reduced backtracking information.
				811	// A quantifier like /.*foo/m will greedily match the whole input. It will
				812	// then need to backtrack to a point where it can match "foo". The naive
				813	// implementation of this would push each character position onto the
				814	// backtracking stack, then pop them off one by one. This would use space
				815	// proportional to the length of the input string. However since the "."
				816	// can only match in one way and always has a constant length (in this case
				817	// of 1) it suffices to store the current position on the top of the stack
				818	// once. Matching now becomes merely incrementing the current position and
				819	// backtracking becomes decrementing the current position and checking the
				820	// result against the stored current position. This is faster and saves
				821	// space.
				822	// * The current state is virtualized.
				823	// This is used to defer expensive operations until it is clear that they
				824	// are needed and to generate code for a node more than once, allowing
				825	// specialized an efficient versions of the code to be created. This is
				826	// explained in the section below.
				827	//
				828	// Execution state virtualization.
				829	//
				830	// Instead of emitting code, nodes that manipulate the state can record their
				831	// manipulation in an object called the Trace. The Trace object can record a
				832	// current position offset, an optional backtrack code location on the top of
				833	// the virtualized backtrack stack and some register changes. When a node is
				834	// to be emitted it can flush the Trace or update it. Flushing the Trace
				835	// will emit code to bring the actual state into line with the virtual state.
				836	// Avoiding flushing the state can postpone some work (e.g. updates of capture
				837	// registers). Postponing work can save time when executing the regular
				838	// expression since it may be found that the work never has to be done as a
				839	// failure to match can occur. In addition it is much faster to jump to a
				840	// known backtrack code location than it is to pop an unknown backtrack
				841	// location from the stack and jump there.
				842	//
				843	// The virtual state found in the Trace affects code generation. For example
				844	// the virtual state contains the difference between the actual current
				845	// position and the virtual current position, and matching code needs to use
				846	// this offset to attempt a match in the correct location of the input
				847	// string. Therefore code generated for a non-trivial trace is specialized
				848	// to that trace. The code generator therefore has the ability to generate
				849	// code for each node several times. In order to limit the size of the
				850	// generated code there is an arbitrary limit on how many specialized sets of
				851	// code may be generated for a given node. If the limit is reached, the
				852	// trace is flushed and a generic version of the code for a node is emitted.
				853	// This is subsequently used for that node. The code emitted for non-generic
				854	// trace is not recorded in the node and so it cannot currently be reused in
				855	// the event that code generation is requested for an identical trace.
				856
				857
				858	void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
				859	UNREACHABLE();
				860	}
				861
				862
				863	void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
				864	text->AddElement(TextElement::Atom(this), zone);
				865	}
				866
				867
				868	void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
				869	text->AddElement(TextElement::CharClass(this), zone);
				870	}
				871
				872
				873	void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
				874	for (int i = 0; i < elements()->length(); i++)
				875	text->AddElement(elements()->at(i), zone);
				876	}
				877
				878
				879	TextElement TextElement::Atom(RegExpAtom* atom) {
				880	return TextElement(ATOM, atom);
				881	}
				882
				883
				884	TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
				885	return TextElement(CHAR_CLASS, char_class);
				886	}
				887
				888
				889	int TextElement::length() const {
				890	switch (text_type()) {
				891	case ATOM:
				892	return atom()->length();
				893
				894	case CHAR_CLASS:
				895	return 1;
				896	}
				897	UNREACHABLE();
				898	return 0;
				899	}
				900
				901
				902	DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
				903	if (table_ == NULL) {
				904	table_ = new(zone()) DispatchTable(zone());
				905	DispatchTableConstructor cons(table_, ignore_case, zone());
				906	cons.BuildTable(this);
				907	}
				908	return table_;
				909	}
				910
				911
				912	class FrequencyCollator {
				913	public:
				914	FrequencyCollator() : total_samples_(0) {
				915	for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
				916	frequencies_[i] = CharacterFrequency(i);
				917	}
				918	}
				919
				920	void CountCharacter(int character) {
				921	int index = (character & RegExpMacroAssembler::kTableMask);
				922	frequencies_[index].Increment();
				923	total_samples_++;
				924	}
				925
				926	// Does not measure in percent, but rather per-128 (the table size from the
				927	// regexp macro assembler).
				928	int Frequency(int in_character) {
				929	DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
				930	if (total_samples_ < 1) return 1; // Division by zero.
				931	int freq_in_per128 =
				932	(frequencies_[in_character].counter() * 128) / total_samples_;
				933	return freq_in_per128;
				934	}
				935
				936	private:
				937	class CharacterFrequency {
				938	public:
				939	CharacterFrequency() : counter_(0), character_(-1) { }
				940	explicit CharacterFrequency(int character)
				941	: counter_(0), character_(character) { }
				942
				943	void Increment() { counter_++; }
				944	int counter() { return counter_; }
				945	int character() { return character_; }
				946
				947	private:
				948	int counter_;
				949	int character_;
				950	};
				951
				952
				953	private:
				954	CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
				955	int total_samples_;
				956	};
				957
				958
				959	class RegExpCompiler {
				960	public:
				961	RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	962	JSRegExp::Flags flags, bool is_one_byte);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	963
				964	int AllocateRegister() {
				965	if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
				966	reg_exp_too_big_ = true;
				967	return next_register_;
				968	}
				969	return next_register_++;
				970	}
				971
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	972	// Lookarounds to match lone surrogates for unicode character class matches
				973	// are never nested. We can therefore reuse registers.
				974	int UnicodeLookaroundStackRegister() {
				975	if (unicode_lookaround_stack_register_ == kNoRegister) {
				976	unicode_lookaround_stack_register_ = AllocateRegister();
				977	}
				978	return unicode_lookaround_stack_register_;
				979	}
				980
				981	int UnicodeLookaroundPositionRegister() {
				982	if (unicode_lookaround_position_register_ == kNoRegister) {
				983	unicode_lookaround_position_register_ = AllocateRegister();
				984	}
				985	return unicode_lookaround_position_register_;
				986	}
				987
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	988	RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
				989	RegExpNode* start,
				990	int capture_count,
				991	Handle<String> pattern);
				992
				993	inline void AddWork(RegExpNode* node) {
				994	if (!node->on_work_list() && !node->label()->is_bound()) {
				995	node->set_on_work_list(true);
				996	work_list_->Add(node);
				997	}
				998	}
				999
				1000	static const int kImplementationOffset = 0;
				1001	static const int kNumberOfRegistersOffset = 0;
				1002	static const int kCodeOffset = 1;
				1003
				1004	RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
				1005	EndNode* accept() { return accept_; }
				1006
				1007	static const int kMaxRecursion = 100;
				1008	inline int recursion_depth() { return recursion_depth_; }
				1009	inline void IncrementRecursionDepth() { recursion_depth_++; }
				1010	inline void DecrementRecursionDepth() { recursion_depth_--; }
				1011
				1012	void SetRegExpTooBig() { reg_exp_too_big_ = true; }
				1013
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1014	inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
				1015	inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1016	inline bool one_byte() { return one_byte_; }
				1017	inline bool optimize() { return optimize_; }
				1018	inline void set_optimize(bool value) { optimize_ = value; }
				1019	inline bool limiting_recursion() { return limiting_recursion_; }
				1020	inline void set_limiting_recursion(bool value) {
				1021	limiting_recursion_ = value;
				1022	}
				1023	bool read_backward() { return read_backward_; }
				1024	void set_read_backward(bool value) { read_backward_ = value; }
				1025	FrequencyCollator* frequency_collator() { return &frequency_collator_; }
				1026
				1027	int current_expansion_factor() { return current_expansion_factor_; }
				1028	void set_current_expansion_factor(int value) {
				1029	current_expansion_factor_ = value;
				1030	}
				1031
				1032	Isolate* isolate() const { return isolate_; }
				1033	Zone* zone() const { return zone_; }
				1034
				1035	static const int kNoRegister = -1;
				1036
				1037	private:
				1038	EndNode* accept_;
				1039	int next_register_;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1040	int unicode_lookaround_stack_register_;
				1041	int unicode_lookaround_position_register_;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1042	List<RegExpNode> work_list_;
				1043	int recursion_depth_;
				1044	RegExpMacroAssembler* macro_assembler_;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1045	JSRegExp::Flags flags_;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1046	bool one_byte_;
				1047	bool reg_exp_too_big_;
				1048	bool limiting_recursion_;
				1049	bool optimize_;
				1050	bool read_backward_;
				1051	int current_expansion_factor_;
				1052	FrequencyCollator frequency_collator_;
				1053	Isolate* isolate_;
				1054	Zone* zone_;
				1055	};
				1056
				1057
				1058	class RecursionCheck {
				1059	public:
				1060	explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
				1061	compiler->IncrementRecursionDepth();
				1062	}
				1063	~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
				1064	private:
				1065	RegExpCompiler* compiler_;
				1066	};
				1067
				1068
				1069	static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
				1070	return RegExpEngine::CompilationResult(isolate, "RegExp too big");
				1071	}
				1072
				1073
				1074	// Attempts to compile the regexp using an Irregexp code generator. Returns
				1075	// a fixed array or a null handle depending on whether it succeeded.
				1076	RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1077	JSRegExp::Flags flags, bool one_byte)
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1078	: next_register_(2 * (capture_count + 1)),
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1079	unicode_lookaround_stack_register_(kNoRegister),
				1080	unicode_lookaround_position_register_(kNoRegister),
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1081	work_list_(NULL),
				1082	recursion_depth_(0),
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1083	flags_(flags),
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1084	one_byte_(one_byte),
				1085	reg_exp_too_big_(false),
				1086	limiting_recursion_(false),
				1087	optimize_(FLAG_regexp_optimization),
				1088	read_backward_(false),
				1089	current_expansion_factor_(1),
				1090	frequency_collator_(),
				1091	isolate_(isolate),
				1092	zone_(zone) {
				1093	accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
				1094	DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
				1095	}
				1096
				1097
				1098	RegExpEngine::CompilationResult RegExpCompiler::Assemble(
				1099	RegExpMacroAssembler* macro_assembler,
				1100	RegExpNode* start,
				1101	int capture_count,
				1102	Handle<String> pattern) {
				1103	Heap* heap = pattern->GetHeap();
				1104
				1105	#ifdef DEBUG
				1106	if (FLAG_trace_regexp_assembler)
				1107	macro_assembler_ =
				1108	new RegExpMacroAssemblerTracer(isolate(), macro_assembler);
				1109	else
				1110	#endif
				1111	macro_assembler_ = macro_assembler;
				1112
				1113	List <RegExpNode*> work_list(0);
				1114	work_list_ = &work_list;
				1115	Label fail;
				1116	macro_assembler_->PushBacktrack(&fail);
				1117	Trace new_trace;
				1118	start->Emit(this, &new_trace);
				1119	macro_assembler_->Bind(&fail);
				1120	macro_assembler_->Fail();
				1121	while (!work_list.is_empty()) {
				1122	RegExpNode* node = work_list.RemoveLast();
				1123	node->set_on_work_list(false);
				1124	if (!node->label()->is_bound()) node->Emit(this, &new_trace);
				1125	}
				1126	if (reg_exp_too_big_) {
				1127	macro_assembler_->AbortedCodeGeneration();
				1128	return IrregexpRegExpTooBig(isolate_);
				1129	}
				1130
				1131	Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
				1132	heap->IncreaseTotalRegexpCodeGenerated(code->Size());
				1133	work_list_ = NULL;
				1134	#ifdef ENABLE_DISASSEMBLER
				1135	if (FLAG_print_code) {
				1136	CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());
				1137	OFStream os(trace_scope.file());
				1138	Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
				1139	}
				1140	#endif
				1141	#ifdef DEBUG
				1142	if (FLAG_trace_regexp_assembler) {
				1143	delete macro_assembler_;
				1144	}
				1145	#endif
				1146	return RegExpEngine::CompilationResult(*code, next_register_);
				1147	}
				1148
				1149
				1150	bool Trace::DeferredAction::Mentions(int that) {
				1151	if (action_type() == ActionNode::CLEAR_CAPTURES) {
				1152	Interval range = static_cast<DeferredClearCaptures*>(this)->range();
				1153	return range.Contains(that);
				1154	} else {
				1155	return reg() == that;
				1156	}
				1157	}
				1158
				1159
				1160	bool Trace::mentions_reg(int reg) {
				1161	for (DeferredAction* action = actions_;
				1162	action != NULL;
				1163	action = action->next()) {
				1164	if (action->Mentions(reg))
				1165	return true;
				1166	}
				1167	return false;
				1168	}
				1169
				1170
				1171	bool Trace::GetStoredPosition(int reg, int* cp_offset) {
				1172	DCHECK_EQ(0, *cp_offset);
				1173	for (DeferredAction* action = actions_;
				1174	action != NULL;
				1175	action = action->next()) {
				1176	if (action->Mentions(reg)) {
				1177	if (action->action_type() == ActionNode::STORE_POSITION) {
				1178	cp_offset = static_cast<DeferredCapture>(action)->cp_offset();
				1179	return true;
				1180	} else {
				1181	return false;
				1182	}
				1183	}
				1184	}
				1185	return false;
				1186	}
				1187
				1188
				1189	int Trace::FindAffectedRegisters(OutSet* affected_registers,
				1190	Zone* zone) {
				1191	int max_register = RegExpCompiler::kNoRegister;
				1192	for (DeferredAction* action = actions_;
				1193	action != NULL;
				1194	action = action->next()) {
				1195	if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
				1196	Interval range = static_cast<DeferredClearCaptures*>(action)->range();
				1197	for (int i = range.from(); i <= range.to(); i++)
				1198	affected_registers->Set(i, zone);
				1199	if (range.to() > max_register) max_register = range.to();
				1200	} else {
				1201	affected_registers->Set(action->reg(), zone);
				1202	if (action->reg() > max_register) max_register = action->reg();
				1203	}
				1204	}
				1205	return max_register;
				1206	}
				1207
				1208
				1209	void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
				1210	int max_register,
				1211	const OutSet& registers_to_pop,
				1212	const OutSet& registers_to_clear) {
				1213	for (int reg = max_register; reg >= 0; reg--) {
				1214	if (registers_to_pop.Get(reg)) {
				1215	assembler->PopRegister(reg);
				1216	} else if (registers_to_clear.Get(reg)) {
				1217	int clear_to = reg;
				1218	while (reg > 0 && registers_to_clear.Get(reg - 1)) {
				1219	reg--;
				1220	}
				1221	assembler->ClearRegisters(reg, clear_to);
				1222	}
				1223	}
				1224	}
				1225
				1226
				1227	void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
				1228	int max_register,
				1229	const OutSet& affected_registers,
				1230	OutSet* registers_to_pop,
				1231	OutSet* registers_to_clear,
				1232	Zone* zone) {
				1233	// The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
				1234	const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
				1235
				1236	// Count pushes performed to force a stack limit check occasionally.
				1237	int pushes = 0;
				1238
				1239	for (int reg = 0; reg <= max_register; reg++) {
				1240	if (!affected_registers.Get(reg)) {
				1241	continue;
				1242	}
				1243
				1244	// The chronologically first deferred action in the trace
				1245	// is used to infer the action needed to restore a register
				1246	// to its previous state (or not, if it's safe to ignore it).
				1247	enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
				1248	DeferredActionUndoType undo_action = IGNORE;
				1249
				1250	int value = 0;
				1251	bool absolute = false;
				1252	bool clear = false;
				1253	static const int kNoStore = kMinInt;
				1254	int store_position = kNoStore;
				1255	// This is a little tricky because we are scanning the actions in reverse
				1256	// historical order (newest first).
				1257	for (DeferredAction* action = actions_;
				1258	action != NULL;
				1259	action = action->next()) {
				1260	if (action->Mentions(reg)) {
				1261	switch (action->action_type()) {
				1262	case ActionNode::SET_REGISTER: {
				1263	Trace::DeferredSetRegister* psr =
				1264	static_cast<Trace::DeferredSetRegister*>(action);
				1265	if (!absolute) {
				1266	value += psr->value();
				1267	absolute = true;
				1268	}
				1269	// SET_REGISTER is currently only used for newly introduced loop
				1270	// counters. They can have a significant previous value if they
				1271	// occour in a loop. TODO(lrn): Propagate this information, so
				1272	// we can set undo_action to IGNORE if we know there is no value to
				1273	// restore.
				1274	undo_action = RESTORE;
				1275	DCHECK_EQ(store_position, kNoStore);
				1276	DCHECK(!clear);
				1277	break;
				1278	}
				1279	case ActionNode::INCREMENT_REGISTER:
				1280	if (!absolute) {
				1281	value++;
				1282	}
				1283	DCHECK_EQ(store_position, kNoStore);
				1284	DCHECK(!clear);
				1285	undo_action = RESTORE;
				1286	break;
				1287	case ActionNode::STORE_POSITION: {
				1288	Trace::DeferredCapture* pc =
				1289	static_cast<Trace::DeferredCapture*>(action);
				1290	if (!clear && store_position == kNoStore) {
				1291	store_position = pc->cp_offset();
				1292	}
				1293
				1294	// For captures we know that stores and clears alternate.
				1295	// Other register, are never cleared, and if the occur
				1296	// inside a loop, they might be assigned more than once.
				1297	if (reg <= 1) {
				1298	// Registers zero and one, aka "capture zero", is
				1299	// always set correctly if we succeed. There is no
				1300	// need to undo a setting on backtrack, because we
				1301	// will set it again or fail.
				1302	undo_action = IGNORE;
				1303	} else {
				1304	undo_action = pc->is_capture() ? CLEAR : RESTORE;
				1305	}
				1306	DCHECK(!absolute);
				1307	DCHECK_EQ(value, 0);
				1308	break;
				1309	}
				1310	case ActionNode::CLEAR_CAPTURES: {
				1311	// Since we're scanning in reverse order, if we've already
				1312	// set the position we have to ignore historically earlier
				1313	// clearing operations.
				1314	if (store_position == kNoStore) {
				1315	clear = true;
				1316	}
				1317	undo_action = RESTORE;
				1318	DCHECK(!absolute);
				1319	DCHECK_EQ(value, 0);
				1320	break;
				1321	}
				1322	default:
				1323	UNREACHABLE();
				1324	break;
				1325	}
				1326	}
				1327	}
				1328	// Prepare for the undo-action (e.g., push if it's going to be popped).
				1329	if (undo_action == RESTORE) {
				1330	pushes++;
				1331	RegExpMacroAssembler::StackCheckFlag stack_check =
				1332	RegExpMacroAssembler::kNoStackLimitCheck;
				1333	if (pushes == push_limit) {
				1334	stack_check = RegExpMacroAssembler::kCheckStackLimit;
				1335	pushes = 0;
				1336	}
				1337
				1338	assembler->PushRegister(reg, stack_check);
				1339	registers_to_pop->Set(reg, zone);
				1340	} else if (undo_action == CLEAR) {
				1341	registers_to_clear->Set(reg, zone);
				1342	}
				1343	// Perform the chronologically last action (or accumulated increment)
				1344	// for the register.
				1345	if (store_position != kNoStore) {
				1346	assembler->WriteCurrentPositionToRegister(reg, store_position);
				1347	} else if (clear) {
				1348	assembler->ClearRegisters(reg, reg);
				1349	} else if (absolute) {
				1350	assembler->SetRegister(reg, value);
				1351	} else if (value != 0) {
				1352	assembler->AdvanceRegister(reg, value);
				1353	}
				1354	}
				1355	}
				1356
				1357
				1358	// This is called as we come into a loop choice node and some other tricky
				1359	// nodes. It normalizes the state of the code generator to ensure we can
				1360	// generate generic code.
				1361	void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
				1362	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1363
				1364	DCHECK(!is_trivial());
				1365
				1366	if (actions_ == NULL && backtrack() == NULL) {
				1367	// Here we just have some deferred cp advances to fix and we are back to
				1368	// a normal situation. We may also have to forget some information gained
				1369	// through a quick check that was already performed.
				1370	if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
				1371	// Create a new trivial state and generate the node with that.
				1372	Trace new_state;
				1373	successor->Emit(compiler, &new_state);
				1374	return;
				1375	}
				1376
				1377	// Generate deferred actions here along with code to undo them again.
				1378	OutSet affected_registers;
				1379
				1380	if (backtrack() != NULL) {
				1381	// Here we have a concrete backtrack location. These are set up by choice
				1382	// nodes and so they indicate that we have a deferred save of the current
				1383	// position which we may need to emit here.
				1384	assembler->PushCurrentPosition();
				1385	}
				1386
				1387	int max_register = FindAffectedRegisters(&affected_registers,
				1388	compiler->zone());
				1389	OutSet registers_to_pop;
				1390	OutSet registers_to_clear;
				1391	PerformDeferredActions(assembler,
				1392	max_register,
				1393	affected_registers,
				1394	&registers_to_pop,
				1395	&registers_to_clear,
				1396	compiler->zone());
				1397	if (cp_offset_ != 0) {
				1398	assembler->AdvanceCurrentPosition(cp_offset_);
				1399	}
				1400
				1401	// Create a new trivial state and generate the node with that.
				1402	Label undo;
				1403	assembler->PushBacktrack(&undo);
				1404	if (successor->KeepRecursing(compiler)) {
				1405	Trace new_state;
				1406	successor->Emit(compiler, &new_state);
				1407	} else {
				1408	compiler->AddWork(successor);
				1409	assembler->GoTo(successor->label());
				1410	}
				1411
				1412	// On backtrack we need to restore state.
				1413	assembler->Bind(&undo);
				1414	RestoreAffectedRegisters(assembler,
				1415	max_register,
				1416	registers_to_pop,
				1417	registers_to_clear);
				1418	if (backtrack() == NULL) {
				1419	assembler->Backtrack();
				1420	} else {
				1421	assembler->PopCurrentPosition();
				1422	assembler->GoTo(backtrack());
				1423	}
				1424	}
				1425
				1426
				1427	void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
				1428	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1429
				1430	// Omit flushing the trace. We discard the entire stack frame anyway.
				1431
				1432	if (!label()->is_bound()) {
				1433	// We are completely independent of the trace, since we ignore it,
				1434	// so this code can be used as the generic version.
				1435	assembler->Bind(label());
				1436	}
				1437
				1438	// Throw away everything on the backtrack stack since the start
				1439	// of the negative submatch and restore the character position.
				1440	assembler->ReadCurrentPositionFromRegister(current_position_register_);
				1441	assembler->ReadStackPointerFromRegister(stack_pointer_register_);
				1442	if (clear_capture_count_ > 0) {
				1443	// Clear any captures that might have been performed during the success
				1444	// of the body of the negative look-ahead.
				1445	int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
				1446	assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
				1447	}
				1448	// Now that we have unwound the stack we find at the top of the stack the
				1449	// backtrack that the BeginSubmatch node got.
				1450	assembler->Backtrack();
				1451	}
				1452
				1453
				1454	void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				1455	if (!trace->is_trivial()) {
				1456	trace->Flush(compiler, this);
				1457	return;
				1458	}
				1459	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1460	if (!label()->is_bound()) {
				1461	assembler->Bind(label());
				1462	}
				1463	switch (action_) {
				1464	case ACCEPT:
				1465	assembler->Succeed();
				1466	return;
				1467	case BACKTRACK:
				1468	assembler->GoTo(trace->backtrack());
				1469	return;
				1470	case NEGATIVE_SUBMATCH_SUCCESS:
				1471	// This case is handled in a different virtual method.
				1472	UNREACHABLE();
				1473	}
				1474	UNIMPLEMENTED();
				1475	}
				1476
				1477
				1478	void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
				1479	if (guards_ == NULL)
				1480	guards_ = new(zone) ZoneList<Guard*>(1, zone);
				1481	guards_->Add(guard, zone);
				1482	}
				1483
				1484
				1485	ActionNode* ActionNode::SetRegister(int reg,
				1486	int val,
				1487	RegExpNode* on_success) {
				1488	ActionNode* result =
				1489	new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
				1490	result->data_.u_store_register.reg = reg;
				1491	result->data_.u_store_register.value = val;
				1492	return result;
				1493	}
				1494
				1495
				1496	ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
				1497	ActionNode* result =
				1498	new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
				1499	result->data_.u_increment_register.reg = reg;
				1500	return result;
				1501	}
				1502
				1503
				1504	ActionNode* ActionNode::StorePosition(int reg,
				1505	bool is_capture,
				1506	RegExpNode* on_success) {
				1507	ActionNode* result =
				1508	new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
				1509	result->data_.u_position_register.reg = reg;
				1510	result->data_.u_position_register.is_capture = is_capture;
				1511	return result;
				1512	}
				1513
				1514
				1515	ActionNode* ActionNode::ClearCaptures(Interval range,
				1516	RegExpNode* on_success) {
				1517	ActionNode* result =
				1518	new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
				1519	result->data_.u_clear_captures.range_from = range.from();
				1520	result->data_.u_clear_captures.range_to = range.to();
				1521	return result;
				1522	}
				1523
				1524
				1525	ActionNode* ActionNode::BeginSubmatch(int stack_reg,
				1526	int position_reg,
				1527	RegExpNode* on_success) {
				1528	ActionNode* result =
				1529	new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
				1530	result->data_.u_submatch.stack_pointer_register = stack_reg;
				1531	result->data_.u_submatch.current_position_register = position_reg;
				1532	return result;
				1533	}
				1534
				1535
				1536	ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
				1537	int position_reg,
				1538	int clear_register_count,
				1539	int clear_register_from,
				1540	RegExpNode* on_success) {
				1541	ActionNode* result =
				1542	new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
				1543	result->data_.u_submatch.stack_pointer_register = stack_reg;
				1544	result->data_.u_submatch.current_position_register = position_reg;
				1545	result->data_.u_submatch.clear_register_count = clear_register_count;
				1546	result->data_.u_submatch.clear_register_from = clear_register_from;
				1547	return result;
				1548	}
				1549
				1550
				1551	ActionNode* ActionNode::EmptyMatchCheck(int start_register,
				1552	int repetition_register,
				1553	int repetition_limit,
				1554	RegExpNode* on_success) {
				1555	ActionNode* result =
				1556	new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
				1557	result->data_.u_empty_match_check.start_register = start_register;
				1558	result->data_.u_empty_match_check.repetition_register = repetition_register;
				1559	result->data_.u_empty_match_check.repetition_limit = repetition_limit;
				1560	return result;
				1561	}
				1562
				1563
				1564	#define DEFINE_ACCEPT(Type) \
				1565	void Type##Node::Accept(NodeVisitor* visitor) { \
				1566	visitor->Visit##Type(this); \
				1567	}
				1568	FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
				1569	#undef DEFINE_ACCEPT
				1570
				1571
				1572	void LoopChoiceNode::Accept(NodeVisitor* visitor) {
				1573	visitor->VisitLoopChoice(this);
				1574	}
				1575
				1576
				1577	// -------------------------------------------------------------------
				1578	// Emit code.
				1579
				1580
				1581	void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
				1582	Guard* guard,
				1583	Trace* trace) {
				1584	switch (guard->op()) {
				1585	case Guard::LT:
				1586	DCHECK(!trace->mentions_reg(guard->reg()));
				1587	macro_assembler->IfRegisterGE(guard->reg(),
				1588	guard->value(),
				1589	trace->backtrack());
				1590	break;
				1591	case Guard::GEQ:
				1592	DCHECK(!trace->mentions_reg(guard->reg()));
				1593	macro_assembler->IfRegisterLT(guard->reg(),
				1594	guard->value(),
				1595	trace->backtrack());
				1596	break;
				1597	}
				1598	}
				1599
				1600
				1601	// Returns the number of characters in the equivalence class, omitting those
				1602	// that cannot occur in the source string because it is Latin1.
				1603	static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
				1604	bool one_byte_subject,
				1605	unibrow::uchar* letters) {
				1606	int length =
				1607	isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
				1608	// Unibrow returns 0 or 1 for characters where case independence is
				1609	// trivial.
				1610	if (length == 0) {
				1611	letters[0] = character;
				1612	length = 1;
				1613	}
				1614
				1615	if (one_byte_subject) {
				1616	int new_length = 0;
				1617	for (int i = 0; i < length; i++) {
				1618	if (letters[i] <= String::kMaxOneByteCharCode) {
				1619	letters[new_length++] = letters[i];
				1620	}
				1621	}
				1622	length = new_length;
				1623	}
				1624
				1625	return length;
				1626	}
				1627
				1628
				1629	static inline bool EmitSimpleCharacter(Isolate* isolate,
				1630	RegExpCompiler* compiler,
				1631	uc16 c,
				1632	Label* on_failure,
				1633	int cp_offset,
				1634	bool check,
				1635	bool preloaded) {
				1636	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1637	bool bound_checked = false;
				1638	if (!preloaded) {
				1639	assembler->LoadCurrentCharacter(
				1640	cp_offset,
				1641	on_failure,
				1642	check);
				1643	bound_checked = true;
				1644	}
				1645	assembler->CheckNotCharacter(c, on_failure);
				1646	return bound_checked;
				1647	}
				1648
				1649
				1650	// Only emits non-letters (things that don't have case). Only used for case
				1651	// independent matches.
				1652	static inline bool EmitAtomNonLetter(Isolate* isolate,
				1653	RegExpCompiler* compiler,
				1654	uc16 c,
				1655	Label* on_failure,
				1656	int cp_offset,
				1657	bool check,
				1658	bool preloaded) {
				1659	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1660	bool one_byte = compiler->one_byte();
				1661	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				1662	int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
				1663	if (length < 1) {
				1664	// This can't match. Must be an one-byte subject and a non-one-byte
				1665	// character. We do not need to do anything since the one-byte pass
				1666	// already handled this.
				1667	return false; // Bounds not checked.
				1668	}
				1669	bool checked = false;
				1670	// We handle the length > 1 case in a later pass.
				1671	if (length == 1) {
				1672	if (one_byte && c > String::kMaxOneByteCharCodeU) {
				1673	// Can't match - see above.
				1674	return false; // Bounds not checked.
				1675	}
				1676	if (!preloaded) {
				1677	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
				1678	checked = check;
				1679	}
				1680	macro_assembler->CheckNotCharacter(c, on_failure);
				1681	}
				1682	return checked;
				1683	}
				1684
				1685
				1686	static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
				1687	bool one_byte, uc16 c1, uc16 c2,
				1688	Label* on_failure) {
				1689	uc16 char_mask;
				1690	if (one_byte) {
				1691	char_mask = String::kMaxOneByteCharCode;
				1692	} else {
				1693	char_mask = String::kMaxUtf16CodeUnit;
				1694	}
				1695	uc16 exor = c1 ^ c2;
				1696	// Check whether exor has only one bit set.
				1697	if (((exor - 1) & exor) == 0) {
				1698	// If c1 and c2 differ only by one bit.
				1699	// Ecma262UnCanonicalize always gives the highest number last.
				1700	DCHECK(c2 > c1);
				1701	uc16 mask = char_mask ^ exor;
				1702	macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
				1703	return true;
				1704	}
				1705	DCHECK(c2 > c1);
				1706	uc16 diff = c2 - c1;
				1707	if (((diff - 1) & diff) == 0 && c1 >= diff) {
				1708	// If the characters differ by 2^n but don't differ by one bit then
				1709	// subtract the difference from the found character, then do the or
				1710	// trick. We avoid the theoretical case where negative numbers are
				1711	// involved in order to simplify code generation.
				1712	uc16 mask = char_mask ^ diff;
				1713	macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
				1714	diff,
				1715	mask,
				1716	on_failure);
				1717	return true;
				1718	}
				1719	return false;
				1720	}
				1721
				1722
				1723	typedef bool EmitCharacterFunction(Isolate* isolate,
				1724	RegExpCompiler* compiler,
				1725	uc16 c,
				1726	Label* on_failure,
				1727	int cp_offset,
				1728	bool check,
				1729	bool preloaded);
				1730
				1731	// Only emits letters (things that have case). Only used for case independent
				1732	// matches.
				1733	static inline bool EmitAtomLetter(Isolate* isolate,
				1734	RegExpCompiler* compiler,
				1735	uc16 c,
				1736	Label* on_failure,
				1737	int cp_offset,
				1738	bool check,
				1739	bool preloaded) {
				1740	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1741	bool one_byte = compiler->one_byte();
				1742	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				1743	int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
				1744	if (length <= 1) return false;
				1745	// We may not need to check against the end of the input string
				1746	// if this character lies before a character that matched.
				1747	if (!preloaded) {
				1748	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
				1749	}
				1750	Label ok;
				1751	DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
				1752	switch (length) {
				1753	case 2: {
				1754	if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
				1755	chars[1], on_failure)) {
				1756	} else {
				1757	macro_assembler->CheckCharacter(chars[0], &ok);
				1758	macro_assembler->CheckNotCharacter(chars[1], on_failure);
				1759	macro_assembler->Bind(&ok);
				1760	}
				1761	break;
				1762	}
				1763	case 4:
				1764	macro_assembler->CheckCharacter(chars[3], &ok);
				1765	// Fall through!
				1766	case 3:
				1767	macro_assembler->CheckCharacter(chars[0], &ok);
				1768	macro_assembler->CheckCharacter(chars[1], &ok);
				1769	macro_assembler->CheckNotCharacter(chars[2], on_failure);
				1770	macro_assembler->Bind(&ok);
				1771	break;
				1772	default:
				1773	UNREACHABLE();
				1774	break;
				1775	}
				1776	return true;
				1777	}
				1778
				1779
				1780	static void EmitBoundaryTest(RegExpMacroAssembler* masm,
				1781	int border,
				1782	Label* fall_through,
				1783	Label* above_or_equal,
				1784	Label* below) {
				1785	if (below != fall_through) {
				1786	masm->CheckCharacterLT(border, below);
				1787	if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
				1788	} else {
				1789	masm->CheckCharacterGT(border - 1, above_or_equal);
				1790	}
				1791	}
				1792
				1793
				1794	static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
				1795	int first,
				1796	int last,
				1797	Label* fall_through,
				1798	Label* in_range,
				1799	Label* out_of_range) {
				1800	if (in_range == fall_through) {
				1801	if (first == last) {
				1802	masm->CheckNotCharacter(first, out_of_range);
				1803	} else {
				1804	masm->CheckCharacterNotInRange(first, last, out_of_range);
				1805	}
				1806	} else {
				1807	if (first == last) {
				1808	masm->CheckCharacter(first, in_range);
				1809	} else {
				1810	masm->CheckCharacterInRange(first, last, in_range);
				1811	}
				1812	if (out_of_range != fall_through) masm->GoTo(out_of_range);
				1813	}
				1814	}
				1815
				1816
				1817	// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
				1818	// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
				1819	static void EmitUseLookupTable(
				1820	RegExpMacroAssembler* masm,
				1821	ZoneList<int>* ranges,
				1822	int start_index,
				1823	int end_index,
				1824	int min_char,
				1825	Label* fall_through,
				1826	Label* even_label,
				1827	Label* odd_label) {
				1828	static const int kSize = RegExpMacroAssembler::kTableSize;
				1829	static const int kMask = RegExpMacroAssembler::kTableMask;
				1830
				1831	int base = (min_char & ~kMask);
				1832	USE(base);
				1833
				1834	// Assert that everything is on one kTableSize page.
				1835	for (int i = start_index; i <= end_index; i++) {
				1836	DCHECK_EQ(ranges->at(i) & ~kMask, base);
				1837	}
				1838	DCHECK(start_index == 0 \|\| (ranges->at(start_index - 1) & ~kMask) <= base);
				1839
				1840	char templ[kSize];
				1841	Label* on_bit_set;
				1842	Label* on_bit_clear;
				1843	int bit;
				1844	if (even_label == fall_through) {
				1845	on_bit_set = odd_label;
				1846	on_bit_clear = even_label;
				1847	bit = 1;
				1848	} else {
				1849	on_bit_set = even_label;
				1850	on_bit_clear = odd_label;
				1851	bit = 0;
				1852	}
				1853	for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
				1854	templ[i] = bit;
				1855	}
				1856	int j = 0;
				1857	bit ^= 1;
				1858	for (int i = start_index; i < end_index; i++) {
				1859	for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
				1860	templ[j] = bit;
				1861	}
				1862	bit ^= 1;
				1863	}
				1864	for (int i = j; i < kSize; i++) {
				1865	templ[i] = bit;
				1866	}
				1867	Factory* factory = masm->isolate()->factory();
				1868	// TODO(erikcorry): Cache these.
				1869	Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
				1870	for (int i = 0; i < kSize; i++) {
				1871	ba->set(i, templ[i]);
				1872	}
				1873	masm->CheckBitInTable(ba, on_bit_set);
				1874	if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
				1875	}
				1876
				1877
				1878	static void CutOutRange(RegExpMacroAssembler* masm,
				1879	ZoneList<int>* ranges,
				1880	int start_index,
				1881	int end_index,
				1882	int cut_index,
				1883	Label* even_label,
				1884	Label* odd_label) {
				1885	bool odd = (((cut_index - start_index) & 1) == 1);
				1886	Label* in_range_label = odd ? odd_label : even_label;
				1887	Label dummy;
				1888	EmitDoubleBoundaryTest(masm,
				1889	ranges->at(cut_index),
				1890	ranges->at(cut_index + 1) - 1,
				1891	&dummy,
				1892	in_range_label,
				1893	&dummy);
				1894	DCHECK(!dummy.is_linked());
				1895	// Cut out the single range by rewriting the array. This creates a new
				1896	// range that is a merger of the two ranges on either side of the one we
				1897	// are cutting out. The oddity of the labels is preserved.
				1898	for (int j = cut_index; j > start_index; j--) {
				1899	ranges->at(j) = ranges->at(j - 1);
				1900	}
				1901	for (int j = cut_index + 1; j < end_index; j++) {
				1902	ranges->at(j) = ranges->at(j + 1);
				1903	}
				1904	}
				1905
				1906
				1907	// Unicode case. Split the search space into kSize spaces that are handled
				1908	// with recursion.
				1909	static void SplitSearchSpace(ZoneList<int>* ranges,
				1910	int start_index,
				1911	int end_index,
				1912	int* new_start_index,
				1913	int* new_end_index,
				1914	int* border) {
				1915	static const int kSize = RegExpMacroAssembler::kTableSize;
				1916	static const int kMask = RegExpMacroAssembler::kTableMask;
				1917
				1918	int first = ranges->at(start_index);
				1919	int last = ranges->at(end_index) - 1;
				1920
				1921	*new_start_index = start_index;
				1922	*border = (ranges->at(start_index) & ~kMask) + kSize;
				1923	while (*new_start_index < end_index) {
				1924	if (ranges->at(new_start_index) > border) break;
				1925	(*new_start_index)++;
				1926	}
				1927	// new_start_index is the index of the first edge that is beyond the
				1928	// current kSize space.
				1929
				1930	// For very large search spaces we do a binary chop search of the non-Latin1
				1931	// space instead of just going to the end of the current kSize space. The
				1932	// heuristics are complicated a little by the fact that any 128-character
				1933	// encoding space can be quickly tested with a table lookup, so we don't
				1934	// wish to do binary chop search at a smaller granularity than that. A
				1935	// 128-character space can take up a lot of space in the ranges array if,
				1936	// for example, we only want to match every second character (eg. the lower
				1937	// case characters on some Unicode pages).
				1938	int binary_chop_index = (end_index + start_index) / 2;
				1939	// The first test ensures that we get to the code that handles the Latin1
				1940	// range with a single not-taken branch, speeding up this important
				1941	// character range (even non-Latin1 charset-based text has spaces and
				1942	// punctuation).
				1943	if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
				1944	end_index - start_index > (new_start_index - start_index) 2 &&
				1945	last - first > kSize * 2 && binary_chop_index > *new_start_index &&
				1946	ranges->at(binary_chop_index) >= first + 2 * kSize) {
				1947	int scan_forward_for_section_border = binary_chop_index;;
				1948	int new_border = (ranges->at(binary_chop_index) \| kMask) + 1;
				1949
				1950	while (scan_forward_for_section_border < end_index) {
				1951	if (ranges->at(scan_forward_for_section_border) > new_border) {
				1952	*new_start_index = scan_forward_for_section_border;
				1953	*border = new_border;
				1954	break;
				1955	}
				1956	scan_forward_for_section_border++;
				1957	}
				1958	}
				1959
				1960	DCHECK(*new_start_index > start_index);
				1961	new_end_index = new_start_index - 1;
				1962	if (ranges->at(new_end_index) == border) {
				1963	(*new_end_index)--;
				1964	}
				1965	if (*border >= ranges->at(end_index)) {
				1966	*border = ranges->at(end_index);
				1967	*new_start_index = end_index; // Won't be used.
				1968	*new_end_index = end_index - 1;
				1969	}
				1970	}
				1971
				1972
				1973	// Gets a series of segment boundaries representing a character class. If the
				1974	// character is in the range between an even and an odd boundary (counting from
				1975	// start_index) then go to even_label, otherwise go to odd_label. We already
				1976	// know that the character is in the range of min_char to max_char inclusive.
				1977	// Either label can be NULL indicating backtracking. Either label can also be
				1978	// equal to the fall_through label.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1979	static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
				1980	int start_index, int end_index, uc32 min_char,
				1981	uc32 max_char, Label* fall_through,
				1982	Label* even_label, Label* odd_label) {
				1983	DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
				1984	DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
				1985
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1986	int first = ranges->at(start_index);
				1987	int last = ranges->at(end_index) - 1;
				1988
				1989	DCHECK_LT(min_char, first);
				1990
				1991	// Just need to test if the character is before or on-or-after
				1992	// a particular character.
				1993	if (start_index == end_index) {
				1994	EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
				1995	return;
				1996	}
				1997
				1998	// Another almost trivial case: There is one interval in the middle that is
				1999	// different from the end intervals.
				2000	if (start_index + 1 == end_index) {
				2001	EmitDoubleBoundaryTest(
				2002	masm, first, last, fall_through, even_label, odd_label);
				2003	return;
				2004	}
				2005
				2006	// It's not worth using table lookup if there are very few intervals in the
				2007	// character class.
				2008	if (end_index - start_index <= 6) {
				2009	// It is faster to test for individual characters, so we look for those
				2010	// first, then try arbitrary ranges in the second round.
				2011	static int kNoCutIndex = -1;
				2012	int cut = kNoCutIndex;
				2013	for (int i = start_index; i < end_index; i++) {
				2014	if (ranges->at(i) == ranges->at(i + 1) - 1) {
				2015	cut = i;
				2016	break;
				2017	}
				2018	}
				2019	if (cut == kNoCutIndex) cut = start_index;
				2020	CutOutRange(
				2021	masm, ranges, start_index, end_index, cut, even_label, odd_label);
				2022	DCHECK_GE(end_index - start_index, 2);
				2023	GenerateBranches(masm,
				2024	ranges,
				2025	start_index + 1,
				2026	end_index - 1,
				2027	min_char,
				2028	max_char,
				2029	fall_through,
				2030	even_label,
				2031	odd_label);
				2032	return;
				2033	}
				2034
				2035	// If there are a lot of intervals in the regexp, then we will use tables to
				2036	// determine whether the character is inside or outside the character class.
				2037	static const int kBits = RegExpMacroAssembler::kTableSizeBits;
				2038
				2039	if ((max_char >> kBits) == (min_char >> kBits)) {
				2040	EmitUseLookupTable(masm,
				2041	ranges,
				2042	start_index,
				2043	end_index,
				2044	min_char,
				2045	fall_through,
				2046	even_label,
				2047	odd_label);
				2048	return;
				2049	}
				2050
				2051	if ((min_char >> kBits) != (first >> kBits)) {
				2052	masm->CheckCharacterLT(first, odd_label);
				2053	GenerateBranches(masm,
				2054	ranges,
				2055	start_index + 1,
				2056	end_index,
				2057	first,
				2058	max_char,
				2059	fall_through,
				2060	odd_label,
				2061	even_label);
				2062	return;
				2063	}
				2064
				2065	int new_start_index = 0;
				2066	int new_end_index = 0;
				2067	int border = 0;
				2068
				2069	SplitSearchSpace(ranges,
				2070	start_index,
				2071	end_index,
				2072	&new_start_index,
				2073	&new_end_index,
				2074	&border);
				2075
				2076	Label handle_rest;
				2077	Label* above = &handle_rest;
				2078	if (border == last + 1) {
				2079	// We didn't find any section that started after the limit, so everything
				2080	// above the border is one of the terminal labels.
				2081	above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
				2082	DCHECK(new_end_index == end_index - 1);
				2083	}
				2084
				2085	DCHECK_LE(start_index, new_end_index);
				2086	DCHECK_LE(new_start_index, end_index);
				2087	DCHECK_LT(start_index, new_start_index);
				2088	DCHECK_LT(new_end_index, end_index);
				2089	DCHECK(new_end_index + 1 == new_start_index \|\|
				2090	(new_end_index + 2 == new_start_index &&
				2091	border == ranges->at(new_end_index + 1)));
				2092	DCHECK_LT(min_char, border - 1);
				2093	DCHECK_LT(border, max_char);
				2094	DCHECK_LT(ranges->at(new_end_index), border);
				2095	DCHECK(border < ranges->at(new_start_index) \|\|
				2096	(border == ranges->at(new_start_index) &&
				2097	new_start_index == end_index &&
				2098	new_end_index == end_index - 1 &&
				2099	border == last + 1));
				2100	DCHECK(new_start_index == 0 \|\| border >= ranges->at(new_start_index - 1));
				2101
				2102	masm->CheckCharacterGT(border - 1, above);
				2103	Label dummy;
				2104	GenerateBranches(masm,
				2105	ranges,
				2106	start_index,
				2107	new_end_index,
				2108	min_char,
				2109	border - 1,
				2110	&dummy,
				2111	even_label,
				2112	odd_label);
				2113	if (handle_rest.is_linked()) {
				2114	masm->Bind(&handle_rest);
				2115	bool flip = (new_start_index & 1) != (start_index & 1);
				2116	GenerateBranches(masm,
				2117	ranges,
				2118	new_start_index,
				2119	end_index,
				2120	border,
				2121	max_char,
				2122	&dummy,
				2123	flip ? odd_label : even_label,
				2124	flip ? even_label : odd_label);
				2125	}
				2126	}
				2127
				2128
				2129	static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
				2130	RegExpCharacterClass* cc, bool one_byte,
				2131	Label* on_failure, int cp_offset, bool check_offset,
				2132	bool preloaded, Zone* zone) {
				2133	ZoneList<CharacterRange>* ranges = cc->ranges(zone);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	2134	CharacterRange::Canonicalize(ranges);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2135
				2136	int max_char;
				2137	if (one_byte) {
				2138	max_char = String::kMaxOneByteCharCode;
				2139	} else {
				2140	max_char = String::kMaxUtf16CodeUnit;
				2141	}
				2142
				2143	int range_count = ranges->length();
				2144
				2145	int last_valid_range = range_count - 1;
				2146	while (last_valid_range >= 0) {
				2147	CharacterRange& range = ranges->at(last_valid_range);
				2148	if (range.from() <= max_char) {
				2149	break;
				2150	}
				2151	last_valid_range--;
				2152	}
				2153
				2154	if (last_valid_range < 0) {
				2155	if (!cc->is_negated()) {
				2156	macro_assembler->GoTo(on_failure);
				2157	}
				2158	if (check_offset) {
				2159	macro_assembler->CheckPosition(cp_offset, on_failure);
				2160	}
				2161	return;
				2162	}
				2163
				2164	if (last_valid_range == 0 &&
				2165	ranges->at(0).IsEverything(max_char)) {
				2166	if (cc->is_negated()) {
				2167	macro_assembler->GoTo(on_failure);
				2168	} else {
				2169	// This is a common case hit by non-anchored expressions.
				2170	if (check_offset) {
				2171	macro_assembler->CheckPosition(cp_offset, on_failure);
				2172	}
				2173	}
				2174	return;
				2175	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2176
				2177	if (!preloaded) {
				2178	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
				2179	}
				2180
				2181	if (cc->is_standard(zone) &&
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	2182	macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
				2183	on_failure)) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2184	return;
				2185	}
				2186
				2187
				2188	// A new list with ascending entries. Each entry is a code unit
				2189	// where there is a boundary between code units that are part of
				2190	// the class and code units that are not. Normally we insert an
				2191	// entry at zero which goes to the failure label, but if there
				2192	// was already one there we fall through for success on that entry.
				2193	// Subsequent entries have alternating meaning (success/failure).
				2194	ZoneList<int>* range_boundaries =
				2195	new(zone) ZoneList<int>(last_valid_range, zone);
				2196
				2197	bool zeroth_entry_is_failure = !cc->is_negated();
				2198
				2199	for (int i = 0; i <= last_valid_range; i++) {
				2200	CharacterRange& range = ranges->at(i);
				2201	if (range.from() == 0) {
				2202	DCHECK_EQ(i, 0);
				2203	zeroth_entry_is_failure = !zeroth_entry_is_failure;
				2204	} else {
				2205	range_boundaries->Add(range.from(), zone);
				2206	}
				2207	range_boundaries->Add(range.to() + 1, zone);
				2208	}
				2209	int end_index = range_boundaries->length() - 1;
				2210	if (range_boundaries->at(end_index) > max_char) {
				2211	end_index--;
				2212	}
				2213
				2214	Label fall_through;
				2215	GenerateBranches(macro_assembler,
				2216	range_boundaries,
				2217	0, // start_index.
				2218	end_index,
				2219	0, // min_char.
				2220	max_char,
				2221	&fall_through,
				2222	zeroth_entry_is_failure ? &fall_through : on_failure,
				2223	zeroth_entry_is_failure ? on_failure : &fall_through);
				2224	macro_assembler->Bind(&fall_through);
				2225	}
				2226
				2227
				2228	RegExpNode::~RegExpNode() {
				2229	}
				2230
				2231
				2232	RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
				2233	Trace* trace) {
				2234	// If we are generating a greedy loop then don't stop and don't reuse code.
				2235	if (trace->stop_node() != NULL) {
				2236	return CONTINUE;
				2237	}
				2238
				2239	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				2240	if (trace->is_trivial()) {
				2241	if (label_.is_bound() \|\| on_work_list() \|\| !KeepRecursing(compiler)) {
				2242	// If a generic version is already scheduled to be generated or we have
				2243	// recursed too deeply then just generate a jump to that code.
				2244	macro_assembler->GoTo(&label_);
				2245	// This will queue it up for generation of a generic version if it hasn't
				2246	// already been queued.
				2247	compiler->AddWork(this);
				2248	return DONE;
				2249	}
				2250	// Generate generic version of the node and bind the label for later use.
				2251	macro_assembler->Bind(&label_);
				2252	return CONTINUE;
				2253	}
				2254
				2255	// We are being asked to make a non-generic version. Keep track of how many
				2256	// non-generic versions we generate so as not to overdo it.
				2257	trace_count_++;
				2258	if (KeepRecursing(compiler) && compiler->optimize() &&
				2259	trace_count_ < kMaxCopiesCodeGenerated) {
				2260	return CONTINUE;
				2261	}
				2262
				2263	// If we get here code has been generated for this node too many times or
				2264	// recursion is too deep. Time to switch to a generic version. The code for
				2265	// generic versions above can handle deep recursion properly.
				2266	bool was_limiting = compiler->limiting_recursion();
				2267	compiler->set_limiting_recursion(true);
				2268	trace->Flush(compiler, this);
				2269	compiler->set_limiting_recursion(was_limiting);
				2270	return DONE;
				2271	}
				2272
				2273
				2274	bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
				2275	return !compiler->limiting_recursion() &&
				2276	compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
				2277	}
				2278
				2279
				2280	int ActionNode::EatsAtLeast(int still_to_find,
				2281	int budget,
				2282	bool not_at_start) {
				2283	if (budget <= 0) return 0;
				2284	if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
				2285	return on_success()->EatsAtLeast(still_to_find,
				2286	budget - 1,
				2287	not_at_start);
				2288	}
				2289
				2290
				2291	void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
				2292	BoyerMooreLookahead* bm, bool not_at_start) {
				2293	if (action_type_ == BEGIN_SUBMATCH) {
				2294	bm->SetRest(offset);
				2295	} else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
				2296	on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
				2297	}
				2298	SaveBMInfo(bm, not_at_start, offset);
				2299	}
				2300
				2301
				2302	int AssertionNode::EatsAtLeast(int still_to_find,
				2303	int budget,
				2304	bool not_at_start) {
				2305	if (budget <= 0) return 0;
				2306	// If we know we are not at the start and we are asked "how many characters
				2307	// will you match if you succeed?" then we can answer anything since false
				2308	// implies false. So lets just return the max answer (still_to_find) since
				2309	// that won't prevent us from preloading a lot of characters for the other
				2310	// branches in the node graph.
				2311	if (assertion_type() == AT_START && not_at_start) return still_to_find;
				2312	return on_success()->EatsAtLeast(still_to_find,
				2313	budget - 1,
				2314	not_at_start);
				2315	}
				2316
				2317
				2318	void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
				2319	BoyerMooreLookahead* bm, bool not_at_start) {
				2320	// Match the behaviour of EatsAtLeast on this node.
				2321	if (assertion_type() == AT_START && not_at_start) return;
				2322	on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
				2323	SaveBMInfo(bm, not_at_start, offset);
				2324	}
				2325
				2326
				2327	int BackReferenceNode::EatsAtLeast(int still_to_find,
				2328	int budget,
				2329	bool not_at_start) {
				2330	if (read_backward()) return 0;
				2331	if (budget <= 0) return 0;
				2332	return on_success()->EatsAtLeast(still_to_find,
				2333	budget - 1,
				2334	not_at_start);
				2335	}
				2336
				2337
				2338	int TextNode::EatsAtLeast(int still_to_find,
				2339	int budget,
				2340	bool not_at_start) {
				2341	if (read_backward()) return 0;
				2342	int answer = Length();
				2343	if (answer >= still_to_find) return answer;
				2344	if (budget <= 0) return answer;
				2345	// We are not at start after this node so we set the last argument to 'true'.
				2346	return answer + on_success()->EatsAtLeast(still_to_find - answer,
				2347	budget - 1,
				2348	true);
				2349	}
				2350
				2351
				2352	int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
				2353	bool not_at_start) {
				2354	if (budget <= 0) return 0;
				2355	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				2356	// afterwards.
				2357	RegExpNode* node = alternatives_->at(1).node();
				2358	return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
				2359	}
				2360
				2361
				2362	void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
				2363	QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
				2364	bool not_at_start) {
				2365	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				2366	// afterwards.
				2367	RegExpNode* node = alternatives_->at(1).node();
				2368	return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
				2369	}
				2370
				2371
				2372	int ChoiceNode::EatsAtLeastHelper(int still_to_find,
				2373	int budget,
				2374	RegExpNode* ignore_this_node,
				2375	bool not_at_start) {
				2376	if (budget <= 0) return 0;
				2377	int min = 100;
				2378	int choice_count = alternatives_->length();
				2379	budget = (budget - 1) / choice_count;
				2380	for (int i = 0; i < choice_count; i++) {
				2381	RegExpNode* node = alternatives_->at(i).node();
				2382	if (node == ignore_this_node) continue;
				2383	int node_eats_at_least =
				2384	node->EatsAtLeast(still_to_find, budget, not_at_start);
				2385	if (node_eats_at_least < min) min = node_eats_at_least;
				2386	if (min == 0) return 0;
				2387	}
				2388	return min;
				2389	}
				2390
				2391
				2392	int LoopChoiceNode::EatsAtLeast(int still_to_find,
				2393	int budget,
				2394	bool not_at_start) {
				2395	return EatsAtLeastHelper(still_to_find,
				2396	budget - 1,
				2397	loop_node_,
				2398	not_at_start);
				2399	}
				2400
				2401
				2402	int ChoiceNode::EatsAtLeast(int still_to_find,
				2403	int budget,
				2404	bool not_at_start) {
				2405	return EatsAtLeastHelper(still_to_find,
				2406	budget,
				2407	NULL,
				2408	not_at_start);
				2409	}
				2410
				2411
				2412	// Takes the left-most 1-bit and smears it out, setting all bits to its right.
				2413	static inline uint32_t SmearBitsRight(uint32_t v) {
				2414	v \|= v >> 1;
				2415	v \|= v >> 2;
				2416	v \|= v >> 4;
				2417	v \|= v >> 8;
				2418	v \|= v >> 16;
				2419	return v;
				2420	}
				2421
				2422
				2423	bool QuickCheckDetails::Rationalize(bool asc) {
				2424	bool found_useful_op = false;
				2425	uint32_t char_mask;
				2426	if (asc) {
				2427	char_mask = String::kMaxOneByteCharCode;
				2428	} else {
				2429	char_mask = String::kMaxUtf16CodeUnit;
				2430	}
				2431	mask_ = 0;
				2432	value_ = 0;
				2433	int char_shift = 0;
				2434	for (int i = 0; i < characters_; i++) {
				2435	Position* pos = &positions_[i];
				2436	if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
				2437	found_useful_op = true;
				2438	}
				2439	mask_ \|= (pos->mask & char_mask) << char_shift;
				2440	value_ \|= (pos->value & char_mask) << char_shift;
				2441	char_shift += asc ? 8 : 16;
				2442	}
				2443	return found_useful_op;
				2444	}
				2445
				2446
				2447	bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
				2448	Trace* bounds_check_trace,
				2449	Trace* trace,
				2450	bool preload_has_checked_bounds,
				2451	Label* on_possible_success,
				2452	QuickCheckDetails* details,
				2453	bool fall_through_on_failure) {
				2454	if (details->characters() == 0) return false;
				2455	GetQuickCheckDetails(
				2456	details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
				2457	if (details->cannot_match()) return false;
				2458	if (!details->Rationalize(compiler->one_byte())) return false;
				2459	DCHECK(details->characters() == 1 \|\|
				2460	compiler->macro_assembler()->CanReadUnaligned());
				2461	uint32_t mask = details->mask();
				2462	uint32_t value = details->value();
				2463
				2464	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2465
				2466	if (trace->characters_preloaded() != details->characters()) {
				2467	DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
				2468	// We are attempting to preload the minimum number of characters
				2469	// any choice would eat, so if the bounds check fails, then none of the
				2470	// choices can succeed, so we can just immediately backtrack, rather
				2471	// than go to the next choice.
				2472	assembler->LoadCurrentCharacter(trace->cp_offset(),
				2473	bounds_check_trace->backtrack(),
				2474	!preload_has_checked_bounds,
				2475	details->characters());
				2476	}
				2477
				2478
				2479	bool need_mask = true;
				2480
				2481	if (details->characters() == 1) {
				2482	// If number of characters preloaded is 1 then we used a byte or 16 bit
				2483	// load so the value is already masked down.
				2484	uint32_t char_mask;
				2485	if (compiler->one_byte()) {
				2486	char_mask = String::kMaxOneByteCharCode;
				2487	} else {
				2488	char_mask = String::kMaxUtf16CodeUnit;
				2489	}
				2490	if ((mask & char_mask) == char_mask) need_mask = false;
				2491	mask &= char_mask;
				2492	} else {
				2493	// For 2-character preloads in one-byte mode or 1-character preloads in
				2494	// two-byte mode we also use a 16 bit load with zero extend.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	2495	static const uint32_t kTwoByteMask = 0xffff;
				2496	static const uint32_t kFourByteMask = 0xffffffff;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2497	if (details->characters() == 2 && compiler->one_byte()) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	2498	if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2499	} else if (details->characters() == 1 && !compiler->one_byte()) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	2500	if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2501	} else {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	2502	if (mask == kFourByteMask) need_mask = false;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2503	}
				2504	}
				2505
				2506	if (fall_through_on_failure) {
				2507	if (need_mask) {
				2508	assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
				2509	} else {
				2510	assembler->CheckCharacter(value, on_possible_success);
				2511	}
				2512	} else {
				2513	if (need_mask) {
				2514	assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
				2515	} else {
				2516	assembler->CheckNotCharacter(value, trace->backtrack());
				2517	}
				2518	}
				2519	return true;
				2520	}
				2521
				2522
				2523	// Here is the meat of GetQuickCheckDetails (see also the comment on the
				2524	// super-class in the .h file).
				2525	//
				2526	// We iterate along the text object, building up for each character a
				2527	// mask and value that can be used to test for a quick failure to match.
				2528	// The masks and values for the positions will be combined into a single
				2529	// machine word for the current character width in order to be used in
				2530	// generating a quick check.
				2531	void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2532	RegExpCompiler* compiler,
				2533	int characters_filled_in,
				2534	bool not_at_start) {
				2535	// Do not collect any quick check details if the text node reads backward,
				2536	// since it reads in the opposite direction than we use for quick checks.
				2537	if (read_backward()) return;
				2538	Isolate* isolate = compiler->macro_assembler()->isolate();
				2539	DCHECK(characters_filled_in < details->characters());
				2540	int characters = details->characters();
				2541	int char_mask;
				2542	if (compiler->one_byte()) {
				2543	char_mask = String::kMaxOneByteCharCode;
				2544	} else {
				2545	char_mask = String::kMaxUtf16CodeUnit;
				2546	}
				2547	for (int k = 0; k < elements()->length(); k++) {
				2548	TextElement elm = elements()->at(k);
				2549	if (elm.text_type() == TextElement::ATOM) {
				2550	Vector<const uc16> quarks = elm.atom()->data();
				2551	for (int i = 0; i < characters && i < quarks.length(); i++) {
				2552	QuickCheckDetails::Position* pos =
				2553	details->positions(characters_filled_in);
				2554	uc16 c = quarks[i];
				2555	if (compiler->ignore_case()) {
				2556	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				2557	int length = GetCaseIndependentLetters(isolate, c,
				2558	compiler->one_byte(), chars);
				2559	if (length == 0) {
				2560	// This can happen because all case variants are non-Latin1, but we
				2561	// know the input is Latin1.
				2562	details->set_cannot_match();
				2563	pos->determines_perfectly = false;
				2564	return;
				2565	}
				2566	if (length == 1) {
				2567	// This letter has no case equivalents, so it's nice and simple
				2568	// and the mask-compare will determine definitely whether we have
				2569	// a match at this character position.
				2570	pos->mask = char_mask;
				2571	pos->value = c;
				2572	pos->determines_perfectly = true;
				2573	} else {
				2574	uint32_t common_bits = char_mask;
				2575	uint32_t bits = chars[0];
				2576	for (int j = 1; j < length; j++) {
				2577	uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
				2578	common_bits ^= differing_bits;
				2579	bits &= common_bits;
				2580	}
				2581	// If length is 2 and common bits has only one zero in it then
				2582	// our mask and compare instruction will determine definitely
				2583	// whether we have a match at this character position. Otherwise
				2584	// it can only be an approximate check.
				2585	uint32_t one_zero = (common_bits \| ~char_mask);
				2586	if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
				2587	pos->determines_perfectly = true;
				2588	}
				2589	pos->mask = common_bits;
				2590	pos->value = bits;
				2591	}
				2592	} else {
				2593	// Don't ignore case. Nice simple case where the mask-compare will
				2594	// determine definitely whether we have a match at this character
				2595	// position.
				2596	if (c > char_mask) {
				2597	details->set_cannot_match();
				2598	pos->determines_perfectly = false;
				2599	return;
				2600	}
				2601	pos->mask = char_mask;
				2602	pos->value = c;
				2603	pos->determines_perfectly = true;
				2604	}
				2605	characters_filled_in++;
				2606	DCHECK(characters_filled_in <= details->characters());
				2607	if (characters_filled_in == details->characters()) {
				2608	return;
				2609	}
				2610	}
				2611	} else {
				2612	QuickCheckDetails::Position* pos =
				2613	details->positions(characters_filled_in);
				2614	RegExpCharacterClass* tree = elm.char_class();
				2615	ZoneList<CharacterRange>* ranges = tree->ranges(zone());
				2616	if (tree->is_negated()) {
				2617	// A quick check uses multi-character mask and compare. There is no
				2618	// useful way to incorporate a negative char class into this scheme
				2619	// so we just conservatively create a mask and value that will always
				2620	// succeed.
				2621	pos->mask = 0;
				2622	pos->value = 0;
				2623	} else {
				2624	int first_range = 0;
				2625	while (ranges->at(first_range).from() > char_mask) {
				2626	first_range++;
				2627	if (first_range == ranges->length()) {
				2628	details->set_cannot_match();
				2629	pos->determines_perfectly = false;
				2630	return;
				2631	}
				2632	}
				2633	CharacterRange range = ranges->at(first_range);
				2634	uc16 from = range.from();
				2635	uc16 to = range.to();
				2636	if (to > char_mask) {
				2637	to = char_mask;
				2638	}
				2639	uint32_t differing_bits = (from ^ to);
				2640	// A mask and compare is only perfect if the differing bits form a
				2641	// number like 00011111 with one single block of trailing 1s.
				2642	if ((differing_bits & (differing_bits + 1)) == 0 &&
				2643	from + differing_bits == to) {
				2644	pos->determines_perfectly = true;
				2645	}
				2646	uint32_t common_bits = ~SmearBitsRight(differing_bits);
				2647	uint32_t bits = (from & common_bits);
				2648	for (int i = first_range + 1; i < ranges->length(); i++) {
				2649	CharacterRange range = ranges->at(i);
				2650	uc16 from = range.from();
				2651	uc16 to = range.to();
				2652	if (from > char_mask) continue;
				2653	if (to > char_mask) to = char_mask;
				2654	// Here we are combining more ranges into the mask and compare
				2655	// value. With each new range the mask becomes more sparse and
				2656	// so the chances of a false positive rise. A character class
				2657	// with multiple ranges is assumed never to be equivalent to a
				2658	// mask and compare operation.
				2659	pos->determines_perfectly = false;
				2660	uint32_t new_common_bits = (from ^ to);
				2661	new_common_bits = ~SmearBitsRight(new_common_bits);
				2662	common_bits &= new_common_bits;
				2663	bits &= new_common_bits;
				2664	uint32_t differing_bits = (from & common_bits) ^ bits;
				2665	common_bits ^= differing_bits;
				2666	bits &= common_bits;
				2667	}
				2668	pos->mask = common_bits;
				2669	pos->value = bits;
				2670	}
				2671	characters_filled_in++;
				2672	DCHECK(characters_filled_in <= details->characters());
				2673	if (characters_filled_in == details->characters()) {
				2674	return;
				2675	}
				2676	}
				2677	}
				2678	DCHECK(characters_filled_in != details->characters());
				2679	if (!details->cannot_match()) {
				2680	on_success()-> GetQuickCheckDetails(details,
				2681	compiler,
				2682	characters_filled_in,
				2683	true);
				2684	}
				2685	}
				2686
				2687
				2688	void QuickCheckDetails::Clear() {
				2689	for (int i = 0; i < characters_; i++) {
				2690	positions_[i].mask = 0;
				2691	positions_[i].value = 0;
				2692	positions_[i].determines_perfectly = false;
				2693	}
				2694	characters_ = 0;
				2695	}
				2696
				2697
				2698	void QuickCheckDetails::Advance(int by, bool one_byte) {
				2699	if (by >= characters_ \|\| by < 0) {
				2700	DCHECK_IMPLIES(by < 0, characters_ == 0);
				2701	Clear();
				2702	return;
				2703	}
				2704	DCHECK_LE(characters_ - by, 4);
				2705	DCHECK_LE(characters_, 4);
				2706	for (int i = 0; i < characters_ - by; i++) {
				2707	positions_[i] = positions_[by + i];
				2708	}
				2709	for (int i = characters_ - by; i < characters_; i++) {
				2710	positions_[i].mask = 0;
				2711	positions_[i].value = 0;
				2712	positions_[i].determines_perfectly = false;
				2713	}
				2714	characters_ -= by;
				2715	// We could change mask_ and value_ here but we would never advance unless
				2716	// they had already been used in a check and they won't be used again because
				2717	// it would gain us nothing. So there's no point.
				2718	}
				2719
				2720
				2721	void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
				2722	DCHECK(characters_ == other->characters_);
				2723	if (other->cannot_match_) {
				2724	return;
				2725	}
				2726	if (cannot_match_) {
				2727	this = other;
				2728	return;
				2729	}
				2730	for (int i = from_index; i < characters_; i++) {
				2731	QuickCheckDetails::Position* pos = positions(i);
				2732	QuickCheckDetails::Position* other_pos = other->positions(i);
				2733	if (pos->mask != other_pos->mask \|\|
				2734	pos->value != other_pos->value \|\|
				2735	!other_pos->determines_perfectly) {
				2736	// Our mask-compare operation will be approximate unless we have the
				2737	// exact same operation on both sides of the alternation.
				2738	pos->determines_perfectly = false;
				2739	}
				2740	pos->mask &= other_pos->mask;
				2741	pos->value &= pos->mask;
				2742	other_pos->value &= pos->mask;
				2743	uc16 differing_bits = (pos->value ^ other_pos->value);
				2744	pos->mask &= ~differing_bits;
				2745	pos->value &= pos->mask;
				2746	}
				2747	}
				2748
				2749
				2750	class VisitMarker {
				2751	public:
				2752	explicit VisitMarker(NodeInfo* info) : info_(info) {
				2753	DCHECK(!info->visited);
				2754	info->visited = true;
				2755	}
				2756	~VisitMarker() {
				2757	info_->visited = false;
				2758	}
				2759	private:
				2760	NodeInfo* info_;
				2761	};
				2762
				2763
				2764	RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
				2765	if (info()->replacement_calculated) return replacement();
				2766	if (depth < 0) return this;
				2767	DCHECK(!info()->visited);
				2768	VisitMarker marker(info());
				2769	return FilterSuccessor(depth - 1, ignore_case);
				2770	}
				2771
				2772
				2773	RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
				2774	RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
				2775	if (next == NULL) return set_replacement(NULL);
				2776	on_success_ = next;
				2777	return set_replacement(this);
				2778	}
				2779
				2780
				2781	// We need to check for the following characters: 0x39c 0x3bc 0x178.
				2782	static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
				2783	// TODO(dcarney): this could be a lot more efficient.
				2784	return range.Contains(0x39c) \|\|
				2785	range.Contains(0x3bc) \|\| range.Contains(0x178);
				2786	}
				2787
				2788
				2789	static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
				2790	for (int i = 0; i < ranges->length(); i++) {
				2791	// TODO(dcarney): this could be a lot more efficient.
				2792	if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
				2793	}
				2794	return false;
				2795	}
				2796
				2797
				2798	RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
				2799	if (info()->replacement_calculated) return replacement();
				2800	if (depth < 0) return this;
				2801	DCHECK(!info()->visited);
				2802	VisitMarker marker(info());
				2803	int element_count = elements()->length();
				2804	for (int i = 0; i < element_count; i++) {
				2805	TextElement elm = elements()->at(i);
				2806	if (elm.text_type() == TextElement::ATOM) {
				2807	Vector<const uc16> quarks = elm.atom()->data();
				2808	for (int j = 0; j < quarks.length(); j++) {
				2809	uint16_t c = quarks[j];
				2810	if (c <= String::kMaxOneByteCharCode) continue;
				2811	if (!ignore_case) return set_replacement(NULL);
				2812	// Here, we need to check for characters whose upper and lower cases
				2813	// are outside the Latin-1 range.
				2814	uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
				2815	// Character is outside Latin-1 completely
				2816	if (converted == 0) return set_replacement(NULL);
				2817	// Convert quark to Latin-1 in place.
				2818	uint16_t* copy = const_cast<uint16_t*>(quarks.start());
				2819	copy[j] = converted;
				2820	}
				2821	} else {
				2822	DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
				2823	RegExpCharacterClass* cc = elm.char_class();
				2824	ZoneList<CharacterRange>* ranges = cc->ranges(zone());
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	2825	CharacterRange::Canonicalize(ranges);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2826	// Now they are in order so we only need to look at the first.
				2827	int range_count = ranges->length();
				2828	if (cc->is_negated()) {
				2829	if (range_count != 0 &&
				2830	ranges->at(0).from() == 0 &&
				2831	ranges->at(0).to() >= String::kMaxOneByteCharCode) {
				2832	// This will be handled in a later filter.
				2833	if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
				2834	return set_replacement(NULL);
				2835	}
				2836	} else {
				2837	if (range_count == 0 \|\|
				2838	ranges->at(0).from() > String::kMaxOneByteCharCode) {
				2839	// This will be handled in a later filter.
				2840	if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
				2841	return set_replacement(NULL);
				2842	}
				2843	}
				2844	}
				2845	}
				2846	return FilterSuccessor(depth - 1, ignore_case);
				2847	}
				2848
				2849
				2850	RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
				2851	if (info()->replacement_calculated) return replacement();
				2852	if (depth < 0) return this;
				2853	if (info()->visited) return this;
				2854	{
				2855	VisitMarker marker(info());
				2856
				2857	RegExpNode* continue_replacement =
				2858	continue_node_->FilterOneByte(depth - 1, ignore_case);
				2859	// If we can't continue after the loop then there is no sense in doing the
				2860	// loop.
				2861	if (continue_replacement == NULL) return set_replacement(NULL);
				2862	}
				2863
				2864	return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
				2865	}
				2866
				2867
				2868	RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
				2869	if (info()->replacement_calculated) return replacement();
				2870	if (depth < 0) return this;
				2871	if (info()->visited) return this;
				2872	VisitMarker marker(info());
				2873	int choice_count = alternatives_->length();
				2874
				2875	for (int i = 0; i < choice_count; i++) {
				2876	GuardedAlternative alternative = alternatives_->at(i);
				2877	if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
				2878	set_replacement(this);
				2879	return this;
				2880	}
				2881	}
				2882
				2883	int surviving = 0;
				2884	RegExpNode* survivor = NULL;
				2885	for (int i = 0; i < choice_count; i++) {
				2886	GuardedAlternative alternative = alternatives_->at(i);
				2887	RegExpNode* replacement =
				2888	alternative.node()->FilterOneByte(depth - 1, ignore_case);
				2889	DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
				2890	if (replacement != NULL) {
				2891	alternatives_->at(i).set_node(replacement);
				2892	surviving++;
				2893	survivor = replacement;
				2894	}
				2895	}
				2896	if (surviving < 2) return set_replacement(survivor);
				2897
				2898	set_replacement(this);
				2899	if (surviving == choice_count) {
				2900	return this;
				2901	}
				2902	// Only some of the nodes survived the filtering. We need to rebuild the
				2903	// alternatives list.
				2904	ZoneList<GuardedAlternative>* new_alternatives =
				2905	new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
				2906	for (int i = 0; i < choice_count; i++) {
				2907	RegExpNode* replacement =
				2908	alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
				2909	if (replacement != NULL) {
				2910	alternatives_->at(i).set_node(replacement);
				2911	new_alternatives->Add(alternatives_->at(i), zone());
				2912	}
				2913	}
				2914	alternatives_ = new_alternatives;
				2915	return this;
				2916	}
				2917
				2918
				2919	RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
				2920	bool ignore_case) {
				2921	if (info()->replacement_calculated) return replacement();
				2922	if (depth < 0) return this;
				2923	if (info()->visited) return this;
				2924	VisitMarker marker(info());
				2925	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				2926	// afterwards.
				2927	RegExpNode* node = alternatives_->at(1).node();
				2928	RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
				2929	if (replacement == NULL) return set_replacement(NULL);
				2930	alternatives_->at(1).set_node(replacement);
				2931
				2932	RegExpNode* neg_node = alternatives_->at(0).node();
				2933	RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
				2934	// If the negative lookahead is always going to fail then
				2935	// we don't need to check it.
				2936	if (neg_replacement == NULL) return set_replacement(replacement);
				2937	alternatives_->at(0).set_node(neg_replacement);
				2938	return set_replacement(this);
				2939	}
				2940
				2941
				2942	void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2943	RegExpCompiler* compiler,
				2944	int characters_filled_in,
				2945	bool not_at_start) {
				2946	if (body_can_be_zero_length_ \|\| info()->visited) return;
				2947	VisitMarker marker(info());
				2948	return ChoiceNode::GetQuickCheckDetails(details,
				2949	compiler,
				2950	characters_filled_in,
				2951	not_at_start);
				2952	}
				2953
				2954
				2955	void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
				2956	BoyerMooreLookahead* bm, bool not_at_start) {
				2957	if (body_can_be_zero_length_ \|\| budget <= 0) {
				2958	bm->SetRest(offset);
				2959	SaveBMInfo(bm, not_at_start, offset);
				2960	return;
				2961	}
				2962	ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
				2963	SaveBMInfo(bm, not_at_start, offset);
				2964	}
				2965
				2966
				2967	void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2968	RegExpCompiler* compiler,
				2969	int characters_filled_in,
				2970	bool not_at_start) {
				2971	not_at_start = (not_at_start \|\| not_at_start_);
				2972	int choice_count = alternatives_->length();
				2973	DCHECK(choice_count > 0);
				2974	alternatives_->at(0).node()->GetQuickCheckDetails(details,
				2975	compiler,
				2976	characters_filled_in,
				2977	not_at_start);
				2978	for (int i = 1; i < choice_count; i++) {
				2979	QuickCheckDetails new_details(details->characters());
				2980	RegExpNode* node = alternatives_->at(i).node();
				2981	node->GetQuickCheckDetails(&new_details, compiler,
				2982	characters_filled_in,
				2983	not_at_start);
				2984	// Here we merge the quick match details of the two branches.
				2985	details->Merge(&new_details, characters_filled_in);
				2986	}
				2987	}
				2988
				2989
				2990	// Check for [0-9A-Z_a-z].
				2991	static void EmitWordCheck(RegExpMacroAssembler* assembler,
				2992	Label* word,
				2993	Label* non_word,
				2994	bool fall_through_on_word) {
				2995	if (assembler->CheckSpecialCharacterClass(
				2996	fall_through_on_word ? 'w' : 'W',
				2997	fall_through_on_word ? non_word : word)) {
				2998	// Optimized implementation available.
				2999	return;
				3000	}
				3001	assembler->CheckCharacterGT('z', non_word);
				3002	assembler->CheckCharacterLT('0', non_word);
				3003	assembler->CheckCharacterGT('a' - 1, word);
				3004	assembler->CheckCharacterLT('9' + 1, word);
				3005	assembler->CheckCharacterLT('A', non_word);
				3006	assembler->CheckCharacterLT('Z' + 1, word);
				3007	if (fall_through_on_word) {
				3008	assembler->CheckNotCharacter('_', non_word);
				3009	} else {
				3010	assembler->CheckCharacter('_', word);
				3011	}
				3012	}
				3013
				3014
				3015	// Emit the code to check for a ^ in multiline mode (1-character lookbehind
				3016	// that matches newline or the start of input).
				3017	static void EmitHat(RegExpCompiler* compiler,
				3018	RegExpNode* on_success,
				3019	Trace* trace) {
				3020	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3021	// We will be loading the previous character into the current character
				3022	// register.
				3023	Trace new_trace(*trace);
				3024	new_trace.InvalidateCurrentCharacter();
				3025
				3026	Label ok;
				3027	if (new_trace.cp_offset() == 0) {
				3028	// The start of input counts as a newline in this context, so skip to
				3029	// ok if we are at the start.
				3030	assembler->CheckAtStart(&ok);
				3031	}
				3032	// We already checked that we are not at the start of input so it must be
				3033	// OK to load the previous character.
				3034	assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
				3035	new_trace.backtrack(),
				3036	false);
				3037	if (!assembler->CheckSpecialCharacterClass('n',
				3038	new_trace.backtrack())) {
				3039	// Newline means \n, \r, 0x2028 or 0x2029.
				3040	if (!compiler->one_byte()) {
				3041	assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
				3042	}
				3043	assembler->CheckCharacter('\n', &ok);
				3044	assembler->CheckNotCharacter('\r', new_trace.backtrack());
				3045	}
				3046	assembler->Bind(&ok);
				3047	on_success->Emit(compiler, &new_trace);
				3048	}
				3049
				3050
				3051	// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
				3052	void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
				3053	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3054	Isolate* isolate = assembler->isolate();
				3055	Trace::TriBool next_is_word_character = Trace::UNKNOWN;
				3056	bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
				3057	BoyerMooreLookahead* lookahead = bm_info(not_at_start);
				3058	if (lookahead == NULL) {
				3059	int eats_at_least =
				3060	Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
				3061	kRecursionBudget,
				3062	not_at_start));
				3063	if (eats_at_least >= 1) {
				3064	BoyerMooreLookahead* bm =
				3065	new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
				3066	FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
				3067	if (bm->at(0)->is_non_word())
				3068	next_is_word_character = Trace::FALSE_VALUE;
				3069	if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
				3070	}
				3071	} else {
				3072	if (lookahead->at(0)->is_non_word())
				3073	next_is_word_character = Trace::FALSE_VALUE;
				3074	if (lookahead->at(0)->is_word())
				3075	next_is_word_character = Trace::TRUE_VALUE;
				3076	}
				3077	bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
				3078	if (next_is_word_character == Trace::UNKNOWN) {
				3079	Label before_non_word;
				3080	Label before_word;
				3081	if (trace->characters_preloaded() != 1) {
				3082	assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
				3083	}
				3084	// Fall through on non-word.
				3085	EmitWordCheck(assembler, &before_word, &before_non_word, false);
				3086	// Next character is not a word character.
				3087	assembler->Bind(&before_non_word);
				3088	Label ok;
				3089	BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
				3090	assembler->GoTo(&ok);
				3091
				3092	assembler->Bind(&before_word);
				3093	BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
				3094	assembler->Bind(&ok);
				3095	} else if (next_is_word_character == Trace::TRUE_VALUE) {
				3096	BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
				3097	} else {
				3098	DCHECK(next_is_word_character == Trace::FALSE_VALUE);
				3099	BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
				3100	}
				3101	}
				3102
				3103
				3104	void AssertionNode::BacktrackIfPrevious(
				3105	RegExpCompiler* compiler,
				3106	Trace* trace,
				3107	AssertionNode::IfPrevious backtrack_if_previous) {
				3108	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3109	Trace new_trace(*trace);
				3110	new_trace.InvalidateCurrentCharacter();
				3111
				3112	Label fall_through, dummy;
				3113
				3114	Label* non_word = backtrack_if_previous == kIsNonWord ?
				3115	new_trace.backtrack() :
				3116	&fall_through;
				3117	Label* word = backtrack_if_previous == kIsNonWord ?
				3118	&fall_through :
				3119	new_trace.backtrack();
				3120
				3121	if (new_trace.cp_offset() == 0) {
				3122	// The start of input counts as a non-word character, so the question is
				3123	// decided if we are at the start.
				3124	assembler->CheckAtStart(non_word);
				3125	}
				3126	// We already checked that we are not at the start of input so it must be
				3127	// OK to load the previous character.
				3128	assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
				3129	EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
				3130
				3131	assembler->Bind(&fall_through);
				3132	on_success()->Emit(compiler, &new_trace);
				3133	}
				3134
				3135
				3136	void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
				3137	RegExpCompiler* compiler,
				3138	int filled_in,
				3139	bool not_at_start) {
				3140	if (assertion_type_ == AT_START && not_at_start) {
				3141	details->set_cannot_match();
				3142	return;
				3143	}
				3144	return on_success()->GetQuickCheckDetails(details,
				3145	compiler,
				3146	filled_in,
				3147	not_at_start);
				3148	}
				3149
				3150
				3151	void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3152	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3153	switch (assertion_type_) {
				3154	case AT_END: {
				3155	Label ok;
				3156	assembler->CheckPosition(trace->cp_offset(), &ok);
				3157	assembler->GoTo(trace->backtrack());
				3158	assembler->Bind(&ok);
				3159	break;
				3160	}
				3161	case AT_START: {
				3162	if (trace->at_start() == Trace::FALSE_VALUE) {
				3163	assembler->GoTo(trace->backtrack());
				3164	return;
				3165	}
				3166	if (trace->at_start() == Trace::UNKNOWN) {
				3167	assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
				3168	Trace at_start_trace = *trace;
				3169	at_start_trace.set_at_start(Trace::TRUE_VALUE);
				3170	on_success()->Emit(compiler, &at_start_trace);
				3171	return;
				3172	}
				3173	}
				3174	break;
				3175	case AFTER_NEWLINE:
				3176	EmitHat(compiler, on_success(), trace);
				3177	return;
				3178	case AT_BOUNDARY:
				3179	case AT_NON_BOUNDARY: {
				3180	EmitBoundaryCheck(compiler, trace);
				3181	return;
				3182	}
				3183	}
				3184	on_success()->Emit(compiler, trace);
				3185	}
				3186
				3187
				3188	static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
				3189	if (quick_check == NULL) return false;
				3190	if (offset >= quick_check->characters()) return false;
				3191	return quick_check->positions(offset)->determines_perfectly;
				3192	}
				3193
				3194
				3195	static void UpdateBoundsCheck(int index, int* checked_up_to) {
				3196	if (index > *checked_up_to) {
				3197	*checked_up_to = index;
				3198	}
				3199	}
				3200
				3201
				3202	// We call this repeatedly to generate code for each pass over the text node.
				3203	// The passes are in increasing order of difficulty because we hope one
				3204	// of the first passes will fail in which case we are saved the work of the
				3205	// later passes. for example for the case independent regexp /%[asdfghjkl]a/
				3206	// we will check the '%' in the first pass, the case independent 'a' in the
				3207	// second pass and the character class in the last pass.
				3208	//
				3209	// The passes are done from right to left, so for example to test for /bar/
				3210	// we will first test for an 'r' with offset 2, then an 'a' with offset 1
				3211	// and then a 'b' with offset 0. This means we can avoid the end-of-input
				3212	// bounds check most of the time. In the example we only need to check for
				3213	// end-of-input when loading the putative 'r'.
				3214	//
				3215	// A slight complication involves the fact that the first character may already
				3216	// be fetched into a register by the previous node. In this case we want to
				3217	// do the test for that character first. We do this in separate passes. The
				3218	// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
				3219	// pass has been performed then subsequent passes will have true in
				3220	// first_element_checked to indicate that that character does not need to be
				3221	// checked again.
				3222	//
				3223	// In addition to all this we are passed a Trace, which can
				3224	// contain an AlternativeGeneration object. In this AlternativeGeneration
				3225	// object we can see details of any quick check that was already passed in
				3226	// order to get to the code we are now generating. The quick check can involve
				3227	// loading characters, which means we do not need to recheck the bounds
				3228	// up to the limit the quick check already checked. In addition the quick
				3229	// check can have involved a mask and compare operation which may simplify
				3230	// or obviate the need for further checks at some character positions.
				3231	void TextNode::TextEmitPass(RegExpCompiler* compiler,
				3232	TextEmitPassType pass,
				3233	bool preloaded,
				3234	Trace* trace,
				3235	bool first_element_checked,
				3236	int* checked_up_to) {
				3237	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3238	Isolate* isolate = assembler->isolate();
				3239	bool one_byte = compiler->one_byte();
				3240	Label* backtrack = trace->backtrack();
				3241	QuickCheckDetails* quick_check = trace->quick_check_performed();
				3242	int element_count = elements()->length();
				3243	int backward_offset = read_backward() ? -Length() : 0;
				3244	for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
				3245	TextElement elm = elements()->at(i);
				3246	int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
				3247	if (elm.text_type() == TextElement::ATOM) {
				3248	Vector<const uc16> quarks = elm.atom()->data();
				3249	for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
				3250	if (first_element_checked && i == 0 && j == 0) continue;
				3251	if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
				3252	EmitCharacterFunction* emit_function = NULL;
				3253	switch (pass) {
				3254	case NON_LATIN1_MATCH:
				3255	DCHECK(one_byte);
				3256	if (quarks[j] > String::kMaxOneByteCharCode) {
				3257	assembler->GoTo(backtrack);
				3258	return;
				3259	}
				3260	break;
				3261	case NON_LETTER_CHARACTER_MATCH:
				3262	emit_function = &EmitAtomNonLetter;
				3263	break;
				3264	case SIMPLE_CHARACTER_MATCH:
				3265	emit_function = &EmitSimpleCharacter;
				3266	break;
				3267	case CASE_CHARACTER_MATCH:
				3268	emit_function = &EmitAtomLetter;
				3269	break;
				3270	default:
				3271	break;
				3272	}
				3273	if (emit_function != NULL) {
				3274	bool bounds_check = *checked_up_to < cp_offset + j \|\| read_backward();
				3275	bool bound_checked =
				3276	emit_function(isolate, compiler, quarks[j], backtrack,
				3277	cp_offset + j, bounds_check, preloaded);
				3278	if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
				3279	}
				3280	}
				3281	} else {
				3282	DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
				3283	if (pass == CHARACTER_CLASS_MATCH) {
				3284	if (first_element_checked && i == 0) continue;
				3285	if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
				3286	RegExpCharacterClass* cc = elm.char_class();
				3287	bool bounds_check = *checked_up_to < cp_offset \|\| read_backward();
				3288	EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
				3289	bounds_check, preloaded, zone());
				3290	UpdateBoundsCheck(cp_offset, checked_up_to);
				3291	}
				3292	}
				3293	}
				3294	}
				3295
				3296
				3297	int TextNode::Length() {
				3298	TextElement elm = elements()->last();
				3299	DCHECK(elm.cp_offset() >= 0);
				3300	return elm.cp_offset() + elm.length();
				3301	}
				3302
				3303
				3304	bool TextNode::SkipPass(int int_pass, bool ignore_case) {
				3305	TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
				3306	if (ignore_case) {
				3307	return pass == SIMPLE_CHARACTER_MATCH;
				3308	} else {
				3309	return pass == NON_LETTER_CHARACTER_MATCH \|\| pass == CASE_CHARACTER_MATCH;
				3310	}
				3311	}
				3312
				3313
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	3314	TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
				3315	ZoneList<CharacterRange>* ranges,
				3316	bool read_backward,
				3317	RegExpNode* on_success) {
				3318	DCHECK_NOT_NULL(ranges);
				3319	ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
				3320	elms->Add(
				3321	TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),
				3322	zone);
				3323	return new (zone) TextNode(elms, read_backward, on_success);
				3324	}
				3325
				3326
				3327	TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
				3328	CharacterRange trail,
				3329	bool read_backward,
				3330	RegExpNode* on_success) {
				3331	ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
				3332	ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
				3333	ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
				3334	elms->Add(TextElement::CharClass(
				3335	new (zone) RegExpCharacterClass(lead_ranges, false)),
				3336	zone);
				3337	elms->Add(TextElement::CharClass(
				3338	new (zone) RegExpCharacterClass(trail_ranges, false)),
				3339	zone);
				3340	return new (zone) TextNode(elms, read_backward, on_success);
				3341	}
				3342
				3343
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3344	// This generates the code to match a text node. A text node can contain
				3345	// straight character sequences (possibly to be matched in a case-independent
				3346	// way) and character classes. For efficiency we do not do this in a single
				3347	// pass from left to right. Instead we pass over the text node several times,
				3348	// emitting code for some character positions every time. See the comment on
				3349	// TextEmitPass for details.
				3350	void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3351	LimitResult limit_result = LimitVersions(compiler, trace);
				3352	if (limit_result == DONE) return;
				3353	DCHECK(limit_result == CONTINUE);
				3354
				3355	if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
				3356	compiler->SetRegExpTooBig();
				3357	return;
				3358	}
				3359
				3360	if (compiler->one_byte()) {
				3361	int dummy = 0;
				3362	TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
				3363	}
				3364
				3365	bool first_elt_done = false;
				3366	int bound_checked_to = trace->cp_offset() - 1;
				3367	bound_checked_to += trace->bound_checked_up_to();
				3368
				3369	// If a character is preloaded into the current character register then
				3370	// check that now.
				3371	if (trace->characters_preloaded() == 1) {
				3372	for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
				3373	if (!SkipPass(pass, compiler->ignore_case())) {
				3374	TextEmitPass(compiler,
				3375	static_cast<TextEmitPassType>(pass),
				3376	true,
				3377	trace,
				3378	false,
				3379	&bound_checked_to);
				3380	}
				3381	}
				3382	first_elt_done = true;
				3383	}
				3384
				3385	for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
				3386	if (!SkipPass(pass, compiler->ignore_case())) {
				3387	TextEmitPass(compiler,
				3388	static_cast<TextEmitPassType>(pass),
				3389	false,
				3390	trace,
				3391	first_elt_done,
				3392	&bound_checked_to);
				3393	}
				3394	}
				3395
				3396	Trace successor_trace(*trace);
				3397	// If we advance backward, we may end up at the start.
				3398	successor_trace.AdvanceCurrentPositionInTrace(
				3399	read_backward() ? -Length() : Length(), compiler);
				3400	successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
				3401	: Trace::FALSE_VALUE);
				3402	RecursionCheck rc(compiler);
				3403	on_success()->Emit(compiler, &successor_trace);
				3404	}
				3405
				3406
				3407	void Trace::InvalidateCurrentCharacter() {
				3408	characters_preloaded_ = 0;
				3409	}
				3410
				3411
				3412	void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
				3413	// We don't have an instruction for shifting the current character register
				3414	// down or for using a shifted value for anything so lets just forget that
				3415	// we preloaded any characters into it.
				3416	characters_preloaded_ = 0;
				3417	// Adjust the offsets of the quick check performed information. This
				3418	// information is used to find out what we already determined about the
				3419	// characters by means of mask and compare.
				3420	quick_check_performed_.Advance(by, compiler->one_byte());
				3421	cp_offset_ += by;
				3422	if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
				3423	compiler->SetRegExpTooBig();
				3424	cp_offset_ = 0;
				3425	}
				3426	bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
				3427	}
				3428
				3429
				3430	void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
				3431	int element_count = elements()->length();
				3432	for (int i = 0; i < element_count; i++) {
				3433	TextElement elm = elements()->at(i);
				3434	if (elm.text_type() == TextElement::CHAR_CLASS) {
				3435	RegExpCharacterClass* cc = elm.char_class();
				3436	// None of the standard character classes is different in the case
				3437	// independent case and it slows us down if we don't know that.
				3438	if (cc->is_standard(zone())) continue;
				3439	ZoneList<CharacterRange>* ranges = cc->ranges(zone());
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	3440	CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3441	}
				3442	}
				3443	}
				3444
				3445
				3446	int TextNode::GreedyLoopTextLength() { return Length(); }
				3447
				3448
				3449	RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
				3450	RegExpCompiler* compiler) {
				3451	if (read_backward()) return NULL;
				3452	if (elements()->length() != 1) return NULL;
				3453	TextElement elm = elements()->at(0);
				3454	if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
				3455	RegExpCharacterClass* node = elm.char_class();
				3456	ZoneList<CharacterRange>* ranges = node->ranges(zone());
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	3457	CharacterRange::Canonicalize(ranges);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3458	if (node->is_negated()) {
				3459	return ranges->length() == 0 ? on_success() : NULL;
				3460	}
				3461	if (ranges->length() != 1) return NULL;
				3462	uint32_t max_char;
				3463	if (compiler->one_byte()) {
				3464	max_char = String::kMaxOneByteCharCode;
				3465	} else {
				3466	max_char = String::kMaxUtf16CodeUnit;
				3467	}
				3468	return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
				3469	}
				3470
				3471
				3472	// Finds the fixed match length of a sequence of nodes that goes from
				3473	// this alternative and back to this choice node. If there are variable
				3474	// length nodes or other complications in the way then return a sentinel
				3475	// value indicating that a greedy loop cannot be constructed.
				3476	int ChoiceNode::GreedyLoopTextLengthForAlternative(
				3477	GuardedAlternative* alternative) {
				3478	int length = 0;
				3479	RegExpNode* node = alternative->node();
				3480	// Later we will generate code for all these text nodes using recursion
				3481	// so we have to limit the max number.
				3482	int recursion_depth = 0;
				3483	while (node != this) {
				3484	if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
				3485	return kNodeIsTooComplexForGreedyLoops;
				3486	}
				3487	int node_length = node->GreedyLoopTextLength();
				3488	if (node_length == kNodeIsTooComplexForGreedyLoops) {
				3489	return kNodeIsTooComplexForGreedyLoops;
				3490	}
				3491	length += node_length;
				3492	SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
				3493	node = seq_node->on_success();
				3494	}
				3495	return read_backward() ? -length : length;
				3496	}
				3497
				3498
				3499	void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
				3500	DCHECK_NULL(loop_node_);
				3501	AddAlternative(alt);
				3502	loop_node_ = alt.node();
				3503	}
				3504
				3505
				3506	void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
				3507	DCHECK_NULL(continue_node_);
				3508	AddAlternative(alt);
				3509	continue_node_ = alt.node();
				3510	}
				3511
				3512
				3513	void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3514	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				3515	if (trace->stop_node() == this) {
				3516	// Back edge of greedy optimized loop node graph.
				3517	int text_length =
				3518	GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
				3519	DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);
				3520	// Update the counter-based backtracking info on the stack. This is an
				3521	// optimization for greedy loops (see below).
				3522	DCHECK(trace->cp_offset() == text_length);
				3523	macro_assembler->AdvanceCurrentPosition(text_length);
				3524	macro_assembler->GoTo(trace->loop_label());
				3525	return;
				3526	}
				3527	DCHECK_NULL(trace->stop_node());
				3528	if (!trace->is_trivial()) {
				3529	trace->Flush(compiler, this);
				3530	return;
				3531	}
				3532	ChoiceNode::Emit(compiler, trace);
				3533	}
				3534
				3535
				3536	int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
				3537	int eats_at_least) {
				3538	int preload_characters = Min(4, eats_at_least);
				3539	if (compiler->macro_assembler()->CanReadUnaligned()) {
				3540	bool one_byte = compiler->one_byte();
				3541	if (one_byte) {
				3542	if (preload_characters > 4) preload_characters = 4;
				3543	// We can't preload 3 characters because there is no machine instruction
				3544	// to do that. We can't just load 4 because we could be reading
				3545	// beyond the end of the string, which could cause a memory fault.
				3546	if (preload_characters == 3) preload_characters = 2;
				3547	} else {
				3548	if (preload_characters > 2) preload_characters = 2;
				3549	}
				3550	} else {
				3551	if (preload_characters > 1) preload_characters = 1;
				3552	}
				3553	return preload_characters;
				3554	}
				3555
				3556
				3557	// This class is used when generating the alternatives in a choice node. It
				3558	// records the way the alternative is being code generated.
				3559	class AlternativeGeneration: public Malloced {
				3560	public:
				3561	AlternativeGeneration()
				3562	: possible_success(),
				3563	expects_preload(false),
				3564	after(),
				3565	quick_check_details() { }
				3566	Label possible_success;
				3567	bool expects_preload;
				3568	Label after;
				3569	QuickCheckDetails quick_check_details;
				3570	};
				3571
				3572
				3573	// Creates a list of AlternativeGenerations. If the list has a reasonable
				3574	// size then it is on the stack, otherwise the excess is on the heap.
				3575	class AlternativeGenerationList {
				3576	public:
				3577	AlternativeGenerationList(int count, Zone* zone)
				3578	: alt_gens_(count, zone) {
				3579	for (int i = 0; i < count && i < kAFew; i++) {
				3580	alt_gens_.Add(a_few_alt_gens_ + i, zone);
				3581	}
				3582	for (int i = kAFew; i < count; i++) {
				3583	alt_gens_.Add(new AlternativeGeneration(), zone);
				3584	}
				3585	}
				3586	~AlternativeGenerationList() {
				3587	for (int i = kAFew; i < alt_gens_.length(); i++) {
				3588	delete alt_gens_[i];
				3589	alt_gens_[i] = NULL;
				3590	}
				3591	}
				3592
				3593	AlternativeGeneration* at(int i) {
				3594	return alt_gens_[i];
				3595	}
				3596
				3597	private:
				3598	static const int kAFew = 10;
				3599	ZoneList<AlternativeGeneration*> alt_gens_;
				3600	AlternativeGeneration a_few_alt_gens_[kAFew];
				3601	};
				3602
				3603
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	3604	static const uc32 kRangeEndMarker = 0x110000;
				3605
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3606	// The '2' variant is has inclusive from and exclusive to.
				3607	// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
				3608	// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	3609	static const int kSpaceRanges[] = {
				3610	'\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 0x1681,
				3611	0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
				3612	0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3613	static const int kSpaceRangeCount = arraysize(kSpaceRanges);
				3614
				3615	static const int kWordRanges[] = {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	3616	'0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3617	static const int kWordRangeCount = arraysize(kWordRanges);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	3618	static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3619	static const int kDigitRangeCount = arraysize(kDigitRanges);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	3620	static const int kSurrogateRanges[] = {
				3621	kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3622	static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	3623	static const int kLineTerminatorRanges[] = {
				3624	0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3625	static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
				3626
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3627	void BoyerMoorePositionInfo::Set(int character) {
				3628	SetInterval(Interval(character, character));
				3629	}
				3630
				3631
				3632	void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
				3633	s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
				3634	w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
				3635	d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
				3636	surrogate_ =
				3637	AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
				3638	if (interval.to() - interval.from() >= kMapSize - 1) {
				3639	if (map_count_ != kMapSize) {
				3640	map_count_ = kMapSize;
				3641	for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
				3642	}
				3643	return;
				3644	}
				3645	for (int i = interval.from(); i <= interval.to(); i++) {
				3646	int mod_character = (i & kMask);
				3647	if (!map_->at(mod_character)) {
				3648	map_count_++;
				3649	map_->at(mod_character) = true;
				3650	}
				3651	if (map_count_ == kMapSize) return;
				3652	}
				3653	}
				3654
				3655
				3656	void BoyerMoorePositionInfo::SetAll() {
				3657	s_ = w_ = d_ = kLatticeUnknown;
				3658	if (map_count_ != kMapSize) {
				3659	map_count_ = kMapSize;
				3660	for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
				3661	}
				3662	}
				3663
				3664
				3665	BoyerMooreLookahead::BoyerMooreLookahead(
				3666	int length, RegExpCompiler* compiler, Zone* zone)
				3667	: length_(length),
				3668	compiler_(compiler) {
				3669	if (compiler->one_byte()) {
				3670	max_char_ = String::kMaxOneByteCharCode;
				3671	} else {
				3672	max_char_ = String::kMaxUtf16CodeUnit;
				3673	}
				3674	bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
				3675	for (int i = 0; i < length; i++) {
				3676	bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
				3677	}
				3678	}
				3679
				3680
				3681	// Find the longest range of lookahead that has the fewest number of different
				3682	// characters that can occur at a given position. Since we are optimizing two
				3683	// different parameters at once this is a tradeoff.
				3684	bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
				3685	int biggest_points = 0;
				3686	// If more than 32 characters out of 128 can occur it is unlikely that we can
				3687	// be lucky enough to step forwards much of the time.
				3688	const int kMaxMax = 32;
				3689	for (int max_number_of_chars = 4;
				3690	max_number_of_chars < kMaxMax;
				3691	max_number_of_chars *= 2) {
				3692	biggest_points =
				3693	FindBestInterval(max_number_of_chars, biggest_points, from, to);
				3694	}
				3695	if (biggest_points == 0) return false;
				3696	return true;
				3697	}
				3698
				3699
				3700	// Find the highest-points range between 0 and length_ where the character
				3701	// information is not too vague. 'Too vague' means that there are more than
				3702	// max_number_of_chars that can occur at this position. Calculates the number
				3703	// of points as the product of width-of-the-range and
				3704	// probability-of-finding-one-of-the-characters, where the probability is
				3705	// calculated using the frequency distribution of the sample subject string.
				3706	int BoyerMooreLookahead::FindBestInterval(
				3707	int max_number_of_chars, int old_biggest_points, int* from, int* to) {
				3708	int biggest_points = old_biggest_points;
				3709	static const int kSize = RegExpMacroAssembler::kTableSize;
				3710	for (int i = 0; i < length_; ) {
				3711	while (i < length_ && Count(i) > max_number_of_chars) i++;
				3712	if (i == length_) break;
				3713	int remembered_from = i;
				3714	bool union_map[kSize];
				3715	for (int j = 0; j < kSize; j++) union_map[j] = false;
				3716	while (i < length_ && Count(i) <= max_number_of_chars) {
				3717	BoyerMoorePositionInfo* map = bitmaps_->at(i);
				3718	for (int j = 0; j < kSize; j++) union_map[j] \|= map->at(j);
				3719	i++;
				3720	}
				3721	int frequency = 0;
				3722	for (int j = 0; j < kSize; j++) {
				3723	if (union_map[j]) {
				3724	// Add 1 to the frequency to give a small per-character boost for
				3725	// the cases where our sampling is not good enough and many
				3726	// characters have a frequency of zero. This means the frequency
				3727	// can theoretically be up to 2*kSize though we treat it mostly as
				3728	// a fraction of kSize.
				3729	frequency += compiler_->frequency_collator()->Frequency(j) + 1;
				3730	}
				3731	}
				3732	// We use the probability of skipping times the distance we are skipping to
				3733	// judge the effectiveness of this. Actually we have a cut-off: By
				3734	// dividing by 2 we switch off the skipping if the probability of skipping
				3735	// is less than 50%. This is because the multibyte mask-and-compare
				3736	// skipping in quickcheck is more likely to do well on this case.
				3737	bool in_quickcheck_range =
				3738	((i - remembered_from < 4) \|\|
				3739	(compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
				3740	// Called 'probability' but it is only a rough estimate and can actually
				3741	// be outside the 0-kSize range.
				3742	int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
				3743	int points = (i - remembered_from) * probability;
				3744	if (points > biggest_points) {
				3745	*from = remembered_from;
				3746	*to = i - 1;
				3747	biggest_points = points;
				3748	}
				3749	}
				3750	return biggest_points;
				3751	}
				3752
				3753
				3754	// Take all the characters that will not prevent a successful match if they
				3755	// occur in the subject string in the range between min_lookahead and
				3756	// max_lookahead (inclusive) measured from the current position. If the
				3757	// character at max_lookahead offset is not one of these characters, then we
				3758	// can safely skip forwards by the number of characters in the range.
				3759	int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
				3760	int max_lookahead,
				3761	Handle<ByteArray> boolean_skip_table) {
				3762	const int kSize = RegExpMacroAssembler::kTableSize;
				3763
				3764	const int kSkipArrayEntry = 0;
				3765	const int kDontSkipArrayEntry = 1;
				3766
				3767	for (int i = 0; i < kSize; i++) {
				3768	boolean_skip_table->set(i, kSkipArrayEntry);
				3769	}
				3770	int skip = max_lookahead + 1 - min_lookahead;
				3771
				3772	for (int i = max_lookahead; i >= min_lookahead; i--) {
				3773	BoyerMoorePositionInfo* map = bitmaps_->at(i);
				3774	for (int j = 0; j < kSize; j++) {
				3775	if (map->at(j)) {
				3776	boolean_skip_table->set(j, kDontSkipArrayEntry);
				3777	}
				3778	}
				3779	}
				3780
				3781	return skip;
				3782	}
				3783
				3784
				3785	// See comment above on the implementation of GetSkipTable.
				3786	void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
				3787	const int kSize = RegExpMacroAssembler::kTableSize;
				3788
				3789	int min_lookahead = 0;
				3790	int max_lookahead = 0;
				3791
				3792	if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
				3793
				3794	bool found_single_character = false;
				3795	int single_character = 0;
				3796	for (int i = max_lookahead; i >= min_lookahead; i--) {
				3797	BoyerMoorePositionInfo* map = bitmaps_->at(i);
				3798	if (map->map_count() > 1 \|\|
				3799	(found_single_character && map->map_count() != 0)) {
				3800	found_single_character = false;
				3801	break;
				3802	}
				3803	for (int j = 0; j < kSize; j++) {
				3804	if (map->at(j)) {
				3805	found_single_character = true;
				3806	single_character = j;
				3807	break;
				3808	}
				3809	}
				3810	}
				3811
				3812	int lookahead_width = max_lookahead + 1 - min_lookahead;
				3813
				3814	if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
				3815	// The mask-compare can probably handle this better.
				3816	return;
				3817	}
				3818
				3819	if (found_single_character) {
				3820	Label cont, again;
				3821	masm->Bind(&again);
				3822	masm->LoadCurrentCharacter(max_lookahead, &cont, true);
				3823	if (max_char_ > kSize) {
				3824	masm->CheckCharacterAfterAnd(single_character,
				3825	RegExpMacroAssembler::kTableMask,
				3826	&cont);
				3827	} else {
				3828	masm->CheckCharacter(single_character, &cont);
				3829	}
				3830	masm->AdvanceCurrentPosition(lookahead_width);
				3831	masm->GoTo(&again);
				3832	masm->Bind(&cont);
				3833	return;
				3834	}
				3835
				3836	Factory* factory = masm->isolate()->factory();
				3837	Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
				3838	int skip_distance = GetSkipTable(
				3839	min_lookahead, max_lookahead, boolean_skip_table);
				3840	DCHECK(skip_distance != 0);
				3841
				3842	Label cont, again;
				3843	masm->Bind(&again);
				3844	masm->LoadCurrentCharacter(max_lookahead, &cont, true);
				3845	masm->CheckBitInTable(boolean_skip_table, &cont);
				3846	masm->AdvanceCurrentPosition(skip_distance);
				3847	masm->GoTo(&again);
				3848	masm->Bind(&cont);
				3849	}
				3850
				3851
				3852	/* Code generation for choice nodes.
				3853	*
				3854	* We generate quick checks that do a mask and compare to eliminate a
				3855	* choice. If the quick check succeeds then it jumps to the continuation to
				3856	* do slow checks and check subsequent nodes. If it fails (the common case)
				3857	* it falls through to the next choice.
				3858	*
				3859	* Here is the desired flow graph. Nodes directly below each other imply
				3860	* fallthrough. Alternatives 1 and 2 have quick checks. Alternative
				3861	* 3 doesn't have a quick check so we have to call the slow check.
				3862	* Nodes are marked Qn for quick checks and Sn for slow checks. The entire
				3863	* regexp continuation is generated directly after the Sn node, up to the
				3864	* next GoTo if we decide to reuse some already generated code. Some
				3865	* nodes expect preload_characters to be preloaded into the current
				3866	* character register. R nodes do this preloading. Vertices are marked
				3867	* F for failures and S for success (possible success in the case of quick
				3868	* nodes). L, V, < and > are used as arrow heads.
				3869	*
				3870	* ----------> R
				3871	* \|
				3872	* V
				3873	* Q1 -----> S1
				3874	* \| S /
				3875	* F\| /
				3876	* \| F/
				3877	* \| /
				3878	* \| R
				3879	* \| /
				3880	* V L
				3881	* Q2 -----> S2
				3882	* \| S /
				3883	* F\| /
				3884	* \| F/
				3885	* \| /
				3886	* \| R
				3887	* \| /
				3888	* V L
				3889	* S3
				3890	* \|
				3891	* F\|
				3892	* \|
				3893	* R
				3894	* \|
				3895	* backtrack V
				3896	* <----------Q4
				3897	* \ F \|
				3898	* \ \|S
				3899	* \ F V
				3900	* \-----S4
				3901	*
				3902	* For greedy loops we push the current position, then generate the code that
				3903	* eats the input specially in EmitGreedyLoop. The other choice (the
				3904	* continuation) is generated by the normal code in EmitChoices, and steps back
				3905	* in the input to the starting position when it fails to match. The loop code
				3906	* looks like this (U is the unwind code that steps back in the greedy loop).
				3907	*
				3908	* _____
				3909	* / \
				3910	* V \|
				3911	* ----------> S1 \|
				3912	* /\| \|
				3913	* / \|S \|
				3914	* F/ \_____/
				3915	* /
				3916	* \|<-----
				3917	* \| \
				3918	* V \|S
				3919	* Q2 ---> U----->backtrack
				3920	* \| F /
				3921	* S\| /
				3922	* V F /
				3923	* S2--/
				3924	*/
				3925
				3926	GreedyLoopState::GreedyLoopState(bool not_at_start) {
				3927	counter_backtrack_trace_.set_backtrack(&label_);
				3928	if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
				3929	}
				3930
				3931
				3932	void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
				3933	#ifdef DEBUG
				3934	int choice_count = alternatives_->length();
				3935	for (int i = 0; i < choice_count - 1; i++) {
				3936	GuardedAlternative alternative = alternatives_->at(i);
				3937	ZoneList<Guard> guards = alternative.guards();
				3938	int guard_count = (guards == NULL) ? 0 : guards->length();
				3939	for (int j = 0; j < guard_count; j++) {
				3940	DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
				3941	}
				3942	}
				3943	#endif
				3944	}
				3945
				3946
				3947	void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
				3948	Trace* current_trace,
				3949	PreloadState* state) {
				3950	if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
				3951	// Save some time by looking at most one machine word ahead.
				3952	state->eats_at_least_ =
				3953	EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
				3954	current_trace->at_start() == Trace::FALSE_VALUE);
				3955	}
				3956	state->preload_characters_ =
				3957	CalculatePreloadCharacters(compiler, state->eats_at_least_);
				3958
				3959	state->preload_is_current_ =
				3960	(current_trace->characters_preloaded() == state->preload_characters_);
				3961	state->preload_has_checked_bounds_ = state->preload_is_current_;
				3962	}
				3963
				3964
				3965	void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3966	int choice_count = alternatives_->length();
				3967
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	3968	if (choice_count == 1 && alternatives_->at(0).guards() == NULL) {
				3969	alternatives_->at(0).node()->Emit(compiler, trace);
				3970	return;
				3971	}
				3972
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3973	AssertGuardsMentionRegisters(trace);
				3974
				3975	LimitResult limit_result = LimitVersions(compiler, trace);
				3976	if (limit_result == DONE) return;
				3977	DCHECK(limit_result == CONTINUE);
				3978
				3979	// For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
				3980	// other choice nodes we only flush if we are out of code size budget.
				3981	if (trace->flush_budget() == 0 && trace->actions() != NULL) {
				3982	trace->Flush(compiler, this);
				3983	return;
				3984	}
				3985
				3986	RecursionCheck rc(compiler);
				3987
				3988	PreloadState preload;
				3989	preload.init();
				3990	GreedyLoopState greedy_loop_state(not_at_start());
				3991
				3992	int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
				3993	AlternativeGenerationList alt_gens(choice_count, zone());
				3994
				3995	if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
				3996	trace = EmitGreedyLoop(compiler,
				3997	trace,
				3998	&alt_gens,
				3999	&preload,
				4000	&greedy_loop_state,
				4001	text_length);
				4002	} else {
				4003	// TODO(erikcorry): Delete this. We don't need this label, but it makes us
				4004	// match the traces produced pre-cleanup.
				4005	Label second_choice;
				4006	compiler->macro_assembler()->Bind(&second_choice);
				4007
				4008	preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
				4009
				4010	EmitChoices(compiler,
				4011	&alt_gens,
				4012	0,
				4013	trace,
				4014	&preload);
				4015	}
				4016
				4017	// At this point we need to generate slow checks for the alternatives where
				4018	// the quick check was inlined. We can recognize these because the associated
				4019	// label was bound.
				4020	int new_flush_budget = trace->flush_budget() / choice_count;
				4021	for (int i = 0; i < choice_count; i++) {
				4022	AlternativeGeneration* alt_gen = alt_gens.at(i);
				4023	Trace new_trace(*trace);
				4024	// If there are actions to be flushed we have to limit how many times
				4025	// they are flushed. Take the budget of the parent trace and distribute
				4026	// it fairly amongst the children.
				4027	if (new_trace.actions() != NULL) {
				4028	new_trace.set_flush_budget(new_flush_budget);
				4029	}
				4030	bool next_expects_preload =
				4031	i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
				4032	EmitOutOfLineContinuation(compiler,
				4033	&new_trace,
				4034	alternatives_->at(i),
				4035	alt_gen,
				4036	preload.preload_characters_,
				4037	next_expects_preload);
				4038	}
				4039	}
				4040
				4041
				4042	Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
				4043	Trace* trace,
				4044	AlternativeGenerationList* alt_gens,
				4045	PreloadState* preload,
				4046	GreedyLoopState* greedy_loop_state,
				4047	int text_length) {
				4048	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				4049	// Here we have special handling for greedy loops containing only text nodes
				4050	// and other simple nodes. These are handled by pushing the current
				4051	// position on the stack and then incrementing the current position each
				4052	// time around the switch. On backtrack we decrement the current position
				4053	// and check it against the pushed value. This avoids pushing backtrack
				4054	// information for each iteration of the loop, which could take up a lot of
				4055	// space.
				4056	DCHECK(trace->stop_node() == NULL);
				4057	macro_assembler->PushCurrentPosition();
				4058	Label greedy_match_failed;
				4059	Trace greedy_match_trace;
				4060	if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
				4061	greedy_match_trace.set_backtrack(&greedy_match_failed);
				4062	Label loop_label;
				4063	macro_assembler->Bind(&loop_label);
				4064	greedy_match_trace.set_stop_node(this);
				4065	greedy_match_trace.set_loop_label(&loop_label);
				4066	alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
				4067	macro_assembler->Bind(&greedy_match_failed);
				4068
				4069	Label second_choice; // For use in greedy matches.
				4070	macro_assembler->Bind(&second_choice);
				4071
				4072	Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
				4073
				4074	EmitChoices(compiler,
				4075	alt_gens,
				4076	1,
				4077	new_trace,
				4078	preload);
				4079
				4080	macro_assembler->Bind(greedy_loop_state->label());
				4081	// If we have unwound to the bottom then backtrack.
				4082	macro_assembler->CheckGreedyLoop(trace->backtrack());
				4083	// Otherwise try the second priority at an earlier position.
				4084	macro_assembler->AdvanceCurrentPosition(-text_length);
				4085	macro_assembler->GoTo(&second_choice);
				4086	return new_trace;
				4087	}
				4088
				4089	int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
				4090	Trace* trace) {
				4091	int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
				4092	if (alternatives_->length() != 2) return eats_at_least;
				4093
				4094	GuardedAlternative alt1 = alternatives_->at(1);
				4095	if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
				4096	return eats_at_least;
				4097	}
				4098	RegExpNode* eats_anything_node = alt1.node();
				4099	if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
				4100	return eats_at_least;
				4101	}
				4102
				4103	// Really we should be creating a new trace when we execute this function,
				4104	// but there is no need, because the code it generates cannot backtrack, and
				4105	// we always arrive here with a trivial trace (since it's the entry to a
				4106	// loop. That also implies that there are no preloaded characters, which is
				4107	// good, because it means we won't be violating any assumptions by
				4108	// overwriting those characters with new load instructions.
				4109	DCHECK(trace->is_trivial());
				4110
				4111	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				4112	Isolate* isolate = macro_assembler->isolate();
				4113	// At this point we know that we are at a non-greedy loop that will eat
				4114	// any character one at a time. Any non-anchored regexp has such a
				4115	// loop prepended to it in order to find where it starts. We look for
				4116	// a pattern of the form ...abc... where we can look 6 characters ahead
				4117	// and step forwards 3 if the character is not one of abc. Abc need
				4118	// not be atoms, they can be any reasonably limited character class or
				4119	// small alternation.
				4120	BoyerMooreLookahead* bm = bm_info(false);
				4121	if (bm == NULL) {
				4122	eats_at_least = Min(kMaxLookaheadForBoyerMoore,
				4123	EatsAtLeast(kMaxLookaheadForBoyerMoore,
				4124	kRecursionBudget,
				4125	false));
				4126	if (eats_at_least >= 1) {
				4127	bm = new(zone()) BoyerMooreLookahead(eats_at_least,
				4128	compiler,
				4129	zone());
				4130	GuardedAlternative alt0 = alternatives_->at(0);
				4131	alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
				4132	}
				4133	}
				4134	if (bm != NULL) {
				4135	bm->EmitSkipInstructions(macro_assembler);
				4136	}
				4137	return eats_at_least;
				4138	}
				4139
				4140
				4141	void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
				4142	AlternativeGenerationList* alt_gens,
				4143	int first_choice,
				4144	Trace* trace,
				4145	PreloadState* preload) {
				4146	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				4147	SetUpPreLoad(compiler, trace, preload);
				4148
				4149	// For now we just call all choices one after the other. The idea ultimately
				4150	// is to use the Dispatch table to try only the relevant ones.
				4151	int choice_count = alternatives_->length();
				4152
				4153	int new_flush_budget = trace->flush_budget() / choice_count;
				4154
				4155	for (int i = first_choice; i < choice_count; i++) {
				4156	bool is_last = i == choice_count - 1;
				4157	bool fall_through_on_failure = !is_last;
				4158	GuardedAlternative alternative = alternatives_->at(i);
				4159	AlternativeGeneration* alt_gen = alt_gens->at(i);
				4160	alt_gen->quick_check_details.set_characters(preload->preload_characters_);
				4161	ZoneList<Guard> guards = alternative.guards();
				4162	int guard_count = (guards == NULL) ? 0 : guards->length();
				4163	Trace new_trace(*trace);
				4164	new_trace.set_characters_preloaded(preload->preload_is_current_ ?
				4165	preload->preload_characters_ :
				4166	0);
				4167	if (preload->preload_has_checked_bounds_) {
				4168	new_trace.set_bound_checked_up_to(preload->preload_characters_);
				4169	}
				4170	new_trace.quick_check_performed()->Clear();
				4171	if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
				4172	if (!is_last) {
				4173	new_trace.set_backtrack(&alt_gen->after);
				4174	}
				4175	alt_gen->expects_preload = preload->preload_is_current_;
				4176	bool generate_full_check_inline = false;
				4177	if (compiler->optimize() &&
				4178	try_to_emit_quick_check_for_alternative(i == 0) &&
				4179	alternative.node()->EmitQuickCheck(
				4180	compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
				4181	&alt_gen->possible_success, &alt_gen->quick_check_details,
				4182	fall_through_on_failure)) {
				4183	// Quick check was generated for this choice.
				4184	preload->preload_is_current_ = true;
				4185	preload->preload_has_checked_bounds_ = true;
				4186	// If we generated the quick check to fall through on possible success,
				4187	// we now need to generate the full check inline.
				4188	if (!fall_through_on_failure) {
				4189	macro_assembler->Bind(&alt_gen->possible_success);
				4190	new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
				4191	new_trace.set_characters_preloaded(preload->preload_characters_);
				4192	new_trace.set_bound_checked_up_to(preload->preload_characters_);
				4193	generate_full_check_inline = true;
				4194	}
				4195	} else if (alt_gen->quick_check_details.cannot_match()) {
				4196	if (!fall_through_on_failure) {
				4197	macro_assembler->GoTo(trace->backtrack());
				4198	}
				4199	continue;
				4200	} else {
				4201	// No quick check was generated. Put the full code here.
				4202	// If this is not the first choice then there could be slow checks from
				4203	// previous cases that go here when they fail. There's no reason to
				4204	// insist that they preload characters since the slow check we are about
				4205	// to generate probably can't use it.
				4206	if (i != first_choice) {
				4207	alt_gen->expects_preload = false;
				4208	new_trace.InvalidateCurrentCharacter();
				4209	}
				4210	generate_full_check_inline = true;
				4211	}
				4212	if (generate_full_check_inline) {
				4213	if (new_trace.actions() != NULL) {
				4214	new_trace.set_flush_budget(new_flush_budget);
				4215	}
				4216	for (int j = 0; j < guard_count; j++) {
				4217	GenerateGuard(macro_assembler, guards->at(j), &new_trace);
				4218	}
				4219	alternative.node()->Emit(compiler, &new_trace);
				4220	preload->preload_is_current_ = false;
				4221	}
				4222	macro_assembler->Bind(&alt_gen->after);
				4223	}
				4224	}
				4225
				4226
				4227	void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
				4228	Trace* trace,
				4229	GuardedAlternative alternative,
				4230	AlternativeGeneration* alt_gen,
				4231	int preload_characters,
				4232	bool next_expects_preload) {
				4233	if (!alt_gen->possible_success.is_linked()) return;
				4234
				4235	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				4236	macro_assembler->Bind(&alt_gen->possible_success);
				4237	Trace out_of_line_trace(*trace);
				4238	out_of_line_trace.set_characters_preloaded(preload_characters);
				4239	out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
				4240	if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
				4241	ZoneList<Guard> guards = alternative.guards();
				4242	int guard_count = (guards == NULL) ? 0 : guards->length();
				4243	if (next_expects_preload) {
				4244	Label reload_current_char;
				4245	out_of_line_trace.set_backtrack(&reload_current_char);
				4246	for (int j = 0; j < guard_count; j++) {
				4247	GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
				4248	}
				4249	alternative.node()->Emit(compiler, &out_of_line_trace);
				4250	macro_assembler->Bind(&reload_current_char);
				4251	// Reload the current character, since the next quick check expects that.
				4252	// We don't need to check bounds here because we only get into this
				4253	// code through a quick check which already did the checked load.
				4254	macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
				4255	NULL,
				4256	false,
				4257	preload_characters);
				4258	macro_assembler->GoTo(&(alt_gen->after));
				4259	} else {
				4260	out_of_line_trace.set_backtrack(&(alt_gen->after));
				4261	for (int j = 0; j < guard_count; j++) {
				4262	GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
				4263	}
				4264	alternative.node()->Emit(compiler, &out_of_line_trace);
				4265	}
				4266	}
				4267
				4268
				4269	void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				4270	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				4271	LimitResult limit_result = LimitVersions(compiler, trace);
				4272	if (limit_result == DONE) return;
				4273	DCHECK(limit_result == CONTINUE);
				4274
				4275	RecursionCheck rc(compiler);
				4276
				4277	switch (action_type_) {
				4278	case STORE_POSITION: {
				4279	Trace::DeferredCapture
				4280	new_capture(data_.u_position_register.reg,
				4281	data_.u_position_register.is_capture,
				4282	trace);
				4283	Trace new_trace = *trace;
				4284	new_trace.add_action(&new_capture);
				4285	on_success()->Emit(compiler, &new_trace);
				4286	break;
				4287	}
				4288	case INCREMENT_REGISTER: {
				4289	Trace::DeferredIncrementRegister
				4290	new_increment(data_.u_increment_register.reg);
				4291	Trace new_trace = *trace;
				4292	new_trace.add_action(&new_increment);
				4293	on_success()->Emit(compiler, &new_trace);
				4294	break;
				4295	}
				4296	case SET_REGISTER: {
				4297	Trace::DeferredSetRegister
				4298	new_set(data_.u_store_register.reg, data_.u_store_register.value);
				4299	Trace new_trace = *trace;
				4300	new_trace.add_action(&new_set);
				4301	on_success()->Emit(compiler, &new_trace);
				4302	break;
				4303	}
				4304	case CLEAR_CAPTURES: {
				4305	Trace::DeferredClearCaptures
				4306	new_capture(Interval(data_.u_clear_captures.range_from,
				4307	data_.u_clear_captures.range_to));
				4308	Trace new_trace = *trace;
				4309	new_trace.add_action(&new_capture);
				4310	on_success()->Emit(compiler, &new_trace);
				4311	break;
				4312	}
				4313	case BEGIN_SUBMATCH:
				4314	if (!trace->is_trivial()) {
				4315	trace->Flush(compiler, this);
				4316	} else {
				4317	assembler->WriteCurrentPositionToRegister(
				4318	data_.u_submatch.current_position_register, 0);
				4319	assembler->WriteStackPointerToRegister(
				4320	data_.u_submatch.stack_pointer_register);
				4321	on_success()->Emit(compiler, trace);
				4322	}
				4323	break;
				4324	case EMPTY_MATCH_CHECK: {
				4325	int start_pos_reg = data_.u_empty_match_check.start_register;
				4326	int stored_pos = 0;
				4327	int rep_reg = data_.u_empty_match_check.repetition_register;
				4328	bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
				4329	bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
				4330	if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
				4331	// If we know we haven't advanced and there is no minimum we
				4332	// can just backtrack immediately.
				4333	assembler->GoTo(trace->backtrack());
				4334	} else if (know_dist && stored_pos < trace->cp_offset()) {
				4335	// If we know we've advanced we can generate the continuation
				4336	// immediately.
				4337	on_success()->Emit(compiler, trace);
				4338	} else if (!trace->is_trivial()) {
				4339	trace->Flush(compiler, this);
				4340	} else {
				4341	Label skip_empty_check;
				4342	// If we have a minimum number of repetitions we check the current
				4343	// number first and skip the empty check if it's not enough.
				4344	if (has_minimum) {
				4345	int limit = data_.u_empty_match_check.repetition_limit;
				4346	assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
				4347	}
				4348	// If the match is empty we bail out, otherwise we fall through
				4349	// to the on-success continuation.
				4350	assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
				4351	trace->backtrack());
				4352	assembler->Bind(&skip_empty_check);
				4353	on_success()->Emit(compiler, trace);
				4354	}
				4355	break;
				4356	}
				4357	case POSITIVE_SUBMATCH_SUCCESS: {
				4358	if (!trace->is_trivial()) {
				4359	trace->Flush(compiler, this);
				4360	return;
				4361	}
				4362	assembler->ReadCurrentPositionFromRegister(
				4363	data_.u_submatch.current_position_register);
				4364	assembler->ReadStackPointerFromRegister(
				4365	data_.u_submatch.stack_pointer_register);
				4366	int clear_register_count = data_.u_submatch.clear_register_count;
				4367	if (clear_register_count == 0) {
				4368	on_success()->Emit(compiler, trace);
				4369	return;
				4370	}
				4371	int clear_registers_from = data_.u_submatch.clear_register_from;
				4372	Label clear_registers_backtrack;
				4373	Trace new_trace = *trace;
				4374	new_trace.set_backtrack(&clear_registers_backtrack);
				4375	on_success()->Emit(compiler, &new_trace);
				4376
				4377	assembler->Bind(&clear_registers_backtrack);
				4378	int clear_registers_to = clear_registers_from + clear_register_count - 1;
				4379	assembler->ClearRegisters(clear_registers_from, clear_registers_to);
				4380
				4381	DCHECK(trace->backtrack() == NULL);
				4382	assembler->Backtrack();
				4383	return;
				4384	}
				4385	default:
				4386	UNREACHABLE();
				4387	}
				4388	}
				4389
				4390
				4391	void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				4392	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				4393	if (!trace->is_trivial()) {
				4394	trace->Flush(compiler, this);
				4395	return;
				4396	}
				4397
				4398	LimitResult limit_result = LimitVersions(compiler, trace);
				4399	if (limit_result == DONE) return;
				4400	DCHECK(limit_result == CONTINUE);
				4401
				4402	RecursionCheck rc(compiler);
				4403
				4404	DCHECK_EQ(start_reg_ + 1, end_reg_);
				4405	if (compiler->ignore_case()) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	4406	assembler->CheckNotBackReferenceIgnoreCase(
				4407	start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	4408	} else {
				4409	assembler->CheckNotBackReference(start_reg_, read_backward(),
				4410	trace->backtrack());
				4411	}
				4412	// We are going to advance backward, so we may end up at the start.
				4413	if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	4414
				4415	// Check that the back reference does not end inside a surrogate pair.
				4416	if (compiler->unicode() && !compiler->one_byte()) {
				4417	assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
				4418	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	4419	on_success()->Emit(compiler, trace);
				4420	}
				4421
				4422
				4423	// -------------------------------------------------------------------
				4424	// Dot/dotty output
				4425
				4426
				4427	#ifdef DEBUG
				4428
				4429
				4430	class DotPrinter: public NodeVisitor {
				4431	public:
				4432	DotPrinter(std::ostream& os, bool ignore_case) // NOLINT
				4433	: os_(os),
				4434	ignore_case_(ignore_case) {}
				4435	void PrintNode(const char* label, RegExpNode* node);
				4436	void Visit(RegExpNode* node);
				4437	void PrintAttributes(RegExpNode* from);
				4438	void PrintOnFailure(RegExpNode* from, RegExpNode* to);
				4439	#define DECLARE_VISIT(Type) \
				4440	virtual void Visit##Type(Type##Node* that);
				4441	FOR_EACH_NODE_TYPE(DECLARE_VISIT)
				4442	#undef DECLARE_VISIT
				4443	private:
				4444	std::ostream& os_;
				4445	bool ignore_case_;
				4446	};
				4447
				4448
				4449	void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
				4450	os_ << "digraph G {\n graph [label=\"";
				4451	for (int i = 0; label[i]; i++) {
				4452	switch (label[i]) {
				4453	case '\\':
				4454	os_ << "\\\\";
				4455	break;
				4456	case '"':
				4457	os_ << "\"";
				4458	break;
				4459	default:
				4460	os_ << label[i];
				4461	break;
				4462	}
				4463	}
				4464	os_ << "\"];\n";
				4465	Visit(node);
				4466	os_ << "}" << std::endl;
				4467	}
				4468
				4469
				4470	void DotPrinter::Visit(RegExpNode* node) {
				4471	if (node->info()->visited) return;
				4472	node->info()->visited = true;
				4473	node->Accept(this);
				4474	}
				4475
				4476
				4477	void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
				4478	os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
				4479	Visit(on_failure);
				4480	}
				4481
				4482
				4483	class TableEntryBodyPrinter {
				4484	public:
				4485	TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT
				4486	: os_(os),
				4487	choice_(choice) {}
				4488	void Call(uc16 from, DispatchTable::Entry entry) {
				4489	OutSet* out_set = entry.out_set();
				4490	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				4491	if (out_set->Get(i)) {
				4492	os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
				4493	<< choice()->alternatives()->at(i).node() << ";\n";
				4494	}
				4495	}
				4496	}
				4497	private:
				4498	ChoiceNode* choice() { return choice_; }
				4499	std::ostream& os_;
				4500	ChoiceNode* choice_;
				4501	};
				4502
				4503
				4504	class TableEntryHeaderPrinter {
				4505	public:
				4506	explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT
				4507	: first_(true),
				4508	os_(os) {}
				4509	void Call(uc16 from, DispatchTable::Entry entry) {
				4510	if (first_) {
				4511	first_ = false;
				4512	} else {
				4513	os_ << "\|";
				4514	}
				4515	os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "\|{";
				4516	OutSet* out_set = entry.out_set();
				4517	int priority = 0;
				4518	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				4519	if (out_set->Get(i)) {
				4520	if (priority > 0) os_ << "\|";
				4521	os_ << "<s" << from << "o" << i << "> " << priority;
				4522	priority++;
				4523	}
				4524	}
				4525	os_ << "}}";
				4526	}
				4527
				4528	private:
				4529	bool first_;
				4530	std::ostream& os_;
				4531	};
				4532
				4533
				4534	class AttributePrinter {
				4535	public:
				4536	explicit AttributePrinter(std::ostream& os) // NOLINT
				4537	: os_(os),
				4538	first_(true) {}
				4539	void PrintSeparator() {
				4540	if (first_) {
				4541	first_ = false;
				4542	} else {
				4543	os_ << "\|";
				4544	}
				4545	}
				4546	void PrintBit(const char* name, bool value) {
				4547	if (!value) return;
				4548	PrintSeparator();
				4549	os_ << "{" << name << "}";
				4550	}
				4551	void PrintPositive(const char* name, int value) {
				4552	if (value < 0) return;
				4553	PrintSeparator();
				4554	os_ << "{" << name << "\|" << value << "}";
				4555	}
				4556
				4557	private:
				4558	std::ostream& os_;
				4559	bool first_;
				4560	};
				4561
				4562
				4563	void DotPrinter::PrintAttributes(RegExpNode* that) {
				4564	os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
				4565	<< "margin=0.1, fontsize=10, label=\"{";
				4566	AttributePrinter printer(os_);
				4567	NodeInfo* info = that->info();
				4568	printer.PrintBit("NI", info->follows_newline_interest);
				4569	printer.PrintBit("WI", info->follows_word_interest);
				4570	printer.PrintBit("SI", info->follows_start_interest);
				4571	Label* label = that->label();
				4572	if (label->is_bound())
				4573	printer.PrintPositive("@", label->pos());
				4574	os_ << "}\"];\n"
				4575	<< " a" << that << " -> n" << that
				4576	<< " [style=dashed, color=grey, arrowhead=none];\n";
				4577	}
				4578
				4579
				4580	static const bool kPrintDispatchTable = false;
				4581	void DotPrinter::VisitChoice(ChoiceNode* that) {
				4582	if (kPrintDispatchTable) {
				4583	os_ << " n" << that << " [shape=Mrecord, label=\"";
				4584	TableEntryHeaderPrinter header_printer(os_);
				4585	that->GetTable(ignore_case_)->ForEach(&header_printer);
				4586	os_ << "\"]\n";
				4587	PrintAttributes(that);
				4588	TableEntryBodyPrinter body_printer(os_, that);
				4589	that->GetTable(ignore_case_)->ForEach(&body_printer);
				4590	} else {
				4591	os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
				4592	for (int i = 0; i < that->alternatives()->length(); i++) {
				4593	GuardedAlternative alt = that->alternatives()->at(i);
				4594	os_ << " n" << that << " -> n" << alt.node();
				4595	}
				4596	}
				4597	for (int i = 0; i < that->alternatives()->length(); i++) {
				4598	GuardedAlternative alt = that->alternatives()->at(i);
				4599	alt.node()->Accept(this);
				4600	}
				4601	}
				4602
				4603
				4604	void DotPrinter::VisitText(TextNode* that) {
				4605	Zone* zone = that->zone();
				4606	os_ << " n" << that << " [label=\"";
				4607	for (int i = 0; i < that->elements()->length(); i++) {
				4608	if (i > 0) os_ << " ";
				4609	TextElement elm = that->elements()->at(i);
				4610	switch (elm.text_type()) {
				4611	case TextElement::ATOM: {
				4612	Vector<const uc16> data = elm.atom()->data();
				4613	for (int i = 0; i < data.length(); i++) {
				4614	os_ << static_cast<char>(data[i]);
				4615	}
				4616	break;
				4617	}
				4618	case TextElement::CHAR_CLASS: {
				4619	RegExpCharacterClass* node = elm.char_class();
				4620	os_ << "[";
				4621	if (node->is_negated()) os_ << "^";
				4622	for (int j = 0; j < node->ranges(zone)->length(); j++) {
				4623	CharacterRange range = node->ranges(zone)->at(j);
				4624	os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
				4625	}
				4626	os_ << "]";
				4627	break;
				4628	}
				4629	default:
				4630	UNREACHABLE();
				4631	}
				4632	}
				4633	os_ << "\", shape=box, peripheries=2];\n";
				4634	PrintAttributes(that);
				4635	os_ << " n" << that << " -> n" << that->on_success() << ";\n";
				4636	Visit(that->on_success());
				4637	}
				4638
				4639
				4640	void DotPrinter::VisitBackReference(BackReferenceNode* that) {
				4641	os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
				4642	<< that->end_register() << "\", shape=doubleoctagon];\n";
				4643	PrintAttributes(that);
				4644	os_ << " n" << that << " -> n" << that->on_success() << ";\n";
				4645	Visit(that->on_success());
				4646	}
				4647
				4648
				4649	void DotPrinter::VisitEnd(EndNode* that) {
				4650	os_ << " n" << that << " [style=bold, shape=point];\n";
				4651	PrintAttributes(that);
				4652	}
				4653
				4654
				4655	void DotPrinter::VisitAssertion(AssertionNode* that) {
				4656	os_ << " n" << that << " [";
				4657	switch (that->assertion_type()) {
				4658	case AssertionNode::AT_END:
				4659	os_ << "label=\"$\", shape=septagon";
				4660	break;
				4661	case AssertionNode::AT_START:
				4662	os_ << "label=\"^\", shape=septagon";
				4663	break;
				4664	case AssertionNode::AT_BOUNDARY:
				4665	os_ << "label=\"\\b\", shape=septagon";
				4666	break;
				4667	case AssertionNode::AT_NON_BOUNDARY:
				4668	os_ << "label=\"\\B\", shape=septagon";
				4669	break;
				4670	case AssertionNode::AFTER_NEWLINE:
				4671	os_ << "label=\"(?<=\\n)\", shape=septagon";
				4672	break;
				4673	}
				4674	os_ << "];\n";
				4675	PrintAttributes(that);
				4676	RegExpNode* successor = that->on_success();
				4677	os_ << " n" << that << " -> n" << successor << ";\n";
				4678	Visit(successor);
				4679	}
				4680
				4681
				4682	void DotPrinter::VisitAction(ActionNode* that) {
				4683	os_ << " n" << that << " [";
				4684	switch (that->action_type_) {
				4685	case ActionNode::SET_REGISTER:
				4686	os_ << "label=\"$" << that->data_.u_store_register.reg
				4687	<< ":=" << that->data_.u_store_register.value << "\", shape=octagon";
				4688	break;
				4689	case ActionNode::INCREMENT_REGISTER:
				4690	os_ << "label=\"$" << that->data_.u_increment_register.reg
				4691	<< "++\", shape=octagon";
				4692	break;
				4693	case ActionNode::STORE_POSITION:
				4694	os_ << "label=\"$" << that->data_.u_position_register.reg
				4695	<< ":=$pos\", shape=octagon";
				4696	break;
				4697	case ActionNode::BEGIN_SUBMATCH:
				4698	os_ << "label=\"$" << that->data_.u_submatch.current_position_register
				4699	<< ":=$pos,begin\", shape=septagon";
				4700	break;
				4701	case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
				4702	os_ << "label=\"escape\", shape=septagon";
				4703	break;
				4704	case ActionNode::EMPTY_MATCH_CHECK:
				4705	os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
				4706	<< "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
				4707	<< "<" << that->data_.u_empty_match_check.repetition_limit
				4708	<< "?\", shape=septagon";
				4709	break;
				4710	case ActionNode::CLEAR_CAPTURES: {
				4711	os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
				4712	<< " to $" << that->data_.u_clear_captures.range_to
				4713	<< "\", shape=septagon";
				4714	break;
				4715	}
				4716	}
				4717	os_ << "];\n";
				4718	PrintAttributes(that);
				4719	RegExpNode* successor = that->on_success();
				4720	os_ << " n" << that << " -> n" << successor << ";\n";
				4721	Visit(successor);
				4722	}
				4723
				4724
				4725	class DispatchTableDumper {
				4726	public:
				4727	explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
				4728	void Call(uc16 key, DispatchTable::Entry entry);
				4729	private:
				4730	std::ostream& os_;
				4731	};
				4732
				4733
				4734	void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
				4735	os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
				4736	OutSet* set = entry.out_set();
				4737	bool first = true;
				4738	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				4739	if (set->Get(i)) {
				4740	if (first) {
				4741	first = false;
				4742	} else {
				4743	os_ << ", ";
				4744	}
				4745	os_ << i;
				4746	}
				4747	}
				4748	os_ << "}\n";
				4749	}
				4750
				4751
				4752	void DispatchTable::Dump() {
				4753	OFStream os(stderr);
				4754	DispatchTableDumper dumper(os);
				4755	tree()->ForEach(&dumper);
				4756	}
				4757
				4758
				4759	void RegExpEngine::DotPrint(const char* label,
				4760	RegExpNode* node,
				4761	bool ignore_case) {
				4762	OFStream os(stdout);
				4763	DotPrinter printer(os, ignore_case);
				4764	printer.PrintNode(label, node);
				4765	}
				4766
				4767
				4768	#endif // DEBUG
				4769
				4770
				4771	// -------------------------------------------------------------------
				4772	// Tree to graph conversion
				4773
				4774	RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
				4775	RegExpNode* on_success) {
				4776	ZoneList<TextElement>* elms =
				4777	new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
				4778	elms->Add(TextElement::Atom(this), compiler->zone());
				4779	return new (compiler->zone())
				4780	TextNode(elms, compiler->read_backward(), on_success);
				4781	}
				4782
				4783
				4784	RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
				4785	RegExpNode* on_success) {
				4786	return new (compiler->zone())
				4787	TextNode(elements(), compiler->read_backward(), on_success);
				4788	}
				4789
				4790
				4791	static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
				4792	const int* special_class,
				4793	int length) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	4794	length--; // Remove final marker.
				4795	DCHECK(special_class[length] == kRangeEndMarker);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	4796	DCHECK(ranges->length() != 0);
				4797	DCHECK(length != 0);
				4798	DCHECK(special_class[0] != 0);
				4799	if (ranges->length() != (length >> 1) + 1) {
				4800	return false;
				4801	}
				4802	CharacterRange range = ranges->at(0);
				4803	if (range.from() != 0) {
				4804	return false;
				4805	}
				4806	for (int i = 0; i < length; i += 2) {
				4807	if (special_class[i] != (range.to() + 1)) {
				4808	return false;
				4809	}
				4810	range = ranges->at((i >> 1) + 1);
				4811	if (special_class[i+1] != range.from()) {
				4812	return false;
				4813	}
				4814	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	4815	if (range.to() != String::kMaxCodePoint) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	4816	return false;
				4817	}
				4818	return true;
				4819	}
				4820
				4821
				4822	static bool CompareRanges(ZoneList<CharacterRange>* ranges,
				4823	const int* special_class,
				4824	int length) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	4825	length--; // Remove final marker.
				4826	DCHECK(special_class[length] == kRangeEndMarker);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	4827	if (ranges->length() * 2 != length) {
				4828	return false;
				4829	}
				4830	for (int i = 0; i < length; i += 2) {
				4831	CharacterRange range = ranges->at(i >> 1);
				4832	if (range.from() != special_class[i] \|\|
				4833	range.to() != special_class[i + 1] - 1) {
				4834	return false;
				4835	}
				4836	}
				4837	return true;
				4838	}
				4839
				4840
				4841	bool RegExpCharacterClass::is_standard(Zone* zone) {
				4842	// TODO(lrn): Remove need for this function, by not throwing away information
				4843	// along the way.
				4844	if (is_negated_) {
				4845	return false;
				4846	}
				4847	if (set_.is_standard()) {
				4848	return true;
				4849	}
				4850	if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
				4851	set_.set_standard_set_type('s');
				4852	return true;
				4853	}
				4854	if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
				4855	set_.set_standard_set_type('S');
				4856	return true;
				4857	}
				4858	if (CompareInverseRanges(set_.ranges(zone),
				4859	kLineTerminatorRanges,
				4860	kLineTerminatorRangeCount)) {
				4861	set_.set_standard_set_type('.');
				4862	return true;
				4863	}
				4864	if (CompareRanges(set_.ranges(zone),
				4865	kLineTerminatorRanges,
				4866	kLineTerminatorRangeCount)) {
				4867	set_.set_standard_set_type('n');
				4868	return true;
				4869	}
				4870	if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
				4871	set_.set_standard_set_type('w');
				4872	return true;
				4873	}
				4874	if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
				4875	set_.set_standard_set_type('W');
				4876	return true;
				4877	}
				4878	return false;
				4879	}
				4880
				4881
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	4882	UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
				4883	ZoneList<CharacterRange>* base)
				4884	: zone_(zone),
				4885	table_(zone),
				4886	bmp_(nullptr),
				4887	lead_surrogates_(nullptr),
				4888	trail_surrogates_(nullptr),
				4889	non_bmp_(nullptr) {
				4890	// The unicode range splitter categorizes given character ranges into:
				4891	// - Code points from the BMP representable by one code unit.
				4892	// - Code points outside the BMP that need to be split into surrogate pairs.
				4893	// - Lone lead surrogates.
				4894	// - Lone trail surrogates.
				4895	// Lone surrogates are valid code points, even though no actual characters.
				4896	// They require special matching to make sure we do not split surrogate pairs.
				4897	// We use the dispatch table to accomplish this. The base range is split up
				4898	// by the table by the overlay ranges, and the Call callback is used to
				4899	// filter and collect ranges for each category.
				4900	for (int i = 0; i < base->length(); i++) {
				4901	table_.AddRange(base->at(i), kBase, zone_);
				4902	}
				4903	// Add overlay ranges.
				4904	table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
				4905	kBmpCodePoints, zone_);
				4906	table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
				4907	kLeadSurrogates, zone_);
				4908	table_.AddRange(
				4909	CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
				4910	kTrailSurrogates, zone_);
				4911	table_.AddRange(
				4912	CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
				4913	kBmpCodePoints, zone_);
				4914	table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
				4915	kNonBmpCodePoints, zone_);
				4916	table_.ForEach(this);
				4917	}
				4918
				4919
				4920	void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
				4921	OutSet* outset = entry.out_set();
				4922	if (!outset->Get(kBase)) return;
				4923	ZoneList<CharacterRange>** target = NULL;
				4924	if (outset->Get(kBmpCodePoints)) {
				4925	target = &bmp_;
				4926	} else if (outset->Get(kLeadSurrogates)) {
				4927	target = &lead_surrogates_;
				4928	} else if (outset->Get(kTrailSurrogates)) {
				4929	target = &trail_surrogates_;
				4930	} else {
				4931	DCHECK(outset->Get(kNonBmpCodePoints));
				4932	target = &non_bmp_;
				4933	}
				4934	if (target == NULL) target = new (zone_) ZoneList<CharacterRange>(2, zone_);
				4935	(*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
				4936	}
				4937
				4938
				4939	void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
				4940	RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
				4941	ZoneList<CharacterRange>* bmp = splitter->bmp();
				4942	if (bmp == nullptr) return;
				4943	result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
				4944	compiler->zone(), bmp, compiler->read_backward(), on_success)));
				4945	}
				4946
				4947
				4948	void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
				4949	RegExpNode* on_success,
				4950	UnicodeRangeSplitter* splitter) {
				4951	ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
				4952	if (non_bmp == nullptr) return;
				4953	DCHECK(compiler->unicode());
				4954	DCHECK(!compiler->one_byte());
				4955	Zone* zone = compiler->zone();
				4956	CharacterRange::Canonicalize(non_bmp);
				4957	for (int i = 0; i < non_bmp->length(); i++) {
				4958	// Match surrogate pair.
				4959	// E.g. [\u10005-\u11005] becomes
				4960	// \ud800[\udc05-\udfff]\|
				4961	// [\ud801-\ud803][\udc00-\udfff]\|
				4962	// \ud804[\udc00-\udc05]
				4963	uc32 from = non_bmp->at(i).from();
				4964	uc32 to = non_bmp->at(i).to();
				4965	uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
				4966	uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
				4967	uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
				4968	uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
				4969	if (from_l == to_l) {
				4970	// The lead surrogate is the same.
				4971	result->AddAlternative(
				4972	GuardedAlternative(TextNode::CreateForSurrogatePair(
				4973	zone, CharacterRange::Singleton(from_l),
				4974	CharacterRange::Range(from_t, to_t), compiler->read_backward(),
				4975	on_success)));
				4976	} else {
				4977	if (from_t != kTrailSurrogateStart) {
				4978	// Add [from_l][from_t-\udfff]
				4979	result->AddAlternative(
				4980	GuardedAlternative(TextNode::CreateForSurrogatePair(
				4981	zone, CharacterRange::Singleton(from_l),
				4982	CharacterRange::Range(from_t, kTrailSurrogateEnd),
				4983	compiler->read_backward(), on_success)));
				4984	from_l++;
				4985	}
				4986	if (to_t != kTrailSurrogateEnd) {
				4987	// Add [to_l][\udc00-to_t]
				4988	result->AddAlternative(
				4989	GuardedAlternative(TextNode::CreateForSurrogatePair(
				4990	zone, CharacterRange::Singleton(to_l),
				4991	CharacterRange::Range(kTrailSurrogateStart, to_t),
				4992	compiler->read_backward(), on_success)));
				4993	to_l--;
				4994	}
				4995	if (from_l <= to_l) {
				4996	// Add [from_l-to_l][\udc00-\udfff]
				4997	result->AddAlternative(
				4998	GuardedAlternative(TextNode::CreateForSurrogatePair(
				4999	zone, CharacterRange::Range(from_l, to_l),
				5000	CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
				5001	compiler->read_backward(), on_success)));
				5002	}
				5003	}
				5004	}
				5005	}
				5006
				5007
				5008	RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
				5009	RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
				5010	ZoneList<CharacterRange>* match, RegExpNode* on_success,
				5011	bool read_backward) {
				5012	Zone* zone = compiler->zone();
				5013	RegExpNode* match_node = TextNode::CreateForCharacterRanges(
				5014	zone, match, read_backward, on_success);
				5015	int stack_register = compiler->UnicodeLookaroundStackRegister();
				5016	int position_register = compiler->UnicodeLookaroundPositionRegister();
				5017	RegExpLookaround::Builder lookaround(false, match_node, stack_register,
				5018	position_register);
				5019	RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
				5020	zone, lookbehind, !read_backward, lookaround.on_match_success());
				5021	return lookaround.ForMatch(negative_match);
				5022	}
				5023
				5024
				5025	RegExpNode* MatchAndNegativeLookaroundInReadDirection(
				5026	RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
				5027	ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
				5028	bool read_backward) {
				5029	Zone* zone = compiler->zone();
				5030	int stack_register = compiler->UnicodeLookaroundStackRegister();
				5031	int position_register = compiler->UnicodeLookaroundPositionRegister();
				5032	RegExpLookaround::Builder lookaround(false, on_success, stack_register,
				5033	position_register);
				5034	RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
				5035	zone, lookahead, read_backward, lookaround.on_match_success());
				5036	return TextNode::CreateForCharacterRanges(
				5037	zone, match, read_backward, lookaround.ForMatch(negative_match));
				5038	}
				5039
				5040
				5041	void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
				5042	RegExpNode* on_success,
				5043	UnicodeRangeSplitter* splitter) {
				5044	ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
				5045	if (lead_surrogates == nullptr) return;
				5046	Zone* zone = compiler->zone();
				5047	// E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
				5048	ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
				5049	zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
				5050
				5051	RegExpNode* match;
				5052	if (compiler->read_backward()) {
				5053	// Reading backward. Assert that reading forward, there is no trail
				5054	// surrogate, and then backward match the lead surrogate.
				5055	match = NegativeLookaroundAgainstReadDirectionAndMatch(
				5056	compiler, trail_surrogates, lead_surrogates, on_success, true);
				5057	} else {
				5058	// Reading forward. Forward match the lead surrogate and assert that
				5059	// no trail surrogate follows.
				5060	match = MatchAndNegativeLookaroundInReadDirection(
				5061	compiler, lead_surrogates, trail_surrogates, on_success, false);
				5062	}
				5063	result->AddAlternative(GuardedAlternative(match));
				5064	}
				5065
				5066
				5067	void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
				5068	RegExpNode* on_success,
				5069	UnicodeRangeSplitter* splitter) {
				5070	ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
				5071	if (trail_surrogates == nullptr) return;
				5072	Zone* zone = compiler->zone();
				5073	// E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
				5074	ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
				5075	zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
				5076
				5077	RegExpNode* match;
				5078	if (compiler->read_backward()) {
				5079	// Reading backward. Backward match the trail surrogate and assert that no
				5080	// lead surrogate precedes it.
				5081	match = MatchAndNegativeLookaroundInReadDirection(
				5082	compiler, trail_surrogates, lead_surrogates, on_success, true);
				5083	} else {
				5084	// Reading forward. Assert that reading backward, there is no lead
				5085	// surrogate, and then forward match the trail surrogate.
				5086	match = NegativeLookaroundAgainstReadDirectionAndMatch(
				5087	compiler, lead_surrogates, trail_surrogates, on_success, false);
				5088	}
				5089	result->AddAlternative(GuardedAlternative(match));
				5090	}
				5091
				5092	RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
				5093	RegExpNode* on_success) {
				5094	// This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
				5095	DCHECK(!compiler->read_backward());
				5096	Zone* zone = compiler->zone();
				5097	// Advance any character. If the character happens to be a lead surrogate and
				5098	// we advanced into the middle of a surrogate pair, it will work out, as
				5099	// nothing will match from there. We will have to advance again, consuming
				5100	// the associated trail surrogate.
				5101	ZoneList<CharacterRange>* range = CharacterRange::List(
				5102	zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
				5103	return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
				5104	}
				5105
				5106
				5107	void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
				5108	ZoneList<CharacterRange>* ranges) {
				5109	#ifdef V8_I18N_SUPPORT
				5110	// Use ICU to compute the case fold closure over the ranges.
				5111	DCHECK(compiler->unicode());
				5112	DCHECK(compiler->ignore_case());
				5113	USet* set = uset_openEmpty();
				5114	for (int i = 0; i < ranges->length(); i++) {
				5115	uset_addRange(set, ranges->at(i).from(), ranges->at(i).to());
				5116	}
				5117	ranges->Clear();
				5118	uset_closeOver(set, USET_CASE_INSENSITIVE);
				5119	// Full case mapping map single characters to multiple characters.
				5120	// Those are represented as strings in the set. Remove them so that
				5121	// we end up with only simple and common case mappings.
				5122	uset_removeAllStrings(set);
				5123	int item_count = uset_getItemCount(set);
				5124	int item_result = 0;
				5125	UErrorCode ec = U_ZERO_ERROR;
				5126	Zone* zone = compiler->zone();
				5127	for (int i = 0; i < item_count; i++) {
				5128	uc32 start = 0;
				5129	uc32 end = 0;
				5130	item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
				5131	ranges->Add(CharacterRange::Range(start, end), zone);
				5132	}
				5133	// No errors and everything we collected have been ranges.
				5134	DCHECK_EQ(U_ZERO_ERROR, ec);
				5135	DCHECK_EQ(0, item_result);
				5136	uset_close(set);
				5137	#else
				5138	// Fallback if ICU is not included.
				5139	CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
				5140	ranges, compiler->one_byte());
				5141	#endif // V8_I18N_SUPPORT
				5142	CharacterRange::Canonicalize(ranges);
				5143	}
				5144
				5145
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5146	RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
				5147	RegExpNode* on_success) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5148	set_.Canonicalize();
				5149	Zone* zone = compiler->zone();
				5150	ZoneList<CharacterRange>* ranges = this->ranges(zone);
				5151	if (compiler->unicode() && compiler->ignore_case()) {
				5152	AddUnicodeCaseEquivalents(compiler, ranges);
				5153	}
				5154	if (compiler->unicode() && !compiler->one_byte()) {
				5155	if (is_negated()) {
				5156	ZoneList<CharacterRange>* negated =
				5157	new (zone) ZoneList<CharacterRange>(2, zone);
				5158	CharacterRange::Negate(ranges, negated, zone);
				5159	ranges = negated;
				5160	}
				5161	if (ranges->length() == 0) {
Ben Murdoch	c561043	2016-08-08 18:44:38 +0100	[diff] [blame^]	5162	ranges->Add(CharacterRange::Everything(), zone);
				5163	RegExpCharacterClass* fail =
				5164	new (zone) RegExpCharacterClass(ranges, true);
				5165	return new (zone) TextNode(fail, compiler->read_backward(), on_success);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5166	}
				5167	if (standard_type() == '*') {
				5168	return UnanchoredAdvance(compiler, on_success);
				5169	} else {
				5170	ChoiceNode* result = new (zone) ChoiceNode(2, zone);
				5171	UnicodeRangeSplitter splitter(zone, ranges);
				5172	AddBmpCharacters(compiler, result, on_success, &splitter);
				5173	AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
				5174	AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
				5175	AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
				5176	return result;
				5177	}
				5178	} else {
				5179	return new (zone) TextNode(this, compiler->read_backward(), on_success);
				5180	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5181	}
				5182
				5183
				5184	int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
				5185	RegExpAtom* atom1 = (*a)->AsAtom();
				5186	RegExpAtom* atom2 = (*b)->AsAtom();
				5187	uc16 character1 = atom1->data().at(0);
				5188	uc16 character2 = atom2->data().at(0);
				5189	if (character1 < character2) return -1;
				5190	if (character1 > character2) return 1;
				5191	return 0;
				5192	}
				5193
				5194
				5195	static unibrow::uchar Canonical(
				5196	unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
				5197	unibrow::uchar c) {
				5198	unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
				5199	int length = canonicalize->get(c, '\0', chars);
				5200	DCHECK_LE(length, 1);
				5201	unibrow::uchar canonical = c;
				5202	if (length == 1) canonical = chars[0];
				5203	return canonical;
				5204	}
				5205
				5206
				5207	int CompareFirstCharCaseIndependent(
				5208	unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
				5209	RegExpTree* const* a, RegExpTree* const* b) {
				5210	RegExpAtom* atom1 = (*a)->AsAtom();
				5211	RegExpAtom* atom2 = (*b)->AsAtom();
				5212	unibrow::uchar character1 = atom1->data().at(0);
				5213	unibrow::uchar character2 = atom2->data().at(0);
				5214	if (character1 == character2) return 0;
				5215	if (character1 >= 'a' \|\| character2 >= 'a') {
				5216	character1 = Canonical(canonicalize, character1);
				5217	character2 = Canonical(canonicalize, character2);
				5218	}
				5219	return static_cast<int>(character1) - static_cast<int>(character2);
				5220	}
				5221
				5222
				5223	// We can stable sort runs of atoms, since the order does not matter if they
				5224	// start with different characters.
				5225	// Returns true if any consecutive atoms were found.
				5226	bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
				5227	ZoneList<RegExpTree> alternatives = this->alternatives();
				5228	int length = alternatives->length();
				5229	bool found_consecutive_atoms = false;
				5230	for (int i = 0; i < length; i++) {
				5231	while (i < length) {
				5232	RegExpTree* alternative = alternatives->at(i);
				5233	if (alternative->IsAtom()) break;
				5234	i++;
				5235	}
				5236	// i is length or it is the index of an atom.
				5237	if (i == length) break;
				5238	int first_atom = i;
				5239	i++;
				5240	while (i < length) {
				5241	RegExpTree* alternative = alternatives->at(i);
				5242	if (!alternative->IsAtom()) break;
				5243	i++;
				5244	}
				5245	// Sort atoms to get ones with common prefixes together.
				5246	// This step is more tricky if we are in a case-independent regexp,
				5247	// because it would change /is\|I/ to /I\|is/, and order matters when
				5248	// the regexp parts don't match only disjoint starting points. To fix
				5249	// this we have a version of CompareFirstChar that uses case-
				5250	// independent character classes for comparison.
				5251	DCHECK_LT(first_atom, alternatives->length());
				5252	DCHECK_LE(i, alternatives->length());
				5253	DCHECK_LE(first_atom, i);
				5254	if (compiler->ignore_case()) {
				5255	unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
				5256	compiler->isolate()->regexp_macro_assembler_canonicalize();
				5257	auto compare_closure =
				5258	[canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
				5259	return CompareFirstCharCaseIndependent(canonicalize, a, b);
				5260	};
				5261	alternatives->StableSort(compare_closure, first_atom, i - first_atom);
				5262	} else {
				5263	alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
				5264	}
				5265	if (i - first_atom > 1) found_consecutive_atoms = true;
				5266	}
				5267	return found_consecutive_atoms;
				5268	}
				5269
				5270
				5271	// Optimizes ab\|ac\|az to a(?:b\|c\|d).
				5272	void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
				5273	Zone* zone = compiler->zone();
				5274	ZoneList<RegExpTree> alternatives = this->alternatives();
				5275	int length = alternatives->length();
				5276
				5277	int write_posn = 0;
				5278	int i = 0;
				5279	while (i < length) {
				5280	RegExpTree* alternative = alternatives->at(i);
				5281	if (!alternative->IsAtom()) {
				5282	alternatives->at(write_posn++) = alternatives->at(i);
				5283	i++;
				5284	continue;
				5285	}
				5286	RegExpAtom* atom = alternative->AsAtom();
				5287	unibrow::uchar common_prefix = atom->data().at(0);
				5288	int first_with_prefix = i;
				5289	int prefix_length = atom->length();
				5290	i++;
				5291	while (i < length) {
				5292	alternative = alternatives->at(i);
				5293	if (!alternative->IsAtom()) break;
				5294	atom = alternative->AsAtom();
				5295	unibrow::uchar new_prefix = atom->data().at(0);
				5296	if (new_prefix != common_prefix) {
				5297	if (!compiler->ignore_case()) break;
				5298	unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
				5299	compiler->isolate()->regexp_macro_assembler_canonicalize();
				5300	new_prefix = Canonical(canonicalize, new_prefix);
				5301	common_prefix = Canonical(canonicalize, common_prefix);
				5302	if (new_prefix != common_prefix) break;
				5303	}
				5304	prefix_length = Min(prefix_length, atom->length());
				5305	i++;
				5306	}
				5307	if (i > first_with_prefix + 2) {
				5308	// Found worthwhile run of alternatives with common prefix of at least one
				5309	// character. The sorting function above did not sort on more than one
				5310	// character for reasons of correctness, but there may still be a longer
				5311	// common prefix if the terms were similar or presorted in the input.
				5312	// Find out how long the common prefix is.
				5313	int run_length = i - first_with_prefix;
				5314	atom = alternatives->at(first_with_prefix)->AsAtom();
				5315	for (int j = 1; j < run_length && prefix_length > 1; j++) {
				5316	RegExpAtom* old_atom =
				5317	alternatives->at(j + first_with_prefix)->AsAtom();
				5318	for (int k = 1; k < prefix_length; k++) {
				5319	if (atom->data().at(k) != old_atom->data().at(k)) {
				5320	prefix_length = k;
				5321	break;
				5322	}
				5323	}
				5324	}
				5325	RegExpAtom* prefix =
				5326	new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
				5327	ZoneList<RegExpTree> pair = new (zone) ZoneList<RegExpTree*>(2, zone);
				5328	pair->Add(prefix, zone);
				5329	ZoneList<RegExpTree> suffixes =
				5330	new (zone) ZoneList<RegExpTree*>(run_length, zone);
				5331	for (int j = 0; j < run_length; j++) {
				5332	RegExpAtom* old_atom =
				5333	alternatives->at(j + first_with_prefix)->AsAtom();
				5334	int len = old_atom->length();
				5335	if (len == prefix_length) {
				5336	suffixes->Add(new (zone) RegExpEmpty(), zone);
				5337	} else {
				5338	RegExpTree* suffix = new (zone) RegExpAtom(
				5339	old_atom->data().SubVector(prefix_length, old_atom->length()));
				5340	suffixes->Add(suffix, zone);
				5341	}
				5342	}
				5343	pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
				5344	alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
				5345	} else {
				5346	// Just copy any non-worthwhile alternatives.
				5347	for (int j = first_with_prefix; j < i; j++) {
				5348	alternatives->at(write_posn++) = alternatives->at(j);
				5349	}
				5350	}
				5351	}
				5352	alternatives->Rewind(write_posn); // Trim end of array.
				5353	}
				5354
				5355
				5356	// Optimizes b\|c\|z to [bcz].
				5357	void RegExpDisjunction::FixSingleCharacterDisjunctions(
				5358	RegExpCompiler* compiler) {
				5359	Zone* zone = compiler->zone();
				5360	ZoneList<RegExpTree> alternatives = this->alternatives();
				5361	int length = alternatives->length();
				5362
				5363	int write_posn = 0;
				5364	int i = 0;
				5365	while (i < length) {
				5366	RegExpTree* alternative = alternatives->at(i);
				5367	if (!alternative->IsAtom()) {
				5368	alternatives->at(write_posn++) = alternatives->at(i);
				5369	i++;
				5370	continue;
				5371	}
				5372	RegExpAtom* atom = alternative->AsAtom();
				5373	if (atom->length() != 1) {
				5374	alternatives->at(write_posn++) = alternatives->at(i);
				5375	i++;
				5376	continue;
				5377	}
				5378	int first_in_run = i;
				5379	i++;
				5380	while (i < length) {
				5381	alternative = alternatives->at(i);
				5382	if (!alternative->IsAtom()) break;
				5383	atom = alternative->AsAtom();
				5384	if (atom->length() != 1) break;
				5385	i++;
				5386	}
				5387	if (i > first_in_run + 1) {
				5388	// Found non-trivial run of single-character alternatives.
				5389	int run_length = i - first_in_run;
				5390	ZoneList<CharacterRange>* ranges =
				5391	new (zone) ZoneList<CharacterRange>(2, zone);
				5392	for (int j = 0; j < run_length; j++) {
				5393	RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
				5394	DCHECK_EQ(old_atom->length(), 1);
				5395	ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
				5396	}
				5397	alternatives->at(write_posn++) =
				5398	new (zone) RegExpCharacterClass(ranges, false);
				5399	} else {
				5400	// Just copy any trivial alternatives.
				5401	for (int j = first_in_run; j < i; j++) {
				5402	alternatives->at(write_posn++) = alternatives->at(j);
				5403	}
				5404	}
				5405	}
				5406	alternatives->Rewind(write_posn); // Trim end of array.
				5407	}
				5408
				5409
				5410	RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
				5411	RegExpNode* on_success) {
				5412	ZoneList<RegExpTree> alternatives = this->alternatives();
				5413
				5414	if (alternatives->length() > 2) {
				5415	bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
				5416	if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
				5417	FixSingleCharacterDisjunctions(compiler);
				5418	if (alternatives->length() == 1) {
				5419	return alternatives->at(0)->ToNode(compiler, on_success);
				5420	}
				5421	}
				5422
				5423	int length = alternatives->length();
				5424
				5425	ChoiceNode* result =
				5426	new(compiler->zone()) ChoiceNode(length, compiler->zone());
				5427	for (int i = 0; i < length; i++) {
				5428	GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
				5429	on_success));
				5430	result->AddAlternative(alternative);
				5431	}
				5432	return result;
				5433	}
				5434
				5435
				5436	RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
				5437	RegExpNode* on_success) {
				5438	return ToNode(min(),
				5439	max(),
				5440	is_greedy(),
				5441	body(),
				5442	compiler,
				5443	on_success);
				5444	}
				5445
				5446
				5447	// Scoped object to keep track of how much we unroll quantifier loops in the
				5448	// regexp graph generator.
				5449	class RegExpExpansionLimiter {
				5450	public:
				5451	static const int kMaxExpansionFactor = 6;
				5452	RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
				5453	: compiler_(compiler),
				5454	saved_expansion_factor_(compiler->current_expansion_factor()),
				5455	ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
				5456	DCHECK(factor > 0);
				5457	if (ok_to_expand_) {
				5458	if (factor > kMaxExpansionFactor) {
				5459	// Avoid integer overflow of the current expansion factor.
				5460	ok_to_expand_ = false;
				5461	compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
				5462	} else {
				5463	int new_factor = saved_expansion_factor_ * factor;
				5464	ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
				5465	compiler->set_current_expansion_factor(new_factor);
				5466	}
				5467	}
				5468	}
				5469
				5470	~RegExpExpansionLimiter() {
				5471	compiler_->set_current_expansion_factor(saved_expansion_factor_);
				5472	}
				5473
				5474	bool ok_to_expand() { return ok_to_expand_; }
				5475
				5476	private:
				5477	RegExpCompiler* compiler_;
				5478	int saved_expansion_factor_;
				5479	bool ok_to_expand_;
				5480
				5481	DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
				5482	};
				5483
				5484
				5485	RegExpNode* RegExpQuantifier::ToNode(int min,
				5486	int max,
				5487	bool is_greedy,
				5488	RegExpTree* body,
				5489	RegExpCompiler* compiler,
				5490	RegExpNode* on_success,
				5491	bool not_at_start) {
				5492	// x{f, t} becomes this:
				5493	//
				5494	// (r++)<-.
				5495	// \| `
				5496	// \| (x)
				5497	// v ^
				5498	// (r=0)-->(?)---/ [if r < t]
				5499	// \|
				5500	// [if r >= f] \----> ...
				5501	//
				5502
				5503	// 15.10.2.5 RepeatMatcher algorithm.
				5504	// The parser has already eliminated the case where max is 0. In the case
				5505	// where max_match is zero the parser has removed the quantifier if min was
				5506	// > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
				5507
				5508	// If we know that we cannot match zero length then things are a little
				5509	// simpler since we don't need to make the special zero length match check
				5510	// from step 2.1. If the min and max are small we can unroll a little in
				5511	// this case.
				5512	static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
				5513	static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
				5514	if (max == 0) return on_success; // This can happen due to recursion.
				5515	bool body_can_be_empty = (body->min_match() == 0);
				5516	int body_start_reg = RegExpCompiler::kNoRegister;
				5517	Interval capture_registers = body->CaptureRegisters();
				5518	bool needs_capture_clearing = !capture_registers.is_empty();
				5519	Zone* zone = compiler->zone();
				5520
				5521	if (body_can_be_empty) {
				5522	body_start_reg = compiler->AllocateRegister();
				5523	} else if (compiler->optimize() && !needs_capture_clearing) {
				5524	// Only unroll if there are no captures and the body can't be
				5525	// empty.
				5526	{
				5527	RegExpExpansionLimiter limiter(
				5528	compiler, min + ((max != min) ? 1 : 0));
				5529	if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
				5530	int new_max = (max == kInfinity) ? max : max - min;
				5531	// Recurse once to get the loop or optional matches after the fixed
				5532	// ones.
				5533	RegExpNode* answer = ToNode(
				5534	0, new_max, is_greedy, body, compiler, on_success, true);
				5535	// Unroll the forced matches from 0 to min. This can cause chains of
				5536	// TextNodes (which the parser does not generate). These should be
				5537	// combined if it turns out they hinder good code generation.
				5538	for (int i = 0; i < min; i++) {
				5539	answer = body->ToNode(compiler, answer);
				5540	}
				5541	return answer;
				5542	}
				5543	}
				5544	if (max <= kMaxUnrolledMaxMatches && min == 0) {
				5545	DCHECK(max > 0); // Due to the 'if' above.
				5546	RegExpExpansionLimiter limiter(compiler, max);
				5547	if (limiter.ok_to_expand()) {
				5548	// Unroll the optional matches up to max.
				5549	RegExpNode* answer = on_success;
				5550	for (int i = 0; i < max; i++) {
				5551	ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
				5552	if (is_greedy) {
				5553	alternation->AddAlternative(
				5554	GuardedAlternative(body->ToNode(compiler, answer)));
				5555	alternation->AddAlternative(GuardedAlternative(on_success));
				5556	} else {
				5557	alternation->AddAlternative(GuardedAlternative(on_success));
				5558	alternation->AddAlternative(
				5559	GuardedAlternative(body->ToNode(compiler, answer)));
				5560	}
				5561	answer = alternation;
				5562	if (not_at_start && !compiler->read_backward()) {
				5563	alternation->set_not_at_start();
				5564	}
				5565	}
				5566	return answer;
				5567	}
				5568	}
				5569	}
				5570	bool has_min = min > 0;
				5571	bool has_max = max < RegExpTree::kInfinity;
				5572	bool needs_counter = has_min \|\| has_max;
				5573	int reg_ctr = needs_counter
				5574	? compiler->AllocateRegister()
				5575	: RegExpCompiler::kNoRegister;
				5576	LoopChoiceNode* center = new (zone)
				5577	LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
				5578	if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
				5579	RegExpNode* loop_return = needs_counter
				5580	? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
				5581	: static_cast<RegExpNode*>(center);
				5582	if (body_can_be_empty) {
				5583	// If the body can be empty we need to check if it was and then
				5584	// backtrack.
				5585	loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
				5586	reg_ctr,
				5587	min,
				5588	loop_return);
				5589	}
				5590	RegExpNode* body_node = body->ToNode(compiler, loop_return);
				5591	if (body_can_be_empty) {
				5592	// If the body can be empty we need to store the start position
				5593	// so we can bail out if it was empty.
				5594	body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
				5595	}
				5596	if (needs_capture_clearing) {
				5597	// Before entering the body of this loop we need to clear captures.
				5598	body_node = ActionNode::ClearCaptures(capture_registers, body_node);
				5599	}
				5600	GuardedAlternative body_alt(body_node);
				5601	if (has_max) {
				5602	Guard* body_guard =
				5603	new(zone) Guard(reg_ctr, Guard::LT, max);
				5604	body_alt.AddGuard(body_guard, zone);
				5605	}
				5606	GuardedAlternative rest_alt(on_success);
				5607	if (has_min) {
				5608	Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
				5609	rest_alt.AddGuard(rest_guard, zone);
				5610	}
				5611	if (is_greedy) {
				5612	center->AddLoopAlternative(body_alt);
				5613	center->AddContinueAlternative(rest_alt);
				5614	} else {
				5615	center->AddContinueAlternative(rest_alt);
				5616	center->AddLoopAlternative(body_alt);
				5617	}
				5618	if (needs_counter) {
				5619	return ActionNode::SetRegister(reg_ctr, 0, center);
				5620	} else {
				5621	return center;
				5622	}
				5623	}
				5624
				5625
				5626	RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
				5627	RegExpNode* on_success) {
				5628	NodeInfo info;
				5629	Zone* zone = compiler->zone();
				5630
				5631	switch (assertion_type()) {
				5632	case START_OF_LINE:
				5633	return AssertionNode::AfterNewline(on_success);
				5634	case START_OF_INPUT:
				5635	return AssertionNode::AtStart(on_success);
				5636	case BOUNDARY:
				5637	return AssertionNode::AtBoundary(on_success);
				5638	case NON_BOUNDARY:
				5639	return AssertionNode::AtNonBoundary(on_success);
				5640	case END_OF_INPUT:
				5641	return AssertionNode::AtEnd(on_success);
				5642	case END_OF_LINE: {
				5643	// Compile $ in multiline regexps as an alternation with a positive
				5644	// lookahead in one side and an end-of-input on the other side.
				5645	// We need two registers for the lookahead.
				5646	int stack_pointer_register = compiler->AllocateRegister();
				5647	int position_register = compiler->AllocateRegister();
				5648	// The ChoiceNode to distinguish between a newline and end-of-input.
				5649	ChoiceNode* result = new(zone) ChoiceNode(2, zone);
				5650	// Create a newline atom.
				5651	ZoneList<CharacterRange>* newline_ranges =
				5652	new(zone) ZoneList<CharacterRange>(3, zone);
				5653	CharacterRange::AddClassEscape('n', newline_ranges, zone);
				5654	RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
				5655	TextNode* newline_matcher = new (zone) TextNode(
				5656	newline_atom, false, ActionNode::PositiveSubmatchSuccess(
				5657	stack_pointer_register, position_register,
				5658	0, // No captures inside.
				5659	-1, // Ignored if no captures.
				5660	on_success));
				5661	// Create an end-of-input matcher.
				5662	RegExpNode* end_of_line = ActionNode::BeginSubmatch(
				5663	stack_pointer_register,
				5664	position_register,
				5665	newline_matcher);
				5666	// Add the two alternatives to the ChoiceNode.
				5667	GuardedAlternative eol_alternative(end_of_line);
				5668	result->AddAlternative(eol_alternative);
				5669	GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
				5670	result->AddAlternative(end_alternative);
				5671	return result;
				5672	}
				5673	default:
				5674	UNREACHABLE();
				5675	}
				5676	return on_success;
				5677	}
				5678
				5679
				5680	RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
				5681	RegExpNode* on_success) {
				5682	return new (compiler->zone())
				5683	BackReferenceNode(RegExpCapture::StartRegister(index()),
				5684	RegExpCapture::EndRegister(index()),
				5685	compiler->read_backward(), on_success);
				5686	}
				5687
				5688
				5689	RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
				5690	RegExpNode* on_success) {
				5691	return on_success;
				5692	}
				5693
				5694
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5695	RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
				5696	int stack_pointer_register,
				5697	int position_register,
				5698	int capture_register_count,
				5699	int capture_register_start)
				5700	: is_positive_(is_positive),
				5701	on_success_(on_success),
				5702	stack_pointer_register_(stack_pointer_register),
				5703	position_register_(position_register) {
				5704	if (is_positive_) {
				5705	on_match_success_ = ActionNode::PositiveSubmatchSuccess(
				5706	stack_pointer_register, position_register, capture_register_count,
				5707	capture_register_start, on_success_);
				5708	} else {
				5709	Zone* zone = on_success_->zone();
				5710	on_match_success_ = new (zone) NegativeSubmatchSuccess(
				5711	stack_pointer_register, position_register, capture_register_count,
				5712	capture_register_start, zone);
				5713	}
				5714	}
				5715
				5716
				5717	RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
				5718	if (is_positive_) {
				5719	return ActionNode::BeginSubmatch(stack_pointer_register_,
				5720	position_register_, match);
				5721	} else {
				5722	Zone* zone = on_success_->zone();
				5723	// We use a ChoiceNode to represent the negative lookaround. The first
				5724	// alternative is the negative match. On success, the end node backtracks.
				5725	// On failure, the second alternative is tried and leads to success.
				5726	// NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
				5727	// first exit when calculating quick checks.
				5728	ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
				5729	GuardedAlternative(match), GuardedAlternative(on_success_), zone);
				5730	return ActionNode::BeginSubmatch(stack_pointer_register_,
				5731	position_register_, choice_node);
				5732	}
				5733	}
				5734
				5735
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5736	RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
				5737	RegExpNode* on_success) {
				5738	int stack_pointer_register = compiler->AllocateRegister();
				5739	int position_register = compiler->AllocateRegister();
				5740
				5741	const int registers_per_capture = 2;
				5742	const int register_of_first_capture = 2;
				5743	int register_count = capture_count_ * registers_per_capture;
				5744	int register_start =
				5745	register_of_first_capture + capture_from_ * registers_per_capture;
				5746
				5747	RegExpNode* result;
				5748	bool was_reading_backward = compiler->read_backward();
				5749	compiler->set_read_backward(type() == LOOKBEHIND);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5750	Builder builder(is_positive(), on_success, stack_pointer_register,
				5751	position_register, register_count, register_start);
				5752	RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
				5753	result = builder.ForMatch(match);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5754	compiler->set_read_backward(was_reading_backward);
				5755	return result;
				5756	}
				5757
				5758
				5759	RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
				5760	RegExpNode* on_success) {
				5761	return ToNode(body(), index(), compiler, on_success);
				5762	}
				5763
				5764
				5765	RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
				5766	int index,
				5767	RegExpCompiler* compiler,
				5768	RegExpNode* on_success) {
				5769	DCHECK_NOT_NULL(body);
				5770	int start_reg = RegExpCapture::StartRegister(index);
				5771	int end_reg = RegExpCapture::EndRegister(index);
				5772	if (compiler->read_backward()) std::swap(start_reg, end_reg);
				5773	RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
				5774	RegExpNode* body_node = body->ToNode(compiler, store_end);
				5775	return ActionNode::StorePosition(start_reg, true, body_node);
				5776	}
				5777
				5778
				5779	RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
				5780	RegExpNode* on_success) {
				5781	ZoneList<RegExpTree> children = nodes();
				5782	RegExpNode* current = on_success;
				5783	if (compiler->read_backward()) {
				5784	for (int i = 0; i < children->length(); i++) {
				5785	current = children->at(i)->ToNode(compiler, current);
				5786	}
				5787	} else {
				5788	for (int i = children->length() - 1; i >= 0; i--) {
				5789	current = children->at(i)->ToNode(compiler, current);
				5790	}
				5791	}
				5792	return current;
				5793	}
				5794
				5795
				5796	static void AddClass(const int* elmv,
				5797	int elmc,
				5798	ZoneList<CharacterRange>* ranges,
				5799	Zone* zone) {
				5800	elmc--;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5801	DCHECK(elmv[elmc] == kRangeEndMarker);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5802	for (int i = 0; i < elmc; i += 2) {
				5803	DCHECK(elmv[i] < elmv[i + 1]);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5804	ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5805	}
				5806	}
				5807
				5808
				5809	static void AddClassNegated(const int *elmv,
				5810	int elmc,
				5811	ZoneList<CharacterRange>* ranges,
				5812	Zone* zone) {
				5813	elmc--;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5814	DCHECK(elmv[elmc] == kRangeEndMarker);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5815	DCHECK(elmv[0] != 0x0000);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5816	DCHECK(elmv[elmc - 1] != String::kMaxCodePoint);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5817	uc16 last = 0x0000;
				5818	for (int i = 0; i < elmc; i += 2) {
				5819	DCHECK(last <= elmv[i] - 1);
				5820	DCHECK(elmv[i] < elmv[i + 1]);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5821	ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5822	last = elmv[i + 1];
				5823	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5824	ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5825	}
				5826
				5827
				5828	void CharacterRange::AddClassEscape(uc16 type,
				5829	ZoneList<CharacterRange>* ranges,
				5830	Zone* zone) {
				5831	switch (type) {
				5832	case 's':
				5833	AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
				5834	break;
				5835	case 'S':
				5836	AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
				5837	break;
				5838	case 'w':
				5839	AddClass(kWordRanges, kWordRangeCount, ranges, zone);
				5840	break;
				5841	case 'W':
				5842	AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
				5843	break;
				5844	case 'd':
				5845	AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
				5846	break;
				5847	case 'D':
				5848	AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
				5849	break;
				5850	case '.':
				5851	AddClassNegated(kLineTerminatorRanges,
				5852	kLineTerminatorRangeCount,
				5853	ranges,
				5854	zone);
				5855	break;
				5856	// This is not a character range as defined by the spec but a
				5857	// convenient shorthand for a character class that matches any
				5858	// character.
				5859	case '*':
				5860	ranges->Add(CharacterRange::Everything(), zone);
				5861	break;
				5862	// This is the set of characters matched by the $ and ^ symbols
				5863	// in multiline mode.
				5864	case 'n':
				5865	AddClass(kLineTerminatorRanges,
				5866	kLineTerminatorRangeCount,
				5867	ranges,
				5868	zone);
				5869	break;
				5870	default:
				5871	UNREACHABLE();
				5872	}
				5873	}
				5874
				5875
				5876	Vector<const int> CharacterRange::GetWordBounds() {
				5877	return Vector<const int>(kWordRanges, kWordRangeCount - 1);
				5878	}
				5879
				5880
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5881	void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
				5882	ZoneList<CharacterRange>* ranges,
				5883	bool is_one_byte) {
Ben Murdoch	c561043	2016-08-08 18:44:38 +0100	[diff] [blame^]	5884	CharacterRange::Canonicalize(ranges);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5885	int range_count = ranges->length();
				5886	for (int i = 0; i < range_count; i++) {
				5887	CharacterRange range = ranges->at(i);
				5888	uc32 bottom = range.from();
				5889	if (bottom > String::kMaxUtf16CodeUnit) return;
				5890	uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
				5891	// Nothing to be done for surrogates.
				5892	if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;
				5893	if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
				5894	if (bottom > String::kMaxOneByteCharCode) return;
				5895	if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5896	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5897	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				5898	if (top == bottom) {
				5899	// If this is a singleton we just expand the one character.
				5900	int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5901	for (int i = 0; i < length; i++) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5902	uc32 chr = chars[i];
				5903	if (chr != bottom) {
				5904	ranges->Add(CharacterRange::Singleton(chars[i]), zone);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5905	}
				5906	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5907	} else {
				5908	// If this is a range we expand the characters block by block, expanding
				5909	// contiguous subranges (blocks) one at a time. The approach is as
				5910	// follows. For a given start character we look up the remainder of the
				5911	// block that contains it (represented by the end point), for instance we
				5912	// find 'z' if the character is 'c'. A block is characterized by the
				5913	// property that all characters uncanonicalize in the same way, except
				5914	// that each entry in the result is incremented by the distance from the
				5915	// first element. So a-z is a block because 'a' uncanonicalizes to ['a',
				5916	// 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
				5917	// we've found the end point we look up its uncanonicalization and
				5918	// produce a range for each element. For instance for [c-f] we look up
				5919	// ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
				5920	// it is not already contained in the input, so [c-f] will be skipped but
				5921	// [C-F] will be added. If this range is not completely contained in a
				5922	// block we do this for all the blocks covered by the range (handling
				5923	// characters that is not in a block as a "singleton block").
				5924	unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				5925	int pos = bottom;
				5926	while (pos <= top) {
				5927	int length =
				5928	isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
				5929	uc32 block_end;
				5930	if (length == 0) {
				5931	block_end = pos;
				5932	} else {
				5933	DCHECK_EQ(1, length);
				5934	block_end = equivalents[0];
				5935	}
				5936	int end = (block_end > top) ? top : block_end;
				5937	length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
				5938	equivalents);
				5939	for (int i = 0; i < length; i++) {
				5940	uc32 c = equivalents[i];
				5941	uc32 range_from = c - (block_end - pos);
				5942	uc32 range_to = c - (block_end - end);
				5943	if (!(bottom <= range_from && range_to <= top)) {
				5944	ranges->Add(CharacterRange::Range(range_from, range_to), zone);
				5945	}
				5946	}
				5947	pos = end + 1;
				5948	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5949	}
				5950	}
				5951	}
				5952
				5953
				5954	bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
				5955	DCHECK_NOT_NULL(ranges);
				5956	int n = ranges->length();
				5957	if (n <= 1) return true;
				5958	int max = ranges->at(0).to();
				5959	for (int i = 1; i < n; i++) {
				5960	CharacterRange next_range = ranges->at(i);
				5961	if (next_range.from() <= max + 1) return false;
				5962	max = next_range.to();
				5963	}
				5964	return true;
				5965	}
				5966
				5967
				5968	ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
				5969	if (ranges_ == NULL) {
				5970	ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
				5971	CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
				5972	}
				5973	return ranges_;
				5974	}
				5975
				5976
				5977	// Move a number of elements in a zonelist to another position
				5978	// in the same list. Handles overlapping source and target areas.
				5979	static void MoveRanges(ZoneList<CharacterRange>* list,
				5980	int from,
				5981	int to,
				5982	int count) {
				5983	// Ranges are potentially overlapping.
				5984	if (from < to) {
				5985	for (int i = count - 1; i >= 0; i--) {
				5986	list->at(to + i) = list->at(from + i);
				5987	}
				5988	} else {
				5989	for (int i = 0; i < count; i++) {
				5990	list->at(to + i) = list->at(from + i);
				5991	}
				5992	}
				5993	}
				5994
				5995
				5996	static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
				5997	int count,
				5998	CharacterRange insert) {
				5999	// Inserts a range into list[0..count[, which must be sorted
				6000	// by from value and non-overlapping and non-adjacent, using at most
				6001	// list[0..count] for the result. Returns the number of resulting
				6002	// canonicalized ranges. Inserting a range may collapse existing ranges into
				6003	// fewer ranges, so the return value can be anything in the range 1..count+1.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6004	uc32 from = insert.from();
				6005	uc32 to = insert.to();
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6006	int start_pos = 0;
				6007	int end_pos = count;
				6008	for (int i = count - 1; i >= 0; i--) {
				6009	CharacterRange current = list->at(i);
				6010	if (current.from() > to + 1) {
				6011	end_pos = i;
				6012	} else if (current.to() + 1 < from) {
				6013	start_pos = i + 1;
				6014	break;
				6015	}
				6016	}
				6017
				6018	// Inserted range overlaps, or is adjacent to, ranges at positions
				6019	// [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
				6020	// not affected by the insertion.
				6021	// If start_pos == end_pos, the range must be inserted before start_pos.
				6022	// if start_pos < end_pos, the entire range from start_pos to end_pos
				6023	// must be merged with the insert range.
				6024
				6025	if (start_pos == end_pos) {
				6026	// Insert between existing ranges at position start_pos.
				6027	if (start_pos < count) {
				6028	MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
				6029	}
				6030	list->at(start_pos) = insert;
				6031	return count + 1;
				6032	}
				6033	if (start_pos + 1 == end_pos) {
				6034	// Replace single existing range at position start_pos.
				6035	CharacterRange to_replace = list->at(start_pos);
				6036	int new_from = Min(to_replace.from(), from);
				6037	int new_to = Max(to_replace.to(), to);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6038	list->at(start_pos) = CharacterRange::Range(new_from, new_to);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6039	return count;
				6040	}
				6041	// Replace a number of existing ranges from start_pos to end_pos - 1.
				6042	// Move the remaining ranges down.
				6043
				6044	int new_from = Min(list->at(start_pos).from(), from);
				6045	int new_to = Max(list->at(end_pos - 1).to(), to);
				6046	if (end_pos < count) {
				6047	MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
				6048	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6049	list->at(start_pos) = CharacterRange::Range(new_from, new_to);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6050	return count - (end_pos - start_pos) + 1;
				6051	}
				6052
				6053
				6054	void CharacterSet::Canonicalize() {
				6055	// Special/default classes are always considered canonical. The result
				6056	// of calling ranges() will be sorted.
				6057	if (ranges_ == NULL) return;
				6058	CharacterRange::Canonicalize(ranges_);
				6059	}
				6060
				6061
				6062	void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
				6063	if (character_ranges->length() <= 1) return;
				6064	// Check whether ranges are already canonical (increasing, non-overlapping,
				6065	// non-adjacent).
				6066	int n = character_ranges->length();
				6067	int max = character_ranges->at(0).to();
				6068	int i = 1;
				6069	while (i < n) {
				6070	CharacterRange current = character_ranges->at(i);
				6071	if (current.from() <= max + 1) {
				6072	break;
				6073	}
				6074	max = current.to();
				6075	i++;
				6076	}
				6077	// Canonical until the i'th range. If that's all of them, we are done.
				6078	if (i == n) return;
				6079
				6080	// The ranges at index i and forward are not canonicalized. Make them so by
				6081	// doing the equivalent of insertion sort (inserting each into the previous
				6082	// list, in order).
				6083	// Notice that inserting a range can reduce the number of ranges in the
				6084	// result due to combining of adjacent and overlapping ranges.
				6085	int read = i; // Range to insert.
				6086	int num_canonical = i; // Length of canonicalized part of list.
				6087	do {
				6088	num_canonical = InsertRangeInCanonicalList(character_ranges,
				6089	num_canonical,
				6090	character_ranges->at(read));
				6091	read++;
				6092	} while (read < n);
				6093	character_ranges->Rewind(num_canonical);
				6094
				6095	DCHECK(CharacterRange::IsCanonical(character_ranges));
				6096	}
				6097
				6098
				6099	void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
				6100	ZoneList<CharacterRange>* negated_ranges,
				6101	Zone* zone) {
				6102	DCHECK(CharacterRange::IsCanonical(ranges));
				6103	DCHECK_EQ(0, negated_ranges->length());
				6104	int range_count = ranges->length();
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6105	uc32 from = 0;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6106	int i = 0;
				6107	if (range_count > 0 && ranges->at(0).from() == 0) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6108	from = ranges->at(0).to() + 1;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6109	i = 1;
				6110	}
				6111	while (i < range_count) {
				6112	CharacterRange range = ranges->at(i);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6113	negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
				6114	from = range.to() + 1;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6115	i++;
				6116	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6117	if (from < String::kMaxCodePoint) {
				6118	negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6119	zone);
				6120	}
				6121	}
				6122
				6123
				6124	// -------------------------------------------------------------------
				6125	// Splay tree
				6126
				6127
				6128	OutSet* OutSet::Extend(unsigned value, Zone* zone) {
				6129	if (Get(value))
				6130	return this;
				6131	if (successors(zone) != NULL) {
				6132	for (int i = 0; i < successors(zone)->length(); i++) {
				6133	OutSet* successor = successors(zone)->at(i);
				6134	if (successor->Get(value))
				6135	return successor;
				6136	}
				6137	} else {
				6138	successors_ = new(zone) ZoneList<OutSet*>(2, zone);
				6139	}
				6140	OutSet* result = new(zone) OutSet(first_, remaining_);
				6141	result->Set(value, zone);
				6142	successors(zone)->Add(result, zone);
				6143	return result;
				6144	}
				6145
				6146
				6147	void OutSet::Set(unsigned value, Zone *zone) {
				6148	if (value < kFirstLimit) {
				6149	first_ \|= (1 << value);
				6150	} else {
				6151	if (remaining_ == NULL)
				6152	remaining_ = new(zone) ZoneList<unsigned>(1, zone);
				6153	if (remaining_->is_empty() \|\| !remaining_->Contains(value))
				6154	remaining_->Add(value, zone);
				6155	}
				6156	}
				6157
				6158
				6159	bool OutSet::Get(unsigned value) const {
				6160	if (value < kFirstLimit) {
				6161	return (first_ & (1 << value)) != 0;
				6162	} else if (remaining_ == NULL) {
				6163	return false;
				6164	} else {
				6165	return remaining_->Contains(value);
				6166	}
				6167	}
				6168
				6169
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6170	const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6171
				6172
				6173	void DispatchTable::AddRange(CharacterRange full_range, int value,
				6174	Zone* zone) {
				6175	CharacterRange current = full_range;
				6176	if (tree()->is_empty()) {
				6177	// If this is the first range we just insert into the table.
				6178	ZoneSplayTree<Config>::Locator loc;
				6179	bool inserted = tree()->Insert(current.from(), &loc);
				6180	DCHECK(inserted);
				6181	USE(inserted);
				6182	loc.set_value(Entry(current.from(), current.to(),
				6183	empty()->Extend(value, zone)));
				6184	return;
				6185	}
				6186	// First see if there is a range to the left of this one that
				6187	// overlaps.
				6188	ZoneSplayTree<Config>::Locator loc;
				6189	if (tree()->FindGreatestLessThan(current.from(), &loc)) {
				6190	Entry* entry = &loc.value();
				6191	// If we've found a range that overlaps with this one, and it
				6192	// starts strictly to the left of this one, we have to fix it
				6193	// because the following code only handles ranges that start on
				6194	// or after the start point of the range we're adding.
				6195	if (entry->from() < current.from() && entry->to() >= current.from()) {
				6196	// Snap the overlapping range in half around the start point of
				6197	// the range we're adding.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6198	CharacterRange left =
				6199	CharacterRange::Range(entry->from(), current.from() - 1);
				6200	CharacterRange right = CharacterRange::Range(current.from(), entry->to());
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6201	// The left part of the overlapping range doesn't overlap.
				6202	// Truncate the whole entry to be just the left part.
				6203	entry->set_to(left.to());
				6204	// The right part is the one that overlaps. We add this part
				6205	// to the map and let the next step deal with merging it with
				6206	// the range we're adding.
				6207	ZoneSplayTree<Config>::Locator loc;
				6208	bool inserted = tree()->Insert(right.from(), &loc);
				6209	DCHECK(inserted);
				6210	USE(inserted);
				6211	loc.set_value(Entry(right.from(),
				6212	right.to(),
				6213	entry->out_set()));
				6214	}
				6215	}
				6216	while (current.is_valid()) {
				6217	if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
				6218	(loc.value().from() <= current.to()) &&
				6219	(loc.value().to() >= current.from())) {
				6220	Entry* entry = &loc.value();
				6221	// We have overlap. If there is space between the start point of
				6222	// the range we're adding and where the overlapping range starts
				6223	// then we have to add a range covering just that space.
				6224	if (current.from() < entry->from()) {
				6225	ZoneSplayTree<Config>::Locator ins;
				6226	bool inserted = tree()->Insert(current.from(), &ins);
				6227	DCHECK(inserted);
				6228	USE(inserted);
				6229	ins.set_value(Entry(current.from(),
				6230	entry->from() - 1,
				6231	empty()->Extend(value, zone)));
				6232	current.set_from(entry->from());
				6233	}
				6234	DCHECK_EQ(current.from(), entry->from());
				6235	// If the overlapping range extends beyond the one we want to add
				6236	// we have to snap the right part off and add it separately.
				6237	if (entry->to() > current.to()) {
				6238	ZoneSplayTree<Config>::Locator ins;
				6239	bool inserted = tree()->Insert(current.to() + 1, &ins);
				6240	DCHECK(inserted);
				6241	USE(inserted);
				6242	ins.set_value(Entry(current.to() + 1,
				6243	entry->to(),
				6244	entry->out_set()));
				6245	entry->set_to(current.to());
				6246	}
				6247	DCHECK(entry->to() <= current.to());
				6248	// The overlapping range is now completely contained by the range
				6249	// we're adding so we can just update it and move the start point
				6250	// of the range we're adding just past it.
				6251	entry->AddValue(value, zone);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6252	DCHECK(entry->to() + 1 > current.from());
				6253	current.set_from(entry->to() + 1);
				6254	} else {
				6255	// There is no overlap so we can just add the range
				6256	ZoneSplayTree<Config>::Locator ins;
				6257	bool inserted = tree()->Insert(current.from(), &ins);
				6258	DCHECK(inserted);
				6259	USE(inserted);
				6260	ins.set_value(Entry(current.from(),
				6261	current.to(),
				6262	empty()->Extend(value, zone)));
				6263	break;
				6264	}
				6265	}
				6266	}
				6267
				6268
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6269	OutSet* DispatchTable::Get(uc32 value) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6270	ZoneSplayTree<Config>::Locator loc;
				6271	if (!tree()->FindGreatestLessThan(value, &loc))
				6272	return empty();
				6273	Entry* entry = &loc.value();
				6274	if (value <= entry->to())
				6275	return entry->out_set();
				6276	else
				6277	return empty();
				6278	}
				6279
				6280
				6281	// -------------------------------------------------------------------
				6282	// Analysis
				6283
				6284
				6285	void Analysis::EnsureAnalyzed(RegExpNode* that) {
				6286	StackLimitCheck check(isolate());
				6287	if (check.HasOverflowed()) {
				6288	fail("Stack overflow");
				6289	return;
				6290	}
				6291	if (that->info()->been_analyzed \|\| that->info()->being_analyzed)
				6292	return;
				6293	that->info()->being_analyzed = true;
				6294	that->Accept(this);
				6295	that->info()->being_analyzed = false;
				6296	that->info()->been_analyzed = true;
				6297	}
				6298
				6299
				6300	void Analysis::VisitEnd(EndNode* that) {
				6301	// nothing to do
				6302	}
				6303
				6304
				6305	void TextNode::CalculateOffsets() {
				6306	int element_count = elements()->length();
				6307	// Set up the offsets of the elements relative to the start. This is a fixed
				6308	// quantity since a TextNode can only contain fixed-width things.
				6309	int cp_offset = 0;
				6310	for (int i = 0; i < element_count; i++) {
				6311	TextElement& elm = elements()->at(i);
				6312	elm.set_cp_offset(cp_offset);
				6313	cp_offset += elm.length();
				6314	}
				6315	}
				6316
				6317
				6318	void Analysis::VisitText(TextNode* that) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6319	if (ignore_case()) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6320	that->MakeCaseIndependent(isolate(), is_one_byte_);
				6321	}
				6322	EnsureAnalyzed(that->on_success());
				6323	if (!has_failed()) {
				6324	that->CalculateOffsets();
				6325	}
				6326	}
				6327
				6328
				6329	void Analysis::VisitAction(ActionNode* that) {
				6330	RegExpNode* target = that->on_success();
				6331	EnsureAnalyzed(target);
				6332	if (!has_failed()) {
				6333	// If the next node is interested in what it follows then this node
				6334	// has to be interested too so it can pass the information on.
				6335	that->info()->AddFromFollowing(target->info());
				6336	}
				6337	}
				6338
				6339
				6340	void Analysis::VisitChoice(ChoiceNode* that) {
				6341	NodeInfo* info = that->info();
				6342	for (int i = 0; i < that->alternatives()->length(); i++) {
				6343	RegExpNode* node = that->alternatives()->at(i).node();
				6344	EnsureAnalyzed(node);
				6345	if (has_failed()) return;
				6346	// Anything the following nodes need to know has to be known by
				6347	// this node also, so it can pass it on.
				6348	info->AddFromFollowing(node->info());
				6349	}
				6350	}
				6351
				6352
				6353	void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
				6354	NodeInfo* info = that->info();
				6355	for (int i = 0; i < that->alternatives()->length(); i++) {
				6356	RegExpNode* node = that->alternatives()->at(i).node();
				6357	if (node != that->loop_node()) {
				6358	EnsureAnalyzed(node);
				6359	if (has_failed()) return;
				6360	info->AddFromFollowing(node->info());
				6361	}
				6362	}
				6363	// Check the loop last since it may need the value of this node
				6364	// to get a correct result.
				6365	EnsureAnalyzed(that->loop_node());
				6366	if (!has_failed()) {
				6367	info->AddFromFollowing(that->loop_node()->info());
				6368	}
				6369	}
				6370
				6371
				6372	void Analysis::VisitBackReference(BackReferenceNode* that) {
				6373	EnsureAnalyzed(that->on_success());
				6374	}
				6375
				6376
				6377	void Analysis::VisitAssertion(AssertionNode* that) {
				6378	EnsureAnalyzed(that->on_success());
				6379	}
				6380
				6381
				6382	void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
				6383	BoyerMooreLookahead* bm,
				6384	bool not_at_start) {
				6385	// Working out the set of characters that a backreference can match is too
				6386	// hard, so we just say that any character can match.
				6387	bm->SetRest(offset);
				6388	SaveBMInfo(bm, not_at_start, offset);
				6389	}
				6390
				6391
				6392	STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
				6393	RegExpMacroAssembler::kTableSize);
				6394
				6395
				6396	void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
				6397	BoyerMooreLookahead* bm, bool not_at_start) {
				6398	ZoneList<GuardedAlternative>* alts = alternatives();
				6399	budget = (budget - 1) / alts->length();
				6400	for (int i = 0; i < alts->length(); i++) {
				6401	GuardedAlternative& alt = alts->at(i);
				6402	if (alt.guards() != NULL && alt.guards()->length() != 0) {
				6403	bm->SetRest(offset); // Give up trying to fill in info.
				6404	SaveBMInfo(bm, not_at_start, offset);
				6405	return;
				6406	}
				6407	alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
				6408	}
				6409	SaveBMInfo(bm, not_at_start, offset);
				6410	}
				6411
				6412
				6413	void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
				6414	BoyerMooreLookahead* bm, bool not_at_start) {
				6415	if (initial_offset >= bm->length()) return;
				6416	int offset = initial_offset;
				6417	int max_char = bm->max_char();
				6418	for (int i = 0; i < elements()->length(); i++) {
				6419	if (offset >= bm->length()) {
				6420	if (initial_offset == 0) set_bm_info(not_at_start, bm);
				6421	return;
				6422	}
				6423	TextElement text = elements()->at(i);
				6424	if (text.text_type() == TextElement::ATOM) {
				6425	RegExpAtom* atom = text.atom();
				6426	for (int j = 0; j < atom->length(); j++, offset++) {
				6427	if (offset >= bm->length()) {
				6428	if (initial_offset == 0) set_bm_info(not_at_start, bm);
				6429	return;
				6430	}
				6431	uc16 character = atom->data()[j];
				6432	if (bm->compiler()->ignore_case()) {
				6433	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				6434	int length = GetCaseIndependentLetters(
				6435	isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
				6436	chars);
				6437	for (int j = 0; j < length; j++) {
				6438	bm->Set(offset, chars[j]);
				6439	}
				6440	} else {
				6441	if (character <= max_char) bm->Set(offset, character);
				6442	}
				6443	}
				6444	} else {
				6445	DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
				6446	RegExpCharacterClass* char_class = text.char_class();
				6447	ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
				6448	if (char_class->is_negated()) {
				6449	bm->SetAll(offset);
				6450	} else {
				6451	for (int k = 0; k < ranges->length(); k++) {
				6452	CharacterRange& range = ranges->at(k);
				6453	if (range.from() > max_char) continue;
				6454	int to = Min(max_char, static_cast<int>(range.to()));
				6455	bm->SetInterval(offset, Interval(range.from(), to));
				6456	}
				6457	}
				6458	offset++;
				6459	}
				6460	}
				6461	if (offset >= bm->length()) {
				6462	if (initial_offset == 0) set_bm_info(not_at_start, bm);
				6463	return;
				6464	}
				6465	on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
				6466	true); // Not at start after a text node.
				6467	if (initial_offset == 0) set_bm_info(not_at_start, bm);
				6468	}
				6469
				6470
				6471	// -------------------------------------------------------------------
				6472	// Dispatch table construction
				6473
				6474
				6475	void DispatchTableConstructor::VisitEnd(EndNode* that) {
				6476	AddRange(CharacterRange::Everything());
				6477	}
				6478
				6479
				6480	void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
				6481	node->set_being_calculated(true);
				6482	ZoneList<GuardedAlternative>* alternatives = node->alternatives();
				6483	for (int i = 0; i < alternatives->length(); i++) {
				6484	set_choice_index(i);
				6485	alternatives->at(i).node()->Accept(this);
				6486	}
				6487	node->set_being_calculated(false);
				6488	}
				6489
				6490
				6491	class AddDispatchRange {
				6492	public:
				6493	explicit AddDispatchRange(DispatchTableConstructor* constructor)
				6494	: constructor_(constructor) { }
				6495	void Call(uc32 from, DispatchTable::Entry entry);
				6496	private:
				6497	DispatchTableConstructor* constructor_;
				6498	};
				6499
				6500
				6501	void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6502	constructor_->AddRange(CharacterRange::Range(from, entry.to()));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6503	}
				6504
				6505
				6506	void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
				6507	if (node->being_calculated())
				6508	return;
				6509	DispatchTable* table = node->GetTable(ignore_case_);
				6510	AddDispatchRange adder(this);
				6511	table->ForEach(&adder);
				6512	}
				6513
				6514
				6515	void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
				6516	// TODO(160): Find the node that we refer back to and propagate its start
				6517	// set back to here. For now we just accept anything.
				6518	AddRange(CharacterRange::Everything());
				6519	}
				6520
				6521
				6522	void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
				6523	RegExpNode* target = that->on_success();
				6524	target->Accept(this);
				6525	}
				6526
				6527
				6528	static int CompareRangeByFrom(const CharacterRange* a,
				6529	const CharacterRange* b) {
				6530	return Compare<uc16>(a->from(), b->from());
				6531	}
				6532
				6533
				6534	void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
				6535	ranges->Sort(CompareRangeByFrom);
				6536	uc16 last = 0;
				6537	for (int i = 0; i < ranges->length(); i++) {
				6538	CharacterRange range = ranges->at(i);
				6539	if (last < range.from())
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6540	AddRange(CharacterRange::Range(last, range.from() - 1));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6541	if (range.to() >= last) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6542	if (range.to() == String::kMaxCodePoint) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6543	return;
				6544	} else {
				6545	last = range.to() + 1;
				6546	}
				6547	}
				6548	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6549	AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6550	}
				6551
				6552
				6553	void DispatchTableConstructor::VisitText(TextNode* that) {
				6554	TextElement elm = that->elements()->at(0);
				6555	switch (elm.text_type()) {
				6556	case TextElement::ATOM: {
				6557	uc16 c = elm.atom()->data()[0];
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6558	AddRange(CharacterRange::Range(c, c));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6559	break;
				6560	}
				6561	case TextElement::CHAR_CLASS: {
				6562	RegExpCharacterClass* tree = elm.char_class();
				6563	ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
				6564	if (tree->is_negated()) {
				6565	AddInverse(ranges);
				6566	} else {
				6567	for (int i = 0; i < ranges->length(); i++)
				6568	AddRange(ranges->at(i));
				6569	}
				6570	break;
				6571	}
				6572	default: {
				6573	UNIMPLEMENTED();
				6574	}
				6575	}
				6576	}
				6577
				6578
				6579	void DispatchTableConstructor::VisitAction(ActionNode* that) {
				6580	RegExpNode* target = that->on_success();
				6581	target->Accept(this);
				6582	}
				6583
				6584
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6585	RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
				6586	RegExpNode* on_success) {
				6587	// If the regexp matching starts within a surrogate pair, step back
				6588	// to the lead surrogate and start matching from there.
				6589	DCHECK(!compiler->read_backward());
				6590	Zone* zone = compiler->zone();
				6591	ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
				6592	zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
				6593	ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
				6594	zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
				6595
				6596	ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
				6597
				6598	int stack_register = compiler->UnicodeLookaroundStackRegister();
				6599	int position_register = compiler->UnicodeLookaroundPositionRegister();
				6600	RegExpNode* step_back = TextNode::CreateForCharacterRanges(
				6601	zone, lead_surrogates, true, on_success);
				6602	RegExpLookaround::Builder builder(true, step_back, stack_register,
				6603	position_register);
				6604	RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
				6605	zone, trail_surrogates, false, builder.on_match_success());
				6606
				6607	optional_step_back->AddAlternative(
				6608	GuardedAlternative(builder.ForMatch(match_trail)));
				6609	optional_step_back->AddAlternative(GuardedAlternative(on_success));
				6610
				6611	return optional_step_back;
				6612	}
				6613
				6614
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6615	RegExpEngine::CompilationResult RegExpEngine::Compile(
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6616	Isolate* isolate, Zone* zone, RegExpCompileData* data,
				6617	JSRegExp::Flags flags, Handle<String> pattern,
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6618	Handle<String> sample_subject, bool is_one_byte) {
				6619	if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
				6620	return IrregexpRegExpTooBig(isolate);
				6621	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6622	bool ignore_case = flags & JSRegExp::kIgnoreCase;
				6623	bool is_sticky = flags & JSRegExp::kSticky;
				6624	bool is_global = flags & JSRegExp::kGlobal;
				6625	bool is_unicode = flags & JSRegExp::kUnicode;
				6626	RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6627	is_one_byte);
				6628
				6629	if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
				6630
				6631	// Sample some characters from the middle of the string.
				6632	static const int kSampleSize = 128;
				6633
				6634	sample_subject = String::Flatten(sample_subject);
				6635	int chars_sampled = 0;
				6636	int half_way = (sample_subject->length() - kSampleSize) / 2;
				6637	for (int i = Max(0, half_way);
				6638	i < sample_subject->length() && chars_sampled < kSampleSize;
				6639	i++, chars_sampled++) {
				6640	compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
				6641	}
				6642
				6643	// Wrap the body of the regexp in capture #0.
				6644	RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
				6645	0,
				6646	&compiler,
				6647	compiler.accept());
				6648	RegExpNode* node = captured_body;
				6649	bool is_end_anchored = data->tree->IsAnchoredAtEnd();
				6650	bool is_start_anchored = data->tree->IsAnchoredAtStart();
				6651	int max_length = data->tree->max_match();
				6652	if (!is_start_anchored && !is_sticky) {
				6653	// Add a .*? at the beginning, outside the body capture, unless
				6654	// this expression is anchored at the beginning or sticky.
				6655	RegExpNode* loop_node = RegExpQuantifier::ToNode(
				6656	0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'),
				6657	&compiler, captured_body, data->contains_anchor);
				6658
				6659	if (data->contains_anchor) {
				6660	// Unroll loop once, to take care of the case that might start
				6661	// at the start of input.
				6662	ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
				6663	first_step_node->AddAlternative(GuardedAlternative(captured_body));
				6664	first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
				6665	new (zone) RegExpCharacterClass('*'), false, loop_node)));
				6666	node = first_step_node;
				6667	} else {
				6668	node = loop_node;
				6669	}
				6670	}
				6671	if (is_one_byte) {
				6672	node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
				6673	// Do it again to propagate the new nodes to places where they were not
				6674	// put because they had not been calculated yet.
				6675	if (node != NULL) {
				6676	node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
				6677	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6678	} else if (compiler.unicode() && (is_global \|\| is_sticky)) {
				6679	node = OptionallyStepBackToLeadSurrogate(&compiler, node);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6680	}
				6681
				6682	if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
				6683	data->node = node;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6684	Analysis analysis(isolate, flags, is_one_byte);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6685	analysis.EnsureAnalyzed(node);
				6686	if (analysis.has_failed()) {
				6687	const char* error_message = analysis.error_message();
				6688	return CompilationResult(isolate, error_message);
				6689	}
				6690
				6691	// Create the correct assembler for the architecture.
				6692	#ifndef V8_INTERPRETED_REGEXP
				6693	// Native regexp implementation.
				6694
				6695	NativeRegExpMacroAssembler::Mode mode =
				6696	is_one_byte ? NativeRegExpMacroAssembler::LATIN1
				6697	: NativeRegExpMacroAssembler::UC16;
				6698
				6699	#if V8_TARGET_ARCH_IA32
				6700	RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
				6701	(data->capture_count + 1) * 2);
				6702	#elif V8_TARGET_ARCH_X64
				6703	RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
				6704	(data->capture_count + 1) * 2);
				6705	#elif V8_TARGET_ARCH_ARM
				6706	RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
				6707	(data->capture_count + 1) * 2);
				6708	#elif V8_TARGET_ARCH_ARM64
				6709	RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
				6710	(data->capture_count + 1) * 2);
Ben Murdoch	da12d29	2016-06-02 14:46:10 +0100	[diff] [blame]	6711	#elif V8_TARGET_ARCH_S390
				6712	RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode,
				6713	(data->capture_count + 1) * 2);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6714	#elif V8_TARGET_ARCH_PPC
				6715	RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
				6716	(data->capture_count + 1) * 2);
				6717	#elif V8_TARGET_ARCH_MIPS
				6718	RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
				6719	(data->capture_count + 1) * 2);
				6720	#elif V8_TARGET_ARCH_MIPS64
				6721	RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
				6722	(data->capture_count + 1) * 2);
				6723	#elif V8_TARGET_ARCH_X87
				6724	RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode,
				6725	(data->capture_count + 1) * 2);
				6726	#else
				6727	#error "Unsupported architecture"
				6728	#endif
				6729
				6730	#else // V8_INTERPRETED_REGEXP
				6731	// Interpreted regexp implementation.
				6732	EmbeddedVector<byte, 1024> codes;
				6733	RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
				6734	#endif // V8_INTERPRETED_REGEXP
				6735
				6736	macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern));
				6737
				6738	// Inserted here, instead of in Assembler, because it depends on information
				6739	// in the AST that isn't replicated in the Node structure.
				6740	static const int kMaxBacksearchLimit = 1024;
				6741	if (is_end_anchored &&
				6742	!is_start_anchored &&
				6743	max_length < kMaxBacksearchLimit) {
				6744	macro_assembler.SetCurrentPositionFromEnd(max_length);
				6745	}
				6746
				6747	if (is_global) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6748	RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
				6749	if (data->tree->min_match() > 0) {
				6750	mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
				6751	} else if (is_unicode) {
				6752	mode = RegExpMacroAssembler::GLOBAL_UNICODE;
				6753	}
				6754	macro_assembler.set_global_mode(mode);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6755	}
				6756
				6757	return compiler.Assemble(&macro_assembler,
				6758	node,
				6759	data->capture_count,
				6760	pattern);
				6761	}
				6762
				6763
				6764	bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) {
				6765	Heap* heap = pattern->GetHeap();
				6766	bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
				6767	if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&
Ben Murdoch	c561043	2016-08-08 18:44:38 +0100	[diff] [blame^]	6768	heap->memory_allocator()->SizeExecutable() >
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6769	RegExpImpl::kRegExpExecutableMemoryLimit) {
				6770	too_much = true;
				6771	}
				6772	return too_much;
				6773	}
				6774
				6775
				6776	Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string,
				6777	Object* key_pattern,
				6778	FixedArray** last_match_cache,
				6779	ResultsCacheType type) {
				6780	FixedArray* cache;
				6781	if (!key_string->IsInternalizedString()) return Smi::FromInt(0);
				6782	if (type == STRING_SPLIT_SUBSTRINGS) {
				6783	DCHECK(key_pattern->IsString());
				6784	if (!key_pattern->IsInternalizedString()) return Smi::FromInt(0);
				6785	cache = heap->string_split_cache();
				6786	} else {
				6787	DCHECK(type == REGEXP_MULTIPLE_INDICES);
				6788	DCHECK(key_pattern->IsFixedArray());
				6789	cache = heap->regexp_multiple_cache();
				6790	}
				6791
				6792	uint32_t hash = key_string->Hash();
				6793	uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
				6794	~(kArrayEntriesPerCacheEntry - 1));
				6795	if (cache->get(index + kStringOffset) != key_string \|\|
				6796	cache->get(index + kPatternOffset) != key_pattern) {
				6797	index =
				6798	((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
				6799	if (cache->get(index + kStringOffset) != key_string \|\|
				6800	cache->get(index + kPatternOffset) != key_pattern) {
				6801	return Smi::FromInt(0);
				6802	}
				6803	}
				6804
				6805	*last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
				6806	return cache->get(index + kArrayOffset);
				6807	}
				6808
				6809
				6810	void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
				6811	Handle<Object> key_pattern,
				6812	Handle<FixedArray> value_array,
				6813	Handle<FixedArray> last_match_cache,
				6814	ResultsCacheType type) {
				6815	Factory* factory = isolate->factory();
				6816	Handle<FixedArray> cache;
				6817	if (!key_string->IsInternalizedString()) return;
				6818	if (type == STRING_SPLIT_SUBSTRINGS) {
				6819	DCHECK(key_pattern->IsString());
				6820	if (!key_pattern->IsInternalizedString()) return;
				6821	cache = factory->string_split_cache();
				6822	} else {
				6823	DCHECK(type == REGEXP_MULTIPLE_INDICES);
				6824	DCHECK(key_pattern->IsFixedArray());
				6825	cache = factory->regexp_multiple_cache();
				6826	}
				6827
				6828	uint32_t hash = key_string->Hash();
				6829	uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
				6830	~(kArrayEntriesPerCacheEntry - 1));
				6831	if (cache->get(index + kStringOffset) == Smi::FromInt(0)) {
				6832	cache->set(index + kStringOffset, *key_string);
				6833	cache->set(index + kPatternOffset, *key_pattern);
				6834	cache->set(index + kArrayOffset, *value_array);
				6835	cache->set(index + kLastMatchOffset, *last_match_cache);
				6836	} else {
				6837	uint32_t index2 =
				6838	((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
				6839	if (cache->get(index2 + kStringOffset) == Smi::FromInt(0)) {
				6840	cache->set(index2 + kStringOffset, *key_string);
				6841	cache->set(index2 + kPatternOffset, *key_pattern);
				6842	cache->set(index2 + kArrayOffset, *value_array);
				6843	cache->set(index2 + kLastMatchOffset, *last_match_cache);
				6844	} else {
				6845	cache->set(index2 + kStringOffset, Smi::FromInt(0));
				6846	cache->set(index2 + kPatternOffset, Smi::FromInt(0));
				6847	cache->set(index2 + kArrayOffset, Smi::FromInt(0));
				6848	cache->set(index2 + kLastMatchOffset, Smi::FromInt(0));
				6849	cache->set(index + kStringOffset, *key_string);
				6850	cache->set(index + kPatternOffset, *key_pattern);
				6851	cache->set(index + kArrayOffset, *value_array);
				6852	cache->set(index + kLastMatchOffset, *last_match_cache);
				6853	}
				6854	}
				6855	// If the array is a reasonably short list of substrings, convert it into a
				6856	// list of internalized strings.
				6857	if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
				6858	for (int i = 0; i < value_array->length(); i++) {
				6859	Handle<String> str(String::cast(value_array->get(i)), isolate);
				6860	Handle<String> internalized_str = factory->InternalizeString(str);
				6861	value_array->set(i, *internalized_str);
				6862	}
				6863	}
				6864	// Convert backing store to a copy-on-write array.
				6865	value_array->set_map_no_write_barrier(*factory->fixed_cow_array_map());
				6866	}
				6867
				6868
				6869	void RegExpResultsCache::Clear(FixedArray* cache) {
				6870	for (int i = 0; i < kRegExpResultsCacheSize; i++) {
				6871	cache->set(i, Smi::FromInt(0));
				6872	}
				6873	}
				6874
				6875	} // namespace internal
				6876	} // namespace v8