Blame - src/regexp/jsregexp.cc - fp2-dev/platform/external/v8

blob: c3b670b51825cbfa3baf3b989525660a150b9920 [file] [log] [blame]

Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1	// Copyright 2012 the V8 project authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#include "src/regexp/jsregexp.h"
				6
				7	#include "src/ast/ast.h"
				8	#include "src/base/platform/platform.h"
				9	#include "src/compilation-cache.h"
				10	#include "src/compiler.h"
				11	#include "src/execution.h"
				12	#include "src/factory.h"
				13	#include "src/isolate-inl.h"
				14	#include "src/messages.h"
				15	#include "src/ostreams.h"
				16	#include "src/regexp/interpreter-irregexp.h"
				17	#include "src/regexp/jsregexp-inl.h"
				18	#include "src/regexp/regexp-macro-assembler.h"
				19	#include "src/regexp/regexp-macro-assembler-irregexp.h"
				20	#include "src/regexp/regexp-macro-assembler-tracer.h"
				21	#include "src/regexp/regexp-parser.h"
				22	#include "src/regexp/regexp-stack.h"
				23	#include "src/runtime/runtime.h"
				24	#include "src/splay-tree-inl.h"
				25	#include "src/string-search.h"
				26	#include "src/unicode-decoder.h"
				27
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	28	#ifdef V8_I18N_SUPPORT
				29	#include "unicode/uset.h"
				30	#include "unicode/utypes.h"
				31	#endif // V8_I18N_SUPPORT
				32
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	33	#ifndef V8_INTERPRETED_REGEXP
				34	#if V8_TARGET_ARCH_IA32
				35	#include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
				36	#elif V8_TARGET_ARCH_X64
				37	#include "src/regexp/x64/regexp-macro-assembler-x64.h"
				38	#elif V8_TARGET_ARCH_ARM64
				39	#include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
				40	#elif V8_TARGET_ARCH_ARM
				41	#include "src/regexp/arm/regexp-macro-assembler-arm.h"
				42	#elif V8_TARGET_ARCH_PPC
				43	#include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
Ben Murdoch	da12d29	2016-06-02 14:46:10 +0100	[diff] [blame]	44	#elif V8_TARGET_ARCH_S390
				45	#include "src/regexp/s390/regexp-macro-assembler-s390.h"
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	46	#elif V8_TARGET_ARCH_MIPS
				47	#include "src/regexp/mips/regexp-macro-assembler-mips.h"
				48	#elif V8_TARGET_ARCH_MIPS64
				49	#include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
				50	#elif V8_TARGET_ARCH_X87
				51	#include "src/regexp/x87/regexp-macro-assembler-x87.h"
				52	#else
				53	#error Unsupported target architecture.
				54	#endif
				55	#endif
				56
				57
				58	namespace v8 {
				59	namespace internal {
				60
				61	MUST_USE_RESULT
				62	static inline MaybeHandle<Object> ThrowRegExpException(
				63	Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) {
				64	Isolate* isolate = re->GetIsolate();
				65	THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
				66	pattern, error_text),
				67	Object);
				68	}
				69
				70
				71	inline void ThrowRegExpException(Handle<JSRegExp> re,
				72	Handle<String> error_text) {
				73	USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));
				74	}
				75
				76
				77	ContainedInLattice AddRange(ContainedInLattice containment,
				78	const int* ranges,
				79	int ranges_length,
				80	Interval new_range) {
				81	DCHECK((ranges_length & 1) == 1);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	82	DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	83	if (containment == kLatticeUnknown) return containment;
				84	bool inside = false;
				85	int last = 0;
				86	for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
				87	// Consider the range from last to ranges[i].
				88	// We haven't got to the new range yet.
				89	if (ranges[i] <= new_range.from()) continue;
				90	// New range is wholly inside last-ranges[i]. Note that new_range.to() is
				91	// inclusive, but the values in ranges are not.
				92	if (last <= new_range.from() && new_range.to() < ranges[i]) {
				93	return Combine(containment, inside ? kLatticeIn : kLatticeOut);
				94	}
				95	return kLatticeUnknown;
				96	}
				97	return containment;
				98	}
				99
				100
				101	// More makes code generation slower, less makes V8 benchmark score lower.
				102	const int kMaxLookaheadForBoyerMoore = 8;
				103	// In a 3-character pattern you can maximally step forwards 3 characters
				104	// at a time, which is not always enough to pay for the extra logic.
				105	const int kPatternTooShortForBoyerMoore = 2;
				106
				107
				108	// Identifies the sort of regexps where the regexp engine is faster
				109	// than the code used for atom matches.
				110	static bool HasFewDifferentCharacters(Handle<String> pattern) {
				111	int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
				112	if (length <= kPatternTooShortForBoyerMoore) return false;
				113	const int kMod = 128;
				114	bool character_found[kMod];
				115	int different = 0;
				116	memset(&character_found[0], 0, sizeof(character_found));
				117	for (int i = 0; i < length; i++) {
				118	int ch = (pattern->Get(i) & (kMod - 1));
				119	if (!character_found[ch]) {
				120	character_found[ch] = true;
				121	different++;
				122	// We declare a regexp low-alphabet if it has at least 3 times as many
				123	// characters as it has different characters.
				124	if (different * 3 > length) return false;
				125	}
				126	}
				127	return true;
				128	}
				129
				130
				131	// Generic RegExp methods. Dispatches to implementation specific methods.
				132
				133
				134	MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
				135	Handle<String> pattern,
				136	JSRegExp::Flags flags) {
				137	Isolate* isolate = re->GetIsolate();
Ben Murdoch	da12d29	2016-06-02 14:46:10 +0100	[diff] [blame]	138	Zone zone(isolate->allocator());
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	139	CompilationCache* compilation_cache = isolate->compilation_cache();
				140	MaybeHandle<FixedArray> maybe_cached =
				141	compilation_cache->LookupRegExp(pattern, flags);
				142	Handle<FixedArray> cached;
				143	bool in_cache = maybe_cached.ToHandle(&cached);
				144	LOG(isolate, RegExpCompileEvent(re, in_cache));
				145
				146	Handle<Object> result;
				147	if (in_cache) {
				148	re->set_data(*cached);
				149	return re;
				150	}
				151	pattern = String::Flatten(pattern);
				152	PostponeInterruptsScope postpone(isolate);
				153	RegExpCompileData parse_result;
				154	FlatStringReader reader(isolate, pattern);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	155	if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags,
				156	&parse_result)) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	157	// Throw an exception if we fail to parse the pattern.
				158	return ThrowRegExpException(re, pattern, parse_result.error);
				159	}
				160
				161	bool has_been_compiled = false;
				162
				163	if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
				164	!(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
				165	// Parse-tree is a single atom that is equal to the pattern.
				166	AtomCompile(re, pattern, flags, pattern);
				167	has_been_compiled = true;
				168	} else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) &&
				169	!(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
				170	RegExpAtom* atom = parse_result.tree->AsAtom();
				171	Vector<const uc16> atom_pattern = atom->data();
				172	Handle<String> atom_string;
				173	ASSIGN_RETURN_ON_EXCEPTION(
				174	isolate, atom_string,
				175	isolate->factory()->NewStringFromTwoByte(atom_pattern),
				176	Object);
				177	if (!HasFewDifferentCharacters(atom_string)) {
				178	AtomCompile(re, pattern, flags, atom_string);
				179	has_been_compiled = true;
				180	}
				181	}
				182	if (!has_been_compiled) {
				183	IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
				184	}
				185	DCHECK(re->data()->IsFixedArray());
				186	// Compilation succeeded so the data is set on the regexp
				187	// and we can store it in the cache.
				188	Handle<FixedArray> data(FixedArray::cast(re->data()));
				189	compilation_cache->PutRegExp(pattern, flags, data);
				190
				191	return re;
				192	}
				193
				194
				195	MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
				196	Handle<String> subject,
				197	int index,
				198	Handle<JSArray> last_match_info) {
				199	switch (regexp->TypeTag()) {
				200	case JSRegExp::ATOM:
				201	return AtomExec(regexp, subject, index, last_match_info);
				202	case JSRegExp::IRREGEXP: {
				203	return IrregexpExec(regexp, subject, index, last_match_info);
				204	}
				205	default:
				206	UNREACHABLE();
				207	return MaybeHandle<Object>();
				208	}
				209	}
				210
				211
				212	// RegExp Atom implementation: Simple string search using indexOf.
				213
				214
				215	void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
				216	Handle<String> pattern,
				217	JSRegExp::Flags flags,
				218	Handle<String> match_pattern) {
				219	re->GetIsolate()->factory()->SetRegExpAtomData(re,
				220	JSRegExp::ATOM,
				221	pattern,
				222	flags,
				223	match_pattern);
				224	}
				225
				226
				227	static void SetAtomLastCapture(FixedArray* array,
				228	String* subject,
				229	int from,
				230	int to) {
				231	SealHandleScope shs(array->GetIsolate());
				232	RegExpImpl::SetLastCaptureCount(array, 2);
				233	RegExpImpl::SetLastSubject(array, subject);
				234	RegExpImpl::SetLastInput(array, subject);
				235	RegExpImpl::SetCapture(array, 0, from);
				236	RegExpImpl::SetCapture(array, 1, to);
				237	}
				238
				239
				240	int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
				241	Handle<String> subject,
				242	int index,
				243	int32_t* output,
				244	int output_size) {
				245	Isolate* isolate = regexp->GetIsolate();
				246
				247	DCHECK(0 <= index);
				248	DCHECK(index <= subject->length());
				249
				250	subject = String::Flatten(subject);
				251	DisallowHeapAllocation no_gc; // ensure vectors stay valid
				252
				253	String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
				254	int needle_len = needle->length();
				255	DCHECK(needle->IsFlat());
				256	DCHECK_LT(0, needle_len);
				257
				258	if (index + needle_len > subject->length()) {
				259	return RegExpImpl::RE_FAILURE;
				260	}
				261
				262	for (int i = 0; i < output_size; i += 2) {
				263	String::FlatContent needle_content = needle->GetFlatContent();
				264	String::FlatContent subject_content = subject->GetFlatContent();
				265	DCHECK(needle_content.IsFlat());
				266	DCHECK(subject_content.IsFlat());
				267	// dispatch on type of strings
				268	index =
				269	(needle_content.IsOneByte()
				270	? (subject_content.IsOneByte()
				271	? SearchString(isolate, subject_content.ToOneByteVector(),
				272	needle_content.ToOneByteVector(), index)
				273	: SearchString(isolate, subject_content.ToUC16Vector(),
				274	needle_content.ToOneByteVector(), index))
				275	: (subject_content.IsOneByte()
				276	? SearchString(isolate, subject_content.ToOneByteVector(),
				277	needle_content.ToUC16Vector(), index)
				278	: SearchString(isolate, subject_content.ToUC16Vector(),
				279	needle_content.ToUC16Vector(), index)));
				280	if (index == -1) {
				281	return i / 2; // Return number of matches.
				282	} else {
				283	output[i] = index;
				284	output[i+1] = index + needle_len;
				285	index += needle_len;
				286	}
				287	}
				288	return output_size / 2;
				289	}
				290
				291
				292	Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
				293	Handle<String> subject,
				294	int index,
				295	Handle<JSArray> last_match_info) {
				296	Isolate* isolate = re->GetIsolate();
				297
				298	static const int kNumRegisters = 2;
				299	STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
				300	int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
				301
				302	int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
				303
				304	if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
				305
				306	DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
				307	SealHandleScope shs(isolate);
				308	FixedArray* array = FixedArray::cast(last_match_info->elements());
				309	SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
				310	return last_match_info;
				311	}
				312
				313
				314	// Irregexp implementation.
				315
				316	// Ensures that the regexp object contains a compiled version of the
				317	// source for either one-byte or two-byte subject strings.
				318	// If the compiled version doesn't already exist, it is compiled
				319	// from the source pattern.
				320	// If compilation fails, an exception is thrown and this function
				321	// returns false.
				322	bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
				323	Handle<String> sample_subject,
				324	bool is_one_byte) {
				325	Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
				326	#ifdef V8_INTERPRETED_REGEXP
				327	if (compiled_code->IsByteArray()) return true;
				328	#else // V8_INTERPRETED_REGEXP (RegExp native code)
				329	if (compiled_code->IsCode()) return true;
				330	#endif
				331	// We could potentially have marked this as flushable, but have kept
				332	// a saved version if we did not flush it yet.
				333	Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
				334	if (saved_code->IsCode()) {
				335	// Reinstate the code in the original place.
				336	re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
				337	DCHECK(compiled_code->IsSmi());
				338	return true;
				339	}
				340	return CompileIrregexp(re, sample_subject, is_one_byte);
				341	}
				342
				343
				344	bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
				345	Handle<String> sample_subject,
				346	bool is_one_byte) {
				347	// Compile the RegExp.
				348	Isolate* isolate = re->GetIsolate();
Ben Murdoch	da12d29	2016-06-02 14:46:10 +0100	[diff] [blame]	349	Zone zone(isolate->allocator());
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	350	PostponeInterruptsScope postpone(isolate);
				351	// If we had a compilation error the last time this is saved at the
				352	// saved code index.
				353	Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
				354	// When arriving here entry can only be a smi, either representing an
				355	// uncompiled regexp, a previous compilation error, or code that has
				356	// been flushed.
				357	DCHECK(entry->IsSmi());
				358	int entry_value = Smi::cast(entry)->value();
				359	DCHECK(entry_value == JSRegExp::kUninitializedValue \|\|
				360	entry_value == JSRegExp::kCompilationErrorValue \|\|
				361	(entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
				362
				363	if (entry_value == JSRegExp::kCompilationErrorValue) {
				364	// A previous compilation failed and threw an error which we store in
				365	// the saved code index (we store the error message, not the actual
				366	// error). Recreate the error object and throw it.
				367	Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
				368	DCHECK(error_string->IsString());
				369	Handle<String> error_message(String::cast(error_string));
				370	ThrowRegExpException(re, error_message);
				371	return false;
				372	}
				373
				374	JSRegExp::Flags flags = re->GetFlags();
				375
				376	Handle<String> pattern(re->Pattern());
				377	pattern = String::Flatten(pattern);
				378	RegExpCompileData compile_data;
				379	FlatStringReader reader(isolate, pattern);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	380	if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
				381	&compile_data)) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	382	// Throw an exception if we fail to parse the pattern.
				383	// THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
				384	USE(ThrowRegExpException(re, pattern, compile_data.error));
				385	return false;
				386	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	387	RegExpEngine::CompilationResult result =
				388	RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
				389	sample_subject, is_one_byte);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	390	if (result.error_message != NULL) {
				391	// Unable to compile regexp.
				392	Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
				393	CStrVector(result.error_message)).ToHandleChecked();
				394	ThrowRegExpException(re, error_message);
				395	return false;
				396	}
				397
				398	Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
				399	data->set(JSRegExp::code_index(is_one_byte), result.code);
Ben Murdoch	61f157c	2016-09-16 13:49:30 +0100	[diff] [blame]	400	SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	401	int register_max = IrregexpMaxRegisterCount(*data);
				402	if (result.num_registers > register_max) {
				403	SetIrregexpMaxRegisterCount(*data, result.num_registers);
				404	}
				405
				406	return true;
				407	}
				408
				409
				410	int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
				411	return Smi::cast(
				412	re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
				413	}
				414
				415
				416	void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
				417	re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
				418	}
				419
Ben Murdoch	61f157c	2016-09-16 13:49:30 +0100	[diff] [blame]	420	void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray* re,
				421	Handle<FixedArray> value) {
				422	if (value.is_null()) {
				423	re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::FromInt(0));
				424	} else {
				425	re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
				426	}
				427	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	428
				429	int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
				430	return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
				431	}
				432
				433
				434	int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
				435	return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
				436	}
				437
				438
				439	ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
				440	return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
				441	}
				442
				443
				444	Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
				445	return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
				446	}
				447
				448
				449	void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
				450	Handle<String> pattern,
				451	JSRegExp::Flags flags,
				452	int capture_count) {
				453	// Initialize compiled code entries to null.
				454	re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
				455	JSRegExp::IRREGEXP,
				456	pattern,
				457	flags,
				458	capture_count);
				459	}
				460
				461
				462	int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
				463	Handle<String> subject) {
				464	subject = String::Flatten(subject);
				465
				466	// Check representation of the underlying storage.
				467	bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
				468	if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
				469
				470	#ifdef V8_INTERPRETED_REGEXP
				471	// Byte-code regexp needs space allocated for all its registers.
				472	// The result captures are copied to the start of the registers array
				473	// if the match succeeds. This way those registers are not clobbered
				474	// when we set the last match info from last successful match.
				475	return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
				476	(IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
				477	#else // V8_INTERPRETED_REGEXP
				478	// Native regexp only needs room to output captures. Registers are handled
				479	// internally.
				480	return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
				481	#endif // V8_INTERPRETED_REGEXP
				482	}
				483
				484
				485	int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
				486	Handle<String> subject,
				487	int index,
				488	int32_t* output,
				489	int output_size) {
				490	Isolate* isolate = regexp->GetIsolate();
				491
				492	Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
				493
				494	DCHECK(index >= 0);
				495	DCHECK(index <= subject->length());
				496	DCHECK(subject->IsFlat());
				497
				498	bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
				499
				500	#ifndef V8_INTERPRETED_REGEXP
				501	DCHECK(output_size >= (IrregexpNumberOfCaptures(irregexp) + 1) 2);
				502	do {
				503	EnsureCompiledIrregexp(regexp, subject, is_one_byte);
				504	Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
				505	// The stack is used to allocate registers for the compiled regexp code.
				506	// This means that in case of failure, the output registers array is left
				507	// untouched and contains the capture results from the previous successful
				508	// match. We can use that to set the last match info lazily.
				509	NativeRegExpMacroAssembler::Result res =
				510	NativeRegExpMacroAssembler::Match(code,
				511	subject,
				512	output,
				513	output_size,
				514	index,
				515	isolate);
				516	if (res != NativeRegExpMacroAssembler::RETRY) {
				517	DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION \|\|
				518	isolate->has_pending_exception());
				519	STATIC_ASSERT(
				520	static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
				521	STATIC_ASSERT(
				522	static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
				523	STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
				524	== RE_EXCEPTION);
				525	return static_cast<IrregexpResult>(res);
				526	}
				527	// If result is RETRY, the string has changed representation, and we
				528	// must restart from scratch.
				529	// In this case, it means we must make sure we are prepared to handle
				530	// the, potentially, different subject (the string can switch between
				531	// being internal and external, and even between being Latin1 and UC16,
				532	// but the characters are always the same).
				533	IrregexpPrepare(regexp, subject);
				534	is_one_byte = subject->IsOneByteRepresentationUnderneath();
				535	} while (true);
				536	UNREACHABLE();
				537	return RE_EXCEPTION;
				538	#else // V8_INTERPRETED_REGEXP
				539
				540	DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
				541	// We must have done EnsureCompiledIrregexp, so we can get the number of
				542	// registers.
				543	int number_of_capture_registers =
				544	(IrregexpNumberOfCaptures(irregexp) + 1) 2;
				545	int32_t* raw_output = &output[number_of_capture_registers];
				546	// We do not touch the actual capture result registers until we know there
				547	// has been a match so that we can use those capture results to set the
				548	// last match info.
				549	for (int i = number_of_capture_registers - 1; i >= 0; i--) {
				550	raw_output[i] = -1;
				551	}
				552	Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
				553	isolate);
				554
				555	IrregexpResult result = IrregexpInterpreter::Match(isolate,
				556	byte_codes,
				557	subject,
				558	raw_output,
				559	index);
				560	if (result == RE_SUCCESS) {
				561	// Copy capture results to the start of the registers array.
				562	MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
				563	}
				564	if (result == RE_EXCEPTION) {
				565	DCHECK(!isolate->has_pending_exception());
				566	isolate->StackOverflow();
				567	}
				568	return result;
				569	#endif // V8_INTERPRETED_REGEXP
				570	}
				571
				572
				573	MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
				574	Handle<String> subject,
				575	int previous_index,
				576	Handle<JSArray> last_match_info) {
				577	Isolate* isolate = regexp->GetIsolate();
				578	DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
				579
				580	// Prepare space for the return values.
				581	#if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
				582	if (FLAG_trace_regexp_bytecodes) {
				583	String* pattern = regexp->Pattern();
				584	PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
				585	PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
				586	}
				587	#endif
				588	int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
				589	if (required_registers < 0) {
				590	// Compiling failed with an exception.
				591	DCHECK(isolate->has_pending_exception());
				592	return MaybeHandle<Object>();
				593	}
				594
				595	int32_t* output_registers = NULL;
				596	if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
				597	output_registers = NewArray<int32_t>(required_registers);
				598	}
				599	base::SmartArrayPointer<int32_t> auto_release(output_registers);
				600	if (output_registers == NULL) {
				601	output_registers = isolate->jsregexp_static_offsets_vector();
				602	}
				603
				604	int res = RegExpImpl::IrregexpExecRaw(
				605	regexp, subject, previous_index, output_registers, required_registers);
				606	if (res == RE_SUCCESS) {
				607	int capture_count =
				608	IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
				609	return SetLastMatchInfo(
				610	last_match_info, subject, capture_count, output_registers);
				611	}
				612	if (res == RE_EXCEPTION) {
				613	DCHECK(isolate->has_pending_exception());
				614	return MaybeHandle<Object>();
				615	}
				616	DCHECK(res == RE_FAILURE);
				617	return isolate->factory()->null_value();
				618	}
				619
				620
				621	static void EnsureSize(Handle<JSArray> array, uint32_t minimum_size) {
				622	if (static_cast<uint32_t>(array->elements()->length()) < minimum_size) {
				623	JSArray::SetLength(array, minimum_size);
				624	}
				625	}
				626
				627
				628	Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info,
				629	Handle<String> subject,
				630	int capture_count,
				631	int32_t* match) {
				632	DCHECK(last_match_info->HasFastObjectElements());
				633	int capture_register_count = (capture_count + 1) * 2;
				634	EnsureSize(last_match_info, capture_register_count + kLastMatchOverhead);
				635	DisallowHeapAllocation no_allocation;
				636	FixedArray* array = FixedArray::cast(last_match_info->elements());
				637	if (match != NULL) {
				638	for (int i = 0; i < capture_register_count; i += 2) {
				639	SetCapture(array, i, match[i]);
				640	SetCapture(array, i + 1, match[i + 1]);
				641	}
				642	}
				643	SetLastCaptureCount(array, capture_register_count);
				644	SetLastSubject(array, *subject);
				645	SetLastInput(array, *subject);
				646	return last_match_info;
				647	}
				648
				649
				650	RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
				651	Handle<String> subject,
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	652	Isolate* isolate)
				653	: register_array_(NULL),
				654	register_array_size_(0),
				655	regexp_(regexp),
				656	subject_(subject) {
				657	#ifdef V8_INTERPRETED_REGEXP
				658	bool interpreted = true;
				659	#else
				660	bool interpreted = false;
				661	#endif // V8_INTERPRETED_REGEXP
				662
				663	if (regexp_->TypeTag() == JSRegExp::ATOM) {
				664	static const int kAtomRegistersPerMatch = 2;
				665	registers_per_match_ = kAtomRegistersPerMatch;
				666	// There is no distinction between interpreted and native for atom regexps.
				667	interpreted = false;
				668	} else {
				669	registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
				670	if (registers_per_match_ < 0) {
				671	num_matches_ = -1; // Signal exception.
				672	return;
				673	}
				674	}
				675
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	676	DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal);
				677	if (!interpreted) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	678	register_array_size_ =
				679	Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
				680	max_matches_ = register_array_size_ / registers_per_match_;
				681	} else {
				682	// Global loop in interpreted regexp is not implemented. We choose
				683	// the size of the offsets vector so that it can only store one match.
				684	register_array_size_ = registers_per_match_;
				685	max_matches_ = 1;
				686	}
				687
				688	if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
				689	register_array_ = NewArray<int32_t>(register_array_size_);
				690	} else {
				691	register_array_ = isolate->jsregexp_static_offsets_vector();
				692	}
				693
				694	// Set state so that fetching the results the first time triggers a call
				695	// to the compiled regexp.
				696	current_match_index_ = max_matches_ - 1;
				697	num_matches_ = max_matches_;
				698	DCHECK(registers_per_match_ >= 2); // Each match has at least one capture.
				699	DCHECK_GE(register_array_size_, registers_per_match_);
				700	int32_t* last_match =
				701	&register_array_[current_match_index_ * registers_per_match_];
				702	last_match[0] = -1;
				703	last_match[1] = 0;
				704	}
				705
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	706	int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
				707	if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 &&
				708	last_index + 1 < subject_->length() &&
				709	unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
				710	unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
				711	// Advance over the surrogate pair.
				712	return last_index + 2;
				713	}
				714	return last_index + 1;
				715	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	716
				717	// -------------------------------------------------------------------
				718	// Implementation of the Irregexp regular expression engine.
				719	//
				720	// The Irregexp regular expression engine is intended to be a complete
				721	// implementation of ECMAScript regular expressions. It generates either
				722	// bytecodes or native code.
				723
				724	// The Irregexp regexp engine is structured in three steps.
				725	// 1) The parser generates an abstract syntax tree. See ast.cc.
				726	// 2) From the AST a node network is created. The nodes are all
				727	// subclasses of RegExpNode. The nodes represent states when
				728	// executing a regular expression. Several optimizations are
				729	// performed on the node network.
				730	// 3) From the nodes we generate either byte codes or native code
				731	// that can actually execute the regular expression (perform
				732	// the search). The code generation step is described in more
				733	// detail below.
				734
				735	// Code generation.
				736	//
				737	// The nodes are divided into four main categories.
				738	// * Choice nodes
				739	// These represent places where the regular expression can
				740	// match in more than one way. For example on entry to an
				741	// alternation (foo\|bar) or a repetition (*, +, ? or {}).
				742	// * Action nodes
				743	// These represent places where some action should be
				744	// performed. Examples include recording the current position
				745	// in the input string to a register (in order to implement
				746	// captures) or other actions on register for example in order
				747	// to implement the counters needed for {} repetitions.
				748	// * Matching nodes
				749	// These attempt to match some element part of the input string.
				750	// Examples of elements include character classes, plain strings
				751	// or back references.
				752	// * End nodes
				753	// These are used to implement the actions required on finding
				754	// a successful match or failing to find a match.
				755	//
				756	// The code generated (whether as byte codes or native code) maintains
				757	// some state as it runs. This consists of the following elements:
				758	//
				759	// * The capture registers. Used for string captures.
				760	// * Other registers. Used for counters etc.
				761	// * The current position.
				762	// * The stack of backtracking information. Used when a matching node
				763	// fails to find a match and needs to try an alternative.
				764	//
				765	// Conceptual regular expression execution model:
				766	//
				767	// There is a simple conceptual model of regular expression execution
				768	// which will be presented first. The actual code generated is a more
				769	// efficient simulation of the simple conceptual model:
				770	//
				771	// * Choice nodes are implemented as follows:
				772	// For each choice except the last {
				773	// push current position
				774	// push backtrack code location
				775	// <generate code to test for choice>
				776	// backtrack code location:
				777	// pop current position
				778	// }
				779	// <generate code to test for last choice>
				780	//
				781	// * Actions nodes are generated as follows
				782	// <push affected registers on backtrack stack>
				783	// <generate code to perform action>
				784	// push backtrack code location
				785	// <generate code to test for following nodes>
				786	// backtrack code location:
				787	// <pop affected registers to restore their state>
				788	// <pop backtrack location from stack and go to it>
				789	//
				790	// * Matching nodes are generated as follows:
				791	// if input string matches at current position
				792	// update current position
				793	// <generate code to test for following nodes>
				794	// else
				795	// <pop backtrack location from stack and go to it>
				796	//
				797	// Thus it can be seen that the current position is saved and restored
				798	// by the choice nodes, whereas the registers are saved and restored by
				799	// by the action nodes that manipulate them.
				800	//
				801	// The other interesting aspect of this model is that nodes are generated
				802	// at the point where they are needed by a recursive call to Emit(). If
				803	// the node has already been code generated then the Emit() call will
				804	// generate a jump to the previously generated code instead. In order to
				805	// limit recursion it is possible for the Emit() function to put the node
				806	// on a work list for later generation and instead generate a jump. The
				807	// destination of the jump is resolved later when the code is generated.
				808	//
				809	// Actual regular expression code generation.
				810	//
				811	// Code generation is actually more complicated than the above. In order
				812	// to improve the efficiency of the generated code some optimizations are
				813	// performed
				814	//
				815	// * Choice nodes have 1-character lookahead.
				816	// A choice node looks at the following character and eliminates some of
				817	// the choices immediately based on that character. This is not yet
				818	// implemented.
				819	// * Simple greedy loops store reduced backtracking information.
				820	// A quantifier like /.*foo/m will greedily match the whole input. It will
				821	// then need to backtrack to a point where it can match "foo". The naive
				822	// implementation of this would push each character position onto the
				823	// backtracking stack, then pop them off one by one. This would use space
				824	// proportional to the length of the input string. However since the "."
				825	// can only match in one way and always has a constant length (in this case
				826	// of 1) it suffices to store the current position on the top of the stack
				827	// once. Matching now becomes merely incrementing the current position and
				828	// backtracking becomes decrementing the current position and checking the
				829	// result against the stored current position. This is faster and saves
				830	// space.
				831	// * The current state is virtualized.
				832	// This is used to defer expensive operations until it is clear that they
				833	// are needed and to generate code for a node more than once, allowing
				834	// specialized an efficient versions of the code to be created. This is
				835	// explained in the section below.
				836	//
				837	// Execution state virtualization.
				838	//
				839	// Instead of emitting code, nodes that manipulate the state can record their
				840	// manipulation in an object called the Trace. The Trace object can record a
				841	// current position offset, an optional backtrack code location on the top of
				842	// the virtualized backtrack stack and some register changes. When a node is
				843	// to be emitted it can flush the Trace or update it. Flushing the Trace
				844	// will emit code to bring the actual state into line with the virtual state.
				845	// Avoiding flushing the state can postpone some work (e.g. updates of capture
				846	// registers). Postponing work can save time when executing the regular
				847	// expression since it may be found that the work never has to be done as a
				848	// failure to match can occur. In addition it is much faster to jump to a
				849	// known backtrack code location than it is to pop an unknown backtrack
				850	// location from the stack and jump there.
				851	//
				852	// The virtual state found in the Trace affects code generation. For example
				853	// the virtual state contains the difference between the actual current
				854	// position and the virtual current position, and matching code needs to use
				855	// this offset to attempt a match in the correct location of the input
				856	// string. Therefore code generated for a non-trivial trace is specialized
				857	// to that trace. The code generator therefore has the ability to generate
				858	// code for each node several times. In order to limit the size of the
				859	// generated code there is an arbitrary limit on how many specialized sets of
				860	// code may be generated for a given node. If the limit is reached, the
				861	// trace is flushed and a generic version of the code for a node is emitted.
				862	// This is subsequently used for that node. The code emitted for non-generic
				863	// trace is not recorded in the node and so it cannot currently be reused in
				864	// the event that code generation is requested for an identical trace.
				865
				866
				867	void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
				868	UNREACHABLE();
				869	}
				870
				871
				872	void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
				873	text->AddElement(TextElement::Atom(this), zone);
				874	}
				875
				876
				877	void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
				878	text->AddElement(TextElement::CharClass(this), zone);
				879	}
				880
				881
				882	void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
				883	for (int i = 0; i < elements()->length(); i++)
				884	text->AddElement(elements()->at(i), zone);
				885	}
				886
				887
				888	TextElement TextElement::Atom(RegExpAtom* atom) {
				889	return TextElement(ATOM, atom);
				890	}
				891
				892
				893	TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
				894	return TextElement(CHAR_CLASS, char_class);
				895	}
				896
				897
				898	int TextElement::length() const {
				899	switch (text_type()) {
				900	case ATOM:
				901	return atom()->length();
				902
				903	case CHAR_CLASS:
				904	return 1;
				905	}
				906	UNREACHABLE();
				907	return 0;
				908	}
				909
				910
				911	DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
				912	if (table_ == NULL) {
				913	table_ = new(zone()) DispatchTable(zone());
				914	DispatchTableConstructor cons(table_, ignore_case, zone());
				915	cons.BuildTable(this);
				916	}
				917	return table_;
				918	}
				919
				920
				921	class FrequencyCollator {
				922	public:
				923	FrequencyCollator() : total_samples_(0) {
				924	for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
				925	frequencies_[i] = CharacterFrequency(i);
				926	}
				927	}
				928
				929	void CountCharacter(int character) {
				930	int index = (character & RegExpMacroAssembler::kTableMask);
				931	frequencies_[index].Increment();
				932	total_samples_++;
				933	}
				934
				935	// Does not measure in percent, but rather per-128 (the table size from the
				936	// regexp macro assembler).
				937	int Frequency(int in_character) {
				938	DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
				939	if (total_samples_ < 1) return 1; // Division by zero.
				940	int freq_in_per128 =
				941	(frequencies_[in_character].counter() * 128) / total_samples_;
				942	return freq_in_per128;
				943	}
				944
				945	private:
				946	class CharacterFrequency {
				947	public:
				948	CharacterFrequency() : counter_(0), character_(-1) { }
				949	explicit CharacterFrequency(int character)
				950	: counter_(0), character_(character) { }
				951
				952	void Increment() { counter_++; }
				953	int counter() { return counter_; }
				954	int character() { return character_; }
				955
				956	private:
				957	int counter_;
				958	int character_;
				959	};
				960
				961
				962	private:
				963	CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
				964	int total_samples_;
				965	};
				966
				967
				968	class RegExpCompiler {
				969	public:
				970	RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	971	JSRegExp::Flags flags, bool is_one_byte);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	972
				973	int AllocateRegister() {
				974	if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
				975	reg_exp_too_big_ = true;
				976	return next_register_;
				977	}
				978	return next_register_++;
				979	}
				980
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	981	// Lookarounds to match lone surrogates for unicode character class matches
				982	// are never nested. We can therefore reuse registers.
				983	int UnicodeLookaroundStackRegister() {
				984	if (unicode_lookaround_stack_register_ == kNoRegister) {
				985	unicode_lookaround_stack_register_ = AllocateRegister();
				986	}
				987	return unicode_lookaround_stack_register_;
				988	}
				989
				990	int UnicodeLookaroundPositionRegister() {
				991	if (unicode_lookaround_position_register_ == kNoRegister) {
				992	unicode_lookaround_position_register_ = AllocateRegister();
				993	}
				994	return unicode_lookaround_position_register_;
				995	}
				996
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	997	RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
				998	RegExpNode* start,
				999	int capture_count,
				1000	Handle<String> pattern);
				1001
				1002	inline void AddWork(RegExpNode* node) {
				1003	if (!node->on_work_list() && !node->label()->is_bound()) {
				1004	node->set_on_work_list(true);
				1005	work_list_->Add(node);
				1006	}
				1007	}
				1008
				1009	static const int kImplementationOffset = 0;
				1010	static const int kNumberOfRegistersOffset = 0;
				1011	static const int kCodeOffset = 1;
				1012
				1013	RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
				1014	EndNode* accept() { return accept_; }
				1015
				1016	static const int kMaxRecursion = 100;
				1017	inline int recursion_depth() { return recursion_depth_; }
				1018	inline void IncrementRecursionDepth() { recursion_depth_++; }
				1019	inline void DecrementRecursionDepth() { recursion_depth_--; }
				1020
				1021	void SetRegExpTooBig() { reg_exp_too_big_ = true; }
				1022
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1023	inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
				1024	inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1025	inline bool one_byte() { return one_byte_; }
				1026	inline bool optimize() { return optimize_; }
				1027	inline void set_optimize(bool value) { optimize_ = value; }
				1028	inline bool limiting_recursion() { return limiting_recursion_; }
				1029	inline void set_limiting_recursion(bool value) {
				1030	limiting_recursion_ = value;
				1031	}
				1032	bool read_backward() { return read_backward_; }
				1033	void set_read_backward(bool value) { read_backward_ = value; }
				1034	FrequencyCollator* frequency_collator() { return &frequency_collator_; }
				1035
				1036	int current_expansion_factor() { return current_expansion_factor_; }
				1037	void set_current_expansion_factor(int value) {
				1038	current_expansion_factor_ = value;
				1039	}
				1040
				1041	Isolate* isolate() const { return isolate_; }
				1042	Zone* zone() const { return zone_; }
				1043
				1044	static const int kNoRegister = -1;
				1045
				1046	private:
				1047	EndNode* accept_;
				1048	int next_register_;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1049	int unicode_lookaround_stack_register_;
				1050	int unicode_lookaround_position_register_;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1051	List<RegExpNode> work_list_;
				1052	int recursion_depth_;
				1053	RegExpMacroAssembler* macro_assembler_;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1054	JSRegExp::Flags flags_;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1055	bool one_byte_;
				1056	bool reg_exp_too_big_;
				1057	bool limiting_recursion_;
				1058	bool optimize_;
				1059	bool read_backward_;
				1060	int current_expansion_factor_;
				1061	FrequencyCollator frequency_collator_;
				1062	Isolate* isolate_;
				1063	Zone* zone_;
				1064	};
				1065
				1066
				1067	class RecursionCheck {
				1068	public:
				1069	explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
				1070	compiler->IncrementRecursionDepth();
				1071	}
				1072	~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
				1073	private:
				1074	RegExpCompiler* compiler_;
				1075	};
				1076
				1077
				1078	static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
				1079	return RegExpEngine::CompilationResult(isolate, "RegExp too big");
				1080	}
				1081
				1082
				1083	// Attempts to compile the regexp using an Irregexp code generator. Returns
				1084	// a fixed array or a null handle depending on whether it succeeded.
				1085	RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1086	JSRegExp::Flags flags, bool one_byte)
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1087	: next_register_(2 * (capture_count + 1)),
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1088	unicode_lookaround_stack_register_(kNoRegister),
				1089	unicode_lookaround_position_register_(kNoRegister),
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1090	work_list_(NULL),
				1091	recursion_depth_(0),
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1092	flags_(flags),
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1093	one_byte_(one_byte),
				1094	reg_exp_too_big_(false),
				1095	limiting_recursion_(false),
				1096	optimize_(FLAG_regexp_optimization),
				1097	read_backward_(false),
				1098	current_expansion_factor_(1),
				1099	frequency_collator_(),
				1100	isolate_(isolate),
				1101	zone_(zone) {
				1102	accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
				1103	DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
				1104	}
				1105
				1106
				1107	RegExpEngine::CompilationResult RegExpCompiler::Assemble(
				1108	RegExpMacroAssembler* macro_assembler,
				1109	RegExpNode* start,
				1110	int capture_count,
				1111	Handle<String> pattern) {
				1112	Heap* heap = pattern->GetHeap();
				1113
				1114	#ifdef DEBUG
				1115	if (FLAG_trace_regexp_assembler)
				1116	macro_assembler_ =
				1117	new RegExpMacroAssemblerTracer(isolate(), macro_assembler);
				1118	else
				1119	#endif
				1120	macro_assembler_ = macro_assembler;
				1121
				1122	List <RegExpNode*> work_list(0);
				1123	work_list_ = &work_list;
				1124	Label fail;
				1125	macro_assembler_->PushBacktrack(&fail);
				1126	Trace new_trace;
				1127	start->Emit(this, &new_trace);
				1128	macro_assembler_->Bind(&fail);
				1129	macro_assembler_->Fail();
				1130	while (!work_list.is_empty()) {
				1131	RegExpNode* node = work_list.RemoveLast();
				1132	node->set_on_work_list(false);
				1133	if (!node->label()->is_bound()) node->Emit(this, &new_trace);
				1134	}
				1135	if (reg_exp_too_big_) {
				1136	macro_assembler_->AbortedCodeGeneration();
				1137	return IrregexpRegExpTooBig(isolate_);
				1138	}
				1139
				1140	Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
				1141	heap->IncreaseTotalRegexpCodeGenerated(code->Size());
				1142	work_list_ = NULL;
				1143	#ifdef ENABLE_DISASSEMBLER
				1144	if (FLAG_print_code) {
				1145	CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());
				1146	OFStream os(trace_scope.file());
				1147	Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
				1148	}
				1149	#endif
				1150	#ifdef DEBUG
				1151	if (FLAG_trace_regexp_assembler) {
				1152	delete macro_assembler_;
				1153	}
				1154	#endif
				1155	return RegExpEngine::CompilationResult(*code, next_register_);
				1156	}
				1157
				1158
				1159	bool Trace::DeferredAction::Mentions(int that) {
				1160	if (action_type() == ActionNode::CLEAR_CAPTURES) {
				1161	Interval range = static_cast<DeferredClearCaptures*>(this)->range();
				1162	return range.Contains(that);
				1163	} else {
				1164	return reg() == that;
				1165	}
				1166	}
				1167
				1168
				1169	bool Trace::mentions_reg(int reg) {
				1170	for (DeferredAction* action = actions_;
				1171	action != NULL;
				1172	action = action->next()) {
				1173	if (action->Mentions(reg))
				1174	return true;
				1175	}
				1176	return false;
				1177	}
				1178
				1179
				1180	bool Trace::GetStoredPosition(int reg, int* cp_offset) {
				1181	DCHECK_EQ(0, *cp_offset);
				1182	for (DeferredAction* action = actions_;
				1183	action != NULL;
				1184	action = action->next()) {
				1185	if (action->Mentions(reg)) {
				1186	if (action->action_type() == ActionNode::STORE_POSITION) {
				1187	cp_offset = static_cast<DeferredCapture>(action)->cp_offset();
				1188	return true;
				1189	} else {
				1190	return false;
				1191	}
				1192	}
				1193	}
				1194	return false;
				1195	}
				1196
				1197
				1198	int Trace::FindAffectedRegisters(OutSet* affected_registers,
				1199	Zone* zone) {
				1200	int max_register = RegExpCompiler::kNoRegister;
				1201	for (DeferredAction* action = actions_;
				1202	action != NULL;
				1203	action = action->next()) {
				1204	if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
				1205	Interval range = static_cast<DeferredClearCaptures*>(action)->range();
				1206	for (int i = range.from(); i <= range.to(); i++)
				1207	affected_registers->Set(i, zone);
				1208	if (range.to() > max_register) max_register = range.to();
				1209	} else {
				1210	affected_registers->Set(action->reg(), zone);
				1211	if (action->reg() > max_register) max_register = action->reg();
				1212	}
				1213	}
				1214	return max_register;
				1215	}
				1216
				1217
				1218	void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
				1219	int max_register,
				1220	const OutSet& registers_to_pop,
				1221	const OutSet& registers_to_clear) {
				1222	for (int reg = max_register; reg >= 0; reg--) {
				1223	if (registers_to_pop.Get(reg)) {
				1224	assembler->PopRegister(reg);
				1225	} else if (registers_to_clear.Get(reg)) {
				1226	int clear_to = reg;
				1227	while (reg > 0 && registers_to_clear.Get(reg - 1)) {
				1228	reg--;
				1229	}
				1230	assembler->ClearRegisters(reg, clear_to);
				1231	}
				1232	}
				1233	}
				1234
				1235
				1236	void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
				1237	int max_register,
				1238	const OutSet& affected_registers,
				1239	OutSet* registers_to_pop,
				1240	OutSet* registers_to_clear,
				1241	Zone* zone) {
				1242	// The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
				1243	const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
				1244
				1245	// Count pushes performed to force a stack limit check occasionally.
				1246	int pushes = 0;
				1247
				1248	for (int reg = 0; reg <= max_register; reg++) {
				1249	if (!affected_registers.Get(reg)) {
				1250	continue;
				1251	}
				1252
				1253	// The chronologically first deferred action in the trace
				1254	// is used to infer the action needed to restore a register
				1255	// to its previous state (or not, if it's safe to ignore it).
				1256	enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
				1257	DeferredActionUndoType undo_action = IGNORE;
				1258
				1259	int value = 0;
				1260	bool absolute = false;
				1261	bool clear = false;
				1262	static const int kNoStore = kMinInt;
				1263	int store_position = kNoStore;
				1264	// This is a little tricky because we are scanning the actions in reverse
				1265	// historical order (newest first).
				1266	for (DeferredAction* action = actions_;
				1267	action != NULL;
				1268	action = action->next()) {
				1269	if (action->Mentions(reg)) {
				1270	switch (action->action_type()) {
				1271	case ActionNode::SET_REGISTER: {
				1272	Trace::DeferredSetRegister* psr =
				1273	static_cast<Trace::DeferredSetRegister*>(action);
				1274	if (!absolute) {
				1275	value += psr->value();
				1276	absolute = true;
				1277	}
				1278	// SET_REGISTER is currently only used for newly introduced loop
				1279	// counters. They can have a significant previous value if they
				1280	// occour in a loop. TODO(lrn): Propagate this information, so
				1281	// we can set undo_action to IGNORE if we know there is no value to
				1282	// restore.
				1283	undo_action = RESTORE;
				1284	DCHECK_EQ(store_position, kNoStore);
				1285	DCHECK(!clear);
				1286	break;
				1287	}
				1288	case ActionNode::INCREMENT_REGISTER:
				1289	if (!absolute) {
				1290	value++;
				1291	}
				1292	DCHECK_EQ(store_position, kNoStore);
				1293	DCHECK(!clear);
				1294	undo_action = RESTORE;
				1295	break;
				1296	case ActionNode::STORE_POSITION: {
				1297	Trace::DeferredCapture* pc =
				1298	static_cast<Trace::DeferredCapture*>(action);
				1299	if (!clear && store_position == kNoStore) {
				1300	store_position = pc->cp_offset();
				1301	}
				1302
				1303	// For captures we know that stores and clears alternate.
				1304	// Other register, are never cleared, and if the occur
				1305	// inside a loop, they might be assigned more than once.
				1306	if (reg <= 1) {
				1307	// Registers zero and one, aka "capture zero", is
				1308	// always set correctly if we succeed. There is no
				1309	// need to undo a setting on backtrack, because we
				1310	// will set it again or fail.
				1311	undo_action = IGNORE;
				1312	} else {
				1313	undo_action = pc->is_capture() ? CLEAR : RESTORE;
				1314	}
				1315	DCHECK(!absolute);
				1316	DCHECK_EQ(value, 0);
				1317	break;
				1318	}
				1319	case ActionNode::CLEAR_CAPTURES: {
				1320	// Since we're scanning in reverse order, if we've already
				1321	// set the position we have to ignore historically earlier
				1322	// clearing operations.
				1323	if (store_position == kNoStore) {
				1324	clear = true;
				1325	}
				1326	undo_action = RESTORE;
				1327	DCHECK(!absolute);
				1328	DCHECK_EQ(value, 0);
				1329	break;
				1330	}
				1331	default:
				1332	UNREACHABLE();
				1333	break;
				1334	}
				1335	}
				1336	}
				1337	// Prepare for the undo-action (e.g., push if it's going to be popped).
				1338	if (undo_action == RESTORE) {
				1339	pushes++;
				1340	RegExpMacroAssembler::StackCheckFlag stack_check =
				1341	RegExpMacroAssembler::kNoStackLimitCheck;
				1342	if (pushes == push_limit) {
				1343	stack_check = RegExpMacroAssembler::kCheckStackLimit;
				1344	pushes = 0;
				1345	}
				1346
				1347	assembler->PushRegister(reg, stack_check);
				1348	registers_to_pop->Set(reg, zone);
				1349	} else if (undo_action == CLEAR) {
				1350	registers_to_clear->Set(reg, zone);
				1351	}
				1352	// Perform the chronologically last action (or accumulated increment)
				1353	// for the register.
				1354	if (store_position != kNoStore) {
				1355	assembler->WriteCurrentPositionToRegister(reg, store_position);
				1356	} else if (clear) {
				1357	assembler->ClearRegisters(reg, reg);
				1358	} else if (absolute) {
				1359	assembler->SetRegister(reg, value);
				1360	} else if (value != 0) {
				1361	assembler->AdvanceRegister(reg, value);
				1362	}
				1363	}
				1364	}
				1365
				1366
				1367	// This is called as we come into a loop choice node and some other tricky
				1368	// nodes. It normalizes the state of the code generator to ensure we can
				1369	// generate generic code.
				1370	void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
				1371	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1372
				1373	DCHECK(!is_trivial());
				1374
				1375	if (actions_ == NULL && backtrack() == NULL) {
				1376	// Here we just have some deferred cp advances to fix and we are back to
				1377	// a normal situation. We may also have to forget some information gained
				1378	// through a quick check that was already performed.
				1379	if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
				1380	// Create a new trivial state and generate the node with that.
				1381	Trace new_state;
				1382	successor->Emit(compiler, &new_state);
				1383	return;
				1384	}
				1385
				1386	// Generate deferred actions here along with code to undo them again.
				1387	OutSet affected_registers;
				1388
				1389	if (backtrack() != NULL) {
				1390	// Here we have a concrete backtrack location. These are set up by choice
				1391	// nodes and so they indicate that we have a deferred save of the current
				1392	// position which we may need to emit here.
				1393	assembler->PushCurrentPosition();
				1394	}
				1395
				1396	int max_register = FindAffectedRegisters(&affected_registers,
				1397	compiler->zone());
				1398	OutSet registers_to_pop;
				1399	OutSet registers_to_clear;
				1400	PerformDeferredActions(assembler,
				1401	max_register,
				1402	affected_registers,
				1403	&registers_to_pop,
				1404	&registers_to_clear,
				1405	compiler->zone());
				1406	if (cp_offset_ != 0) {
				1407	assembler->AdvanceCurrentPosition(cp_offset_);
				1408	}
				1409
				1410	// Create a new trivial state and generate the node with that.
				1411	Label undo;
				1412	assembler->PushBacktrack(&undo);
				1413	if (successor->KeepRecursing(compiler)) {
				1414	Trace new_state;
				1415	successor->Emit(compiler, &new_state);
				1416	} else {
				1417	compiler->AddWork(successor);
				1418	assembler->GoTo(successor->label());
				1419	}
				1420
				1421	// On backtrack we need to restore state.
				1422	assembler->Bind(&undo);
				1423	RestoreAffectedRegisters(assembler,
				1424	max_register,
				1425	registers_to_pop,
				1426	registers_to_clear);
				1427	if (backtrack() == NULL) {
				1428	assembler->Backtrack();
				1429	} else {
				1430	assembler->PopCurrentPosition();
				1431	assembler->GoTo(backtrack());
				1432	}
				1433	}
				1434
				1435
				1436	void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
				1437	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1438
				1439	// Omit flushing the trace. We discard the entire stack frame anyway.
				1440
				1441	if (!label()->is_bound()) {
				1442	// We are completely independent of the trace, since we ignore it,
				1443	// so this code can be used as the generic version.
				1444	assembler->Bind(label());
				1445	}
				1446
				1447	// Throw away everything on the backtrack stack since the start
				1448	// of the negative submatch and restore the character position.
				1449	assembler->ReadCurrentPositionFromRegister(current_position_register_);
				1450	assembler->ReadStackPointerFromRegister(stack_pointer_register_);
				1451	if (clear_capture_count_ > 0) {
				1452	// Clear any captures that might have been performed during the success
				1453	// of the body of the negative look-ahead.
				1454	int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
				1455	assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
				1456	}
				1457	// Now that we have unwound the stack we find at the top of the stack the
				1458	// backtrack that the BeginSubmatch node got.
				1459	assembler->Backtrack();
				1460	}
				1461
				1462
				1463	void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				1464	if (!trace->is_trivial()) {
				1465	trace->Flush(compiler, this);
				1466	return;
				1467	}
				1468	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1469	if (!label()->is_bound()) {
				1470	assembler->Bind(label());
				1471	}
				1472	switch (action_) {
				1473	case ACCEPT:
				1474	assembler->Succeed();
				1475	return;
				1476	case BACKTRACK:
				1477	assembler->GoTo(trace->backtrack());
				1478	return;
				1479	case NEGATIVE_SUBMATCH_SUCCESS:
				1480	// This case is handled in a different virtual method.
				1481	UNREACHABLE();
				1482	}
				1483	UNIMPLEMENTED();
				1484	}
				1485
				1486
				1487	void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
				1488	if (guards_ == NULL)
				1489	guards_ = new(zone) ZoneList<Guard*>(1, zone);
				1490	guards_->Add(guard, zone);
				1491	}
				1492
				1493
				1494	ActionNode* ActionNode::SetRegister(int reg,
				1495	int val,
				1496	RegExpNode* on_success) {
				1497	ActionNode* result =
				1498	new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
				1499	result->data_.u_store_register.reg = reg;
				1500	result->data_.u_store_register.value = val;
				1501	return result;
				1502	}
				1503
				1504
				1505	ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
				1506	ActionNode* result =
				1507	new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
				1508	result->data_.u_increment_register.reg = reg;
				1509	return result;
				1510	}
				1511
				1512
				1513	ActionNode* ActionNode::StorePosition(int reg,
				1514	bool is_capture,
				1515	RegExpNode* on_success) {
				1516	ActionNode* result =
				1517	new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
				1518	result->data_.u_position_register.reg = reg;
				1519	result->data_.u_position_register.is_capture = is_capture;
				1520	return result;
				1521	}
				1522
				1523
				1524	ActionNode* ActionNode::ClearCaptures(Interval range,
				1525	RegExpNode* on_success) {
				1526	ActionNode* result =
				1527	new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
				1528	result->data_.u_clear_captures.range_from = range.from();
				1529	result->data_.u_clear_captures.range_to = range.to();
				1530	return result;
				1531	}
				1532
				1533
				1534	ActionNode* ActionNode::BeginSubmatch(int stack_reg,
				1535	int position_reg,
				1536	RegExpNode* on_success) {
				1537	ActionNode* result =
				1538	new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
				1539	result->data_.u_submatch.stack_pointer_register = stack_reg;
				1540	result->data_.u_submatch.current_position_register = position_reg;
				1541	return result;
				1542	}
				1543
				1544
				1545	ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
				1546	int position_reg,
				1547	int clear_register_count,
				1548	int clear_register_from,
				1549	RegExpNode* on_success) {
				1550	ActionNode* result =
				1551	new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
				1552	result->data_.u_submatch.stack_pointer_register = stack_reg;
				1553	result->data_.u_submatch.current_position_register = position_reg;
				1554	result->data_.u_submatch.clear_register_count = clear_register_count;
				1555	result->data_.u_submatch.clear_register_from = clear_register_from;
				1556	return result;
				1557	}
				1558
				1559
				1560	ActionNode* ActionNode::EmptyMatchCheck(int start_register,
				1561	int repetition_register,
				1562	int repetition_limit,
				1563	RegExpNode* on_success) {
				1564	ActionNode* result =
				1565	new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
				1566	result->data_.u_empty_match_check.start_register = start_register;
				1567	result->data_.u_empty_match_check.repetition_register = repetition_register;
				1568	result->data_.u_empty_match_check.repetition_limit = repetition_limit;
				1569	return result;
				1570	}
				1571
				1572
				1573	#define DEFINE_ACCEPT(Type) \
				1574	void Type##Node::Accept(NodeVisitor* visitor) { \
				1575	visitor->Visit##Type(this); \
				1576	}
				1577	FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
				1578	#undef DEFINE_ACCEPT
				1579
				1580
				1581	void LoopChoiceNode::Accept(NodeVisitor* visitor) {
				1582	visitor->VisitLoopChoice(this);
				1583	}
				1584
				1585
				1586	// -------------------------------------------------------------------
				1587	// Emit code.
				1588
				1589
				1590	void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
				1591	Guard* guard,
				1592	Trace* trace) {
				1593	switch (guard->op()) {
				1594	case Guard::LT:
				1595	DCHECK(!trace->mentions_reg(guard->reg()));
				1596	macro_assembler->IfRegisterGE(guard->reg(),
				1597	guard->value(),
				1598	trace->backtrack());
				1599	break;
				1600	case Guard::GEQ:
				1601	DCHECK(!trace->mentions_reg(guard->reg()));
				1602	macro_assembler->IfRegisterLT(guard->reg(),
				1603	guard->value(),
				1604	trace->backtrack());
				1605	break;
				1606	}
				1607	}
				1608
				1609
				1610	// Returns the number of characters in the equivalence class, omitting those
				1611	// that cannot occur in the source string because it is Latin1.
				1612	static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
				1613	bool one_byte_subject,
				1614	unibrow::uchar* letters) {
				1615	int length =
				1616	isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
				1617	// Unibrow returns 0 or 1 for characters where case independence is
				1618	// trivial.
				1619	if (length == 0) {
				1620	letters[0] = character;
				1621	length = 1;
				1622	}
				1623
				1624	if (one_byte_subject) {
				1625	int new_length = 0;
				1626	for (int i = 0; i < length; i++) {
				1627	if (letters[i] <= String::kMaxOneByteCharCode) {
				1628	letters[new_length++] = letters[i];
				1629	}
				1630	}
				1631	length = new_length;
				1632	}
				1633
				1634	return length;
				1635	}
				1636
				1637
				1638	static inline bool EmitSimpleCharacter(Isolate* isolate,
				1639	RegExpCompiler* compiler,
				1640	uc16 c,
				1641	Label* on_failure,
				1642	int cp_offset,
				1643	bool check,
				1644	bool preloaded) {
				1645	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1646	bool bound_checked = false;
				1647	if (!preloaded) {
				1648	assembler->LoadCurrentCharacter(
				1649	cp_offset,
				1650	on_failure,
				1651	check);
				1652	bound_checked = true;
				1653	}
				1654	assembler->CheckNotCharacter(c, on_failure);
				1655	return bound_checked;
				1656	}
				1657
				1658
				1659	// Only emits non-letters (things that don't have case). Only used for case
				1660	// independent matches.
				1661	static inline bool EmitAtomNonLetter(Isolate* isolate,
				1662	RegExpCompiler* compiler,
				1663	uc16 c,
				1664	Label* on_failure,
				1665	int cp_offset,
				1666	bool check,
				1667	bool preloaded) {
				1668	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1669	bool one_byte = compiler->one_byte();
				1670	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				1671	int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
				1672	if (length < 1) {
				1673	// This can't match. Must be an one-byte subject and a non-one-byte
				1674	// character. We do not need to do anything since the one-byte pass
				1675	// already handled this.
				1676	return false; // Bounds not checked.
				1677	}
				1678	bool checked = false;
				1679	// We handle the length > 1 case in a later pass.
				1680	if (length == 1) {
				1681	if (one_byte && c > String::kMaxOneByteCharCodeU) {
				1682	// Can't match - see above.
				1683	return false; // Bounds not checked.
				1684	}
				1685	if (!preloaded) {
				1686	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
				1687	checked = check;
				1688	}
				1689	macro_assembler->CheckNotCharacter(c, on_failure);
				1690	}
				1691	return checked;
				1692	}
				1693
				1694
				1695	static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
				1696	bool one_byte, uc16 c1, uc16 c2,
				1697	Label* on_failure) {
				1698	uc16 char_mask;
				1699	if (one_byte) {
				1700	char_mask = String::kMaxOneByteCharCode;
				1701	} else {
				1702	char_mask = String::kMaxUtf16CodeUnit;
				1703	}
				1704	uc16 exor = c1 ^ c2;
				1705	// Check whether exor has only one bit set.
				1706	if (((exor - 1) & exor) == 0) {
				1707	// If c1 and c2 differ only by one bit.
				1708	// Ecma262UnCanonicalize always gives the highest number last.
				1709	DCHECK(c2 > c1);
				1710	uc16 mask = char_mask ^ exor;
				1711	macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
				1712	return true;
				1713	}
				1714	DCHECK(c2 > c1);
				1715	uc16 diff = c2 - c1;
				1716	if (((diff - 1) & diff) == 0 && c1 >= diff) {
				1717	// If the characters differ by 2^n but don't differ by one bit then
				1718	// subtract the difference from the found character, then do the or
				1719	// trick. We avoid the theoretical case where negative numbers are
				1720	// involved in order to simplify code generation.
				1721	uc16 mask = char_mask ^ diff;
				1722	macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
				1723	diff,
				1724	mask,
				1725	on_failure);
				1726	return true;
				1727	}
				1728	return false;
				1729	}
				1730
				1731
				1732	typedef bool EmitCharacterFunction(Isolate* isolate,
				1733	RegExpCompiler* compiler,
				1734	uc16 c,
				1735	Label* on_failure,
				1736	int cp_offset,
				1737	bool check,
				1738	bool preloaded);
				1739
				1740	// Only emits letters (things that have case). Only used for case independent
				1741	// matches.
				1742	static inline bool EmitAtomLetter(Isolate* isolate,
				1743	RegExpCompiler* compiler,
				1744	uc16 c,
				1745	Label* on_failure,
				1746	int cp_offset,
				1747	bool check,
				1748	bool preloaded) {
				1749	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1750	bool one_byte = compiler->one_byte();
				1751	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				1752	int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
				1753	if (length <= 1) return false;
				1754	// We may not need to check against the end of the input string
				1755	// if this character lies before a character that matched.
				1756	if (!preloaded) {
				1757	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
				1758	}
				1759	Label ok;
				1760	DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
				1761	switch (length) {
				1762	case 2: {
				1763	if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
				1764	chars[1], on_failure)) {
				1765	} else {
				1766	macro_assembler->CheckCharacter(chars[0], &ok);
				1767	macro_assembler->CheckNotCharacter(chars[1], on_failure);
				1768	macro_assembler->Bind(&ok);
				1769	}
				1770	break;
				1771	}
				1772	case 4:
				1773	macro_assembler->CheckCharacter(chars[3], &ok);
				1774	// Fall through!
				1775	case 3:
				1776	macro_assembler->CheckCharacter(chars[0], &ok);
				1777	macro_assembler->CheckCharacter(chars[1], &ok);
				1778	macro_assembler->CheckNotCharacter(chars[2], on_failure);
				1779	macro_assembler->Bind(&ok);
				1780	break;
				1781	default:
				1782	UNREACHABLE();
				1783	break;
				1784	}
				1785	return true;
				1786	}
				1787
				1788
				1789	static void EmitBoundaryTest(RegExpMacroAssembler* masm,
				1790	int border,
				1791	Label* fall_through,
				1792	Label* above_or_equal,
				1793	Label* below) {
				1794	if (below != fall_through) {
				1795	masm->CheckCharacterLT(border, below);
				1796	if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
				1797	} else {
				1798	masm->CheckCharacterGT(border - 1, above_or_equal);
				1799	}
				1800	}
				1801
				1802
				1803	static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
				1804	int first,
				1805	int last,
				1806	Label* fall_through,
				1807	Label* in_range,
				1808	Label* out_of_range) {
				1809	if (in_range == fall_through) {
				1810	if (first == last) {
				1811	masm->CheckNotCharacter(first, out_of_range);
				1812	} else {
				1813	masm->CheckCharacterNotInRange(first, last, out_of_range);
				1814	}
				1815	} else {
				1816	if (first == last) {
				1817	masm->CheckCharacter(first, in_range);
				1818	} else {
				1819	masm->CheckCharacterInRange(first, last, in_range);
				1820	}
				1821	if (out_of_range != fall_through) masm->GoTo(out_of_range);
				1822	}
				1823	}
				1824
				1825
				1826	// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
				1827	// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
				1828	static void EmitUseLookupTable(
				1829	RegExpMacroAssembler* masm,
				1830	ZoneList<int>* ranges,
				1831	int start_index,
				1832	int end_index,
				1833	int min_char,
				1834	Label* fall_through,
				1835	Label* even_label,
				1836	Label* odd_label) {
				1837	static const int kSize = RegExpMacroAssembler::kTableSize;
				1838	static const int kMask = RegExpMacroAssembler::kTableMask;
				1839
				1840	int base = (min_char & ~kMask);
				1841	USE(base);
				1842
				1843	// Assert that everything is on one kTableSize page.
				1844	for (int i = start_index; i <= end_index; i++) {
				1845	DCHECK_EQ(ranges->at(i) & ~kMask, base);
				1846	}
				1847	DCHECK(start_index == 0 \|\| (ranges->at(start_index - 1) & ~kMask) <= base);
				1848
				1849	char templ[kSize];
				1850	Label* on_bit_set;
				1851	Label* on_bit_clear;
				1852	int bit;
				1853	if (even_label == fall_through) {
				1854	on_bit_set = odd_label;
				1855	on_bit_clear = even_label;
				1856	bit = 1;
				1857	} else {
				1858	on_bit_set = even_label;
				1859	on_bit_clear = odd_label;
				1860	bit = 0;
				1861	}
				1862	for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
				1863	templ[i] = bit;
				1864	}
				1865	int j = 0;
				1866	bit ^= 1;
				1867	for (int i = start_index; i < end_index; i++) {
				1868	for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
				1869	templ[j] = bit;
				1870	}
				1871	bit ^= 1;
				1872	}
				1873	for (int i = j; i < kSize; i++) {
				1874	templ[i] = bit;
				1875	}
				1876	Factory* factory = masm->isolate()->factory();
				1877	// TODO(erikcorry): Cache these.
				1878	Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
				1879	for (int i = 0; i < kSize; i++) {
				1880	ba->set(i, templ[i]);
				1881	}
				1882	masm->CheckBitInTable(ba, on_bit_set);
				1883	if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
				1884	}
				1885
				1886
				1887	static void CutOutRange(RegExpMacroAssembler* masm,
				1888	ZoneList<int>* ranges,
				1889	int start_index,
				1890	int end_index,
				1891	int cut_index,
				1892	Label* even_label,
				1893	Label* odd_label) {
				1894	bool odd = (((cut_index - start_index) & 1) == 1);
				1895	Label* in_range_label = odd ? odd_label : even_label;
				1896	Label dummy;
				1897	EmitDoubleBoundaryTest(masm,
				1898	ranges->at(cut_index),
				1899	ranges->at(cut_index + 1) - 1,
				1900	&dummy,
				1901	in_range_label,
				1902	&dummy);
				1903	DCHECK(!dummy.is_linked());
				1904	// Cut out the single range by rewriting the array. This creates a new
				1905	// range that is a merger of the two ranges on either side of the one we
				1906	// are cutting out. The oddity of the labels is preserved.
				1907	for (int j = cut_index; j > start_index; j--) {
				1908	ranges->at(j) = ranges->at(j - 1);
				1909	}
				1910	for (int j = cut_index + 1; j < end_index; j++) {
				1911	ranges->at(j) = ranges->at(j + 1);
				1912	}
				1913	}
				1914
				1915
				1916	// Unicode case. Split the search space into kSize spaces that are handled
				1917	// with recursion.
				1918	static void SplitSearchSpace(ZoneList<int>* ranges,
				1919	int start_index,
				1920	int end_index,
				1921	int* new_start_index,
				1922	int* new_end_index,
				1923	int* border) {
				1924	static const int kSize = RegExpMacroAssembler::kTableSize;
				1925	static const int kMask = RegExpMacroAssembler::kTableMask;
				1926
				1927	int first = ranges->at(start_index);
				1928	int last = ranges->at(end_index) - 1;
				1929
				1930	*new_start_index = start_index;
				1931	*border = (ranges->at(start_index) & ~kMask) + kSize;
				1932	while (*new_start_index < end_index) {
				1933	if (ranges->at(new_start_index) > border) break;
				1934	(*new_start_index)++;
				1935	}
				1936	// new_start_index is the index of the first edge that is beyond the
				1937	// current kSize space.
				1938
				1939	// For very large search spaces we do a binary chop search of the non-Latin1
				1940	// space instead of just going to the end of the current kSize space. The
				1941	// heuristics are complicated a little by the fact that any 128-character
				1942	// encoding space can be quickly tested with a table lookup, so we don't
				1943	// wish to do binary chop search at a smaller granularity than that. A
				1944	// 128-character space can take up a lot of space in the ranges array if,
				1945	// for example, we only want to match every second character (eg. the lower
				1946	// case characters on some Unicode pages).
				1947	int binary_chop_index = (end_index + start_index) / 2;
				1948	// The first test ensures that we get to the code that handles the Latin1
				1949	// range with a single not-taken branch, speeding up this important
				1950	// character range (even non-Latin1 charset-based text has spaces and
				1951	// punctuation).
				1952	if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
				1953	end_index - start_index > (new_start_index - start_index) 2 &&
				1954	last - first > kSize * 2 && binary_chop_index > *new_start_index &&
				1955	ranges->at(binary_chop_index) >= first + 2 * kSize) {
				1956	int scan_forward_for_section_border = binary_chop_index;;
				1957	int new_border = (ranges->at(binary_chop_index) \| kMask) + 1;
				1958
				1959	while (scan_forward_for_section_border < end_index) {
				1960	if (ranges->at(scan_forward_for_section_border) > new_border) {
				1961	*new_start_index = scan_forward_for_section_border;
				1962	*border = new_border;
				1963	break;
				1964	}
				1965	scan_forward_for_section_border++;
				1966	}
				1967	}
				1968
				1969	DCHECK(*new_start_index > start_index);
				1970	new_end_index = new_start_index - 1;
				1971	if (ranges->at(new_end_index) == border) {
				1972	(*new_end_index)--;
				1973	}
				1974	if (*border >= ranges->at(end_index)) {
				1975	*border = ranges->at(end_index);
				1976	*new_start_index = end_index; // Won't be used.
				1977	*new_end_index = end_index - 1;
				1978	}
				1979	}
				1980
				1981
				1982	// Gets a series of segment boundaries representing a character class. If the
				1983	// character is in the range between an even and an odd boundary (counting from
				1984	// start_index) then go to even_label, otherwise go to odd_label. We already
				1985	// know that the character is in the range of min_char to max_char inclusive.
				1986	// Either label can be NULL indicating backtracking. Either label can also be
				1987	// equal to the fall_through label.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1988	static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
				1989	int start_index, int end_index, uc32 min_char,
				1990	uc32 max_char, Label* fall_through,
				1991	Label* even_label, Label* odd_label) {
				1992	DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
				1993	DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
				1994
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	1995	int first = ranges->at(start_index);
				1996	int last = ranges->at(end_index) - 1;
				1997
				1998	DCHECK_LT(min_char, first);
				1999
				2000	// Just need to test if the character is before or on-or-after
				2001	// a particular character.
				2002	if (start_index == end_index) {
				2003	EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
				2004	return;
				2005	}
				2006
				2007	// Another almost trivial case: There is one interval in the middle that is
				2008	// different from the end intervals.
				2009	if (start_index + 1 == end_index) {
				2010	EmitDoubleBoundaryTest(
				2011	masm, first, last, fall_through, even_label, odd_label);
				2012	return;
				2013	}
				2014
				2015	// It's not worth using table lookup if there are very few intervals in the
				2016	// character class.
				2017	if (end_index - start_index <= 6) {
				2018	// It is faster to test for individual characters, so we look for those
				2019	// first, then try arbitrary ranges in the second round.
				2020	static int kNoCutIndex = -1;
				2021	int cut = kNoCutIndex;
				2022	for (int i = start_index; i < end_index; i++) {
				2023	if (ranges->at(i) == ranges->at(i + 1) - 1) {
				2024	cut = i;
				2025	break;
				2026	}
				2027	}
				2028	if (cut == kNoCutIndex) cut = start_index;
				2029	CutOutRange(
				2030	masm, ranges, start_index, end_index, cut, even_label, odd_label);
				2031	DCHECK_GE(end_index - start_index, 2);
				2032	GenerateBranches(masm,
				2033	ranges,
				2034	start_index + 1,
				2035	end_index - 1,
				2036	min_char,
				2037	max_char,
				2038	fall_through,
				2039	even_label,
				2040	odd_label);
				2041	return;
				2042	}
				2043
				2044	// If there are a lot of intervals in the regexp, then we will use tables to
				2045	// determine whether the character is inside or outside the character class.
				2046	static const int kBits = RegExpMacroAssembler::kTableSizeBits;
				2047
				2048	if ((max_char >> kBits) == (min_char >> kBits)) {
				2049	EmitUseLookupTable(masm,
				2050	ranges,
				2051	start_index,
				2052	end_index,
				2053	min_char,
				2054	fall_through,
				2055	even_label,
				2056	odd_label);
				2057	return;
				2058	}
				2059
				2060	if ((min_char >> kBits) != (first >> kBits)) {
				2061	masm->CheckCharacterLT(first, odd_label);
				2062	GenerateBranches(masm,
				2063	ranges,
				2064	start_index + 1,
				2065	end_index,
				2066	first,
				2067	max_char,
				2068	fall_through,
				2069	odd_label,
				2070	even_label);
				2071	return;
				2072	}
				2073
				2074	int new_start_index = 0;
				2075	int new_end_index = 0;
				2076	int border = 0;
				2077
				2078	SplitSearchSpace(ranges,
				2079	start_index,
				2080	end_index,
				2081	&new_start_index,
				2082	&new_end_index,
				2083	&border);
				2084
				2085	Label handle_rest;
				2086	Label* above = &handle_rest;
				2087	if (border == last + 1) {
				2088	// We didn't find any section that started after the limit, so everything
				2089	// above the border is one of the terminal labels.
				2090	above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
				2091	DCHECK(new_end_index == end_index - 1);
				2092	}
				2093
				2094	DCHECK_LE(start_index, new_end_index);
				2095	DCHECK_LE(new_start_index, end_index);
				2096	DCHECK_LT(start_index, new_start_index);
				2097	DCHECK_LT(new_end_index, end_index);
				2098	DCHECK(new_end_index + 1 == new_start_index \|\|
				2099	(new_end_index + 2 == new_start_index &&
				2100	border == ranges->at(new_end_index + 1)));
				2101	DCHECK_LT(min_char, border - 1);
				2102	DCHECK_LT(border, max_char);
				2103	DCHECK_LT(ranges->at(new_end_index), border);
				2104	DCHECK(border < ranges->at(new_start_index) \|\|
				2105	(border == ranges->at(new_start_index) &&
				2106	new_start_index == end_index &&
				2107	new_end_index == end_index - 1 &&
				2108	border == last + 1));
				2109	DCHECK(new_start_index == 0 \|\| border >= ranges->at(new_start_index - 1));
				2110
				2111	masm->CheckCharacterGT(border - 1, above);
				2112	Label dummy;
				2113	GenerateBranches(masm,
				2114	ranges,
				2115	start_index,
				2116	new_end_index,
				2117	min_char,
				2118	border - 1,
				2119	&dummy,
				2120	even_label,
				2121	odd_label);
				2122	if (handle_rest.is_linked()) {
				2123	masm->Bind(&handle_rest);
				2124	bool flip = (new_start_index & 1) != (start_index & 1);
				2125	GenerateBranches(masm,
				2126	ranges,
				2127	new_start_index,
				2128	end_index,
				2129	border,
				2130	max_char,
				2131	&dummy,
				2132	flip ? odd_label : even_label,
				2133	flip ? even_label : odd_label);
				2134	}
				2135	}
				2136
				2137
				2138	static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
				2139	RegExpCharacterClass* cc, bool one_byte,
				2140	Label* on_failure, int cp_offset, bool check_offset,
				2141	bool preloaded, Zone* zone) {
				2142	ZoneList<CharacterRange>* ranges = cc->ranges(zone);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	2143	CharacterRange::Canonicalize(ranges);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2144
				2145	int max_char;
				2146	if (one_byte) {
				2147	max_char = String::kMaxOneByteCharCode;
				2148	} else {
				2149	max_char = String::kMaxUtf16CodeUnit;
				2150	}
				2151
				2152	int range_count = ranges->length();
				2153
				2154	int last_valid_range = range_count - 1;
				2155	while (last_valid_range >= 0) {
				2156	CharacterRange& range = ranges->at(last_valid_range);
				2157	if (range.from() <= max_char) {
				2158	break;
				2159	}
				2160	last_valid_range--;
				2161	}
				2162
				2163	if (last_valid_range < 0) {
				2164	if (!cc->is_negated()) {
				2165	macro_assembler->GoTo(on_failure);
				2166	}
				2167	if (check_offset) {
				2168	macro_assembler->CheckPosition(cp_offset, on_failure);
				2169	}
				2170	return;
				2171	}
				2172
				2173	if (last_valid_range == 0 &&
				2174	ranges->at(0).IsEverything(max_char)) {
				2175	if (cc->is_negated()) {
				2176	macro_assembler->GoTo(on_failure);
				2177	} else {
				2178	// This is a common case hit by non-anchored expressions.
				2179	if (check_offset) {
				2180	macro_assembler->CheckPosition(cp_offset, on_failure);
				2181	}
				2182	}
				2183	return;
				2184	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2185
				2186	if (!preloaded) {
				2187	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
				2188	}
				2189
				2190	if (cc->is_standard(zone) &&
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	2191	macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
				2192	on_failure)) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2193	return;
				2194	}
				2195
				2196
				2197	// A new list with ascending entries. Each entry is a code unit
				2198	// where there is a boundary between code units that are part of
				2199	// the class and code units that are not. Normally we insert an
				2200	// entry at zero which goes to the failure label, but if there
				2201	// was already one there we fall through for success on that entry.
				2202	// Subsequent entries have alternating meaning (success/failure).
				2203	ZoneList<int>* range_boundaries =
				2204	new(zone) ZoneList<int>(last_valid_range, zone);
				2205
				2206	bool zeroth_entry_is_failure = !cc->is_negated();
				2207
				2208	for (int i = 0; i <= last_valid_range; i++) {
				2209	CharacterRange& range = ranges->at(i);
				2210	if (range.from() == 0) {
				2211	DCHECK_EQ(i, 0);
				2212	zeroth_entry_is_failure = !zeroth_entry_is_failure;
				2213	} else {
				2214	range_boundaries->Add(range.from(), zone);
				2215	}
				2216	range_boundaries->Add(range.to() + 1, zone);
				2217	}
				2218	int end_index = range_boundaries->length() - 1;
				2219	if (range_boundaries->at(end_index) > max_char) {
				2220	end_index--;
				2221	}
				2222
				2223	Label fall_through;
				2224	GenerateBranches(macro_assembler,
				2225	range_boundaries,
				2226	0, // start_index.
				2227	end_index,
				2228	0, // min_char.
				2229	max_char,
				2230	&fall_through,
				2231	zeroth_entry_is_failure ? &fall_through : on_failure,
				2232	zeroth_entry_is_failure ? on_failure : &fall_through);
				2233	macro_assembler->Bind(&fall_through);
				2234	}
				2235
				2236
				2237	RegExpNode::~RegExpNode() {
				2238	}
				2239
				2240
				2241	RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
				2242	Trace* trace) {
				2243	// If we are generating a greedy loop then don't stop and don't reuse code.
				2244	if (trace->stop_node() != NULL) {
				2245	return CONTINUE;
				2246	}
				2247
				2248	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				2249	if (trace->is_trivial()) {
				2250	if (label_.is_bound() \|\| on_work_list() \|\| !KeepRecursing(compiler)) {
				2251	// If a generic version is already scheduled to be generated or we have
				2252	// recursed too deeply then just generate a jump to that code.
				2253	macro_assembler->GoTo(&label_);
				2254	// This will queue it up for generation of a generic version if it hasn't
				2255	// already been queued.
				2256	compiler->AddWork(this);
				2257	return DONE;
				2258	}
				2259	// Generate generic version of the node and bind the label for later use.
				2260	macro_assembler->Bind(&label_);
				2261	return CONTINUE;
				2262	}
				2263
				2264	// We are being asked to make a non-generic version. Keep track of how many
				2265	// non-generic versions we generate so as not to overdo it.
				2266	trace_count_++;
				2267	if (KeepRecursing(compiler) && compiler->optimize() &&
				2268	trace_count_ < kMaxCopiesCodeGenerated) {
				2269	return CONTINUE;
				2270	}
				2271
				2272	// If we get here code has been generated for this node too many times or
				2273	// recursion is too deep. Time to switch to a generic version. The code for
				2274	// generic versions above can handle deep recursion properly.
				2275	bool was_limiting = compiler->limiting_recursion();
				2276	compiler->set_limiting_recursion(true);
				2277	trace->Flush(compiler, this);
				2278	compiler->set_limiting_recursion(was_limiting);
				2279	return DONE;
				2280	}
				2281
				2282
				2283	bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
				2284	return !compiler->limiting_recursion() &&
				2285	compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
				2286	}
				2287
				2288
				2289	int ActionNode::EatsAtLeast(int still_to_find,
				2290	int budget,
				2291	bool not_at_start) {
				2292	if (budget <= 0) return 0;
				2293	if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
				2294	return on_success()->EatsAtLeast(still_to_find,
				2295	budget - 1,
				2296	not_at_start);
				2297	}
				2298
				2299
				2300	void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
				2301	BoyerMooreLookahead* bm, bool not_at_start) {
				2302	if (action_type_ == BEGIN_SUBMATCH) {
				2303	bm->SetRest(offset);
				2304	} else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
				2305	on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
				2306	}
				2307	SaveBMInfo(bm, not_at_start, offset);
				2308	}
				2309
				2310
				2311	int AssertionNode::EatsAtLeast(int still_to_find,
				2312	int budget,
				2313	bool not_at_start) {
				2314	if (budget <= 0) return 0;
				2315	// If we know we are not at the start and we are asked "how many characters
				2316	// will you match if you succeed?" then we can answer anything since false
				2317	// implies false. So lets just return the max answer (still_to_find) since
				2318	// that won't prevent us from preloading a lot of characters for the other
				2319	// branches in the node graph.
				2320	if (assertion_type() == AT_START && not_at_start) return still_to_find;
				2321	return on_success()->EatsAtLeast(still_to_find,
				2322	budget - 1,
				2323	not_at_start);
				2324	}
				2325
				2326
				2327	void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
				2328	BoyerMooreLookahead* bm, bool not_at_start) {
				2329	// Match the behaviour of EatsAtLeast on this node.
				2330	if (assertion_type() == AT_START && not_at_start) return;
				2331	on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
				2332	SaveBMInfo(bm, not_at_start, offset);
				2333	}
				2334
				2335
				2336	int BackReferenceNode::EatsAtLeast(int still_to_find,
				2337	int budget,
				2338	bool not_at_start) {
				2339	if (read_backward()) return 0;
				2340	if (budget <= 0) return 0;
				2341	return on_success()->EatsAtLeast(still_to_find,
				2342	budget - 1,
				2343	not_at_start);
				2344	}
				2345
				2346
				2347	int TextNode::EatsAtLeast(int still_to_find,
				2348	int budget,
				2349	bool not_at_start) {
				2350	if (read_backward()) return 0;
				2351	int answer = Length();
				2352	if (answer >= still_to_find) return answer;
				2353	if (budget <= 0) return answer;
				2354	// We are not at start after this node so we set the last argument to 'true'.
				2355	return answer + on_success()->EatsAtLeast(still_to_find - answer,
				2356	budget - 1,
				2357	true);
				2358	}
				2359
				2360
				2361	int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
				2362	bool not_at_start) {
				2363	if (budget <= 0) return 0;
				2364	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				2365	// afterwards.
				2366	RegExpNode* node = alternatives_->at(1).node();
				2367	return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
				2368	}
				2369
				2370
				2371	void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
				2372	QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
				2373	bool not_at_start) {
				2374	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				2375	// afterwards.
				2376	RegExpNode* node = alternatives_->at(1).node();
				2377	return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
				2378	}
				2379
				2380
				2381	int ChoiceNode::EatsAtLeastHelper(int still_to_find,
				2382	int budget,
				2383	RegExpNode* ignore_this_node,
				2384	bool not_at_start) {
				2385	if (budget <= 0) return 0;
				2386	int min = 100;
				2387	int choice_count = alternatives_->length();
				2388	budget = (budget - 1) / choice_count;
				2389	for (int i = 0; i < choice_count; i++) {
				2390	RegExpNode* node = alternatives_->at(i).node();
				2391	if (node == ignore_this_node) continue;
				2392	int node_eats_at_least =
				2393	node->EatsAtLeast(still_to_find, budget, not_at_start);
				2394	if (node_eats_at_least < min) min = node_eats_at_least;
				2395	if (min == 0) return 0;
				2396	}
				2397	return min;
				2398	}
				2399
				2400
				2401	int LoopChoiceNode::EatsAtLeast(int still_to_find,
				2402	int budget,
				2403	bool not_at_start) {
				2404	return EatsAtLeastHelper(still_to_find,
				2405	budget - 1,
				2406	loop_node_,
				2407	not_at_start);
				2408	}
				2409
				2410
				2411	int ChoiceNode::EatsAtLeast(int still_to_find,
				2412	int budget,
				2413	bool not_at_start) {
				2414	return EatsAtLeastHelper(still_to_find,
				2415	budget,
				2416	NULL,
				2417	not_at_start);
				2418	}
				2419
				2420
				2421	// Takes the left-most 1-bit and smears it out, setting all bits to its right.
				2422	static inline uint32_t SmearBitsRight(uint32_t v) {
				2423	v \|= v >> 1;
				2424	v \|= v >> 2;
				2425	v \|= v >> 4;
				2426	v \|= v >> 8;
				2427	v \|= v >> 16;
				2428	return v;
				2429	}
				2430
				2431
				2432	bool QuickCheckDetails::Rationalize(bool asc) {
				2433	bool found_useful_op = false;
				2434	uint32_t char_mask;
				2435	if (asc) {
				2436	char_mask = String::kMaxOneByteCharCode;
				2437	} else {
				2438	char_mask = String::kMaxUtf16CodeUnit;
				2439	}
				2440	mask_ = 0;
				2441	value_ = 0;
				2442	int char_shift = 0;
				2443	for (int i = 0; i < characters_; i++) {
				2444	Position* pos = &positions_[i];
				2445	if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
				2446	found_useful_op = true;
				2447	}
				2448	mask_ \|= (pos->mask & char_mask) << char_shift;
				2449	value_ \|= (pos->value & char_mask) << char_shift;
				2450	char_shift += asc ? 8 : 16;
				2451	}
				2452	return found_useful_op;
				2453	}
				2454
				2455
				2456	bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
				2457	Trace* bounds_check_trace,
				2458	Trace* trace,
				2459	bool preload_has_checked_bounds,
				2460	Label* on_possible_success,
				2461	QuickCheckDetails* details,
				2462	bool fall_through_on_failure) {
				2463	if (details->characters() == 0) return false;
				2464	GetQuickCheckDetails(
				2465	details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
				2466	if (details->cannot_match()) return false;
				2467	if (!details->Rationalize(compiler->one_byte())) return false;
				2468	DCHECK(details->characters() == 1 \|\|
				2469	compiler->macro_assembler()->CanReadUnaligned());
				2470	uint32_t mask = details->mask();
				2471	uint32_t value = details->value();
				2472
				2473	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2474
				2475	if (trace->characters_preloaded() != details->characters()) {
				2476	DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
				2477	// We are attempting to preload the minimum number of characters
				2478	// any choice would eat, so if the bounds check fails, then none of the
				2479	// choices can succeed, so we can just immediately backtrack, rather
				2480	// than go to the next choice.
				2481	assembler->LoadCurrentCharacter(trace->cp_offset(),
				2482	bounds_check_trace->backtrack(),
				2483	!preload_has_checked_bounds,
				2484	details->characters());
				2485	}
				2486
				2487
				2488	bool need_mask = true;
				2489
				2490	if (details->characters() == 1) {
				2491	// If number of characters preloaded is 1 then we used a byte or 16 bit
				2492	// load so the value is already masked down.
				2493	uint32_t char_mask;
				2494	if (compiler->one_byte()) {
				2495	char_mask = String::kMaxOneByteCharCode;
				2496	} else {
				2497	char_mask = String::kMaxUtf16CodeUnit;
				2498	}
				2499	if ((mask & char_mask) == char_mask) need_mask = false;
				2500	mask &= char_mask;
				2501	} else {
				2502	// For 2-character preloads in one-byte mode or 1-character preloads in
				2503	// two-byte mode we also use a 16 bit load with zero extend.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	2504	static const uint32_t kTwoByteMask = 0xffff;
				2505	static const uint32_t kFourByteMask = 0xffffffff;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2506	if (details->characters() == 2 && compiler->one_byte()) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	2507	if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2508	} else if (details->characters() == 1 && !compiler->one_byte()) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	2509	if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2510	} else {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	2511	if (mask == kFourByteMask) need_mask = false;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2512	}
				2513	}
				2514
				2515	if (fall_through_on_failure) {
				2516	if (need_mask) {
				2517	assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
				2518	} else {
				2519	assembler->CheckCharacter(value, on_possible_success);
				2520	}
				2521	} else {
				2522	if (need_mask) {
				2523	assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
				2524	} else {
				2525	assembler->CheckNotCharacter(value, trace->backtrack());
				2526	}
				2527	}
				2528	return true;
				2529	}
				2530
				2531
				2532	// Here is the meat of GetQuickCheckDetails (see also the comment on the
				2533	// super-class in the .h file).
				2534	//
				2535	// We iterate along the text object, building up for each character a
				2536	// mask and value that can be used to test for a quick failure to match.
				2537	// The masks and values for the positions will be combined into a single
				2538	// machine word for the current character width in order to be used in
				2539	// generating a quick check.
				2540	void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2541	RegExpCompiler* compiler,
				2542	int characters_filled_in,
				2543	bool not_at_start) {
				2544	// Do not collect any quick check details if the text node reads backward,
				2545	// since it reads in the opposite direction than we use for quick checks.
				2546	if (read_backward()) return;
				2547	Isolate* isolate = compiler->macro_assembler()->isolate();
				2548	DCHECK(characters_filled_in < details->characters());
				2549	int characters = details->characters();
				2550	int char_mask;
				2551	if (compiler->one_byte()) {
				2552	char_mask = String::kMaxOneByteCharCode;
				2553	} else {
				2554	char_mask = String::kMaxUtf16CodeUnit;
				2555	}
				2556	for (int k = 0; k < elements()->length(); k++) {
				2557	TextElement elm = elements()->at(k);
				2558	if (elm.text_type() == TextElement::ATOM) {
				2559	Vector<const uc16> quarks = elm.atom()->data();
				2560	for (int i = 0; i < characters && i < quarks.length(); i++) {
				2561	QuickCheckDetails::Position* pos =
				2562	details->positions(characters_filled_in);
				2563	uc16 c = quarks[i];
				2564	if (compiler->ignore_case()) {
				2565	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				2566	int length = GetCaseIndependentLetters(isolate, c,
				2567	compiler->one_byte(), chars);
				2568	if (length == 0) {
				2569	// This can happen because all case variants are non-Latin1, but we
				2570	// know the input is Latin1.
				2571	details->set_cannot_match();
				2572	pos->determines_perfectly = false;
				2573	return;
				2574	}
				2575	if (length == 1) {
				2576	// This letter has no case equivalents, so it's nice and simple
				2577	// and the mask-compare will determine definitely whether we have
				2578	// a match at this character position.
				2579	pos->mask = char_mask;
				2580	pos->value = c;
				2581	pos->determines_perfectly = true;
				2582	} else {
				2583	uint32_t common_bits = char_mask;
				2584	uint32_t bits = chars[0];
				2585	for (int j = 1; j < length; j++) {
				2586	uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
				2587	common_bits ^= differing_bits;
				2588	bits &= common_bits;
				2589	}
				2590	// If length is 2 and common bits has only one zero in it then
				2591	// our mask and compare instruction will determine definitely
				2592	// whether we have a match at this character position. Otherwise
				2593	// it can only be an approximate check.
				2594	uint32_t one_zero = (common_bits \| ~char_mask);
				2595	if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
				2596	pos->determines_perfectly = true;
				2597	}
				2598	pos->mask = common_bits;
				2599	pos->value = bits;
				2600	}
				2601	} else {
				2602	// Don't ignore case. Nice simple case where the mask-compare will
				2603	// determine definitely whether we have a match at this character
				2604	// position.
				2605	if (c > char_mask) {
				2606	details->set_cannot_match();
				2607	pos->determines_perfectly = false;
				2608	return;
				2609	}
				2610	pos->mask = char_mask;
				2611	pos->value = c;
				2612	pos->determines_perfectly = true;
				2613	}
				2614	characters_filled_in++;
				2615	DCHECK(characters_filled_in <= details->characters());
				2616	if (characters_filled_in == details->characters()) {
				2617	return;
				2618	}
				2619	}
				2620	} else {
				2621	QuickCheckDetails::Position* pos =
				2622	details->positions(characters_filled_in);
				2623	RegExpCharacterClass* tree = elm.char_class();
				2624	ZoneList<CharacterRange>* ranges = tree->ranges(zone());
				2625	if (tree->is_negated()) {
				2626	// A quick check uses multi-character mask and compare. There is no
				2627	// useful way to incorporate a negative char class into this scheme
				2628	// so we just conservatively create a mask and value that will always
				2629	// succeed.
				2630	pos->mask = 0;
				2631	pos->value = 0;
				2632	} else {
				2633	int first_range = 0;
				2634	while (ranges->at(first_range).from() > char_mask) {
				2635	first_range++;
				2636	if (first_range == ranges->length()) {
				2637	details->set_cannot_match();
				2638	pos->determines_perfectly = false;
				2639	return;
				2640	}
				2641	}
				2642	CharacterRange range = ranges->at(first_range);
				2643	uc16 from = range.from();
				2644	uc16 to = range.to();
				2645	if (to > char_mask) {
				2646	to = char_mask;
				2647	}
				2648	uint32_t differing_bits = (from ^ to);
				2649	// A mask and compare is only perfect if the differing bits form a
				2650	// number like 00011111 with one single block of trailing 1s.
				2651	if ((differing_bits & (differing_bits + 1)) == 0 &&
				2652	from + differing_bits == to) {
				2653	pos->determines_perfectly = true;
				2654	}
				2655	uint32_t common_bits = ~SmearBitsRight(differing_bits);
				2656	uint32_t bits = (from & common_bits);
				2657	for (int i = first_range + 1; i < ranges->length(); i++) {
				2658	CharacterRange range = ranges->at(i);
				2659	uc16 from = range.from();
				2660	uc16 to = range.to();
				2661	if (from > char_mask) continue;
				2662	if (to > char_mask) to = char_mask;
				2663	// Here we are combining more ranges into the mask and compare
				2664	// value. With each new range the mask becomes more sparse and
				2665	// so the chances of a false positive rise. A character class
				2666	// with multiple ranges is assumed never to be equivalent to a
				2667	// mask and compare operation.
				2668	pos->determines_perfectly = false;
				2669	uint32_t new_common_bits = (from ^ to);
				2670	new_common_bits = ~SmearBitsRight(new_common_bits);
				2671	common_bits &= new_common_bits;
				2672	bits &= new_common_bits;
				2673	uint32_t differing_bits = (from & common_bits) ^ bits;
				2674	common_bits ^= differing_bits;
				2675	bits &= common_bits;
				2676	}
				2677	pos->mask = common_bits;
				2678	pos->value = bits;
				2679	}
				2680	characters_filled_in++;
				2681	DCHECK(characters_filled_in <= details->characters());
				2682	if (characters_filled_in == details->characters()) {
				2683	return;
				2684	}
				2685	}
				2686	}
				2687	DCHECK(characters_filled_in != details->characters());
				2688	if (!details->cannot_match()) {
				2689	on_success()-> GetQuickCheckDetails(details,
				2690	compiler,
				2691	characters_filled_in,
				2692	true);
				2693	}
				2694	}
				2695
				2696
				2697	void QuickCheckDetails::Clear() {
				2698	for (int i = 0; i < characters_; i++) {
				2699	positions_[i].mask = 0;
				2700	positions_[i].value = 0;
				2701	positions_[i].determines_perfectly = false;
				2702	}
				2703	characters_ = 0;
				2704	}
				2705
				2706
				2707	void QuickCheckDetails::Advance(int by, bool one_byte) {
				2708	if (by >= characters_ \|\| by < 0) {
				2709	DCHECK_IMPLIES(by < 0, characters_ == 0);
				2710	Clear();
				2711	return;
				2712	}
				2713	DCHECK_LE(characters_ - by, 4);
				2714	DCHECK_LE(characters_, 4);
				2715	for (int i = 0; i < characters_ - by; i++) {
				2716	positions_[i] = positions_[by + i];
				2717	}
				2718	for (int i = characters_ - by; i < characters_; i++) {
				2719	positions_[i].mask = 0;
				2720	positions_[i].value = 0;
				2721	positions_[i].determines_perfectly = false;
				2722	}
				2723	characters_ -= by;
				2724	// We could change mask_ and value_ here but we would never advance unless
				2725	// they had already been used in a check and they won't be used again because
				2726	// it would gain us nothing. So there's no point.
				2727	}
				2728
				2729
				2730	void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
				2731	DCHECK(characters_ == other->characters_);
				2732	if (other->cannot_match_) {
				2733	return;
				2734	}
				2735	if (cannot_match_) {
				2736	this = other;
				2737	return;
				2738	}
				2739	for (int i = from_index; i < characters_; i++) {
				2740	QuickCheckDetails::Position* pos = positions(i);
				2741	QuickCheckDetails::Position* other_pos = other->positions(i);
				2742	if (pos->mask != other_pos->mask \|\|
				2743	pos->value != other_pos->value \|\|
				2744	!other_pos->determines_perfectly) {
				2745	// Our mask-compare operation will be approximate unless we have the
				2746	// exact same operation on both sides of the alternation.
				2747	pos->determines_perfectly = false;
				2748	}
				2749	pos->mask &= other_pos->mask;
				2750	pos->value &= pos->mask;
				2751	other_pos->value &= pos->mask;
				2752	uc16 differing_bits = (pos->value ^ other_pos->value);
				2753	pos->mask &= ~differing_bits;
				2754	pos->value &= pos->mask;
				2755	}
				2756	}
				2757
				2758
				2759	class VisitMarker {
				2760	public:
				2761	explicit VisitMarker(NodeInfo* info) : info_(info) {
				2762	DCHECK(!info->visited);
				2763	info->visited = true;
				2764	}
				2765	~VisitMarker() {
				2766	info_->visited = false;
				2767	}
				2768	private:
				2769	NodeInfo* info_;
				2770	};
				2771
				2772
				2773	RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
				2774	if (info()->replacement_calculated) return replacement();
				2775	if (depth < 0) return this;
				2776	DCHECK(!info()->visited);
				2777	VisitMarker marker(info());
				2778	return FilterSuccessor(depth - 1, ignore_case);
				2779	}
				2780
				2781
				2782	RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
				2783	RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
				2784	if (next == NULL) return set_replacement(NULL);
				2785	on_success_ = next;
				2786	return set_replacement(this);
				2787	}
				2788
				2789
				2790	// We need to check for the following characters: 0x39c 0x3bc 0x178.
				2791	static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
				2792	// TODO(dcarney): this could be a lot more efficient.
				2793	return range.Contains(0x39c) \|\|
				2794	range.Contains(0x3bc) \|\| range.Contains(0x178);
				2795	}
				2796
				2797
				2798	static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
				2799	for (int i = 0; i < ranges->length(); i++) {
				2800	// TODO(dcarney): this could be a lot more efficient.
				2801	if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
				2802	}
				2803	return false;
				2804	}
				2805
				2806
				2807	RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
				2808	if (info()->replacement_calculated) return replacement();
				2809	if (depth < 0) return this;
				2810	DCHECK(!info()->visited);
				2811	VisitMarker marker(info());
				2812	int element_count = elements()->length();
				2813	for (int i = 0; i < element_count; i++) {
				2814	TextElement elm = elements()->at(i);
				2815	if (elm.text_type() == TextElement::ATOM) {
				2816	Vector<const uc16> quarks = elm.atom()->data();
				2817	for (int j = 0; j < quarks.length(); j++) {
				2818	uint16_t c = quarks[j];
				2819	if (c <= String::kMaxOneByteCharCode) continue;
				2820	if (!ignore_case) return set_replacement(NULL);
				2821	// Here, we need to check for characters whose upper and lower cases
				2822	// are outside the Latin-1 range.
				2823	uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
				2824	// Character is outside Latin-1 completely
				2825	if (converted == 0) return set_replacement(NULL);
				2826	// Convert quark to Latin-1 in place.
				2827	uint16_t* copy = const_cast<uint16_t*>(quarks.start());
				2828	copy[j] = converted;
				2829	}
				2830	} else {
				2831	DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
				2832	RegExpCharacterClass* cc = elm.char_class();
				2833	ZoneList<CharacterRange>* ranges = cc->ranges(zone());
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	2834	CharacterRange::Canonicalize(ranges);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	2835	// Now they are in order so we only need to look at the first.
				2836	int range_count = ranges->length();
				2837	if (cc->is_negated()) {
				2838	if (range_count != 0 &&
				2839	ranges->at(0).from() == 0 &&
				2840	ranges->at(0).to() >= String::kMaxOneByteCharCode) {
				2841	// This will be handled in a later filter.
				2842	if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
				2843	return set_replacement(NULL);
				2844	}
				2845	} else {
				2846	if (range_count == 0 \|\|
				2847	ranges->at(0).from() > String::kMaxOneByteCharCode) {
				2848	// This will be handled in a later filter.
				2849	if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
				2850	return set_replacement(NULL);
				2851	}
				2852	}
				2853	}
				2854	}
				2855	return FilterSuccessor(depth - 1, ignore_case);
				2856	}
				2857
				2858
				2859	RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
				2860	if (info()->replacement_calculated) return replacement();
				2861	if (depth < 0) return this;
				2862	if (info()->visited) return this;
				2863	{
				2864	VisitMarker marker(info());
				2865
				2866	RegExpNode* continue_replacement =
				2867	continue_node_->FilterOneByte(depth - 1, ignore_case);
				2868	// If we can't continue after the loop then there is no sense in doing the
				2869	// loop.
				2870	if (continue_replacement == NULL) return set_replacement(NULL);
				2871	}
				2872
				2873	return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
				2874	}
				2875
				2876
				2877	RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
				2878	if (info()->replacement_calculated) return replacement();
				2879	if (depth < 0) return this;
				2880	if (info()->visited) return this;
				2881	VisitMarker marker(info());
				2882	int choice_count = alternatives_->length();
				2883
				2884	for (int i = 0; i < choice_count; i++) {
				2885	GuardedAlternative alternative = alternatives_->at(i);
				2886	if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
				2887	set_replacement(this);
				2888	return this;
				2889	}
				2890	}
				2891
				2892	int surviving = 0;
				2893	RegExpNode* survivor = NULL;
				2894	for (int i = 0; i < choice_count; i++) {
				2895	GuardedAlternative alternative = alternatives_->at(i);
				2896	RegExpNode* replacement =
				2897	alternative.node()->FilterOneByte(depth - 1, ignore_case);
				2898	DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
				2899	if (replacement != NULL) {
				2900	alternatives_->at(i).set_node(replacement);
				2901	surviving++;
				2902	survivor = replacement;
				2903	}
				2904	}
				2905	if (surviving < 2) return set_replacement(survivor);
				2906
				2907	set_replacement(this);
				2908	if (surviving == choice_count) {
				2909	return this;
				2910	}
				2911	// Only some of the nodes survived the filtering. We need to rebuild the
				2912	// alternatives list.
				2913	ZoneList<GuardedAlternative>* new_alternatives =
				2914	new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
				2915	for (int i = 0; i < choice_count; i++) {
				2916	RegExpNode* replacement =
				2917	alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
				2918	if (replacement != NULL) {
				2919	alternatives_->at(i).set_node(replacement);
				2920	new_alternatives->Add(alternatives_->at(i), zone());
				2921	}
				2922	}
				2923	alternatives_ = new_alternatives;
				2924	return this;
				2925	}
				2926
				2927
				2928	RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
				2929	bool ignore_case) {
				2930	if (info()->replacement_calculated) return replacement();
				2931	if (depth < 0) return this;
				2932	if (info()->visited) return this;
				2933	VisitMarker marker(info());
				2934	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				2935	// afterwards.
				2936	RegExpNode* node = alternatives_->at(1).node();
				2937	RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
				2938	if (replacement == NULL) return set_replacement(NULL);
				2939	alternatives_->at(1).set_node(replacement);
				2940
				2941	RegExpNode* neg_node = alternatives_->at(0).node();
				2942	RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
				2943	// If the negative lookahead is always going to fail then
				2944	// we don't need to check it.
				2945	if (neg_replacement == NULL) return set_replacement(replacement);
				2946	alternatives_->at(0).set_node(neg_replacement);
				2947	return set_replacement(this);
				2948	}
				2949
				2950
				2951	void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2952	RegExpCompiler* compiler,
				2953	int characters_filled_in,
				2954	bool not_at_start) {
				2955	if (body_can_be_zero_length_ \|\| info()->visited) return;
				2956	VisitMarker marker(info());
				2957	return ChoiceNode::GetQuickCheckDetails(details,
				2958	compiler,
				2959	characters_filled_in,
				2960	not_at_start);
				2961	}
				2962
				2963
				2964	void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
				2965	BoyerMooreLookahead* bm, bool not_at_start) {
				2966	if (body_can_be_zero_length_ \|\| budget <= 0) {
				2967	bm->SetRest(offset);
				2968	SaveBMInfo(bm, not_at_start, offset);
				2969	return;
				2970	}
				2971	ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
				2972	SaveBMInfo(bm, not_at_start, offset);
				2973	}
				2974
				2975
				2976	void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2977	RegExpCompiler* compiler,
				2978	int characters_filled_in,
				2979	bool not_at_start) {
				2980	not_at_start = (not_at_start \|\| not_at_start_);
				2981	int choice_count = alternatives_->length();
				2982	DCHECK(choice_count > 0);
				2983	alternatives_->at(0).node()->GetQuickCheckDetails(details,
				2984	compiler,
				2985	characters_filled_in,
				2986	not_at_start);
				2987	for (int i = 1; i < choice_count; i++) {
				2988	QuickCheckDetails new_details(details->characters());
				2989	RegExpNode* node = alternatives_->at(i).node();
				2990	node->GetQuickCheckDetails(&new_details, compiler,
				2991	characters_filled_in,
				2992	not_at_start);
				2993	// Here we merge the quick match details of the two branches.
				2994	details->Merge(&new_details, characters_filled_in);
				2995	}
				2996	}
				2997
				2998
				2999	// Check for [0-9A-Z_a-z].
				3000	static void EmitWordCheck(RegExpMacroAssembler* assembler,
				3001	Label* word,
				3002	Label* non_word,
				3003	bool fall_through_on_word) {
				3004	if (assembler->CheckSpecialCharacterClass(
				3005	fall_through_on_word ? 'w' : 'W',
				3006	fall_through_on_word ? non_word : word)) {
				3007	// Optimized implementation available.
				3008	return;
				3009	}
				3010	assembler->CheckCharacterGT('z', non_word);
				3011	assembler->CheckCharacterLT('0', non_word);
				3012	assembler->CheckCharacterGT('a' - 1, word);
				3013	assembler->CheckCharacterLT('9' + 1, word);
				3014	assembler->CheckCharacterLT('A', non_word);
				3015	assembler->CheckCharacterLT('Z' + 1, word);
				3016	if (fall_through_on_word) {
				3017	assembler->CheckNotCharacter('_', non_word);
				3018	} else {
				3019	assembler->CheckCharacter('_', word);
				3020	}
				3021	}
				3022
				3023
				3024	// Emit the code to check for a ^ in multiline mode (1-character lookbehind
				3025	// that matches newline or the start of input).
				3026	static void EmitHat(RegExpCompiler* compiler,
				3027	RegExpNode* on_success,
				3028	Trace* trace) {
				3029	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3030	// We will be loading the previous character into the current character
				3031	// register.
				3032	Trace new_trace(*trace);
				3033	new_trace.InvalidateCurrentCharacter();
				3034
				3035	Label ok;
				3036	if (new_trace.cp_offset() == 0) {
				3037	// The start of input counts as a newline in this context, so skip to
				3038	// ok if we are at the start.
				3039	assembler->CheckAtStart(&ok);
				3040	}
				3041	// We already checked that we are not at the start of input so it must be
				3042	// OK to load the previous character.
				3043	assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
				3044	new_trace.backtrack(),
				3045	false);
				3046	if (!assembler->CheckSpecialCharacterClass('n',
				3047	new_trace.backtrack())) {
				3048	// Newline means \n, \r, 0x2028 or 0x2029.
				3049	if (!compiler->one_byte()) {
				3050	assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
				3051	}
				3052	assembler->CheckCharacter('\n', &ok);
				3053	assembler->CheckNotCharacter('\r', new_trace.backtrack());
				3054	}
				3055	assembler->Bind(&ok);
				3056	on_success->Emit(compiler, &new_trace);
				3057	}
				3058
				3059
				3060	// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
				3061	void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
				3062	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3063	Isolate* isolate = assembler->isolate();
				3064	Trace::TriBool next_is_word_character = Trace::UNKNOWN;
				3065	bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
				3066	BoyerMooreLookahead* lookahead = bm_info(not_at_start);
				3067	if (lookahead == NULL) {
				3068	int eats_at_least =
				3069	Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
				3070	kRecursionBudget,
				3071	not_at_start));
				3072	if (eats_at_least >= 1) {
				3073	BoyerMooreLookahead* bm =
				3074	new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
				3075	FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
				3076	if (bm->at(0)->is_non_word())
				3077	next_is_word_character = Trace::FALSE_VALUE;
				3078	if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
				3079	}
				3080	} else {
				3081	if (lookahead->at(0)->is_non_word())
				3082	next_is_word_character = Trace::FALSE_VALUE;
				3083	if (lookahead->at(0)->is_word())
				3084	next_is_word_character = Trace::TRUE_VALUE;
				3085	}
				3086	bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
				3087	if (next_is_word_character == Trace::UNKNOWN) {
				3088	Label before_non_word;
				3089	Label before_word;
				3090	if (trace->characters_preloaded() != 1) {
				3091	assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
				3092	}
				3093	// Fall through on non-word.
				3094	EmitWordCheck(assembler, &before_word, &before_non_word, false);
				3095	// Next character is not a word character.
				3096	assembler->Bind(&before_non_word);
				3097	Label ok;
				3098	BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
				3099	assembler->GoTo(&ok);
				3100
				3101	assembler->Bind(&before_word);
				3102	BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
				3103	assembler->Bind(&ok);
				3104	} else if (next_is_word_character == Trace::TRUE_VALUE) {
				3105	BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
				3106	} else {
				3107	DCHECK(next_is_word_character == Trace::FALSE_VALUE);
				3108	BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
				3109	}
				3110	}
				3111
				3112
				3113	void AssertionNode::BacktrackIfPrevious(
				3114	RegExpCompiler* compiler,
				3115	Trace* trace,
				3116	AssertionNode::IfPrevious backtrack_if_previous) {
				3117	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3118	Trace new_trace(*trace);
				3119	new_trace.InvalidateCurrentCharacter();
				3120
				3121	Label fall_through, dummy;
				3122
				3123	Label* non_word = backtrack_if_previous == kIsNonWord ?
				3124	new_trace.backtrack() :
				3125	&fall_through;
				3126	Label* word = backtrack_if_previous == kIsNonWord ?
				3127	&fall_through :
				3128	new_trace.backtrack();
				3129
				3130	if (new_trace.cp_offset() == 0) {
				3131	// The start of input counts as a non-word character, so the question is
				3132	// decided if we are at the start.
				3133	assembler->CheckAtStart(non_word);
				3134	}
				3135	// We already checked that we are not at the start of input so it must be
				3136	// OK to load the previous character.
				3137	assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
				3138	EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
				3139
				3140	assembler->Bind(&fall_through);
				3141	on_success()->Emit(compiler, &new_trace);
				3142	}
				3143
				3144
				3145	void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
				3146	RegExpCompiler* compiler,
				3147	int filled_in,
				3148	bool not_at_start) {
				3149	if (assertion_type_ == AT_START && not_at_start) {
				3150	details->set_cannot_match();
				3151	return;
				3152	}
				3153	return on_success()->GetQuickCheckDetails(details,
				3154	compiler,
				3155	filled_in,
				3156	not_at_start);
				3157	}
				3158
				3159
				3160	void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3161	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3162	switch (assertion_type_) {
				3163	case AT_END: {
				3164	Label ok;
				3165	assembler->CheckPosition(trace->cp_offset(), &ok);
				3166	assembler->GoTo(trace->backtrack());
				3167	assembler->Bind(&ok);
				3168	break;
				3169	}
				3170	case AT_START: {
				3171	if (trace->at_start() == Trace::FALSE_VALUE) {
				3172	assembler->GoTo(trace->backtrack());
				3173	return;
				3174	}
				3175	if (trace->at_start() == Trace::UNKNOWN) {
				3176	assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
				3177	Trace at_start_trace = *trace;
				3178	at_start_trace.set_at_start(Trace::TRUE_VALUE);
				3179	on_success()->Emit(compiler, &at_start_trace);
				3180	return;
				3181	}
				3182	}
				3183	break;
				3184	case AFTER_NEWLINE:
				3185	EmitHat(compiler, on_success(), trace);
				3186	return;
				3187	case AT_BOUNDARY:
				3188	case AT_NON_BOUNDARY: {
				3189	EmitBoundaryCheck(compiler, trace);
				3190	return;
				3191	}
				3192	}
				3193	on_success()->Emit(compiler, trace);
				3194	}
				3195
				3196
				3197	static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
				3198	if (quick_check == NULL) return false;
				3199	if (offset >= quick_check->characters()) return false;
				3200	return quick_check->positions(offset)->determines_perfectly;
				3201	}
				3202
				3203
				3204	static void UpdateBoundsCheck(int index, int* checked_up_to) {
				3205	if (index > *checked_up_to) {
				3206	*checked_up_to = index;
				3207	}
				3208	}
				3209
				3210
				3211	// We call this repeatedly to generate code for each pass over the text node.
				3212	// The passes are in increasing order of difficulty because we hope one
				3213	// of the first passes will fail in which case we are saved the work of the
				3214	// later passes. for example for the case independent regexp /%[asdfghjkl]a/
				3215	// we will check the '%' in the first pass, the case independent 'a' in the
				3216	// second pass and the character class in the last pass.
				3217	//
				3218	// The passes are done from right to left, so for example to test for /bar/
				3219	// we will first test for an 'r' with offset 2, then an 'a' with offset 1
				3220	// and then a 'b' with offset 0. This means we can avoid the end-of-input
				3221	// bounds check most of the time. In the example we only need to check for
				3222	// end-of-input when loading the putative 'r'.
				3223	//
				3224	// A slight complication involves the fact that the first character may already
				3225	// be fetched into a register by the previous node. In this case we want to
				3226	// do the test for that character first. We do this in separate passes. The
				3227	// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
				3228	// pass has been performed then subsequent passes will have true in
				3229	// first_element_checked to indicate that that character does not need to be
				3230	// checked again.
				3231	//
				3232	// In addition to all this we are passed a Trace, which can
				3233	// contain an AlternativeGeneration object. In this AlternativeGeneration
				3234	// object we can see details of any quick check that was already passed in
				3235	// order to get to the code we are now generating. The quick check can involve
				3236	// loading characters, which means we do not need to recheck the bounds
				3237	// up to the limit the quick check already checked. In addition the quick
				3238	// check can have involved a mask and compare operation which may simplify
				3239	// or obviate the need for further checks at some character positions.
				3240	void TextNode::TextEmitPass(RegExpCompiler* compiler,
				3241	TextEmitPassType pass,
				3242	bool preloaded,
				3243	Trace* trace,
				3244	bool first_element_checked,
				3245	int* checked_up_to) {
				3246	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3247	Isolate* isolate = assembler->isolate();
				3248	bool one_byte = compiler->one_byte();
				3249	Label* backtrack = trace->backtrack();
				3250	QuickCheckDetails* quick_check = trace->quick_check_performed();
				3251	int element_count = elements()->length();
				3252	int backward_offset = read_backward() ? -Length() : 0;
				3253	for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
				3254	TextElement elm = elements()->at(i);
				3255	int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
				3256	if (elm.text_type() == TextElement::ATOM) {
				3257	Vector<const uc16> quarks = elm.atom()->data();
				3258	for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
				3259	if (first_element_checked && i == 0 && j == 0) continue;
				3260	if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
				3261	EmitCharacterFunction* emit_function = NULL;
				3262	switch (pass) {
				3263	case NON_LATIN1_MATCH:
				3264	DCHECK(one_byte);
				3265	if (quarks[j] > String::kMaxOneByteCharCode) {
				3266	assembler->GoTo(backtrack);
				3267	return;
				3268	}
				3269	break;
				3270	case NON_LETTER_CHARACTER_MATCH:
				3271	emit_function = &EmitAtomNonLetter;
				3272	break;
				3273	case SIMPLE_CHARACTER_MATCH:
				3274	emit_function = &EmitSimpleCharacter;
				3275	break;
				3276	case CASE_CHARACTER_MATCH:
				3277	emit_function = &EmitAtomLetter;
				3278	break;
				3279	default:
				3280	break;
				3281	}
				3282	if (emit_function != NULL) {
				3283	bool bounds_check = *checked_up_to < cp_offset + j \|\| read_backward();
				3284	bool bound_checked =
				3285	emit_function(isolate, compiler, quarks[j], backtrack,
				3286	cp_offset + j, bounds_check, preloaded);
				3287	if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
				3288	}
				3289	}
				3290	} else {
				3291	DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
				3292	if (pass == CHARACTER_CLASS_MATCH) {
				3293	if (first_element_checked && i == 0) continue;
				3294	if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
				3295	RegExpCharacterClass* cc = elm.char_class();
				3296	bool bounds_check = *checked_up_to < cp_offset \|\| read_backward();
				3297	EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
				3298	bounds_check, preloaded, zone());
				3299	UpdateBoundsCheck(cp_offset, checked_up_to);
				3300	}
				3301	}
				3302	}
				3303	}
				3304
				3305
				3306	int TextNode::Length() {
				3307	TextElement elm = elements()->last();
				3308	DCHECK(elm.cp_offset() >= 0);
				3309	return elm.cp_offset() + elm.length();
				3310	}
				3311
				3312
				3313	bool TextNode::SkipPass(int int_pass, bool ignore_case) {
				3314	TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
				3315	if (ignore_case) {
				3316	return pass == SIMPLE_CHARACTER_MATCH;
				3317	} else {
				3318	return pass == NON_LETTER_CHARACTER_MATCH \|\| pass == CASE_CHARACTER_MATCH;
				3319	}
				3320	}
				3321
				3322
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	3323	TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
				3324	ZoneList<CharacterRange>* ranges,
				3325	bool read_backward,
				3326	RegExpNode* on_success) {
				3327	DCHECK_NOT_NULL(ranges);
				3328	ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
				3329	elms->Add(
				3330	TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),
				3331	zone);
				3332	return new (zone) TextNode(elms, read_backward, on_success);
				3333	}
				3334
				3335
				3336	TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
				3337	CharacterRange trail,
				3338	bool read_backward,
				3339	RegExpNode* on_success) {
				3340	ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
				3341	ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
				3342	ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
				3343	elms->Add(TextElement::CharClass(
				3344	new (zone) RegExpCharacterClass(lead_ranges, false)),
				3345	zone);
				3346	elms->Add(TextElement::CharClass(
				3347	new (zone) RegExpCharacterClass(trail_ranges, false)),
				3348	zone);
				3349	return new (zone) TextNode(elms, read_backward, on_success);
				3350	}
				3351
				3352
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3353	// This generates the code to match a text node. A text node can contain
				3354	// straight character sequences (possibly to be matched in a case-independent
				3355	// way) and character classes. For efficiency we do not do this in a single
				3356	// pass from left to right. Instead we pass over the text node several times,
				3357	// emitting code for some character positions every time. See the comment on
				3358	// TextEmitPass for details.
				3359	void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3360	LimitResult limit_result = LimitVersions(compiler, trace);
				3361	if (limit_result == DONE) return;
				3362	DCHECK(limit_result == CONTINUE);
				3363
				3364	if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
				3365	compiler->SetRegExpTooBig();
				3366	return;
				3367	}
				3368
				3369	if (compiler->one_byte()) {
				3370	int dummy = 0;
				3371	TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
				3372	}
				3373
				3374	bool first_elt_done = false;
				3375	int bound_checked_to = trace->cp_offset() - 1;
				3376	bound_checked_to += trace->bound_checked_up_to();
				3377
				3378	// If a character is preloaded into the current character register then
				3379	// check that now.
				3380	if (trace->characters_preloaded() == 1) {
				3381	for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
				3382	if (!SkipPass(pass, compiler->ignore_case())) {
				3383	TextEmitPass(compiler,
				3384	static_cast<TextEmitPassType>(pass),
				3385	true,
				3386	trace,
				3387	false,
				3388	&bound_checked_to);
				3389	}
				3390	}
				3391	first_elt_done = true;
				3392	}
				3393
				3394	for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
				3395	if (!SkipPass(pass, compiler->ignore_case())) {
				3396	TextEmitPass(compiler,
				3397	static_cast<TextEmitPassType>(pass),
				3398	false,
				3399	trace,
				3400	first_elt_done,
				3401	&bound_checked_to);
				3402	}
				3403	}
				3404
				3405	Trace successor_trace(*trace);
				3406	// If we advance backward, we may end up at the start.
				3407	successor_trace.AdvanceCurrentPositionInTrace(
				3408	read_backward() ? -Length() : Length(), compiler);
				3409	successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
				3410	: Trace::FALSE_VALUE);
				3411	RecursionCheck rc(compiler);
				3412	on_success()->Emit(compiler, &successor_trace);
				3413	}
				3414
				3415
				3416	void Trace::InvalidateCurrentCharacter() {
				3417	characters_preloaded_ = 0;
				3418	}
				3419
				3420
				3421	void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
				3422	// We don't have an instruction for shifting the current character register
				3423	// down or for using a shifted value for anything so lets just forget that
				3424	// we preloaded any characters into it.
				3425	characters_preloaded_ = 0;
				3426	// Adjust the offsets of the quick check performed information. This
				3427	// information is used to find out what we already determined about the
				3428	// characters by means of mask and compare.
				3429	quick_check_performed_.Advance(by, compiler->one_byte());
				3430	cp_offset_ += by;
				3431	if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
				3432	compiler->SetRegExpTooBig();
				3433	cp_offset_ = 0;
				3434	}
				3435	bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
				3436	}
				3437
				3438
				3439	void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
				3440	int element_count = elements()->length();
				3441	for (int i = 0; i < element_count; i++) {
				3442	TextElement elm = elements()->at(i);
				3443	if (elm.text_type() == TextElement::CHAR_CLASS) {
				3444	RegExpCharacterClass* cc = elm.char_class();
				3445	// None of the standard character classes is different in the case
				3446	// independent case and it slows us down if we don't know that.
				3447	if (cc->is_standard(zone())) continue;
				3448	ZoneList<CharacterRange>* ranges = cc->ranges(zone());
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	3449	CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3450	}
				3451	}
				3452	}
				3453
				3454
				3455	int TextNode::GreedyLoopTextLength() { return Length(); }
				3456
				3457
				3458	RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
				3459	RegExpCompiler* compiler) {
				3460	if (read_backward()) return NULL;
				3461	if (elements()->length() != 1) return NULL;
				3462	TextElement elm = elements()->at(0);
				3463	if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
				3464	RegExpCharacterClass* node = elm.char_class();
				3465	ZoneList<CharacterRange>* ranges = node->ranges(zone());
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	3466	CharacterRange::Canonicalize(ranges);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3467	if (node->is_negated()) {
				3468	return ranges->length() == 0 ? on_success() : NULL;
				3469	}
				3470	if (ranges->length() != 1) return NULL;
				3471	uint32_t max_char;
				3472	if (compiler->one_byte()) {
				3473	max_char = String::kMaxOneByteCharCode;
				3474	} else {
				3475	max_char = String::kMaxUtf16CodeUnit;
				3476	}
				3477	return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
				3478	}
				3479
				3480
				3481	// Finds the fixed match length of a sequence of nodes that goes from
				3482	// this alternative and back to this choice node. If there are variable
				3483	// length nodes or other complications in the way then return a sentinel
				3484	// value indicating that a greedy loop cannot be constructed.
				3485	int ChoiceNode::GreedyLoopTextLengthForAlternative(
				3486	GuardedAlternative* alternative) {
				3487	int length = 0;
				3488	RegExpNode* node = alternative->node();
				3489	// Later we will generate code for all these text nodes using recursion
				3490	// so we have to limit the max number.
				3491	int recursion_depth = 0;
				3492	while (node != this) {
				3493	if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
				3494	return kNodeIsTooComplexForGreedyLoops;
				3495	}
				3496	int node_length = node->GreedyLoopTextLength();
				3497	if (node_length == kNodeIsTooComplexForGreedyLoops) {
				3498	return kNodeIsTooComplexForGreedyLoops;
				3499	}
				3500	length += node_length;
				3501	SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
				3502	node = seq_node->on_success();
				3503	}
				3504	return read_backward() ? -length : length;
				3505	}
				3506
				3507
				3508	void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
				3509	DCHECK_NULL(loop_node_);
				3510	AddAlternative(alt);
				3511	loop_node_ = alt.node();
				3512	}
				3513
				3514
				3515	void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
				3516	DCHECK_NULL(continue_node_);
				3517	AddAlternative(alt);
				3518	continue_node_ = alt.node();
				3519	}
				3520
				3521
				3522	void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3523	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				3524	if (trace->stop_node() == this) {
				3525	// Back edge of greedy optimized loop node graph.
				3526	int text_length =
				3527	GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
				3528	DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);
				3529	// Update the counter-based backtracking info on the stack. This is an
				3530	// optimization for greedy loops (see below).
				3531	DCHECK(trace->cp_offset() == text_length);
				3532	macro_assembler->AdvanceCurrentPosition(text_length);
				3533	macro_assembler->GoTo(trace->loop_label());
				3534	return;
				3535	}
				3536	DCHECK_NULL(trace->stop_node());
				3537	if (!trace->is_trivial()) {
				3538	trace->Flush(compiler, this);
				3539	return;
				3540	}
				3541	ChoiceNode::Emit(compiler, trace);
				3542	}
				3543
				3544
				3545	int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
				3546	int eats_at_least) {
				3547	int preload_characters = Min(4, eats_at_least);
				3548	if (compiler->macro_assembler()->CanReadUnaligned()) {
				3549	bool one_byte = compiler->one_byte();
				3550	if (one_byte) {
				3551	if (preload_characters > 4) preload_characters = 4;
				3552	// We can't preload 3 characters because there is no machine instruction
				3553	// to do that. We can't just load 4 because we could be reading
				3554	// beyond the end of the string, which could cause a memory fault.
				3555	if (preload_characters == 3) preload_characters = 2;
				3556	} else {
				3557	if (preload_characters > 2) preload_characters = 2;
				3558	}
				3559	} else {
				3560	if (preload_characters > 1) preload_characters = 1;
				3561	}
				3562	return preload_characters;
				3563	}
				3564
				3565
				3566	// This class is used when generating the alternatives in a choice node. It
				3567	// records the way the alternative is being code generated.
				3568	class AlternativeGeneration: public Malloced {
				3569	public:
				3570	AlternativeGeneration()
				3571	: possible_success(),
				3572	expects_preload(false),
				3573	after(),
				3574	quick_check_details() { }
				3575	Label possible_success;
				3576	bool expects_preload;
				3577	Label after;
				3578	QuickCheckDetails quick_check_details;
				3579	};
				3580
				3581
				3582	// Creates a list of AlternativeGenerations. If the list has a reasonable
				3583	// size then it is on the stack, otherwise the excess is on the heap.
				3584	class AlternativeGenerationList {
				3585	public:
				3586	AlternativeGenerationList(int count, Zone* zone)
				3587	: alt_gens_(count, zone) {
				3588	for (int i = 0; i < count && i < kAFew; i++) {
				3589	alt_gens_.Add(a_few_alt_gens_ + i, zone);
				3590	}
				3591	for (int i = kAFew; i < count; i++) {
				3592	alt_gens_.Add(new AlternativeGeneration(), zone);
				3593	}
				3594	}
				3595	~AlternativeGenerationList() {
				3596	for (int i = kAFew; i < alt_gens_.length(); i++) {
				3597	delete alt_gens_[i];
				3598	alt_gens_[i] = NULL;
				3599	}
				3600	}
				3601
				3602	AlternativeGeneration* at(int i) {
				3603	return alt_gens_[i];
				3604	}
				3605
				3606	private:
				3607	static const int kAFew = 10;
				3608	ZoneList<AlternativeGeneration*> alt_gens_;
				3609	AlternativeGeneration a_few_alt_gens_[kAFew];
				3610	};
				3611
				3612
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	3613	static const uc32 kRangeEndMarker = 0x110000;
				3614
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3615	// The '2' variant is has inclusive from and exclusive to.
				3616	// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
				3617	// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	3618	static const int kSpaceRanges[] = {
				3619	'\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 0x1681,
				3620	0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
				3621	0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3622	static const int kSpaceRangeCount = arraysize(kSpaceRanges);
				3623
				3624	static const int kWordRanges[] = {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	3625	'0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3626	static const int kWordRangeCount = arraysize(kWordRanges);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	3627	static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3628	static const int kDigitRangeCount = arraysize(kDigitRanges);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	3629	static const int kSurrogateRanges[] = {
				3630	kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3631	static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	3632	static const int kLineTerminatorRanges[] = {
				3633	0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3634	static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
				3635
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3636	void BoyerMoorePositionInfo::Set(int character) {
				3637	SetInterval(Interval(character, character));
				3638	}
				3639
				3640
				3641	void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
				3642	s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
				3643	w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
				3644	d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
				3645	surrogate_ =
				3646	AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
				3647	if (interval.to() - interval.from() >= kMapSize - 1) {
				3648	if (map_count_ != kMapSize) {
				3649	map_count_ = kMapSize;
				3650	for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
				3651	}
				3652	return;
				3653	}
				3654	for (int i = interval.from(); i <= interval.to(); i++) {
				3655	int mod_character = (i & kMask);
				3656	if (!map_->at(mod_character)) {
				3657	map_count_++;
				3658	map_->at(mod_character) = true;
				3659	}
				3660	if (map_count_ == kMapSize) return;
				3661	}
				3662	}
				3663
				3664
				3665	void BoyerMoorePositionInfo::SetAll() {
				3666	s_ = w_ = d_ = kLatticeUnknown;
				3667	if (map_count_ != kMapSize) {
				3668	map_count_ = kMapSize;
				3669	for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
				3670	}
				3671	}
				3672
				3673
				3674	BoyerMooreLookahead::BoyerMooreLookahead(
				3675	int length, RegExpCompiler* compiler, Zone* zone)
				3676	: length_(length),
				3677	compiler_(compiler) {
				3678	if (compiler->one_byte()) {
				3679	max_char_ = String::kMaxOneByteCharCode;
				3680	} else {
				3681	max_char_ = String::kMaxUtf16CodeUnit;
				3682	}
				3683	bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
				3684	for (int i = 0; i < length; i++) {
				3685	bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
				3686	}
				3687	}
				3688
				3689
				3690	// Find the longest range of lookahead that has the fewest number of different
				3691	// characters that can occur at a given position. Since we are optimizing two
				3692	// different parameters at once this is a tradeoff.
				3693	bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
				3694	int biggest_points = 0;
				3695	// If more than 32 characters out of 128 can occur it is unlikely that we can
				3696	// be lucky enough to step forwards much of the time.
				3697	const int kMaxMax = 32;
				3698	for (int max_number_of_chars = 4;
				3699	max_number_of_chars < kMaxMax;
				3700	max_number_of_chars *= 2) {
				3701	biggest_points =
				3702	FindBestInterval(max_number_of_chars, biggest_points, from, to);
				3703	}
				3704	if (biggest_points == 0) return false;
				3705	return true;
				3706	}
				3707
				3708
				3709	// Find the highest-points range between 0 and length_ where the character
				3710	// information is not too vague. 'Too vague' means that there are more than
				3711	// max_number_of_chars that can occur at this position. Calculates the number
				3712	// of points as the product of width-of-the-range and
				3713	// probability-of-finding-one-of-the-characters, where the probability is
				3714	// calculated using the frequency distribution of the sample subject string.
				3715	int BoyerMooreLookahead::FindBestInterval(
				3716	int max_number_of_chars, int old_biggest_points, int* from, int* to) {
				3717	int biggest_points = old_biggest_points;
				3718	static const int kSize = RegExpMacroAssembler::kTableSize;
				3719	for (int i = 0; i < length_; ) {
				3720	while (i < length_ && Count(i) > max_number_of_chars) i++;
				3721	if (i == length_) break;
				3722	int remembered_from = i;
				3723	bool union_map[kSize];
				3724	for (int j = 0; j < kSize; j++) union_map[j] = false;
				3725	while (i < length_ && Count(i) <= max_number_of_chars) {
				3726	BoyerMoorePositionInfo* map = bitmaps_->at(i);
				3727	for (int j = 0; j < kSize; j++) union_map[j] \|= map->at(j);
				3728	i++;
				3729	}
				3730	int frequency = 0;
				3731	for (int j = 0; j < kSize; j++) {
				3732	if (union_map[j]) {
				3733	// Add 1 to the frequency to give a small per-character boost for
				3734	// the cases where our sampling is not good enough and many
				3735	// characters have a frequency of zero. This means the frequency
				3736	// can theoretically be up to 2*kSize though we treat it mostly as
				3737	// a fraction of kSize.
				3738	frequency += compiler_->frequency_collator()->Frequency(j) + 1;
				3739	}
				3740	}
				3741	// We use the probability of skipping times the distance we are skipping to
				3742	// judge the effectiveness of this. Actually we have a cut-off: By
				3743	// dividing by 2 we switch off the skipping if the probability of skipping
				3744	// is less than 50%. This is because the multibyte mask-and-compare
				3745	// skipping in quickcheck is more likely to do well on this case.
				3746	bool in_quickcheck_range =
				3747	((i - remembered_from < 4) \|\|
				3748	(compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
				3749	// Called 'probability' but it is only a rough estimate and can actually
				3750	// be outside the 0-kSize range.
				3751	int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
				3752	int points = (i - remembered_from) * probability;
				3753	if (points > biggest_points) {
				3754	*from = remembered_from;
				3755	*to = i - 1;
				3756	biggest_points = points;
				3757	}
				3758	}
				3759	return biggest_points;
				3760	}
				3761
				3762
				3763	// Take all the characters that will not prevent a successful match if they
				3764	// occur in the subject string in the range between min_lookahead and
				3765	// max_lookahead (inclusive) measured from the current position. If the
				3766	// character at max_lookahead offset is not one of these characters, then we
				3767	// can safely skip forwards by the number of characters in the range.
				3768	int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
				3769	int max_lookahead,
				3770	Handle<ByteArray> boolean_skip_table) {
				3771	const int kSize = RegExpMacroAssembler::kTableSize;
				3772
				3773	const int kSkipArrayEntry = 0;
				3774	const int kDontSkipArrayEntry = 1;
				3775
				3776	for (int i = 0; i < kSize; i++) {
				3777	boolean_skip_table->set(i, kSkipArrayEntry);
				3778	}
				3779	int skip = max_lookahead + 1 - min_lookahead;
				3780
				3781	for (int i = max_lookahead; i >= min_lookahead; i--) {
				3782	BoyerMoorePositionInfo* map = bitmaps_->at(i);
				3783	for (int j = 0; j < kSize; j++) {
				3784	if (map->at(j)) {
				3785	boolean_skip_table->set(j, kDontSkipArrayEntry);
				3786	}
				3787	}
				3788	}
				3789
				3790	return skip;
				3791	}
				3792
				3793
				3794	// See comment above on the implementation of GetSkipTable.
				3795	void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
				3796	const int kSize = RegExpMacroAssembler::kTableSize;
				3797
				3798	int min_lookahead = 0;
				3799	int max_lookahead = 0;
				3800
				3801	if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
				3802
				3803	bool found_single_character = false;
				3804	int single_character = 0;
				3805	for (int i = max_lookahead; i >= min_lookahead; i--) {
				3806	BoyerMoorePositionInfo* map = bitmaps_->at(i);
				3807	if (map->map_count() > 1 \|\|
				3808	(found_single_character && map->map_count() != 0)) {
				3809	found_single_character = false;
				3810	break;
				3811	}
				3812	for (int j = 0; j < kSize; j++) {
				3813	if (map->at(j)) {
				3814	found_single_character = true;
				3815	single_character = j;
				3816	break;
				3817	}
				3818	}
				3819	}
				3820
				3821	int lookahead_width = max_lookahead + 1 - min_lookahead;
				3822
				3823	if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
				3824	// The mask-compare can probably handle this better.
				3825	return;
				3826	}
				3827
				3828	if (found_single_character) {
				3829	Label cont, again;
				3830	masm->Bind(&again);
				3831	masm->LoadCurrentCharacter(max_lookahead, &cont, true);
				3832	if (max_char_ > kSize) {
				3833	masm->CheckCharacterAfterAnd(single_character,
				3834	RegExpMacroAssembler::kTableMask,
				3835	&cont);
				3836	} else {
				3837	masm->CheckCharacter(single_character, &cont);
				3838	}
				3839	masm->AdvanceCurrentPosition(lookahead_width);
				3840	masm->GoTo(&again);
				3841	masm->Bind(&cont);
				3842	return;
				3843	}
				3844
				3845	Factory* factory = masm->isolate()->factory();
				3846	Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
				3847	int skip_distance = GetSkipTable(
				3848	min_lookahead, max_lookahead, boolean_skip_table);
				3849	DCHECK(skip_distance != 0);
				3850
				3851	Label cont, again;
				3852	masm->Bind(&again);
				3853	masm->LoadCurrentCharacter(max_lookahead, &cont, true);
				3854	masm->CheckBitInTable(boolean_skip_table, &cont);
				3855	masm->AdvanceCurrentPosition(skip_distance);
				3856	masm->GoTo(&again);
				3857	masm->Bind(&cont);
				3858	}
				3859
				3860
				3861	/* Code generation for choice nodes.
				3862	*
				3863	* We generate quick checks that do a mask and compare to eliminate a
				3864	* choice. If the quick check succeeds then it jumps to the continuation to
				3865	* do slow checks and check subsequent nodes. If it fails (the common case)
				3866	* it falls through to the next choice.
				3867	*
				3868	* Here is the desired flow graph. Nodes directly below each other imply
				3869	* fallthrough. Alternatives 1 and 2 have quick checks. Alternative
				3870	* 3 doesn't have a quick check so we have to call the slow check.
				3871	* Nodes are marked Qn for quick checks and Sn for slow checks. The entire
				3872	* regexp continuation is generated directly after the Sn node, up to the
				3873	* next GoTo if we decide to reuse some already generated code. Some
				3874	* nodes expect preload_characters to be preloaded into the current
				3875	* character register. R nodes do this preloading. Vertices are marked
				3876	* F for failures and S for success (possible success in the case of quick
				3877	* nodes). L, V, < and > are used as arrow heads.
				3878	*
				3879	* ----------> R
				3880	* \|
				3881	* V
				3882	* Q1 -----> S1
				3883	* \| S /
				3884	* F\| /
				3885	* \| F/
				3886	* \| /
				3887	* \| R
				3888	* \| /
				3889	* V L
				3890	* Q2 -----> S2
				3891	* \| S /
				3892	* F\| /
				3893	* \| F/
				3894	* \| /
				3895	* \| R
				3896	* \| /
				3897	* V L
				3898	* S3
				3899	* \|
				3900	* F\|
				3901	* \|
				3902	* R
				3903	* \|
				3904	* backtrack V
				3905	* <----------Q4
				3906	* \ F \|
				3907	* \ \|S
				3908	* \ F V
				3909	* \-----S4
				3910	*
				3911	* For greedy loops we push the current position, then generate the code that
				3912	* eats the input specially in EmitGreedyLoop. The other choice (the
				3913	* continuation) is generated by the normal code in EmitChoices, and steps back
				3914	* in the input to the starting position when it fails to match. The loop code
				3915	* looks like this (U is the unwind code that steps back in the greedy loop).
				3916	*
				3917	* _____
				3918	* / \
				3919	* V \|
				3920	* ----------> S1 \|
				3921	* /\| \|
				3922	* / \|S \|
				3923	* F/ \_____/
				3924	* /
				3925	* \|<-----
				3926	* \| \
				3927	* V \|S
				3928	* Q2 ---> U----->backtrack
				3929	* \| F /
				3930	* S\| /
				3931	* V F /
				3932	* S2--/
				3933	*/
				3934
				3935	GreedyLoopState::GreedyLoopState(bool not_at_start) {
				3936	counter_backtrack_trace_.set_backtrack(&label_);
				3937	if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
				3938	}
				3939
				3940
				3941	void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
				3942	#ifdef DEBUG
				3943	int choice_count = alternatives_->length();
				3944	for (int i = 0; i < choice_count - 1; i++) {
				3945	GuardedAlternative alternative = alternatives_->at(i);
				3946	ZoneList<Guard> guards = alternative.guards();
				3947	int guard_count = (guards == NULL) ? 0 : guards->length();
				3948	for (int j = 0; j < guard_count; j++) {
				3949	DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
				3950	}
				3951	}
				3952	#endif
				3953	}
				3954
				3955
				3956	void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
				3957	Trace* current_trace,
				3958	PreloadState* state) {
				3959	if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
				3960	// Save some time by looking at most one machine word ahead.
				3961	state->eats_at_least_ =
				3962	EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
				3963	current_trace->at_start() == Trace::FALSE_VALUE);
				3964	}
				3965	state->preload_characters_ =
				3966	CalculatePreloadCharacters(compiler, state->eats_at_least_);
				3967
				3968	state->preload_is_current_ =
				3969	(current_trace->characters_preloaded() == state->preload_characters_);
				3970	state->preload_has_checked_bounds_ = state->preload_is_current_;
				3971	}
				3972
				3973
				3974	void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3975	int choice_count = alternatives_->length();
				3976
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	3977	if (choice_count == 1 && alternatives_->at(0).guards() == NULL) {
				3978	alternatives_->at(0).node()->Emit(compiler, trace);
				3979	return;
				3980	}
				3981
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	3982	AssertGuardsMentionRegisters(trace);
				3983
				3984	LimitResult limit_result = LimitVersions(compiler, trace);
				3985	if (limit_result == DONE) return;
				3986	DCHECK(limit_result == CONTINUE);
				3987
				3988	// For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
				3989	// other choice nodes we only flush if we are out of code size budget.
				3990	if (trace->flush_budget() == 0 && trace->actions() != NULL) {
				3991	trace->Flush(compiler, this);
				3992	return;
				3993	}
				3994
				3995	RecursionCheck rc(compiler);
				3996
				3997	PreloadState preload;
				3998	preload.init();
				3999	GreedyLoopState greedy_loop_state(not_at_start());
				4000
				4001	int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
				4002	AlternativeGenerationList alt_gens(choice_count, zone());
				4003
				4004	if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
				4005	trace = EmitGreedyLoop(compiler,
				4006	trace,
				4007	&alt_gens,
				4008	&preload,
				4009	&greedy_loop_state,
				4010	text_length);
				4011	} else {
				4012	// TODO(erikcorry): Delete this. We don't need this label, but it makes us
				4013	// match the traces produced pre-cleanup.
				4014	Label second_choice;
				4015	compiler->macro_assembler()->Bind(&second_choice);
				4016
				4017	preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
				4018
				4019	EmitChoices(compiler,
				4020	&alt_gens,
				4021	0,
				4022	trace,
				4023	&preload);
				4024	}
				4025
				4026	// At this point we need to generate slow checks for the alternatives where
				4027	// the quick check was inlined. We can recognize these because the associated
				4028	// label was bound.
				4029	int new_flush_budget = trace->flush_budget() / choice_count;
				4030	for (int i = 0; i < choice_count; i++) {
				4031	AlternativeGeneration* alt_gen = alt_gens.at(i);
				4032	Trace new_trace(*trace);
				4033	// If there are actions to be flushed we have to limit how many times
				4034	// they are flushed. Take the budget of the parent trace and distribute
				4035	// it fairly amongst the children.
				4036	if (new_trace.actions() != NULL) {
				4037	new_trace.set_flush_budget(new_flush_budget);
				4038	}
				4039	bool next_expects_preload =
				4040	i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
				4041	EmitOutOfLineContinuation(compiler,
				4042	&new_trace,
				4043	alternatives_->at(i),
				4044	alt_gen,
				4045	preload.preload_characters_,
				4046	next_expects_preload);
				4047	}
				4048	}
				4049
				4050
				4051	Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
				4052	Trace* trace,
				4053	AlternativeGenerationList* alt_gens,
				4054	PreloadState* preload,
				4055	GreedyLoopState* greedy_loop_state,
				4056	int text_length) {
				4057	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				4058	// Here we have special handling for greedy loops containing only text nodes
				4059	// and other simple nodes. These are handled by pushing the current
				4060	// position on the stack and then incrementing the current position each
				4061	// time around the switch. On backtrack we decrement the current position
				4062	// and check it against the pushed value. This avoids pushing backtrack
				4063	// information for each iteration of the loop, which could take up a lot of
				4064	// space.
				4065	DCHECK(trace->stop_node() == NULL);
				4066	macro_assembler->PushCurrentPosition();
				4067	Label greedy_match_failed;
				4068	Trace greedy_match_trace;
				4069	if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
				4070	greedy_match_trace.set_backtrack(&greedy_match_failed);
				4071	Label loop_label;
				4072	macro_assembler->Bind(&loop_label);
				4073	greedy_match_trace.set_stop_node(this);
				4074	greedy_match_trace.set_loop_label(&loop_label);
				4075	alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
				4076	macro_assembler->Bind(&greedy_match_failed);
				4077
				4078	Label second_choice; // For use in greedy matches.
				4079	macro_assembler->Bind(&second_choice);
				4080
				4081	Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
				4082
				4083	EmitChoices(compiler,
				4084	alt_gens,
				4085	1,
				4086	new_trace,
				4087	preload);
				4088
				4089	macro_assembler->Bind(greedy_loop_state->label());
				4090	// If we have unwound to the bottom then backtrack.
				4091	macro_assembler->CheckGreedyLoop(trace->backtrack());
				4092	// Otherwise try the second priority at an earlier position.
				4093	macro_assembler->AdvanceCurrentPosition(-text_length);
				4094	macro_assembler->GoTo(&second_choice);
				4095	return new_trace;
				4096	}
				4097
				4098	int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
				4099	Trace* trace) {
				4100	int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
				4101	if (alternatives_->length() != 2) return eats_at_least;
				4102
				4103	GuardedAlternative alt1 = alternatives_->at(1);
				4104	if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
				4105	return eats_at_least;
				4106	}
				4107	RegExpNode* eats_anything_node = alt1.node();
				4108	if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
				4109	return eats_at_least;
				4110	}
				4111
				4112	// Really we should be creating a new trace when we execute this function,
				4113	// but there is no need, because the code it generates cannot backtrack, and
				4114	// we always arrive here with a trivial trace (since it's the entry to a
				4115	// loop. That also implies that there are no preloaded characters, which is
				4116	// good, because it means we won't be violating any assumptions by
				4117	// overwriting those characters with new load instructions.
				4118	DCHECK(trace->is_trivial());
				4119
				4120	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				4121	Isolate* isolate = macro_assembler->isolate();
				4122	// At this point we know that we are at a non-greedy loop that will eat
				4123	// any character one at a time. Any non-anchored regexp has such a
				4124	// loop prepended to it in order to find where it starts. We look for
				4125	// a pattern of the form ...abc... where we can look 6 characters ahead
				4126	// and step forwards 3 if the character is not one of abc. Abc need
				4127	// not be atoms, they can be any reasonably limited character class or
				4128	// small alternation.
				4129	BoyerMooreLookahead* bm = bm_info(false);
				4130	if (bm == NULL) {
				4131	eats_at_least = Min(kMaxLookaheadForBoyerMoore,
				4132	EatsAtLeast(kMaxLookaheadForBoyerMoore,
				4133	kRecursionBudget,
				4134	false));
				4135	if (eats_at_least >= 1) {
				4136	bm = new(zone()) BoyerMooreLookahead(eats_at_least,
				4137	compiler,
				4138	zone());
				4139	GuardedAlternative alt0 = alternatives_->at(0);
				4140	alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
				4141	}
				4142	}
				4143	if (bm != NULL) {
				4144	bm->EmitSkipInstructions(macro_assembler);
				4145	}
				4146	return eats_at_least;
				4147	}
				4148
				4149
				4150	void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
				4151	AlternativeGenerationList* alt_gens,
				4152	int first_choice,
				4153	Trace* trace,
				4154	PreloadState* preload) {
				4155	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				4156	SetUpPreLoad(compiler, trace, preload);
				4157
				4158	// For now we just call all choices one after the other. The idea ultimately
				4159	// is to use the Dispatch table to try only the relevant ones.
				4160	int choice_count = alternatives_->length();
				4161
				4162	int new_flush_budget = trace->flush_budget() / choice_count;
				4163
				4164	for (int i = first_choice; i < choice_count; i++) {
				4165	bool is_last = i == choice_count - 1;
				4166	bool fall_through_on_failure = !is_last;
				4167	GuardedAlternative alternative = alternatives_->at(i);
				4168	AlternativeGeneration* alt_gen = alt_gens->at(i);
				4169	alt_gen->quick_check_details.set_characters(preload->preload_characters_);
				4170	ZoneList<Guard> guards = alternative.guards();
				4171	int guard_count = (guards == NULL) ? 0 : guards->length();
				4172	Trace new_trace(*trace);
				4173	new_trace.set_characters_preloaded(preload->preload_is_current_ ?
				4174	preload->preload_characters_ :
				4175	0);
				4176	if (preload->preload_has_checked_bounds_) {
				4177	new_trace.set_bound_checked_up_to(preload->preload_characters_);
				4178	}
				4179	new_trace.quick_check_performed()->Clear();
				4180	if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
				4181	if (!is_last) {
				4182	new_trace.set_backtrack(&alt_gen->after);
				4183	}
				4184	alt_gen->expects_preload = preload->preload_is_current_;
				4185	bool generate_full_check_inline = false;
				4186	if (compiler->optimize() &&
				4187	try_to_emit_quick_check_for_alternative(i == 0) &&
				4188	alternative.node()->EmitQuickCheck(
				4189	compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
				4190	&alt_gen->possible_success, &alt_gen->quick_check_details,
				4191	fall_through_on_failure)) {
				4192	// Quick check was generated for this choice.
				4193	preload->preload_is_current_ = true;
				4194	preload->preload_has_checked_bounds_ = true;
				4195	// If we generated the quick check to fall through on possible success,
				4196	// we now need to generate the full check inline.
				4197	if (!fall_through_on_failure) {
				4198	macro_assembler->Bind(&alt_gen->possible_success);
				4199	new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
				4200	new_trace.set_characters_preloaded(preload->preload_characters_);
				4201	new_trace.set_bound_checked_up_to(preload->preload_characters_);
				4202	generate_full_check_inline = true;
				4203	}
				4204	} else if (alt_gen->quick_check_details.cannot_match()) {
				4205	if (!fall_through_on_failure) {
				4206	macro_assembler->GoTo(trace->backtrack());
				4207	}
				4208	continue;
				4209	} else {
				4210	// No quick check was generated. Put the full code here.
				4211	// If this is not the first choice then there could be slow checks from
				4212	// previous cases that go here when they fail. There's no reason to
				4213	// insist that they preload characters since the slow check we are about
				4214	// to generate probably can't use it.
				4215	if (i != first_choice) {
				4216	alt_gen->expects_preload = false;
				4217	new_trace.InvalidateCurrentCharacter();
				4218	}
				4219	generate_full_check_inline = true;
				4220	}
				4221	if (generate_full_check_inline) {
				4222	if (new_trace.actions() != NULL) {
				4223	new_trace.set_flush_budget(new_flush_budget);
				4224	}
				4225	for (int j = 0; j < guard_count; j++) {
				4226	GenerateGuard(macro_assembler, guards->at(j), &new_trace);
				4227	}
				4228	alternative.node()->Emit(compiler, &new_trace);
				4229	preload->preload_is_current_ = false;
				4230	}
				4231	macro_assembler->Bind(&alt_gen->after);
				4232	}
				4233	}
				4234
				4235
				4236	void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
				4237	Trace* trace,
				4238	GuardedAlternative alternative,
				4239	AlternativeGeneration* alt_gen,
				4240	int preload_characters,
				4241	bool next_expects_preload) {
				4242	if (!alt_gen->possible_success.is_linked()) return;
				4243
				4244	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				4245	macro_assembler->Bind(&alt_gen->possible_success);
				4246	Trace out_of_line_trace(*trace);
				4247	out_of_line_trace.set_characters_preloaded(preload_characters);
				4248	out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
				4249	if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
				4250	ZoneList<Guard> guards = alternative.guards();
				4251	int guard_count = (guards == NULL) ? 0 : guards->length();
				4252	if (next_expects_preload) {
				4253	Label reload_current_char;
				4254	out_of_line_trace.set_backtrack(&reload_current_char);
				4255	for (int j = 0; j < guard_count; j++) {
				4256	GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
				4257	}
				4258	alternative.node()->Emit(compiler, &out_of_line_trace);
				4259	macro_assembler->Bind(&reload_current_char);
				4260	// Reload the current character, since the next quick check expects that.
				4261	// We don't need to check bounds here because we only get into this
				4262	// code through a quick check which already did the checked load.
				4263	macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
				4264	NULL,
				4265	false,
				4266	preload_characters);
				4267	macro_assembler->GoTo(&(alt_gen->after));
				4268	} else {
				4269	out_of_line_trace.set_backtrack(&(alt_gen->after));
				4270	for (int j = 0; j < guard_count; j++) {
				4271	GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
				4272	}
				4273	alternative.node()->Emit(compiler, &out_of_line_trace);
				4274	}
				4275	}
				4276
				4277
				4278	void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				4279	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				4280	LimitResult limit_result = LimitVersions(compiler, trace);
				4281	if (limit_result == DONE) return;
				4282	DCHECK(limit_result == CONTINUE);
				4283
				4284	RecursionCheck rc(compiler);
				4285
				4286	switch (action_type_) {
				4287	case STORE_POSITION: {
				4288	Trace::DeferredCapture
				4289	new_capture(data_.u_position_register.reg,
				4290	data_.u_position_register.is_capture,
				4291	trace);
				4292	Trace new_trace = *trace;
				4293	new_trace.add_action(&new_capture);
				4294	on_success()->Emit(compiler, &new_trace);
				4295	break;
				4296	}
				4297	case INCREMENT_REGISTER: {
				4298	Trace::DeferredIncrementRegister
				4299	new_increment(data_.u_increment_register.reg);
				4300	Trace new_trace = *trace;
				4301	new_trace.add_action(&new_increment);
				4302	on_success()->Emit(compiler, &new_trace);
				4303	break;
				4304	}
				4305	case SET_REGISTER: {
				4306	Trace::DeferredSetRegister
				4307	new_set(data_.u_store_register.reg, data_.u_store_register.value);
				4308	Trace new_trace = *trace;
				4309	new_trace.add_action(&new_set);
				4310	on_success()->Emit(compiler, &new_trace);
				4311	break;
				4312	}
				4313	case CLEAR_CAPTURES: {
				4314	Trace::DeferredClearCaptures
				4315	new_capture(Interval(data_.u_clear_captures.range_from,
				4316	data_.u_clear_captures.range_to));
				4317	Trace new_trace = *trace;
				4318	new_trace.add_action(&new_capture);
				4319	on_success()->Emit(compiler, &new_trace);
				4320	break;
				4321	}
				4322	case BEGIN_SUBMATCH:
				4323	if (!trace->is_trivial()) {
				4324	trace->Flush(compiler, this);
				4325	} else {
				4326	assembler->WriteCurrentPositionToRegister(
				4327	data_.u_submatch.current_position_register, 0);
				4328	assembler->WriteStackPointerToRegister(
				4329	data_.u_submatch.stack_pointer_register);
				4330	on_success()->Emit(compiler, trace);
				4331	}
				4332	break;
				4333	case EMPTY_MATCH_CHECK: {
				4334	int start_pos_reg = data_.u_empty_match_check.start_register;
				4335	int stored_pos = 0;
				4336	int rep_reg = data_.u_empty_match_check.repetition_register;
				4337	bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
				4338	bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
				4339	if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
				4340	// If we know we haven't advanced and there is no minimum we
				4341	// can just backtrack immediately.
				4342	assembler->GoTo(trace->backtrack());
				4343	} else if (know_dist && stored_pos < trace->cp_offset()) {
				4344	// If we know we've advanced we can generate the continuation
				4345	// immediately.
				4346	on_success()->Emit(compiler, trace);
				4347	} else if (!trace->is_trivial()) {
				4348	trace->Flush(compiler, this);
				4349	} else {
				4350	Label skip_empty_check;
				4351	// If we have a minimum number of repetitions we check the current
				4352	// number first and skip the empty check if it's not enough.
				4353	if (has_minimum) {
				4354	int limit = data_.u_empty_match_check.repetition_limit;
				4355	assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
				4356	}
				4357	// If the match is empty we bail out, otherwise we fall through
				4358	// to the on-success continuation.
				4359	assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
				4360	trace->backtrack());
				4361	assembler->Bind(&skip_empty_check);
				4362	on_success()->Emit(compiler, trace);
				4363	}
				4364	break;
				4365	}
				4366	case POSITIVE_SUBMATCH_SUCCESS: {
				4367	if (!trace->is_trivial()) {
				4368	trace->Flush(compiler, this);
				4369	return;
				4370	}
				4371	assembler->ReadCurrentPositionFromRegister(
				4372	data_.u_submatch.current_position_register);
				4373	assembler->ReadStackPointerFromRegister(
				4374	data_.u_submatch.stack_pointer_register);
				4375	int clear_register_count = data_.u_submatch.clear_register_count;
				4376	if (clear_register_count == 0) {
				4377	on_success()->Emit(compiler, trace);
				4378	return;
				4379	}
				4380	int clear_registers_from = data_.u_submatch.clear_register_from;
				4381	Label clear_registers_backtrack;
				4382	Trace new_trace = *trace;
				4383	new_trace.set_backtrack(&clear_registers_backtrack);
				4384	on_success()->Emit(compiler, &new_trace);
				4385
				4386	assembler->Bind(&clear_registers_backtrack);
				4387	int clear_registers_to = clear_registers_from + clear_register_count - 1;
				4388	assembler->ClearRegisters(clear_registers_from, clear_registers_to);
				4389
				4390	DCHECK(trace->backtrack() == NULL);
				4391	assembler->Backtrack();
				4392	return;
				4393	}
				4394	default:
				4395	UNREACHABLE();
				4396	}
				4397	}
				4398
				4399
				4400	void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				4401	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				4402	if (!trace->is_trivial()) {
				4403	trace->Flush(compiler, this);
				4404	return;
				4405	}
				4406
				4407	LimitResult limit_result = LimitVersions(compiler, trace);
				4408	if (limit_result == DONE) return;
				4409	DCHECK(limit_result == CONTINUE);
				4410
				4411	RecursionCheck rc(compiler);
				4412
				4413	DCHECK_EQ(start_reg_ + 1, end_reg_);
				4414	if (compiler->ignore_case()) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	4415	assembler->CheckNotBackReferenceIgnoreCase(
				4416	start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	4417	} else {
				4418	assembler->CheckNotBackReference(start_reg_, read_backward(),
				4419	trace->backtrack());
				4420	}
				4421	// We are going to advance backward, so we may end up at the start.
				4422	if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	4423
				4424	// Check that the back reference does not end inside a surrogate pair.
				4425	if (compiler->unicode() && !compiler->one_byte()) {
				4426	assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
				4427	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	4428	on_success()->Emit(compiler, trace);
				4429	}
				4430
				4431
				4432	// -------------------------------------------------------------------
				4433	// Dot/dotty output
				4434
				4435
				4436	#ifdef DEBUG
				4437
				4438
				4439	class DotPrinter: public NodeVisitor {
				4440	public:
				4441	DotPrinter(std::ostream& os, bool ignore_case) // NOLINT
				4442	: os_(os),
				4443	ignore_case_(ignore_case) {}
				4444	void PrintNode(const char* label, RegExpNode* node);
				4445	void Visit(RegExpNode* node);
				4446	void PrintAttributes(RegExpNode* from);
				4447	void PrintOnFailure(RegExpNode* from, RegExpNode* to);
				4448	#define DECLARE_VISIT(Type) \
				4449	virtual void Visit##Type(Type##Node* that);
				4450	FOR_EACH_NODE_TYPE(DECLARE_VISIT)
				4451	#undef DECLARE_VISIT
				4452	private:
				4453	std::ostream& os_;
				4454	bool ignore_case_;
				4455	};
				4456
				4457
				4458	void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
				4459	os_ << "digraph G {\n graph [label=\"";
				4460	for (int i = 0; label[i]; i++) {
				4461	switch (label[i]) {
				4462	case '\\':
				4463	os_ << "\\\\";
				4464	break;
				4465	case '"':
				4466	os_ << "\"";
				4467	break;
				4468	default:
				4469	os_ << label[i];
				4470	break;
				4471	}
				4472	}
				4473	os_ << "\"];\n";
				4474	Visit(node);
				4475	os_ << "}" << std::endl;
				4476	}
				4477
				4478
				4479	void DotPrinter::Visit(RegExpNode* node) {
				4480	if (node->info()->visited) return;
				4481	node->info()->visited = true;
				4482	node->Accept(this);
				4483	}
				4484
				4485
				4486	void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
				4487	os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
				4488	Visit(on_failure);
				4489	}
				4490
				4491
				4492	class TableEntryBodyPrinter {
				4493	public:
				4494	TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT
				4495	: os_(os),
				4496	choice_(choice) {}
				4497	void Call(uc16 from, DispatchTable::Entry entry) {
				4498	OutSet* out_set = entry.out_set();
				4499	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				4500	if (out_set->Get(i)) {
				4501	os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
				4502	<< choice()->alternatives()->at(i).node() << ";\n";
				4503	}
				4504	}
				4505	}
				4506	private:
				4507	ChoiceNode* choice() { return choice_; }
				4508	std::ostream& os_;
				4509	ChoiceNode* choice_;
				4510	};
				4511
				4512
				4513	class TableEntryHeaderPrinter {
				4514	public:
				4515	explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT
				4516	: first_(true),
				4517	os_(os) {}
				4518	void Call(uc16 from, DispatchTable::Entry entry) {
				4519	if (first_) {
				4520	first_ = false;
				4521	} else {
				4522	os_ << "\|";
				4523	}
				4524	os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "\|{";
				4525	OutSet* out_set = entry.out_set();
				4526	int priority = 0;
				4527	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				4528	if (out_set->Get(i)) {
				4529	if (priority > 0) os_ << "\|";
				4530	os_ << "<s" << from << "o" << i << "> " << priority;
				4531	priority++;
				4532	}
				4533	}
				4534	os_ << "}}";
				4535	}
				4536
				4537	private:
				4538	bool first_;
				4539	std::ostream& os_;
				4540	};
				4541
				4542
				4543	class AttributePrinter {
				4544	public:
				4545	explicit AttributePrinter(std::ostream& os) // NOLINT
				4546	: os_(os),
				4547	first_(true) {}
				4548	void PrintSeparator() {
				4549	if (first_) {
				4550	first_ = false;
				4551	} else {
				4552	os_ << "\|";
				4553	}
				4554	}
				4555	void PrintBit(const char* name, bool value) {
				4556	if (!value) return;
				4557	PrintSeparator();
				4558	os_ << "{" << name << "}";
				4559	}
				4560	void PrintPositive(const char* name, int value) {
				4561	if (value < 0) return;
				4562	PrintSeparator();
				4563	os_ << "{" << name << "\|" << value << "}";
				4564	}
				4565
				4566	private:
				4567	std::ostream& os_;
				4568	bool first_;
				4569	};
				4570
				4571
				4572	void DotPrinter::PrintAttributes(RegExpNode* that) {
				4573	os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
				4574	<< "margin=0.1, fontsize=10, label=\"{";
				4575	AttributePrinter printer(os_);
				4576	NodeInfo* info = that->info();
				4577	printer.PrintBit("NI", info->follows_newline_interest);
				4578	printer.PrintBit("WI", info->follows_word_interest);
				4579	printer.PrintBit("SI", info->follows_start_interest);
				4580	Label* label = that->label();
				4581	if (label->is_bound())
				4582	printer.PrintPositive("@", label->pos());
				4583	os_ << "}\"];\n"
				4584	<< " a" << that << " -> n" << that
				4585	<< " [style=dashed, color=grey, arrowhead=none];\n";
				4586	}
				4587
				4588
				4589	static const bool kPrintDispatchTable = false;
				4590	void DotPrinter::VisitChoice(ChoiceNode* that) {
				4591	if (kPrintDispatchTable) {
				4592	os_ << " n" << that << " [shape=Mrecord, label=\"";
				4593	TableEntryHeaderPrinter header_printer(os_);
				4594	that->GetTable(ignore_case_)->ForEach(&header_printer);
				4595	os_ << "\"]\n";
				4596	PrintAttributes(that);
				4597	TableEntryBodyPrinter body_printer(os_, that);
				4598	that->GetTable(ignore_case_)->ForEach(&body_printer);
				4599	} else {
				4600	os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
				4601	for (int i = 0; i < that->alternatives()->length(); i++) {
				4602	GuardedAlternative alt = that->alternatives()->at(i);
				4603	os_ << " n" << that << " -> n" << alt.node();
				4604	}
				4605	}
				4606	for (int i = 0; i < that->alternatives()->length(); i++) {
				4607	GuardedAlternative alt = that->alternatives()->at(i);
				4608	alt.node()->Accept(this);
				4609	}
				4610	}
				4611
				4612
				4613	void DotPrinter::VisitText(TextNode* that) {
				4614	Zone* zone = that->zone();
				4615	os_ << " n" << that << " [label=\"";
				4616	for (int i = 0; i < that->elements()->length(); i++) {
				4617	if (i > 0) os_ << " ";
				4618	TextElement elm = that->elements()->at(i);
				4619	switch (elm.text_type()) {
				4620	case TextElement::ATOM: {
				4621	Vector<const uc16> data = elm.atom()->data();
				4622	for (int i = 0; i < data.length(); i++) {
				4623	os_ << static_cast<char>(data[i]);
				4624	}
				4625	break;
				4626	}
				4627	case TextElement::CHAR_CLASS: {
				4628	RegExpCharacterClass* node = elm.char_class();
				4629	os_ << "[";
				4630	if (node->is_negated()) os_ << "^";
				4631	for (int j = 0; j < node->ranges(zone)->length(); j++) {
				4632	CharacterRange range = node->ranges(zone)->at(j);
				4633	os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
				4634	}
				4635	os_ << "]";
				4636	break;
				4637	}
				4638	default:
				4639	UNREACHABLE();
				4640	}
				4641	}
				4642	os_ << "\", shape=box, peripheries=2];\n";
				4643	PrintAttributes(that);
				4644	os_ << " n" << that << " -> n" << that->on_success() << ";\n";
				4645	Visit(that->on_success());
				4646	}
				4647
				4648
				4649	void DotPrinter::VisitBackReference(BackReferenceNode* that) {
				4650	os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
				4651	<< that->end_register() << "\", shape=doubleoctagon];\n";
				4652	PrintAttributes(that);
				4653	os_ << " n" << that << " -> n" << that->on_success() << ";\n";
				4654	Visit(that->on_success());
				4655	}
				4656
				4657
				4658	void DotPrinter::VisitEnd(EndNode* that) {
				4659	os_ << " n" << that << " [style=bold, shape=point];\n";
				4660	PrintAttributes(that);
				4661	}
				4662
				4663
				4664	void DotPrinter::VisitAssertion(AssertionNode* that) {
				4665	os_ << " n" << that << " [";
				4666	switch (that->assertion_type()) {
				4667	case AssertionNode::AT_END:
				4668	os_ << "label=\"$\", shape=septagon";
				4669	break;
				4670	case AssertionNode::AT_START:
				4671	os_ << "label=\"^\", shape=septagon";
				4672	break;
				4673	case AssertionNode::AT_BOUNDARY:
				4674	os_ << "label=\"\\b\", shape=septagon";
				4675	break;
				4676	case AssertionNode::AT_NON_BOUNDARY:
				4677	os_ << "label=\"\\B\", shape=septagon";
				4678	break;
				4679	case AssertionNode::AFTER_NEWLINE:
				4680	os_ << "label=\"(?<=\\n)\", shape=septagon";
				4681	break;
				4682	}
				4683	os_ << "];\n";
				4684	PrintAttributes(that);
				4685	RegExpNode* successor = that->on_success();
				4686	os_ << " n" << that << " -> n" << successor << ";\n";
				4687	Visit(successor);
				4688	}
				4689
				4690
				4691	void DotPrinter::VisitAction(ActionNode* that) {
				4692	os_ << " n" << that << " [";
				4693	switch (that->action_type_) {
				4694	case ActionNode::SET_REGISTER:
				4695	os_ << "label=\"$" << that->data_.u_store_register.reg
				4696	<< ":=" << that->data_.u_store_register.value << "\", shape=octagon";
				4697	break;
				4698	case ActionNode::INCREMENT_REGISTER:
				4699	os_ << "label=\"$" << that->data_.u_increment_register.reg
				4700	<< "++\", shape=octagon";
				4701	break;
				4702	case ActionNode::STORE_POSITION:
				4703	os_ << "label=\"$" << that->data_.u_position_register.reg
				4704	<< ":=$pos\", shape=octagon";
				4705	break;
				4706	case ActionNode::BEGIN_SUBMATCH:
				4707	os_ << "label=\"$" << that->data_.u_submatch.current_position_register
				4708	<< ":=$pos,begin\", shape=septagon";
				4709	break;
				4710	case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
				4711	os_ << "label=\"escape\", shape=septagon";
				4712	break;
				4713	case ActionNode::EMPTY_MATCH_CHECK:
				4714	os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
				4715	<< "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
				4716	<< "<" << that->data_.u_empty_match_check.repetition_limit
				4717	<< "?\", shape=septagon";
				4718	break;
				4719	case ActionNode::CLEAR_CAPTURES: {
				4720	os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
				4721	<< " to $" << that->data_.u_clear_captures.range_to
				4722	<< "\", shape=septagon";
				4723	break;
				4724	}
				4725	}
				4726	os_ << "];\n";
				4727	PrintAttributes(that);
				4728	RegExpNode* successor = that->on_success();
				4729	os_ << " n" << that << " -> n" << successor << ";\n";
				4730	Visit(successor);
				4731	}
				4732
				4733
				4734	class DispatchTableDumper {
				4735	public:
				4736	explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
				4737	void Call(uc16 key, DispatchTable::Entry entry);
				4738	private:
				4739	std::ostream& os_;
				4740	};
				4741
				4742
				4743	void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
				4744	os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
				4745	OutSet* set = entry.out_set();
				4746	bool first = true;
				4747	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				4748	if (set->Get(i)) {
				4749	if (first) {
				4750	first = false;
				4751	} else {
				4752	os_ << ", ";
				4753	}
				4754	os_ << i;
				4755	}
				4756	}
				4757	os_ << "}\n";
				4758	}
				4759
				4760
				4761	void DispatchTable::Dump() {
				4762	OFStream os(stderr);
				4763	DispatchTableDumper dumper(os);
				4764	tree()->ForEach(&dumper);
				4765	}
				4766
				4767
				4768	void RegExpEngine::DotPrint(const char* label,
				4769	RegExpNode* node,
				4770	bool ignore_case) {
				4771	OFStream os(stdout);
				4772	DotPrinter printer(os, ignore_case);
				4773	printer.PrintNode(label, node);
				4774	}
				4775
				4776
				4777	#endif // DEBUG
				4778
				4779
				4780	// -------------------------------------------------------------------
				4781	// Tree to graph conversion
				4782
				4783	RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
				4784	RegExpNode* on_success) {
				4785	ZoneList<TextElement>* elms =
				4786	new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
				4787	elms->Add(TextElement::Atom(this), compiler->zone());
				4788	return new (compiler->zone())
				4789	TextNode(elms, compiler->read_backward(), on_success);
				4790	}
				4791
				4792
				4793	RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
				4794	RegExpNode* on_success) {
				4795	return new (compiler->zone())
				4796	TextNode(elements(), compiler->read_backward(), on_success);
				4797	}
				4798
				4799
				4800	static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
				4801	const int* special_class,
				4802	int length) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	4803	length--; // Remove final marker.
				4804	DCHECK(special_class[length] == kRangeEndMarker);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	4805	DCHECK(ranges->length() != 0);
				4806	DCHECK(length != 0);
				4807	DCHECK(special_class[0] != 0);
				4808	if (ranges->length() != (length >> 1) + 1) {
				4809	return false;
				4810	}
				4811	CharacterRange range = ranges->at(0);
				4812	if (range.from() != 0) {
				4813	return false;
				4814	}
				4815	for (int i = 0; i < length; i += 2) {
				4816	if (special_class[i] != (range.to() + 1)) {
				4817	return false;
				4818	}
				4819	range = ranges->at((i >> 1) + 1);
				4820	if (special_class[i+1] != range.from()) {
				4821	return false;
				4822	}
				4823	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	4824	if (range.to() != String::kMaxCodePoint) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	4825	return false;
				4826	}
				4827	return true;
				4828	}
				4829
				4830
				4831	static bool CompareRanges(ZoneList<CharacterRange>* ranges,
				4832	const int* special_class,
				4833	int length) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	4834	length--; // Remove final marker.
				4835	DCHECK(special_class[length] == kRangeEndMarker);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	4836	if (ranges->length() * 2 != length) {
				4837	return false;
				4838	}
				4839	for (int i = 0; i < length; i += 2) {
				4840	CharacterRange range = ranges->at(i >> 1);
				4841	if (range.from() != special_class[i] \|\|
				4842	range.to() != special_class[i + 1] - 1) {
				4843	return false;
				4844	}
				4845	}
				4846	return true;
				4847	}
				4848
				4849
				4850	bool RegExpCharacterClass::is_standard(Zone* zone) {
				4851	// TODO(lrn): Remove need for this function, by not throwing away information
				4852	// along the way.
				4853	if (is_negated_) {
				4854	return false;
				4855	}
				4856	if (set_.is_standard()) {
				4857	return true;
				4858	}
				4859	if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
				4860	set_.set_standard_set_type('s');
				4861	return true;
				4862	}
				4863	if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
				4864	set_.set_standard_set_type('S');
				4865	return true;
				4866	}
				4867	if (CompareInverseRanges(set_.ranges(zone),
				4868	kLineTerminatorRanges,
				4869	kLineTerminatorRangeCount)) {
				4870	set_.set_standard_set_type('.');
				4871	return true;
				4872	}
				4873	if (CompareRanges(set_.ranges(zone),
				4874	kLineTerminatorRanges,
				4875	kLineTerminatorRangeCount)) {
				4876	set_.set_standard_set_type('n');
				4877	return true;
				4878	}
				4879	if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
				4880	set_.set_standard_set_type('w');
				4881	return true;
				4882	}
				4883	if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
				4884	set_.set_standard_set_type('W');
				4885	return true;
				4886	}
				4887	return false;
				4888	}
				4889
				4890
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	4891	UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
				4892	ZoneList<CharacterRange>* base)
				4893	: zone_(zone),
				4894	table_(zone),
				4895	bmp_(nullptr),
				4896	lead_surrogates_(nullptr),
				4897	trail_surrogates_(nullptr),
				4898	non_bmp_(nullptr) {
				4899	// The unicode range splitter categorizes given character ranges into:
				4900	// - Code points from the BMP representable by one code unit.
				4901	// - Code points outside the BMP that need to be split into surrogate pairs.
				4902	// - Lone lead surrogates.
				4903	// - Lone trail surrogates.
				4904	// Lone surrogates are valid code points, even though no actual characters.
				4905	// They require special matching to make sure we do not split surrogate pairs.
				4906	// We use the dispatch table to accomplish this. The base range is split up
				4907	// by the table by the overlay ranges, and the Call callback is used to
				4908	// filter and collect ranges for each category.
				4909	for (int i = 0; i < base->length(); i++) {
				4910	table_.AddRange(base->at(i), kBase, zone_);
				4911	}
				4912	// Add overlay ranges.
				4913	table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
				4914	kBmpCodePoints, zone_);
				4915	table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
				4916	kLeadSurrogates, zone_);
				4917	table_.AddRange(
				4918	CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
				4919	kTrailSurrogates, zone_);
				4920	table_.AddRange(
				4921	CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
				4922	kBmpCodePoints, zone_);
				4923	table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
				4924	kNonBmpCodePoints, zone_);
				4925	table_.ForEach(this);
				4926	}
				4927
				4928
				4929	void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
				4930	OutSet* outset = entry.out_set();
				4931	if (!outset->Get(kBase)) return;
				4932	ZoneList<CharacterRange>** target = NULL;
				4933	if (outset->Get(kBmpCodePoints)) {
				4934	target = &bmp_;
				4935	} else if (outset->Get(kLeadSurrogates)) {
				4936	target = &lead_surrogates_;
				4937	} else if (outset->Get(kTrailSurrogates)) {
				4938	target = &trail_surrogates_;
				4939	} else {
				4940	DCHECK(outset->Get(kNonBmpCodePoints));
				4941	target = &non_bmp_;
				4942	}
				4943	if (target == NULL) target = new (zone_) ZoneList<CharacterRange>(2, zone_);
				4944	(*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
				4945	}
				4946
				4947
				4948	void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
				4949	RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
				4950	ZoneList<CharacterRange>* bmp = splitter->bmp();
				4951	if (bmp == nullptr) return;
				4952	result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
				4953	compiler->zone(), bmp, compiler->read_backward(), on_success)));
				4954	}
				4955
				4956
				4957	void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
				4958	RegExpNode* on_success,
				4959	UnicodeRangeSplitter* splitter) {
				4960	ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
				4961	if (non_bmp == nullptr) return;
				4962	DCHECK(compiler->unicode());
				4963	DCHECK(!compiler->one_byte());
				4964	Zone* zone = compiler->zone();
				4965	CharacterRange::Canonicalize(non_bmp);
				4966	for (int i = 0; i < non_bmp->length(); i++) {
				4967	// Match surrogate pair.
				4968	// E.g. [\u10005-\u11005] becomes
				4969	// \ud800[\udc05-\udfff]\|
				4970	// [\ud801-\ud803][\udc00-\udfff]\|
				4971	// \ud804[\udc00-\udc05]
				4972	uc32 from = non_bmp->at(i).from();
				4973	uc32 to = non_bmp->at(i).to();
				4974	uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
				4975	uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
				4976	uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
				4977	uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
				4978	if (from_l == to_l) {
				4979	// The lead surrogate is the same.
				4980	result->AddAlternative(
				4981	GuardedAlternative(TextNode::CreateForSurrogatePair(
				4982	zone, CharacterRange::Singleton(from_l),
				4983	CharacterRange::Range(from_t, to_t), compiler->read_backward(),
				4984	on_success)));
				4985	} else {
				4986	if (from_t != kTrailSurrogateStart) {
				4987	// Add [from_l][from_t-\udfff]
				4988	result->AddAlternative(
				4989	GuardedAlternative(TextNode::CreateForSurrogatePair(
				4990	zone, CharacterRange::Singleton(from_l),
				4991	CharacterRange::Range(from_t, kTrailSurrogateEnd),
				4992	compiler->read_backward(), on_success)));
				4993	from_l++;
				4994	}
				4995	if (to_t != kTrailSurrogateEnd) {
				4996	// Add [to_l][\udc00-to_t]
				4997	result->AddAlternative(
				4998	GuardedAlternative(TextNode::CreateForSurrogatePair(
				4999	zone, CharacterRange::Singleton(to_l),
				5000	CharacterRange::Range(kTrailSurrogateStart, to_t),
				5001	compiler->read_backward(), on_success)));
				5002	to_l--;
				5003	}
				5004	if (from_l <= to_l) {
				5005	// Add [from_l-to_l][\udc00-\udfff]
				5006	result->AddAlternative(
				5007	GuardedAlternative(TextNode::CreateForSurrogatePair(
				5008	zone, CharacterRange::Range(from_l, to_l),
				5009	CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
				5010	compiler->read_backward(), on_success)));
				5011	}
				5012	}
				5013	}
				5014	}
				5015
				5016
				5017	RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
				5018	RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
				5019	ZoneList<CharacterRange>* match, RegExpNode* on_success,
				5020	bool read_backward) {
				5021	Zone* zone = compiler->zone();
				5022	RegExpNode* match_node = TextNode::CreateForCharacterRanges(
				5023	zone, match, read_backward, on_success);
				5024	int stack_register = compiler->UnicodeLookaroundStackRegister();
				5025	int position_register = compiler->UnicodeLookaroundPositionRegister();
				5026	RegExpLookaround::Builder lookaround(false, match_node, stack_register,
				5027	position_register);
				5028	RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
				5029	zone, lookbehind, !read_backward, lookaround.on_match_success());
				5030	return lookaround.ForMatch(negative_match);
				5031	}
				5032
				5033
				5034	RegExpNode* MatchAndNegativeLookaroundInReadDirection(
				5035	RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
				5036	ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
				5037	bool read_backward) {
				5038	Zone* zone = compiler->zone();
				5039	int stack_register = compiler->UnicodeLookaroundStackRegister();
				5040	int position_register = compiler->UnicodeLookaroundPositionRegister();
				5041	RegExpLookaround::Builder lookaround(false, on_success, stack_register,
				5042	position_register);
				5043	RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
				5044	zone, lookahead, read_backward, lookaround.on_match_success());
				5045	return TextNode::CreateForCharacterRanges(
				5046	zone, match, read_backward, lookaround.ForMatch(negative_match));
				5047	}
				5048
				5049
				5050	void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
				5051	RegExpNode* on_success,
				5052	UnicodeRangeSplitter* splitter) {
				5053	ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
				5054	if (lead_surrogates == nullptr) return;
				5055	Zone* zone = compiler->zone();
				5056	// E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
				5057	ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
				5058	zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
				5059
				5060	RegExpNode* match;
				5061	if (compiler->read_backward()) {
				5062	// Reading backward. Assert that reading forward, there is no trail
				5063	// surrogate, and then backward match the lead surrogate.
				5064	match = NegativeLookaroundAgainstReadDirectionAndMatch(
				5065	compiler, trail_surrogates, lead_surrogates, on_success, true);
				5066	} else {
				5067	// Reading forward. Forward match the lead surrogate and assert that
				5068	// no trail surrogate follows.
				5069	match = MatchAndNegativeLookaroundInReadDirection(
				5070	compiler, lead_surrogates, trail_surrogates, on_success, false);
				5071	}
				5072	result->AddAlternative(GuardedAlternative(match));
				5073	}
				5074
				5075
				5076	void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
				5077	RegExpNode* on_success,
				5078	UnicodeRangeSplitter* splitter) {
				5079	ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
				5080	if (trail_surrogates == nullptr) return;
				5081	Zone* zone = compiler->zone();
				5082	// E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
				5083	ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
				5084	zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
				5085
				5086	RegExpNode* match;
				5087	if (compiler->read_backward()) {
				5088	// Reading backward. Backward match the trail surrogate and assert that no
				5089	// lead surrogate precedes it.
				5090	match = MatchAndNegativeLookaroundInReadDirection(
				5091	compiler, trail_surrogates, lead_surrogates, on_success, true);
				5092	} else {
				5093	// Reading forward. Assert that reading backward, there is no lead
				5094	// surrogate, and then forward match the trail surrogate.
				5095	match = NegativeLookaroundAgainstReadDirectionAndMatch(
				5096	compiler, lead_surrogates, trail_surrogates, on_success, false);
				5097	}
				5098	result->AddAlternative(GuardedAlternative(match));
				5099	}
				5100
				5101	RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
				5102	RegExpNode* on_success) {
				5103	// This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
				5104	DCHECK(!compiler->read_backward());
				5105	Zone* zone = compiler->zone();
				5106	// Advance any character. If the character happens to be a lead surrogate and
				5107	// we advanced into the middle of a surrogate pair, it will work out, as
				5108	// nothing will match from there. We will have to advance again, consuming
				5109	// the associated trail surrogate.
				5110	ZoneList<CharacterRange>* range = CharacterRange::List(
				5111	zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
				5112	return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
				5113	}
				5114
				5115
				5116	void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
				5117	ZoneList<CharacterRange>* ranges) {
				5118	#ifdef V8_I18N_SUPPORT
				5119	// Use ICU to compute the case fold closure over the ranges.
				5120	DCHECK(compiler->unicode());
				5121	DCHECK(compiler->ignore_case());
				5122	USet* set = uset_openEmpty();
				5123	for (int i = 0; i < ranges->length(); i++) {
				5124	uset_addRange(set, ranges->at(i).from(), ranges->at(i).to());
				5125	}
				5126	ranges->Clear();
				5127	uset_closeOver(set, USET_CASE_INSENSITIVE);
				5128	// Full case mapping map single characters to multiple characters.
				5129	// Those are represented as strings in the set. Remove them so that
				5130	// we end up with only simple and common case mappings.
				5131	uset_removeAllStrings(set);
				5132	int item_count = uset_getItemCount(set);
				5133	int item_result = 0;
				5134	UErrorCode ec = U_ZERO_ERROR;
				5135	Zone* zone = compiler->zone();
				5136	for (int i = 0; i < item_count; i++) {
				5137	uc32 start = 0;
				5138	uc32 end = 0;
				5139	item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
				5140	ranges->Add(CharacterRange::Range(start, end), zone);
				5141	}
				5142	// No errors and everything we collected have been ranges.
				5143	DCHECK_EQ(U_ZERO_ERROR, ec);
				5144	DCHECK_EQ(0, item_result);
				5145	uset_close(set);
				5146	#else
				5147	// Fallback if ICU is not included.
				5148	CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
				5149	ranges, compiler->one_byte());
				5150	#endif // V8_I18N_SUPPORT
				5151	CharacterRange::Canonicalize(ranges);
				5152	}
				5153
				5154
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5155	RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
				5156	RegExpNode* on_success) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5157	set_.Canonicalize();
				5158	Zone* zone = compiler->zone();
				5159	ZoneList<CharacterRange>* ranges = this->ranges(zone);
				5160	if (compiler->unicode() && compiler->ignore_case()) {
				5161	AddUnicodeCaseEquivalents(compiler, ranges);
				5162	}
				5163	if (compiler->unicode() && !compiler->one_byte()) {
				5164	if (is_negated()) {
				5165	ZoneList<CharacterRange>* negated =
				5166	new (zone) ZoneList<CharacterRange>(2, zone);
				5167	CharacterRange::Negate(ranges, negated, zone);
				5168	ranges = negated;
				5169	}
				5170	if (ranges->length() == 0) {
Ben Murdoch	c561043	2016-08-08 18:44:38 +0100	[diff] [blame]	5171	ranges->Add(CharacterRange::Everything(), zone);
				5172	RegExpCharacterClass* fail =
				5173	new (zone) RegExpCharacterClass(ranges, true);
				5174	return new (zone) TextNode(fail, compiler->read_backward(), on_success);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5175	}
				5176	if (standard_type() == '*') {
				5177	return UnanchoredAdvance(compiler, on_success);
				5178	} else {
				5179	ChoiceNode* result = new (zone) ChoiceNode(2, zone);
				5180	UnicodeRangeSplitter splitter(zone, ranges);
				5181	AddBmpCharacters(compiler, result, on_success, &splitter);
				5182	AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
				5183	AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
				5184	AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
				5185	return result;
				5186	}
				5187	} else {
				5188	return new (zone) TextNode(this, compiler->read_backward(), on_success);
				5189	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5190	}
				5191
				5192
				5193	int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
				5194	RegExpAtom* atom1 = (*a)->AsAtom();
				5195	RegExpAtom* atom2 = (*b)->AsAtom();
				5196	uc16 character1 = atom1->data().at(0);
				5197	uc16 character2 = atom2->data().at(0);
				5198	if (character1 < character2) return -1;
				5199	if (character1 > character2) return 1;
				5200	return 0;
				5201	}
				5202
				5203
				5204	static unibrow::uchar Canonical(
				5205	unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
				5206	unibrow::uchar c) {
				5207	unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
				5208	int length = canonicalize->get(c, '\0', chars);
				5209	DCHECK_LE(length, 1);
				5210	unibrow::uchar canonical = c;
				5211	if (length == 1) canonical = chars[0];
				5212	return canonical;
				5213	}
				5214
				5215
				5216	int CompareFirstCharCaseIndependent(
				5217	unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
				5218	RegExpTree* const* a, RegExpTree* const* b) {
				5219	RegExpAtom* atom1 = (*a)->AsAtom();
				5220	RegExpAtom* atom2 = (*b)->AsAtom();
				5221	unibrow::uchar character1 = atom1->data().at(0);
				5222	unibrow::uchar character2 = atom2->data().at(0);
				5223	if (character1 == character2) return 0;
				5224	if (character1 >= 'a' \|\| character2 >= 'a') {
				5225	character1 = Canonical(canonicalize, character1);
				5226	character2 = Canonical(canonicalize, character2);
				5227	}
				5228	return static_cast<int>(character1) - static_cast<int>(character2);
				5229	}
				5230
				5231
				5232	// We can stable sort runs of atoms, since the order does not matter if they
				5233	// start with different characters.
				5234	// Returns true if any consecutive atoms were found.
				5235	bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
				5236	ZoneList<RegExpTree> alternatives = this->alternatives();
				5237	int length = alternatives->length();
				5238	bool found_consecutive_atoms = false;
				5239	for (int i = 0; i < length; i++) {
				5240	while (i < length) {
				5241	RegExpTree* alternative = alternatives->at(i);
				5242	if (alternative->IsAtom()) break;
				5243	i++;
				5244	}
				5245	// i is length or it is the index of an atom.
				5246	if (i == length) break;
				5247	int first_atom = i;
				5248	i++;
				5249	while (i < length) {
				5250	RegExpTree* alternative = alternatives->at(i);
				5251	if (!alternative->IsAtom()) break;
				5252	i++;
				5253	}
				5254	// Sort atoms to get ones with common prefixes together.
				5255	// This step is more tricky if we are in a case-independent regexp,
				5256	// because it would change /is\|I/ to /I\|is/, and order matters when
				5257	// the regexp parts don't match only disjoint starting points. To fix
				5258	// this we have a version of CompareFirstChar that uses case-
				5259	// independent character classes for comparison.
				5260	DCHECK_LT(first_atom, alternatives->length());
				5261	DCHECK_LE(i, alternatives->length());
				5262	DCHECK_LE(first_atom, i);
				5263	if (compiler->ignore_case()) {
				5264	unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
				5265	compiler->isolate()->regexp_macro_assembler_canonicalize();
				5266	auto compare_closure =
				5267	[canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
				5268	return CompareFirstCharCaseIndependent(canonicalize, a, b);
				5269	};
				5270	alternatives->StableSort(compare_closure, first_atom, i - first_atom);
				5271	} else {
				5272	alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
				5273	}
				5274	if (i - first_atom > 1) found_consecutive_atoms = true;
				5275	}
				5276	return found_consecutive_atoms;
				5277	}
				5278
				5279
				5280	// Optimizes ab\|ac\|az to a(?:b\|c\|d).
				5281	void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
				5282	Zone* zone = compiler->zone();
				5283	ZoneList<RegExpTree> alternatives = this->alternatives();
				5284	int length = alternatives->length();
				5285
				5286	int write_posn = 0;
				5287	int i = 0;
				5288	while (i < length) {
				5289	RegExpTree* alternative = alternatives->at(i);
				5290	if (!alternative->IsAtom()) {
				5291	alternatives->at(write_posn++) = alternatives->at(i);
				5292	i++;
				5293	continue;
				5294	}
				5295	RegExpAtom* atom = alternative->AsAtom();
				5296	unibrow::uchar common_prefix = atom->data().at(0);
				5297	int first_with_prefix = i;
				5298	int prefix_length = atom->length();
				5299	i++;
				5300	while (i < length) {
				5301	alternative = alternatives->at(i);
				5302	if (!alternative->IsAtom()) break;
				5303	atom = alternative->AsAtom();
				5304	unibrow::uchar new_prefix = atom->data().at(0);
				5305	if (new_prefix != common_prefix) {
				5306	if (!compiler->ignore_case()) break;
				5307	unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
				5308	compiler->isolate()->regexp_macro_assembler_canonicalize();
				5309	new_prefix = Canonical(canonicalize, new_prefix);
				5310	common_prefix = Canonical(canonicalize, common_prefix);
				5311	if (new_prefix != common_prefix) break;
				5312	}
				5313	prefix_length = Min(prefix_length, atom->length());
				5314	i++;
				5315	}
				5316	if (i > first_with_prefix + 2) {
				5317	// Found worthwhile run of alternatives with common prefix of at least one
				5318	// character. The sorting function above did not sort on more than one
				5319	// character for reasons of correctness, but there may still be a longer
				5320	// common prefix if the terms were similar or presorted in the input.
				5321	// Find out how long the common prefix is.
				5322	int run_length = i - first_with_prefix;
				5323	atom = alternatives->at(first_with_prefix)->AsAtom();
				5324	for (int j = 1; j < run_length && prefix_length > 1; j++) {
				5325	RegExpAtom* old_atom =
				5326	alternatives->at(j + first_with_prefix)->AsAtom();
				5327	for (int k = 1; k < prefix_length; k++) {
				5328	if (atom->data().at(k) != old_atom->data().at(k)) {
				5329	prefix_length = k;
				5330	break;
				5331	}
				5332	}
				5333	}
				5334	RegExpAtom* prefix =
				5335	new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
				5336	ZoneList<RegExpTree> pair = new (zone) ZoneList<RegExpTree*>(2, zone);
				5337	pair->Add(prefix, zone);
				5338	ZoneList<RegExpTree> suffixes =
				5339	new (zone) ZoneList<RegExpTree*>(run_length, zone);
				5340	for (int j = 0; j < run_length; j++) {
				5341	RegExpAtom* old_atom =
				5342	alternatives->at(j + first_with_prefix)->AsAtom();
				5343	int len = old_atom->length();
				5344	if (len == prefix_length) {
				5345	suffixes->Add(new (zone) RegExpEmpty(), zone);
				5346	} else {
				5347	RegExpTree* suffix = new (zone) RegExpAtom(
				5348	old_atom->data().SubVector(prefix_length, old_atom->length()));
				5349	suffixes->Add(suffix, zone);
				5350	}
				5351	}
				5352	pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
				5353	alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
				5354	} else {
				5355	// Just copy any non-worthwhile alternatives.
				5356	for (int j = first_with_prefix; j < i; j++) {
				5357	alternatives->at(write_posn++) = alternatives->at(j);
				5358	}
				5359	}
				5360	}
				5361	alternatives->Rewind(write_posn); // Trim end of array.
				5362	}
				5363
				5364
				5365	// Optimizes b\|c\|z to [bcz].
				5366	void RegExpDisjunction::FixSingleCharacterDisjunctions(
				5367	RegExpCompiler* compiler) {
				5368	Zone* zone = compiler->zone();
				5369	ZoneList<RegExpTree> alternatives = this->alternatives();
				5370	int length = alternatives->length();
				5371
				5372	int write_posn = 0;
				5373	int i = 0;
				5374	while (i < length) {
				5375	RegExpTree* alternative = alternatives->at(i);
				5376	if (!alternative->IsAtom()) {
				5377	alternatives->at(write_posn++) = alternatives->at(i);
				5378	i++;
				5379	continue;
				5380	}
				5381	RegExpAtom* atom = alternative->AsAtom();
				5382	if (atom->length() != 1) {
				5383	alternatives->at(write_posn++) = alternatives->at(i);
				5384	i++;
				5385	continue;
				5386	}
				5387	int first_in_run = i;
				5388	i++;
				5389	while (i < length) {
				5390	alternative = alternatives->at(i);
				5391	if (!alternative->IsAtom()) break;
				5392	atom = alternative->AsAtom();
				5393	if (atom->length() != 1) break;
				5394	i++;
				5395	}
				5396	if (i > first_in_run + 1) {
				5397	// Found non-trivial run of single-character alternatives.
				5398	int run_length = i - first_in_run;
				5399	ZoneList<CharacterRange>* ranges =
				5400	new (zone) ZoneList<CharacterRange>(2, zone);
				5401	for (int j = 0; j < run_length; j++) {
				5402	RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
				5403	DCHECK_EQ(old_atom->length(), 1);
				5404	ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
				5405	}
				5406	alternatives->at(write_posn++) =
				5407	new (zone) RegExpCharacterClass(ranges, false);
				5408	} else {
				5409	// Just copy any trivial alternatives.
				5410	for (int j = first_in_run; j < i; j++) {
				5411	alternatives->at(write_posn++) = alternatives->at(j);
				5412	}
				5413	}
				5414	}
				5415	alternatives->Rewind(write_posn); // Trim end of array.
				5416	}
				5417
				5418
				5419	RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
				5420	RegExpNode* on_success) {
				5421	ZoneList<RegExpTree> alternatives = this->alternatives();
				5422
				5423	if (alternatives->length() > 2) {
				5424	bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
				5425	if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
				5426	FixSingleCharacterDisjunctions(compiler);
				5427	if (alternatives->length() == 1) {
				5428	return alternatives->at(0)->ToNode(compiler, on_success);
				5429	}
				5430	}
				5431
				5432	int length = alternatives->length();
				5433
				5434	ChoiceNode* result =
				5435	new(compiler->zone()) ChoiceNode(length, compiler->zone());
				5436	for (int i = 0; i < length; i++) {
				5437	GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
				5438	on_success));
				5439	result->AddAlternative(alternative);
				5440	}
				5441	return result;
				5442	}
				5443
				5444
				5445	RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
				5446	RegExpNode* on_success) {
				5447	return ToNode(min(),
				5448	max(),
				5449	is_greedy(),
				5450	body(),
				5451	compiler,
				5452	on_success);
				5453	}
				5454
				5455
				5456	// Scoped object to keep track of how much we unroll quantifier loops in the
				5457	// regexp graph generator.
				5458	class RegExpExpansionLimiter {
				5459	public:
				5460	static const int kMaxExpansionFactor = 6;
				5461	RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
				5462	: compiler_(compiler),
				5463	saved_expansion_factor_(compiler->current_expansion_factor()),
				5464	ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
				5465	DCHECK(factor > 0);
				5466	if (ok_to_expand_) {
				5467	if (factor > kMaxExpansionFactor) {
				5468	// Avoid integer overflow of the current expansion factor.
				5469	ok_to_expand_ = false;
				5470	compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
				5471	} else {
				5472	int new_factor = saved_expansion_factor_ * factor;
				5473	ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
				5474	compiler->set_current_expansion_factor(new_factor);
				5475	}
				5476	}
				5477	}
				5478
				5479	~RegExpExpansionLimiter() {
				5480	compiler_->set_current_expansion_factor(saved_expansion_factor_);
				5481	}
				5482
				5483	bool ok_to_expand() { return ok_to_expand_; }
				5484
				5485	private:
				5486	RegExpCompiler* compiler_;
				5487	int saved_expansion_factor_;
				5488	bool ok_to_expand_;
				5489
				5490	DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
				5491	};
				5492
				5493
				5494	RegExpNode* RegExpQuantifier::ToNode(int min,
				5495	int max,
				5496	bool is_greedy,
				5497	RegExpTree* body,
				5498	RegExpCompiler* compiler,
				5499	RegExpNode* on_success,
				5500	bool not_at_start) {
				5501	// x{f, t} becomes this:
				5502	//
				5503	// (r++)<-.
				5504	// \| `
				5505	// \| (x)
				5506	// v ^
				5507	// (r=0)-->(?)---/ [if r < t]
				5508	// \|
				5509	// [if r >= f] \----> ...
				5510	//
				5511
				5512	// 15.10.2.5 RepeatMatcher algorithm.
				5513	// The parser has already eliminated the case where max is 0. In the case
				5514	// where max_match is zero the parser has removed the quantifier if min was
				5515	// > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
				5516
				5517	// If we know that we cannot match zero length then things are a little
				5518	// simpler since we don't need to make the special zero length match check
				5519	// from step 2.1. If the min and max are small we can unroll a little in
				5520	// this case.
				5521	static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
				5522	static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
				5523	if (max == 0) return on_success; // This can happen due to recursion.
				5524	bool body_can_be_empty = (body->min_match() == 0);
				5525	int body_start_reg = RegExpCompiler::kNoRegister;
				5526	Interval capture_registers = body->CaptureRegisters();
				5527	bool needs_capture_clearing = !capture_registers.is_empty();
				5528	Zone* zone = compiler->zone();
				5529
				5530	if (body_can_be_empty) {
				5531	body_start_reg = compiler->AllocateRegister();
				5532	} else if (compiler->optimize() && !needs_capture_clearing) {
				5533	// Only unroll if there are no captures and the body can't be
				5534	// empty.
				5535	{
				5536	RegExpExpansionLimiter limiter(
				5537	compiler, min + ((max != min) ? 1 : 0));
				5538	if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
				5539	int new_max = (max == kInfinity) ? max : max - min;
				5540	// Recurse once to get the loop or optional matches after the fixed
				5541	// ones.
				5542	RegExpNode* answer = ToNode(
				5543	0, new_max, is_greedy, body, compiler, on_success, true);
				5544	// Unroll the forced matches from 0 to min. This can cause chains of
				5545	// TextNodes (which the parser does not generate). These should be
				5546	// combined if it turns out they hinder good code generation.
				5547	for (int i = 0; i < min; i++) {
				5548	answer = body->ToNode(compiler, answer);
				5549	}
				5550	return answer;
				5551	}
				5552	}
				5553	if (max <= kMaxUnrolledMaxMatches && min == 0) {
				5554	DCHECK(max > 0); // Due to the 'if' above.
				5555	RegExpExpansionLimiter limiter(compiler, max);
				5556	if (limiter.ok_to_expand()) {
				5557	// Unroll the optional matches up to max.
				5558	RegExpNode* answer = on_success;
				5559	for (int i = 0; i < max; i++) {
				5560	ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
				5561	if (is_greedy) {
				5562	alternation->AddAlternative(
				5563	GuardedAlternative(body->ToNode(compiler, answer)));
				5564	alternation->AddAlternative(GuardedAlternative(on_success));
				5565	} else {
				5566	alternation->AddAlternative(GuardedAlternative(on_success));
				5567	alternation->AddAlternative(
				5568	GuardedAlternative(body->ToNode(compiler, answer)));
				5569	}
				5570	answer = alternation;
				5571	if (not_at_start && !compiler->read_backward()) {
				5572	alternation->set_not_at_start();
				5573	}
				5574	}
				5575	return answer;
				5576	}
				5577	}
				5578	}
				5579	bool has_min = min > 0;
				5580	bool has_max = max < RegExpTree::kInfinity;
				5581	bool needs_counter = has_min \|\| has_max;
				5582	int reg_ctr = needs_counter
				5583	? compiler->AllocateRegister()
				5584	: RegExpCompiler::kNoRegister;
				5585	LoopChoiceNode* center = new (zone)
				5586	LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
				5587	if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
				5588	RegExpNode* loop_return = needs_counter
				5589	? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
				5590	: static_cast<RegExpNode*>(center);
				5591	if (body_can_be_empty) {
				5592	// If the body can be empty we need to check if it was and then
				5593	// backtrack.
				5594	loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
				5595	reg_ctr,
				5596	min,
				5597	loop_return);
				5598	}
				5599	RegExpNode* body_node = body->ToNode(compiler, loop_return);
				5600	if (body_can_be_empty) {
				5601	// If the body can be empty we need to store the start position
				5602	// so we can bail out if it was empty.
				5603	body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
				5604	}
				5605	if (needs_capture_clearing) {
				5606	// Before entering the body of this loop we need to clear captures.
				5607	body_node = ActionNode::ClearCaptures(capture_registers, body_node);
				5608	}
				5609	GuardedAlternative body_alt(body_node);
				5610	if (has_max) {
				5611	Guard* body_guard =
				5612	new(zone) Guard(reg_ctr, Guard::LT, max);
				5613	body_alt.AddGuard(body_guard, zone);
				5614	}
				5615	GuardedAlternative rest_alt(on_success);
				5616	if (has_min) {
				5617	Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
				5618	rest_alt.AddGuard(rest_guard, zone);
				5619	}
				5620	if (is_greedy) {
				5621	center->AddLoopAlternative(body_alt);
				5622	center->AddContinueAlternative(rest_alt);
				5623	} else {
				5624	center->AddContinueAlternative(rest_alt);
				5625	center->AddLoopAlternative(body_alt);
				5626	}
				5627	if (needs_counter) {
				5628	return ActionNode::SetRegister(reg_ctr, 0, center);
				5629	} else {
				5630	return center;
				5631	}
				5632	}
				5633
				5634
				5635	RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
				5636	RegExpNode* on_success) {
				5637	NodeInfo info;
				5638	Zone* zone = compiler->zone();
				5639
				5640	switch (assertion_type()) {
				5641	case START_OF_LINE:
				5642	return AssertionNode::AfterNewline(on_success);
				5643	case START_OF_INPUT:
				5644	return AssertionNode::AtStart(on_success);
				5645	case BOUNDARY:
				5646	return AssertionNode::AtBoundary(on_success);
				5647	case NON_BOUNDARY:
				5648	return AssertionNode::AtNonBoundary(on_success);
				5649	case END_OF_INPUT:
				5650	return AssertionNode::AtEnd(on_success);
				5651	case END_OF_LINE: {
				5652	// Compile $ in multiline regexps as an alternation with a positive
				5653	// lookahead in one side and an end-of-input on the other side.
				5654	// We need two registers for the lookahead.
				5655	int stack_pointer_register = compiler->AllocateRegister();
				5656	int position_register = compiler->AllocateRegister();
				5657	// The ChoiceNode to distinguish between a newline and end-of-input.
				5658	ChoiceNode* result = new(zone) ChoiceNode(2, zone);
				5659	// Create a newline atom.
				5660	ZoneList<CharacterRange>* newline_ranges =
				5661	new(zone) ZoneList<CharacterRange>(3, zone);
				5662	CharacterRange::AddClassEscape('n', newline_ranges, zone);
				5663	RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
				5664	TextNode* newline_matcher = new (zone) TextNode(
				5665	newline_atom, false, ActionNode::PositiveSubmatchSuccess(
				5666	stack_pointer_register, position_register,
				5667	0, // No captures inside.
				5668	-1, // Ignored if no captures.
				5669	on_success));
				5670	// Create an end-of-input matcher.
				5671	RegExpNode* end_of_line = ActionNode::BeginSubmatch(
				5672	stack_pointer_register,
				5673	position_register,
				5674	newline_matcher);
				5675	// Add the two alternatives to the ChoiceNode.
				5676	GuardedAlternative eol_alternative(end_of_line);
				5677	result->AddAlternative(eol_alternative);
				5678	GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
				5679	result->AddAlternative(end_alternative);
				5680	return result;
				5681	}
				5682	default:
				5683	UNREACHABLE();
				5684	}
				5685	return on_success;
				5686	}
				5687
				5688
				5689	RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
				5690	RegExpNode* on_success) {
				5691	return new (compiler->zone())
				5692	BackReferenceNode(RegExpCapture::StartRegister(index()),
				5693	RegExpCapture::EndRegister(index()),
				5694	compiler->read_backward(), on_success);
				5695	}
				5696
				5697
				5698	RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
				5699	RegExpNode* on_success) {
				5700	return on_success;
				5701	}
				5702
				5703
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5704	RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
				5705	int stack_pointer_register,
				5706	int position_register,
				5707	int capture_register_count,
				5708	int capture_register_start)
				5709	: is_positive_(is_positive),
				5710	on_success_(on_success),
				5711	stack_pointer_register_(stack_pointer_register),
				5712	position_register_(position_register) {
				5713	if (is_positive_) {
				5714	on_match_success_ = ActionNode::PositiveSubmatchSuccess(
				5715	stack_pointer_register, position_register, capture_register_count,
				5716	capture_register_start, on_success_);
				5717	} else {
				5718	Zone* zone = on_success_->zone();
				5719	on_match_success_ = new (zone) NegativeSubmatchSuccess(
				5720	stack_pointer_register, position_register, capture_register_count,
				5721	capture_register_start, zone);
				5722	}
				5723	}
				5724
				5725
				5726	RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
				5727	if (is_positive_) {
				5728	return ActionNode::BeginSubmatch(stack_pointer_register_,
				5729	position_register_, match);
				5730	} else {
				5731	Zone* zone = on_success_->zone();
				5732	// We use a ChoiceNode to represent the negative lookaround. The first
				5733	// alternative is the negative match. On success, the end node backtracks.
				5734	// On failure, the second alternative is tried and leads to success.
				5735	// NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
				5736	// first exit when calculating quick checks.
				5737	ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
				5738	GuardedAlternative(match), GuardedAlternative(on_success_), zone);
				5739	return ActionNode::BeginSubmatch(stack_pointer_register_,
				5740	position_register_, choice_node);
				5741	}
				5742	}
				5743
				5744
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5745	RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
				5746	RegExpNode* on_success) {
				5747	int stack_pointer_register = compiler->AllocateRegister();
				5748	int position_register = compiler->AllocateRegister();
				5749
				5750	const int registers_per_capture = 2;
				5751	const int register_of_first_capture = 2;
				5752	int register_count = capture_count_ * registers_per_capture;
				5753	int register_start =
				5754	register_of_first_capture + capture_from_ * registers_per_capture;
				5755
				5756	RegExpNode* result;
				5757	bool was_reading_backward = compiler->read_backward();
				5758	compiler->set_read_backward(type() == LOOKBEHIND);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5759	Builder builder(is_positive(), on_success, stack_pointer_register,
				5760	position_register, register_count, register_start);
				5761	RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
				5762	result = builder.ForMatch(match);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5763	compiler->set_read_backward(was_reading_backward);
				5764	return result;
				5765	}
				5766
				5767
				5768	RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
				5769	RegExpNode* on_success) {
				5770	return ToNode(body(), index(), compiler, on_success);
				5771	}
				5772
				5773
				5774	RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
				5775	int index,
				5776	RegExpCompiler* compiler,
				5777	RegExpNode* on_success) {
				5778	DCHECK_NOT_NULL(body);
				5779	int start_reg = RegExpCapture::StartRegister(index);
				5780	int end_reg = RegExpCapture::EndRegister(index);
				5781	if (compiler->read_backward()) std::swap(start_reg, end_reg);
				5782	RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
				5783	RegExpNode* body_node = body->ToNode(compiler, store_end);
				5784	return ActionNode::StorePosition(start_reg, true, body_node);
				5785	}
				5786
				5787
				5788	RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
				5789	RegExpNode* on_success) {
				5790	ZoneList<RegExpTree> children = nodes();
				5791	RegExpNode* current = on_success;
				5792	if (compiler->read_backward()) {
				5793	for (int i = 0; i < children->length(); i++) {
				5794	current = children->at(i)->ToNode(compiler, current);
				5795	}
				5796	} else {
				5797	for (int i = children->length() - 1; i >= 0; i--) {
				5798	current = children->at(i)->ToNode(compiler, current);
				5799	}
				5800	}
				5801	return current;
				5802	}
				5803
				5804
				5805	static void AddClass(const int* elmv,
				5806	int elmc,
				5807	ZoneList<CharacterRange>* ranges,
				5808	Zone* zone) {
				5809	elmc--;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5810	DCHECK(elmv[elmc] == kRangeEndMarker);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5811	for (int i = 0; i < elmc; i += 2) {
				5812	DCHECK(elmv[i] < elmv[i + 1]);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5813	ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5814	}
				5815	}
				5816
				5817
				5818	static void AddClassNegated(const int *elmv,
				5819	int elmc,
				5820	ZoneList<CharacterRange>* ranges,
				5821	Zone* zone) {
				5822	elmc--;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5823	DCHECK(elmv[elmc] == kRangeEndMarker);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5824	DCHECK(elmv[0] != 0x0000);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5825	DCHECK(elmv[elmc - 1] != String::kMaxCodePoint);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5826	uc16 last = 0x0000;
				5827	for (int i = 0; i < elmc; i += 2) {
				5828	DCHECK(last <= elmv[i] - 1);
				5829	DCHECK(elmv[i] < elmv[i + 1]);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5830	ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5831	last = elmv[i + 1];
				5832	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5833	ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5834	}
				5835
				5836
				5837	void CharacterRange::AddClassEscape(uc16 type,
				5838	ZoneList<CharacterRange>* ranges,
				5839	Zone* zone) {
				5840	switch (type) {
				5841	case 's':
				5842	AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
				5843	break;
				5844	case 'S':
				5845	AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
				5846	break;
				5847	case 'w':
				5848	AddClass(kWordRanges, kWordRangeCount, ranges, zone);
				5849	break;
				5850	case 'W':
				5851	AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
				5852	break;
				5853	case 'd':
				5854	AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
				5855	break;
				5856	case 'D':
				5857	AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
				5858	break;
				5859	case '.':
				5860	AddClassNegated(kLineTerminatorRanges,
				5861	kLineTerminatorRangeCount,
				5862	ranges,
				5863	zone);
				5864	break;
				5865	// This is not a character range as defined by the spec but a
				5866	// convenient shorthand for a character class that matches any
				5867	// character.
				5868	case '*':
				5869	ranges->Add(CharacterRange::Everything(), zone);
				5870	break;
				5871	// This is the set of characters matched by the $ and ^ symbols
				5872	// in multiline mode.
				5873	case 'n':
				5874	AddClass(kLineTerminatorRanges,
				5875	kLineTerminatorRangeCount,
				5876	ranges,
				5877	zone);
				5878	break;
				5879	default:
				5880	UNREACHABLE();
				5881	}
				5882	}
				5883
				5884
				5885	Vector<const int> CharacterRange::GetWordBounds() {
				5886	return Vector<const int>(kWordRanges, kWordRangeCount - 1);
				5887	}
				5888
				5889
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5890	void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
				5891	ZoneList<CharacterRange>* ranges,
				5892	bool is_one_byte) {
Ben Murdoch	c561043	2016-08-08 18:44:38 +0100	[diff] [blame]	5893	CharacterRange::Canonicalize(ranges);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5894	int range_count = ranges->length();
				5895	for (int i = 0; i < range_count; i++) {
				5896	CharacterRange range = ranges->at(i);
				5897	uc32 bottom = range.from();
				5898	if (bottom > String::kMaxUtf16CodeUnit) return;
				5899	uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
				5900	// Nothing to be done for surrogates.
				5901	if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;
				5902	if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
				5903	if (bottom > String::kMaxOneByteCharCode) return;
				5904	if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5905	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5906	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				5907	if (top == bottom) {
				5908	// If this is a singleton we just expand the one character.
				5909	int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5910	for (int i = 0; i < length; i++) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5911	uc32 chr = chars[i];
				5912	if (chr != bottom) {
				5913	ranges->Add(CharacterRange::Singleton(chars[i]), zone);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5914	}
				5915	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	5916	} else {
				5917	// If this is a range we expand the characters block by block, expanding
				5918	// contiguous subranges (blocks) one at a time. The approach is as
				5919	// follows. For a given start character we look up the remainder of the
				5920	// block that contains it (represented by the end point), for instance we
				5921	// find 'z' if the character is 'c'. A block is characterized by the
				5922	// property that all characters uncanonicalize in the same way, except
				5923	// that each entry in the result is incremented by the distance from the
				5924	// first element. So a-z is a block because 'a' uncanonicalizes to ['a',
				5925	// 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
				5926	// we've found the end point we look up its uncanonicalization and
				5927	// produce a range for each element. For instance for [c-f] we look up
				5928	// ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
				5929	// it is not already contained in the input, so [c-f] will be skipped but
				5930	// [C-F] will be added. If this range is not completely contained in a
				5931	// block we do this for all the blocks covered by the range (handling
				5932	// characters that is not in a block as a "singleton block").
				5933	unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				5934	int pos = bottom;
				5935	while (pos <= top) {
				5936	int length =
				5937	isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
				5938	uc32 block_end;
				5939	if (length == 0) {
				5940	block_end = pos;
				5941	} else {
				5942	DCHECK_EQ(1, length);
				5943	block_end = equivalents[0];
				5944	}
				5945	int end = (block_end > top) ? top : block_end;
				5946	length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
				5947	equivalents);
				5948	for (int i = 0; i < length; i++) {
				5949	uc32 c = equivalents[i];
				5950	uc32 range_from = c - (block_end - pos);
				5951	uc32 range_to = c - (block_end - end);
				5952	if (!(bottom <= range_from && range_to <= top)) {
				5953	ranges->Add(CharacterRange::Range(range_from, range_to), zone);
				5954	}
				5955	}
				5956	pos = end + 1;
				5957	}
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	5958	}
				5959	}
				5960	}
				5961
				5962
				5963	bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
				5964	DCHECK_NOT_NULL(ranges);
				5965	int n = ranges->length();
				5966	if (n <= 1) return true;
				5967	int max = ranges->at(0).to();
				5968	for (int i = 1; i < n; i++) {
				5969	CharacterRange next_range = ranges->at(i);
				5970	if (next_range.from() <= max + 1) return false;
				5971	max = next_range.to();
				5972	}
				5973	return true;
				5974	}
				5975
				5976
				5977	ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
				5978	if (ranges_ == NULL) {
				5979	ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
				5980	CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
				5981	}
				5982	return ranges_;
				5983	}
				5984
				5985
				5986	// Move a number of elements in a zonelist to another position
				5987	// in the same list. Handles overlapping source and target areas.
				5988	static void MoveRanges(ZoneList<CharacterRange>* list,
				5989	int from,
				5990	int to,
				5991	int count) {
				5992	// Ranges are potentially overlapping.
				5993	if (from < to) {
				5994	for (int i = count - 1; i >= 0; i--) {
				5995	list->at(to + i) = list->at(from + i);
				5996	}
				5997	} else {
				5998	for (int i = 0; i < count; i++) {
				5999	list->at(to + i) = list->at(from + i);
				6000	}
				6001	}
				6002	}
				6003
				6004
				6005	static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
				6006	int count,
				6007	CharacterRange insert) {
				6008	// Inserts a range into list[0..count[, which must be sorted
				6009	// by from value and non-overlapping and non-adjacent, using at most
				6010	// list[0..count] for the result. Returns the number of resulting
				6011	// canonicalized ranges. Inserting a range may collapse existing ranges into
				6012	// fewer ranges, so the return value can be anything in the range 1..count+1.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6013	uc32 from = insert.from();
				6014	uc32 to = insert.to();
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6015	int start_pos = 0;
				6016	int end_pos = count;
				6017	for (int i = count - 1; i >= 0; i--) {
				6018	CharacterRange current = list->at(i);
				6019	if (current.from() > to + 1) {
				6020	end_pos = i;
				6021	} else if (current.to() + 1 < from) {
				6022	start_pos = i + 1;
				6023	break;
				6024	}
				6025	}
				6026
				6027	// Inserted range overlaps, or is adjacent to, ranges at positions
				6028	// [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
				6029	// not affected by the insertion.
				6030	// If start_pos == end_pos, the range must be inserted before start_pos.
				6031	// if start_pos < end_pos, the entire range from start_pos to end_pos
				6032	// must be merged with the insert range.
				6033
				6034	if (start_pos == end_pos) {
				6035	// Insert between existing ranges at position start_pos.
				6036	if (start_pos < count) {
				6037	MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
				6038	}
				6039	list->at(start_pos) = insert;
				6040	return count + 1;
				6041	}
				6042	if (start_pos + 1 == end_pos) {
				6043	// Replace single existing range at position start_pos.
				6044	CharacterRange to_replace = list->at(start_pos);
				6045	int new_from = Min(to_replace.from(), from);
				6046	int new_to = Max(to_replace.to(), to);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6047	list->at(start_pos) = CharacterRange::Range(new_from, new_to);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6048	return count;
				6049	}
				6050	// Replace a number of existing ranges from start_pos to end_pos - 1.
				6051	// Move the remaining ranges down.
				6052
				6053	int new_from = Min(list->at(start_pos).from(), from);
				6054	int new_to = Max(list->at(end_pos - 1).to(), to);
				6055	if (end_pos < count) {
				6056	MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
				6057	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6058	list->at(start_pos) = CharacterRange::Range(new_from, new_to);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6059	return count - (end_pos - start_pos) + 1;
				6060	}
				6061
				6062
				6063	void CharacterSet::Canonicalize() {
				6064	// Special/default classes are always considered canonical. The result
				6065	// of calling ranges() will be sorted.
				6066	if (ranges_ == NULL) return;
				6067	CharacterRange::Canonicalize(ranges_);
				6068	}
				6069
				6070
				6071	void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
				6072	if (character_ranges->length() <= 1) return;
				6073	// Check whether ranges are already canonical (increasing, non-overlapping,
				6074	// non-adjacent).
				6075	int n = character_ranges->length();
				6076	int max = character_ranges->at(0).to();
				6077	int i = 1;
				6078	while (i < n) {
				6079	CharacterRange current = character_ranges->at(i);
				6080	if (current.from() <= max + 1) {
				6081	break;
				6082	}
				6083	max = current.to();
				6084	i++;
				6085	}
				6086	// Canonical until the i'th range. If that's all of them, we are done.
				6087	if (i == n) return;
				6088
				6089	// The ranges at index i and forward are not canonicalized. Make them so by
				6090	// doing the equivalent of insertion sort (inserting each into the previous
				6091	// list, in order).
				6092	// Notice that inserting a range can reduce the number of ranges in the
				6093	// result due to combining of adjacent and overlapping ranges.
				6094	int read = i; // Range to insert.
				6095	int num_canonical = i; // Length of canonicalized part of list.
				6096	do {
				6097	num_canonical = InsertRangeInCanonicalList(character_ranges,
				6098	num_canonical,
				6099	character_ranges->at(read));
				6100	read++;
				6101	} while (read < n);
				6102	character_ranges->Rewind(num_canonical);
				6103
				6104	DCHECK(CharacterRange::IsCanonical(character_ranges));
				6105	}
				6106
				6107
				6108	void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
				6109	ZoneList<CharacterRange>* negated_ranges,
				6110	Zone* zone) {
				6111	DCHECK(CharacterRange::IsCanonical(ranges));
				6112	DCHECK_EQ(0, negated_ranges->length());
				6113	int range_count = ranges->length();
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6114	uc32 from = 0;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6115	int i = 0;
				6116	if (range_count > 0 && ranges->at(0).from() == 0) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6117	from = ranges->at(0).to() + 1;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6118	i = 1;
				6119	}
				6120	while (i < range_count) {
				6121	CharacterRange range = ranges->at(i);
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6122	negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
				6123	from = range.to() + 1;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6124	i++;
				6125	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6126	if (from < String::kMaxCodePoint) {
				6127	negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6128	zone);
				6129	}
				6130	}
				6131
				6132
				6133	// -------------------------------------------------------------------
				6134	// Splay tree
				6135
				6136
				6137	OutSet* OutSet::Extend(unsigned value, Zone* zone) {
				6138	if (Get(value))
				6139	return this;
				6140	if (successors(zone) != NULL) {
				6141	for (int i = 0; i < successors(zone)->length(); i++) {
				6142	OutSet* successor = successors(zone)->at(i);
				6143	if (successor->Get(value))
				6144	return successor;
				6145	}
				6146	} else {
				6147	successors_ = new(zone) ZoneList<OutSet*>(2, zone);
				6148	}
				6149	OutSet* result = new(zone) OutSet(first_, remaining_);
				6150	result->Set(value, zone);
				6151	successors(zone)->Add(result, zone);
				6152	return result;
				6153	}
				6154
				6155
				6156	void OutSet::Set(unsigned value, Zone *zone) {
				6157	if (value < kFirstLimit) {
				6158	first_ \|= (1 << value);
				6159	} else {
				6160	if (remaining_ == NULL)
				6161	remaining_ = new(zone) ZoneList<unsigned>(1, zone);
				6162	if (remaining_->is_empty() \|\| !remaining_->Contains(value))
				6163	remaining_->Add(value, zone);
				6164	}
				6165	}
				6166
				6167
				6168	bool OutSet::Get(unsigned value) const {
				6169	if (value < kFirstLimit) {
				6170	return (first_ & (1 << value)) != 0;
				6171	} else if (remaining_ == NULL) {
				6172	return false;
				6173	} else {
				6174	return remaining_->Contains(value);
				6175	}
				6176	}
				6177
				6178
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6179	const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6180
				6181
				6182	void DispatchTable::AddRange(CharacterRange full_range, int value,
				6183	Zone* zone) {
				6184	CharacterRange current = full_range;
				6185	if (tree()->is_empty()) {
				6186	// If this is the first range we just insert into the table.
				6187	ZoneSplayTree<Config>::Locator loc;
				6188	bool inserted = tree()->Insert(current.from(), &loc);
				6189	DCHECK(inserted);
				6190	USE(inserted);
				6191	loc.set_value(Entry(current.from(), current.to(),
				6192	empty()->Extend(value, zone)));
				6193	return;
				6194	}
				6195	// First see if there is a range to the left of this one that
				6196	// overlaps.
				6197	ZoneSplayTree<Config>::Locator loc;
				6198	if (tree()->FindGreatestLessThan(current.from(), &loc)) {
				6199	Entry* entry = &loc.value();
				6200	// If we've found a range that overlaps with this one, and it
				6201	// starts strictly to the left of this one, we have to fix it
				6202	// because the following code only handles ranges that start on
				6203	// or after the start point of the range we're adding.
				6204	if (entry->from() < current.from() && entry->to() >= current.from()) {
				6205	// Snap the overlapping range in half around the start point of
				6206	// the range we're adding.
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6207	CharacterRange left =
				6208	CharacterRange::Range(entry->from(), current.from() - 1);
				6209	CharacterRange right = CharacterRange::Range(current.from(), entry->to());
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6210	// The left part of the overlapping range doesn't overlap.
				6211	// Truncate the whole entry to be just the left part.
				6212	entry->set_to(left.to());
				6213	// The right part is the one that overlaps. We add this part
				6214	// to the map and let the next step deal with merging it with
				6215	// the range we're adding.
				6216	ZoneSplayTree<Config>::Locator loc;
				6217	bool inserted = tree()->Insert(right.from(), &loc);
				6218	DCHECK(inserted);
				6219	USE(inserted);
				6220	loc.set_value(Entry(right.from(),
				6221	right.to(),
				6222	entry->out_set()));
				6223	}
				6224	}
				6225	while (current.is_valid()) {
				6226	if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
				6227	(loc.value().from() <= current.to()) &&
				6228	(loc.value().to() >= current.from())) {
				6229	Entry* entry = &loc.value();
				6230	// We have overlap. If there is space between the start point of
				6231	// the range we're adding and where the overlapping range starts
				6232	// then we have to add a range covering just that space.
				6233	if (current.from() < entry->from()) {
				6234	ZoneSplayTree<Config>::Locator ins;
				6235	bool inserted = tree()->Insert(current.from(), &ins);
				6236	DCHECK(inserted);
				6237	USE(inserted);
				6238	ins.set_value(Entry(current.from(),
				6239	entry->from() - 1,
				6240	empty()->Extend(value, zone)));
				6241	current.set_from(entry->from());
				6242	}
				6243	DCHECK_EQ(current.from(), entry->from());
				6244	// If the overlapping range extends beyond the one we want to add
				6245	// we have to snap the right part off and add it separately.
				6246	if (entry->to() > current.to()) {
				6247	ZoneSplayTree<Config>::Locator ins;
				6248	bool inserted = tree()->Insert(current.to() + 1, &ins);
				6249	DCHECK(inserted);
				6250	USE(inserted);
				6251	ins.set_value(Entry(current.to() + 1,
				6252	entry->to(),
				6253	entry->out_set()));
				6254	entry->set_to(current.to());
				6255	}
				6256	DCHECK(entry->to() <= current.to());
				6257	// The overlapping range is now completely contained by the range
				6258	// we're adding so we can just update it and move the start point
				6259	// of the range we're adding just past it.
				6260	entry->AddValue(value, zone);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6261	DCHECK(entry->to() + 1 > current.from());
				6262	current.set_from(entry->to() + 1);
				6263	} else {
				6264	// There is no overlap so we can just add the range
				6265	ZoneSplayTree<Config>::Locator ins;
				6266	bool inserted = tree()->Insert(current.from(), &ins);
				6267	DCHECK(inserted);
				6268	USE(inserted);
				6269	ins.set_value(Entry(current.from(),
				6270	current.to(),
				6271	empty()->Extend(value, zone)));
				6272	break;
				6273	}
				6274	}
				6275	}
				6276
				6277
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6278	OutSet* DispatchTable::Get(uc32 value) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6279	ZoneSplayTree<Config>::Locator loc;
				6280	if (!tree()->FindGreatestLessThan(value, &loc))
				6281	return empty();
				6282	Entry* entry = &loc.value();
				6283	if (value <= entry->to())
				6284	return entry->out_set();
				6285	else
				6286	return empty();
				6287	}
				6288
				6289
				6290	// -------------------------------------------------------------------
				6291	// Analysis
				6292
				6293
				6294	void Analysis::EnsureAnalyzed(RegExpNode* that) {
				6295	StackLimitCheck check(isolate());
				6296	if (check.HasOverflowed()) {
				6297	fail("Stack overflow");
				6298	return;
				6299	}
				6300	if (that->info()->been_analyzed \|\| that->info()->being_analyzed)
				6301	return;
				6302	that->info()->being_analyzed = true;
				6303	that->Accept(this);
				6304	that->info()->being_analyzed = false;
				6305	that->info()->been_analyzed = true;
				6306	}
				6307
				6308
				6309	void Analysis::VisitEnd(EndNode* that) {
				6310	// nothing to do
				6311	}
				6312
				6313
				6314	void TextNode::CalculateOffsets() {
				6315	int element_count = elements()->length();
				6316	// Set up the offsets of the elements relative to the start. This is a fixed
				6317	// quantity since a TextNode can only contain fixed-width things.
				6318	int cp_offset = 0;
				6319	for (int i = 0; i < element_count; i++) {
				6320	TextElement& elm = elements()->at(i);
				6321	elm.set_cp_offset(cp_offset);
				6322	cp_offset += elm.length();
				6323	}
				6324	}
				6325
				6326
				6327	void Analysis::VisitText(TextNode* that) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6328	if (ignore_case()) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6329	that->MakeCaseIndependent(isolate(), is_one_byte_);
				6330	}
				6331	EnsureAnalyzed(that->on_success());
				6332	if (!has_failed()) {
				6333	that->CalculateOffsets();
				6334	}
				6335	}
				6336
				6337
				6338	void Analysis::VisitAction(ActionNode* that) {
				6339	RegExpNode* target = that->on_success();
				6340	EnsureAnalyzed(target);
				6341	if (!has_failed()) {
				6342	// If the next node is interested in what it follows then this node
				6343	// has to be interested too so it can pass the information on.
				6344	that->info()->AddFromFollowing(target->info());
				6345	}
				6346	}
				6347
				6348
				6349	void Analysis::VisitChoice(ChoiceNode* that) {
				6350	NodeInfo* info = that->info();
				6351	for (int i = 0; i < that->alternatives()->length(); i++) {
				6352	RegExpNode* node = that->alternatives()->at(i).node();
				6353	EnsureAnalyzed(node);
				6354	if (has_failed()) return;
				6355	// Anything the following nodes need to know has to be known by
				6356	// this node also, so it can pass it on.
				6357	info->AddFromFollowing(node->info());
				6358	}
				6359	}
				6360
				6361
				6362	void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
				6363	NodeInfo* info = that->info();
				6364	for (int i = 0; i < that->alternatives()->length(); i++) {
				6365	RegExpNode* node = that->alternatives()->at(i).node();
				6366	if (node != that->loop_node()) {
				6367	EnsureAnalyzed(node);
				6368	if (has_failed()) return;
				6369	info->AddFromFollowing(node->info());
				6370	}
				6371	}
				6372	// Check the loop last since it may need the value of this node
				6373	// to get a correct result.
				6374	EnsureAnalyzed(that->loop_node());
				6375	if (!has_failed()) {
				6376	info->AddFromFollowing(that->loop_node()->info());
				6377	}
				6378	}
				6379
				6380
				6381	void Analysis::VisitBackReference(BackReferenceNode* that) {
				6382	EnsureAnalyzed(that->on_success());
				6383	}
				6384
				6385
				6386	void Analysis::VisitAssertion(AssertionNode* that) {
				6387	EnsureAnalyzed(that->on_success());
				6388	}
				6389
				6390
				6391	void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
				6392	BoyerMooreLookahead* bm,
				6393	bool not_at_start) {
				6394	// Working out the set of characters that a backreference can match is too
				6395	// hard, so we just say that any character can match.
				6396	bm->SetRest(offset);
				6397	SaveBMInfo(bm, not_at_start, offset);
				6398	}
				6399
				6400
				6401	STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
				6402	RegExpMacroAssembler::kTableSize);
				6403
				6404
				6405	void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
				6406	BoyerMooreLookahead* bm, bool not_at_start) {
				6407	ZoneList<GuardedAlternative>* alts = alternatives();
				6408	budget = (budget - 1) / alts->length();
				6409	for (int i = 0; i < alts->length(); i++) {
				6410	GuardedAlternative& alt = alts->at(i);
				6411	if (alt.guards() != NULL && alt.guards()->length() != 0) {
				6412	bm->SetRest(offset); // Give up trying to fill in info.
				6413	SaveBMInfo(bm, not_at_start, offset);
				6414	return;
				6415	}
				6416	alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
				6417	}
				6418	SaveBMInfo(bm, not_at_start, offset);
				6419	}
				6420
				6421
				6422	void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
				6423	BoyerMooreLookahead* bm, bool not_at_start) {
				6424	if (initial_offset >= bm->length()) return;
				6425	int offset = initial_offset;
				6426	int max_char = bm->max_char();
				6427	for (int i = 0; i < elements()->length(); i++) {
				6428	if (offset >= bm->length()) {
				6429	if (initial_offset == 0) set_bm_info(not_at_start, bm);
				6430	return;
				6431	}
				6432	TextElement text = elements()->at(i);
				6433	if (text.text_type() == TextElement::ATOM) {
				6434	RegExpAtom* atom = text.atom();
				6435	for (int j = 0; j < atom->length(); j++, offset++) {
				6436	if (offset >= bm->length()) {
				6437	if (initial_offset == 0) set_bm_info(not_at_start, bm);
				6438	return;
				6439	}
				6440	uc16 character = atom->data()[j];
				6441	if (bm->compiler()->ignore_case()) {
				6442	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				6443	int length = GetCaseIndependentLetters(
				6444	isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
				6445	chars);
				6446	for (int j = 0; j < length; j++) {
				6447	bm->Set(offset, chars[j]);
				6448	}
				6449	} else {
				6450	if (character <= max_char) bm->Set(offset, character);
				6451	}
				6452	}
				6453	} else {
				6454	DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
				6455	RegExpCharacterClass* char_class = text.char_class();
				6456	ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
				6457	if (char_class->is_negated()) {
				6458	bm->SetAll(offset);
				6459	} else {
				6460	for (int k = 0; k < ranges->length(); k++) {
				6461	CharacterRange& range = ranges->at(k);
				6462	if (range.from() > max_char) continue;
				6463	int to = Min(max_char, static_cast<int>(range.to()));
				6464	bm->SetInterval(offset, Interval(range.from(), to));
				6465	}
				6466	}
				6467	offset++;
				6468	}
				6469	}
				6470	if (offset >= bm->length()) {
				6471	if (initial_offset == 0) set_bm_info(not_at_start, bm);
				6472	return;
				6473	}
				6474	on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
				6475	true); // Not at start after a text node.
				6476	if (initial_offset == 0) set_bm_info(not_at_start, bm);
				6477	}
				6478
				6479
				6480	// -------------------------------------------------------------------
				6481	// Dispatch table construction
				6482
				6483
				6484	void DispatchTableConstructor::VisitEnd(EndNode* that) {
				6485	AddRange(CharacterRange::Everything());
				6486	}
				6487
				6488
				6489	void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
				6490	node->set_being_calculated(true);
				6491	ZoneList<GuardedAlternative>* alternatives = node->alternatives();
				6492	for (int i = 0; i < alternatives->length(); i++) {
				6493	set_choice_index(i);
				6494	alternatives->at(i).node()->Accept(this);
				6495	}
				6496	node->set_being_calculated(false);
				6497	}
				6498
				6499
				6500	class AddDispatchRange {
				6501	public:
				6502	explicit AddDispatchRange(DispatchTableConstructor* constructor)
				6503	: constructor_(constructor) { }
				6504	void Call(uc32 from, DispatchTable::Entry entry);
				6505	private:
				6506	DispatchTableConstructor* constructor_;
				6507	};
				6508
				6509
				6510	void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6511	constructor_->AddRange(CharacterRange::Range(from, entry.to()));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6512	}
				6513
				6514
				6515	void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
				6516	if (node->being_calculated())
				6517	return;
				6518	DispatchTable* table = node->GetTable(ignore_case_);
				6519	AddDispatchRange adder(this);
				6520	table->ForEach(&adder);
				6521	}
				6522
				6523
				6524	void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
				6525	// TODO(160): Find the node that we refer back to and propagate its start
				6526	// set back to here. For now we just accept anything.
				6527	AddRange(CharacterRange::Everything());
				6528	}
				6529
				6530
				6531	void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
				6532	RegExpNode* target = that->on_success();
				6533	target->Accept(this);
				6534	}
				6535
				6536
				6537	static int CompareRangeByFrom(const CharacterRange* a,
				6538	const CharacterRange* b) {
				6539	return Compare<uc16>(a->from(), b->from());
				6540	}
				6541
				6542
				6543	void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
				6544	ranges->Sort(CompareRangeByFrom);
				6545	uc16 last = 0;
				6546	for (int i = 0; i < ranges->length(); i++) {
				6547	CharacterRange range = ranges->at(i);
				6548	if (last < range.from())
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6549	AddRange(CharacterRange::Range(last, range.from() - 1));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6550	if (range.to() >= last) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6551	if (range.to() == String::kMaxCodePoint) {
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6552	return;
				6553	} else {
				6554	last = range.to() + 1;
				6555	}
				6556	}
				6557	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6558	AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6559	}
				6560
				6561
				6562	void DispatchTableConstructor::VisitText(TextNode* that) {
				6563	TextElement elm = that->elements()->at(0);
				6564	switch (elm.text_type()) {
				6565	case TextElement::ATOM: {
				6566	uc16 c = elm.atom()->data()[0];
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6567	AddRange(CharacterRange::Range(c, c));
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6568	break;
				6569	}
				6570	case TextElement::CHAR_CLASS: {
				6571	RegExpCharacterClass* tree = elm.char_class();
				6572	ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
				6573	if (tree->is_negated()) {
				6574	AddInverse(ranges);
				6575	} else {
				6576	for (int i = 0; i < ranges->length(); i++)
				6577	AddRange(ranges->at(i));
				6578	}
				6579	break;
				6580	}
				6581	default: {
				6582	UNIMPLEMENTED();
				6583	}
				6584	}
				6585	}
				6586
				6587
				6588	void DispatchTableConstructor::VisitAction(ActionNode* that) {
				6589	RegExpNode* target = that->on_success();
				6590	target->Accept(this);
				6591	}
				6592
				6593
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6594	RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
				6595	RegExpNode* on_success) {
				6596	// If the regexp matching starts within a surrogate pair, step back
				6597	// to the lead surrogate and start matching from there.
				6598	DCHECK(!compiler->read_backward());
				6599	Zone* zone = compiler->zone();
				6600	ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
				6601	zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
				6602	ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
				6603	zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
				6604
				6605	ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
				6606
				6607	int stack_register = compiler->UnicodeLookaroundStackRegister();
				6608	int position_register = compiler->UnicodeLookaroundPositionRegister();
				6609	RegExpNode* step_back = TextNode::CreateForCharacterRanges(
				6610	zone, lead_surrogates, true, on_success);
				6611	RegExpLookaround::Builder builder(true, step_back, stack_register,
				6612	position_register);
				6613	RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
				6614	zone, trail_surrogates, false, builder.on_match_success());
				6615
				6616	optional_step_back->AddAlternative(
				6617	GuardedAlternative(builder.ForMatch(match_trail)));
				6618	optional_step_back->AddAlternative(GuardedAlternative(on_success));
				6619
				6620	return optional_step_back;
				6621	}
				6622
				6623
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6624	RegExpEngine::CompilationResult RegExpEngine::Compile(
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6625	Isolate* isolate, Zone* zone, RegExpCompileData* data,
				6626	JSRegExp::Flags flags, Handle<String> pattern,
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6627	Handle<String> sample_subject, bool is_one_byte) {
				6628	if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
				6629	return IrregexpRegExpTooBig(isolate);
				6630	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6631	bool ignore_case = flags & JSRegExp::kIgnoreCase;
				6632	bool is_sticky = flags & JSRegExp::kSticky;
				6633	bool is_global = flags & JSRegExp::kGlobal;
				6634	bool is_unicode = flags & JSRegExp::kUnicode;
				6635	RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6636	is_one_byte);
				6637
				6638	if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
				6639
				6640	// Sample some characters from the middle of the string.
				6641	static const int kSampleSize = 128;
				6642
				6643	sample_subject = String::Flatten(sample_subject);
				6644	int chars_sampled = 0;
				6645	int half_way = (sample_subject->length() - kSampleSize) / 2;
				6646	for (int i = Max(0, half_way);
				6647	i < sample_subject->length() && chars_sampled < kSampleSize;
				6648	i++, chars_sampled++) {
				6649	compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
				6650	}
				6651
				6652	// Wrap the body of the regexp in capture #0.
				6653	RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
				6654	0,
				6655	&compiler,
				6656	compiler.accept());
				6657	RegExpNode* node = captured_body;
				6658	bool is_end_anchored = data->tree->IsAnchoredAtEnd();
				6659	bool is_start_anchored = data->tree->IsAnchoredAtStart();
				6660	int max_length = data->tree->max_match();
				6661	if (!is_start_anchored && !is_sticky) {
				6662	// Add a .*? at the beginning, outside the body capture, unless
				6663	// this expression is anchored at the beginning or sticky.
				6664	RegExpNode* loop_node = RegExpQuantifier::ToNode(
				6665	0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'),
				6666	&compiler, captured_body, data->contains_anchor);
				6667
				6668	if (data->contains_anchor) {
				6669	// Unroll loop once, to take care of the case that might start
				6670	// at the start of input.
				6671	ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
				6672	first_step_node->AddAlternative(GuardedAlternative(captured_body));
				6673	first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
				6674	new (zone) RegExpCharacterClass('*'), false, loop_node)));
				6675	node = first_step_node;
				6676	} else {
				6677	node = loop_node;
				6678	}
				6679	}
				6680	if (is_one_byte) {
				6681	node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
				6682	// Do it again to propagate the new nodes to places where they were not
				6683	// put because they had not been calculated yet.
				6684	if (node != NULL) {
				6685	node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
				6686	}
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6687	} else if (compiler.unicode() && (is_global \|\| is_sticky)) {
				6688	node = OptionallyStepBackToLeadSurrogate(&compiler, node);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6689	}
				6690
				6691	if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
				6692	data->node = node;
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6693	Analysis analysis(isolate, flags, is_one_byte);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6694	analysis.EnsureAnalyzed(node);
				6695	if (analysis.has_failed()) {
				6696	const char* error_message = analysis.error_message();
				6697	return CompilationResult(isolate, error_message);
				6698	}
				6699
				6700	// Create the correct assembler for the architecture.
				6701	#ifndef V8_INTERPRETED_REGEXP
				6702	// Native regexp implementation.
				6703
				6704	NativeRegExpMacroAssembler::Mode mode =
				6705	is_one_byte ? NativeRegExpMacroAssembler::LATIN1
				6706	: NativeRegExpMacroAssembler::UC16;
				6707
				6708	#if V8_TARGET_ARCH_IA32
				6709	RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
				6710	(data->capture_count + 1) * 2);
				6711	#elif V8_TARGET_ARCH_X64
				6712	RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
				6713	(data->capture_count + 1) * 2);
				6714	#elif V8_TARGET_ARCH_ARM
				6715	RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
				6716	(data->capture_count + 1) * 2);
				6717	#elif V8_TARGET_ARCH_ARM64
				6718	RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
				6719	(data->capture_count + 1) * 2);
Ben Murdoch	da12d29	2016-06-02 14:46:10 +0100	[diff] [blame]	6720	#elif V8_TARGET_ARCH_S390
				6721	RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode,
				6722	(data->capture_count + 1) * 2);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6723	#elif V8_TARGET_ARCH_PPC
				6724	RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
				6725	(data->capture_count + 1) * 2);
				6726	#elif V8_TARGET_ARCH_MIPS
				6727	RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
				6728	(data->capture_count + 1) * 2);
				6729	#elif V8_TARGET_ARCH_MIPS64
				6730	RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
				6731	(data->capture_count + 1) * 2);
				6732	#elif V8_TARGET_ARCH_X87
				6733	RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode,
				6734	(data->capture_count + 1) * 2);
				6735	#else
				6736	#error "Unsupported architecture"
				6737	#endif
				6738
				6739	#else // V8_INTERPRETED_REGEXP
				6740	// Interpreted regexp implementation.
				6741	EmbeddedVector<byte, 1024> codes;
				6742	RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
				6743	#endif // V8_INTERPRETED_REGEXP
				6744
				6745	macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern));
				6746
				6747	// Inserted here, instead of in Assembler, because it depends on information
				6748	// in the AST that isn't replicated in the Node structure.
				6749	static const int kMaxBacksearchLimit = 1024;
				6750	if (is_end_anchored &&
				6751	!is_start_anchored &&
				6752	max_length < kMaxBacksearchLimit) {
				6753	macro_assembler.SetCurrentPositionFromEnd(max_length);
				6754	}
				6755
				6756	if (is_global) {
Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	6757	RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
				6758	if (data->tree->min_match() > 0) {
				6759	mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
				6760	} else if (is_unicode) {
				6761	mode = RegExpMacroAssembler::GLOBAL_UNICODE;
				6762	}
				6763	macro_assembler.set_global_mode(mode);
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6764	}
				6765
				6766	return compiler.Assemble(&macro_assembler,
				6767	node,
				6768	data->capture_count,
				6769	pattern);
				6770	}
				6771
				6772
				6773	bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) {
				6774	Heap* heap = pattern->GetHeap();
				6775	bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
				6776	if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&
Ben Murdoch	c561043	2016-08-08 18:44:38 +0100	[diff] [blame]	6777	heap->memory_allocator()->SizeExecutable() >
Ben Murdoch	4a90d5f	2016-03-22 12:00:34 +0000	[diff] [blame]	6778	RegExpImpl::kRegExpExecutableMemoryLimit) {
				6779	too_much = true;
				6780	}
				6781	return too_much;
				6782	}
				6783
				6784
				6785	Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string,
				6786	Object* key_pattern,
				6787	FixedArray** last_match_cache,
				6788	ResultsCacheType type) {
				6789	FixedArray* cache;
				6790	if (!key_string->IsInternalizedString()) return Smi::FromInt(0);
				6791	if (type == STRING_SPLIT_SUBSTRINGS) {
				6792	DCHECK(key_pattern->IsString());
				6793	if (!key_pattern->IsInternalizedString()) return Smi::FromInt(0);
				6794	cache = heap->string_split_cache();
				6795	} else {
				6796	DCHECK(type == REGEXP_MULTIPLE_INDICES);
				6797	DCHECK(key_pattern->IsFixedArray());
				6798	cache = heap->regexp_multiple_cache();
				6799	}
				6800
				6801	uint32_t hash = key_string->Hash();
				6802	uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
				6803	~(kArrayEntriesPerCacheEntry - 1));
				6804	if (cache->get(index + kStringOffset) != key_string \|\|
				6805	cache->get(index + kPatternOffset) != key_pattern) {
				6806	index =
				6807	((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
				6808	if (cache->get(index + kStringOffset) != key_string \|\|
				6809	cache->get(index + kPatternOffset) != key_pattern) {
				6810	return Smi::FromInt(0);
				6811	}
				6812	}
				6813
				6814	*last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
				6815	return cache->get(index + kArrayOffset);
				6816	}
				6817
				6818
				6819	void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
				6820	Handle<Object> key_pattern,
				6821	Handle<FixedArray> value_array,
				6822	Handle<FixedArray> last_match_cache,
				6823	ResultsCacheType type) {
				6824	Factory* factory = isolate->factory();
				6825	Handle<FixedArray> cache;
				6826	if (!key_string->IsInternalizedString()) return;
				6827	if (type == STRING_SPLIT_SUBSTRINGS) {
				6828	DCHECK(key_pattern->IsString());
				6829	if (!key_pattern->IsInternalizedString()) return;
				6830	cache = factory->string_split_cache();
				6831	} else {
				6832	DCHECK(type == REGEXP_MULTIPLE_INDICES);
				6833	DCHECK(key_pattern->IsFixedArray());
				6834	cache = factory->regexp_multiple_cache();
				6835	}
				6836
				6837	uint32_t hash = key_string->Hash();
				6838	uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
				6839	~(kArrayEntriesPerCacheEntry - 1));
				6840	if (cache->get(index + kStringOffset) == Smi::FromInt(0)) {
				6841	cache->set(index + kStringOffset, *key_string);
				6842	cache->set(index + kPatternOffset, *key_pattern);
				6843	cache->set(index + kArrayOffset, *value_array);
				6844	cache->set(index + kLastMatchOffset, *last_match_cache);
				6845	} else {
				6846	uint32_t index2 =
				6847	((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
				6848	if (cache->get(index2 + kStringOffset) == Smi::FromInt(0)) {
				6849	cache->set(index2 + kStringOffset, *key_string);
				6850	cache->set(index2 + kPatternOffset, *key_pattern);
				6851	cache->set(index2 + kArrayOffset, *value_array);
				6852	cache->set(index2 + kLastMatchOffset, *last_match_cache);
				6853	} else {
				6854	cache->set(index2 + kStringOffset, Smi::FromInt(0));
				6855	cache->set(index2 + kPatternOffset, Smi::FromInt(0));
				6856	cache->set(index2 + kArrayOffset, Smi::FromInt(0));
				6857	cache->set(index2 + kLastMatchOffset, Smi::FromInt(0));
				6858	cache->set(index + kStringOffset, *key_string);
				6859	cache->set(index + kPatternOffset, *key_pattern);
				6860	cache->set(index + kArrayOffset, *value_array);
				6861	cache->set(index + kLastMatchOffset, *last_match_cache);
				6862	}
				6863	}
				6864	// If the array is a reasonably short list of substrings, convert it into a
				6865	// list of internalized strings.
				6866	if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
				6867	for (int i = 0; i < value_array->length(); i++) {
				6868	Handle<String> str(String::cast(value_array->get(i)), isolate);
				6869	Handle<String> internalized_str = factory->InternalizeString(str);
				6870	value_array->set(i, *internalized_str);
				6871	}
				6872	}
				6873	// Convert backing store to a copy-on-write array.
				6874	value_array->set_map_no_write_barrier(*factory->fixed_cow_array_map());
				6875	}
				6876
				6877
				6878	void RegExpResultsCache::Clear(FixedArray* cache) {
				6879	for (int i = 0; i < kRegExpResultsCacheSize; i++) {
				6880	cache->set(i, Smi::FromInt(0));
				6881	}
				6882	}
				6883
				6884	} // namespace internal
				6885	} // namespace v8