Blame - src/jsregexp.cc - fp2-dev/platform/external/v8

blob: e518662d071fff60837c2925f0fa3bea7020fb74 [file] [log] [blame]

Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame^]	1	// Copyright 2006-2009 the V8 project authors. All rights reserved.
				2	// Redistribution and use in source and binary forms, with or without
				3	// modification, are permitted provided that the following conditions are
				4	// met:
				5	//
				6	// * Redistributions of source code must retain the above copyright
				7	// notice, this list of conditions and the following disclaimer.
				8	// * Redistributions in binary form must reproduce the above
				9	// copyright notice, this list of conditions and the following
				10	// disclaimer in the documentation and/or other materials provided
				11	// with the distribution.
				12	// * Neither the name of Google Inc. nor the names of its
				13	// contributors may be used to endorse or promote products derived
				14	// from this software without specific prior written permission.
				15	//
				16	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				17	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				18	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				19	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				20	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				21	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				22	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				23	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				24	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				25	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				26	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				27
				28	#include "v8.h"
				29
				30	#include "ast.h"
				31	#include "compiler.h"
				32	#include "execution.h"
				33	#include "factory.h"
				34	#include "jsregexp.h"
				35	#include "platform.h"
				36	#include "runtime.h"
				37	#include "top.h"
				38	#include "compilation-cache.h"
				39	#include "string-stream.h"
				40	#include "parser.h"
				41	#include "regexp-macro-assembler.h"
				42	#include "regexp-macro-assembler-tracer.h"
				43	#include "regexp-macro-assembler-irregexp.h"
				44	#include "regexp-stack.h"
				45
				46	#ifdef V8_NATIVE_REGEXP
				47	#if V8_TARGET_ARCH_IA32
				48	#include "ia32/macro-assembler-ia32.h"
				49	#include "ia32/regexp-macro-assembler-ia32.h"
				50	#elif V8_TARGET_ARCH_X64
				51	#include "x64/macro-assembler-x64.h"
				52	#include "x64/regexp-macro-assembler-x64.h"
				53	#elif V8_TARGET_ARCH_ARM
				54	#include "arm/macro-assembler-arm.h"
				55	#include "arm/regexp-macro-assembler-arm.h"
				56	#else
				57	#error Unsupported target architecture.
				58	#endif
				59	#endif
				60
				61	#include "interpreter-irregexp.h"
				62
				63
				64	namespace v8 {
				65	namespace internal {
				66
				67
				68	Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
				69	Handle<String> pattern,
				70	Handle<String> flags,
				71	bool* has_pending_exception) {
				72	// Ensure that the constructor function has been loaded.
				73	if (!constructor->IsLoaded()) {
				74	LoadLazy(constructor, has_pending_exception);
				75	if (*has_pending_exception) return Handle<Object>();
				76	}
				77	// Call the construct code with 2 arguments.
				78	Object** argv[2] = { Handle<Object>::cast(pattern).location(),
				79	Handle<Object>::cast(flags).location() };
				80	return Execution::New(constructor, 2, argv, has_pending_exception);
				81	}
				82
				83
				84	static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
				85	int flags = JSRegExp::NONE;
				86	for (int i = 0; i < str->length(); i++) {
				87	switch (str->Get(i)) {
				88	case 'i':
				89	flags \|= JSRegExp::IGNORE_CASE;
				90	break;
				91	case 'g':
				92	flags \|= JSRegExp::GLOBAL;
				93	break;
				94	case 'm':
				95	flags \|= JSRegExp::MULTILINE;
				96	break;
				97	}
				98	}
				99	return JSRegExp::Flags(flags);
				100	}
				101
				102
				103	static inline void ThrowRegExpException(Handle<JSRegExp> re,
				104	Handle<String> pattern,
				105	Handle<String> error_text,
				106	const char* message) {
				107	Handle<JSArray> array = Factory::NewJSArray(2);
				108	SetElement(array, 0, pattern);
				109	SetElement(array, 1, error_text);
				110	Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
				111	Top::Throw(*regexp_err);
				112	}
				113
				114
				115	// Generic RegExp methods. Dispatches to implementation specific methods.
				116
				117
				118	class OffsetsVector {
				119	public:
				120	inline OffsetsVector(int num_registers)
				121	: offsets_vector_length_(num_registers) {
				122	if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
				123	vector_ = NewArray<int>(offsets_vector_length_);
				124	} else {
				125	vector_ = static_offsets_vector_;
				126	}
				127	}
				128	inline ~OffsetsVector() {
				129	if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
				130	DeleteArray(vector_);
				131	vector_ = NULL;
				132	}
				133	}
				134	inline int* vector() { return vector_; }
				135	inline int length() { return offsets_vector_length_; }
				136
				137	private:
				138	int* vector_;
				139	int offsets_vector_length_;
				140	static const int kStaticOffsetsVectorSize = 50;
				141	static int static_offsets_vector_[kStaticOffsetsVectorSize];
				142	};
				143
				144
				145	int OffsetsVector::static_offsets_vector_[
				146	OffsetsVector::kStaticOffsetsVectorSize];
				147
				148
				149	Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
				150	Handle<String> pattern,
				151	Handle<String> flag_str) {
				152	JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
				153	Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
				154	bool in_cache = !cached.is_null();
				155	LOG(RegExpCompileEvent(re, in_cache));
				156
				157	Handle<Object> result;
				158	if (in_cache) {
				159	re->set_data(*cached);
				160	return re;
				161	}
				162	FlattenString(pattern);
				163	CompilationZoneScope zone_scope(DELETE_ON_EXIT);
				164	RegExpCompileData parse_result;
				165	FlatStringReader reader(pattern);
				166	if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
				167	// Throw an exception if we fail to parse the pattern.
				168	ThrowRegExpException(re,
				169	pattern,
				170	parse_result.error,
				171	"malformed_regexp");
				172	return Handle<Object>::null();
				173	}
				174
				175	if (parse_result.simple && !flags.is_ignore_case()) {
				176	// Parse-tree is a single atom that is equal to the pattern.
				177	AtomCompile(re, pattern, flags, pattern);
				178	} else if (parse_result.tree->IsAtom() &&
				179	!flags.is_ignore_case() &&
				180	parse_result.capture_count == 0) {
				181	RegExpAtom* atom = parse_result.tree->AsAtom();
				182	Vector<const uc16> atom_pattern = atom->data();
				183	Handle<String> atom_string = Factory::NewStringFromTwoByte(atom_pattern);
				184	AtomCompile(re, pattern, flags, atom_string);
				185	} else {
				186	IrregexpPrepare(re, pattern, flags, parse_result.capture_count);
				187	}
				188	ASSERT(re->data()->IsFixedArray());
				189	// Compilation succeeded so the data is set on the regexp
				190	// and we can store it in the cache.
				191	Handle<FixedArray> data(FixedArray::cast(re->data()));
				192	CompilationCache::PutRegExp(pattern, flags, data);
				193
				194	return re;
				195	}
				196
				197
				198	Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
				199	Handle<String> subject,
				200	int index,
				201	Handle<JSArray> last_match_info) {
				202	switch (regexp->TypeTag()) {
				203	case JSRegExp::ATOM:
				204	return AtomExec(regexp, subject, index, last_match_info);
				205	case JSRegExp::IRREGEXP: {
				206	Handle<Object> result =
				207	IrregexpExec(regexp, subject, index, last_match_info);
				208	ASSERT(!result.is_null() \|\| Top::has_pending_exception());
				209	return result;
				210	}
				211	default:
				212	UNREACHABLE();
				213	return Handle<Object>::null();
				214	}
				215	}
				216
				217
				218	// RegExp Atom implementation: Simple string search using indexOf.
				219
				220
				221	void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
				222	Handle<String> pattern,
				223	JSRegExp::Flags flags,
				224	Handle<String> match_pattern) {
				225	Factory::SetRegExpAtomData(re,
				226	JSRegExp::ATOM,
				227	pattern,
				228	flags,
				229	match_pattern);
				230	}
				231
				232
				233	static void SetAtomLastCapture(FixedArray* array,
				234	String* subject,
				235	int from,
				236	int to) {
				237	NoHandleAllocation no_handles;
				238	RegExpImpl::SetLastCaptureCount(array, 2);
				239	RegExpImpl::SetLastSubject(array, subject);
				240	RegExpImpl::SetLastInput(array, subject);
				241	RegExpImpl::SetCapture(array, 0, from);
				242	RegExpImpl::SetCapture(array, 1, to);
				243	}
				244
				245
				246	Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
				247	Handle<String> subject,
				248	int index,
				249	Handle<JSArray> last_match_info) {
				250	Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
				251
				252	uint32_t start_index = index;
				253
				254	int value = Runtime::StringMatch(subject, needle, start_index);
				255	if (value == -1) return Factory::null_value();
				256	ASSERT(last_match_info->HasFastElements());
				257
				258	{
				259	NoHandleAllocation no_handles;
				260	FixedArray* array = FixedArray::cast(last_match_info->elements());
				261	SetAtomLastCapture(array, *subject, value, value + needle->length());
				262	}
				263	return last_match_info;
				264	}
				265
				266
				267	// Irregexp implementation.
				268
				269	// Ensures that the regexp object contains a compiled version of the
				270	// source for either ASCII or non-ASCII strings.
				271	// If the compiled version doesn't already exist, it is compiled
				272	// from the source pattern.
				273	// If compilation fails, an exception is thrown and this function
				274	// returns false.
				275	bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {
				276	Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
				277	#ifdef V8_NATIVE_REGEXP
				278	if (compiled_code->IsCode()) return true;
				279	#else // ! V8_NATIVE_REGEXP (RegExp interpreter code)
				280	if (compiled_code->IsByteArray()) return true;
				281	#endif
				282	return CompileIrregexp(re, is_ascii);
				283	}
				284
				285
				286	bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {
				287	// Compile the RegExp.
				288	CompilationZoneScope zone_scope(DELETE_ON_EXIT);
				289	Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
				290	if (entry->IsJSObject()) {
				291	// If it's a JSObject, a previous compilation failed and threw this object.
				292	// Re-throw the object without trying again.
				293	Top::Throw(entry);
				294	return false;
				295	}
				296	ASSERT(entry->IsTheHole());
				297
				298	JSRegExp::Flags flags = re->GetFlags();
				299
				300	Handle<String> pattern(re->Pattern());
				301	if (!pattern->IsFlat()) {
				302	FlattenString(pattern);
				303	}
				304
				305	RegExpCompileData compile_data;
				306	FlatStringReader reader(pattern);
				307	if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) {
				308	// Throw an exception if we fail to parse the pattern.
				309	// THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
				310	ThrowRegExpException(re,
				311	pattern,
				312	compile_data.error,
				313	"malformed_regexp");
				314	return false;
				315	}
				316	RegExpEngine::CompilationResult result =
				317	RegExpEngine::Compile(&compile_data,
				318	flags.is_ignore_case(),
				319	flags.is_multiline(),
				320	pattern,
				321	is_ascii);
				322	if (result.error_message != NULL) {
				323	// Unable to compile regexp.
				324	Handle<JSArray> array = Factory::NewJSArray(2);
				325	SetElement(array, 0, pattern);
				326	SetElement(array,
				327	1,
				328	Factory::NewStringFromUtf8(CStrVector(result.error_message)));
				329	Handle<Object> regexp_err =
				330	Factory::NewSyntaxError("malformed_regexp", array);
				331	Top::Throw(*regexp_err);
				332	re->SetDataAt(JSRegExp::code_index(is_ascii), *regexp_err);
				333	return false;
				334	}
				335
				336	Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
				337	data->set(JSRegExp::code_index(is_ascii), result.code);
				338	int register_max = IrregexpMaxRegisterCount(*data);
				339	if (result.num_registers > register_max) {
				340	SetIrregexpMaxRegisterCount(*data, result.num_registers);
				341	}
				342
				343	return true;
				344	}
				345
				346
				347	int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
				348	return Smi::cast(
				349	re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
				350	}
				351
				352
				353	void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
				354	re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
				355	}
				356
				357
				358	int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
				359	return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
				360	}
				361
				362
				363	int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
				364	return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
				365	}
				366
				367
				368	ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
				369	return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
				370	}
				371
				372
				373	Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
				374	return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
				375	}
				376
				377
				378	void RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
				379	Handle<String> pattern,
				380	JSRegExp::Flags flags,
				381	int capture_count) {
				382	// Initialize compiled code entries to null.
				383	Factory::SetRegExpIrregexpData(re,
				384	JSRegExp::IRREGEXP,
				385	pattern,
				386	flags,
				387	capture_count);
				388	}
				389
				390
				391	Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp,
				392	Handle<String> subject,
				393	int previous_index,
				394	Handle<JSArray> last_match_info) {
				395	ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP);
				396
				397	// Prepare space for the return values.
				398	int number_of_capture_registers =
				399	(IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2;
				400
				401	#ifndef V8_NATIVE_REGEXP
				402	#ifdef DEBUG
				403	if (FLAG_trace_regexp_bytecodes) {
				404	String* pattern = jsregexp->Pattern();
				405	PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
				406	PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
				407	}
				408	#endif
				409	#endif
				410
				411	if (!subject->IsFlat()) {
				412	FlattenString(subject);
				413	}
				414
				415	last_match_info->EnsureSize(number_of_capture_registers + kLastMatchOverhead);
				416
				417	Handle<FixedArray> array;
				418
				419	// Dispatch to the correct RegExp implementation.
				420	Handle<FixedArray> regexp(FixedArray::cast(jsregexp->data()));
				421
				422	#ifdef V8_NATIVE_REGEXP
				423
				424	OffsetsVector captures(number_of_capture_registers);
				425	int* captures_vector = captures.vector();
				426	NativeRegExpMacroAssembler::Result res;
				427	do {
				428	bool is_ascii = subject->IsAsciiRepresentation();
				429	if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) {
				430	return Handle<Object>::null();
				431	}
				432	Handle<Code> code(RegExpImpl::IrregexpNativeCode(*regexp, is_ascii));
				433	res = NativeRegExpMacroAssembler::Match(code,
				434	subject,
				435	captures_vector,
				436	captures.length(),
				437	previous_index);
				438	// If result is RETRY, the string have changed representation, and we
				439	// must restart from scratch.
				440	} while (res == NativeRegExpMacroAssembler::RETRY);
				441	if (res == NativeRegExpMacroAssembler::EXCEPTION) {
				442	ASSERT(Top::has_pending_exception());
				443	return Handle<Object>::null();
				444	}
				445	ASSERT(res == NativeRegExpMacroAssembler::SUCCESS
				446	\|\| res == NativeRegExpMacroAssembler::FAILURE);
				447
				448	if (res != NativeRegExpMacroAssembler::SUCCESS) return Factory::null_value();
				449
				450	array = Handle<FixedArray>(FixedArray::cast(last_match_info->elements()));
				451	ASSERT(array->length() >= number_of_capture_registers + kLastMatchOverhead);
				452	// The captures come in (start, end+1) pairs.
				453	for (int i = 0; i < number_of_capture_registers; i += 2) {
				454	SetCapture(*array, i, captures_vector[i]);
				455	SetCapture(*array, i + 1, captures_vector[i + 1]);
				456	}
				457
				458	#else // ! V8_NATIVE_REGEXP
				459
				460	bool is_ascii = subject->IsAsciiRepresentation();
				461	if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) {
				462	return Handle<Object>::null();
				463	}
				464	// Now that we have done EnsureCompiledIrregexp we can get the number of
				465	// registers.
				466	int number_of_registers =
				467	IrregexpNumberOfRegisters(FixedArray::cast(jsregexp->data()));
				468	OffsetsVector registers(number_of_registers);
				469	int* register_vector = registers.vector();
				470	for (int i = number_of_capture_registers - 1; i >= 0; i--) {
				471	register_vector[i] = -1;
				472	}
				473	Handle<ByteArray> byte_codes(IrregexpByteCode(*regexp, is_ascii));
				474
				475	if (!IrregexpInterpreter::Match(byte_codes,
				476	subject,
				477	register_vector,
				478	previous_index)) {
				479	return Factory::null_value();
				480	}
				481
				482	array = Handle<FixedArray>(FixedArray::cast(last_match_info->elements()));
				483	ASSERT(array->length() >= number_of_capture_registers + kLastMatchOverhead);
				484	// The captures come in (start, end+1) pairs.
				485	for (int i = 0; i < number_of_capture_registers; i += 2) {
				486	SetCapture(*array, i, register_vector[i]);
				487	SetCapture(*array, i + 1, register_vector[i + 1]);
				488	}
				489
				490	#endif // V8_NATIVE_REGEXP
				491
				492	SetLastCaptureCount(*array, number_of_capture_registers);
				493	SetLastSubject(array, subject);
				494	SetLastInput(array, subject);
				495
				496	return last_match_info;
				497	}
				498
				499
				500	// -------------------------------------------------------------------
				501	// Implementation of the Irregexp regular expression engine.
				502	//
				503	// The Irregexp regular expression engine is intended to be a complete
				504	// implementation of ECMAScript regular expressions. It generates either
				505	// bytecodes or native code.
				506
				507	// The Irregexp regexp engine is structured in three steps.
				508	// 1) The parser generates an abstract syntax tree. See ast.cc.
				509	// 2) From the AST a node network is created. The nodes are all
				510	// subclasses of RegExpNode. The nodes represent states when
				511	// executing a regular expression. Several optimizations are
				512	// performed on the node network.
				513	// 3) From the nodes we generate either byte codes or native code
				514	// that can actually execute the regular expression (perform
				515	// the search). The code generation step is described in more
				516	// detail below.
				517
				518	// Code generation.
				519	//
				520	// The nodes are divided into four main categories.
				521	// * Choice nodes
				522	// These represent places where the regular expression can
				523	// match in more than one way. For example on entry to an
				524	// alternation (foo\|bar) or a repetition (*, +, ? or {}).
				525	// * Action nodes
				526	// These represent places where some action should be
				527	// performed. Examples include recording the current position
				528	// in the input string to a register (in order to implement
				529	// captures) or other actions on register for example in order
				530	// to implement the counters needed for {} repetitions.
				531	// * Matching nodes
				532	// These attempt to match some element part of the input string.
				533	// Examples of elements include character classes, plain strings
				534	// or back references.
				535	// * End nodes
				536	// These are used to implement the actions required on finding
				537	// a successful match or failing to find a match.
				538	//
				539	// The code generated (whether as byte codes or native code) maintains
				540	// some state as it runs. This consists of the following elements:
				541	//
				542	// * The capture registers. Used for string captures.
				543	// * Other registers. Used for counters etc.
				544	// * The current position.
				545	// * The stack of backtracking information. Used when a matching node
				546	// fails to find a match and needs to try an alternative.
				547	//
				548	// Conceptual regular expression execution model:
				549	//
				550	// There is a simple conceptual model of regular expression execution
				551	// which will be presented first. The actual code generated is a more
				552	// efficient simulation of the simple conceptual model:
				553	//
				554	// * Choice nodes are implemented as follows:
				555	// For each choice except the last {
				556	// push current position
				557	// push backtrack code location
				558	// <generate code to test for choice>
				559	// backtrack code location:
				560	// pop current position
				561	// }
				562	// <generate code to test for last choice>
				563	//
				564	// * Actions nodes are generated as follows
				565	// <push affected registers on backtrack stack>
				566	// <generate code to perform action>
				567	// push backtrack code location
				568	// <generate code to test for following nodes>
				569	// backtrack code location:
				570	// <pop affected registers to restore their state>
				571	// <pop backtrack location from stack and go to it>
				572	//
				573	// * Matching nodes are generated as follows:
				574	// if input string matches at current position
				575	// update current position
				576	// <generate code to test for following nodes>
				577	// else
				578	// <pop backtrack location from stack and go to it>
				579	//
				580	// Thus it can be seen that the current position is saved and restored
				581	// by the choice nodes, whereas the registers are saved and restored by
				582	// by the action nodes that manipulate them.
				583	//
				584	// The other interesting aspect of this model is that nodes are generated
				585	// at the point where they are needed by a recursive call to Emit(). If
				586	// the node has already been code generated then the Emit() call will
				587	// generate a jump to the previously generated code instead. In order to
				588	// limit recursion it is possible for the Emit() function to put the node
				589	// on a work list for later generation and instead generate a jump. The
				590	// destination of the jump is resolved later when the code is generated.
				591	//
				592	// Actual regular expression code generation.
				593	//
				594	// Code generation is actually more complicated than the above. In order
				595	// to improve the efficiency of the generated code some optimizations are
				596	// performed
				597	//
				598	// * Choice nodes have 1-character lookahead.
				599	// A choice node looks at the following character and eliminates some of
				600	// the choices immediately based on that character. This is not yet
				601	// implemented.
				602	// * Simple greedy loops store reduced backtracking information.
				603	// A quantifier like /.*foo/m will greedily match the whole input. It will
				604	// then need to backtrack to a point where it can match "foo". The naive
				605	// implementation of this would push each character position onto the
				606	// backtracking stack, then pop them off one by one. This would use space
				607	// proportional to the length of the input string. However since the "."
				608	// can only match in one way and always has a constant length (in this case
				609	// of 1) it suffices to store the current position on the top of the stack
				610	// once. Matching now becomes merely incrementing the current position and
				611	// backtracking becomes decrementing the current position and checking the
				612	// result against the stored current position. This is faster and saves
				613	// space.
				614	// * The current state is virtualized.
				615	// This is used to defer expensive operations until it is clear that they
				616	// are needed and to generate code for a node more than once, allowing
				617	// specialized an efficient versions of the code to be created. This is
				618	// explained in the section below.
				619	//
				620	// Execution state virtualization.
				621	//
				622	// Instead of emitting code, nodes that manipulate the state can record their
				623	// manipulation in an object called the Trace. The Trace object can record a
				624	// current position offset, an optional backtrack code location on the top of
				625	// the virtualized backtrack stack and some register changes. When a node is
				626	// to be emitted it can flush the Trace or update it. Flushing the Trace
				627	// will emit code to bring the actual state into line with the virtual state.
				628	// Avoiding flushing the state can postpone some work (eg updates of capture
				629	// registers). Postponing work can save time when executing the regular
				630	// expression since it may be found that the work never has to be done as a
				631	// failure to match can occur. In addition it is much faster to jump to a
				632	// known backtrack code location than it is to pop an unknown backtrack
				633	// location from the stack and jump there.
				634	//
				635	// The virtual state found in the Trace affects code generation. For example
				636	// the virtual state contains the difference between the actual current
				637	// position and the virtual current position, and matching code needs to use
				638	// this offset to attempt a match in the correct location of the input
				639	// string. Therefore code generated for a non-trivial trace is specialized
				640	// to that trace. The code generator therefore has the ability to generate
				641	// code for each node several times. In order to limit the size of the
				642	// generated code there is an arbitrary limit on how many specialized sets of
				643	// code may be generated for a given node. If the limit is reached, the
				644	// trace is flushed and a generic version of the code for a node is emitted.
				645	// This is subsequently used for that node. The code emitted for non-generic
				646	// trace is not recorded in the node and so it cannot currently be reused in
				647	// the event that code generation is requested for an identical trace.
				648
				649
				650	void RegExpTree::AppendToText(RegExpText* text) {
				651	UNREACHABLE();
				652	}
				653
				654
				655	void RegExpAtom::AppendToText(RegExpText* text) {
				656	text->AddElement(TextElement::Atom(this));
				657	}
				658
				659
				660	void RegExpCharacterClass::AppendToText(RegExpText* text) {
				661	text->AddElement(TextElement::CharClass(this));
				662	}
				663
				664
				665	void RegExpText::AppendToText(RegExpText* text) {
				666	for (int i = 0; i < elements()->length(); i++)
				667	text->AddElement(elements()->at(i));
				668	}
				669
				670
				671	TextElement TextElement::Atom(RegExpAtom* atom) {
				672	TextElement result = TextElement(ATOM);
				673	result.data.u_atom = atom;
				674	return result;
				675	}
				676
				677
				678	TextElement TextElement::CharClass(
				679	RegExpCharacterClass* char_class) {
				680	TextElement result = TextElement(CHAR_CLASS);
				681	result.data.u_char_class = char_class;
				682	return result;
				683	}
				684
				685
				686	int TextElement::length() {
				687	if (type == ATOM) {
				688	return data.u_atom->length();
				689	} else {
				690	ASSERT(type == CHAR_CLASS);
				691	return 1;
				692	}
				693	}
				694
				695
				696	DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
				697	if (table_ == NULL) {
				698	table_ = new DispatchTable();
				699	DispatchTableConstructor cons(table_, ignore_case);
				700	cons.BuildTable(this);
				701	}
				702	return table_;
				703	}
				704
				705
				706	class RegExpCompiler {
				707	public:
				708	RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
				709
				710	int AllocateRegister() {
				711	if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
				712	reg_exp_too_big_ = true;
				713	return next_register_;
				714	}
				715	return next_register_++;
				716	}
				717
				718	RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
				719	RegExpNode* start,
				720	int capture_count,
				721	Handle<String> pattern);
				722
				723	inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
				724
				725	static const int kImplementationOffset = 0;
				726	static const int kNumberOfRegistersOffset = 0;
				727	static const int kCodeOffset = 1;
				728
				729	RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
				730	EndNode* accept() { return accept_; }
				731
				732	static const int kMaxRecursion = 100;
				733	inline int recursion_depth() { return recursion_depth_; }
				734	inline void IncrementRecursionDepth() { recursion_depth_++; }
				735	inline void DecrementRecursionDepth() { recursion_depth_--; }
				736
				737	void SetRegExpTooBig() { reg_exp_too_big_ = true; }
				738
				739	inline bool ignore_case() { return ignore_case_; }
				740	inline bool ascii() { return ascii_; }
				741
				742	static const int kNoRegister = -1;
				743	private:
				744	EndNode* accept_;
				745	int next_register_;
				746	List<RegExpNode> work_list_;
				747	int recursion_depth_;
				748	RegExpMacroAssembler* macro_assembler_;
				749	bool ignore_case_;
				750	bool ascii_;
				751	bool reg_exp_too_big_;
				752	};
				753
				754
				755	class RecursionCheck {
				756	public:
				757	explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
				758	compiler->IncrementRecursionDepth();
				759	}
				760	~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
				761	private:
				762	RegExpCompiler* compiler_;
				763	};
				764
				765
				766	static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
				767	return RegExpEngine::CompilationResult("RegExp too big");
				768	}
				769
				770
				771	// Attempts to compile the regexp using an Irregexp code generator. Returns
				772	// a fixed array or a null handle depending on whether it succeeded.
				773	RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
				774	: next_register_(2 * (capture_count + 1)),
				775	work_list_(NULL),
				776	recursion_depth_(0),
				777	ignore_case_(ignore_case),
				778	ascii_(ascii),
				779	reg_exp_too_big_(false) {
				780	accept_ = new EndNode(EndNode::ACCEPT);
				781	ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
				782	}
				783
				784
				785	RegExpEngine::CompilationResult RegExpCompiler::Assemble(
				786	RegExpMacroAssembler* macro_assembler,
				787	RegExpNode* start,
				788	int capture_count,
				789	Handle<String> pattern) {
				790	#ifdef DEBUG
				791	if (FLAG_trace_regexp_assembler)
				792	macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
				793	else
				794	#endif
				795	macro_assembler_ = macro_assembler;
				796	List <RegExpNode*> work_list(0);
				797	work_list_ = &work_list;
				798	Label fail;
				799	macro_assembler_->PushBacktrack(&fail);
				800	Trace new_trace;
				801	start->Emit(this, &new_trace);
				802	macro_assembler_->Bind(&fail);
				803	macro_assembler_->Fail();
				804	while (!work_list.is_empty()) {
				805	work_list.RemoveLast()->Emit(this, &new_trace);
				806	}
				807	if (reg_exp_too_big_) return IrregexpRegExpTooBig();
				808
				809	Handle<Object> code = macro_assembler_->GetCode(pattern);
				810
				811	work_list_ = NULL;
				812	#ifdef DEBUG
				813	if (FLAG_trace_regexp_assembler) {
				814	delete macro_assembler_;
				815	}
				816	#endif
				817	return RegExpEngine::CompilationResult(*code, next_register_);
				818	}
				819
				820
				821	bool Trace::DeferredAction::Mentions(int that) {
				822	if (type() == ActionNode::CLEAR_CAPTURES) {
				823	Interval range = static_cast<DeferredClearCaptures*>(this)->range();
				824	return range.Contains(that);
				825	} else {
				826	return reg() == that;
				827	}
				828	}
				829
				830
				831	bool Trace::mentions_reg(int reg) {
				832	for (DeferredAction* action = actions_;
				833	action != NULL;
				834	action = action->next()) {
				835	if (action->Mentions(reg))
				836	return true;
				837	}
				838	return false;
				839	}
				840
				841
				842	bool Trace::GetStoredPosition(int reg, int* cp_offset) {
				843	ASSERT_EQ(0, *cp_offset);
				844	for (DeferredAction* action = actions_;
				845	action != NULL;
				846	action = action->next()) {
				847	if (action->Mentions(reg)) {
				848	if (action->type() == ActionNode::STORE_POSITION) {
				849	cp_offset = static_cast<DeferredCapture>(action)->cp_offset();
				850	return true;
				851	} else {
				852	return false;
				853	}
				854	}
				855	}
				856	return false;
				857	}
				858
				859
				860	int Trace::FindAffectedRegisters(OutSet* affected_registers) {
				861	int max_register = RegExpCompiler::kNoRegister;
				862	for (DeferredAction* action = actions_;
				863	action != NULL;
				864	action = action->next()) {
				865	if (action->type() == ActionNode::CLEAR_CAPTURES) {
				866	Interval range = static_cast<DeferredClearCaptures*>(action)->range();
				867	for (int i = range.from(); i <= range.to(); i++)
				868	affected_registers->Set(i);
				869	if (range.to() > max_register) max_register = range.to();
				870	} else {
				871	affected_registers->Set(action->reg());
				872	if (action->reg() > max_register) max_register = action->reg();
				873	}
				874	}
				875	return max_register;
				876	}
				877
				878
				879	void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
				880	int max_register,
				881	OutSet& registers_to_pop,
				882	OutSet& registers_to_clear) {
				883	for (int reg = max_register; reg >= 0; reg--) {
				884	if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
				885	else if (registers_to_clear.Get(reg)) {
				886	int clear_to = reg;
				887	while (reg > 0 && registers_to_clear.Get(reg - 1)) {
				888	reg--;
				889	}
				890	assembler->ClearRegisters(reg, clear_to);
				891	}
				892	}
				893	}
				894
				895
				896	void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
				897	int max_register,
				898	OutSet& affected_registers,
				899	OutSet* registers_to_pop,
				900	OutSet* registers_to_clear) {
				901	// The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
				902	const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
				903
				904	// Count pushes performed to force a stack limit check occasionally.
				905	int pushes = 0;
				906
				907	for (int reg = 0; reg <= max_register; reg++) {
				908	if (!affected_registers.Get(reg)) {
				909	continue;
				910	}
				911
				912	// The chronologically first deferred action in the trace
				913	// is used to infer the action needed to restore a register
				914	// to its previous state (or not, if it's safe to ignore it).
				915	enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
				916	DeferredActionUndoType undo_action = IGNORE;
				917
				918	int value = 0;
				919	bool absolute = false;
				920	bool clear = false;
				921	int store_position = -1;
				922	// This is a little tricky because we are scanning the actions in reverse
				923	// historical order (newest first).
				924	for (DeferredAction* action = actions_;
				925	action != NULL;
				926	action = action->next()) {
				927	if (action->Mentions(reg)) {
				928	switch (action->type()) {
				929	case ActionNode::SET_REGISTER: {
				930	Trace::DeferredSetRegister* psr =
				931	static_cast<Trace::DeferredSetRegister*>(action);
				932	if (!absolute) {
				933	value += psr->value();
				934	absolute = true;
				935	}
				936	// SET_REGISTER is currently only used for newly introduced loop
				937	// counters. They can have a significant previous value if they
				938	// occour in a loop. TODO(lrn): Propagate this information, so
				939	// we can set undo_action to IGNORE if we know there is no value to
				940	// restore.
				941	undo_action = RESTORE;
				942	ASSERT_EQ(store_position, -1);
				943	ASSERT(!clear);
				944	break;
				945	}
				946	case ActionNode::INCREMENT_REGISTER:
				947	if (!absolute) {
				948	value++;
				949	}
				950	ASSERT_EQ(store_position, -1);
				951	ASSERT(!clear);
				952	undo_action = RESTORE;
				953	break;
				954	case ActionNode::STORE_POSITION: {
				955	Trace::DeferredCapture* pc =
				956	static_cast<Trace::DeferredCapture*>(action);
				957	if (!clear && store_position == -1) {
				958	store_position = pc->cp_offset();
				959	}
				960
				961	// For captures we know that stores and clears alternate.
				962	// Other register, are never cleared, and if the occur
				963	// inside a loop, they might be assigned more than once.
				964	if (reg <= 1) {
				965	// Registers zero and one, aka "capture zero", is
				966	// always set correctly if we succeed. There is no
				967	// need to undo a setting on backtrack, because we
				968	// will set it again or fail.
				969	undo_action = IGNORE;
				970	} else {
				971	undo_action = pc->is_capture() ? CLEAR : RESTORE;
				972	}
				973	ASSERT(!absolute);
				974	ASSERT_EQ(value, 0);
				975	break;
				976	}
				977	case ActionNode::CLEAR_CAPTURES: {
				978	// Since we're scanning in reverse order, if we've already
				979	// set the position we have to ignore historically earlier
				980	// clearing operations.
				981	if (store_position == -1) {
				982	clear = true;
				983	}
				984	undo_action = RESTORE;
				985	ASSERT(!absolute);
				986	ASSERT_EQ(value, 0);
				987	break;
				988	}
				989	default:
				990	UNREACHABLE();
				991	break;
				992	}
				993	}
				994	}
				995	// Prepare for the undo-action (e.g., push if it's going to be popped).
				996	if (undo_action == RESTORE) {
				997	pushes++;
				998	RegExpMacroAssembler::StackCheckFlag stack_check =
				999	RegExpMacroAssembler::kNoStackLimitCheck;
				1000	if (pushes == push_limit) {
				1001	stack_check = RegExpMacroAssembler::kCheckStackLimit;
				1002	pushes = 0;
				1003	}
				1004
				1005	assembler->PushRegister(reg, stack_check);
				1006	registers_to_pop->Set(reg);
				1007	} else if (undo_action == CLEAR) {
				1008	registers_to_clear->Set(reg);
				1009	}
				1010	// Perform the chronologically last action (or accumulated increment)
				1011	// for the register.
				1012	if (store_position != -1) {
				1013	assembler->WriteCurrentPositionToRegister(reg, store_position);
				1014	} else if (clear) {
				1015	assembler->ClearRegisters(reg, reg);
				1016	} else if (absolute) {
				1017	assembler->SetRegister(reg, value);
				1018	} else if (value != 0) {
				1019	assembler->AdvanceRegister(reg, value);
				1020	}
				1021	}
				1022	}
				1023
				1024
				1025	// This is called as we come into a loop choice node and some other tricky
				1026	// nodes. It normalizes the state of the code generator to ensure we can
				1027	// generate generic code.
				1028	void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
				1029	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1030
				1031	ASSERT(!is_trivial());
				1032
				1033	if (actions_ == NULL && backtrack() == NULL) {
				1034	// Here we just have some deferred cp advances to fix and we are back to
				1035	// a normal situation. We may also have to forget some information gained
				1036	// through a quick check that was already performed.
				1037	if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
				1038	// Create a new trivial state and generate the node with that.
				1039	Trace new_state;
				1040	successor->Emit(compiler, &new_state);
				1041	return;
				1042	}
				1043
				1044	// Generate deferred actions here along with code to undo them again.
				1045	OutSet affected_registers;
				1046
				1047	if (backtrack() != NULL) {
				1048	// Here we have a concrete backtrack location. These are set up by choice
				1049	// nodes and so they indicate that we have a deferred save of the current
				1050	// position which we may need to emit here.
				1051	assembler->PushCurrentPosition();
				1052	}
				1053
				1054	int max_register = FindAffectedRegisters(&affected_registers);
				1055	OutSet registers_to_pop;
				1056	OutSet registers_to_clear;
				1057	PerformDeferredActions(assembler,
				1058	max_register,
				1059	affected_registers,
				1060	&registers_to_pop,
				1061	&registers_to_clear);
				1062	if (cp_offset_ != 0) {
				1063	assembler->AdvanceCurrentPosition(cp_offset_);
				1064	}
				1065
				1066	// Create a new trivial state and generate the node with that.
				1067	Label undo;
				1068	assembler->PushBacktrack(&undo);
				1069	Trace new_state;
				1070	successor->Emit(compiler, &new_state);
				1071
				1072	// On backtrack we need to restore state.
				1073	assembler->Bind(&undo);
				1074	RestoreAffectedRegisters(assembler,
				1075	max_register,
				1076	registers_to_pop,
				1077	registers_to_clear);
				1078	if (backtrack() == NULL) {
				1079	assembler->Backtrack();
				1080	} else {
				1081	assembler->PopCurrentPosition();
				1082	assembler->GoTo(backtrack());
				1083	}
				1084	}
				1085
				1086
				1087	void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
				1088	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1089
				1090	// Omit flushing the trace. We discard the entire stack frame anyway.
				1091
				1092	if (!label()->is_bound()) {
				1093	// We are completely independent of the trace, since we ignore it,
				1094	// so this code can be used as the generic version.
				1095	assembler->Bind(label());
				1096	}
				1097
				1098	// Throw away everything on the backtrack stack since the start
				1099	// of the negative submatch and restore the character position.
				1100	assembler->ReadCurrentPositionFromRegister(current_position_register_);
				1101	assembler->ReadStackPointerFromRegister(stack_pointer_register_);
				1102	if (clear_capture_count_ > 0) {
				1103	// Clear any captures that might have been performed during the success
				1104	// of the body of the negative look-ahead.
				1105	int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
				1106	assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
				1107	}
				1108	// Now that we have unwound the stack we find at the top of the stack the
				1109	// backtrack that the BeginSubmatch node got.
				1110	assembler->Backtrack();
				1111	}
				1112
				1113
				1114	void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				1115	if (!trace->is_trivial()) {
				1116	trace->Flush(compiler, this);
				1117	return;
				1118	}
				1119	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1120	if (!label()->is_bound()) {
				1121	assembler->Bind(label());
				1122	}
				1123	switch (action_) {
				1124	case ACCEPT:
				1125	assembler->Succeed();
				1126	return;
				1127	case BACKTRACK:
				1128	assembler->GoTo(trace->backtrack());
				1129	return;
				1130	case NEGATIVE_SUBMATCH_SUCCESS:
				1131	// This case is handled in a different virtual method.
				1132	UNREACHABLE();
				1133	}
				1134	UNIMPLEMENTED();
				1135	}
				1136
				1137
				1138	void GuardedAlternative::AddGuard(Guard* guard) {
				1139	if (guards_ == NULL)
				1140	guards_ = new ZoneList<Guard*>(1);
				1141	guards_->Add(guard);
				1142	}
				1143
				1144
				1145	ActionNode* ActionNode::SetRegister(int reg,
				1146	int val,
				1147	RegExpNode* on_success) {
				1148	ActionNode* result = new ActionNode(SET_REGISTER, on_success);
				1149	result->data_.u_store_register.reg = reg;
				1150	result->data_.u_store_register.value = val;
				1151	return result;
				1152	}
				1153
				1154
				1155	ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
				1156	ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
				1157	result->data_.u_increment_register.reg = reg;
				1158	return result;
				1159	}
				1160
				1161
				1162	ActionNode* ActionNode::StorePosition(int reg,
				1163	bool is_capture,
				1164	RegExpNode* on_success) {
				1165	ActionNode* result = new ActionNode(STORE_POSITION, on_success);
				1166	result->data_.u_position_register.reg = reg;
				1167	result->data_.u_position_register.is_capture = is_capture;
				1168	return result;
				1169	}
				1170
				1171
				1172	ActionNode* ActionNode::ClearCaptures(Interval range,
				1173	RegExpNode* on_success) {
				1174	ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
				1175	result->data_.u_clear_captures.range_from = range.from();
				1176	result->data_.u_clear_captures.range_to = range.to();
				1177	return result;
				1178	}
				1179
				1180
				1181	ActionNode* ActionNode::BeginSubmatch(int stack_reg,
				1182	int position_reg,
				1183	RegExpNode* on_success) {
				1184	ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
				1185	result->data_.u_submatch.stack_pointer_register = stack_reg;
				1186	result->data_.u_submatch.current_position_register = position_reg;
				1187	return result;
				1188	}
				1189
				1190
				1191	ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
				1192	int position_reg,
				1193	int clear_register_count,
				1194	int clear_register_from,
				1195	RegExpNode* on_success) {
				1196	ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
				1197	result->data_.u_submatch.stack_pointer_register = stack_reg;
				1198	result->data_.u_submatch.current_position_register = position_reg;
				1199	result->data_.u_submatch.clear_register_count = clear_register_count;
				1200	result->data_.u_submatch.clear_register_from = clear_register_from;
				1201	return result;
				1202	}
				1203
				1204
				1205	ActionNode* ActionNode::EmptyMatchCheck(int start_register,
				1206	int repetition_register,
				1207	int repetition_limit,
				1208	RegExpNode* on_success) {
				1209	ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
				1210	result->data_.u_empty_match_check.start_register = start_register;
				1211	result->data_.u_empty_match_check.repetition_register = repetition_register;
				1212	result->data_.u_empty_match_check.repetition_limit = repetition_limit;
				1213	return result;
				1214	}
				1215
				1216
				1217	#define DEFINE_ACCEPT(Type) \
				1218	void Type##Node::Accept(NodeVisitor* visitor) { \
				1219	visitor->Visit##Type(this); \
				1220	}
				1221	FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
				1222	#undef DEFINE_ACCEPT
				1223
				1224
				1225	void LoopChoiceNode::Accept(NodeVisitor* visitor) {
				1226	visitor->VisitLoopChoice(this);
				1227	}
				1228
				1229
				1230	// -------------------------------------------------------------------
				1231	// Emit code.
				1232
				1233
				1234	void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
				1235	Guard* guard,
				1236	Trace* trace) {
				1237	switch (guard->op()) {
				1238	case Guard::LT:
				1239	ASSERT(!trace->mentions_reg(guard->reg()));
				1240	macro_assembler->IfRegisterGE(guard->reg(),
				1241	guard->value(),
				1242	trace->backtrack());
				1243	break;
				1244	case Guard::GEQ:
				1245	ASSERT(!trace->mentions_reg(guard->reg()));
				1246	macro_assembler->IfRegisterLT(guard->reg(),
				1247	guard->value(),
				1248	trace->backtrack());
				1249	break;
				1250	}
				1251	}
				1252
				1253
				1254	static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
				1255	static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
				1256
				1257
				1258	// Returns the number of characters in the equivalence class, omitting those
				1259	// that cannot occur in the source string because it is ASCII.
				1260	static int GetCaseIndependentLetters(uc16 character,
				1261	bool ascii_subject,
				1262	unibrow::uchar* letters) {
				1263	int length = uncanonicalize.get(character, '\0', letters);
				1264	// Unibrow returns 0 or 1 for characters where case independependence is
				1265	// trivial.
				1266	if (length == 0) {
				1267	letters[0] = character;
				1268	length = 1;
				1269	}
				1270	if (!ascii_subject \|\| character <= String::kMaxAsciiCharCode) {
				1271	return length;
				1272	}
				1273	// The standard requires that non-ASCII characters cannot have ASCII
				1274	// character codes in their equivalence class.
				1275	return 0;
				1276	}
				1277
				1278
				1279	static inline bool EmitSimpleCharacter(RegExpCompiler* compiler,
				1280	uc16 c,
				1281	Label* on_failure,
				1282	int cp_offset,
				1283	bool check,
				1284	bool preloaded) {
				1285	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1286	bool bound_checked = false;
				1287	if (!preloaded) {
				1288	assembler->LoadCurrentCharacter(
				1289	cp_offset,
				1290	on_failure,
				1291	check);
				1292	bound_checked = true;
				1293	}
				1294	assembler->CheckNotCharacter(c, on_failure);
				1295	return bound_checked;
				1296	}
				1297
				1298
				1299	// Only emits non-letters (things that don't have case). Only used for case
				1300	// independent matches.
				1301	static inline bool EmitAtomNonLetter(RegExpCompiler* compiler,
				1302	uc16 c,
				1303	Label* on_failure,
				1304	int cp_offset,
				1305	bool check,
				1306	bool preloaded) {
				1307	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1308	bool ascii = compiler->ascii();
				1309	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				1310	int length = GetCaseIndependentLetters(c, ascii, chars);
				1311	if (length < 1) {
				1312	// This can't match. Must be an ASCII subject and a non-ASCII character.
				1313	// We do not need to do anything since the ASCII pass already handled this.
				1314	return false; // Bounds not checked.
				1315	}
				1316	bool checked = false;
				1317	// We handle the length > 1 case in a later pass.
				1318	if (length == 1) {
				1319	if (ascii && c > String::kMaxAsciiCharCodeU) {
				1320	// Can't match - see above.
				1321	return false; // Bounds not checked.
				1322	}
				1323	if (!preloaded) {
				1324	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
				1325	checked = check;
				1326	}
				1327	macro_assembler->CheckNotCharacter(c, on_failure);
				1328	}
				1329	return checked;
				1330	}
				1331
				1332
				1333	static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
				1334	bool ascii,
				1335	uc16 c1,
				1336	uc16 c2,
				1337	Label* on_failure) {
				1338	uc16 char_mask;
				1339	if (ascii) {
				1340	char_mask = String::kMaxAsciiCharCode;
				1341	} else {
				1342	char_mask = String::kMaxUC16CharCode;
				1343	}
				1344	uc16 exor = c1 ^ c2;
				1345	// Check whether exor has only one bit set.
				1346	if (((exor - 1) & exor) == 0) {
				1347	// If c1 and c2 differ only by one bit.
				1348	// Ecma262UnCanonicalize always gives the highest number last.
				1349	ASSERT(c2 > c1);
				1350	uc16 mask = char_mask ^ exor;
				1351	macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
				1352	return true;
				1353	}
				1354	ASSERT(c2 > c1);
				1355	uc16 diff = c2 - c1;
				1356	if (((diff - 1) & diff) == 0 && c1 >= diff) {
				1357	// If the characters differ by 2^n but don't differ by one bit then
				1358	// subtract the difference from the found character, then do the or
				1359	// trick. We avoid the theoretical case where negative numbers are
				1360	// involved in order to simplify code generation.
				1361	uc16 mask = char_mask ^ diff;
				1362	macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
				1363	diff,
				1364	mask,
				1365	on_failure);
				1366	return true;
				1367	}
				1368	return false;
				1369	}
				1370
				1371
				1372	typedef bool EmitCharacterFunction(RegExpCompiler* compiler,
				1373	uc16 c,
				1374	Label* on_failure,
				1375	int cp_offset,
				1376	bool check,
				1377	bool preloaded);
				1378
				1379	// Only emits letters (things that have case). Only used for case independent
				1380	// matches.
				1381	static inline bool EmitAtomLetter(RegExpCompiler* compiler,
				1382	uc16 c,
				1383	Label* on_failure,
				1384	int cp_offset,
				1385	bool check,
				1386	bool preloaded) {
				1387	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1388	bool ascii = compiler->ascii();
				1389	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				1390	int length = GetCaseIndependentLetters(c, ascii, chars);
				1391	if (length <= 1) return false;
				1392	// We may not need to check against the end of the input string
				1393	// if this character lies before a character that matched.
				1394	if (!preloaded) {
				1395	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
				1396	}
				1397	Label ok;
				1398	ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
				1399	switch (length) {
				1400	case 2: {
				1401	if (ShortCutEmitCharacterPair(macro_assembler,
				1402	ascii,
				1403	chars[0],
				1404	chars[1],
				1405	on_failure)) {
				1406	} else {
				1407	macro_assembler->CheckCharacter(chars[0], &ok);
				1408	macro_assembler->CheckNotCharacter(chars[1], on_failure);
				1409	macro_assembler->Bind(&ok);
				1410	}
				1411	break;
				1412	}
				1413	case 4:
				1414	macro_assembler->CheckCharacter(chars[3], &ok);
				1415	// Fall through!
				1416	case 3:
				1417	macro_assembler->CheckCharacter(chars[0], &ok);
				1418	macro_assembler->CheckCharacter(chars[1], &ok);
				1419	macro_assembler->CheckNotCharacter(chars[2], on_failure);
				1420	macro_assembler->Bind(&ok);
				1421	break;
				1422	default:
				1423	UNREACHABLE();
				1424	break;
				1425	}
				1426	return true;
				1427	}
				1428
				1429
				1430	static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
				1431	RegExpCharacterClass* cc,
				1432	bool ascii,
				1433	Label* on_failure,
				1434	int cp_offset,
				1435	bool check_offset,
				1436	bool preloaded) {
				1437	if (cc->is_standard() &&
				1438	macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
				1439	cp_offset,
				1440	check_offset,
				1441	on_failure)) {
				1442	return;
				1443	}
				1444
				1445	ZoneList<CharacterRange>* ranges = cc->ranges();
				1446	int max_char;
				1447	if (ascii) {
				1448	max_char = String::kMaxAsciiCharCode;
				1449	} else {
				1450	max_char = String::kMaxUC16CharCode;
				1451	}
				1452
				1453	Label success;
				1454
				1455	Label* char_is_in_class =
				1456	cc->is_negated() ? on_failure : &success;
				1457
				1458	int range_count = ranges->length();
				1459
				1460	int last_valid_range = range_count - 1;
				1461	while (last_valid_range >= 0) {
				1462	CharacterRange& range = ranges->at(last_valid_range);
				1463	if (range.from() <= max_char) {
				1464	break;
				1465	}
				1466	last_valid_range--;
				1467	}
				1468
				1469	if (last_valid_range < 0) {
				1470	if (!cc->is_negated()) {
				1471	// TODO(plesner): We can remove this when the node level does our
				1472	// ASCII optimizations for us.
				1473	macro_assembler->GoTo(on_failure);
				1474	}
				1475	if (check_offset) {
				1476	macro_assembler->CheckPosition(cp_offset, on_failure);
				1477	}
				1478	return;
				1479	}
				1480
				1481	if (last_valid_range == 0 &&
				1482	!cc->is_negated() &&
				1483	ranges->at(0).IsEverything(max_char)) {
				1484	// This is a common case hit by non-anchored expressions.
				1485	if (check_offset) {
				1486	macro_assembler->CheckPosition(cp_offset, on_failure);
				1487	}
				1488	return;
				1489	}
				1490
				1491	if (!preloaded) {
				1492	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
				1493	}
				1494
				1495	for (int i = 0; i < last_valid_range; i++) {
				1496	CharacterRange& range = ranges->at(i);
				1497	Label next_range;
				1498	uc16 from = range.from();
				1499	uc16 to = range.to();
				1500	if (from > max_char) {
				1501	continue;
				1502	}
				1503	if (to > max_char) to = max_char;
				1504	if (to == from) {
				1505	macro_assembler->CheckCharacter(to, char_is_in_class);
				1506	} else {
				1507	if (from != 0) {
				1508	macro_assembler->CheckCharacterLT(from, &next_range);
				1509	}
				1510	if (to != max_char) {
				1511	macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
				1512	} else {
				1513	macro_assembler->GoTo(char_is_in_class);
				1514	}
				1515	}
				1516	macro_assembler->Bind(&next_range);
				1517	}
				1518
				1519	CharacterRange& range = ranges->at(last_valid_range);
				1520	uc16 from = range.from();
				1521	uc16 to = range.to();
				1522
				1523	if (to > max_char) to = max_char;
				1524	ASSERT(to >= from);
				1525
				1526	if (to == from) {
				1527	if (cc->is_negated()) {
				1528	macro_assembler->CheckCharacter(to, on_failure);
				1529	} else {
				1530	macro_assembler->CheckNotCharacter(to, on_failure);
				1531	}
				1532	} else {
				1533	if (from != 0) {
				1534	if (cc->is_negated()) {
				1535	macro_assembler->CheckCharacterLT(from, &success);
				1536	} else {
				1537	macro_assembler->CheckCharacterLT(from, on_failure);
				1538	}
				1539	}
				1540	if (to != String::kMaxUC16CharCode) {
				1541	if (cc->is_negated()) {
				1542	macro_assembler->CheckCharacterLT(to + 1, on_failure);
				1543	} else {
				1544	macro_assembler->CheckCharacterGT(to, on_failure);
				1545	}
				1546	} else {
				1547	if (cc->is_negated()) {
				1548	macro_assembler->GoTo(on_failure);
				1549	}
				1550	}
				1551	}
				1552	macro_assembler->Bind(&success);
				1553	}
				1554
				1555
				1556	RegExpNode::~RegExpNode() {
				1557	}
				1558
				1559
				1560	RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
				1561	Trace* trace) {
				1562	// If we are generating a greedy loop then don't stop and don't reuse code.
				1563	if (trace->stop_node() != NULL) {
				1564	return CONTINUE;
				1565	}
				1566
				1567	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1568	if (trace->is_trivial()) {
				1569	if (label_.is_bound()) {
				1570	// We are being asked to generate a generic version, but that's already
				1571	// been done so just go to it.
				1572	macro_assembler->GoTo(&label_);
				1573	return DONE;
				1574	}
				1575	if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
				1576	// To avoid too deep recursion we push the node to the work queue and just
				1577	// generate a goto here.
				1578	compiler->AddWork(this);
				1579	macro_assembler->GoTo(&label_);
				1580	return DONE;
				1581	}
				1582	// Generate generic version of the node and bind the label for later use.
				1583	macro_assembler->Bind(&label_);
				1584	return CONTINUE;
				1585	}
				1586
				1587	// We are being asked to make a non-generic version. Keep track of how many
				1588	// non-generic versions we generate so as not to overdo it.
				1589	trace_count_++;
				1590	if (FLAG_regexp_optimization &&
				1591	trace_count_ < kMaxCopiesCodeGenerated &&
				1592	compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
				1593	return CONTINUE;
				1594	}
				1595
				1596	// If we get here code has been generated for this node too many times or
				1597	// recursion is too deep. Time to switch to a generic version. The code for
				1598	// generic versions above can handle deep recursion properly.
				1599	trace->Flush(compiler, this);
				1600	return DONE;
				1601	}
				1602
				1603
				1604	int ActionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
				1605	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
				1606	if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
				1607	return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
				1608	}
				1609
				1610
				1611	int AssertionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
				1612	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
				1613	return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
				1614	}
				1615
				1616
				1617	int BackReferenceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
				1618	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
				1619	return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
				1620	}
				1621
				1622
				1623	int TextNode::EatsAtLeast(int still_to_find, int recursion_depth) {
				1624	int answer = Length();
				1625	if (answer >= still_to_find) return answer;
				1626	if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
				1627	return answer + on_success()->EatsAtLeast(still_to_find - answer,
				1628	recursion_depth + 1);
				1629	}
				1630
				1631
				1632	int NegativeLookaheadChoiceNode:: EatsAtLeast(int still_to_find,
				1633	int recursion_depth) {
				1634	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
				1635	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				1636	// afterwards.
				1637	RegExpNode* node = alternatives_->at(1).node();
				1638	return node->EatsAtLeast(still_to_find, recursion_depth + 1);
				1639	}
				1640
				1641
				1642	void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
				1643	QuickCheckDetails* details,
				1644	RegExpCompiler* compiler,
				1645	int filled_in,
				1646	bool not_at_start) {
				1647	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				1648	// afterwards.
				1649	RegExpNode* node = alternatives_->at(1).node();
				1650	return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
				1651	}
				1652
				1653
				1654	int ChoiceNode::EatsAtLeastHelper(int still_to_find,
				1655	int recursion_depth,
				1656	RegExpNode* ignore_this_node) {
				1657	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
				1658	int min = 100;
				1659	int choice_count = alternatives_->length();
				1660	for (int i = 0; i < choice_count; i++) {
				1661	RegExpNode* node = alternatives_->at(i).node();
				1662	if (node == ignore_this_node) continue;
				1663	int node_eats_at_least = node->EatsAtLeast(still_to_find,
				1664	recursion_depth + 1);
				1665	if (node_eats_at_least < min) min = node_eats_at_least;
				1666	}
				1667	return min;
				1668	}
				1669
				1670
				1671	int LoopChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
				1672	return EatsAtLeastHelper(still_to_find, recursion_depth, loop_node_);
				1673	}
				1674
				1675
				1676	int ChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
				1677	return EatsAtLeastHelper(still_to_find, recursion_depth, NULL);
				1678	}
				1679
				1680
				1681	// Takes the left-most 1-bit and smears it out, setting all bits to its right.
				1682	static inline uint32_t SmearBitsRight(uint32_t v) {
				1683	v \|= v >> 1;
				1684	v \|= v >> 2;
				1685	v \|= v >> 4;
				1686	v \|= v >> 8;
				1687	v \|= v >> 16;
				1688	return v;
				1689	}
				1690
				1691
				1692	bool QuickCheckDetails::Rationalize(bool asc) {
				1693	bool found_useful_op = false;
				1694	uint32_t char_mask;
				1695	if (asc) {
				1696	char_mask = String::kMaxAsciiCharCode;
				1697	} else {
				1698	char_mask = String::kMaxUC16CharCode;
				1699	}
				1700	mask_ = 0;
				1701	value_ = 0;
				1702	int char_shift = 0;
				1703	for (int i = 0; i < characters_; i++) {
				1704	Position* pos = &positions_[i];
				1705	if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
				1706	found_useful_op = true;
				1707	}
				1708	mask_ \|= (pos->mask & char_mask) << char_shift;
				1709	value_ \|= (pos->value & char_mask) << char_shift;
				1710	char_shift += asc ? 8 : 16;
				1711	}
				1712	return found_useful_op;
				1713	}
				1714
				1715
				1716	bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
				1717	Trace* trace,
				1718	bool preload_has_checked_bounds,
				1719	Label* on_possible_success,
				1720	QuickCheckDetails* details,
				1721	bool fall_through_on_failure) {
				1722	if (details->characters() == 0) return false;
				1723	GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
				1724	if (details->cannot_match()) return false;
				1725	if (!details->Rationalize(compiler->ascii())) return false;
				1726	ASSERT(details->characters() == 1 \|\|
				1727	compiler->macro_assembler()->CanReadUnaligned());
				1728	uint32_t mask = details->mask();
				1729	uint32_t value = details->value();
				1730
				1731	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1732
				1733	if (trace->characters_preloaded() != details->characters()) {
				1734	assembler->LoadCurrentCharacter(trace->cp_offset(),
				1735	trace->backtrack(),
				1736	!preload_has_checked_bounds,
				1737	details->characters());
				1738	}
				1739
				1740
				1741	bool need_mask = true;
				1742
				1743	if (details->characters() == 1) {
				1744	// If number of characters preloaded is 1 then we used a byte or 16 bit
				1745	// load so the value is already masked down.
				1746	uint32_t char_mask;
				1747	if (compiler->ascii()) {
				1748	char_mask = String::kMaxAsciiCharCode;
				1749	} else {
				1750	char_mask = String::kMaxUC16CharCode;
				1751	}
				1752	if ((mask & char_mask) == char_mask) need_mask = false;
				1753	mask &= char_mask;
				1754	} else {
				1755	// For 2-character preloads in ASCII mode we also use a 16 bit load with
				1756	// zero extend.
				1757	if (details->characters() == 2 && compiler->ascii()) {
				1758	if ((mask & 0xffff) == 0xffff) need_mask = false;
				1759	} else {
				1760	if (mask == 0xffffffff) need_mask = false;
				1761	}
				1762	}
				1763
				1764	if (fall_through_on_failure) {
				1765	if (need_mask) {
				1766	assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
				1767	} else {
				1768	assembler->CheckCharacter(value, on_possible_success);
				1769	}
				1770	} else {
				1771	if (need_mask) {
				1772	assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
				1773	} else {
				1774	assembler->CheckNotCharacter(value, trace->backtrack());
				1775	}
				1776	}
				1777	return true;
				1778	}
				1779
				1780
				1781	// Here is the meat of GetQuickCheckDetails (see also the comment on the
				1782	// super-class in the .h file).
				1783	//
				1784	// We iterate along the text object, building up for each character a
				1785	// mask and value that can be used to test for a quick failure to match.
				1786	// The masks and values for the positions will be combined into a single
				1787	// machine word for the current character width in order to be used in
				1788	// generating a quick check.
				1789	void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
				1790	RegExpCompiler* compiler,
				1791	int characters_filled_in,
				1792	bool not_at_start) {
				1793	ASSERT(characters_filled_in < details->characters());
				1794	int characters = details->characters();
				1795	int char_mask;
				1796	int char_shift;
				1797	if (compiler->ascii()) {
				1798	char_mask = String::kMaxAsciiCharCode;
				1799	char_shift = 8;
				1800	} else {
				1801	char_mask = String::kMaxUC16CharCode;
				1802	char_shift = 16;
				1803	}
				1804	for (int k = 0; k < elms_->length(); k++) {
				1805	TextElement elm = elms_->at(k);
				1806	if (elm.type == TextElement::ATOM) {
				1807	Vector<const uc16> quarks = elm.data.u_atom->data();
				1808	for (int i = 0; i < characters && i < quarks.length(); i++) {
				1809	QuickCheckDetails::Position* pos =
				1810	details->positions(characters_filled_in);
				1811	uc16 c = quarks[i];
				1812	if (c > char_mask) {
				1813	// If we expect a non-ASCII character from an ASCII string,
				1814	// there is no way we can match. Not even case independent
				1815	// matching can turn an ASCII character into non-ASCII or
				1816	// vice versa.
				1817	details->set_cannot_match();
				1818	pos->determines_perfectly = false;
				1819	return;
				1820	}
				1821	if (compiler->ignore_case()) {
				1822	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				1823	int length = GetCaseIndependentLetters(c, compiler->ascii(), chars);
				1824	ASSERT(length != 0); // Can only happen if c > char_mask (see above).
				1825	if (length == 1) {
				1826	// This letter has no case equivalents, so it's nice and simple
				1827	// and the mask-compare will determine definitely whether we have
				1828	// a match at this character position.
				1829	pos->mask = char_mask;
				1830	pos->value = c;
				1831	pos->determines_perfectly = true;
				1832	} else {
				1833	uint32_t common_bits = char_mask;
				1834	uint32_t bits = chars[0];
				1835	for (int j = 1; j < length; j++) {
				1836	uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
				1837	common_bits ^= differing_bits;
				1838	bits &= common_bits;
				1839	}
				1840	// If length is 2 and common bits has only one zero in it then
				1841	// our mask and compare instruction will determine definitely
				1842	// whether we have a match at this character position. Otherwise
				1843	// it can only be an approximate check.
				1844	uint32_t one_zero = (common_bits \| ~char_mask);
				1845	if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
				1846	pos->determines_perfectly = true;
				1847	}
				1848	pos->mask = common_bits;
				1849	pos->value = bits;
				1850	}
				1851	} else {
				1852	// Don't ignore case. Nice simple case where the mask-compare will
				1853	// determine definitely whether we have a match at this character
				1854	// position.
				1855	pos->mask = char_mask;
				1856	pos->value = c;
				1857	pos->determines_perfectly = true;
				1858	}
				1859	characters_filled_in++;
				1860	ASSERT(characters_filled_in <= details->characters());
				1861	if (characters_filled_in == details->characters()) {
				1862	return;
				1863	}
				1864	}
				1865	} else {
				1866	QuickCheckDetails::Position* pos =
				1867	details->positions(characters_filled_in);
				1868	RegExpCharacterClass* tree = elm.data.u_char_class;
				1869	ZoneList<CharacterRange>* ranges = tree->ranges();
				1870	if (tree->is_negated()) {
				1871	// A quick check uses multi-character mask and compare. There is no
				1872	// useful way to incorporate a negative char class into this scheme
				1873	// so we just conservatively create a mask and value that will always
				1874	// succeed.
				1875	pos->mask = 0;
				1876	pos->value = 0;
				1877	} else {
				1878	int first_range = 0;
				1879	while (ranges->at(first_range).from() > char_mask) {
				1880	first_range++;
				1881	if (first_range == ranges->length()) {
				1882	details->set_cannot_match();
				1883	pos->determines_perfectly = false;
				1884	return;
				1885	}
				1886	}
				1887	CharacterRange range = ranges->at(first_range);
				1888	uc16 from = range.from();
				1889	uc16 to = range.to();
				1890	if (to > char_mask) {
				1891	to = char_mask;
				1892	}
				1893	uint32_t differing_bits = (from ^ to);
				1894	// A mask and compare is only perfect if the differing bits form a
				1895	// number like 00011111 with one single block of trailing 1s.
				1896	if ((differing_bits & (differing_bits + 1)) == 0 &&
				1897	from + differing_bits == to) {
				1898	pos->determines_perfectly = true;
				1899	}
				1900	uint32_t common_bits = ~SmearBitsRight(differing_bits);
				1901	uint32_t bits = (from & common_bits);
				1902	for (int i = first_range + 1; i < ranges->length(); i++) {
				1903	CharacterRange range = ranges->at(i);
				1904	uc16 from = range.from();
				1905	uc16 to = range.to();
				1906	if (from > char_mask) continue;
				1907	if (to > char_mask) to = char_mask;
				1908	// Here we are combining more ranges into the mask and compare
				1909	// value. With each new range the mask becomes more sparse and
				1910	// so the chances of a false positive rise. A character class
				1911	// with multiple ranges is assumed never to be equivalent to a
				1912	// mask and compare operation.
				1913	pos->determines_perfectly = false;
				1914	uint32_t new_common_bits = (from ^ to);
				1915	new_common_bits = ~SmearBitsRight(new_common_bits);
				1916	common_bits &= new_common_bits;
				1917	bits &= new_common_bits;
				1918	uint32_t differing_bits = (from & common_bits) ^ bits;
				1919	common_bits ^= differing_bits;
				1920	bits &= common_bits;
				1921	}
				1922	pos->mask = common_bits;
				1923	pos->value = bits;
				1924	}
				1925	characters_filled_in++;
				1926	ASSERT(characters_filled_in <= details->characters());
				1927	if (characters_filled_in == details->characters()) {
				1928	return;
				1929	}
				1930	}
				1931	}
				1932	ASSERT(characters_filled_in != details->characters());
				1933	on_success()-> GetQuickCheckDetails(details,
				1934	compiler,
				1935	characters_filled_in,
				1936	true);
				1937	}
				1938
				1939
				1940	void QuickCheckDetails::Clear() {
				1941	for (int i = 0; i < characters_; i++) {
				1942	positions_[i].mask = 0;
				1943	positions_[i].value = 0;
				1944	positions_[i].determines_perfectly = false;
				1945	}
				1946	characters_ = 0;
				1947	}
				1948
				1949
				1950	void QuickCheckDetails::Advance(int by, bool ascii) {
				1951	ASSERT(by >= 0);
				1952	if (by >= characters_) {
				1953	Clear();
				1954	return;
				1955	}
				1956	for (int i = 0; i < characters_ - by; i++) {
				1957	positions_[i] = positions_[by + i];
				1958	}
				1959	for (int i = characters_ - by; i < characters_; i++) {
				1960	positions_[i].mask = 0;
				1961	positions_[i].value = 0;
				1962	positions_[i].determines_perfectly = false;
				1963	}
				1964	characters_ -= by;
				1965	// We could change mask_ and value_ here but we would never advance unless
				1966	// they had already been used in a check and they won't be used again because
				1967	// it would gain us nothing. So there's no point.
				1968	}
				1969
				1970
				1971	void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
				1972	ASSERT(characters_ == other->characters_);
				1973	if (other->cannot_match_) {
				1974	return;
				1975	}
				1976	if (cannot_match_) {
				1977	this = other;
				1978	return;
				1979	}
				1980	for (int i = from_index; i < characters_; i++) {
				1981	QuickCheckDetails::Position* pos = positions(i);
				1982	QuickCheckDetails::Position* other_pos = other->positions(i);
				1983	if (pos->mask != other_pos->mask \|\|
				1984	pos->value != other_pos->value \|\|
				1985	!other_pos->determines_perfectly) {
				1986	// Our mask-compare operation will be approximate unless we have the
				1987	// exact same operation on both sides of the alternation.
				1988	pos->determines_perfectly = false;
				1989	}
				1990	pos->mask &= other_pos->mask;
				1991	pos->value &= pos->mask;
				1992	other_pos->value &= pos->mask;
				1993	uc16 differing_bits = (pos->value ^ other_pos->value);
				1994	pos->mask &= ~differing_bits;
				1995	pos->value &= pos->mask;
				1996	}
				1997	}
				1998
				1999
				2000	class VisitMarker {
				2001	public:
				2002	explicit VisitMarker(NodeInfo* info) : info_(info) {
				2003	ASSERT(!info->visited);
				2004	info->visited = true;
				2005	}
				2006	~VisitMarker() {
				2007	info_->visited = false;
				2008	}
				2009	private:
				2010	NodeInfo* info_;
				2011	};
				2012
				2013
				2014	void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2015	RegExpCompiler* compiler,
				2016	int characters_filled_in,
				2017	bool not_at_start) {
				2018	if (body_can_be_zero_length_ \|\| info()->visited) return;
				2019	VisitMarker marker(info());
				2020	return ChoiceNode::GetQuickCheckDetails(details,
				2021	compiler,
				2022	characters_filled_in,
				2023	not_at_start);
				2024	}
				2025
				2026
				2027	void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2028	RegExpCompiler* compiler,
				2029	int characters_filled_in,
				2030	bool not_at_start) {
				2031	not_at_start = (not_at_start \|\| not_at_start_);
				2032	int choice_count = alternatives_->length();
				2033	ASSERT(choice_count > 0);
				2034	alternatives_->at(0).node()->GetQuickCheckDetails(details,
				2035	compiler,
				2036	characters_filled_in,
				2037	not_at_start);
				2038	for (int i = 1; i < choice_count; i++) {
				2039	QuickCheckDetails new_details(details->characters());
				2040	RegExpNode* node = alternatives_->at(i).node();
				2041	node->GetQuickCheckDetails(&new_details, compiler,
				2042	characters_filled_in,
				2043	not_at_start);
				2044	// Here we merge the quick match details of the two branches.
				2045	details->Merge(&new_details, characters_filled_in);
				2046	}
				2047	}
				2048
				2049
				2050	// Check for [0-9A-Z_a-z].
				2051	static void EmitWordCheck(RegExpMacroAssembler* assembler,
				2052	Label* word,
				2053	Label* non_word,
				2054	bool fall_through_on_word) {
				2055	assembler->CheckCharacterGT('z', non_word);
				2056	assembler->CheckCharacterLT('0', non_word);
				2057	assembler->CheckCharacterGT('a' - 1, word);
				2058	assembler->CheckCharacterLT('9' + 1, word);
				2059	assembler->CheckCharacterLT('A', non_word);
				2060	assembler->CheckCharacterLT('Z' + 1, word);
				2061	if (fall_through_on_word) {
				2062	assembler->CheckNotCharacter('_', non_word);
				2063	} else {
				2064	assembler->CheckCharacter('_', word);
				2065	}
				2066	}
				2067
				2068
				2069	// Emit the code to check for a ^ in multiline mode (1-character lookbehind
				2070	// that matches newline or the start of input).
				2071	static void EmitHat(RegExpCompiler* compiler,
				2072	RegExpNode* on_success,
				2073	Trace* trace) {
				2074	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2075	// We will be loading the previous character into the current character
				2076	// register.
				2077	Trace new_trace(*trace);
				2078	new_trace.InvalidateCurrentCharacter();
				2079
				2080	Label ok;
				2081	if (new_trace.cp_offset() == 0) {
				2082	// The start of input counts as a newline in this context, so skip to
				2083	// ok if we are at the start.
				2084	assembler->CheckAtStart(&ok);
				2085	}
				2086	// We already checked that we are not at the start of input so it must be
				2087	// OK to load the previous character.
				2088	assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
				2089	new_trace.backtrack(),
				2090	false);
				2091	// Newline means \n, \r, 0x2028 or 0x2029.
				2092	if (!compiler->ascii()) {
				2093	assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
				2094	}
				2095	assembler->CheckCharacter('\n', &ok);
				2096	assembler->CheckNotCharacter('\r', new_trace.backtrack());
				2097	assembler->Bind(&ok);
				2098	on_success->Emit(compiler, &new_trace);
				2099	}
				2100
				2101
				2102	// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
				2103	static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
				2104	RegExpCompiler* compiler,
				2105	RegExpNode* on_success,
				2106	Trace* trace) {
				2107	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2108	Label before_non_word;
				2109	Label before_word;
				2110	if (trace->characters_preloaded() != 1) {
				2111	assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
				2112	}
				2113	// Fall through on non-word.
				2114	EmitWordCheck(assembler, &before_word, &before_non_word, false);
				2115
				2116	// We will be loading the previous character into the current character
				2117	// register.
				2118	Trace new_trace(*trace);
				2119	new_trace.InvalidateCurrentCharacter();
				2120
				2121	Label ok;
				2122	Label* boundary;
				2123	Label* not_boundary;
				2124	if (type == AssertionNode::AT_BOUNDARY) {
				2125	boundary = &ok;
				2126	not_boundary = new_trace.backtrack();
				2127	} else {
				2128	not_boundary = &ok;
				2129	boundary = new_trace.backtrack();
				2130	}
				2131
				2132	// Next character is not a word character.
				2133	assembler->Bind(&before_non_word);
				2134	if (new_trace.cp_offset() == 0) {
				2135	// The start of input counts as a non-word character, so the question is
				2136	// decided if we are at the start.
				2137	assembler->CheckAtStart(not_boundary);
				2138	}
				2139	// We already checked that we are not at the start of input so it must be
				2140	// OK to load the previous character.
				2141	assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
				2142	&ok, // Unused dummy label in this call.
				2143	false);
				2144	// Fall through on non-word.
				2145	EmitWordCheck(assembler, boundary, not_boundary, false);
				2146	assembler->GoTo(not_boundary);
				2147
				2148	// Next character is a word character.
				2149	assembler->Bind(&before_word);
				2150	if (new_trace.cp_offset() == 0) {
				2151	// The start of input counts as a non-word character, so the question is
				2152	// decided if we are at the start.
				2153	assembler->CheckAtStart(boundary);
				2154	}
				2155	// We already checked that we are not at the start of input so it must be
				2156	// OK to load the previous character.
				2157	assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
				2158	&ok, // Unused dummy label in this call.
				2159	false);
				2160	bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
				2161	EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
				2162
				2163	assembler->Bind(&ok);
				2164
				2165	on_success->Emit(compiler, &new_trace);
				2166	}
				2167
				2168
				2169	void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2170	RegExpCompiler* compiler,
				2171	int filled_in,
				2172	bool not_at_start) {
				2173	if (type_ == AT_START && not_at_start) {
				2174	details->set_cannot_match();
				2175	return;
				2176	}
				2177	return on_success()->GetQuickCheckDetails(details,
				2178	compiler,
				2179	filled_in,
				2180	not_at_start);
				2181	}
				2182
				2183
				2184	void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2185	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2186	switch (type_) {
				2187	case AT_END: {
				2188	Label ok;
				2189	assembler->CheckPosition(trace->cp_offset(), &ok);
				2190	assembler->GoTo(trace->backtrack());
				2191	assembler->Bind(&ok);
				2192	break;
				2193	}
				2194	case AT_START: {
				2195	if (trace->at_start() == Trace::FALSE) {
				2196	assembler->GoTo(trace->backtrack());
				2197	return;
				2198	}
				2199	if (trace->at_start() == Trace::UNKNOWN) {
				2200	assembler->CheckNotAtStart(trace->backtrack());
				2201	Trace at_start_trace = *trace;
				2202	at_start_trace.set_at_start(true);
				2203	on_success()->Emit(compiler, &at_start_trace);
				2204	return;
				2205	}
				2206	}
				2207	break;
				2208	case AFTER_NEWLINE:
				2209	EmitHat(compiler, on_success(), trace);
				2210	return;
				2211	case AT_NON_BOUNDARY:
				2212	case AT_BOUNDARY:
				2213	EmitBoundaryCheck(type_, compiler, on_success(), trace);
				2214	return;
				2215	}
				2216	on_success()->Emit(compiler, trace);
				2217	}
				2218
				2219
				2220	static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
				2221	if (quick_check == NULL) return false;
				2222	if (offset >= quick_check->characters()) return false;
				2223	return quick_check->positions(offset)->determines_perfectly;
				2224	}
				2225
				2226
				2227	static void UpdateBoundsCheck(int index, int* checked_up_to) {
				2228	if (index > *checked_up_to) {
				2229	*checked_up_to = index;
				2230	}
				2231	}
				2232
				2233
				2234	// We call this repeatedly to generate code for each pass over the text node.
				2235	// The passes are in increasing order of difficulty because we hope one
				2236	// of the first passes will fail in which case we are saved the work of the
				2237	// later passes. for example for the case independent regexp /%[asdfghjkl]a/
				2238	// we will check the '%' in the first pass, the case independent 'a' in the
				2239	// second pass and the character class in the last pass.
				2240	//
				2241	// The passes are done from right to left, so for example to test for /bar/
				2242	// we will first test for an 'r' with offset 2, then an 'a' with offset 1
				2243	// and then a 'b' with offset 0. This means we can avoid the end-of-input
				2244	// bounds check most of the time. In the example we only need to check for
				2245	// end-of-input when loading the putative 'r'.
				2246	//
				2247	// A slight complication involves the fact that the first character may already
				2248	// be fetched into a register by the previous node. In this case we want to
				2249	// do the test for that character first. We do this in separate passes. The
				2250	// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
				2251	// pass has been performed then subsequent passes will have true in
				2252	// first_element_checked to indicate that that character does not need to be
				2253	// checked again.
				2254	//
				2255	// In addition to all this we are passed a Trace, which can
				2256	// contain an AlternativeGeneration object. In this AlternativeGeneration
				2257	// object we can see details of any quick check that was already passed in
				2258	// order to get to the code we are now generating. The quick check can involve
				2259	// loading characters, which means we do not need to recheck the bounds
				2260	// up to the limit the quick check already checked. In addition the quick
				2261	// check can have involved a mask and compare operation which may simplify
				2262	// or obviate the need for further checks at some character positions.
				2263	void TextNode::TextEmitPass(RegExpCompiler* compiler,
				2264	TextEmitPassType pass,
				2265	bool preloaded,
				2266	Trace* trace,
				2267	bool first_element_checked,
				2268	int* checked_up_to) {
				2269	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2270	bool ascii = compiler->ascii();
				2271	Label* backtrack = trace->backtrack();
				2272	QuickCheckDetails* quick_check = trace->quick_check_performed();
				2273	int element_count = elms_->length();
				2274	for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
				2275	TextElement elm = elms_->at(i);
				2276	int cp_offset = trace->cp_offset() + elm.cp_offset;
				2277	if (elm.type == TextElement::ATOM) {
				2278	Vector<const uc16> quarks = elm.data.u_atom->data();
				2279	for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
				2280	if (first_element_checked && i == 0 && j == 0) continue;
				2281	if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
				2282	EmitCharacterFunction* emit_function = NULL;
				2283	switch (pass) {
				2284	case NON_ASCII_MATCH:
				2285	ASSERT(ascii);
				2286	if (quarks[j] > String::kMaxAsciiCharCode) {
				2287	assembler->GoTo(backtrack);
				2288	return;
				2289	}
				2290	break;
				2291	case NON_LETTER_CHARACTER_MATCH:
				2292	emit_function = &EmitAtomNonLetter;
				2293	break;
				2294	case SIMPLE_CHARACTER_MATCH:
				2295	emit_function = &EmitSimpleCharacter;
				2296	break;
				2297	case CASE_CHARACTER_MATCH:
				2298	emit_function = &EmitAtomLetter;
				2299	break;
				2300	default:
				2301	break;
				2302	}
				2303	if (emit_function != NULL) {
				2304	bool bound_checked = emit_function(compiler,
				2305	quarks[j],
				2306	backtrack,
				2307	cp_offset + j,
				2308	*checked_up_to < cp_offset + j,
				2309	preloaded);
				2310	if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
				2311	}
				2312	}
				2313	} else {
				2314	ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
				2315	if (pass == CHARACTER_CLASS_MATCH) {
				2316	if (first_element_checked && i == 0) continue;
				2317	if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
				2318	RegExpCharacterClass* cc = elm.data.u_char_class;
				2319	EmitCharClass(assembler,
				2320	cc,
				2321	ascii,
				2322	backtrack,
				2323	cp_offset,
				2324	*checked_up_to < cp_offset,
				2325	preloaded);
				2326	UpdateBoundsCheck(cp_offset, checked_up_to);
				2327	}
				2328	}
				2329	}
				2330	}
				2331
				2332
				2333	int TextNode::Length() {
				2334	TextElement elm = elms_->last();
				2335	ASSERT(elm.cp_offset >= 0);
				2336	if (elm.type == TextElement::ATOM) {
				2337	return elm.cp_offset + elm.data.u_atom->data().length();
				2338	} else {
				2339	return elm.cp_offset + 1;
				2340	}
				2341	}
				2342
				2343
				2344	bool TextNode::SkipPass(int int_pass, bool ignore_case) {
				2345	TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
				2346	if (ignore_case) {
				2347	return pass == SIMPLE_CHARACTER_MATCH;
				2348	} else {
				2349	return pass == NON_LETTER_CHARACTER_MATCH \|\| pass == CASE_CHARACTER_MATCH;
				2350	}
				2351	}
				2352
				2353
				2354	// This generates the code to match a text node. A text node can contain
				2355	// straight character sequences (possibly to be matched in a case-independent
				2356	// way) and character classes. For efficiency we do not do this in a single
				2357	// pass from left to right. Instead we pass over the text node several times,
				2358	// emitting code for some character positions every time. See the comment on
				2359	// TextEmitPass for details.
				2360	void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2361	LimitResult limit_result = LimitVersions(compiler, trace);
				2362	if (limit_result == DONE) return;
				2363	ASSERT(limit_result == CONTINUE);
				2364
				2365	if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
				2366	compiler->SetRegExpTooBig();
				2367	return;
				2368	}
				2369
				2370	if (compiler->ascii()) {
				2371	int dummy = 0;
				2372	TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
				2373	}
				2374
				2375	bool first_elt_done = false;
				2376	int bound_checked_to = trace->cp_offset() - 1;
				2377	bound_checked_to += trace->bound_checked_up_to();
				2378
				2379	// If a character is preloaded into the current character register then
				2380	// check that now.
				2381	if (trace->characters_preloaded() == 1) {
				2382	for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
				2383	if (!SkipPass(pass, compiler->ignore_case())) {
				2384	TextEmitPass(compiler,
				2385	static_cast<TextEmitPassType>(pass),
				2386	true,
				2387	trace,
				2388	false,
				2389	&bound_checked_to);
				2390	}
				2391	}
				2392	first_elt_done = true;
				2393	}
				2394
				2395	for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
				2396	if (!SkipPass(pass, compiler->ignore_case())) {
				2397	TextEmitPass(compiler,
				2398	static_cast<TextEmitPassType>(pass),
				2399	false,
				2400	trace,
				2401	first_elt_done,
				2402	&bound_checked_to);
				2403	}
				2404	}
				2405
				2406	Trace successor_trace(*trace);
				2407	successor_trace.set_at_start(false);
				2408	successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
				2409	RecursionCheck rc(compiler);
				2410	on_success()->Emit(compiler, &successor_trace);
				2411	}
				2412
				2413
				2414	void Trace::InvalidateCurrentCharacter() {
				2415	characters_preloaded_ = 0;
				2416	}
				2417
				2418
				2419	void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
				2420	ASSERT(by > 0);
				2421	// We don't have an instruction for shifting the current character register
				2422	// down or for using a shifted value for anything so lets just forget that
				2423	// we preloaded any characters into it.
				2424	characters_preloaded_ = 0;
				2425	// Adjust the offsets of the quick check performed information. This
				2426	// information is used to find out what we already determined about the
				2427	// characters by means of mask and compare.
				2428	quick_check_performed_.Advance(by, compiler->ascii());
				2429	cp_offset_ += by;
				2430	if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
				2431	compiler->SetRegExpTooBig();
				2432	cp_offset_ = 0;
				2433	}
				2434	bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
				2435	}
				2436
				2437
				2438	void TextNode::MakeCaseIndependent() {
				2439	int element_count = elms_->length();
				2440	for (int i = 0; i < element_count; i++) {
				2441	TextElement elm = elms_->at(i);
				2442	if (elm.type == TextElement::CHAR_CLASS) {
				2443	RegExpCharacterClass* cc = elm.data.u_char_class;
				2444	ZoneList<CharacterRange>* ranges = cc->ranges();
				2445	int range_count = ranges->length();
				2446	for (int i = 0; i < range_count; i++) {
				2447	ranges->at(i).AddCaseEquivalents(ranges);
				2448	}
				2449	}
				2450	}
				2451	}
				2452
				2453
				2454	int TextNode::GreedyLoopTextLength() {
				2455	TextElement elm = elms_->at(elms_->length() - 1);
				2456	if (elm.type == TextElement::CHAR_CLASS) {
				2457	return elm.cp_offset + 1;
				2458	} else {
				2459	return elm.cp_offset + elm.data.u_atom->data().length();
				2460	}
				2461	}
				2462
				2463
				2464	// Finds the fixed match length of a sequence of nodes that goes from
				2465	// this alternative and back to this choice node. If there are variable
				2466	// length nodes or other complications in the way then return a sentinel
				2467	// value indicating that a greedy loop cannot be constructed.
				2468	int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) {
				2469	int length = 0;
				2470	RegExpNode* node = alternative->node();
				2471	// Later we will generate code for all these text nodes using recursion
				2472	// so we have to limit the max number.
				2473	int recursion_depth = 0;
				2474	while (node != this) {
				2475	if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
				2476	return kNodeIsTooComplexForGreedyLoops;
				2477	}
				2478	int node_length = node->GreedyLoopTextLength();
				2479	if (node_length == kNodeIsTooComplexForGreedyLoops) {
				2480	return kNodeIsTooComplexForGreedyLoops;
				2481	}
				2482	length += node_length;
				2483	SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
				2484	node = seq_node->on_success();
				2485	}
				2486	return length;
				2487	}
				2488
				2489
				2490	void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
				2491	ASSERT_EQ(loop_node_, NULL);
				2492	AddAlternative(alt);
				2493	loop_node_ = alt.node();
				2494	}
				2495
				2496
				2497	void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
				2498	ASSERT_EQ(continue_node_, NULL);
				2499	AddAlternative(alt);
				2500	continue_node_ = alt.node();
				2501	}
				2502
				2503
				2504	void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2505	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				2506	if (trace->stop_node() == this) {
				2507	int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
				2508	ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
				2509	// Update the counter-based backtracking info on the stack. This is an
				2510	// optimization for greedy loops (see below).
				2511	ASSERT(trace->cp_offset() == text_length);
				2512	macro_assembler->AdvanceCurrentPosition(text_length);
				2513	macro_assembler->GoTo(trace->loop_label());
				2514	return;
				2515	}
				2516	ASSERT(trace->stop_node() == NULL);
				2517	if (!trace->is_trivial()) {
				2518	trace->Flush(compiler, this);
				2519	return;
				2520	}
				2521	ChoiceNode::Emit(compiler, trace);
				2522	}
				2523
				2524
				2525	int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler) {
				2526	int preload_characters = EatsAtLeast(4, 0);
				2527	if (compiler->macro_assembler()->CanReadUnaligned()) {
				2528	bool ascii = compiler->ascii();
				2529	if (ascii) {
				2530	if (preload_characters > 4) preload_characters = 4;
				2531	// We can't preload 3 characters because there is no machine instruction
				2532	// to do that. We can't just load 4 because we could be reading
				2533	// beyond the end of the string, which could cause a memory fault.
				2534	if (preload_characters == 3) preload_characters = 2;
				2535	} else {
				2536	if (preload_characters > 2) preload_characters = 2;
				2537	}
				2538	} else {
				2539	if (preload_characters > 1) preload_characters = 1;
				2540	}
				2541	return preload_characters;
				2542	}
				2543
				2544
				2545	// This class is used when generating the alternatives in a choice node. It
				2546	// records the way the alternative is being code generated.
				2547	class AlternativeGeneration: public Malloced {
				2548	public:
				2549	AlternativeGeneration()
				2550	: possible_success(),
				2551	expects_preload(false),
				2552	after(),
				2553	quick_check_details() { }
				2554	Label possible_success;
				2555	bool expects_preload;
				2556	Label after;
				2557	QuickCheckDetails quick_check_details;
				2558	};
				2559
				2560
				2561	// Creates a list of AlternativeGenerations. If the list has a reasonable
				2562	// size then it is on the stack, otherwise the excess is on the heap.
				2563	class AlternativeGenerationList {
				2564	public:
				2565	explicit AlternativeGenerationList(int count)
				2566	: alt_gens_(count) {
				2567	for (int i = 0; i < count && i < kAFew; i++) {
				2568	alt_gens_.Add(a_few_alt_gens_ + i);
				2569	}
				2570	for (int i = kAFew; i < count; i++) {
				2571	alt_gens_.Add(new AlternativeGeneration());
				2572	}
				2573	}
				2574	~AlternativeGenerationList() {
				2575	for (int i = kAFew; i < alt_gens_.length(); i++) {
				2576	delete alt_gens_[i];
				2577	alt_gens_[i] = NULL;
				2578	}
				2579	}
				2580
				2581	AlternativeGeneration* at(int i) {
				2582	return alt_gens_[i];
				2583	}
				2584	private:
				2585	static const int kAFew = 10;
				2586	ZoneList<AlternativeGeneration*> alt_gens_;
				2587	AlternativeGeneration a_few_alt_gens_[kAFew];
				2588	};
				2589
				2590
				2591	/* Code generation for choice nodes.
				2592	*
				2593	* We generate quick checks that do a mask and compare to eliminate a
				2594	* choice. If the quick check succeeds then it jumps to the continuation to
				2595	* do slow checks and check subsequent nodes. If it fails (the common case)
				2596	* it falls through to the next choice.
				2597	*
				2598	* Here is the desired flow graph. Nodes directly below each other imply
				2599	* fallthrough. Alternatives 1 and 2 have quick checks. Alternative
				2600	* 3 doesn't have a quick check so we have to call the slow check.
				2601	* Nodes are marked Qn for quick checks and Sn for slow checks. The entire
				2602	* regexp continuation is generated directly after the Sn node, up to the
				2603	* next GoTo if we decide to reuse some already generated code. Some
				2604	* nodes expect preload_characters to be preloaded into the current
				2605	* character register. R nodes do this preloading. Vertices are marked
				2606	* F for failures and S for success (possible success in the case of quick
				2607	* nodes). L, V, < and > are used as arrow heads.
				2608	*
				2609	* ----------> R
				2610	* \|
				2611	* V
				2612	* Q1 -----> S1
				2613	* \| S /
				2614	* F\| /
				2615	* \| F/
				2616	* \| /
				2617	* \| R
				2618	* \| /
				2619	* V L
				2620	* Q2 -----> S2
				2621	* \| S /
				2622	* F\| /
				2623	* \| F/
				2624	* \| /
				2625	* \| R
				2626	* \| /
				2627	* V L
				2628	* S3
				2629	* \|
				2630	* F\|
				2631	* \|
				2632	* R
				2633	* \|
				2634	* backtrack V
				2635	* <----------Q4
				2636	* \ F \|
				2637	* \ \|S
				2638	* \ F V
				2639	* \-----S4
				2640	*
				2641	* For greedy loops we reverse our expectation and expect to match rather
				2642	* than fail. Therefore we want the loop code to look like this (U is the
				2643	* unwind code that steps back in the greedy loop). The following alternatives
				2644	* look the same as above.
				2645	* _____
				2646	* / \
				2647	* V \|
				2648	* ----------> S1 \|
				2649	* /\| \|
				2650	* / \|S \|
				2651	* F/ \_____/
				2652	* /
				2653	* \|<-----------
				2654	* \| \
				2655	* V \
				2656	* Q2 ---> S2 \
				2657	* \| S / \|
				2658	* F\| / \|
				2659	* \| F/ \|
				2660	* \| / \|
				2661	* \| R \|
				2662	* \| / \|
				2663	* F VL \|
				2664	* <------U \|
				2665	* back \|S \|
				2666	* \______________/
				2667	*/
				2668
				2669
				2670	void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2671	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				2672	int choice_count = alternatives_->length();
				2673	#ifdef DEBUG
				2674	for (int i = 0; i < choice_count - 1; i++) {
				2675	GuardedAlternative alternative = alternatives_->at(i);
				2676	ZoneList<Guard> guards = alternative.guards();
				2677	int guard_count = (guards == NULL) ? 0 : guards->length();
				2678	for (int j = 0; j < guard_count; j++) {
				2679	ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
				2680	}
				2681	}
				2682	#endif
				2683
				2684	LimitResult limit_result = LimitVersions(compiler, trace);
				2685	if (limit_result == DONE) return;
				2686	ASSERT(limit_result == CONTINUE);
				2687
				2688	int new_flush_budget = trace->flush_budget() / choice_count;
				2689	if (trace->flush_budget() == 0 && trace->actions() != NULL) {
				2690	trace->Flush(compiler, this);
				2691	return;
				2692	}
				2693
				2694	RecursionCheck rc(compiler);
				2695
				2696	Trace* current_trace = trace;
				2697
				2698	int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
				2699	bool greedy_loop = false;
				2700	Label greedy_loop_label;
				2701	Trace counter_backtrack_trace;
				2702	counter_backtrack_trace.set_backtrack(&greedy_loop_label);
				2703	if (not_at_start()) counter_backtrack_trace.set_at_start(false);
				2704
				2705	if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
				2706	// Here we have special handling for greedy loops containing only text nodes
				2707	// and other simple nodes. These are handled by pushing the current
				2708	// position on the stack and then incrementing the current position each
				2709	// time around the switch. On backtrack we decrement the current position
				2710	// and check it against the pushed value. This avoids pushing backtrack
				2711	// information for each iteration of the loop, which could take up a lot of
				2712	// space.
				2713	greedy_loop = true;
				2714	ASSERT(trace->stop_node() == NULL);
				2715	macro_assembler->PushCurrentPosition();
				2716	current_trace = &counter_backtrack_trace;
				2717	Label greedy_match_failed;
				2718	Trace greedy_match_trace;
				2719	if (not_at_start()) greedy_match_trace.set_at_start(false);
				2720	greedy_match_trace.set_backtrack(&greedy_match_failed);
				2721	Label loop_label;
				2722	macro_assembler->Bind(&loop_label);
				2723	greedy_match_trace.set_stop_node(this);
				2724	greedy_match_trace.set_loop_label(&loop_label);
				2725	alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
				2726	macro_assembler->Bind(&greedy_match_failed);
				2727	}
				2728
				2729	Label second_choice; // For use in greedy matches.
				2730	macro_assembler->Bind(&second_choice);
				2731
				2732	int first_normal_choice = greedy_loop ? 1 : 0;
				2733
				2734	int preload_characters = CalculatePreloadCharacters(compiler);
				2735	bool preload_is_current =
				2736	(current_trace->characters_preloaded() == preload_characters);
				2737	bool preload_has_checked_bounds = preload_is_current;
				2738
				2739	AlternativeGenerationList alt_gens(choice_count);
				2740
				2741	// For now we just call all choices one after the other. The idea ultimately
				2742	// is to use the Dispatch table to try only the relevant ones.
				2743	for (int i = first_normal_choice; i < choice_count; i++) {
				2744	GuardedAlternative alternative = alternatives_->at(i);
				2745	AlternativeGeneration* alt_gen = alt_gens.at(i);
				2746	alt_gen->quick_check_details.set_characters(preload_characters);
				2747	ZoneList<Guard> guards = alternative.guards();
				2748	int guard_count = (guards == NULL) ? 0 : guards->length();
				2749	Trace new_trace(*current_trace);
				2750	new_trace.set_characters_preloaded(preload_is_current ?
				2751	preload_characters :
				2752	0);
				2753	if (preload_has_checked_bounds) {
				2754	new_trace.set_bound_checked_up_to(preload_characters);
				2755	}
				2756	new_trace.quick_check_performed()->Clear();
				2757	if (not_at_start_) new_trace.set_at_start(Trace::FALSE);
				2758	alt_gen->expects_preload = preload_is_current;
				2759	bool generate_full_check_inline = false;
				2760	if (FLAG_regexp_optimization &&
				2761	try_to_emit_quick_check_for_alternative(i) &&
				2762	alternative.node()->EmitQuickCheck(compiler,
				2763	&new_trace,
				2764	preload_has_checked_bounds,
				2765	&alt_gen->possible_success,
				2766	&alt_gen->quick_check_details,
				2767	i < choice_count - 1)) {
				2768	// Quick check was generated for this choice.
				2769	preload_is_current = true;
				2770	preload_has_checked_bounds = true;
				2771	// On the last choice in the ChoiceNode we generated the quick
				2772	// check to fall through on possible success. So now we need to
				2773	// generate the full check inline.
				2774	if (i == choice_count - 1) {
				2775	macro_assembler->Bind(&alt_gen->possible_success);
				2776	new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
				2777	new_trace.set_characters_preloaded(preload_characters);
				2778	new_trace.set_bound_checked_up_to(preload_characters);
				2779	generate_full_check_inline = true;
				2780	}
				2781	} else if (alt_gen->quick_check_details.cannot_match()) {
				2782	if (i == choice_count - 1 && !greedy_loop) {
				2783	macro_assembler->GoTo(trace->backtrack());
				2784	}
				2785	continue;
				2786	} else {
				2787	// No quick check was generated. Put the full code here.
				2788	// If this is not the first choice then there could be slow checks from
				2789	// previous cases that go here when they fail. There's no reason to
				2790	// insist that they preload characters since the slow check we are about
				2791	// to generate probably can't use it.
				2792	if (i != first_normal_choice) {
				2793	alt_gen->expects_preload = false;
				2794	new_trace.set_characters_preloaded(0);
				2795	}
				2796	if (i < choice_count - 1) {
				2797	new_trace.set_backtrack(&alt_gen->after);
				2798	}
				2799	generate_full_check_inline = true;
				2800	}
				2801	if (generate_full_check_inline) {
				2802	if (new_trace.actions() != NULL) {
				2803	new_trace.set_flush_budget(new_flush_budget);
				2804	}
				2805	for (int j = 0; j < guard_count; j++) {
				2806	GenerateGuard(macro_assembler, guards->at(j), &new_trace);
				2807	}
				2808	alternative.node()->Emit(compiler, &new_trace);
				2809	preload_is_current = false;
				2810	}
				2811	macro_assembler->Bind(&alt_gen->after);
				2812	}
				2813	if (greedy_loop) {
				2814	macro_assembler->Bind(&greedy_loop_label);
				2815	// If we have unwound to the bottom then backtrack.
				2816	macro_assembler->CheckGreedyLoop(trace->backtrack());
				2817	// Otherwise try the second priority at an earlier position.
				2818	macro_assembler->AdvanceCurrentPosition(-text_length);
				2819	macro_assembler->GoTo(&second_choice);
				2820	}
				2821
				2822	// At this point we need to generate slow checks for the alternatives where
				2823	// the quick check was inlined. We can recognize these because the associated
				2824	// label was bound.
				2825	for (int i = first_normal_choice; i < choice_count - 1; i++) {
				2826	AlternativeGeneration* alt_gen = alt_gens.at(i);
				2827	Trace new_trace(*current_trace);
				2828	// If there are actions to be flushed we have to limit how many times
				2829	// they are flushed. Take the budget of the parent trace and distribute
				2830	// it fairly amongst the children.
				2831	if (new_trace.actions() != NULL) {
				2832	new_trace.set_flush_budget(new_flush_budget);
				2833	}
				2834	EmitOutOfLineContinuation(compiler,
				2835	&new_trace,
				2836	alternatives_->at(i),
				2837	alt_gen,
				2838	preload_characters,
				2839	alt_gens.at(i + 1)->expects_preload);
				2840	}
				2841	}
				2842
				2843
				2844	void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
				2845	Trace* trace,
				2846	GuardedAlternative alternative,
				2847	AlternativeGeneration* alt_gen,
				2848	int preload_characters,
				2849	bool next_expects_preload) {
				2850	if (!alt_gen->possible_success.is_linked()) return;
				2851
				2852	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				2853	macro_assembler->Bind(&alt_gen->possible_success);
				2854	Trace out_of_line_trace(*trace);
				2855	out_of_line_trace.set_characters_preloaded(preload_characters);
				2856	out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
				2857	if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE);
				2858	ZoneList<Guard> guards = alternative.guards();
				2859	int guard_count = (guards == NULL) ? 0 : guards->length();
				2860	if (next_expects_preload) {
				2861	Label reload_current_char;
				2862	out_of_line_trace.set_backtrack(&reload_current_char);
				2863	for (int j = 0; j < guard_count; j++) {
				2864	GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
				2865	}
				2866	alternative.node()->Emit(compiler, &out_of_line_trace);
				2867	macro_assembler->Bind(&reload_current_char);
				2868	// Reload the current character, since the next quick check expects that.
				2869	// We don't need to check bounds here because we only get into this
				2870	// code through a quick check which already did the checked load.
				2871	macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
				2872	NULL,
				2873	false,
				2874	preload_characters);
				2875	macro_assembler->GoTo(&(alt_gen->after));
				2876	} else {
				2877	out_of_line_trace.set_backtrack(&(alt_gen->after));
				2878	for (int j = 0; j < guard_count; j++) {
				2879	GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
				2880	}
				2881	alternative.node()->Emit(compiler, &out_of_line_trace);
				2882	}
				2883	}
				2884
				2885
				2886	void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2887	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2888	LimitResult limit_result = LimitVersions(compiler, trace);
				2889	if (limit_result == DONE) return;
				2890	ASSERT(limit_result == CONTINUE);
				2891
				2892	RecursionCheck rc(compiler);
				2893
				2894	switch (type_) {
				2895	case STORE_POSITION: {
				2896	Trace::DeferredCapture
				2897	new_capture(data_.u_position_register.reg,
				2898	data_.u_position_register.is_capture,
				2899	trace);
				2900	Trace new_trace = *trace;
				2901	new_trace.add_action(&new_capture);
				2902	on_success()->Emit(compiler, &new_trace);
				2903	break;
				2904	}
				2905	case INCREMENT_REGISTER: {
				2906	Trace::DeferredIncrementRegister
				2907	new_increment(data_.u_increment_register.reg);
				2908	Trace new_trace = *trace;
				2909	new_trace.add_action(&new_increment);
				2910	on_success()->Emit(compiler, &new_trace);
				2911	break;
				2912	}
				2913	case SET_REGISTER: {
				2914	Trace::DeferredSetRegister
				2915	new_set(data_.u_store_register.reg, data_.u_store_register.value);
				2916	Trace new_trace = *trace;
				2917	new_trace.add_action(&new_set);
				2918	on_success()->Emit(compiler, &new_trace);
				2919	break;
				2920	}
				2921	case CLEAR_CAPTURES: {
				2922	Trace::DeferredClearCaptures
				2923	new_capture(Interval(data_.u_clear_captures.range_from,
				2924	data_.u_clear_captures.range_to));
				2925	Trace new_trace = *trace;
				2926	new_trace.add_action(&new_capture);
				2927	on_success()->Emit(compiler, &new_trace);
				2928	break;
				2929	}
				2930	case BEGIN_SUBMATCH:
				2931	if (!trace->is_trivial()) {
				2932	trace->Flush(compiler, this);
				2933	} else {
				2934	assembler->WriteCurrentPositionToRegister(
				2935	data_.u_submatch.current_position_register, 0);
				2936	assembler->WriteStackPointerToRegister(
				2937	data_.u_submatch.stack_pointer_register);
				2938	on_success()->Emit(compiler, trace);
				2939	}
				2940	break;
				2941	case EMPTY_MATCH_CHECK: {
				2942	int start_pos_reg = data_.u_empty_match_check.start_register;
				2943	int stored_pos = 0;
				2944	int rep_reg = data_.u_empty_match_check.repetition_register;
				2945	bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
				2946	bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
				2947	if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
				2948	// If we know we haven't advanced and there is no minimum we
				2949	// can just backtrack immediately.
				2950	assembler->GoTo(trace->backtrack());
				2951	} else if (know_dist && stored_pos < trace->cp_offset()) {
				2952	// If we know we've advanced we can generate the continuation
				2953	// immediately.
				2954	on_success()->Emit(compiler, trace);
				2955	} else if (!trace->is_trivial()) {
				2956	trace->Flush(compiler, this);
				2957	} else {
				2958	Label skip_empty_check;
				2959	// If we have a minimum number of repetitions we check the current
				2960	// number first and skip the empty check if it's not enough.
				2961	if (has_minimum) {
				2962	int limit = data_.u_empty_match_check.repetition_limit;
				2963	assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
				2964	}
				2965	// If the match is empty we bail out, otherwise we fall through
				2966	// to the on-success continuation.
				2967	assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
				2968	trace->backtrack());
				2969	assembler->Bind(&skip_empty_check);
				2970	on_success()->Emit(compiler, trace);
				2971	}
				2972	break;
				2973	}
				2974	case POSITIVE_SUBMATCH_SUCCESS: {
				2975	if (!trace->is_trivial()) {
				2976	trace->Flush(compiler, this);
				2977	return;
				2978	}
				2979	assembler->ReadCurrentPositionFromRegister(
				2980	data_.u_submatch.current_position_register);
				2981	assembler->ReadStackPointerFromRegister(
				2982	data_.u_submatch.stack_pointer_register);
				2983	int clear_register_count = data_.u_submatch.clear_register_count;
				2984	if (clear_register_count == 0) {
				2985	on_success()->Emit(compiler, trace);
				2986	return;
				2987	}
				2988	int clear_registers_from = data_.u_submatch.clear_register_from;
				2989	Label clear_registers_backtrack;
				2990	Trace new_trace = *trace;
				2991	new_trace.set_backtrack(&clear_registers_backtrack);
				2992	on_success()->Emit(compiler, &new_trace);
				2993
				2994	assembler->Bind(&clear_registers_backtrack);
				2995	int clear_registers_to = clear_registers_from + clear_register_count - 1;
				2996	assembler->ClearRegisters(clear_registers_from, clear_registers_to);
				2997
				2998	ASSERT(trace->backtrack() == NULL);
				2999	assembler->Backtrack();
				3000	return;
				3001	}
				3002	default:
				3003	UNREACHABLE();
				3004	}
				3005	}
				3006
				3007
				3008	void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3009	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3010	if (!trace->is_trivial()) {
				3011	trace->Flush(compiler, this);
				3012	return;
				3013	}
				3014
				3015	LimitResult limit_result = LimitVersions(compiler, trace);
				3016	if (limit_result == DONE) return;
				3017	ASSERT(limit_result == CONTINUE);
				3018
				3019	RecursionCheck rc(compiler);
				3020
				3021	ASSERT_EQ(start_reg_ + 1, end_reg_);
				3022	if (compiler->ignore_case()) {
				3023	assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
				3024	trace->backtrack());
				3025	} else {
				3026	assembler->CheckNotBackReference(start_reg_, trace->backtrack());
				3027	}
				3028	on_success()->Emit(compiler, trace);
				3029	}
				3030
				3031
				3032	// -------------------------------------------------------------------
				3033	// Dot/dotty output
				3034
				3035
				3036	#ifdef DEBUG
				3037
				3038
				3039	class DotPrinter: public NodeVisitor {
				3040	public:
				3041	explicit DotPrinter(bool ignore_case)
				3042	: ignore_case_(ignore_case),
				3043	stream_(&alloc_) { }
				3044	void PrintNode(const char* label, RegExpNode* node);
				3045	void Visit(RegExpNode* node);
				3046	void PrintAttributes(RegExpNode* from);
				3047	StringStream* stream() { return &stream_; }
				3048	void PrintOnFailure(RegExpNode* from, RegExpNode* to);
				3049	#define DECLARE_VISIT(Type) \
				3050	virtual void Visit##Type(Type##Node* that);
				3051	FOR_EACH_NODE_TYPE(DECLARE_VISIT)
				3052	#undef DECLARE_VISIT
				3053	private:
				3054	bool ignore_case_;
				3055	HeapStringAllocator alloc_;
				3056	StringStream stream_;
				3057	};
				3058
				3059
				3060	void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
				3061	stream()->Add("digraph G {\n graph [label=\"");
				3062	for (int i = 0; label[i]; i++) {
				3063	switch (label[i]) {
				3064	case '\\':
				3065	stream()->Add("\\\\");
				3066	break;
				3067	case '"':
				3068	stream()->Add("\"");
				3069	break;
				3070	default:
				3071	stream()->Put(label[i]);
				3072	break;
				3073	}
				3074	}
				3075	stream()->Add("\"];\n");
				3076	Visit(node);
				3077	stream()->Add("}\n");
				3078	printf("%s", *(stream()->ToCString()));
				3079	}
				3080
				3081
				3082	void DotPrinter::Visit(RegExpNode* node) {
				3083	if (node->info()->visited) return;
				3084	node->info()->visited = true;
				3085	node->Accept(this);
				3086	}
				3087
				3088
				3089	void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
				3090	stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
				3091	Visit(on_failure);
				3092	}
				3093
				3094
				3095	class TableEntryBodyPrinter {
				3096	public:
				3097	TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
				3098	: stream_(stream), choice_(choice) { }
				3099	void Call(uc16 from, DispatchTable::Entry entry) {
				3100	OutSet* out_set = entry.out_set();
				3101	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				3102	if (out_set->Get(i)) {
				3103	stream()->Add(" n%p:s%io%i -> n%p;\n",
				3104	choice(),
				3105	from,
				3106	i,
				3107	choice()->alternatives()->at(i).node());
				3108	}
				3109	}
				3110	}
				3111	private:
				3112	StringStream* stream() { return stream_; }
				3113	ChoiceNode* choice() { return choice_; }
				3114	StringStream* stream_;
				3115	ChoiceNode* choice_;
				3116	};
				3117
				3118
				3119	class TableEntryHeaderPrinter {
				3120	public:
				3121	explicit TableEntryHeaderPrinter(StringStream* stream)
				3122	: first_(true), stream_(stream) { }
				3123	void Call(uc16 from, DispatchTable::Entry entry) {
				3124	if (first_) {
				3125	first_ = false;
				3126	} else {
				3127	stream()->Add("\|");
				3128	}
				3129	stream()->Add("{\\%k-\\%k\|{", from, entry.to());
				3130	OutSet* out_set = entry.out_set();
				3131	int priority = 0;
				3132	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				3133	if (out_set->Get(i)) {
				3134	if (priority > 0) stream()->Add("\|");
				3135	stream()->Add("<s%io%i> %i", from, i, priority);
				3136	priority++;
				3137	}
				3138	}
				3139	stream()->Add("}}");
				3140	}
				3141	private:
				3142	bool first_;
				3143	StringStream* stream() { return stream_; }
				3144	StringStream* stream_;
				3145	};
				3146
				3147
				3148	class AttributePrinter {
				3149	public:
				3150	explicit AttributePrinter(DotPrinter* out)
				3151	: out_(out), first_(true) { }
				3152	void PrintSeparator() {
				3153	if (first_) {
				3154	first_ = false;
				3155	} else {
				3156	out_->stream()->Add("\|");
				3157	}
				3158	}
				3159	void PrintBit(const char* name, bool value) {
				3160	if (!value) return;
				3161	PrintSeparator();
				3162	out_->stream()->Add("{%s}", name);
				3163	}
				3164	void PrintPositive(const char* name, int value) {
				3165	if (value < 0) return;
				3166	PrintSeparator();
				3167	out_->stream()->Add("{%s\|%x}", name, value);
				3168	}
				3169	private:
				3170	DotPrinter* out_;
				3171	bool first_;
				3172	};
				3173
				3174
				3175	void DotPrinter::PrintAttributes(RegExpNode* that) {
				3176	stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
				3177	"margin=0.1, fontsize=10, label=\"{",
				3178	that);
				3179	AttributePrinter printer(this);
				3180	NodeInfo* info = that->info();
				3181	printer.PrintBit("NI", info->follows_newline_interest);
				3182	printer.PrintBit("WI", info->follows_word_interest);
				3183	printer.PrintBit("SI", info->follows_start_interest);
				3184	Label* label = that->label();
				3185	if (label->is_bound())
				3186	printer.PrintPositive("@", label->pos());
				3187	stream()->Add("}\"];\n");
				3188	stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
				3189	"arrowhead=none];\n", that, that);
				3190	}
				3191
				3192
				3193	static const bool kPrintDispatchTable = false;
				3194	void DotPrinter::VisitChoice(ChoiceNode* that) {
				3195	if (kPrintDispatchTable) {
				3196	stream()->Add(" n%p [shape=Mrecord, label=\"", that);
				3197	TableEntryHeaderPrinter header_printer(stream());
				3198	that->GetTable(ignore_case_)->ForEach(&header_printer);
				3199	stream()->Add("\"]\n", that);
				3200	PrintAttributes(that);
				3201	TableEntryBodyPrinter body_printer(stream(), that);
				3202	that->GetTable(ignore_case_)->ForEach(&body_printer);
				3203	} else {
				3204	stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
				3205	for (int i = 0; i < that->alternatives()->length(); i++) {
				3206	GuardedAlternative alt = that->alternatives()->at(i);
				3207	stream()->Add(" n%p -> n%p;\n", that, alt.node());
				3208	}
				3209	}
				3210	for (int i = 0; i < that->alternatives()->length(); i++) {
				3211	GuardedAlternative alt = that->alternatives()->at(i);
				3212	alt.node()->Accept(this);
				3213	}
				3214	}
				3215
				3216
				3217	void DotPrinter::VisitText(TextNode* that) {
				3218	stream()->Add(" n%p [label=\"", that);
				3219	for (int i = 0; i < that->elements()->length(); i++) {
				3220	if (i > 0) stream()->Add(" ");
				3221	TextElement elm = that->elements()->at(i);
				3222	switch (elm.type) {
				3223	case TextElement::ATOM: {
				3224	stream()->Add("'%w'", elm.data.u_atom->data());
				3225	break;
				3226	}
				3227	case TextElement::CHAR_CLASS: {
				3228	RegExpCharacterClass* node = elm.data.u_char_class;
				3229	stream()->Add("[");
				3230	if (node->is_negated())
				3231	stream()->Add("^");
				3232	for (int j = 0; j < node->ranges()->length(); j++) {
				3233	CharacterRange range = node->ranges()->at(j);
				3234	stream()->Add("%k-%k", range.from(), range.to());
				3235	}
				3236	stream()->Add("]");
				3237	break;
				3238	}
				3239	default:
				3240	UNREACHABLE();
				3241	}
				3242	}
				3243	stream()->Add("\", shape=box, peripheries=2];\n");
				3244	PrintAttributes(that);
				3245	stream()->Add(" n%p -> n%p;\n", that, that->on_success());
				3246	Visit(that->on_success());
				3247	}
				3248
				3249
				3250	void DotPrinter::VisitBackReference(BackReferenceNode* that) {
				3251	stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
				3252	that,
				3253	that->start_register(),
				3254	that->end_register());
				3255	PrintAttributes(that);
				3256	stream()->Add(" n%p -> n%p;\n", that, that->on_success());
				3257	Visit(that->on_success());
				3258	}
				3259
				3260
				3261	void DotPrinter::VisitEnd(EndNode* that) {
				3262	stream()->Add(" n%p [style=bold, shape=point];\n", that);
				3263	PrintAttributes(that);
				3264	}
				3265
				3266
				3267	void DotPrinter::VisitAssertion(AssertionNode* that) {
				3268	stream()->Add(" n%p [", that);
				3269	switch (that->type()) {
				3270	case AssertionNode::AT_END:
				3271	stream()->Add("label=\"$\", shape=septagon");
				3272	break;
				3273	case AssertionNode::AT_START:
				3274	stream()->Add("label=\"^\", shape=septagon");
				3275	break;
				3276	case AssertionNode::AT_BOUNDARY:
				3277	stream()->Add("label=\"\\b\", shape=septagon");
				3278	break;
				3279	case AssertionNode::AT_NON_BOUNDARY:
				3280	stream()->Add("label=\"\\B\", shape=septagon");
				3281	break;
				3282	case AssertionNode::AFTER_NEWLINE:
				3283	stream()->Add("label=\"(?<=\\n)\", shape=septagon");
				3284	break;
				3285	}
				3286	stream()->Add("];\n");
				3287	PrintAttributes(that);
				3288	RegExpNode* successor = that->on_success();
				3289	stream()->Add(" n%p -> n%p;\n", that, successor);
				3290	Visit(successor);
				3291	}
				3292
				3293
				3294	void DotPrinter::VisitAction(ActionNode* that) {
				3295	stream()->Add(" n%p [", that);
				3296	switch (that->type_) {
				3297	case ActionNode::SET_REGISTER:
				3298	stream()->Add("label=\"$%i:=%i\", shape=octagon",
				3299	that->data_.u_store_register.reg,
				3300	that->data_.u_store_register.value);
				3301	break;
				3302	case ActionNode::INCREMENT_REGISTER:
				3303	stream()->Add("label=\"$%i++\", shape=octagon",
				3304	that->data_.u_increment_register.reg);
				3305	break;
				3306	case ActionNode::STORE_POSITION:
				3307	stream()->Add("label=\"$%i:=$pos\", shape=octagon",
				3308	that->data_.u_position_register.reg);
				3309	break;
				3310	case ActionNode::BEGIN_SUBMATCH:
				3311	stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
				3312	that->data_.u_submatch.current_position_register);
				3313	break;
				3314	case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
				3315	stream()->Add("label=\"escape\", shape=septagon");
				3316	break;
				3317	case ActionNode::EMPTY_MATCH_CHECK:
				3318	stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
				3319	that->data_.u_empty_match_check.start_register,
				3320	that->data_.u_empty_match_check.repetition_register,
				3321	that->data_.u_empty_match_check.repetition_limit);
				3322	break;
				3323	case ActionNode::CLEAR_CAPTURES: {
				3324	stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
				3325	that->data_.u_clear_captures.range_from,
				3326	that->data_.u_clear_captures.range_to);
				3327	break;
				3328	}
				3329	}
				3330	stream()->Add("];\n");
				3331	PrintAttributes(that);
				3332	RegExpNode* successor = that->on_success();
				3333	stream()->Add(" n%p -> n%p;\n", that, successor);
				3334	Visit(successor);
				3335	}
				3336
				3337
				3338	class DispatchTableDumper {
				3339	public:
				3340	explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
				3341	void Call(uc16 key, DispatchTable::Entry entry);
				3342	StringStream* stream() { return stream_; }
				3343	private:
				3344	StringStream* stream_;
				3345	};
				3346
				3347
				3348	void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
				3349	stream()->Add("[%k-%k]: {", key, entry.to());
				3350	OutSet* set = entry.out_set();
				3351	bool first = true;
				3352	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				3353	if (set->Get(i)) {
				3354	if (first) {
				3355	first = false;
				3356	} else {
				3357	stream()->Add(", ");
				3358	}
				3359	stream()->Add("%i", i);
				3360	}
				3361	}
				3362	stream()->Add("}\n");
				3363	}
				3364
				3365
				3366	void DispatchTable::Dump() {
				3367	HeapStringAllocator alloc;
				3368	StringStream stream(&alloc);
				3369	DispatchTableDumper dumper(&stream);
				3370	tree()->ForEach(&dumper);
				3371	OS::PrintError("%s", *stream.ToCString());
				3372	}
				3373
				3374
				3375	void RegExpEngine::DotPrint(const char* label,
				3376	RegExpNode* node,
				3377	bool ignore_case) {
				3378	DotPrinter printer(ignore_case);
				3379	printer.PrintNode(label, node);
				3380	}
				3381
				3382
				3383	#endif // DEBUG
				3384
				3385
				3386	// -------------------------------------------------------------------
				3387	// Tree to graph conversion
				3388
				3389	static const int kSpaceRangeCount = 20;
				3390	static const int kSpaceRangeAsciiCount = 4;
				3391	static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020,
				3392	0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A,
				3393	0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 };
				3394
				3395	static const int kWordRangeCount = 8;
				3396	static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_',
				3397	'_', 'a', 'z' };
				3398
				3399	static const int kDigitRangeCount = 2;
				3400	static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' };
				3401
				3402	static const int kLineTerminatorRangeCount = 6;
				3403	static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A,
				3404	0x000A, 0x000D, 0x000D, 0x2028, 0x2029 };
				3405
				3406	RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
				3407	RegExpNode* on_success) {
				3408	ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
				3409	elms->Add(TextElement::Atom(this));
				3410	return new TextNode(elms, on_success);
				3411	}
				3412
				3413
				3414	RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
				3415	RegExpNode* on_success) {
				3416	return new TextNode(elements(), on_success);
				3417	}
				3418
				3419	static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
				3420	const uc16* special_class,
				3421	int length) {
				3422	ASSERT(ranges->length() != 0);
				3423	ASSERT(length != 0);
				3424	ASSERT(special_class[0] != 0);
				3425	if (ranges->length() != (length >> 1) + 1) {
				3426	return false;
				3427	}
				3428	CharacterRange range = ranges->at(0);
				3429	if (range.from() != 0) {
				3430	return false;
				3431	}
				3432	for (int i = 0; i < length; i += 2) {
				3433	if (special_class[i] != (range.to() + 1)) {
				3434	return false;
				3435	}
				3436	range = ranges->at((i >> 1) + 1);
				3437	if (special_class[i+1] != range.from() - 1) {
				3438	return false;
				3439	}
				3440	}
				3441	if (range.to() != 0xffff) {
				3442	return false;
				3443	}
				3444	return true;
				3445	}
				3446
				3447
				3448	static bool CompareRanges(ZoneList<CharacterRange>* ranges,
				3449	const uc16* special_class,
				3450	int length) {
				3451	if (ranges->length() * 2 != length) {
				3452	return false;
				3453	}
				3454	for (int i = 0; i < length; i += 2) {
				3455	CharacterRange range = ranges->at(i >> 1);
				3456	if (range.from() != special_class[i] \|\| range.to() != special_class[i+1]) {
				3457	return false;
				3458	}
				3459	}
				3460	return true;
				3461	}
				3462
				3463
				3464	bool RegExpCharacterClass::is_standard() {
				3465	// TODO(lrn): Remove need for this function, by not throwing away information
				3466	// along the way.
				3467	if (is_negated_) {
				3468	return false;
				3469	}
				3470	if (set_.is_standard()) {
				3471	return true;
				3472	}
				3473	if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
				3474	set_.set_standard_set_type('s');
				3475	return true;
				3476	}
				3477	if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
				3478	set_.set_standard_set_type('S');
				3479	return true;
				3480	}
				3481	if (CompareInverseRanges(set_.ranges(),
				3482	kLineTerminatorRanges,
				3483	kLineTerminatorRangeCount)) {
				3484	set_.set_standard_set_type('.');
				3485	return true;
				3486	}
				3487	return false;
				3488	}
				3489
				3490
				3491	RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
				3492	RegExpNode* on_success) {
				3493	return new TextNode(this, on_success);
				3494	}
				3495
				3496
				3497	RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
				3498	RegExpNode* on_success) {
				3499	ZoneList<RegExpTree> alternatives = this->alternatives();
				3500	int length = alternatives->length();
				3501	ChoiceNode* result = new ChoiceNode(length);
				3502	for (int i = 0; i < length; i++) {
				3503	GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
				3504	on_success));
				3505	result->AddAlternative(alternative);
				3506	}
				3507	return result;
				3508	}
				3509
				3510
				3511	RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
				3512	RegExpNode* on_success) {
				3513	return ToNode(min(),
				3514	max(),
				3515	is_greedy(),
				3516	body(),
				3517	compiler,
				3518	on_success);
				3519	}
				3520
				3521
				3522	RegExpNode* RegExpQuantifier::ToNode(int min,
				3523	int max,
				3524	bool is_greedy,
				3525	RegExpTree* body,
				3526	RegExpCompiler* compiler,
				3527	RegExpNode* on_success,
				3528	bool not_at_start) {
				3529	// x{f, t} becomes this:
				3530	//
				3531	// (r++)<-.
				3532	// \| `
				3533	// \| (x)
				3534	// v ^
				3535	// (r=0)-->(?)---/ [if r < t]
				3536	// \|
				3537	// [if r >= f] \----> ...
				3538	//
				3539
				3540	// 15.10.2.5 RepeatMatcher algorithm.
				3541	// The parser has already eliminated the case where max is 0. In the case
				3542	// where max_match is zero the parser has removed the quantifier if min was
				3543	// > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
				3544
				3545	// If we know that we cannot match zero length then things are a little
				3546	// simpler since we don't need to make the special zero length match check
				3547	// from step 2.1. If the min and max are small we can unroll a little in
				3548	// this case.
				3549	static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
				3550	static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
				3551	if (max == 0) return on_success; // This can happen due to recursion.
				3552	bool body_can_be_empty = (body->min_match() == 0);
				3553	int body_start_reg = RegExpCompiler::kNoRegister;
				3554	Interval capture_registers = body->CaptureRegisters();
				3555	bool needs_capture_clearing = !capture_registers.is_empty();
				3556	if (body_can_be_empty) {
				3557	body_start_reg = compiler->AllocateRegister();
				3558	} else if (FLAG_regexp_optimization && !needs_capture_clearing) {
				3559	// Only unroll if there are no captures and the body can't be
				3560	// empty.
				3561	if (min > 0 && min <= kMaxUnrolledMinMatches) {
				3562	int new_max = (max == kInfinity) ? max : max - min;
				3563	// Recurse once to get the loop or optional matches after the fixed ones.
				3564	RegExpNode* answer = ToNode(
				3565	0, new_max, is_greedy, body, compiler, on_success, true);
				3566	// Unroll the forced matches from 0 to min. This can cause chains of
				3567	// TextNodes (which the parser does not generate). These should be
				3568	// combined if it turns out they hinder good code generation.
				3569	for (int i = 0; i < min; i++) {
				3570	answer = body->ToNode(compiler, answer);
				3571	}
				3572	return answer;
				3573	}
				3574	if (max <= kMaxUnrolledMaxMatches) {
				3575	ASSERT(min == 0);
				3576	// Unroll the optional matches up to max.
				3577	RegExpNode* answer = on_success;
				3578	for (int i = 0; i < max; i++) {
				3579	ChoiceNode* alternation = new ChoiceNode(2);
				3580	if (is_greedy) {
				3581	alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
				3582	answer)));
				3583	alternation->AddAlternative(GuardedAlternative(on_success));
				3584	} else {
				3585	alternation->AddAlternative(GuardedAlternative(on_success));
				3586	alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
				3587	answer)));
				3588	}
				3589	answer = alternation;
				3590	if (not_at_start) alternation->set_not_at_start();
				3591	}
				3592	return answer;
				3593	}
				3594	}
				3595	bool has_min = min > 0;
				3596	bool has_max = max < RegExpTree::kInfinity;
				3597	bool needs_counter = has_min \|\| has_max;
				3598	int reg_ctr = needs_counter
				3599	? compiler->AllocateRegister()
				3600	: RegExpCompiler::kNoRegister;
				3601	LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
				3602	if (not_at_start) center->set_not_at_start();
				3603	RegExpNode* loop_return = needs_counter
				3604	? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
				3605	: static_cast<RegExpNode*>(center);
				3606	if (body_can_be_empty) {
				3607	// If the body can be empty we need to check if it was and then
				3608	// backtrack.
				3609	loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
				3610	reg_ctr,
				3611	min,
				3612	loop_return);
				3613	}
				3614	RegExpNode* body_node = body->ToNode(compiler, loop_return);
				3615	if (body_can_be_empty) {
				3616	// If the body can be empty we need to store the start position
				3617	// so we can bail out if it was empty.
				3618	body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
				3619	}
				3620	if (needs_capture_clearing) {
				3621	// Before entering the body of this loop we need to clear captures.
				3622	body_node = ActionNode::ClearCaptures(capture_registers, body_node);
				3623	}
				3624	GuardedAlternative body_alt(body_node);
				3625	if (has_max) {
				3626	Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
				3627	body_alt.AddGuard(body_guard);
				3628	}
				3629	GuardedAlternative rest_alt(on_success);
				3630	if (has_min) {
				3631	Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
				3632	rest_alt.AddGuard(rest_guard);
				3633	}
				3634	if (is_greedy) {
				3635	center->AddLoopAlternative(body_alt);
				3636	center->AddContinueAlternative(rest_alt);
				3637	} else {
				3638	center->AddContinueAlternative(rest_alt);
				3639	center->AddLoopAlternative(body_alt);
				3640	}
				3641	if (needs_counter) {
				3642	return ActionNode::SetRegister(reg_ctr, 0, center);
				3643	} else {
				3644	return center;
				3645	}
				3646	}
				3647
				3648
				3649	RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
				3650	RegExpNode* on_success) {
				3651	NodeInfo info;
				3652	switch (type()) {
				3653	case START_OF_LINE:
				3654	return AssertionNode::AfterNewline(on_success);
				3655	case START_OF_INPUT:
				3656	return AssertionNode::AtStart(on_success);
				3657	case BOUNDARY:
				3658	return AssertionNode::AtBoundary(on_success);
				3659	case NON_BOUNDARY:
				3660	return AssertionNode::AtNonBoundary(on_success);
				3661	case END_OF_INPUT:
				3662	return AssertionNode::AtEnd(on_success);
				3663	case END_OF_LINE: {
				3664	// Compile $ in multiline regexps as an alternation with a positive
				3665	// lookahead in one side and an end-of-input on the other side.
				3666	// We need two registers for the lookahead.
				3667	int stack_pointer_register = compiler->AllocateRegister();
				3668	int position_register = compiler->AllocateRegister();
				3669	// The ChoiceNode to distinguish between a newline and end-of-input.
				3670	ChoiceNode* result = new ChoiceNode(2);
				3671	// Create a newline atom.
				3672	ZoneList<CharacterRange>* newline_ranges =
				3673	new ZoneList<CharacterRange>(3);
				3674	CharacterRange::AddClassEscape('n', newline_ranges);
				3675	RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
				3676	TextNode* newline_matcher = new TextNode(
				3677	newline_atom,
				3678	ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
				3679	position_register,
				3680	0, // No captures inside.
				3681	-1, // Ignored if no captures.
				3682	on_success));
				3683	// Create an end-of-input matcher.
				3684	RegExpNode* end_of_line = ActionNode::BeginSubmatch(
				3685	stack_pointer_register,
				3686	position_register,
				3687	newline_matcher);
				3688	// Add the two alternatives to the ChoiceNode.
				3689	GuardedAlternative eol_alternative(end_of_line);
				3690	result->AddAlternative(eol_alternative);
				3691	GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
				3692	result->AddAlternative(end_alternative);
				3693	return result;
				3694	}
				3695	default:
				3696	UNREACHABLE();
				3697	}
				3698	return on_success;
				3699	}
				3700
				3701
				3702	RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
				3703	RegExpNode* on_success) {
				3704	return new BackReferenceNode(RegExpCapture::StartRegister(index()),
				3705	RegExpCapture::EndRegister(index()),
				3706	on_success);
				3707	}
				3708
				3709
				3710	RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
				3711	RegExpNode* on_success) {
				3712	return on_success;
				3713	}
				3714
				3715
				3716	RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
				3717	RegExpNode* on_success) {
				3718	int stack_pointer_register = compiler->AllocateRegister();
				3719	int position_register = compiler->AllocateRegister();
				3720
				3721	const int registers_per_capture = 2;
				3722	const int register_of_first_capture = 2;
				3723	int register_count = capture_count_ * registers_per_capture;
				3724	int register_start =
				3725	register_of_first_capture + capture_from_ * registers_per_capture;
				3726
				3727	RegExpNode* success;
				3728	if (is_positive()) {
				3729	RegExpNode* node = ActionNode::BeginSubmatch(
				3730	stack_pointer_register,
				3731	position_register,
				3732	body()->ToNode(
				3733	compiler,
				3734	ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
				3735	position_register,
				3736	register_count,
				3737	register_start,
				3738	on_success)));
				3739	return node;
				3740	} else {
				3741	// We use a ChoiceNode for a negative lookahead because it has most of
				3742	// the characteristics we need. It has the body of the lookahead as its
				3743	// first alternative and the expression after the lookahead of the second
				3744	// alternative. If the first alternative succeeds then the
				3745	// NegativeSubmatchSuccess will unwind the stack including everything the
				3746	// choice node set up and backtrack. If the first alternative fails then
				3747	// the second alternative is tried, which is exactly the desired result
				3748	// for a negative lookahead. The NegativeLookaheadChoiceNode is a special
				3749	// ChoiceNode that knows to ignore the first exit when calculating quick
				3750	// checks.
				3751	GuardedAlternative body_alt(
				3752	body()->ToNode(
				3753	compiler,
				3754	success = new NegativeSubmatchSuccess(stack_pointer_register,
				3755	position_register,
				3756	register_count,
				3757	register_start)));
				3758	ChoiceNode* choice_node =
				3759	new NegativeLookaheadChoiceNode(body_alt,
				3760	GuardedAlternative(on_success));
				3761	return ActionNode::BeginSubmatch(stack_pointer_register,
				3762	position_register,
				3763	choice_node);
				3764	}
				3765	}
				3766
				3767
				3768	RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
				3769	RegExpNode* on_success) {
				3770	return ToNode(body(), index(), compiler, on_success);
				3771	}
				3772
				3773
				3774	RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
				3775	int index,
				3776	RegExpCompiler* compiler,
				3777	RegExpNode* on_success) {
				3778	int start_reg = RegExpCapture::StartRegister(index);
				3779	int end_reg = RegExpCapture::EndRegister(index);
				3780	RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
				3781	RegExpNode* body_node = body->ToNode(compiler, store_end);
				3782	return ActionNode::StorePosition(start_reg, true, body_node);
				3783	}
				3784
				3785
				3786	RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
				3787	RegExpNode* on_success) {
				3788	ZoneList<RegExpTree> children = nodes();
				3789	RegExpNode* current = on_success;
				3790	for (int i = children->length() - 1; i >= 0; i--) {
				3791	current = children->at(i)->ToNode(compiler, current);
				3792	}
				3793	return current;
				3794	}
				3795
				3796
				3797	static void AddClass(const uc16* elmv,
				3798	int elmc,
				3799	ZoneList<CharacterRange>* ranges) {
				3800	for (int i = 0; i < elmc; i += 2) {
				3801	ASSERT(elmv[i] <= elmv[i + 1]);
				3802	ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
				3803	}
				3804	}
				3805
				3806
				3807	static void AddClassNegated(const uc16 *elmv,
				3808	int elmc,
				3809	ZoneList<CharacterRange>* ranges) {
				3810	ASSERT(elmv[0] != 0x0000);
				3811	ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
				3812	uc16 last = 0x0000;
				3813	for (int i = 0; i < elmc; i += 2) {
				3814	ASSERT(last <= elmv[i] - 1);
				3815	ASSERT(elmv[i] <= elmv[i + 1]);
				3816	ranges->Add(CharacterRange(last, elmv[i] - 1));
				3817	last = elmv[i + 1] + 1;
				3818	}
				3819	ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
				3820	}
				3821
				3822
				3823	void CharacterRange::AddClassEscape(uc16 type,
				3824	ZoneList<CharacterRange>* ranges) {
				3825	switch (type) {
				3826	case 's':
				3827	AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
				3828	break;
				3829	case 'S':
				3830	AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
				3831	break;
				3832	case 'w':
				3833	AddClass(kWordRanges, kWordRangeCount, ranges);
				3834	break;
				3835	case 'W':
				3836	AddClassNegated(kWordRanges, kWordRangeCount, ranges);
				3837	break;
				3838	case 'd':
				3839	AddClass(kDigitRanges, kDigitRangeCount, ranges);
				3840	break;
				3841	case 'D':
				3842	AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
				3843	break;
				3844	case '.':
				3845	AddClassNegated(kLineTerminatorRanges,
				3846	kLineTerminatorRangeCount,
				3847	ranges);
				3848	break;
				3849	// This is not a character range as defined by the spec but a
				3850	// convenient shorthand for a character class that matches any
				3851	// character.
				3852	case '*':
				3853	ranges->Add(CharacterRange::Everything());
				3854	break;
				3855	// This is the set of characters matched by the $ and ^ symbols
				3856	// in multiline mode.
				3857	case 'n':
				3858	AddClass(kLineTerminatorRanges,
				3859	kLineTerminatorRangeCount,
				3860	ranges);
				3861	break;
				3862	default:
				3863	UNREACHABLE();
				3864	}
				3865	}
				3866
				3867
				3868	Vector<const uc16> CharacterRange::GetWordBounds() {
				3869	return Vector<const uc16>(kWordRanges, kWordRangeCount);
				3870	}
				3871
				3872
				3873	class CharacterRangeSplitter {
				3874	public:
				3875	CharacterRangeSplitter(ZoneList<CharacterRange>** included,
				3876	ZoneList<CharacterRange>** excluded)
				3877	: included_(included),
				3878	excluded_(excluded) { }
				3879	void Call(uc16 from, DispatchTable::Entry entry);
				3880
				3881	static const int kInBase = 0;
				3882	static const int kInOverlay = 1;
				3883
				3884	private:
				3885	ZoneList<CharacterRange>** included_;
				3886	ZoneList<CharacterRange>** excluded_;
				3887	};
				3888
				3889
				3890	void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
				3891	if (!entry.out_set()->Get(kInBase)) return;
				3892	ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
				3893	? included_
				3894	: excluded_;
				3895	if (target == NULL) target = new ZoneList<CharacterRange>(2);
				3896	(*target)->Add(CharacterRange(entry.from(), entry.to()));
				3897	}
				3898
				3899
				3900	void CharacterRange::Split(ZoneList<CharacterRange>* base,
				3901	Vector<const uc16> overlay,
				3902	ZoneList<CharacterRange>** included,
				3903	ZoneList<CharacterRange>** excluded) {
				3904	ASSERT_EQ(NULL, *included);
				3905	ASSERT_EQ(NULL, *excluded);
				3906	DispatchTable table;
				3907	for (int i = 0; i < base->length(); i++)
				3908	table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
				3909	for (int i = 0; i < overlay.length(); i += 2) {
				3910	table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
				3911	CharacterRangeSplitter::kInOverlay);
				3912	}
				3913	CharacterRangeSplitter callback(included, excluded);
				3914	table.ForEach(&callback);
				3915	}
				3916
				3917
				3918	void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
				3919	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				3920	if (IsSingleton()) {
				3921	// If this is a singleton we just expand the one character.
				3922	int length = uncanonicalize.get(from(), '\0', chars);
				3923	for (int i = 0; i < length; i++) {
				3924	uc32 chr = chars[i];
				3925	if (chr != from()) {
				3926	ranges->Add(CharacterRange::Singleton(chars[i]));
				3927	}
				3928	}
				3929	} else if (from() <= kRangeCanonicalizeMax &&
				3930	to() <= kRangeCanonicalizeMax) {
				3931	// If this is a range we expand the characters block by block,
				3932	// expanding contiguous subranges (blocks) one at a time.
				3933	// The approach is as follows. For a given start character we
				3934	// look up the block that contains it, for instance 'a' if the
				3935	// start character is 'c'. A block is characterized by the property
				3936	// that all characters uncanonicalize in the same way as the first
				3937	// element, except that each entry in the result is incremented
				3938	// by the distance from the first element. So a-z is a block
				3939	// because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
				3940	// uncanonicalizes to ['a' + k, 'A' + k].
				3941	// Once we've found the start point we look up its uncanonicalization
				3942	// and produce a range for each element. For instance for [c-f]
				3943	// we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
				3944	// add a range if it is not already contained in the input, so [c-f]
				3945	// will be skipped but [C-F] will be added. If this range is not
				3946	// completely contained in a block we do this for all the blocks
				3947	// covered by the range.
				3948	unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				3949	// First, look up the block that contains the 'from' character.
				3950	int length = canonrange.get(from(), '\0', range);
				3951	if (length == 0) {
				3952	range[0] = from();
				3953	} else {
				3954	ASSERT_EQ(1, length);
				3955	}
				3956	int pos = from();
				3957	// The start of the current block. Note that except for the first
				3958	// iteration 'start' is always equal to 'pos'.
				3959	int start;
				3960	// If it is not the start point of a block the entry contains the
				3961	// offset of the character from the start point.
				3962	if ((range[0] & kStartMarker) == 0) {
				3963	start = pos - range[0];
				3964	} else {
				3965	start = pos;
				3966	}
				3967	// Then we add the ranges on at a time, incrementing the current
				3968	// position to be after the last block each time. The position
				3969	// always points to the start of a block.
				3970	while (pos < to()) {
				3971	length = canonrange.get(start, '\0', range);
				3972	if (length == 0) {
				3973	range[0] = start;
				3974	} else {
				3975	ASSERT_EQ(1, length);
				3976	}
				3977	ASSERT((range[0] & kStartMarker) != 0);
				3978	// The start point of a block contains the distance to the end
				3979	// of the range.
				3980	int block_end = start + (range[0] & kPayloadMask) - 1;
				3981	int end = (block_end > to()) ? to() : block_end;
				3982	length = uncanonicalize.get(start, '\0', range);
				3983	for (int i = 0; i < length; i++) {
				3984	uc32 c = range[i];
				3985	uc16 range_from = c + (pos - start);
				3986	uc16 range_to = c + (end - start);
				3987	if (!(from() <= range_from && range_to <= to())) {
				3988	ranges->Add(CharacterRange(range_from, range_to));
				3989	}
				3990	}
				3991	start = pos = block_end + 1;
				3992	}
				3993	} else {
				3994	// TODO(plesner) when we've fixed the 2^11 bug in unibrow.
				3995	}
				3996	}
				3997
				3998
				3999	ZoneList<CharacterRange>* CharacterSet::ranges() {
				4000	if (ranges_ == NULL) {
				4001	ranges_ = new ZoneList<CharacterRange>(2);
				4002	CharacterRange::AddClassEscape(standard_set_type_, ranges_);
				4003	}
				4004	return ranges_;
				4005	}
				4006
				4007
				4008
				4009	// -------------------------------------------------------------------
				4010	// Interest propagation
				4011
				4012
				4013	RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
				4014	for (int i = 0; i < siblings_.length(); i++) {
				4015	RegExpNode* sibling = siblings_.Get(i);
				4016	if (sibling->info()->Matches(info))
				4017	return sibling;
				4018	}
				4019	return NULL;
				4020	}
				4021
				4022
				4023	RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
				4024	ASSERT_EQ(false, *cloned);
				4025	siblings_.Ensure(this);
				4026	RegExpNode* result = TryGetSibling(info);
				4027	if (result != NULL) return result;
				4028	result = this->Clone();
				4029	NodeInfo* new_info = result->info();
				4030	new_info->ResetCompilationState();
				4031	new_info->AddFromPreceding(info);
				4032	AddSibling(result);
				4033	*cloned = true;
				4034	return result;
				4035	}
				4036
				4037
				4038	template <class C>
				4039	static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
				4040	NodeInfo full_info(*node->info());
				4041	full_info.AddFromPreceding(info);
				4042	bool cloned = false;
				4043	return RegExpNode::EnsureSibling(node, &full_info, &cloned);
				4044	}
				4045
				4046
				4047	// -------------------------------------------------------------------
				4048	// Splay tree
				4049
				4050
				4051	OutSet* OutSet::Extend(unsigned value) {
				4052	if (Get(value))
				4053	return this;
				4054	if (successors() != NULL) {
				4055	for (int i = 0; i < successors()->length(); i++) {
				4056	OutSet* successor = successors()->at(i);
				4057	if (successor->Get(value))
				4058	return successor;
				4059	}
				4060	} else {
				4061	successors_ = new ZoneList<OutSet*>(2);
				4062	}
				4063	OutSet* result = new OutSet(first_, remaining_);
				4064	result->Set(value);
				4065	successors()->Add(result);
				4066	return result;
				4067	}
				4068
				4069
				4070	void OutSet::Set(unsigned value) {
				4071	if (value < kFirstLimit) {
				4072	first_ \|= (1 << value);
				4073	} else {
				4074	if (remaining_ == NULL)
				4075	remaining_ = new ZoneList<unsigned>(1);
				4076	if (remaining_->is_empty() \|\| !remaining_->Contains(value))
				4077	remaining_->Add(value);
				4078	}
				4079	}
				4080
				4081
				4082	bool OutSet::Get(unsigned value) {
				4083	if (value < kFirstLimit) {
				4084	return (first_ & (1 << value)) != 0;
				4085	} else if (remaining_ == NULL) {
				4086	return false;
				4087	} else {
				4088	return remaining_->Contains(value);
				4089	}
				4090	}
				4091
				4092
				4093	const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
				4094	const DispatchTable::Entry DispatchTable::Config::kNoValue;
				4095
				4096
				4097	void DispatchTable::AddRange(CharacterRange full_range, int value) {
				4098	CharacterRange current = full_range;
				4099	if (tree()->is_empty()) {
				4100	// If this is the first range we just insert into the table.
				4101	ZoneSplayTree<Config>::Locator loc;
				4102	ASSERT_RESULT(tree()->Insert(current.from(), &loc));
				4103	loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
				4104	return;
				4105	}
				4106	// First see if there is a range to the left of this one that
				4107	// overlaps.
				4108	ZoneSplayTree<Config>::Locator loc;
				4109	if (tree()->FindGreatestLessThan(current.from(), &loc)) {
				4110	Entry* entry = &loc.value();
				4111	// If we've found a range that overlaps with this one, and it
				4112	// starts strictly to the left of this one, we have to fix it
				4113	// because the following code only handles ranges that start on
				4114	// or after the start point of the range we're adding.
				4115	if (entry->from() < current.from() && entry->to() >= current.from()) {
				4116	// Snap the overlapping range in half around the start point of
				4117	// the range we're adding.
				4118	CharacterRange left(entry->from(), current.from() - 1);
				4119	CharacterRange right(current.from(), entry->to());
				4120	// The left part of the overlapping range doesn't overlap.
				4121	// Truncate the whole entry to be just the left part.
				4122	entry->set_to(left.to());
				4123	// The right part is the one that overlaps. We add this part
				4124	// to the map and let the next step deal with merging it with
				4125	// the range we're adding.
				4126	ZoneSplayTree<Config>::Locator loc;
				4127	ASSERT_RESULT(tree()->Insert(right.from(), &loc));
				4128	loc.set_value(Entry(right.from(),
				4129	right.to(),
				4130	entry->out_set()));
				4131	}
				4132	}
				4133	while (current.is_valid()) {
				4134	if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
				4135	(loc.value().from() <= current.to()) &&
				4136	(loc.value().to() >= current.from())) {
				4137	Entry* entry = &loc.value();
				4138	// We have overlap. If there is space between the start point of
				4139	// the range we're adding and where the overlapping range starts
				4140	// then we have to add a range covering just that space.
				4141	if (current.from() < entry->from()) {
				4142	ZoneSplayTree<Config>::Locator ins;
				4143	ASSERT_RESULT(tree()->Insert(current.from(), &ins));
				4144	ins.set_value(Entry(current.from(),
				4145	entry->from() - 1,
				4146	empty()->Extend(value)));
				4147	current.set_from(entry->from());
				4148	}
				4149	ASSERT_EQ(current.from(), entry->from());
				4150	// If the overlapping range extends beyond the one we want to add
				4151	// we have to snap the right part off and add it separately.
				4152	if (entry->to() > current.to()) {
				4153	ZoneSplayTree<Config>::Locator ins;
				4154	ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
				4155	ins.set_value(Entry(current.to() + 1,
				4156	entry->to(),
				4157	entry->out_set()));
				4158	entry->set_to(current.to());
				4159	}
				4160	ASSERT(entry->to() <= current.to());
				4161	// The overlapping range is now completely contained by the range
				4162	// we're adding so we can just update it and move the start point
				4163	// of the range we're adding just past it.
				4164	entry->AddValue(value);
				4165	// Bail out if the last interval ended at 0xFFFF since otherwise
				4166	// adding 1 will wrap around to 0.
				4167	if (entry->to() == String::kMaxUC16CharCode)
				4168	break;
				4169	ASSERT(entry->to() + 1 > current.from());
				4170	current.set_from(entry->to() + 1);
				4171	} else {
				4172	// There is no overlap so we can just add the range
				4173	ZoneSplayTree<Config>::Locator ins;
				4174	ASSERT_RESULT(tree()->Insert(current.from(), &ins));
				4175	ins.set_value(Entry(current.from(),
				4176	current.to(),
				4177	empty()->Extend(value)));
				4178	break;
				4179	}
				4180	}
				4181	}
				4182
				4183
				4184	OutSet* DispatchTable::Get(uc16 value) {
				4185	ZoneSplayTree<Config>::Locator loc;
				4186	if (!tree()->FindGreatestLessThan(value, &loc))
				4187	return empty();
				4188	Entry* entry = &loc.value();
				4189	if (value <= entry->to())
				4190	return entry->out_set();
				4191	else
				4192	return empty();
				4193	}
				4194
				4195
				4196	// -------------------------------------------------------------------
				4197	// Analysis
				4198
				4199
				4200	void Analysis::EnsureAnalyzed(RegExpNode* that) {
				4201	StackLimitCheck check;
				4202	if (check.HasOverflowed()) {
				4203	fail("Stack overflow");
				4204	return;
				4205	}
				4206	if (that->info()->been_analyzed \|\| that->info()->being_analyzed)
				4207	return;
				4208	that->info()->being_analyzed = true;
				4209	that->Accept(this);
				4210	that->info()->being_analyzed = false;
				4211	that->info()->been_analyzed = true;
				4212	}
				4213
				4214
				4215	void Analysis::VisitEnd(EndNode* that) {
				4216	// nothing to do
				4217	}
				4218
				4219
				4220	void TextNode::CalculateOffsets() {
				4221	int element_count = elements()->length();
				4222	// Set up the offsets of the elements relative to the start. This is a fixed
				4223	// quantity since a TextNode can only contain fixed-width things.
				4224	int cp_offset = 0;
				4225	for (int i = 0; i < element_count; i++) {
				4226	TextElement& elm = elements()->at(i);
				4227	elm.cp_offset = cp_offset;
				4228	if (elm.type == TextElement::ATOM) {
				4229	cp_offset += elm.data.u_atom->data().length();
				4230	} else {
				4231	cp_offset++;
				4232	Vector<const uc16> quarks = elm.data.u_atom->data();
				4233	}
				4234	}
				4235	}
				4236
				4237
				4238	void Analysis::VisitText(TextNode* that) {
				4239	if (ignore_case_) {
				4240	that->MakeCaseIndependent();
				4241	}
				4242	EnsureAnalyzed(that->on_success());
				4243	if (!has_failed()) {
				4244	that->CalculateOffsets();
				4245	}
				4246	}
				4247
				4248
				4249	void Analysis::VisitAction(ActionNode* that) {
				4250	RegExpNode* target = that->on_success();
				4251	EnsureAnalyzed(target);
				4252	if (!has_failed()) {
				4253	// If the next node is interested in what it follows then this node
				4254	// has to be interested too so it can pass the information on.
				4255	that->info()->AddFromFollowing(target->info());
				4256	}
				4257	}
				4258
				4259
				4260	void Analysis::VisitChoice(ChoiceNode* that) {
				4261	NodeInfo* info = that->info();
				4262	for (int i = 0; i < that->alternatives()->length(); i++) {
				4263	RegExpNode* node = that->alternatives()->at(i).node();
				4264	EnsureAnalyzed(node);
				4265	if (has_failed()) return;
				4266	// Anything the following nodes need to know has to be known by
				4267	// this node also, so it can pass it on.
				4268	info->AddFromFollowing(node->info());
				4269	}
				4270	}
				4271
				4272
				4273	void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
				4274	NodeInfo* info = that->info();
				4275	for (int i = 0; i < that->alternatives()->length(); i++) {
				4276	RegExpNode* node = that->alternatives()->at(i).node();
				4277	if (node != that->loop_node()) {
				4278	EnsureAnalyzed(node);
				4279	if (has_failed()) return;
				4280	info->AddFromFollowing(node->info());
				4281	}
				4282	}
				4283	// Check the loop last since it may need the value of this node
				4284	// to get a correct result.
				4285	EnsureAnalyzed(that->loop_node());
				4286	if (!has_failed()) {
				4287	info->AddFromFollowing(that->loop_node()->info());
				4288	}
				4289	}
				4290
				4291
				4292	void Analysis::VisitBackReference(BackReferenceNode* that) {
				4293	EnsureAnalyzed(that->on_success());
				4294	}
				4295
				4296
				4297	void Analysis::VisitAssertion(AssertionNode* that) {
				4298	EnsureAnalyzed(that->on_success());
				4299	}
				4300
				4301
				4302	// -------------------------------------------------------------------
				4303	// Dispatch table construction
				4304
				4305
				4306	void DispatchTableConstructor::VisitEnd(EndNode* that) {
				4307	AddRange(CharacterRange::Everything());
				4308	}
				4309
				4310
				4311	void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
				4312	node->set_being_calculated(true);
				4313	ZoneList<GuardedAlternative>* alternatives = node->alternatives();
				4314	for (int i = 0; i < alternatives->length(); i++) {
				4315	set_choice_index(i);
				4316	alternatives->at(i).node()->Accept(this);
				4317	}
				4318	node->set_being_calculated(false);
				4319	}
				4320
				4321
				4322	class AddDispatchRange {
				4323	public:
				4324	explicit AddDispatchRange(DispatchTableConstructor* constructor)
				4325	: constructor_(constructor) { }
				4326	void Call(uc32 from, DispatchTable::Entry entry);
				4327	private:
				4328	DispatchTableConstructor* constructor_;
				4329	};
				4330
				4331
				4332	void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
				4333	CharacterRange range(from, entry.to());
				4334	constructor_->AddRange(range);
				4335	}
				4336
				4337
				4338	void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
				4339	if (node->being_calculated())
				4340	return;
				4341	DispatchTable* table = node->GetTable(ignore_case_);
				4342	AddDispatchRange adder(this);
				4343	table->ForEach(&adder);
				4344	}
				4345
				4346
				4347	void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
				4348	// TODO(160): Find the node that we refer back to and propagate its start
				4349	// set back to here. For now we just accept anything.
				4350	AddRange(CharacterRange::Everything());
				4351	}
				4352
				4353
				4354	void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
				4355	RegExpNode* target = that->on_success();
				4356	target->Accept(this);
				4357	}
				4358
				4359
				4360
				4361	static int CompareRangeByFrom(const CharacterRange* a,
				4362	const CharacterRange* b) {
				4363	return Compare<uc16>(a->from(), b->from());
				4364	}
				4365
				4366
				4367	void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
				4368	ranges->Sort(CompareRangeByFrom);
				4369	uc16 last = 0;
				4370	for (int i = 0; i < ranges->length(); i++) {
				4371	CharacterRange range = ranges->at(i);
				4372	if (last < range.from())
				4373	AddRange(CharacterRange(last, range.from() - 1));
				4374	if (range.to() >= last) {
				4375	if (range.to() == String::kMaxUC16CharCode) {
				4376	return;
				4377	} else {
				4378	last = range.to() + 1;
				4379	}
				4380	}
				4381	}
				4382	AddRange(CharacterRange(last, String::kMaxUC16CharCode));
				4383	}
				4384
				4385
				4386	void DispatchTableConstructor::VisitText(TextNode* that) {
				4387	TextElement elm = that->elements()->at(0);
				4388	switch (elm.type) {
				4389	case TextElement::ATOM: {
				4390	uc16 c = elm.data.u_atom->data()[0];
				4391	AddRange(CharacterRange(c, c));
				4392	break;
				4393	}
				4394	case TextElement::CHAR_CLASS: {
				4395	RegExpCharacterClass* tree = elm.data.u_char_class;
				4396	ZoneList<CharacterRange>* ranges = tree->ranges();
				4397	if (tree->is_negated()) {
				4398	AddInverse(ranges);
				4399	} else {
				4400	for (int i = 0; i < ranges->length(); i++)
				4401	AddRange(ranges->at(i));
				4402	}
				4403	break;
				4404	}
				4405	default: {
				4406	UNIMPLEMENTED();
				4407	}
				4408	}
				4409	}
				4410
				4411
				4412	void DispatchTableConstructor::VisitAction(ActionNode* that) {
				4413	RegExpNode* target = that->on_success();
				4414	target->Accept(this);
				4415	}
				4416
				4417
				4418	RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data,
				4419	bool ignore_case,
				4420	bool is_multiline,
				4421	Handle<String> pattern,
				4422	bool is_ascii) {
				4423	if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
				4424	return IrregexpRegExpTooBig();
				4425	}
				4426	RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
				4427	// Wrap the body of the regexp in capture #0.
				4428	RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
				4429	0,
				4430	&compiler,
				4431	compiler.accept());
				4432	RegExpNode* node = captured_body;
				4433	if (!data->tree->IsAnchored()) {
				4434	// Add a .*? at the beginning, outside the body capture, unless
				4435	// this expression is anchored at the beginning.
				4436	RegExpNode* loop_node =
				4437	RegExpQuantifier::ToNode(0,
				4438	RegExpTree::kInfinity,
				4439	false,
				4440	new RegExpCharacterClass('*'),
				4441	&compiler,
				4442	captured_body,
				4443	data->contains_anchor);
				4444
				4445	if (data->contains_anchor) {
				4446	// Unroll loop once, to take care of the case that might start
				4447	// at the start of input.
				4448	ChoiceNode* first_step_node = new ChoiceNode(2);
				4449	first_step_node->AddAlternative(GuardedAlternative(captured_body));
				4450	first_step_node->AddAlternative(GuardedAlternative(
				4451	new TextNode(new RegExpCharacterClass('*'), loop_node)));
				4452	node = first_step_node;
				4453	} else {
				4454	node = loop_node;
				4455	}
				4456	}
				4457	data->node = node;
				4458	Analysis analysis(ignore_case);
				4459	analysis.EnsureAnalyzed(node);
				4460	if (analysis.has_failed()) {
				4461	const char* error_message = analysis.error_message();
				4462	return CompilationResult(error_message);
				4463	}
				4464
				4465	NodeInfo info = *node->info();
				4466
				4467	// Create the correct assembler for the architecture.
				4468	#ifdef V8_NATIVE_REGEXP
				4469	// Native regexp implementation.
				4470
				4471	NativeRegExpMacroAssembler::Mode mode =
				4472	is_ascii ? NativeRegExpMacroAssembler::ASCII
				4473	: NativeRegExpMacroAssembler::UC16;
				4474
				4475	#if V8_TARGET_ARCH_IA32
				4476	RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2);
				4477	#elif V8_TARGET_ARCH_X64
				4478	RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2);
				4479	#elif V8_TARGET_ARCH_ARM
				4480	RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2);
				4481	#endif
				4482
				4483	#else // ! V8_NATIVE_REGEXP
				4484	// Interpreted regexp implementation.
				4485	EmbeddedVector<byte, 1024> codes;
				4486	RegExpMacroAssemblerIrregexp macro_assembler(codes);
				4487	#endif
				4488
				4489	return compiler.Assemble(&macro_assembler,
				4490	node,
				4491	data->capture_count,
				4492	pattern);
				4493	}
				4494
				4495	}} // namespace v8::internal