Blame - src/jsregexp.cc - platform/external/v8

blob: bc47df8f23c0017fce39e41813007dec4a0952a1 [file] [log] [blame]

Ben Murdoch	3fb3ca8	2011-12-02 17:19:32 +0000	[diff] [blame^]	1	// Copyright 2011 the V8 project authors. All rights reserved.
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2	// Redistribution and use in source and binary forms, with or without
				3	// modification, are permitted provided that the following conditions are
				4	// met:
				5	//
				6	// * Redistributions of source code must retain the above copyright
				7	// notice, this list of conditions and the following disclaimer.
				8	// * Redistributions in binary form must reproduce the above
				9	// copyright notice, this list of conditions and the following
				10	// disclaimer in the documentation and/or other materials provided
				11	// with the distribution.
				12	// * Neither the name of Google Inc. nor the names of its
				13	// contributors may be used to endorse or promote products derived
				14	// from this software without specific prior written permission.
				15	//
				16	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				17	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				18	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				19	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				20	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				21	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				22	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				23	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				24	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				25	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				26	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				27
				28	#include "v8.h"
				29
				30	#include "ast.h"
				31	#include "compiler.h"
				32	#include "execution.h"
				33	#include "factory.h"
				34	#include "jsregexp.h"
				35	#include "platform.h"
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	36	#include "string-search.h"
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	37	#include "runtime.h"
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	38	#include "compilation-cache.h"
				39	#include "string-stream.h"
				40	#include "parser.h"
				41	#include "regexp-macro-assembler.h"
				42	#include "regexp-macro-assembler-tracer.h"
				43	#include "regexp-macro-assembler-irregexp.h"
				44	#include "regexp-stack.h"
				45
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	46	#ifndef V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	47	#if V8_TARGET_ARCH_IA32
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	48	#include "ia32/regexp-macro-assembler-ia32.h"
				49	#elif V8_TARGET_ARCH_X64
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	50	#include "x64/regexp-macro-assembler-x64.h"
				51	#elif V8_TARGET_ARCH_ARM
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	52	#include "arm/regexp-macro-assembler-arm.h"
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	53	#elif V8_TARGET_ARCH_MIPS
				54	#include "mips/regexp-macro-assembler-mips.h"
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	55	#else
				56	#error Unsupported target architecture.
				57	#endif
				58	#endif
				59
				60	#include "interpreter-irregexp.h"
				61
				62
				63	namespace v8 {
				64	namespace internal {
				65
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	66	Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
				67	Handle<String> pattern,
				68	Handle<String> flags,
				69	bool* has_pending_exception) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	70	// Call the construct code with 2 arguments.
				71	Object** argv[2] = { Handle<Object>::cast(pattern).location(),
				72	Handle<Object>::cast(flags).location() };
				73	return Execution::New(constructor, 2, argv, has_pending_exception);
				74	}
				75
				76
				77	static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
				78	int flags = JSRegExp::NONE;
				79	for (int i = 0; i < str->length(); i++) {
				80	switch (str->Get(i)) {
				81	case 'i':
				82	flags \|= JSRegExp::IGNORE_CASE;
				83	break;
				84	case 'g':
				85	flags \|= JSRegExp::GLOBAL;
				86	break;
				87	case 'm':
				88	flags \|= JSRegExp::MULTILINE;
				89	break;
				90	}
				91	}
				92	return JSRegExp::Flags(flags);
				93	}
				94
				95
				96	static inline void ThrowRegExpException(Handle<JSRegExp> re,
				97	Handle<String> pattern,
				98	Handle<String> error_text,
				99	const char* message) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	100	Isolate* isolate = re->GetIsolate();
				101	Factory* factory = isolate->factory();
				102	Handle<FixedArray> elements = factory->NewFixedArray(2);
Ben Murdoch	e0cee9b	2011-05-25 10:26:03 +0100	[diff] [blame]	103	elements->set(0, *pattern);
				104	elements->set(1, *error_text);
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	105	Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
				106	Handle<Object> regexp_err = factory->NewSyntaxError(message, array);
				107	isolate->Throw(*regexp_err);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	108	}
				109
				110
				111	// Generic RegExp methods. Dispatches to implementation specific methods.
				112
				113
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	114	Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
				115	Handle<String> pattern,
				116	Handle<String> flag_str) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	117	Isolate* isolate = re->GetIsolate();
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	118	JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	119	CompilationCache* compilation_cache = isolate->compilation_cache();
				120	Handle<FixedArray> cached = compilation_cache->LookupRegExp(pattern, flags);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	121	bool in_cache = !cached.is_null();
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	122	LOG(isolate, RegExpCompileEvent(re, in_cache));
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	123
				124	Handle<Object> result;
				125	if (in_cache) {
				126	re->set_data(*cached);
				127	return re;
				128	}
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	129	pattern = FlattenGetString(pattern);
Ben Murdoch	3fb3ca8	2011-12-02 17:19:32 +0000	[diff] [blame^]	130	ZoneScope zone_scope(isolate, DELETE_ON_EXIT);
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	131	PostponeInterruptsScope postpone(isolate);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	132	RegExpCompileData parse_result;
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	133	FlatStringReader reader(isolate, pattern);
Teng-Hui Zhu	3e5fa29	2010-11-09 16:16:48 -0800	[diff] [blame]	134	if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
				135	&parse_result)) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	136	// Throw an exception if we fail to parse the pattern.
				137	ThrowRegExpException(re,
				138	pattern,
				139	parse_result.error,
				140	"malformed_regexp");
				141	return Handle<Object>::null();
				142	}
				143
				144	if (parse_result.simple && !flags.is_ignore_case()) {
				145	// Parse-tree is a single atom that is equal to the pattern.
				146	AtomCompile(re, pattern, flags, pattern);
				147	} else if (parse_result.tree->IsAtom() &&
				148	!flags.is_ignore_case() &&
				149	parse_result.capture_count == 0) {
				150	RegExpAtom* atom = parse_result.tree->AsAtom();
				151	Vector<const uc16> atom_pattern = atom->data();
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	152	Handle<String> atom_string =
				153	isolate->factory()->NewStringFromTwoByte(atom_pattern);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	154	AtomCompile(re, pattern, flags, atom_string);
				155	} else {
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	156	IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	157	}
				158	ASSERT(re->data()->IsFixedArray());
				159	// Compilation succeeded so the data is set on the regexp
				160	// and we can store it in the cache.
				161	Handle<FixedArray> data(FixedArray::cast(re->data()));
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	162	compilation_cache->PutRegExp(pattern, flags, data);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	163
				164	return re;
				165	}
				166
				167
				168	Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
				169	Handle<String> subject,
				170	int index,
				171	Handle<JSArray> last_match_info) {
				172	switch (regexp->TypeTag()) {
				173	case JSRegExp::ATOM:
				174	return AtomExec(regexp, subject, index, last_match_info);
				175	case JSRegExp::IRREGEXP: {
				176	Handle<Object> result =
				177	IrregexpExec(regexp, subject, index, last_match_info);
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	178	ASSERT(!result.is_null() \|\| Isolate::Current()->has_pending_exception());
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	179	return result;
				180	}
				181	default:
				182	UNREACHABLE();
				183	return Handle<Object>::null();
				184	}
				185	}
				186
				187
				188	// RegExp Atom implementation: Simple string search using indexOf.
				189
				190
				191	void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
				192	Handle<String> pattern,
				193	JSRegExp::Flags flags,
				194	Handle<String> match_pattern) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	195	re->GetIsolate()->factory()->SetRegExpAtomData(re,
				196	JSRegExp::ATOM,
				197	pattern,
				198	flags,
				199	match_pattern);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	200	}
				201
				202
				203	static void SetAtomLastCapture(FixedArray* array,
				204	String* subject,
				205	int from,
				206	int to) {
				207	NoHandleAllocation no_handles;
				208	RegExpImpl::SetLastCaptureCount(array, 2);
				209	RegExpImpl::SetLastSubject(array, subject);
				210	RegExpImpl::SetLastInput(array, subject);
				211	RegExpImpl::SetCapture(array, 0, from);
				212	RegExpImpl::SetCapture(array, 1, to);
				213	}
				214
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	215	/* template <typename SubjectChar>, typename PatternChar>
				216	static int ReStringMatch(Vector<const SubjectChar> sub_vector,
				217	Vector<const PatternChar> pat_vector,
				218	int start_index) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	219
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	220	int pattern_length = pat_vector.length();
				221	if (pattern_length == 0) return start_index;
				222
				223	int subject_length = sub_vector.length();
				224	if (start_index + pattern_length > subject_length) return -1;
				225	return SearchString(sub_vector, pat_vector, start_index);
				226	}
				227	*/
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	228	Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
				229	Handle<String> subject,
				230	int index,
				231	Handle<JSArray> last_match_info) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	232	Isolate* isolate = re->GetIsolate();
				233
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	234	ASSERT(0 <= index);
				235	ASSERT(index <= subject->length());
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	236
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	237	if (!subject->IsFlat()) FlattenString(subject);
				238	AssertNoAllocation no_heap_allocation; // ensure vectors stay valid
				239	// Extract flattened substrings of cons strings before determining asciiness.
				240	String* seq_sub = *subject;
				241	if (seq_sub->IsConsString()) seq_sub = ConsString::cast(seq_sub)->first();
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	242
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	243	String* needle = String::cast(re->DataAt(JSRegExp::kAtomPatternIndex));
				244	int needle_len = needle->length();
				245
				246	if (needle_len != 0) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	247	if (index + needle_len > subject->length())
				248	return isolate->factory()->null_value();
				249
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	250	// dispatch on type of strings
				251	index = (needle->IsAsciiRepresentation()
				252	? (seq_sub->IsAsciiRepresentation()
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	253	? SearchString(isolate,
				254	seq_sub->ToAsciiVector(),
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	255	needle->ToAsciiVector(),
				256	index)
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	257	: SearchString(isolate,
				258	seq_sub->ToUC16Vector(),
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	259	needle->ToAsciiVector(),
				260	index))
				261	: (seq_sub->IsAsciiRepresentation()
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	262	? SearchString(isolate,
				263	seq_sub->ToAsciiVector(),
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	264	needle->ToUC16Vector(),
				265	index)
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	266	: SearchString(isolate,
				267	seq_sub->ToUC16Vector(),
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	268	needle->ToUC16Vector(),
				269	index)));
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	270	if (index == -1) return FACTORY->null_value();
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	271	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	272	ASSERT(last_match_info->HasFastElements());
				273
				274	{
				275	NoHandleAllocation no_handles;
				276	FixedArray* array = FixedArray::cast(last_match_info->elements());
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	277	SetAtomLastCapture(array, *subject, index, index + needle_len);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	278	}
				279	return last_match_info;
				280	}
				281
				282
				283	// Irregexp implementation.
				284
				285	// Ensures that the regexp object contains a compiled version of the
				286	// source for either ASCII or non-ASCII strings.
				287	// If the compiled version doesn't already exist, it is compiled
				288	// from the source pattern.
				289	// If compilation fails, an exception is thrown and this function
				290	// returns false.
				291	bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {
				292	Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	293	#ifdef V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	294	if (compiled_code->IsByteArray()) return true;
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	295	#else // V8_INTERPRETED_REGEXP (RegExp native code)
				296	if (compiled_code->IsCode()) return true;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	297	#endif
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	298	// We could potentially have marked this as flushable, but have kept
				299	// a saved version if we did not flush it yet.
				300	Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii));
				301	if (saved_code->IsCode()) {
				302	// Reinstate the code in the original place.
				303	re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code);
				304	ASSERT(compiled_code->IsSmi());
				305	return true;
				306	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	307	return CompileIrregexp(re, is_ascii);
				308	}
				309
				310
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	311	static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re,
				312	bool is_ascii,
				313	Handle<String> error_message,
				314	Isolate* isolate) {
				315	Factory* factory = isolate->factory();
				316	Handle<FixedArray> elements = factory->NewFixedArray(2);
				317	elements->set(0, re->Pattern());
				318	elements->set(1, *error_message);
				319	Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
				320	Handle<Object> regexp_err =
				321	factory->NewSyntaxError("malformed_regexp", array);
				322	isolate->Throw(*regexp_err);
				323	return false;
				324	}
				325
				326
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	327	bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {
				328	// Compile the RegExp.
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	329	Isolate* isolate = re->GetIsolate();
Ben Murdoch	3fb3ca8	2011-12-02 17:19:32 +0000	[diff] [blame^]	330	ZoneScope zone_scope(isolate, DELETE_ON_EXIT);
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	331	PostponeInterruptsScope postpone(isolate);
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	332	// If we had a compilation error the last time this is saved at the
				333	// saved code index.
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	334	Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	335	// When arriving here entry can only be a smi, either representing an
				336	// uncompiled regexp, a previous compilation error, or code that has
				337	// been flushed.
				338	ASSERT(entry->IsSmi());
				339	int entry_value = Smi::cast(entry)->value();
				340	ASSERT(entry_value == JSRegExp::kUninitializedValue \|\|
				341	entry_value == JSRegExp::kCompilationErrorValue \|\|
				342	(entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
				343
				344	if (entry_value == JSRegExp::kCompilationErrorValue) {
				345	// A previous compilation failed and threw an error which we store in
				346	// the saved code index (we store the error message, not the actual
				347	// error). Recreate the error object and throw it.
				348	Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_ascii));
				349	ASSERT(error_string->IsString());
				350	Handle<String> error_message(String::cast(error_string));
				351	CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	352	return false;
				353	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	354
				355	JSRegExp::Flags flags = re->GetFlags();
				356
				357	Handle<String> pattern(re->Pattern());
				358	if (!pattern->IsFlat()) {
				359	FlattenString(pattern);
				360	}
				361
				362	RegExpCompileData compile_data;
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	363	FlatStringReader reader(isolate, pattern);
Teng-Hui Zhu	3e5fa29	2010-11-09 16:16:48 -0800	[diff] [blame]	364	if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
				365	&compile_data)) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	366	// Throw an exception if we fail to parse the pattern.
				367	// THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
				368	ThrowRegExpException(re,
				369	pattern,
				370	compile_data.error,
				371	"malformed_regexp");
				372	return false;
				373	}
				374	RegExpEngine::CompilationResult result =
				375	RegExpEngine::Compile(&compile_data,
				376	flags.is_ignore_case(),
				377	flags.is_multiline(),
				378	pattern,
				379	is_ascii);
				380	if (result.error_message != NULL) {
				381	// Unable to compile regexp.
Ben Murdoch	e0cee9b	2011-05-25 10:26:03 +0100	[diff] [blame]	382	Handle<String> error_message =
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	383	isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message));
				384	CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	385	return false;
				386	}
				387
				388	Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
				389	data->set(JSRegExp::code_index(is_ascii), result.code);
				390	int register_max = IrregexpMaxRegisterCount(*data);
				391	if (result.num_registers > register_max) {
				392	SetIrregexpMaxRegisterCount(*data, result.num_registers);
				393	}
				394
				395	return true;
				396	}
				397
				398
				399	int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
				400	return Smi::cast(
				401	re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
				402	}
				403
				404
				405	void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
				406	re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
				407	}
				408
				409
				410	int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
				411	return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
				412	}
				413
				414
				415	int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
				416	return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
				417	}
				418
				419
				420	ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
				421	return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
				422	}
				423
				424
				425	Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
				426	return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
				427	}
				428
				429
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	430	void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
				431	Handle<String> pattern,
				432	JSRegExp::Flags flags,
				433	int capture_count) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	434	// Initialize compiled code entries to null.
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	435	re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
				436	JSRegExp::IRREGEXP,
				437	pattern,
				438	flags,
				439	capture_count);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	440	}
				441
				442
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	443	int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
				444	Handle<String> subject) {
				445	if (!subject->IsFlat()) {
				446	FlattenString(subject);
				447	}
Steve Block	8defd9f	2010-07-08 12:39:36 +0100	[diff] [blame]	448	// Check the asciiness of the underlying storage.
				449	bool is_ascii;
				450	{
				451	AssertNoAllocation no_gc;
				452	String* sequential_string = *subject;
				453	if (subject->IsConsString()) {
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	454	sequential_string = ConsString::cast(*subject)->first();
Steve Block	8defd9f	2010-07-08 12:39:36 +0100	[diff] [blame]	455	}
				456	is_ascii = sequential_string->IsAsciiRepresentation();
				457	}
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	458	if (!EnsureCompiledIrregexp(regexp, is_ascii)) {
				459	return -1;
				460	}
				461	#ifdef V8_INTERPRETED_REGEXP
				462	// Byte-code regexp needs space allocated for all its registers.
				463	return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data()));
				464	#else // V8_INTERPRETED_REGEXP
				465	// Native regexp only needs room to output captures. Registers are handled
				466	// internally.
				467	return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
				468	#endif // V8_INTERPRETED_REGEXP
				469	}
				470
				471
Steve Block	791712a	2010-08-27 10:21:07 +0100	[diff] [blame]	472	RegExpImpl::IrregexpResult RegExpImpl::IrregexpExecOnce(
				473	Handle<JSRegExp> regexp,
				474	Handle<String> subject,
				475	int index,
Ben Murdoch	b8e0da2	2011-05-16 14:20:40 +0100	[diff] [blame]	476	Vector<int> output) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	477	Isolate* isolate = regexp->GetIsolate();
				478
				479	Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	480
				481	ASSERT(index >= 0);
				482	ASSERT(index <= subject->length());
				483	ASSERT(subject->IsFlat());
				484
Steve Block	8defd9f	2010-07-08 12:39:36 +0100	[diff] [blame]	485	// A flat ASCII string might have a two-byte first part.
				486	if (subject->IsConsString()) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	487	subject = Handle<String>(ConsString::cast(*subject)->first(), isolate);
Steve Block	8defd9f	2010-07-08 12:39:36 +0100	[diff] [blame]	488	}
				489
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	490	#ifndef V8_INTERPRETED_REGEXP
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	491	ASSERT(output.length() >= (IrregexpNumberOfCaptures(irregexp) + 1) 2);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	492	do {
				493	bool is_ascii = subject->IsAsciiRepresentation();
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	494	EnsureCompiledIrregexp(regexp, is_ascii);
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	495	Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	496	NativeRegExpMacroAssembler::Result res =
				497	NativeRegExpMacroAssembler::Match(code,
				498	subject,
				499	output.start(),
				500	output.length(),
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	501	index,
				502	isolate);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	503	if (res != NativeRegExpMacroAssembler::RETRY) {
				504	ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION \|\|
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	505	isolate->has_pending_exception());
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	506	STATIC_ASSERT(
				507	static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
				508	STATIC_ASSERT(
				509	static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
				510	STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
				511	== RE_EXCEPTION);
				512	return static_cast<IrregexpResult>(res);
				513	}
				514	// If result is RETRY, the string has changed representation, and we
				515	// must restart from scratch.
				516	// In this case, it means we must make sure we are prepared to handle
Steve Block	8defd9f	2010-07-08 12:39:36 +0100	[diff] [blame]	517	// the, potentially, different subject (the string can switch between
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	518	// being internal and external, and even between being ASCII and UC16,
				519	// but the characters are always the same).
				520	IrregexpPrepare(regexp, subject);
				521	} while (true);
				522	UNREACHABLE();
				523	return RE_EXCEPTION;
				524	#else // V8_INTERPRETED_REGEXP
				525
				526	ASSERT(output.length() >= IrregexpNumberOfRegisters(*irregexp));
				527	bool is_ascii = subject->IsAsciiRepresentation();
				528	// We must have done EnsureCompiledIrregexp, so we can get the number of
				529	// registers.
				530	int* register_vector = output.start();
				531	int number_of_capture_registers =
				532	(IrregexpNumberOfCaptures(irregexp) + 1) 2;
				533	for (int i = number_of_capture_registers - 1; i >= 0; i--) {
				534	register_vector[i] = -1;
				535	}
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	536	Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	537
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	538	if (IrregexpInterpreter::Match(isolate,
				539	byte_codes,
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	540	subject,
				541	register_vector,
				542	index)) {
				543	return RE_SUCCESS;
				544	}
				545	return RE_FAILURE;
				546	#endif // V8_INTERPRETED_REGEXP
				547	}
				548
				549
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	550	Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp,
				551	Handle<String> subject,
				552	int previous_index,
				553	Handle<JSArray> last_match_info) {
				554	ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP);
				555
				556	// Prepare space for the return values.
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	557	#ifdef V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	558	#ifdef DEBUG
				559	if (FLAG_trace_regexp_bytecodes) {
				560	String* pattern = jsregexp->Pattern();
				561	PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
				562	PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
				563	}
				564	#endif
				565	#endif
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	566	int required_registers = RegExpImpl::IrregexpPrepare(jsregexp, subject);
				567	if (required_registers < 0) {
				568	// Compiling failed with an exception.
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	569	ASSERT(Isolate::Current()->has_pending_exception());
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	570	return Handle<Object>::null();
				571	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	572
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	573	OffsetsVector registers(required_registers);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	574
Iain Merrick	7568138	2010-08-19 15:07:18 +0100	[diff] [blame]	575	IrregexpResult res = RegExpImpl::IrregexpExecOnce(
Ben Murdoch	b8e0da2	2011-05-16 14:20:40 +0100	[diff] [blame]	576	jsregexp, subject, previous_index, Vector<int>(registers.vector(),
				577	registers.length()));
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	578	if (res == RE_SUCCESS) {
				579	int capture_register_count =
				580	(IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2;
				581	last_match_info->EnsureSize(capture_register_count + kLastMatchOverhead);
				582	AssertNoAllocation no_gc;
				583	int* register_vector = registers.vector();
				584	FixedArray* array = FixedArray::cast(last_match_info->elements());
				585	for (int i = 0; i < capture_register_count; i += 2) {
				586	SetCapture(array, i, register_vector[i]);
				587	SetCapture(array, i + 1, register_vector[i + 1]);
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	588	}
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	589	SetLastCaptureCount(array, capture_register_count);
				590	SetLastSubject(array, *subject);
				591	SetLastInput(array, *subject);
				592	return last_match_info;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	593	}
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	594	if (res == RE_EXCEPTION) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	595	ASSERT(Isolate::Current()->has_pending_exception());
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	596	return Handle<Object>::null();
				597	}
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	598	ASSERT(res == RE_FAILURE);
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	599	return Isolate::Current()->factory()->null_value();
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	600	}
				601
				602
				603	// -------------------------------------------------------------------
				604	// Implementation of the Irregexp regular expression engine.
				605	//
				606	// The Irregexp regular expression engine is intended to be a complete
				607	// implementation of ECMAScript regular expressions. It generates either
				608	// bytecodes or native code.
				609
				610	// The Irregexp regexp engine is structured in three steps.
				611	// 1) The parser generates an abstract syntax tree. See ast.cc.
				612	// 2) From the AST a node network is created. The nodes are all
				613	// subclasses of RegExpNode. The nodes represent states when
				614	// executing a regular expression. Several optimizations are
				615	// performed on the node network.
				616	// 3) From the nodes we generate either byte codes or native code
				617	// that can actually execute the regular expression (perform
				618	// the search). The code generation step is described in more
				619	// detail below.
				620
				621	// Code generation.
				622	//
				623	// The nodes are divided into four main categories.
				624	// * Choice nodes
				625	// These represent places where the regular expression can
				626	// match in more than one way. For example on entry to an
				627	// alternation (foo\|bar) or a repetition (*, +, ? or {}).
				628	// * Action nodes
				629	// These represent places where some action should be
				630	// performed. Examples include recording the current position
				631	// in the input string to a register (in order to implement
				632	// captures) or other actions on register for example in order
				633	// to implement the counters needed for {} repetitions.
				634	// * Matching nodes
				635	// These attempt to match some element part of the input string.
				636	// Examples of elements include character classes, plain strings
				637	// or back references.
				638	// * End nodes
				639	// These are used to implement the actions required on finding
				640	// a successful match or failing to find a match.
				641	//
				642	// The code generated (whether as byte codes or native code) maintains
				643	// some state as it runs. This consists of the following elements:
				644	//
				645	// * The capture registers. Used for string captures.
				646	// * Other registers. Used for counters etc.
				647	// * The current position.
				648	// * The stack of backtracking information. Used when a matching node
				649	// fails to find a match and needs to try an alternative.
				650	//
				651	// Conceptual regular expression execution model:
				652	//
				653	// There is a simple conceptual model of regular expression execution
				654	// which will be presented first. The actual code generated is a more
				655	// efficient simulation of the simple conceptual model:
				656	//
				657	// * Choice nodes are implemented as follows:
				658	// For each choice except the last {
				659	// push current position
				660	// push backtrack code location
				661	// <generate code to test for choice>
				662	// backtrack code location:
				663	// pop current position
				664	// }
				665	// <generate code to test for last choice>
				666	//
				667	// * Actions nodes are generated as follows
				668	// <push affected registers on backtrack stack>
				669	// <generate code to perform action>
				670	// push backtrack code location
				671	// <generate code to test for following nodes>
				672	// backtrack code location:
				673	// <pop affected registers to restore their state>
				674	// <pop backtrack location from stack and go to it>
				675	//
				676	// * Matching nodes are generated as follows:
				677	// if input string matches at current position
				678	// update current position
				679	// <generate code to test for following nodes>
				680	// else
				681	// <pop backtrack location from stack and go to it>
				682	//
				683	// Thus it can be seen that the current position is saved and restored
				684	// by the choice nodes, whereas the registers are saved and restored by
				685	// by the action nodes that manipulate them.
				686	//
				687	// The other interesting aspect of this model is that nodes are generated
				688	// at the point where they are needed by a recursive call to Emit(). If
				689	// the node has already been code generated then the Emit() call will
				690	// generate a jump to the previously generated code instead. In order to
				691	// limit recursion it is possible for the Emit() function to put the node
				692	// on a work list for later generation and instead generate a jump. The
				693	// destination of the jump is resolved later when the code is generated.
				694	//
				695	// Actual regular expression code generation.
				696	//
				697	// Code generation is actually more complicated than the above. In order
				698	// to improve the efficiency of the generated code some optimizations are
				699	// performed
				700	//
				701	// * Choice nodes have 1-character lookahead.
				702	// A choice node looks at the following character and eliminates some of
				703	// the choices immediately based on that character. This is not yet
				704	// implemented.
				705	// * Simple greedy loops store reduced backtracking information.
				706	// A quantifier like /.*foo/m will greedily match the whole input. It will
				707	// then need to backtrack to a point where it can match "foo". The naive
				708	// implementation of this would push each character position onto the
				709	// backtracking stack, then pop them off one by one. This would use space
				710	// proportional to the length of the input string. However since the "."
				711	// can only match in one way and always has a constant length (in this case
				712	// of 1) it suffices to store the current position on the top of the stack
				713	// once. Matching now becomes merely incrementing the current position and
				714	// backtracking becomes decrementing the current position and checking the
				715	// result against the stored current position. This is faster and saves
				716	// space.
				717	// * The current state is virtualized.
				718	// This is used to defer expensive operations until it is clear that they
				719	// are needed and to generate code for a node more than once, allowing
				720	// specialized an efficient versions of the code to be created. This is
				721	// explained in the section below.
				722	//
				723	// Execution state virtualization.
				724	//
				725	// Instead of emitting code, nodes that manipulate the state can record their
				726	// manipulation in an object called the Trace. The Trace object can record a
				727	// current position offset, an optional backtrack code location on the top of
				728	// the virtualized backtrack stack and some register changes. When a node is
				729	// to be emitted it can flush the Trace or update it. Flushing the Trace
				730	// will emit code to bring the actual state into line with the virtual state.
				731	// Avoiding flushing the state can postpone some work (eg updates of capture
				732	// registers). Postponing work can save time when executing the regular
				733	// expression since it may be found that the work never has to be done as a
				734	// failure to match can occur. In addition it is much faster to jump to a
				735	// known backtrack code location than it is to pop an unknown backtrack
				736	// location from the stack and jump there.
				737	//
				738	// The virtual state found in the Trace affects code generation. For example
				739	// the virtual state contains the difference between the actual current
				740	// position and the virtual current position, and matching code needs to use
				741	// this offset to attempt a match in the correct location of the input
				742	// string. Therefore code generated for a non-trivial trace is specialized
				743	// to that trace. The code generator therefore has the ability to generate
				744	// code for each node several times. In order to limit the size of the
				745	// generated code there is an arbitrary limit on how many specialized sets of
				746	// code may be generated for a given node. If the limit is reached, the
				747	// trace is flushed and a generic version of the code for a node is emitted.
				748	// This is subsequently used for that node. The code emitted for non-generic
				749	// trace is not recorded in the node and so it cannot currently be reused in
				750	// the event that code generation is requested for an identical trace.
				751
				752
				753	void RegExpTree::AppendToText(RegExpText* text) {
				754	UNREACHABLE();
				755	}
				756
				757
				758	void RegExpAtom::AppendToText(RegExpText* text) {
				759	text->AddElement(TextElement::Atom(this));
				760	}
				761
				762
				763	void RegExpCharacterClass::AppendToText(RegExpText* text) {
				764	text->AddElement(TextElement::CharClass(this));
				765	}
				766
				767
				768	void RegExpText::AppendToText(RegExpText* text) {
				769	for (int i = 0; i < elements()->length(); i++)
				770	text->AddElement(elements()->at(i));
				771	}
				772
				773
				774	TextElement TextElement::Atom(RegExpAtom* atom) {
				775	TextElement result = TextElement(ATOM);
				776	result.data.u_atom = atom;
				777	return result;
				778	}
				779
				780
				781	TextElement TextElement::CharClass(
				782	RegExpCharacterClass* char_class) {
				783	TextElement result = TextElement(CHAR_CLASS);
				784	result.data.u_char_class = char_class;
				785	return result;
				786	}
				787
				788
				789	int TextElement::length() {
				790	if (type == ATOM) {
				791	return data.u_atom->length();
				792	} else {
				793	ASSERT(type == CHAR_CLASS);
				794	return 1;
				795	}
				796	}
				797
				798
				799	DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
				800	if (table_ == NULL) {
				801	table_ = new DispatchTable();
				802	DispatchTableConstructor cons(table_, ignore_case);
				803	cons.BuildTable(this);
				804	}
				805	return table_;
				806	}
				807
				808
				809	class RegExpCompiler {
				810	public:
				811	RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
				812
				813	int AllocateRegister() {
				814	if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
				815	reg_exp_too_big_ = true;
				816	return next_register_;
				817	}
				818	return next_register_++;
				819	}
				820
				821	RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
				822	RegExpNode* start,
				823	int capture_count,
				824	Handle<String> pattern);
				825
				826	inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
				827
				828	static const int kImplementationOffset = 0;
				829	static const int kNumberOfRegistersOffset = 0;
				830	static const int kCodeOffset = 1;
				831
				832	RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
				833	EndNode* accept() { return accept_; }
				834
				835	static const int kMaxRecursion = 100;
				836	inline int recursion_depth() { return recursion_depth_; }
				837	inline void IncrementRecursionDepth() { recursion_depth_++; }
				838	inline void DecrementRecursionDepth() { recursion_depth_--; }
				839
				840	void SetRegExpTooBig() { reg_exp_too_big_ = true; }
				841
				842	inline bool ignore_case() { return ignore_case_; }
				843	inline bool ascii() { return ascii_; }
				844
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	845	int current_expansion_factor() { return current_expansion_factor_; }
				846	void set_current_expansion_factor(int value) {
				847	current_expansion_factor_ = value;
				848	}
				849
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	850	static const int kNoRegister = -1;
Ben Murdoch	3fb3ca8	2011-12-02 17:19:32 +0000	[diff] [blame^]	851
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	852	private:
				853	EndNode* accept_;
				854	int next_register_;
				855	List<RegExpNode> work_list_;
				856	int recursion_depth_;
				857	RegExpMacroAssembler* macro_assembler_;
				858	bool ignore_case_;
				859	bool ascii_;
				860	bool reg_exp_too_big_;
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	861	int current_expansion_factor_;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	862	};
				863
				864
				865	class RecursionCheck {
				866	public:
				867	explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
				868	compiler->IncrementRecursionDepth();
				869	}
				870	~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
				871	private:
				872	RegExpCompiler* compiler_;
				873	};
				874
				875
				876	static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
				877	return RegExpEngine::CompilationResult("RegExp too big");
				878	}
				879
				880
				881	// Attempts to compile the regexp using an Irregexp code generator. Returns
				882	// a fixed array or a null handle depending on whether it succeeded.
				883	RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
				884	: next_register_(2 * (capture_count + 1)),
				885	work_list_(NULL),
				886	recursion_depth_(0),
				887	ignore_case_(ignore_case),
				888	ascii_(ascii),
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	889	reg_exp_too_big_(false),
				890	current_expansion_factor_(1) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	891	accept_ = new EndNode(EndNode::ACCEPT);
				892	ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
				893	}
				894
				895
				896	RegExpEngine::CompilationResult RegExpCompiler::Assemble(
				897	RegExpMacroAssembler* macro_assembler,
				898	RegExpNode* start,
				899	int capture_count,
				900	Handle<String> pattern) {
Steve Block	053d10c	2011-06-13 19:13:29 +0100	[diff] [blame]	901	Heap* heap = pattern->GetHeap();
				902
				903	bool use_slow_safe_regexp_compiler = false;
				904	if (heap->total_regexp_code_generated() >
				905	RegExpImpl::kRegWxpCompiledLimit &&
				906	heap->isolate()->memory_allocator()->SizeExecutable() >
				907	RegExpImpl::kRegExpExecutableMemoryLimit) {
				908	use_slow_safe_regexp_compiler = true;
				909	}
				910
				911	macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);
				912
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	913	#ifdef DEBUG
				914	if (FLAG_trace_regexp_assembler)
				915	macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
				916	else
				917	#endif
				918	macro_assembler_ = macro_assembler;
Steve Block	053d10c	2011-06-13 19:13:29 +0100	[diff] [blame]	919
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	920	List <RegExpNode*> work_list(0);
				921	work_list_ = &work_list;
				922	Label fail;
				923	macro_assembler_->PushBacktrack(&fail);
				924	Trace new_trace;
				925	start->Emit(this, &new_trace);
				926	macro_assembler_->Bind(&fail);
				927	macro_assembler_->Fail();
				928	while (!work_list.is_empty()) {
				929	work_list.RemoveLast()->Emit(this, &new_trace);
				930	}
				931	if (reg_exp_too_big_) return IrregexpRegExpTooBig();
				932
Steve Block	053d10c	2011-06-13 19:13:29 +0100	[diff] [blame]	933	Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
				934	heap->IncreaseTotalRegexpCodeGenerated(code->Size());
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	935	work_list_ = NULL;
				936	#ifdef DEBUG
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	937	if (FLAG_print_code) {
				938	Handle<Code>::cast(code)->Disassemble(*pattern->ToCString());
				939	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	940	if (FLAG_trace_regexp_assembler) {
				941	delete macro_assembler_;
				942	}
				943	#endif
				944	return RegExpEngine::CompilationResult(*code, next_register_);
				945	}
				946
				947
				948	bool Trace::DeferredAction::Mentions(int that) {
				949	if (type() == ActionNode::CLEAR_CAPTURES) {
				950	Interval range = static_cast<DeferredClearCaptures*>(this)->range();
				951	return range.Contains(that);
				952	} else {
				953	return reg() == that;
				954	}
				955	}
				956
				957
				958	bool Trace::mentions_reg(int reg) {
				959	for (DeferredAction* action = actions_;
				960	action != NULL;
				961	action = action->next()) {
				962	if (action->Mentions(reg))
				963	return true;
				964	}
				965	return false;
				966	}
				967
				968
				969	bool Trace::GetStoredPosition(int reg, int* cp_offset) {
				970	ASSERT_EQ(0, *cp_offset);
				971	for (DeferredAction* action = actions_;
				972	action != NULL;
				973	action = action->next()) {
				974	if (action->Mentions(reg)) {
				975	if (action->type() == ActionNode::STORE_POSITION) {
				976	cp_offset = static_cast<DeferredCapture>(action)->cp_offset();
				977	return true;
				978	} else {
				979	return false;
				980	}
				981	}
				982	}
				983	return false;
				984	}
				985
				986
				987	int Trace::FindAffectedRegisters(OutSet* affected_registers) {
				988	int max_register = RegExpCompiler::kNoRegister;
				989	for (DeferredAction* action = actions_;
				990	action != NULL;
				991	action = action->next()) {
				992	if (action->type() == ActionNode::CLEAR_CAPTURES) {
				993	Interval range = static_cast<DeferredClearCaptures*>(action)->range();
				994	for (int i = range.from(); i <= range.to(); i++)
				995	affected_registers->Set(i);
				996	if (range.to() > max_register) max_register = range.to();
				997	} else {
				998	affected_registers->Set(action->reg());
				999	if (action->reg() > max_register) max_register = action->reg();
				1000	}
				1001	}
				1002	return max_register;
				1003	}
				1004
				1005
				1006	void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
				1007	int max_register,
				1008	OutSet& registers_to_pop,
				1009	OutSet& registers_to_clear) {
				1010	for (int reg = max_register; reg >= 0; reg--) {
				1011	if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
				1012	else if (registers_to_clear.Get(reg)) {
				1013	int clear_to = reg;
				1014	while (reg > 0 && registers_to_clear.Get(reg - 1)) {
				1015	reg--;
				1016	}
				1017	assembler->ClearRegisters(reg, clear_to);
				1018	}
				1019	}
				1020	}
				1021
				1022
				1023	void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
				1024	int max_register,
				1025	OutSet& affected_registers,
				1026	OutSet* registers_to_pop,
				1027	OutSet* registers_to_clear) {
				1028	// The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
				1029	const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
				1030
				1031	// Count pushes performed to force a stack limit check occasionally.
				1032	int pushes = 0;
				1033
				1034	for (int reg = 0; reg <= max_register; reg++) {
				1035	if (!affected_registers.Get(reg)) {
				1036	continue;
				1037	}
				1038
				1039	// The chronologically first deferred action in the trace
				1040	// is used to infer the action needed to restore a register
				1041	// to its previous state (or not, if it's safe to ignore it).
				1042	enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
				1043	DeferredActionUndoType undo_action = IGNORE;
				1044
				1045	int value = 0;
				1046	bool absolute = false;
				1047	bool clear = false;
				1048	int store_position = -1;
				1049	// This is a little tricky because we are scanning the actions in reverse
				1050	// historical order (newest first).
				1051	for (DeferredAction* action = actions_;
				1052	action != NULL;
				1053	action = action->next()) {
				1054	if (action->Mentions(reg)) {
				1055	switch (action->type()) {
				1056	case ActionNode::SET_REGISTER: {
				1057	Trace::DeferredSetRegister* psr =
				1058	static_cast<Trace::DeferredSetRegister*>(action);
				1059	if (!absolute) {
				1060	value += psr->value();
				1061	absolute = true;
				1062	}
				1063	// SET_REGISTER is currently only used for newly introduced loop
				1064	// counters. They can have a significant previous value if they
				1065	// occour in a loop. TODO(lrn): Propagate this information, so
				1066	// we can set undo_action to IGNORE if we know there is no value to
				1067	// restore.
				1068	undo_action = RESTORE;
				1069	ASSERT_EQ(store_position, -1);
				1070	ASSERT(!clear);
				1071	break;
				1072	}
				1073	case ActionNode::INCREMENT_REGISTER:
				1074	if (!absolute) {
				1075	value++;
				1076	}
				1077	ASSERT_EQ(store_position, -1);
				1078	ASSERT(!clear);
				1079	undo_action = RESTORE;
				1080	break;
				1081	case ActionNode::STORE_POSITION: {
				1082	Trace::DeferredCapture* pc =
				1083	static_cast<Trace::DeferredCapture*>(action);
				1084	if (!clear && store_position == -1) {
				1085	store_position = pc->cp_offset();
				1086	}
				1087
				1088	// For captures we know that stores and clears alternate.
				1089	// Other register, are never cleared, and if the occur
				1090	// inside a loop, they might be assigned more than once.
				1091	if (reg <= 1) {
				1092	// Registers zero and one, aka "capture zero", is
				1093	// always set correctly if we succeed. There is no
				1094	// need to undo a setting on backtrack, because we
				1095	// will set it again or fail.
				1096	undo_action = IGNORE;
				1097	} else {
				1098	undo_action = pc->is_capture() ? CLEAR : RESTORE;
				1099	}
				1100	ASSERT(!absolute);
				1101	ASSERT_EQ(value, 0);
				1102	break;
				1103	}
				1104	case ActionNode::CLEAR_CAPTURES: {
				1105	// Since we're scanning in reverse order, if we've already
				1106	// set the position we have to ignore historically earlier
				1107	// clearing operations.
				1108	if (store_position == -1) {
				1109	clear = true;
				1110	}
				1111	undo_action = RESTORE;
				1112	ASSERT(!absolute);
				1113	ASSERT_EQ(value, 0);
				1114	break;
				1115	}
				1116	default:
				1117	UNREACHABLE();
				1118	break;
				1119	}
				1120	}
				1121	}
				1122	// Prepare for the undo-action (e.g., push if it's going to be popped).
				1123	if (undo_action == RESTORE) {
				1124	pushes++;
				1125	RegExpMacroAssembler::StackCheckFlag stack_check =
				1126	RegExpMacroAssembler::kNoStackLimitCheck;
				1127	if (pushes == push_limit) {
				1128	stack_check = RegExpMacroAssembler::kCheckStackLimit;
				1129	pushes = 0;
				1130	}
				1131
				1132	assembler->PushRegister(reg, stack_check);
				1133	registers_to_pop->Set(reg);
				1134	} else if (undo_action == CLEAR) {
				1135	registers_to_clear->Set(reg);
				1136	}
				1137	// Perform the chronologically last action (or accumulated increment)
				1138	// for the register.
				1139	if (store_position != -1) {
				1140	assembler->WriteCurrentPositionToRegister(reg, store_position);
				1141	} else if (clear) {
				1142	assembler->ClearRegisters(reg, reg);
				1143	} else if (absolute) {
				1144	assembler->SetRegister(reg, value);
				1145	} else if (value != 0) {
				1146	assembler->AdvanceRegister(reg, value);
				1147	}
				1148	}
				1149	}
				1150
				1151
				1152	// This is called as we come into a loop choice node and some other tricky
				1153	// nodes. It normalizes the state of the code generator to ensure we can
				1154	// generate generic code.
				1155	void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
				1156	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1157
				1158	ASSERT(!is_trivial());
				1159
				1160	if (actions_ == NULL && backtrack() == NULL) {
				1161	// Here we just have some deferred cp advances to fix and we are back to
				1162	// a normal situation. We may also have to forget some information gained
				1163	// through a quick check that was already performed.
				1164	if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
				1165	// Create a new trivial state and generate the node with that.
				1166	Trace new_state;
				1167	successor->Emit(compiler, &new_state);
				1168	return;
				1169	}
				1170
				1171	// Generate deferred actions here along with code to undo them again.
				1172	OutSet affected_registers;
				1173
				1174	if (backtrack() != NULL) {
				1175	// Here we have a concrete backtrack location. These are set up by choice
				1176	// nodes and so they indicate that we have a deferred save of the current
				1177	// position which we may need to emit here.
				1178	assembler->PushCurrentPosition();
				1179	}
				1180
				1181	int max_register = FindAffectedRegisters(&affected_registers);
				1182	OutSet registers_to_pop;
				1183	OutSet registers_to_clear;
				1184	PerformDeferredActions(assembler,
				1185	max_register,
				1186	affected_registers,
				1187	&registers_to_pop,
				1188	&registers_to_clear);
				1189	if (cp_offset_ != 0) {
				1190	assembler->AdvanceCurrentPosition(cp_offset_);
				1191	}
				1192
				1193	// Create a new trivial state and generate the node with that.
				1194	Label undo;
				1195	assembler->PushBacktrack(&undo);
				1196	Trace new_state;
				1197	successor->Emit(compiler, &new_state);
				1198
				1199	// On backtrack we need to restore state.
				1200	assembler->Bind(&undo);
				1201	RestoreAffectedRegisters(assembler,
				1202	max_register,
				1203	registers_to_pop,
				1204	registers_to_clear);
				1205	if (backtrack() == NULL) {
				1206	assembler->Backtrack();
				1207	} else {
				1208	assembler->PopCurrentPosition();
				1209	assembler->GoTo(backtrack());
				1210	}
				1211	}
				1212
				1213
				1214	void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
				1215	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1216
				1217	// Omit flushing the trace. We discard the entire stack frame anyway.
				1218
				1219	if (!label()->is_bound()) {
				1220	// We are completely independent of the trace, since we ignore it,
				1221	// so this code can be used as the generic version.
				1222	assembler->Bind(label());
				1223	}
				1224
				1225	// Throw away everything on the backtrack stack since the start
				1226	// of the negative submatch and restore the character position.
				1227	assembler->ReadCurrentPositionFromRegister(current_position_register_);
				1228	assembler->ReadStackPointerFromRegister(stack_pointer_register_);
				1229	if (clear_capture_count_ > 0) {
				1230	// Clear any captures that might have been performed during the success
				1231	// of the body of the negative look-ahead.
				1232	int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
				1233	assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
				1234	}
				1235	// Now that we have unwound the stack we find at the top of the stack the
				1236	// backtrack that the BeginSubmatch node got.
				1237	assembler->Backtrack();
				1238	}
				1239
				1240
				1241	void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				1242	if (!trace->is_trivial()) {
				1243	trace->Flush(compiler, this);
				1244	return;
				1245	}
				1246	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1247	if (!label()->is_bound()) {
				1248	assembler->Bind(label());
				1249	}
				1250	switch (action_) {
				1251	case ACCEPT:
				1252	assembler->Succeed();
				1253	return;
				1254	case BACKTRACK:
				1255	assembler->GoTo(trace->backtrack());
				1256	return;
				1257	case NEGATIVE_SUBMATCH_SUCCESS:
				1258	// This case is handled in a different virtual method.
				1259	UNREACHABLE();
				1260	}
				1261	UNIMPLEMENTED();
				1262	}
				1263
				1264
				1265	void GuardedAlternative::AddGuard(Guard* guard) {
				1266	if (guards_ == NULL)
				1267	guards_ = new ZoneList<Guard*>(1);
				1268	guards_->Add(guard);
				1269	}
				1270
				1271
				1272	ActionNode* ActionNode::SetRegister(int reg,
				1273	int val,
				1274	RegExpNode* on_success) {
				1275	ActionNode* result = new ActionNode(SET_REGISTER, on_success);
				1276	result->data_.u_store_register.reg = reg;
				1277	result->data_.u_store_register.value = val;
				1278	return result;
				1279	}
				1280
				1281
				1282	ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
				1283	ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
				1284	result->data_.u_increment_register.reg = reg;
				1285	return result;
				1286	}
				1287
				1288
				1289	ActionNode* ActionNode::StorePosition(int reg,
				1290	bool is_capture,
				1291	RegExpNode* on_success) {
				1292	ActionNode* result = new ActionNode(STORE_POSITION, on_success);
				1293	result->data_.u_position_register.reg = reg;
				1294	result->data_.u_position_register.is_capture = is_capture;
				1295	return result;
				1296	}
				1297
				1298
				1299	ActionNode* ActionNode::ClearCaptures(Interval range,
				1300	RegExpNode* on_success) {
				1301	ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
				1302	result->data_.u_clear_captures.range_from = range.from();
				1303	result->data_.u_clear_captures.range_to = range.to();
				1304	return result;
				1305	}
				1306
				1307
				1308	ActionNode* ActionNode::BeginSubmatch(int stack_reg,
				1309	int position_reg,
				1310	RegExpNode* on_success) {
				1311	ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
				1312	result->data_.u_submatch.stack_pointer_register = stack_reg;
				1313	result->data_.u_submatch.current_position_register = position_reg;
				1314	return result;
				1315	}
				1316
				1317
				1318	ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
				1319	int position_reg,
				1320	int clear_register_count,
				1321	int clear_register_from,
				1322	RegExpNode* on_success) {
				1323	ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
				1324	result->data_.u_submatch.stack_pointer_register = stack_reg;
				1325	result->data_.u_submatch.current_position_register = position_reg;
				1326	result->data_.u_submatch.clear_register_count = clear_register_count;
				1327	result->data_.u_submatch.clear_register_from = clear_register_from;
				1328	return result;
				1329	}
				1330
				1331
				1332	ActionNode* ActionNode::EmptyMatchCheck(int start_register,
				1333	int repetition_register,
				1334	int repetition_limit,
				1335	RegExpNode* on_success) {
				1336	ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
				1337	result->data_.u_empty_match_check.start_register = start_register;
				1338	result->data_.u_empty_match_check.repetition_register = repetition_register;
				1339	result->data_.u_empty_match_check.repetition_limit = repetition_limit;
				1340	return result;
				1341	}
				1342
				1343
				1344	#define DEFINE_ACCEPT(Type) \
				1345	void Type##Node::Accept(NodeVisitor* visitor) { \
				1346	visitor->Visit##Type(this); \
				1347	}
				1348	FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
				1349	#undef DEFINE_ACCEPT
				1350
				1351
				1352	void LoopChoiceNode::Accept(NodeVisitor* visitor) {
				1353	visitor->VisitLoopChoice(this);
				1354	}
				1355
				1356
				1357	// -------------------------------------------------------------------
				1358	// Emit code.
				1359
				1360
				1361	void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
				1362	Guard* guard,
				1363	Trace* trace) {
				1364	switch (guard->op()) {
				1365	case Guard::LT:
				1366	ASSERT(!trace->mentions_reg(guard->reg()));
				1367	macro_assembler->IfRegisterGE(guard->reg(),
				1368	guard->value(),
				1369	trace->backtrack());
				1370	break;
				1371	case Guard::GEQ:
				1372	ASSERT(!trace->mentions_reg(guard->reg()));
				1373	macro_assembler->IfRegisterLT(guard->reg(),
				1374	guard->value(),
				1375	trace->backtrack());
				1376	break;
				1377	}
				1378	}
				1379
				1380
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1381	// Returns the number of characters in the equivalence class, omitting those
				1382	// that cannot occur in the source string because it is ASCII.
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	1383	static int GetCaseIndependentLetters(Isolate* isolate,
				1384	uc16 character,
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1385	bool ascii_subject,
				1386	unibrow::uchar* letters) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	1387	int length =
				1388	isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	1389	// Unibrow returns 0 or 1 for characters where case independence is
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1390	// trivial.
				1391	if (length == 0) {
				1392	letters[0] = character;
				1393	length = 1;
				1394	}
				1395	if (!ascii_subject \|\| character <= String::kMaxAsciiCharCode) {
				1396	return length;
				1397	}
				1398	// The standard requires that non-ASCII characters cannot have ASCII
				1399	// character codes in their equivalence class.
				1400	return 0;
				1401	}
				1402
				1403
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	1404	static inline bool EmitSimpleCharacter(Isolate* isolate,
				1405	RegExpCompiler* compiler,
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1406	uc16 c,
				1407	Label* on_failure,
				1408	int cp_offset,
				1409	bool check,
				1410	bool preloaded) {
				1411	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1412	bool bound_checked = false;
				1413	if (!preloaded) {
				1414	assembler->LoadCurrentCharacter(
				1415	cp_offset,
				1416	on_failure,
				1417	check);
				1418	bound_checked = true;
				1419	}
				1420	assembler->CheckNotCharacter(c, on_failure);
				1421	return bound_checked;
				1422	}
				1423
				1424
				1425	// Only emits non-letters (things that don't have case). Only used for case
				1426	// independent matches.
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	1427	static inline bool EmitAtomNonLetter(Isolate* isolate,
				1428	RegExpCompiler* compiler,
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1429	uc16 c,
				1430	Label* on_failure,
				1431	int cp_offset,
				1432	bool check,
				1433	bool preloaded) {
				1434	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1435	bool ascii = compiler->ascii();
				1436	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	1437	int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1438	if (length < 1) {
				1439	// This can't match. Must be an ASCII subject and a non-ASCII character.
				1440	// We do not need to do anything since the ASCII pass already handled this.
				1441	return false; // Bounds not checked.
				1442	}
				1443	bool checked = false;
				1444	// We handle the length > 1 case in a later pass.
				1445	if (length == 1) {
				1446	if (ascii && c > String::kMaxAsciiCharCodeU) {
				1447	// Can't match - see above.
				1448	return false; // Bounds not checked.
				1449	}
				1450	if (!preloaded) {
				1451	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
				1452	checked = check;
				1453	}
				1454	macro_assembler->CheckNotCharacter(c, on_failure);
				1455	}
				1456	return checked;
				1457	}
				1458
				1459
				1460	static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
				1461	bool ascii,
				1462	uc16 c1,
				1463	uc16 c2,
				1464	Label* on_failure) {
				1465	uc16 char_mask;
				1466	if (ascii) {
				1467	char_mask = String::kMaxAsciiCharCode;
				1468	} else {
				1469	char_mask = String::kMaxUC16CharCode;
				1470	}
				1471	uc16 exor = c1 ^ c2;
				1472	// Check whether exor has only one bit set.
				1473	if (((exor - 1) & exor) == 0) {
				1474	// If c1 and c2 differ only by one bit.
				1475	// Ecma262UnCanonicalize always gives the highest number last.
				1476	ASSERT(c2 > c1);
				1477	uc16 mask = char_mask ^ exor;
				1478	macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
				1479	return true;
				1480	}
				1481	ASSERT(c2 > c1);
				1482	uc16 diff = c2 - c1;
				1483	if (((diff - 1) & diff) == 0 && c1 >= diff) {
				1484	// If the characters differ by 2^n but don't differ by one bit then
				1485	// subtract the difference from the found character, then do the or
				1486	// trick. We avoid the theoretical case where negative numbers are
				1487	// involved in order to simplify code generation.
				1488	uc16 mask = char_mask ^ diff;
				1489	macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
				1490	diff,
				1491	mask,
				1492	on_failure);
				1493	return true;
				1494	}
				1495	return false;
				1496	}
				1497
				1498
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	1499	typedef bool EmitCharacterFunction(Isolate* isolate,
				1500	RegExpCompiler* compiler,
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1501	uc16 c,
				1502	Label* on_failure,
				1503	int cp_offset,
				1504	bool check,
				1505	bool preloaded);
				1506
				1507	// Only emits letters (things that have case). Only used for case independent
				1508	// matches.
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	1509	static inline bool EmitAtomLetter(Isolate* isolate,
				1510	RegExpCompiler* compiler,
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1511	uc16 c,
				1512	Label* on_failure,
				1513	int cp_offset,
				1514	bool check,
				1515	bool preloaded) {
				1516	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1517	bool ascii = compiler->ascii();
				1518	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	1519	int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1520	if (length <= 1) return false;
				1521	// We may not need to check against the end of the input string
				1522	// if this character lies before a character that matched.
				1523	if (!preloaded) {
				1524	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
				1525	}
				1526	Label ok;
				1527	ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
				1528	switch (length) {
				1529	case 2: {
				1530	if (ShortCutEmitCharacterPair(macro_assembler,
				1531	ascii,
				1532	chars[0],
				1533	chars[1],
				1534	on_failure)) {
				1535	} else {
				1536	macro_assembler->CheckCharacter(chars[0], &ok);
				1537	macro_assembler->CheckNotCharacter(chars[1], on_failure);
				1538	macro_assembler->Bind(&ok);
				1539	}
				1540	break;
				1541	}
				1542	case 4:
				1543	macro_assembler->CheckCharacter(chars[3], &ok);
				1544	// Fall through!
				1545	case 3:
				1546	macro_assembler->CheckCharacter(chars[0], &ok);
				1547	macro_assembler->CheckCharacter(chars[1], &ok);
				1548	macro_assembler->CheckNotCharacter(chars[2], on_failure);
				1549	macro_assembler->Bind(&ok);
				1550	break;
				1551	default:
				1552	UNREACHABLE();
				1553	break;
				1554	}
				1555	return true;
				1556	}
				1557
				1558
				1559	static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
				1560	RegExpCharacterClass* cc,
				1561	bool ascii,
				1562	Label* on_failure,
				1563	int cp_offset,
				1564	bool check_offset,
				1565	bool preloaded) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1566	ZoneList<CharacterRange>* ranges = cc->ranges();
				1567	int max_char;
				1568	if (ascii) {
				1569	max_char = String::kMaxAsciiCharCode;
				1570	} else {
				1571	max_char = String::kMaxUC16CharCode;
				1572	}
				1573
				1574	Label success;
				1575
				1576	Label* char_is_in_class =
				1577	cc->is_negated() ? on_failure : &success;
				1578
				1579	int range_count = ranges->length();
				1580
				1581	int last_valid_range = range_count - 1;
				1582	while (last_valid_range >= 0) {
				1583	CharacterRange& range = ranges->at(last_valid_range);
				1584	if (range.from() <= max_char) {
				1585	break;
				1586	}
				1587	last_valid_range--;
				1588	}
				1589
				1590	if (last_valid_range < 0) {
				1591	if (!cc->is_negated()) {
				1592	// TODO(plesner): We can remove this when the node level does our
				1593	// ASCII optimizations for us.
				1594	macro_assembler->GoTo(on_failure);
				1595	}
				1596	if (check_offset) {
				1597	macro_assembler->CheckPosition(cp_offset, on_failure);
				1598	}
				1599	return;
				1600	}
				1601
				1602	if (last_valid_range == 0 &&
				1603	!cc->is_negated() &&
				1604	ranges->at(0).IsEverything(max_char)) {
				1605	// This is a common case hit by non-anchored expressions.
				1606	if (check_offset) {
				1607	macro_assembler->CheckPosition(cp_offset, on_failure);
				1608	}
				1609	return;
				1610	}
				1611
				1612	if (!preloaded) {
				1613	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
				1614	}
				1615
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	1616	if (cc->is_standard() &&
				1617	macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
				1618	on_failure)) {
				1619	return;
				1620	}
				1621
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1622	for (int i = 0; i < last_valid_range; i++) {
				1623	CharacterRange& range = ranges->at(i);
				1624	Label next_range;
				1625	uc16 from = range.from();
				1626	uc16 to = range.to();
				1627	if (from > max_char) {
				1628	continue;
				1629	}
				1630	if (to > max_char) to = max_char;
				1631	if (to == from) {
				1632	macro_assembler->CheckCharacter(to, char_is_in_class);
				1633	} else {
				1634	if (from != 0) {
				1635	macro_assembler->CheckCharacterLT(from, &next_range);
				1636	}
				1637	if (to != max_char) {
				1638	macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
				1639	} else {
				1640	macro_assembler->GoTo(char_is_in_class);
				1641	}
				1642	}
				1643	macro_assembler->Bind(&next_range);
				1644	}
				1645
				1646	CharacterRange& range = ranges->at(last_valid_range);
				1647	uc16 from = range.from();
				1648	uc16 to = range.to();
				1649
				1650	if (to > max_char) to = max_char;
				1651	ASSERT(to >= from);
				1652
				1653	if (to == from) {
				1654	if (cc->is_negated()) {
				1655	macro_assembler->CheckCharacter(to, on_failure);
				1656	} else {
				1657	macro_assembler->CheckNotCharacter(to, on_failure);
				1658	}
				1659	} else {
				1660	if (from != 0) {
				1661	if (cc->is_negated()) {
				1662	macro_assembler->CheckCharacterLT(from, &success);
				1663	} else {
				1664	macro_assembler->CheckCharacterLT(from, on_failure);
				1665	}
				1666	}
				1667	if (to != String::kMaxUC16CharCode) {
				1668	if (cc->is_negated()) {
				1669	macro_assembler->CheckCharacterLT(to + 1, on_failure);
				1670	} else {
				1671	macro_assembler->CheckCharacterGT(to, on_failure);
				1672	}
				1673	} else {
				1674	if (cc->is_negated()) {
				1675	macro_assembler->GoTo(on_failure);
				1676	}
				1677	}
				1678	}
				1679	macro_assembler->Bind(&success);
				1680	}
				1681
				1682
				1683	RegExpNode::~RegExpNode() {
				1684	}
				1685
				1686
				1687	RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
				1688	Trace* trace) {
				1689	// If we are generating a greedy loop then don't stop and don't reuse code.
				1690	if (trace->stop_node() != NULL) {
				1691	return CONTINUE;
				1692	}
				1693
				1694	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1695	if (trace->is_trivial()) {
				1696	if (label_.is_bound()) {
				1697	// We are being asked to generate a generic version, but that's already
				1698	// been done so just go to it.
				1699	macro_assembler->GoTo(&label_);
				1700	return DONE;
				1701	}
				1702	if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
				1703	// To avoid too deep recursion we push the node to the work queue and just
				1704	// generate a goto here.
				1705	compiler->AddWork(this);
				1706	macro_assembler->GoTo(&label_);
				1707	return DONE;
				1708	}
				1709	// Generate generic version of the node and bind the label for later use.
				1710	macro_assembler->Bind(&label_);
				1711	return CONTINUE;
				1712	}
				1713
				1714	// We are being asked to make a non-generic version. Keep track of how many
				1715	// non-generic versions we generate so as not to overdo it.
				1716	trace_count_++;
				1717	if (FLAG_regexp_optimization &&
				1718	trace_count_ < kMaxCopiesCodeGenerated &&
				1719	compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
				1720	return CONTINUE;
				1721	}
				1722
				1723	// If we get here code has been generated for this node too many times or
				1724	// recursion is too deep. Time to switch to a generic version. The code for
				1725	// generic versions above can handle deep recursion properly.
				1726	trace->Flush(compiler, this);
				1727	return DONE;
				1728	}
				1729
				1730
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1731	int ActionNode::EatsAtLeast(int still_to_find,
				1732	int recursion_depth,
				1733	bool not_at_start) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1734	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
				1735	if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1736	return on_success()->EatsAtLeast(still_to_find,
				1737	recursion_depth + 1,
				1738	not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1739	}
				1740
				1741
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1742	int AssertionNode::EatsAtLeast(int still_to_find,
				1743	int recursion_depth,
				1744	bool not_at_start) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1745	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1746	// If we know we are not at the start and we are asked "how many characters
				1747	// will you match if you succeed?" then we can answer anything since false
				1748	// implies false. So lets just return the max answer (still_to_find) since
				1749	// that won't prevent us from preloading a lot of characters for the other
				1750	// branches in the node graph.
				1751	if (type() == AT_START && not_at_start) return still_to_find;
				1752	return on_success()->EatsAtLeast(still_to_find,
				1753	recursion_depth + 1,
				1754	not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1755	}
				1756
				1757
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1758	int BackReferenceNode::EatsAtLeast(int still_to_find,
				1759	int recursion_depth,
				1760	bool not_at_start) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1761	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1762	return on_success()->EatsAtLeast(still_to_find,
				1763	recursion_depth + 1,
				1764	not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1765	}
				1766
				1767
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1768	int TextNode::EatsAtLeast(int still_to_find,
				1769	int recursion_depth,
				1770	bool not_at_start) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1771	int answer = Length();
				1772	if (answer >= still_to_find) return answer;
				1773	if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1774	// We are not at start after this node so we set the last argument to 'true'.
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1775	return answer + on_success()->EatsAtLeast(still_to_find - answer,
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1776	recursion_depth + 1,
				1777	true);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1778	}
				1779
				1780
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	1781	int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1782	int recursion_depth,
				1783	bool not_at_start) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1784	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
				1785	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				1786	// afterwards.
				1787	RegExpNode* node = alternatives_->at(1).node();
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1788	return node->EatsAtLeast(still_to_find, recursion_depth + 1, not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1789	}
				1790
				1791
				1792	void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
				1793	QuickCheckDetails* details,
				1794	RegExpCompiler* compiler,
				1795	int filled_in,
				1796	bool not_at_start) {
				1797	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				1798	// afterwards.
				1799	RegExpNode* node = alternatives_->at(1).node();
				1800	return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
				1801	}
				1802
				1803
				1804	int ChoiceNode::EatsAtLeastHelper(int still_to_find,
				1805	int recursion_depth,
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1806	RegExpNode* ignore_this_node,
				1807	bool not_at_start) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1808	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
				1809	int min = 100;
				1810	int choice_count = alternatives_->length();
				1811	for (int i = 0; i < choice_count; i++) {
				1812	RegExpNode* node = alternatives_->at(i).node();
				1813	if (node == ignore_this_node) continue;
				1814	int node_eats_at_least = node->EatsAtLeast(still_to_find,
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1815	recursion_depth + 1,
				1816	not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1817	if (node_eats_at_least < min) min = node_eats_at_least;
				1818	}
				1819	return min;
				1820	}
				1821
				1822
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1823	int LoopChoiceNode::EatsAtLeast(int still_to_find,
				1824	int recursion_depth,
				1825	bool not_at_start) {
				1826	return EatsAtLeastHelper(still_to_find,
				1827	recursion_depth,
				1828	loop_node_,
				1829	not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1830	}
				1831
				1832
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1833	int ChoiceNode::EatsAtLeast(int still_to_find,
				1834	int recursion_depth,
				1835	bool not_at_start) {
				1836	return EatsAtLeastHelper(still_to_find,
				1837	recursion_depth,
				1838	NULL,
				1839	not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1840	}
				1841
				1842
				1843	// Takes the left-most 1-bit and smears it out, setting all bits to its right.
				1844	static inline uint32_t SmearBitsRight(uint32_t v) {
				1845	v \|= v >> 1;
				1846	v \|= v >> 2;
				1847	v \|= v >> 4;
				1848	v \|= v >> 8;
				1849	v \|= v >> 16;
				1850	return v;
				1851	}
				1852
				1853
				1854	bool QuickCheckDetails::Rationalize(bool asc) {
				1855	bool found_useful_op = false;
				1856	uint32_t char_mask;
				1857	if (asc) {
				1858	char_mask = String::kMaxAsciiCharCode;
				1859	} else {
				1860	char_mask = String::kMaxUC16CharCode;
				1861	}
				1862	mask_ = 0;
				1863	value_ = 0;
				1864	int char_shift = 0;
				1865	for (int i = 0; i < characters_; i++) {
				1866	Position* pos = &positions_[i];
				1867	if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
				1868	found_useful_op = true;
				1869	}
				1870	mask_ \|= (pos->mask & char_mask) << char_shift;
				1871	value_ \|= (pos->value & char_mask) << char_shift;
				1872	char_shift += asc ? 8 : 16;
				1873	}
				1874	return found_useful_op;
				1875	}
				1876
				1877
				1878	bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
				1879	Trace* trace,
				1880	bool preload_has_checked_bounds,
				1881	Label* on_possible_success,
				1882	QuickCheckDetails* details,
				1883	bool fall_through_on_failure) {
				1884	if (details->characters() == 0) return false;
				1885	GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
				1886	if (details->cannot_match()) return false;
				1887	if (!details->Rationalize(compiler->ascii())) return false;
				1888	ASSERT(details->characters() == 1 \|\|
				1889	compiler->macro_assembler()->CanReadUnaligned());
				1890	uint32_t mask = details->mask();
				1891	uint32_t value = details->value();
				1892
				1893	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1894
				1895	if (trace->characters_preloaded() != details->characters()) {
				1896	assembler->LoadCurrentCharacter(trace->cp_offset(),
				1897	trace->backtrack(),
				1898	!preload_has_checked_bounds,
				1899	details->characters());
				1900	}
				1901
				1902
				1903	bool need_mask = true;
				1904
				1905	if (details->characters() == 1) {
				1906	// If number of characters preloaded is 1 then we used a byte or 16 bit
				1907	// load so the value is already masked down.
				1908	uint32_t char_mask;
				1909	if (compiler->ascii()) {
				1910	char_mask = String::kMaxAsciiCharCode;
				1911	} else {
				1912	char_mask = String::kMaxUC16CharCode;
				1913	}
				1914	if ((mask & char_mask) == char_mask) need_mask = false;
				1915	mask &= char_mask;
				1916	} else {
Kristian Monsen	9dcf7e2	2010-06-28 14:14:28 +0100	[diff] [blame]	1917	// For 2-character preloads in ASCII mode or 1-character preloads in
				1918	// TWO_BYTE mode we also use a 16 bit load with zero extend.
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1919	if (details->characters() == 2 && compiler->ascii()) {
Kristian Monsen	9dcf7e2	2010-06-28 14:14:28 +0100	[diff] [blame]	1920	if ((mask & 0x7f7f) == 0x7f7f) need_mask = false;
				1921	} else if (details->characters() == 1 && !compiler->ascii()) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1922	if ((mask & 0xffff) == 0xffff) need_mask = false;
				1923	} else {
				1924	if (mask == 0xffffffff) need_mask = false;
				1925	}
				1926	}
				1927
				1928	if (fall_through_on_failure) {
				1929	if (need_mask) {
				1930	assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
				1931	} else {
				1932	assembler->CheckCharacter(value, on_possible_success);
				1933	}
				1934	} else {
				1935	if (need_mask) {
				1936	assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
				1937	} else {
				1938	assembler->CheckNotCharacter(value, trace->backtrack());
				1939	}
				1940	}
				1941	return true;
				1942	}
				1943
				1944
				1945	// Here is the meat of GetQuickCheckDetails (see also the comment on the
				1946	// super-class in the .h file).
				1947	//
				1948	// We iterate along the text object, building up for each character a
				1949	// mask and value that can be used to test for a quick failure to match.
				1950	// The masks and values for the positions will be combined into a single
				1951	// machine word for the current character width in order to be used in
				1952	// generating a quick check.
				1953	void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
				1954	RegExpCompiler* compiler,
				1955	int characters_filled_in,
				1956	bool not_at_start) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	1957	Isolate* isolate = Isolate::Current();
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1958	ASSERT(characters_filled_in < details->characters());
				1959	int characters = details->characters();
				1960	int char_mask;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1961	if (compiler->ascii()) {
				1962	char_mask = String::kMaxAsciiCharCode;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1963	} else {
				1964	char_mask = String::kMaxUC16CharCode;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1965	}
				1966	for (int k = 0; k < elms_->length(); k++) {
				1967	TextElement elm = elms_->at(k);
				1968	if (elm.type == TextElement::ATOM) {
				1969	Vector<const uc16> quarks = elm.data.u_atom->data();
				1970	for (int i = 0; i < characters && i < quarks.length(); i++) {
				1971	QuickCheckDetails::Position* pos =
				1972	details->positions(characters_filled_in);
				1973	uc16 c = quarks[i];
				1974	if (c > char_mask) {
				1975	// If we expect a non-ASCII character from an ASCII string,
				1976	// there is no way we can match. Not even case independent
				1977	// matching can turn an ASCII character into non-ASCII or
				1978	// vice versa.
				1979	details->set_cannot_match();
				1980	pos->determines_perfectly = false;
				1981	return;
				1982	}
				1983	if (compiler->ignore_case()) {
				1984	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	1985	int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),
				1986	chars);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1987	ASSERT(length != 0); // Can only happen if c > char_mask (see above).
				1988	if (length == 1) {
				1989	// This letter has no case equivalents, so it's nice and simple
				1990	// and the mask-compare will determine definitely whether we have
				1991	// a match at this character position.
				1992	pos->mask = char_mask;
				1993	pos->value = c;
				1994	pos->determines_perfectly = true;
				1995	} else {
				1996	uint32_t common_bits = char_mask;
				1997	uint32_t bits = chars[0];
				1998	for (int j = 1; j < length; j++) {
				1999	uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
				2000	common_bits ^= differing_bits;
				2001	bits &= common_bits;
				2002	}
				2003	// If length is 2 and common bits has only one zero in it then
				2004	// our mask and compare instruction will determine definitely
				2005	// whether we have a match at this character position. Otherwise
				2006	// it can only be an approximate check.
				2007	uint32_t one_zero = (common_bits \| ~char_mask);
				2008	if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
				2009	pos->determines_perfectly = true;
				2010	}
				2011	pos->mask = common_bits;
				2012	pos->value = bits;
				2013	}
				2014	} else {
				2015	// Don't ignore case. Nice simple case where the mask-compare will
				2016	// determine definitely whether we have a match at this character
				2017	// position.
				2018	pos->mask = char_mask;
				2019	pos->value = c;
				2020	pos->determines_perfectly = true;
				2021	}
				2022	characters_filled_in++;
				2023	ASSERT(characters_filled_in <= details->characters());
				2024	if (characters_filled_in == details->characters()) {
				2025	return;
				2026	}
				2027	}
				2028	} else {
				2029	QuickCheckDetails::Position* pos =
				2030	details->positions(characters_filled_in);
				2031	RegExpCharacterClass* tree = elm.data.u_char_class;
				2032	ZoneList<CharacterRange>* ranges = tree->ranges();
				2033	if (tree->is_negated()) {
				2034	// A quick check uses multi-character mask and compare. There is no
				2035	// useful way to incorporate a negative char class into this scheme
				2036	// so we just conservatively create a mask and value that will always
				2037	// succeed.
				2038	pos->mask = 0;
				2039	pos->value = 0;
				2040	} else {
				2041	int first_range = 0;
				2042	while (ranges->at(first_range).from() > char_mask) {
				2043	first_range++;
				2044	if (first_range == ranges->length()) {
				2045	details->set_cannot_match();
				2046	pos->determines_perfectly = false;
				2047	return;
				2048	}
				2049	}
				2050	CharacterRange range = ranges->at(first_range);
				2051	uc16 from = range.from();
				2052	uc16 to = range.to();
				2053	if (to > char_mask) {
				2054	to = char_mask;
				2055	}
				2056	uint32_t differing_bits = (from ^ to);
				2057	// A mask and compare is only perfect if the differing bits form a
				2058	// number like 00011111 with one single block of trailing 1s.
				2059	if ((differing_bits & (differing_bits + 1)) == 0 &&
				2060	from + differing_bits == to) {
				2061	pos->determines_perfectly = true;
				2062	}
				2063	uint32_t common_bits = ~SmearBitsRight(differing_bits);
				2064	uint32_t bits = (from & common_bits);
				2065	for (int i = first_range + 1; i < ranges->length(); i++) {
				2066	CharacterRange range = ranges->at(i);
				2067	uc16 from = range.from();
				2068	uc16 to = range.to();
				2069	if (from > char_mask) continue;
				2070	if (to > char_mask) to = char_mask;
				2071	// Here we are combining more ranges into the mask and compare
				2072	// value. With each new range the mask becomes more sparse and
				2073	// so the chances of a false positive rise. A character class
				2074	// with multiple ranges is assumed never to be equivalent to a
				2075	// mask and compare operation.
				2076	pos->determines_perfectly = false;
				2077	uint32_t new_common_bits = (from ^ to);
				2078	new_common_bits = ~SmearBitsRight(new_common_bits);
				2079	common_bits &= new_common_bits;
				2080	bits &= new_common_bits;
				2081	uint32_t differing_bits = (from & common_bits) ^ bits;
				2082	common_bits ^= differing_bits;
				2083	bits &= common_bits;
				2084	}
				2085	pos->mask = common_bits;
				2086	pos->value = bits;
				2087	}
				2088	characters_filled_in++;
				2089	ASSERT(characters_filled_in <= details->characters());
				2090	if (characters_filled_in == details->characters()) {
				2091	return;
				2092	}
				2093	}
				2094	}
				2095	ASSERT(characters_filled_in != details->characters());
				2096	on_success()-> GetQuickCheckDetails(details,
				2097	compiler,
				2098	characters_filled_in,
				2099	true);
				2100	}
				2101
				2102
				2103	void QuickCheckDetails::Clear() {
				2104	for (int i = 0; i < characters_; i++) {
				2105	positions_[i].mask = 0;
				2106	positions_[i].value = 0;
				2107	positions_[i].determines_perfectly = false;
				2108	}
				2109	characters_ = 0;
				2110	}
				2111
				2112
				2113	void QuickCheckDetails::Advance(int by, bool ascii) {
				2114	ASSERT(by >= 0);
				2115	if (by >= characters_) {
				2116	Clear();
				2117	return;
				2118	}
				2119	for (int i = 0; i < characters_ - by; i++) {
				2120	positions_[i] = positions_[by + i];
				2121	}
				2122	for (int i = characters_ - by; i < characters_; i++) {
				2123	positions_[i].mask = 0;
				2124	positions_[i].value = 0;
				2125	positions_[i].determines_perfectly = false;
				2126	}
				2127	characters_ -= by;
				2128	// We could change mask_ and value_ here but we would never advance unless
				2129	// they had already been used in a check and they won't be used again because
				2130	// it would gain us nothing. So there's no point.
				2131	}
				2132
				2133
				2134	void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
				2135	ASSERT(characters_ == other->characters_);
				2136	if (other->cannot_match_) {
				2137	return;
				2138	}
				2139	if (cannot_match_) {
				2140	this = other;
				2141	return;
				2142	}
				2143	for (int i = from_index; i < characters_; i++) {
				2144	QuickCheckDetails::Position* pos = positions(i);
				2145	QuickCheckDetails::Position* other_pos = other->positions(i);
				2146	if (pos->mask != other_pos->mask \|\|
				2147	pos->value != other_pos->value \|\|
				2148	!other_pos->determines_perfectly) {
				2149	// Our mask-compare operation will be approximate unless we have the
				2150	// exact same operation on both sides of the alternation.
				2151	pos->determines_perfectly = false;
				2152	}
				2153	pos->mask &= other_pos->mask;
				2154	pos->value &= pos->mask;
				2155	other_pos->value &= pos->mask;
				2156	uc16 differing_bits = (pos->value ^ other_pos->value);
				2157	pos->mask &= ~differing_bits;
				2158	pos->value &= pos->mask;
				2159	}
				2160	}
				2161
				2162
				2163	class VisitMarker {
				2164	public:
				2165	explicit VisitMarker(NodeInfo* info) : info_(info) {
				2166	ASSERT(!info->visited);
				2167	info->visited = true;
				2168	}
				2169	~VisitMarker() {
				2170	info_->visited = false;
				2171	}
				2172	private:
				2173	NodeInfo* info_;
				2174	};
				2175
				2176
				2177	void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2178	RegExpCompiler* compiler,
				2179	int characters_filled_in,
				2180	bool not_at_start) {
				2181	if (body_can_be_zero_length_ \|\| info()->visited) return;
				2182	VisitMarker marker(info());
				2183	return ChoiceNode::GetQuickCheckDetails(details,
				2184	compiler,
				2185	characters_filled_in,
				2186	not_at_start);
				2187	}
				2188
				2189
				2190	void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2191	RegExpCompiler* compiler,
				2192	int characters_filled_in,
				2193	bool not_at_start) {
				2194	not_at_start = (not_at_start \|\| not_at_start_);
				2195	int choice_count = alternatives_->length();
				2196	ASSERT(choice_count > 0);
				2197	alternatives_->at(0).node()->GetQuickCheckDetails(details,
				2198	compiler,
				2199	characters_filled_in,
				2200	not_at_start);
				2201	for (int i = 1; i < choice_count; i++) {
				2202	QuickCheckDetails new_details(details->characters());
				2203	RegExpNode* node = alternatives_->at(i).node();
				2204	node->GetQuickCheckDetails(&new_details, compiler,
				2205	characters_filled_in,
				2206	not_at_start);
				2207	// Here we merge the quick match details of the two branches.
				2208	details->Merge(&new_details, characters_filled_in);
				2209	}
				2210	}
				2211
				2212
				2213	// Check for [0-9A-Z_a-z].
				2214	static void EmitWordCheck(RegExpMacroAssembler* assembler,
				2215	Label* word,
				2216	Label* non_word,
				2217	bool fall_through_on_word) {
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2218	if (assembler->CheckSpecialCharacterClass(
				2219	fall_through_on_word ? 'w' : 'W',
				2220	fall_through_on_word ? non_word : word)) {
				2221	// Optimized implementation available.
				2222	return;
				2223	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2224	assembler->CheckCharacterGT('z', non_word);
				2225	assembler->CheckCharacterLT('0', non_word);
				2226	assembler->CheckCharacterGT('a' - 1, word);
				2227	assembler->CheckCharacterLT('9' + 1, word);
				2228	assembler->CheckCharacterLT('A', non_word);
				2229	assembler->CheckCharacterLT('Z' + 1, word);
				2230	if (fall_through_on_word) {
				2231	assembler->CheckNotCharacter('_', non_word);
				2232	} else {
				2233	assembler->CheckCharacter('_', word);
				2234	}
				2235	}
				2236
				2237
				2238	// Emit the code to check for a ^ in multiline mode (1-character lookbehind
				2239	// that matches newline or the start of input).
				2240	static void EmitHat(RegExpCompiler* compiler,
				2241	RegExpNode* on_success,
				2242	Trace* trace) {
				2243	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2244	// We will be loading the previous character into the current character
				2245	// register.
				2246	Trace new_trace(*trace);
				2247	new_trace.InvalidateCurrentCharacter();
				2248
				2249	Label ok;
				2250	if (new_trace.cp_offset() == 0) {
				2251	// The start of input counts as a newline in this context, so skip to
				2252	// ok if we are at the start.
				2253	assembler->CheckAtStart(&ok);
				2254	}
				2255	// We already checked that we are not at the start of input so it must be
				2256	// OK to load the previous character.
				2257	assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
				2258	new_trace.backtrack(),
				2259	false);
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2260	if (!assembler->CheckSpecialCharacterClass('n',
				2261	new_trace.backtrack())) {
				2262	// Newline means \n, \r, 0x2028 or 0x2029.
				2263	if (!compiler->ascii()) {
				2264	assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
				2265	}
				2266	assembler->CheckCharacter('\n', &ok);
				2267	assembler->CheckNotCharacter('\r', new_trace.backtrack());
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2268	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2269	assembler->Bind(&ok);
				2270	on_success->Emit(compiler, &new_trace);
				2271	}
				2272
				2273
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2274	// Emit the code to handle \b and \B (word-boundary or non-word-boundary)
				2275	// when we know whether the next character must be a word character or not.
				2276	static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type,
				2277	RegExpCompiler* compiler,
				2278	RegExpNode* on_success,
				2279	Trace* trace) {
				2280	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2281	Label done;
				2282
				2283	Trace new_trace(*trace);
				2284
				2285	bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER);
				2286	Label* on_word = expect_word_character ? &done : new_trace.backtrack();
				2287	Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done;
				2288
				2289	// Check whether previous character was a word character.
				2290	switch (trace->at_start()) {
				2291	case Trace::TRUE:
				2292	if (expect_word_character) {
				2293	assembler->GoTo(on_non_word);
				2294	}
				2295	break;
				2296	case Trace::UNKNOWN:
				2297	ASSERT_EQ(0, trace->cp_offset());
				2298	assembler->CheckAtStart(on_non_word);
				2299	// Fall through.
				2300	case Trace::FALSE:
				2301	int prev_char_offset = trace->cp_offset() - 1;
				2302	assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1);
				2303	EmitWordCheck(assembler, on_word, on_non_word, expect_word_character);
				2304	// We may or may not have loaded the previous character.
				2305	new_trace.InvalidateCurrentCharacter();
				2306	}
				2307
				2308	assembler->Bind(&done);
				2309
				2310	on_success->Emit(compiler, &new_trace);
				2311	}
				2312
				2313
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2314	// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
				2315	static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
				2316	RegExpCompiler* compiler,
				2317	RegExpNode* on_success,
				2318	Trace* trace) {
				2319	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2320	Label before_non_word;
				2321	Label before_word;
				2322	if (trace->characters_preloaded() != 1) {
				2323	assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
				2324	}
				2325	// Fall through on non-word.
				2326	EmitWordCheck(assembler, &before_word, &before_non_word, false);
				2327
				2328	// We will be loading the previous character into the current character
				2329	// register.
				2330	Trace new_trace(*trace);
				2331	new_trace.InvalidateCurrentCharacter();
				2332
				2333	Label ok;
				2334	Label* boundary;
				2335	Label* not_boundary;
				2336	if (type == AssertionNode::AT_BOUNDARY) {
				2337	boundary = &ok;
				2338	not_boundary = new_trace.backtrack();
				2339	} else {
				2340	not_boundary = &ok;
				2341	boundary = new_trace.backtrack();
				2342	}
				2343
				2344	// Next character is not a word character.
				2345	assembler->Bind(&before_non_word);
				2346	if (new_trace.cp_offset() == 0) {
				2347	// The start of input counts as a non-word character, so the question is
				2348	// decided if we are at the start.
				2349	assembler->CheckAtStart(not_boundary);
				2350	}
				2351	// We already checked that we are not at the start of input so it must be
				2352	// OK to load the previous character.
				2353	assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
				2354	&ok, // Unused dummy label in this call.
				2355	false);
				2356	// Fall through on non-word.
				2357	EmitWordCheck(assembler, boundary, not_boundary, false);
				2358	assembler->GoTo(not_boundary);
				2359
				2360	// Next character is a word character.
				2361	assembler->Bind(&before_word);
				2362	if (new_trace.cp_offset() == 0) {
				2363	// The start of input counts as a non-word character, so the question is
				2364	// decided if we are at the start.
				2365	assembler->CheckAtStart(boundary);
				2366	}
				2367	// We already checked that we are not at the start of input so it must be
				2368	// OK to load the previous character.
				2369	assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
				2370	&ok, // Unused dummy label in this call.
				2371	false);
				2372	bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
				2373	EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
				2374
				2375	assembler->Bind(&ok);
				2376
				2377	on_success->Emit(compiler, &new_trace);
				2378	}
				2379
				2380
				2381	void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2382	RegExpCompiler* compiler,
				2383	int filled_in,
				2384	bool not_at_start) {
				2385	if (type_ == AT_START && not_at_start) {
				2386	details->set_cannot_match();
				2387	return;
				2388	}
				2389	return on_success()->GetQuickCheckDetails(details,
				2390	compiler,
				2391	filled_in,
				2392	not_at_start);
				2393	}
				2394
				2395
				2396	void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2397	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2398	switch (type_) {
				2399	case AT_END: {
				2400	Label ok;
				2401	assembler->CheckPosition(trace->cp_offset(), &ok);
				2402	assembler->GoTo(trace->backtrack());
				2403	assembler->Bind(&ok);
				2404	break;
				2405	}
				2406	case AT_START: {
				2407	if (trace->at_start() == Trace::FALSE) {
				2408	assembler->GoTo(trace->backtrack());
				2409	return;
				2410	}
				2411	if (trace->at_start() == Trace::UNKNOWN) {
				2412	assembler->CheckNotAtStart(trace->backtrack());
				2413	Trace at_start_trace = *trace;
				2414	at_start_trace.set_at_start(true);
				2415	on_success()->Emit(compiler, &at_start_trace);
				2416	return;
				2417	}
				2418	}
				2419	break;
				2420	case AFTER_NEWLINE:
				2421	EmitHat(compiler, on_success(), trace);
				2422	return;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2423	case AT_BOUNDARY:
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2424	case AT_NON_BOUNDARY: {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2425	EmitBoundaryCheck(type_, compiler, on_success(), trace);
				2426	return;
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2427	}
				2428	case AFTER_WORD_CHARACTER:
				2429	case AFTER_NONWORD_CHARACTER: {
				2430	EmitHalfBoundaryCheck(type_, compiler, on_success(), trace);
				2431	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2432	}
				2433	on_success()->Emit(compiler, trace);
				2434	}
				2435
				2436
				2437	static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
				2438	if (quick_check == NULL) return false;
				2439	if (offset >= quick_check->characters()) return false;
				2440	return quick_check->positions(offset)->determines_perfectly;
				2441	}
				2442
				2443
				2444	static void UpdateBoundsCheck(int index, int* checked_up_to) {
				2445	if (index > *checked_up_to) {
				2446	*checked_up_to = index;
				2447	}
				2448	}
				2449
				2450
				2451	// We call this repeatedly to generate code for each pass over the text node.
				2452	// The passes are in increasing order of difficulty because we hope one
				2453	// of the first passes will fail in which case we are saved the work of the
				2454	// later passes. for example for the case independent regexp /%[asdfghjkl]a/
				2455	// we will check the '%' in the first pass, the case independent 'a' in the
				2456	// second pass and the character class in the last pass.
				2457	//
				2458	// The passes are done from right to left, so for example to test for /bar/
				2459	// we will first test for an 'r' with offset 2, then an 'a' with offset 1
				2460	// and then a 'b' with offset 0. This means we can avoid the end-of-input
				2461	// bounds check most of the time. In the example we only need to check for
				2462	// end-of-input when loading the putative 'r'.
				2463	//
				2464	// A slight complication involves the fact that the first character may already
				2465	// be fetched into a register by the previous node. In this case we want to
				2466	// do the test for that character first. We do this in separate passes. The
				2467	// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
				2468	// pass has been performed then subsequent passes will have true in
				2469	// first_element_checked to indicate that that character does not need to be
				2470	// checked again.
				2471	//
				2472	// In addition to all this we are passed a Trace, which can
				2473	// contain an AlternativeGeneration object. In this AlternativeGeneration
				2474	// object we can see details of any quick check that was already passed in
				2475	// order to get to the code we are now generating. The quick check can involve
				2476	// loading characters, which means we do not need to recheck the bounds
				2477	// up to the limit the quick check already checked. In addition the quick
				2478	// check can have involved a mask and compare operation which may simplify
				2479	// or obviate the need for further checks at some character positions.
				2480	void TextNode::TextEmitPass(RegExpCompiler* compiler,
				2481	TextEmitPassType pass,
				2482	bool preloaded,
				2483	Trace* trace,
				2484	bool first_element_checked,
				2485	int* checked_up_to) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	2486	Isolate* isolate = Isolate::Current();
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2487	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2488	bool ascii = compiler->ascii();
				2489	Label* backtrack = trace->backtrack();
				2490	QuickCheckDetails* quick_check = trace->quick_check_performed();
				2491	int element_count = elms_->length();
				2492	for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
				2493	TextElement elm = elms_->at(i);
				2494	int cp_offset = trace->cp_offset() + elm.cp_offset;
				2495	if (elm.type == TextElement::ATOM) {
				2496	Vector<const uc16> quarks = elm.data.u_atom->data();
				2497	for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
				2498	if (first_element_checked && i == 0 && j == 0) continue;
				2499	if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
				2500	EmitCharacterFunction* emit_function = NULL;
				2501	switch (pass) {
				2502	case NON_ASCII_MATCH:
				2503	ASSERT(ascii);
				2504	if (quarks[j] > String::kMaxAsciiCharCode) {
				2505	assembler->GoTo(backtrack);
				2506	return;
				2507	}
				2508	break;
				2509	case NON_LETTER_CHARACTER_MATCH:
				2510	emit_function = &EmitAtomNonLetter;
				2511	break;
				2512	case SIMPLE_CHARACTER_MATCH:
				2513	emit_function = &EmitSimpleCharacter;
				2514	break;
				2515	case CASE_CHARACTER_MATCH:
				2516	emit_function = &EmitAtomLetter;
				2517	break;
				2518	default:
				2519	break;
				2520	}
				2521	if (emit_function != NULL) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	2522	bool bound_checked = emit_function(isolate,
				2523	compiler,
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2524	quarks[j],
				2525	backtrack,
				2526	cp_offset + j,
				2527	*checked_up_to < cp_offset + j,
				2528	preloaded);
				2529	if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
				2530	}
				2531	}
				2532	} else {
				2533	ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
				2534	if (pass == CHARACTER_CLASS_MATCH) {
				2535	if (first_element_checked && i == 0) continue;
				2536	if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
				2537	RegExpCharacterClass* cc = elm.data.u_char_class;
				2538	EmitCharClass(assembler,
				2539	cc,
				2540	ascii,
				2541	backtrack,
				2542	cp_offset,
				2543	*checked_up_to < cp_offset,
				2544	preloaded);
				2545	UpdateBoundsCheck(cp_offset, checked_up_to);
				2546	}
				2547	}
				2548	}
				2549	}
				2550
				2551
				2552	int TextNode::Length() {
				2553	TextElement elm = elms_->last();
				2554	ASSERT(elm.cp_offset >= 0);
				2555	if (elm.type == TextElement::ATOM) {
				2556	return elm.cp_offset + elm.data.u_atom->data().length();
				2557	} else {
				2558	return elm.cp_offset + 1;
				2559	}
				2560	}
				2561
				2562
				2563	bool TextNode::SkipPass(int int_pass, bool ignore_case) {
				2564	TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
				2565	if (ignore_case) {
				2566	return pass == SIMPLE_CHARACTER_MATCH;
				2567	} else {
				2568	return pass == NON_LETTER_CHARACTER_MATCH \|\| pass == CASE_CHARACTER_MATCH;
				2569	}
				2570	}
				2571
				2572
				2573	// This generates the code to match a text node. A text node can contain
				2574	// straight character sequences (possibly to be matched in a case-independent
				2575	// way) and character classes. For efficiency we do not do this in a single
				2576	// pass from left to right. Instead we pass over the text node several times,
				2577	// emitting code for some character positions every time. See the comment on
				2578	// TextEmitPass for details.
				2579	void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2580	LimitResult limit_result = LimitVersions(compiler, trace);
				2581	if (limit_result == DONE) return;
				2582	ASSERT(limit_result == CONTINUE);
				2583
				2584	if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
				2585	compiler->SetRegExpTooBig();
				2586	return;
				2587	}
				2588
				2589	if (compiler->ascii()) {
				2590	int dummy = 0;
				2591	TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
				2592	}
				2593
				2594	bool first_elt_done = false;
				2595	int bound_checked_to = trace->cp_offset() - 1;
				2596	bound_checked_to += trace->bound_checked_up_to();
				2597
				2598	// If a character is preloaded into the current character register then
				2599	// check that now.
				2600	if (trace->characters_preloaded() == 1) {
				2601	for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
				2602	if (!SkipPass(pass, compiler->ignore_case())) {
				2603	TextEmitPass(compiler,
				2604	static_cast<TextEmitPassType>(pass),
				2605	true,
				2606	trace,
				2607	false,
				2608	&bound_checked_to);
				2609	}
				2610	}
				2611	first_elt_done = true;
				2612	}
				2613
				2614	for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
				2615	if (!SkipPass(pass, compiler->ignore_case())) {
				2616	TextEmitPass(compiler,
				2617	static_cast<TextEmitPassType>(pass),
				2618	false,
				2619	trace,
				2620	first_elt_done,
				2621	&bound_checked_to);
				2622	}
				2623	}
				2624
				2625	Trace successor_trace(*trace);
				2626	successor_trace.set_at_start(false);
				2627	successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
				2628	RecursionCheck rc(compiler);
				2629	on_success()->Emit(compiler, &successor_trace);
				2630	}
				2631
				2632
				2633	void Trace::InvalidateCurrentCharacter() {
				2634	characters_preloaded_ = 0;
				2635	}
				2636
				2637
				2638	void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
				2639	ASSERT(by > 0);
				2640	// We don't have an instruction for shifting the current character register
				2641	// down or for using a shifted value for anything so lets just forget that
				2642	// we preloaded any characters into it.
				2643	characters_preloaded_ = 0;
				2644	// Adjust the offsets of the quick check performed information. This
				2645	// information is used to find out what we already determined about the
				2646	// characters by means of mask and compare.
				2647	quick_check_performed_.Advance(by, compiler->ascii());
				2648	cp_offset_ += by;
				2649	if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
				2650	compiler->SetRegExpTooBig();
				2651	cp_offset_ = 0;
				2652	}
				2653	bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
				2654	}
				2655
				2656
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	2657	void TextNode::MakeCaseIndependent(bool is_ascii) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2658	int element_count = elms_->length();
				2659	for (int i = 0; i < element_count; i++) {
				2660	TextElement elm = elms_->at(i);
				2661	if (elm.type == TextElement::CHAR_CLASS) {
				2662	RegExpCharacterClass* cc = elm.data.u_char_class;
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	2663	// None of the standard character classses is different in the case
				2664	// independent case and it slows us down if we don't know that.
				2665	if (cc->is_standard()) continue;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2666	ZoneList<CharacterRange>* ranges = cc->ranges();
				2667	int range_count = ranges->length();
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	2668	for (int j = 0; j < range_count; j++) {
				2669	ranges->at(j).AddCaseEquivalents(ranges, is_ascii);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2670	}
				2671	}
				2672	}
				2673	}
				2674
				2675
				2676	int TextNode::GreedyLoopTextLength() {
				2677	TextElement elm = elms_->at(elms_->length() - 1);
				2678	if (elm.type == TextElement::CHAR_CLASS) {
				2679	return elm.cp_offset + 1;
				2680	} else {
				2681	return elm.cp_offset + elm.data.u_atom->data().length();
				2682	}
				2683	}
				2684
				2685
				2686	// Finds the fixed match length of a sequence of nodes that goes from
				2687	// this alternative and back to this choice node. If there are variable
				2688	// length nodes or other complications in the way then return a sentinel
				2689	// value indicating that a greedy loop cannot be constructed.
				2690	int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) {
				2691	int length = 0;
				2692	RegExpNode* node = alternative->node();
				2693	// Later we will generate code for all these text nodes using recursion
				2694	// so we have to limit the max number.
				2695	int recursion_depth = 0;
				2696	while (node != this) {
				2697	if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
				2698	return kNodeIsTooComplexForGreedyLoops;
				2699	}
				2700	int node_length = node->GreedyLoopTextLength();
				2701	if (node_length == kNodeIsTooComplexForGreedyLoops) {
				2702	return kNodeIsTooComplexForGreedyLoops;
				2703	}
				2704	length += node_length;
				2705	SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
				2706	node = seq_node->on_success();
				2707	}
				2708	return length;
				2709	}
				2710
				2711
				2712	void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
				2713	ASSERT_EQ(loop_node_, NULL);
				2714	AddAlternative(alt);
				2715	loop_node_ = alt.node();
				2716	}
				2717
				2718
				2719	void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
				2720	ASSERT_EQ(continue_node_, NULL);
				2721	AddAlternative(alt);
				2722	continue_node_ = alt.node();
				2723	}
				2724
				2725
				2726	void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2727	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				2728	if (trace->stop_node() == this) {
				2729	int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
				2730	ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
				2731	// Update the counter-based backtracking info on the stack. This is an
				2732	// optimization for greedy loops (see below).
				2733	ASSERT(trace->cp_offset() == text_length);
				2734	macro_assembler->AdvanceCurrentPosition(text_length);
				2735	macro_assembler->GoTo(trace->loop_label());
				2736	return;
				2737	}
				2738	ASSERT(trace->stop_node() == NULL);
				2739	if (!trace->is_trivial()) {
				2740	trace->Flush(compiler, this);
				2741	return;
				2742	}
				2743	ChoiceNode::Emit(compiler, trace);
				2744	}
				2745
				2746
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	2747	int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
				2748	bool not_at_start) {
				2749	int preload_characters = EatsAtLeast(4, 0, not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2750	if (compiler->macro_assembler()->CanReadUnaligned()) {
				2751	bool ascii = compiler->ascii();
				2752	if (ascii) {
				2753	if (preload_characters > 4) preload_characters = 4;
				2754	// We can't preload 3 characters because there is no machine instruction
				2755	// to do that. We can't just load 4 because we could be reading
				2756	// beyond the end of the string, which could cause a memory fault.
				2757	if (preload_characters == 3) preload_characters = 2;
				2758	} else {
				2759	if (preload_characters > 2) preload_characters = 2;
				2760	}
				2761	} else {
				2762	if (preload_characters > 1) preload_characters = 1;
				2763	}
				2764	return preload_characters;
				2765	}
				2766
				2767
				2768	// This class is used when generating the alternatives in a choice node. It
				2769	// records the way the alternative is being code generated.
				2770	class AlternativeGeneration: public Malloced {
				2771	public:
				2772	AlternativeGeneration()
				2773	: possible_success(),
				2774	expects_preload(false),
				2775	after(),
				2776	quick_check_details() { }
				2777	Label possible_success;
				2778	bool expects_preload;
				2779	Label after;
				2780	QuickCheckDetails quick_check_details;
				2781	};
				2782
				2783
				2784	// Creates a list of AlternativeGenerations. If the list has a reasonable
				2785	// size then it is on the stack, otherwise the excess is on the heap.
				2786	class AlternativeGenerationList {
				2787	public:
				2788	explicit AlternativeGenerationList(int count)
				2789	: alt_gens_(count) {
				2790	for (int i = 0; i < count && i < kAFew; i++) {
				2791	alt_gens_.Add(a_few_alt_gens_ + i);
				2792	}
				2793	for (int i = kAFew; i < count; i++) {
				2794	alt_gens_.Add(new AlternativeGeneration());
				2795	}
				2796	}
				2797	~AlternativeGenerationList() {
				2798	for (int i = kAFew; i < alt_gens_.length(); i++) {
				2799	delete alt_gens_[i];
				2800	alt_gens_[i] = NULL;
				2801	}
				2802	}
				2803
				2804	AlternativeGeneration* at(int i) {
				2805	return alt_gens_[i];
				2806	}
Ben Murdoch	3fb3ca8	2011-12-02 17:19:32 +0000	[diff] [blame^]	2807
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2808	private:
				2809	static const int kAFew = 10;
				2810	ZoneList<AlternativeGeneration*> alt_gens_;
				2811	AlternativeGeneration a_few_alt_gens_[kAFew];
				2812	};
				2813
				2814
				2815	/* Code generation for choice nodes.
				2816	*
				2817	* We generate quick checks that do a mask and compare to eliminate a
				2818	* choice. If the quick check succeeds then it jumps to the continuation to
				2819	* do slow checks and check subsequent nodes. If it fails (the common case)
				2820	* it falls through to the next choice.
				2821	*
				2822	* Here is the desired flow graph. Nodes directly below each other imply
				2823	* fallthrough. Alternatives 1 and 2 have quick checks. Alternative
				2824	* 3 doesn't have a quick check so we have to call the slow check.
				2825	* Nodes are marked Qn for quick checks and Sn for slow checks. The entire
				2826	* regexp continuation is generated directly after the Sn node, up to the
				2827	* next GoTo if we decide to reuse some already generated code. Some
				2828	* nodes expect preload_characters to be preloaded into the current
				2829	* character register. R nodes do this preloading. Vertices are marked
				2830	* F for failures and S for success (possible success in the case of quick
				2831	* nodes). L, V, < and > are used as arrow heads.
				2832	*
				2833	* ----------> R
				2834	* \|
				2835	* V
				2836	* Q1 -----> S1
				2837	* \| S /
				2838	* F\| /
				2839	* \| F/
				2840	* \| /
				2841	* \| R
				2842	* \| /
				2843	* V L
				2844	* Q2 -----> S2
				2845	* \| S /
				2846	* F\| /
				2847	* \| F/
				2848	* \| /
				2849	* \| R
				2850	* \| /
				2851	* V L
				2852	* S3
				2853	* \|
				2854	* F\|
				2855	* \|
				2856	* R
				2857	* \|
				2858	* backtrack V
				2859	* <----------Q4
				2860	* \ F \|
				2861	* \ \|S
				2862	* \ F V
				2863	* \-----S4
				2864	*
				2865	* For greedy loops we reverse our expectation and expect to match rather
				2866	* than fail. Therefore we want the loop code to look like this (U is the
				2867	* unwind code that steps back in the greedy loop). The following alternatives
				2868	* look the same as above.
				2869	* _____
				2870	* / \
				2871	* V \|
				2872	* ----------> S1 \|
				2873	* /\| \|
				2874	* / \|S \|
				2875	* F/ \_____/
				2876	* /
				2877	* \|<-----------
				2878	* \| \
				2879	* V \
				2880	* Q2 ---> S2 \
				2881	* \| S / \|
				2882	* F\| / \|
				2883	* \| F/ \|
				2884	* \| / \|
				2885	* \| R \|
				2886	* \| / \|
				2887	* F VL \|
				2888	* <------U \|
				2889	* back \|S \|
				2890	* \______________/
				2891	*/
				2892
				2893
				2894	void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2895	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				2896	int choice_count = alternatives_->length();
				2897	#ifdef DEBUG
				2898	for (int i = 0; i < choice_count - 1; i++) {
				2899	GuardedAlternative alternative = alternatives_->at(i);
				2900	ZoneList<Guard> guards = alternative.guards();
				2901	int guard_count = (guards == NULL) ? 0 : guards->length();
				2902	for (int j = 0; j < guard_count; j++) {
				2903	ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
				2904	}
				2905	}
				2906	#endif
				2907
				2908	LimitResult limit_result = LimitVersions(compiler, trace);
				2909	if (limit_result == DONE) return;
				2910	ASSERT(limit_result == CONTINUE);
				2911
				2912	int new_flush_budget = trace->flush_budget() / choice_count;
				2913	if (trace->flush_budget() == 0 && trace->actions() != NULL) {
				2914	trace->Flush(compiler, this);
				2915	return;
				2916	}
				2917
				2918	RecursionCheck rc(compiler);
				2919
				2920	Trace* current_trace = trace;
				2921
				2922	int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
				2923	bool greedy_loop = false;
				2924	Label greedy_loop_label;
				2925	Trace counter_backtrack_trace;
				2926	counter_backtrack_trace.set_backtrack(&greedy_loop_label);
				2927	if (not_at_start()) counter_backtrack_trace.set_at_start(false);
				2928
				2929	if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
				2930	// Here we have special handling for greedy loops containing only text nodes
				2931	// and other simple nodes. These are handled by pushing the current
				2932	// position on the stack and then incrementing the current position each
				2933	// time around the switch. On backtrack we decrement the current position
				2934	// and check it against the pushed value. This avoids pushing backtrack
				2935	// information for each iteration of the loop, which could take up a lot of
				2936	// space.
				2937	greedy_loop = true;
				2938	ASSERT(trace->stop_node() == NULL);
				2939	macro_assembler->PushCurrentPosition();
				2940	current_trace = &counter_backtrack_trace;
				2941	Label greedy_match_failed;
				2942	Trace greedy_match_trace;
				2943	if (not_at_start()) greedy_match_trace.set_at_start(false);
				2944	greedy_match_trace.set_backtrack(&greedy_match_failed);
				2945	Label loop_label;
				2946	macro_assembler->Bind(&loop_label);
				2947	greedy_match_trace.set_stop_node(this);
				2948	greedy_match_trace.set_loop_label(&loop_label);
				2949	alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
				2950	macro_assembler->Bind(&greedy_match_failed);
				2951	}
				2952
				2953	Label second_choice; // For use in greedy matches.
				2954	macro_assembler->Bind(&second_choice);
				2955
				2956	int first_normal_choice = greedy_loop ? 1 : 0;
				2957
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	2958	int preload_characters =
				2959	CalculatePreloadCharacters(compiler,
				2960	current_trace->at_start() == Trace::FALSE);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2961	bool preload_is_current =
				2962	(current_trace->characters_preloaded() == preload_characters);
				2963	bool preload_has_checked_bounds = preload_is_current;
				2964
				2965	AlternativeGenerationList alt_gens(choice_count);
				2966
				2967	// For now we just call all choices one after the other. The idea ultimately
				2968	// is to use the Dispatch table to try only the relevant ones.
				2969	for (int i = first_normal_choice; i < choice_count; i++) {
				2970	GuardedAlternative alternative = alternatives_->at(i);
				2971	AlternativeGeneration* alt_gen = alt_gens.at(i);
				2972	alt_gen->quick_check_details.set_characters(preload_characters);
				2973	ZoneList<Guard> guards = alternative.guards();
				2974	int guard_count = (guards == NULL) ? 0 : guards->length();
				2975	Trace new_trace(*current_trace);
				2976	new_trace.set_characters_preloaded(preload_is_current ?
				2977	preload_characters :
				2978	0);
				2979	if (preload_has_checked_bounds) {
				2980	new_trace.set_bound_checked_up_to(preload_characters);
				2981	}
				2982	new_trace.quick_check_performed()->Clear();
				2983	if (not_at_start_) new_trace.set_at_start(Trace::FALSE);
				2984	alt_gen->expects_preload = preload_is_current;
				2985	bool generate_full_check_inline = false;
				2986	if (FLAG_regexp_optimization &&
				2987	try_to_emit_quick_check_for_alternative(i) &&
				2988	alternative.node()->EmitQuickCheck(compiler,
				2989	&new_trace,
				2990	preload_has_checked_bounds,
				2991	&alt_gen->possible_success,
				2992	&alt_gen->quick_check_details,
				2993	i < choice_count - 1)) {
				2994	// Quick check was generated for this choice.
				2995	preload_is_current = true;
				2996	preload_has_checked_bounds = true;
				2997	// On the last choice in the ChoiceNode we generated the quick
				2998	// check to fall through on possible success. So now we need to
				2999	// generate the full check inline.
				3000	if (i == choice_count - 1) {
				3001	macro_assembler->Bind(&alt_gen->possible_success);
				3002	new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
				3003	new_trace.set_characters_preloaded(preload_characters);
				3004	new_trace.set_bound_checked_up_to(preload_characters);
				3005	generate_full_check_inline = true;
				3006	}
				3007	} else if (alt_gen->quick_check_details.cannot_match()) {
				3008	if (i == choice_count - 1 && !greedy_loop) {
				3009	macro_assembler->GoTo(trace->backtrack());
				3010	}
				3011	continue;
				3012	} else {
				3013	// No quick check was generated. Put the full code here.
				3014	// If this is not the first choice then there could be slow checks from
				3015	// previous cases that go here when they fail. There's no reason to
				3016	// insist that they preload characters since the slow check we are about
				3017	// to generate probably can't use it.
				3018	if (i != first_normal_choice) {
				3019	alt_gen->expects_preload = false;
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	3020	new_trace.InvalidateCurrentCharacter();
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3021	}
				3022	if (i < choice_count - 1) {
				3023	new_trace.set_backtrack(&alt_gen->after);
				3024	}
				3025	generate_full_check_inline = true;
				3026	}
				3027	if (generate_full_check_inline) {
				3028	if (new_trace.actions() != NULL) {
				3029	new_trace.set_flush_budget(new_flush_budget);
				3030	}
				3031	for (int j = 0; j < guard_count; j++) {
				3032	GenerateGuard(macro_assembler, guards->at(j), &new_trace);
				3033	}
				3034	alternative.node()->Emit(compiler, &new_trace);
				3035	preload_is_current = false;
				3036	}
				3037	macro_assembler->Bind(&alt_gen->after);
				3038	}
				3039	if (greedy_loop) {
				3040	macro_assembler->Bind(&greedy_loop_label);
				3041	// If we have unwound to the bottom then backtrack.
				3042	macro_assembler->CheckGreedyLoop(trace->backtrack());
				3043	// Otherwise try the second priority at an earlier position.
				3044	macro_assembler->AdvanceCurrentPosition(-text_length);
				3045	macro_assembler->GoTo(&second_choice);
				3046	}
				3047
				3048	// At this point we need to generate slow checks for the alternatives where
				3049	// the quick check was inlined. We can recognize these because the associated
				3050	// label was bound.
				3051	for (int i = first_normal_choice; i < choice_count - 1; i++) {
				3052	AlternativeGeneration* alt_gen = alt_gens.at(i);
				3053	Trace new_trace(*current_trace);
				3054	// If there are actions to be flushed we have to limit how many times
				3055	// they are flushed. Take the budget of the parent trace and distribute
				3056	// it fairly amongst the children.
				3057	if (new_trace.actions() != NULL) {
				3058	new_trace.set_flush_budget(new_flush_budget);
				3059	}
				3060	EmitOutOfLineContinuation(compiler,
				3061	&new_trace,
				3062	alternatives_->at(i),
				3063	alt_gen,
				3064	preload_characters,
				3065	alt_gens.at(i + 1)->expects_preload);
				3066	}
				3067	}
				3068
				3069
				3070	void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
				3071	Trace* trace,
				3072	GuardedAlternative alternative,
				3073	AlternativeGeneration* alt_gen,
				3074	int preload_characters,
				3075	bool next_expects_preload) {
				3076	if (!alt_gen->possible_success.is_linked()) return;
				3077
				3078	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				3079	macro_assembler->Bind(&alt_gen->possible_success);
				3080	Trace out_of_line_trace(*trace);
				3081	out_of_line_trace.set_characters_preloaded(preload_characters);
				3082	out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
				3083	if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE);
				3084	ZoneList<Guard> guards = alternative.guards();
				3085	int guard_count = (guards == NULL) ? 0 : guards->length();
				3086	if (next_expects_preload) {
				3087	Label reload_current_char;
				3088	out_of_line_trace.set_backtrack(&reload_current_char);
				3089	for (int j = 0; j < guard_count; j++) {
				3090	GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
				3091	}
				3092	alternative.node()->Emit(compiler, &out_of_line_trace);
				3093	macro_assembler->Bind(&reload_current_char);
				3094	// Reload the current character, since the next quick check expects that.
				3095	// We don't need to check bounds here because we only get into this
				3096	// code through a quick check which already did the checked load.
				3097	macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
				3098	NULL,
				3099	false,
				3100	preload_characters);
				3101	macro_assembler->GoTo(&(alt_gen->after));
				3102	} else {
				3103	out_of_line_trace.set_backtrack(&(alt_gen->after));
				3104	for (int j = 0; j < guard_count; j++) {
				3105	GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
				3106	}
				3107	alternative.node()->Emit(compiler, &out_of_line_trace);
				3108	}
				3109	}
				3110
				3111
				3112	void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3113	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3114	LimitResult limit_result = LimitVersions(compiler, trace);
				3115	if (limit_result == DONE) return;
				3116	ASSERT(limit_result == CONTINUE);
				3117
				3118	RecursionCheck rc(compiler);
				3119
				3120	switch (type_) {
				3121	case STORE_POSITION: {
				3122	Trace::DeferredCapture
				3123	new_capture(data_.u_position_register.reg,
				3124	data_.u_position_register.is_capture,
				3125	trace);
				3126	Trace new_trace = *trace;
				3127	new_trace.add_action(&new_capture);
				3128	on_success()->Emit(compiler, &new_trace);
				3129	break;
				3130	}
				3131	case INCREMENT_REGISTER: {
				3132	Trace::DeferredIncrementRegister
				3133	new_increment(data_.u_increment_register.reg);
				3134	Trace new_trace = *trace;
				3135	new_trace.add_action(&new_increment);
				3136	on_success()->Emit(compiler, &new_trace);
				3137	break;
				3138	}
				3139	case SET_REGISTER: {
				3140	Trace::DeferredSetRegister
				3141	new_set(data_.u_store_register.reg, data_.u_store_register.value);
				3142	Trace new_trace = *trace;
				3143	new_trace.add_action(&new_set);
				3144	on_success()->Emit(compiler, &new_trace);
				3145	break;
				3146	}
				3147	case CLEAR_CAPTURES: {
				3148	Trace::DeferredClearCaptures
				3149	new_capture(Interval(data_.u_clear_captures.range_from,
				3150	data_.u_clear_captures.range_to));
				3151	Trace new_trace = *trace;
				3152	new_trace.add_action(&new_capture);
				3153	on_success()->Emit(compiler, &new_trace);
				3154	break;
				3155	}
				3156	case BEGIN_SUBMATCH:
				3157	if (!trace->is_trivial()) {
				3158	trace->Flush(compiler, this);
				3159	} else {
				3160	assembler->WriteCurrentPositionToRegister(
				3161	data_.u_submatch.current_position_register, 0);
				3162	assembler->WriteStackPointerToRegister(
				3163	data_.u_submatch.stack_pointer_register);
				3164	on_success()->Emit(compiler, trace);
				3165	}
				3166	break;
				3167	case EMPTY_MATCH_CHECK: {
				3168	int start_pos_reg = data_.u_empty_match_check.start_register;
				3169	int stored_pos = 0;
				3170	int rep_reg = data_.u_empty_match_check.repetition_register;
				3171	bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
				3172	bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
				3173	if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
				3174	// If we know we haven't advanced and there is no minimum we
				3175	// can just backtrack immediately.
				3176	assembler->GoTo(trace->backtrack());
				3177	} else if (know_dist && stored_pos < trace->cp_offset()) {
				3178	// If we know we've advanced we can generate the continuation
				3179	// immediately.
				3180	on_success()->Emit(compiler, trace);
				3181	} else if (!trace->is_trivial()) {
				3182	trace->Flush(compiler, this);
				3183	} else {
				3184	Label skip_empty_check;
				3185	// If we have a minimum number of repetitions we check the current
				3186	// number first and skip the empty check if it's not enough.
				3187	if (has_minimum) {
				3188	int limit = data_.u_empty_match_check.repetition_limit;
				3189	assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
				3190	}
				3191	// If the match is empty we bail out, otherwise we fall through
				3192	// to the on-success continuation.
				3193	assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
				3194	trace->backtrack());
				3195	assembler->Bind(&skip_empty_check);
				3196	on_success()->Emit(compiler, trace);
				3197	}
				3198	break;
				3199	}
				3200	case POSITIVE_SUBMATCH_SUCCESS: {
				3201	if (!trace->is_trivial()) {
				3202	trace->Flush(compiler, this);
				3203	return;
				3204	}
				3205	assembler->ReadCurrentPositionFromRegister(
				3206	data_.u_submatch.current_position_register);
				3207	assembler->ReadStackPointerFromRegister(
				3208	data_.u_submatch.stack_pointer_register);
				3209	int clear_register_count = data_.u_submatch.clear_register_count;
				3210	if (clear_register_count == 0) {
				3211	on_success()->Emit(compiler, trace);
				3212	return;
				3213	}
				3214	int clear_registers_from = data_.u_submatch.clear_register_from;
				3215	Label clear_registers_backtrack;
				3216	Trace new_trace = *trace;
				3217	new_trace.set_backtrack(&clear_registers_backtrack);
				3218	on_success()->Emit(compiler, &new_trace);
				3219
				3220	assembler->Bind(&clear_registers_backtrack);
				3221	int clear_registers_to = clear_registers_from + clear_register_count - 1;
				3222	assembler->ClearRegisters(clear_registers_from, clear_registers_to);
				3223
				3224	ASSERT(trace->backtrack() == NULL);
				3225	assembler->Backtrack();
				3226	return;
				3227	}
				3228	default:
				3229	UNREACHABLE();
				3230	}
				3231	}
				3232
				3233
				3234	void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3235	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3236	if (!trace->is_trivial()) {
				3237	trace->Flush(compiler, this);
				3238	return;
				3239	}
				3240
				3241	LimitResult limit_result = LimitVersions(compiler, trace);
				3242	if (limit_result == DONE) return;
				3243	ASSERT(limit_result == CONTINUE);
				3244
				3245	RecursionCheck rc(compiler);
				3246
				3247	ASSERT_EQ(start_reg_ + 1, end_reg_);
				3248	if (compiler->ignore_case()) {
				3249	assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
				3250	trace->backtrack());
				3251	} else {
				3252	assembler->CheckNotBackReference(start_reg_, trace->backtrack());
				3253	}
				3254	on_success()->Emit(compiler, trace);
				3255	}
				3256
				3257
				3258	// -------------------------------------------------------------------
				3259	// Dot/dotty output
				3260
				3261
				3262	#ifdef DEBUG
				3263
				3264
				3265	class DotPrinter: public NodeVisitor {
				3266	public:
				3267	explicit DotPrinter(bool ignore_case)
				3268	: ignore_case_(ignore_case),
				3269	stream_(&alloc_) { }
				3270	void PrintNode(const char* label, RegExpNode* node);
				3271	void Visit(RegExpNode* node);
				3272	void PrintAttributes(RegExpNode* from);
				3273	StringStream* stream() { return &stream_; }
				3274	void PrintOnFailure(RegExpNode* from, RegExpNode* to);
				3275	#define DECLARE_VISIT(Type) \
				3276	virtual void Visit##Type(Type##Node* that);
				3277	FOR_EACH_NODE_TYPE(DECLARE_VISIT)
				3278	#undef DECLARE_VISIT
				3279	private:
				3280	bool ignore_case_;
				3281	HeapStringAllocator alloc_;
				3282	StringStream stream_;
				3283	};
				3284
				3285
				3286	void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
				3287	stream()->Add("digraph G {\n graph [label=\"");
				3288	for (int i = 0; label[i]; i++) {
				3289	switch (label[i]) {
				3290	case '\\':
				3291	stream()->Add("\\\\");
				3292	break;
				3293	case '"':
				3294	stream()->Add("\"");
				3295	break;
				3296	default:
				3297	stream()->Put(label[i]);
				3298	break;
				3299	}
				3300	}
				3301	stream()->Add("\"];\n");
				3302	Visit(node);
				3303	stream()->Add("}\n");
				3304	printf("%s", *(stream()->ToCString()));
				3305	}
				3306
				3307
				3308	void DotPrinter::Visit(RegExpNode* node) {
				3309	if (node->info()->visited) return;
				3310	node->info()->visited = true;
				3311	node->Accept(this);
				3312	}
				3313
				3314
				3315	void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
				3316	stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
				3317	Visit(on_failure);
				3318	}
				3319
				3320
				3321	class TableEntryBodyPrinter {
				3322	public:
				3323	TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
				3324	: stream_(stream), choice_(choice) { }
				3325	void Call(uc16 from, DispatchTable::Entry entry) {
				3326	OutSet* out_set = entry.out_set();
				3327	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				3328	if (out_set->Get(i)) {
				3329	stream()->Add(" n%p:s%io%i -> n%p;\n",
				3330	choice(),
				3331	from,
				3332	i,
				3333	choice()->alternatives()->at(i).node());
				3334	}
				3335	}
				3336	}
				3337	private:
				3338	StringStream* stream() { return stream_; }
				3339	ChoiceNode* choice() { return choice_; }
				3340	StringStream* stream_;
				3341	ChoiceNode* choice_;
				3342	};
				3343
				3344
				3345	class TableEntryHeaderPrinter {
				3346	public:
				3347	explicit TableEntryHeaderPrinter(StringStream* stream)
				3348	: first_(true), stream_(stream) { }
				3349	void Call(uc16 from, DispatchTable::Entry entry) {
				3350	if (first_) {
				3351	first_ = false;
				3352	} else {
				3353	stream()->Add("\|");
				3354	}
				3355	stream()->Add("{\\%k-\\%k\|{", from, entry.to());
				3356	OutSet* out_set = entry.out_set();
				3357	int priority = 0;
				3358	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				3359	if (out_set->Get(i)) {
				3360	if (priority > 0) stream()->Add("\|");
				3361	stream()->Add("<s%io%i> %i", from, i, priority);
				3362	priority++;
				3363	}
				3364	}
				3365	stream()->Add("}}");
				3366	}
Ben Murdoch	3fb3ca8	2011-12-02 17:19:32 +0000	[diff] [blame^]	3367
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3368	private:
				3369	bool first_;
				3370	StringStream* stream() { return stream_; }
				3371	StringStream* stream_;
				3372	};
				3373
				3374
				3375	class AttributePrinter {
				3376	public:
				3377	explicit AttributePrinter(DotPrinter* out)
				3378	: out_(out), first_(true) { }
				3379	void PrintSeparator() {
				3380	if (first_) {
				3381	first_ = false;
				3382	} else {
				3383	out_->stream()->Add("\|");
				3384	}
				3385	}
				3386	void PrintBit(const char* name, bool value) {
				3387	if (!value) return;
				3388	PrintSeparator();
				3389	out_->stream()->Add("{%s}", name);
				3390	}
				3391	void PrintPositive(const char* name, int value) {
				3392	if (value < 0) return;
				3393	PrintSeparator();
				3394	out_->stream()->Add("{%s\|%x}", name, value);
				3395	}
				3396	private:
				3397	DotPrinter* out_;
				3398	bool first_;
				3399	};
				3400
				3401
				3402	void DotPrinter::PrintAttributes(RegExpNode* that) {
				3403	stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
				3404	"margin=0.1, fontsize=10, label=\"{",
				3405	that);
				3406	AttributePrinter printer(this);
				3407	NodeInfo* info = that->info();
				3408	printer.PrintBit("NI", info->follows_newline_interest);
				3409	printer.PrintBit("WI", info->follows_word_interest);
				3410	printer.PrintBit("SI", info->follows_start_interest);
				3411	Label* label = that->label();
				3412	if (label->is_bound())
				3413	printer.PrintPositive("@", label->pos());
				3414	stream()->Add("}\"];\n");
				3415	stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
				3416	"arrowhead=none];\n", that, that);
				3417	}
				3418
				3419
				3420	static const bool kPrintDispatchTable = false;
				3421	void DotPrinter::VisitChoice(ChoiceNode* that) {
				3422	if (kPrintDispatchTable) {
				3423	stream()->Add(" n%p [shape=Mrecord, label=\"", that);
				3424	TableEntryHeaderPrinter header_printer(stream());
				3425	that->GetTable(ignore_case_)->ForEach(&header_printer);
				3426	stream()->Add("\"]\n", that);
				3427	PrintAttributes(that);
				3428	TableEntryBodyPrinter body_printer(stream(), that);
				3429	that->GetTable(ignore_case_)->ForEach(&body_printer);
				3430	} else {
				3431	stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
				3432	for (int i = 0; i < that->alternatives()->length(); i++) {
				3433	GuardedAlternative alt = that->alternatives()->at(i);
				3434	stream()->Add(" n%p -> n%p;\n", that, alt.node());
				3435	}
				3436	}
				3437	for (int i = 0; i < that->alternatives()->length(); i++) {
				3438	GuardedAlternative alt = that->alternatives()->at(i);
				3439	alt.node()->Accept(this);
				3440	}
				3441	}
				3442
				3443
				3444	void DotPrinter::VisitText(TextNode* that) {
				3445	stream()->Add(" n%p [label=\"", that);
				3446	for (int i = 0; i < that->elements()->length(); i++) {
				3447	if (i > 0) stream()->Add(" ");
				3448	TextElement elm = that->elements()->at(i);
				3449	switch (elm.type) {
				3450	case TextElement::ATOM: {
				3451	stream()->Add("'%w'", elm.data.u_atom->data());
				3452	break;
				3453	}
				3454	case TextElement::CHAR_CLASS: {
				3455	RegExpCharacterClass* node = elm.data.u_char_class;
				3456	stream()->Add("[");
				3457	if (node->is_negated())
				3458	stream()->Add("^");
				3459	for (int j = 0; j < node->ranges()->length(); j++) {
				3460	CharacterRange range = node->ranges()->at(j);
				3461	stream()->Add("%k-%k", range.from(), range.to());
				3462	}
				3463	stream()->Add("]");
				3464	break;
				3465	}
				3466	default:
				3467	UNREACHABLE();
				3468	}
				3469	}
				3470	stream()->Add("\", shape=box, peripheries=2];\n");
				3471	PrintAttributes(that);
				3472	stream()->Add(" n%p -> n%p;\n", that, that->on_success());
				3473	Visit(that->on_success());
				3474	}
				3475
				3476
				3477	void DotPrinter::VisitBackReference(BackReferenceNode* that) {
				3478	stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
				3479	that,
				3480	that->start_register(),
				3481	that->end_register());
				3482	PrintAttributes(that);
				3483	stream()->Add(" n%p -> n%p;\n", that, that->on_success());
				3484	Visit(that->on_success());
				3485	}
				3486
				3487
				3488	void DotPrinter::VisitEnd(EndNode* that) {
				3489	stream()->Add(" n%p [style=bold, shape=point];\n", that);
				3490	PrintAttributes(that);
				3491	}
				3492
				3493
				3494	void DotPrinter::VisitAssertion(AssertionNode* that) {
				3495	stream()->Add(" n%p [", that);
				3496	switch (that->type()) {
				3497	case AssertionNode::AT_END:
				3498	stream()->Add("label=\"$\", shape=septagon");
				3499	break;
				3500	case AssertionNode::AT_START:
				3501	stream()->Add("label=\"^\", shape=septagon");
				3502	break;
				3503	case AssertionNode::AT_BOUNDARY:
				3504	stream()->Add("label=\"\\b\", shape=septagon");
				3505	break;
				3506	case AssertionNode::AT_NON_BOUNDARY:
				3507	stream()->Add("label=\"\\B\", shape=septagon");
				3508	break;
				3509	case AssertionNode::AFTER_NEWLINE:
				3510	stream()->Add("label=\"(?<=\\n)\", shape=septagon");
				3511	break;
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	3512	case AssertionNode::AFTER_WORD_CHARACTER:
				3513	stream()->Add("label=\"(?<=\\w)\", shape=septagon");
				3514	break;
				3515	case AssertionNode::AFTER_NONWORD_CHARACTER:
				3516	stream()->Add("label=\"(?<=\\W)\", shape=septagon");
				3517	break;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3518	}
				3519	stream()->Add("];\n");
				3520	PrintAttributes(that);
				3521	RegExpNode* successor = that->on_success();
				3522	stream()->Add(" n%p -> n%p;\n", that, successor);
				3523	Visit(successor);
				3524	}
				3525
				3526
				3527	void DotPrinter::VisitAction(ActionNode* that) {
				3528	stream()->Add(" n%p [", that);
				3529	switch (that->type_) {
				3530	case ActionNode::SET_REGISTER:
				3531	stream()->Add("label=\"$%i:=%i\", shape=octagon",
				3532	that->data_.u_store_register.reg,
				3533	that->data_.u_store_register.value);
				3534	break;
				3535	case ActionNode::INCREMENT_REGISTER:
				3536	stream()->Add("label=\"$%i++\", shape=octagon",
				3537	that->data_.u_increment_register.reg);
				3538	break;
				3539	case ActionNode::STORE_POSITION:
				3540	stream()->Add("label=\"$%i:=$pos\", shape=octagon",
				3541	that->data_.u_position_register.reg);
				3542	break;
				3543	case ActionNode::BEGIN_SUBMATCH:
				3544	stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
				3545	that->data_.u_submatch.current_position_register);
				3546	break;
				3547	case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
				3548	stream()->Add("label=\"escape\", shape=septagon");
				3549	break;
				3550	case ActionNode::EMPTY_MATCH_CHECK:
				3551	stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
				3552	that->data_.u_empty_match_check.start_register,
				3553	that->data_.u_empty_match_check.repetition_register,
				3554	that->data_.u_empty_match_check.repetition_limit);
				3555	break;
				3556	case ActionNode::CLEAR_CAPTURES: {
				3557	stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
				3558	that->data_.u_clear_captures.range_from,
				3559	that->data_.u_clear_captures.range_to);
				3560	break;
				3561	}
				3562	}
				3563	stream()->Add("];\n");
				3564	PrintAttributes(that);
				3565	RegExpNode* successor = that->on_success();
				3566	stream()->Add(" n%p -> n%p;\n", that, successor);
				3567	Visit(successor);
				3568	}
				3569
				3570
				3571	class DispatchTableDumper {
				3572	public:
				3573	explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
				3574	void Call(uc16 key, DispatchTable::Entry entry);
				3575	StringStream* stream() { return stream_; }
				3576	private:
				3577	StringStream* stream_;
				3578	};
				3579
				3580
				3581	void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
				3582	stream()->Add("[%k-%k]: {", key, entry.to());
				3583	OutSet* set = entry.out_set();
				3584	bool first = true;
				3585	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				3586	if (set->Get(i)) {
				3587	if (first) {
				3588	first = false;
				3589	} else {
				3590	stream()->Add(", ");
				3591	}
				3592	stream()->Add("%i", i);
				3593	}
				3594	}
				3595	stream()->Add("}\n");
				3596	}
				3597
				3598
				3599	void DispatchTable::Dump() {
				3600	HeapStringAllocator alloc;
				3601	StringStream stream(&alloc);
				3602	DispatchTableDumper dumper(&stream);
				3603	tree()->ForEach(&dumper);
				3604	OS::PrintError("%s", *stream.ToCString());
				3605	}
				3606
				3607
				3608	void RegExpEngine::DotPrint(const char* label,
				3609	RegExpNode* node,
				3610	bool ignore_case) {
				3611	DotPrinter printer(ignore_case);
				3612	printer.PrintNode(label, node);
				3613	}
				3614
				3615
				3616	#endif // DEBUG
				3617
				3618
				3619	// -------------------------------------------------------------------
				3620	// Tree to graph conversion
				3621
				3622	static const int kSpaceRangeCount = 20;
				3623	static const int kSpaceRangeAsciiCount = 4;
				3624	static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020,
				3625	0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A,
				3626	0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 };
				3627
				3628	static const int kWordRangeCount = 8;
				3629	static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_',
				3630	'_', 'a', 'z' };
				3631
				3632	static const int kDigitRangeCount = 2;
				3633	static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' };
				3634
				3635	static const int kLineTerminatorRangeCount = 6;
				3636	static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A,
				3637	0x000A, 0x000D, 0x000D, 0x2028, 0x2029 };
				3638
				3639	RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
				3640	RegExpNode* on_success) {
				3641	ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
				3642	elms->Add(TextElement::Atom(this));
				3643	return new TextNode(elms, on_success);
				3644	}
				3645
				3646
				3647	RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
				3648	RegExpNode* on_success) {
				3649	return new TextNode(elements(), on_success);
				3650	}
				3651
				3652	static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
				3653	const uc16* special_class,
				3654	int length) {
				3655	ASSERT(ranges->length() != 0);
				3656	ASSERT(length != 0);
				3657	ASSERT(special_class[0] != 0);
				3658	if (ranges->length() != (length >> 1) + 1) {
				3659	return false;
				3660	}
				3661	CharacterRange range = ranges->at(0);
				3662	if (range.from() != 0) {
				3663	return false;
				3664	}
				3665	for (int i = 0; i < length; i += 2) {
				3666	if (special_class[i] != (range.to() + 1)) {
				3667	return false;
				3668	}
				3669	range = ranges->at((i >> 1) + 1);
				3670	if (special_class[i+1] != range.from() - 1) {
				3671	return false;
				3672	}
				3673	}
				3674	if (range.to() != 0xffff) {
				3675	return false;
				3676	}
				3677	return true;
				3678	}
				3679
				3680
				3681	static bool CompareRanges(ZoneList<CharacterRange>* ranges,
				3682	const uc16* special_class,
				3683	int length) {
				3684	if (ranges->length() * 2 != length) {
				3685	return false;
				3686	}
				3687	for (int i = 0; i < length; i += 2) {
				3688	CharacterRange range = ranges->at(i >> 1);
				3689	if (range.from() != special_class[i] \|\| range.to() != special_class[i+1]) {
				3690	return false;
				3691	}
				3692	}
				3693	return true;
				3694	}
				3695
				3696
				3697	bool RegExpCharacterClass::is_standard() {
				3698	// TODO(lrn): Remove need for this function, by not throwing away information
				3699	// along the way.
				3700	if (is_negated_) {
				3701	return false;
				3702	}
				3703	if (set_.is_standard()) {
				3704	return true;
				3705	}
				3706	if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
				3707	set_.set_standard_set_type('s');
				3708	return true;
				3709	}
				3710	if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
				3711	set_.set_standard_set_type('S');
				3712	return true;
				3713	}
				3714	if (CompareInverseRanges(set_.ranges(),
				3715	kLineTerminatorRanges,
				3716	kLineTerminatorRangeCount)) {
				3717	set_.set_standard_set_type('.');
				3718	return true;
				3719	}
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	3720	if (CompareRanges(set_.ranges(),
				3721	kLineTerminatorRanges,
				3722	kLineTerminatorRangeCount)) {
				3723	set_.set_standard_set_type('n');
				3724	return true;
				3725	}
				3726	if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
				3727	set_.set_standard_set_type('w');
				3728	return true;
				3729	}
				3730	if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
				3731	set_.set_standard_set_type('W');
				3732	return true;
				3733	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3734	return false;
				3735	}
				3736
				3737
				3738	RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
				3739	RegExpNode* on_success) {
				3740	return new TextNode(this, on_success);
				3741	}
				3742
				3743
				3744	RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
				3745	RegExpNode* on_success) {
				3746	ZoneList<RegExpTree> alternatives = this->alternatives();
				3747	int length = alternatives->length();
				3748	ChoiceNode* result = new ChoiceNode(length);
				3749	for (int i = 0; i < length; i++) {
				3750	GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
				3751	on_success));
				3752	result->AddAlternative(alternative);
				3753	}
				3754	return result;
				3755	}
				3756
				3757
				3758	RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
				3759	RegExpNode* on_success) {
				3760	return ToNode(min(),
				3761	max(),
				3762	is_greedy(),
				3763	body(),
				3764	compiler,
				3765	on_success);
				3766	}
				3767
				3768
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	3769	// Scoped object to keep track of how much we unroll quantifier loops in the
				3770	// regexp graph generator.
				3771	class RegExpExpansionLimiter {
				3772	public:
				3773	static const int kMaxExpansionFactor = 6;
				3774	RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
				3775	: compiler_(compiler),
				3776	saved_expansion_factor_(compiler->current_expansion_factor()),
				3777	ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
				3778	ASSERT(factor > 0);
				3779	if (ok_to_expand_) {
				3780	if (factor > kMaxExpansionFactor) {
				3781	// Avoid integer overflow of the current expansion factor.
				3782	ok_to_expand_ = false;
				3783	compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
				3784	} else {
				3785	int new_factor = saved_expansion_factor_ * factor;
				3786	ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
				3787	compiler->set_current_expansion_factor(new_factor);
				3788	}
				3789	}
				3790	}
				3791
				3792	~RegExpExpansionLimiter() {
				3793	compiler_->set_current_expansion_factor(saved_expansion_factor_);
				3794	}
				3795
				3796	bool ok_to_expand() { return ok_to_expand_; }
				3797
				3798	private:
				3799	RegExpCompiler* compiler_;
				3800	int saved_expansion_factor_;
				3801	bool ok_to_expand_;
				3802
				3803	DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
				3804	};
				3805
				3806
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3807	RegExpNode* RegExpQuantifier::ToNode(int min,
				3808	int max,
				3809	bool is_greedy,
				3810	RegExpTree* body,
				3811	RegExpCompiler* compiler,
				3812	RegExpNode* on_success,
				3813	bool not_at_start) {
				3814	// x{f, t} becomes this:
				3815	//
				3816	// (r++)<-.
				3817	// \| `
				3818	// \| (x)
				3819	// v ^
				3820	// (r=0)-->(?)---/ [if r < t]
				3821	// \|
				3822	// [if r >= f] \----> ...
				3823	//
				3824
				3825	// 15.10.2.5 RepeatMatcher algorithm.
				3826	// The parser has already eliminated the case where max is 0. In the case
				3827	// where max_match is zero the parser has removed the quantifier if min was
				3828	// > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
				3829
				3830	// If we know that we cannot match zero length then things are a little
				3831	// simpler since we don't need to make the special zero length match check
				3832	// from step 2.1. If the min and max are small we can unroll a little in
				3833	// this case.
				3834	static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
				3835	static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
				3836	if (max == 0) return on_success; // This can happen due to recursion.
				3837	bool body_can_be_empty = (body->min_match() == 0);
				3838	int body_start_reg = RegExpCompiler::kNoRegister;
				3839	Interval capture_registers = body->CaptureRegisters();
				3840	bool needs_capture_clearing = !capture_registers.is_empty();
				3841	if (body_can_be_empty) {
				3842	body_start_reg = compiler->AllocateRegister();
				3843	} else if (FLAG_regexp_optimization && !needs_capture_clearing) {
				3844	// Only unroll if there are no captures and the body can't be
				3845	// empty.
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	3846	{
				3847	RegExpExpansionLimiter limiter(
				3848	compiler, min + ((max != min) ? 1 : 0));
				3849	if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
				3850	int new_max = (max == kInfinity) ? max : max - min;
				3851	// Recurse once to get the loop or optional matches after the fixed
				3852	// ones.
				3853	RegExpNode* answer = ToNode(
				3854	0, new_max, is_greedy, body, compiler, on_success, true);
				3855	// Unroll the forced matches from 0 to min. This can cause chains of
				3856	// TextNodes (which the parser does not generate). These should be
				3857	// combined if it turns out they hinder good code generation.
				3858	for (int i = 0; i < min; i++) {
				3859	answer = body->ToNode(compiler, answer);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3860	}
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	3861	return answer;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3862	}
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	3863	}
				3864	if (max <= kMaxUnrolledMaxMatches && min == 0) {
				3865	ASSERT(max > 0); // Due to the 'if' above.
				3866	RegExpExpansionLimiter limiter(compiler, max);
				3867	if (limiter.ok_to_expand()) {
				3868	// Unroll the optional matches up to max.
				3869	RegExpNode* answer = on_success;
				3870	for (int i = 0; i < max; i++) {
				3871	ChoiceNode* alternation = new ChoiceNode(2);
				3872	if (is_greedy) {
				3873	alternation->AddAlternative(
				3874	GuardedAlternative(body->ToNode(compiler, answer)));
				3875	alternation->AddAlternative(GuardedAlternative(on_success));
				3876	} else {
				3877	alternation->AddAlternative(GuardedAlternative(on_success));
				3878	alternation->AddAlternative(
				3879	GuardedAlternative(body->ToNode(compiler, answer)));
				3880	}
				3881	answer = alternation;
				3882	if (not_at_start) alternation->set_not_at_start();
				3883	}
				3884	return answer;
				3885	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3886	}
				3887	}
				3888	bool has_min = min > 0;
				3889	bool has_max = max < RegExpTree::kInfinity;
				3890	bool needs_counter = has_min \|\| has_max;
				3891	int reg_ctr = needs_counter
				3892	? compiler->AllocateRegister()
				3893	: RegExpCompiler::kNoRegister;
				3894	LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
				3895	if (not_at_start) center->set_not_at_start();
				3896	RegExpNode* loop_return = needs_counter
				3897	? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
				3898	: static_cast<RegExpNode*>(center);
				3899	if (body_can_be_empty) {
				3900	// If the body can be empty we need to check if it was and then
				3901	// backtrack.
				3902	loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
				3903	reg_ctr,
				3904	min,
				3905	loop_return);
				3906	}
				3907	RegExpNode* body_node = body->ToNode(compiler, loop_return);
				3908	if (body_can_be_empty) {
				3909	// If the body can be empty we need to store the start position
				3910	// so we can bail out if it was empty.
				3911	body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
				3912	}
				3913	if (needs_capture_clearing) {
				3914	// Before entering the body of this loop we need to clear captures.
				3915	body_node = ActionNode::ClearCaptures(capture_registers, body_node);
				3916	}
				3917	GuardedAlternative body_alt(body_node);
				3918	if (has_max) {
				3919	Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
				3920	body_alt.AddGuard(body_guard);
				3921	}
				3922	GuardedAlternative rest_alt(on_success);
				3923	if (has_min) {
				3924	Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
				3925	rest_alt.AddGuard(rest_guard);
				3926	}
				3927	if (is_greedy) {
				3928	center->AddLoopAlternative(body_alt);
				3929	center->AddContinueAlternative(rest_alt);
				3930	} else {
				3931	center->AddContinueAlternative(rest_alt);
				3932	center->AddLoopAlternative(body_alt);
				3933	}
				3934	if (needs_counter) {
				3935	return ActionNode::SetRegister(reg_ctr, 0, center);
				3936	} else {
				3937	return center;
				3938	}
				3939	}
				3940
				3941
				3942	RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
				3943	RegExpNode* on_success) {
				3944	NodeInfo info;
				3945	switch (type()) {
				3946	case START_OF_LINE:
				3947	return AssertionNode::AfterNewline(on_success);
				3948	case START_OF_INPUT:
				3949	return AssertionNode::AtStart(on_success);
				3950	case BOUNDARY:
				3951	return AssertionNode::AtBoundary(on_success);
				3952	case NON_BOUNDARY:
				3953	return AssertionNode::AtNonBoundary(on_success);
				3954	case END_OF_INPUT:
				3955	return AssertionNode::AtEnd(on_success);
				3956	case END_OF_LINE: {
				3957	// Compile $ in multiline regexps as an alternation with a positive
				3958	// lookahead in one side and an end-of-input on the other side.
				3959	// We need two registers for the lookahead.
				3960	int stack_pointer_register = compiler->AllocateRegister();
				3961	int position_register = compiler->AllocateRegister();
				3962	// The ChoiceNode to distinguish between a newline and end-of-input.
				3963	ChoiceNode* result = new ChoiceNode(2);
				3964	// Create a newline atom.
				3965	ZoneList<CharacterRange>* newline_ranges =
				3966	new ZoneList<CharacterRange>(3);
				3967	CharacterRange::AddClassEscape('n', newline_ranges);
				3968	RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
				3969	TextNode* newline_matcher = new TextNode(
				3970	newline_atom,
				3971	ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
				3972	position_register,
				3973	0, // No captures inside.
				3974	-1, // Ignored if no captures.
				3975	on_success));
				3976	// Create an end-of-input matcher.
				3977	RegExpNode* end_of_line = ActionNode::BeginSubmatch(
				3978	stack_pointer_register,
				3979	position_register,
				3980	newline_matcher);
				3981	// Add the two alternatives to the ChoiceNode.
				3982	GuardedAlternative eol_alternative(end_of_line);
				3983	result->AddAlternative(eol_alternative);
				3984	GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
				3985	result->AddAlternative(end_alternative);
				3986	return result;
				3987	}
				3988	default:
				3989	UNREACHABLE();
				3990	}
				3991	return on_success;
				3992	}
				3993
				3994
				3995	RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
				3996	RegExpNode* on_success) {
				3997	return new BackReferenceNode(RegExpCapture::StartRegister(index()),
				3998	RegExpCapture::EndRegister(index()),
				3999	on_success);
				4000	}
				4001
				4002
				4003	RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
				4004	RegExpNode* on_success) {
				4005	return on_success;
				4006	}
				4007
				4008
				4009	RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
				4010	RegExpNode* on_success) {
				4011	int stack_pointer_register = compiler->AllocateRegister();
				4012	int position_register = compiler->AllocateRegister();
				4013
				4014	const int registers_per_capture = 2;
				4015	const int register_of_first_capture = 2;
				4016	int register_count = capture_count_ * registers_per_capture;
				4017	int register_start =
				4018	register_of_first_capture + capture_from_ * registers_per_capture;
				4019
				4020	RegExpNode* success;
				4021	if (is_positive()) {
				4022	RegExpNode* node = ActionNode::BeginSubmatch(
				4023	stack_pointer_register,
				4024	position_register,
				4025	body()->ToNode(
				4026	compiler,
				4027	ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
				4028	position_register,
				4029	register_count,
				4030	register_start,
				4031	on_success)));
				4032	return node;
				4033	} else {
				4034	// We use a ChoiceNode for a negative lookahead because it has most of
				4035	// the characteristics we need. It has the body of the lookahead as its
				4036	// first alternative and the expression after the lookahead of the second
				4037	// alternative. If the first alternative succeeds then the
				4038	// NegativeSubmatchSuccess will unwind the stack including everything the
				4039	// choice node set up and backtrack. If the first alternative fails then
				4040	// the second alternative is tried, which is exactly the desired result
				4041	// for a negative lookahead. The NegativeLookaheadChoiceNode is a special
				4042	// ChoiceNode that knows to ignore the first exit when calculating quick
				4043	// checks.
				4044	GuardedAlternative body_alt(
				4045	body()->ToNode(
				4046	compiler,
				4047	success = new NegativeSubmatchSuccess(stack_pointer_register,
				4048	position_register,
				4049	register_count,
				4050	register_start)));
				4051	ChoiceNode* choice_node =
				4052	new NegativeLookaheadChoiceNode(body_alt,
				4053	GuardedAlternative(on_success));
				4054	return ActionNode::BeginSubmatch(stack_pointer_register,
				4055	position_register,
				4056	choice_node);
				4057	}
				4058	}
				4059
				4060
				4061	RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
				4062	RegExpNode* on_success) {
				4063	return ToNode(body(), index(), compiler, on_success);
				4064	}
				4065
				4066
				4067	RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
				4068	int index,
				4069	RegExpCompiler* compiler,
				4070	RegExpNode* on_success) {
				4071	int start_reg = RegExpCapture::StartRegister(index);
				4072	int end_reg = RegExpCapture::EndRegister(index);
				4073	RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
				4074	RegExpNode* body_node = body->ToNode(compiler, store_end);
				4075	return ActionNode::StorePosition(start_reg, true, body_node);
				4076	}
				4077
				4078
				4079	RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
				4080	RegExpNode* on_success) {
				4081	ZoneList<RegExpTree> children = nodes();
				4082	RegExpNode* current = on_success;
				4083	for (int i = children->length() - 1; i >= 0; i--) {
				4084	current = children->at(i)->ToNode(compiler, current);
				4085	}
				4086	return current;
				4087	}
				4088
				4089
				4090	static void AddClass(const uc16* elmv,
				4091	int elmc,
				4092	ZoneList<CharacterRange>* ranges) {
				4093	for (int i = 0; i < elmc; i += 2) {
				4094	ASSERT(elmv[i] <= elmv[i + 1]);
				4095	ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
				4096	}
				4097	}
				4098
				4099
				4100	static void AddClassNegated(const uc16 *elmv,
				4101	int elmc,
				4102	ZoneList<CharacterRange>* ranges) {
				4103	ASSERT(elmv[0] != 0x0000);
				4104	ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
				4105	uc16 last = 0x0000;
				4106	for (int i = 0; i < elmc; i += 2) {
				4107	ASSERT(last <= elmv[i] - 1);
				4108	ASSERT(elmv[i] <= elmv[i + 1]);
				4109	ranges->Add(CharacterRange(last, elmv[i] - 1));
				4110	last = elmv[i + 1] + 1;
				4111	}
				4112	ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
				4113	}
				4114
				4115
				4116	void CharacterRange::AddClassEscape(uc16 type,
				4117	ZoneList<CharacterRange>* ranges) {
				4118	switch (type) {
				4119	case 's':
				4120	AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
				4121	break;
				4122	case 'S':
				4123	AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
				4124	break;
				4125	case 'w':
				4126	AddClass(kWordRanges, kWordRangeCount, ranges);
				4127	break;
				4128	case 'W':
				4129	AddClassNegated(kWordRanges, kWordRangeCount, ranges);
				4130	break;
				4131	case 'd':
				4132	AddClass(kDigitRanges, kDigitRangeCount, ranges);
				4133	break;
				4134	case 'D':
				4135	AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
				4136	break;
				4137	case '.':
				4138	AddClassNegated(kLineTerminatorRanges,
				4139	kLineTerminatorRangeCount,
				4140	ranges);
				4141	break;
				4142	// This is not a character range as defined by the spec but a
				4143	// convenient shorthand for a character class that matches any
				4144	// character.
				4145	case '*':
				4146	ranges->Add(CharacterRange::Everything());
				4147	break;
				4148	// This is the set of characters matched by the $ and ^ symbols
				4149	// in multiline mode.
				4150	case 'n':
				4151	AddClass(kLineTerminatorRanges,
				4152	kLineTerminatorRangeCount,
				4153	ranges);
				4154	break;
				4155	default:
				4156	UNREACHABLE();
				4157	}
				4158	}
				4159
				4160
				4161	Vector<const uc16> CharacterRange::GetWordBounds() {
				4162	return Vector<const uc16>(kWordRanges, kWordRangeCount);
				4163	}
				4164
				4165
				4166	class CharacterRangeSplitter {
				4167	public:
				4168	CharacterRangeSplitter(ZoneList<CharacterRange>** included,
				4169	ZoneList<CharacterRange>** excluded)
				4170	: included_(included),
				4171	excluded_(excluded) { }
				4172	void Call(uc16 from, DispatchTable::Entry entry);
				4173
				4174	static const int kInBase = 0;
				4175	static const int kInOverlay = 1;
				4176
				4177	private:
				4178	ZoneList<CharacterRange>** included_;
				4179	ZoneList<CharacterRange>** excluded_;
				4180	};
				4181
				4182
				4183	void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
				4184	if (!entry.out_set()->Get(kInBase)) return;
				4185	ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
				4186	? included_
				4187	: excluded_;
				4188	if (target == NULL) target = new ZoneList<CharacterRange>(2);
				4189	(*target)->Add(CharacterRange(entry.from(), entry.to()));
				4190	}
				4191
				4192
				4193	void CharacterRange::Split(ZoneList<CharacterRange>* base,
				4194	Vector<const uc16> overlay,
				4195	ZoneList<CharacterRange>** included,
				4196	ZoneList<CharacterRange>** excluded) {
				4197	ASSERT_EQ(NULL, *included);
				4198	ASSERT_EQ(NULL, *excluded);
				4199	DispatchTable table;
				4200	for (int i = 0; i < base->length(); i++)
				4201	table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
				4202	for (int i = 0; i < overlay.length(); i += 2) {
				4203	table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
				4204	CharacterRangeSplitter::kInOverlay);
				4205	}
				4206	CharacterRangeSplitter callback(included, excluded);
				4207	table.ForEach(&callback);
				4208	}
				4209
				4210
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4211	void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
				4212	bool is_ascii) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	4213	Isolate* isolate = Isolate::Current();
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4214	uc16 bottom = from();
				4215	uc16 top = to();
				4216	if (is_ascii) {
				4217	if (bottom > String::kMaxAsciiCharCode) return;
				4218	if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode;
				4219	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4220	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4221	if (top == bottom) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4222	// If this is a singleton we just expand the one character.
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	4223	int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4224	for (int i = 0; i < length; i++) {
				4225	uc32 chr = chars[i];
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4226	if (chr != bottom) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4227	ranges->Add(CharacterRange::Singleton(chars[i]));
				4228	}
				4229	}
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4230	} else {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4231	// If this is a range we expand the characters block by block,
				4232	// expanding contiguous subranges (blocks) one at a time.
				4233	// The approach is as follows. For a given start character we
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4234	// look up the remainder of the block that contains it (represented
				4235	// by the end point), for instance we find 'z' if the character
				4236	// is 'c'. A block is characterized by the property
				4237	// that all characters uncanonicalize in the same way, except that
				4238	// each entry in the result is incremented by the distance from the first
				4239	// element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
				4240	// the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
				4241	// Once we've found the end point we look up its uncanonicalization
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4242	// and produce a range for each element. For instance for [c-f]
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4243	// we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4244	// add a range if it is not already contained in the input, so [c-f]
				4245	// will be skipped but [C-F] will be added. If this range is not
				4246	// completely contained in a block we do this for all the blocks
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4247	// covered by the range (handling characters that is not in a block
				4248	// as a "singleton block").
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4249	unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4250	int pos = bottom;
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4251	while (pos < top) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	4252	int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4253	uc16 block_end;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4254	if (length == 0) {
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4255	block_end = pos;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4256	} else {
				4257	ASSERT_EQ(1, length);
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4258	block_end = range[0];
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4259	}
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4260	int end = (block_end > top) ? top : block_end;
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	4261	length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4262	for (int i = 0; i < length; i++) {
				4263	uc32 c = range[i];
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4264	uc16 range_from = c - (block_end - pos);
				4265	uc16 range_to = c - (block_end - end);
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4266	if (!(bottom <= range_from && range_to <= top)) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4267	ranges->Add(CharacterRange(range_from, range_to));
				4268	}
				4269	}
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4270	pos = end + 1;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4271	}
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4272	}
				4273	}
				4274
				4275
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4276	bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
				4277	ASSERT_NOT_NULL(ranges);
				4278	int n = ranges->length();
				4279	if (n <= 1) return true;
				4280	int max = ranges->at(0).to();
				4281	for (int i = 1; i < n; i++) {
				4282	CharacterRange next_range = ranges->at(i);
				4283	if (next_range.from() <= max + 1) return false;
				4284	max = next_range.to();
				4285	}
				4286	return true;
				4287	}
				4288
				4289	SetRelation CharacterRange::WordCharacterRelation(
				4290	ZoneList<CharacterRange>* range) {
				4291	ASSERT(IsCanonical(range));
				4292	int i = 0; // Word character range index.
				4293	int j = 0; // Argument range index.
				4294	ASSERT_NE(0, kWordRangeCount);
				4295	SetRelation result;
				4296	if (range->length() == 0) {
				4297	result.SetElementsInSecondSet();
				4298	return result;
				4299	}
				4300	CharacterRange argument_range = range->at(0);
				4301	CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]);
				4302	while (i < kWordRangeCount && j < range->length()) {
				4303	// Check the two ranges for the five cases:
				4304	// - no overlap.
				4305	// - partial overlap (there are elements in both ranges that isn't
				4306	// in the other, and there are also elements that are in both).
				4307	// - argument range entirely inside word range.
				4308	// - word range entirely inside argument range.
				4309	// - ranges are completely equal.
				4310
				4311	// First check for no overlap. The earlier range is not in the other set.
				4312	if (argument_range.from() > word_range.to()) {
				4313	// Ranges are disjoint. The earlier word range contains elements that
				4314	// cannot be in the argument set.
				4315	result.SetElementsInSecondSet();
				4316	} else if (word_range.from() > argument_range.to()) {
				4317	// Ranges are disjoint. The earlier argument range contains elements that
				4318	// cannot be in the word set.
				4319	result.SetElementsInFirstSet();
				4320	} else if (word_range.from() <= argument_range.from() &&
				4321	word_range.to() >= argument_range.from()) {
				4322	result.SetElementsInBothSets();
				4323	// argument range completely inside word range.
				4324	if (word_range.from() < argument_range.from() \|\|
				4325	word_range.to() > argument_range.from()) {
				4326	result.SetElementsInSecondSet();
				4327	}
				4328	} else if (word_range.from() >= argument_range.from() &&
				4329	word_range.to() <= argument_range.from()) {
				4330	result.SetElementsInBothSets();
				4331	result.SetElementsInFirstSet();
				4332	} else {
				4333	// There is overlap, and neither is a subrange of the other
				4334	result.SetElementsInFirstSet();
				4335	result.SetElementsInSecondSet();
				4336	result.SetElementsInBothSets();
				4337	}
				4338	if (result.NonTrivialIntersection()) {
				4339	// The result is as (im)precise as we can possibly make it.
				4340	return result;
				4341	}
				4342	// Progress the range(s) with minimal to-character.
				4343	uc16 word_to = word_range.to();
				4344	uc16 argument_to = argument_range.to();
				4345	if (argument_to <= word_to) {
				4346	j++;
				4347	if (j < range->length()) {
				4348	argument_range = range->at(j);
				4349	}
				4350	}
				4351	if (word_to <= argument_to) {
				4352	i += 2;
				4353	if (i < kWordRangeCount) {
				4354	word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]);
				4355	}
				4356	}
				4357	}
				4358	// Check if anything wasn't compared in the loop.
				4359	if (i < kWordRangeCount) {
				4360	// word range contains something not in argument range.
				4361	result.SetElementsInSecondSet();
				4362	} else if (j < range->length()) {
				4363	// Argument range contains something not in word range.
				4364	result.SetElementsInFirstSet();
				4365	}
				4366
				4367	return result;
				4368	}
				4369
				4370
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4371	ZoneList<CharacterRange>* CharacterSet::ranges() {
				4372	if (ranges_ == NULL) {
				4373	ranges_ = new ZoneList<CharacterRange>(2);
				4374	CharacterRange::AddClassEscape(standard_set_type_, ranges_);
				4375	}
				4376	return ranges_;
				4377	}
				4378
				4379
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4380	// Move a number of elements in a zonelist to another position
				4381	// in the same list. Handles overlapping source and target areas.
				4382	static void MoveRanges(ZoneList<CharacterRange>* list,
				4383	int from,
				4384	int to,
				4385	int count) {
				4386	// Ranges are potentially overlapping.
				4387	if (from < to) {
				4388	for (int i = count - 1; i >= 0; i--) {
				4389	list->at(to + i) = list->at(from + i);
				4390	}
				4391	} else {
				4392	for (int i = 0; i < count; i++) {
				4393	list->at(to + i) = list->at(from + i);
				4394	}
				4395	}
				4396	}
				4397
				4398
				4399	static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
				4400	int count,
				4401	CharacterRange insert) {
				4402	// Inserts a range into list[0..count[, which must be sorted
				4403	// by from value and non-overlapping and non-adjacent, using at most
				4404	// list[0..count] for the result. Returns the number of resulting
				4405	// canonicalized ranges. Inserting a range may collapse existing ranges into
				4406	// fewer ranges, so the return value can be anything in the range 1..count+1.
				4407	uc16 from = insert.from();
				4408	uc16 to = insert.to();
				4409	int start_pos = 0;
				4410	int end_pos = count;
				4411	for (int i = count - 1; i >= 0; i--) {
				4412	CharacterRange current = list->at(i);
				4413	if (current.from() > to + 1) {
				4414	end_pos = i;
				4415	} else if (current.to() + 1 < from) {
				4416	start_pos = i + 1;
				4417	break;
				4418	}
				4419	}
				4420
				4421	// Inserted range overlaps, or is adjacent to, ranges at positions
				4422	// [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
				4423	// not affected by the insertion.
				4424	// If start_pos == end_pos, the range must be inserted before start_pos.
				4425	// if start_pos < end_pos, the entire range from start_pos to end_pos
				4426	// must be merged with the insert range.
				4427
				4428	if (start_pos == end_pos) {
				4429	// Insert between existing ranges at position start_pos.
				4430	if (start_pos < count) {
				4431	MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
				4432	}
				4433	list->at(start_pos) = insert;
				4434	return count + 1;
				4435	}
				4436	if (start_pos + 1 == end_pos) {
				4437	// Replace single existing range at position start_pos.
				4438	CharacterRange to_replace = list->at(start_pos);
				4439	int new_from = Min(to_replace.from(), from);
				4440	int new_to = Max(to_replace.to(), to);
				4441	list->at(start_pos) = CharacterRange(new_from, new_to);
				4442	return count;
				4443	}
				4444	// Replace a number of existing ranges from start_pos to end_pos - 1.
				4445	// Move the remaining ranges down.
				4446
				4447	int new_from = Min(list->at(start_pos).from(), from);
				4448	int new_to = Max(list->at(end_pos - 1).to(), to);
				4449	if (end_pos < count) {
				4450	MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
				4451	}
				4452	list->at(start_pos) = CharacterRange(new_from, new_to);
				4453	return count - (end_pos - start_pos) + 1;
				4454	}
				4455
				4456
				4457	void CharacterSet::Canonicalize() {
				4458	// Special/default classes are always considered canonical. The result
				4459	// of calling ranges() will be sorted.
				4460	if (ranges_ == NULL) return;
				4461	CharacterRange::Canonicalize(ranges_);
				4462	}
				4463
				4464
				4465	void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
				4466	if (character_ranges->length() <= 1) return;
				4467	// Check whether ranges are already canonical (increasing, non-overlapping,
				4468	// non-adjacent).
				4469	int n = character_ranges->length();
				4470	int max = character_ranges->at(0).to();
				4471	int i = 1;
				4472	while (i < n) {
				4473	CharacterRange current = character_ranges->at(i);
				4474	if (current.from() <= max + 1) {
				4475	break;
				4476	}
				4477	max = current.to();
				4478	i++;
				4479	}
				4480	// Canonical until the i'th range. If that's all of them, we are done.
				4481	if (i == n) return;
				4482
				4483	// The ranges at index i and forward are not canonicalized. Make them so by
				4484	// doing the equivalent of insertion sort (inserting each into the previous
				4485	// list, in order).
				4486	// Notice that inserting a range can reduce the number of ranges in the
				4487	// result due to combining of adjacent and overlapping ranges.
				4488	int read = i; // Range to insert.
				4489	int num_canonical = i; // Length of canonicalized part of list.
				4490	do {
				4491	num_canonical = InsertRangeInCanonicalList(character_ranges,
				4492	num_canonical,
				4493	character_ranges->at(read));
				4494	read++;
				4495	} while (read < n);
				4496	character_ranges->Rewind(num_canonical);
				4497
				4498	ASSERT(CharacterRange::IsCanonical(character_ranges));
				4499	}
				4500
				4501
				4502	// Utility function for CharacterRange::Merge. Adds a range at the end of
				4503	// a canonicalized range list, if necessary merging the range with the last
				4504	// range of the list.
				4505	static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) {
				4506	if (set == NULL) return;
				4507	ASSERT(set->length() == 0 \|\| set->at(set->length() - 1).to() < range.from());
				4508	int n = set->length();
				4509	if (n > 0) {
				4510	CharacterRange lastRange = set->at(n - 1);
				4511	if (lastRange.to() == range.from() - 1) {
				4512	set->at(n - 1) = CharacterRange(lastRange.from(), range.to());
				4513	return;
				4514	}
				4515	}
				4516	set->Add(range);
				4517	}
				4518
				4519
				4520	static void AddRangeToSelectedSet(int selector,
				4521	ZoneList<CharacterRange>* first_set,
				4522	ZoneList<CharacterRange>* second_set,
				4523	ZoneList<CharacterRange>* intersection_set,
				4524	CharacterRange range) {
				4525	switch (selector) {
				4526	case kInsideFirst:
				4527	AddRangeToSet(first_set, range);
				4528	break;
				4529	case kInsideSecond:
				4530	AddRangeToSet(second_set, range);
				4531	break;
				4532	case kInsideBoth:
				4533	AddRangeToSet(intersection_set, range);
				4534	break;
				4535	}
				4536	}
				4537
				4538
				4539
				4540	void CharacterRange::Merge(ZoneList<CharacterRange>* first_set,
				4541	ZoneList<CharacterRange>* second_set,
				4542	ZoneList<CharacterRange>* first_set_only_out,
				4543	ZoneList<CharacterRange>* second_set_only_out,
				4544	ZoneList<CharacterRange>* both_sets_out) {
				4545	// Inputs are canonicalized.
				4546	ASSERT(CharacterRange::IsCanonical(first_set));
				4547	ASSERT(CharacterRange::IsCanonical(second_set));
				4548	// Outputs are empty, if applicable.
				4549	ASSERT(first_set_only_out == NULL \|\| first_set_only_out->length() == 0);
				4550	ASSERT(second_set_only_out == NULL \|\| second_set_only_out->length() == 0);
				4551	ASSERT(both_sets_out == NULL \|\| both_sets_out->length() == 0);
				4552
				4553	// Merge sets by iterating through the lists in order of lowest "from" value,
				4554	// and putting intervals into one of three sets.
				4555
				4556	if (first_set->length() == 0) {
				4557	second_set_only_out->AddAll(*second_set);
				4558	return;
				4559	}
				4560	if (second_set->length() == 0) {
				4561	first_set_only_out->AddAll(*first_set);
				4562	return;
				4563	}
				4564	// Indices into input lists.
				4565	int i1 = 0;
				4566	int i2 = 0;
				4567	// Cache length of input lists.
				4568	int n1 = first_set->length();
				4569	int n2 = second_set->length();
				4570	// Current range. May be invalid if state is kInsideNone.
				4571	int from = 0;
				4572	int to = -1;
				4573	// Where current range comes from.
				4574	int state = kInsideNone;
				4575
				4576	while (i1 < n1 \|\| i2 < n2) {
				4577	CharacterRange next_range;
				4578	int range_source;
Leon Clarke	d91b9f7	2010-01-27 17:25:45 +0000	[diff] [blame]	4579	if (i2 == n2 \|\|
				4580	(i1 < n1 && first_set->at(i1).from() < second_set->at(i2).from())) {
				4581	// Next smallest element is in first set.
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4582	next_range = first_set->at(i1++);
				4583	range_source = kInsideFirst;
				4584	} else {
Leon Clarke	d91b9f7	2010-01-27 17:25:45 +0000	[diff] [blame]	4585	// Next smallest element is in second set.
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4586	next_range = second_set->at(i2++);
				4587	range_source = kInsideSecond;
				4588	}
				4589	if (to < next_range.from()) {
				4590	// Ranges disjoint: \|current\| \|next\|
				4591	AddRangeToSelectedSet(state,
				4592	first_set_only_out,
				4593	second_set_only_out,
				4594	both_sets_out,
				4595	CharacterRange(from, to));
				4596	from = next_range.from();
				4597	to = next_range.to();
				4598	state = range_source;
				4599	} else {
				4600	if (from < next_range.from()) {
				4601	AddRangeToSelectedSet(state,
				4602	first_set_only_out,
				4603	second_set_only_out,
				4604	both_sets_out,
				4605	CharacterRange(from, next_range.from()-1));
				4606	}
				4607	if (to < next_range.to()) {
				4608	// Ranges overlap: \|current\|
				4609	// \|next\|
				4610	AddRangeToSelectedSet(state \| range_source,
				4611	first_set_only_out,
				4612	second_set_only_out,
				4613	both_sets_out,
				4614	CharacterRange(next_range.from(), to));
				4615	from = to + 1;
				4616	to = next_range.to();
				4617	state = range_source;
				4618	} else {
				4619	// Range included: \|current\| , possibly ending at same character.
				4620	// \|next\|
				4621	AddRangeToSelectedSet(
				4622	state \| range_source,
				4623	first_set_only_out,
				4624	second_set_only_out,
				4625	both_sets_out,
				4626	CharacterRange(next_range.from(), next_range.to()));
				4627	from = next_range.to() + 1;
				4628	// If ranges end at same character, both ranges are consumed completely.
				4629	if (next_range.to() == to) state = kInsideNone;
				4630	}
				4631	}
				4632	}
				4633	AddRangeToSelectedSet(state,
				4634	first_set_only_out,
				4635	second_set_only_out,
				4636	both_sets_out,
				4637	CharacterRange(from, to));
				4638	}
				4639
				4640
				4641	void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
				4642	ZoneList<CharacterRange>* negated_ranges) {
				4643	ASSERT(CharacterRange::IsCanonical(ranges));
				4644	ASSERT_EQ(0, negated_ranges->length());
				4645	int range_count = ranges->length();
				4646	uc16 from = 0;
				4647	int i = 0;
				4648	if (range_count > 0 && ranges->at(0).from() == 0) {
				4649	from = ranges->at(0).to();
				4650	i = 1;
				4651	}
				4652	while (i < range_count) {
				4653	CharacterRange range = ranges->at(i);
				4654	negated_ranges->Add(CharacterRange(from + 1, range.from() - 1));
				4655	from = range.to();
				4656	i++;
				4657	}
				4658	if (from < String::kMaxUC16CharCode) {
				4659	negated_ranges->Add(CharacterRange(from + 1, String::kMaxUC16CharCode));
				4660	}
				4661	}
				4662
				4663
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4664
				4665	// -------------------------------------------------------------------
				4666	// Interest propagation
				4667
				4668
				4669	RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
				4670	for (int i = 0; i < siblings_.length(); i++) {
				4671	RegExpNode* sibling = siblings_.Get(i);
				4672	if (sibling->info()->Matches(info))
				4673	return sibling;
				4674	}
				4675	return NULL;
				4676	}
				4677
				4678
				4679	RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
				4680	ASSERT_EQ(false, *cloned);
				4681	siblings_.Ensure(this);
				4682	RegExpNode* result = TryGetSibling(info);
				4683	if (result != NULL) return result;
				4684	result = this->Clone();
				4685	NodeInfo* new_info = result->info();
				4686	new_info->ResetCompilationState();
				4687	new_info->AddFromPreceding(info);
				4688	AddSibling(result);
				4689	*cloned = true;
				4690	return result;
				4691	}
				4692
				4693
				4694	template <class C>
				4695	static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
				4696	NodeInfo full_info(*node->info());
				4697	full_info.AddFromPreceding(info);
				4698	bool cloned = false;
				4699	return RegExpNode::EnsureSibling(node, &full_info, &cloned);
				4700	}
				4701
				4702
				4703	// -------------------------------------------------------------------
				4704	// Splay tree
				4705
				4706
				4707	OutSet* OutSet::Extend(unsigned value) {
				4708	if (Get(value))
				4709	return this;
				4710	if (successors() != NULL) {
				4711	for (int i = 0; i < successors()->length(); i++) {
				4712	OutSet* successor = successors()->at(i);
				4713	if (successor->Get(value))
				4714	return successor;
				4715	}
				4716	} else {
				4717	successors_ = new ZoneList<OutSet*>(2);
				4718	}
				4719	OutSet* result = new OutSet(first_, remaining_);
				4720	result->Set(value);
				4721	successors()->Add(result);
				4722	return result;
				4723	}
				4724
				4725
				4726	void OutSet::Set(unsigned value) {
				4727	if (value < kFirstLimit) {
				4728	first_ \|= (1 << value);
				4729	} else {
				4730	if (remaining_ == NULL)
				4731	remaining_ = new ZoneList<unsigned>(1);
				4732	if (remaining_->is_empty() \|\| !remaining_->Contains(value))
				4733	remaining_->Add(value);
				4734	}
				4735	}
				4736
				4737
				4738	bool OutSet::Get(unsigned value) {
				4739	if (value < kFirstLimit) {
				4740	return (first_ & (1 << value)) != 0;
				4741	} else if (remaining_ == NULL) {
				4742	return false;
				4743	} else {
				4744	return remaining_->Contains(value);
				4745	}
				4746	}
				4747
				4748
				4749	const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
				4750	const DispatchTable::Entry DispatchTable::Config::kNoValue;
				4751
				4752
				4753	void DispatchTable::AddRange(CharacterRange full_range, int value) {
				4754	CharacterRange current = full_range;
				4755	if (tree()->is_empty()) {
				4756	// If this is the first range we just insert into the table.
				4757	ZoneSplayTree<Config>::Locator loc;
				4758	ASSERT_RESULT(tree()->Insert(current.from(), &loc));
				4759	loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
				4760	return;
				4761	}
				4762	// First see if there is a range to the left of this one that
				4763	// overlaps.
				4764	ZoneSplayTree<Config>::Locator loc;
				4765	if (tree()->FindGreatestLessThan(current.from(), &loc)) {
				4766	Entry* entry = &loc.value();
				4767	// If we've found a range that overlaps with this one, and it
				4768	// starts strictly to the left of this one, we have to fix it
				4769	// because the following code only handles ranges that start on
				4770	// or after the start point of the range we're adding.
				4771	if (entry->from() < current.from() && entry->to() >= current.from()) {
				4772	// Snap the overlapping range in half around the start point of
				4773	// the range we're adding.
				4774	CharacterRange left(entry->from(), current.from() - 1);
				4775	CharacterRange right(current.from(), entry->to());
				4776	// The left part of the overlapping range doesn't overlap.
				4777	// Truncate the whole entry to be just the left part.
				4778	entry->set_to(left.to());
				4779	// The right part is the one that overlaps. We add this part
				4780	// to the map and let the next step deal with merging it with
				4781	// the range we're adding.
				4782	ZoneSplayTree<Config>::Locator loc;
				4783	ASSERT_RESULT(tree()->Insert(right.from(), &loc));
				4784	loc.set_value(Entry(right.from(),
				4785	right.to(),
				4786	entry->out_set()));
				4787	}
				4788	}
				4789	while (current.is_valid()) {
				4790	if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
				4791	(loc.value().from() <= current.to()) &&
				4792	(loc.value().to() >= current.from())) {
				4793	Entry* entry = &loc.value();
				4794	// We have overlap. If there is space between the start point of
				4795	// the range we're adding and where the overlapping range starts
				4796	// then we have to add a range covering just that space.
				4797	if (current.from() < entry->from()) {
				4798	ZoneSplayTree<Config>::Locator ins;
				4799	ASSERT_RESULT(tree()->Insert(current.from(), &ins));
				4800	ins.set_value(Entry(current.from(),
				4801	entry->from() - 1,
				4802	empty()->Extend(value)));
				4803	current.set_from(entry->from());
				4804	}
				4805	ASSERT_EQ(current.from(), entry->from());
				4806	// If the overlapping range extends beyond the one we want to add
				4807	// we have to snap the right part off and add it separately.
				4808	if (entry->to() > current.to()) {
				4809	ZoneSplayTree<Config>::Locator ins;
				4810	ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
				4811	ins.set_value(Entry(current.to() + 1,
				4812	entry->to(),
				4813	entry->out_set()));
				4814	entry->set_to(current.to());
				4815	}
				4816	ASSERT(entry->to() <= current.to());
				4817	// The overlapping range is now completely contained by the range
				4818	// we're adding so we can just update it and move the start point
				4819	// of the range we're adding just past it.
				4820	entry->AddValue(value);
				4821	// Bail out if the last interval ended at 0xFFFF since otherwise
				4822	// adding 1 will wrap around to 0.
				4823	if (entry->to() == String::kMaxUC16CharCode)
				4824	break;
				4825	ASSERT(entry->to() + 1 > current.from());
				4826	current.set_from(entry->to() + 1);
				4827	} else {
				4828	// There is no overlap so we can just add the range
				4829	ZoneSplayTree<Config>::Locator ins;
				4830	ASSERT_RESULT(tree()->Insert(current.from(), &ins));
				4831	ins.set_value(Entry(current.from(),
				4832	current.to(),
				4833	empty()->Extend(value)));
				4834	break;
				4835	}
				4836	}
				4837	}
				4838
				4839
				4840	OutSet* DispatchTable::Get(uc16 value) {
				4841	ZoneSplayTree<Config>::Locator loc;
				4842	if (!tree()->FindGreatestLessThan(value, &loc))
				4843	return empty();
				4844	Entry* entry = &loc.value();
				4845	if (value <= entry->to())
				4846	return entry->out_set();
				4847	else
				4848	return empty();
				4849	}
				4850
				4851
				4852	// -------------------------------------------------------------------
				4853	// Analysis
				4854
				4855
				4856	void Analysis::EnsureAnalyzed(RegExpNode* that) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	4857	StackLimitCheck check(Isolate::Current());
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4858	if (check.HasOverflowed()) {
				4859	fail("Stack overflow");
				4860	return;
				4861	}
				4862	if (that->info()->been_analyzed \|\| that->info()->being_analyzed)
				4863	return;
				4864	that->info()->being_analyzed = true;
				4865	that->Accept(this);
				4866	that->info()->being_analyzed = false;
				4867	that->info()->been_analyzed = true;
				4868	}
				4869
				4870
				4871	void Analysis::VisitEnd(EndNode* that) {
				4872	// nothing to do
				4873	}
				4874
				4875
				4876	void TextNode::CalculateOffsets() {
				4877	int element_count = elements()->length();
				4878	// Set up the offsets of the elements relative to the start. This is a fixed
				4879	// quantity since a TextNode can only contain fixed-width things.
				4880	int cp_offset = 0;
				4881	for (int i = 0; i < element_count; i++) {
				4882	TextElement& elm = elements()->at(i);
				4883	elm.cp_offset = cp_offset;
				4884	if (elm.type == TextElement::ATOM) {
				4885	cp_offset += elm.data.u_atom->data().length();
				4886	} else {
				4887	cp_offset++;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4888	}
				4889	}
				4890	}
				4891
				4892
				4893	void Analysis::VisitText(TextNode* that) {
				4894	if (ignore_case_) {
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4895	that->MakeCaseIndependent(is_ascii_);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4896	}
				4897	EnsureAnalyzed(that->on_success());
				4898	if (!has_failed()) {
				4899	that->CalculateOffsets();
				4900	}
				4901	}
				4902
				4903
				4904	void Analysis::VisitAction(ActionNode* that) {
				4905	RegExpNode* target = that->on_success();
				4906	EnsureAnalyzed(target);
				4907	if (!has_failed()) {
				4908	// If the next node is interested in what it follows then this node
				4909	// has to be interested too so it can pass the information on.
				4910	that->info()->AddFromFollowing(target->info());
				4911	}
				4912	}
				4913
				4914
				4915	void Analysis::VisitChoice(ChoiceNode* that) {
				4916	NodeInfo* info = that->info();
				4917	for (int i = 0; i < that->alternatives()->length(); i++) {
				4918	RegExpNode* node = that->alternatives()->at(i).node();
				4919	EnsureAnalyzed(node);
				4920	if (has_failed()) return;
				4921	// Anything the following nodes need to know has to be known by
				4922	// this node also, so it can pass it on.
				4923	info->AddFromFollowing(node->info());
				4924	}
				4925	}
				4926
				4927
				4928	void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
				4929	NodeInfo* info = that->info();
				4930	for (int i = 0; i < that->alternatives()->length(); i++) {
				4931	RegExpNode* node = that->alternatives()->at(i).node();
				4932	if (node != that->loop_node()) {
				4933	EnsureAnalyzed(node);
				4934	if (has_failed()) return;
				4935	info->AddFromFollowing(node->info());
				4936	}
				4937	}
				4938	// Check the loop last since it may need the value of this node
				4939	// to get a correct result.
				4940	EnsureAnalyzed(that->loop_node());
				4941	if (!has_failed()) {
				4942	info->AddFromFollowing(that->loop_node()->info());
				4943	}
				4944	}
				4945
				4946
				4947	void Analysis::VisitBackReference(BackReferenceNode* that) {
				4948	EnsureAnalyzed(that->on_success());
				4949	}
				4950
				4951
				4952	void Analysis::VisitAssertion(AssertionNode* that) {
				4953	EnsureAnalyzed(that->on_success());
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4954	AssertionNode::AssertionNodeType type = that->type();
				4955	if (type == AssertionNode::AT_BOUNDARY \|\|
				4956	type == AssertionNode::AT_NON_BOUNDARY) {
				4957	// Check if the following character is known to be a word character
				4958	// or known to not be a word character.
				4959	ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet();
				4960
				4961	CharacterRange::Canonicalize(following_chars);
				4962
				4963	SetRelation word_relation =
				4964	CharacterRange::WordCharacterRelation(following_chars);
Andrei Popescu	6d3d5a3	2010-04-27 19:40:12 +0100	[diff] [blame]	4965	if (word_relation.Disjoint()) {
				4966	// Includes the case where following_chars is empty (e.g., end-of-input).
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4967	// Following character is definitely not a word character.
				4968	type = (type == AssertionNode::AT_BOUNDARY) ?
Andrei Popescu	6d3d5a3	2010-04-27 19:40:12 +0100	[diff] [blame]	4969	AssertionNode::AFTER_WORD_CHARACTER :
				4970	AssertionNode::AFTER_NONWORD_CHARACTER;
				4971	that->set_type(type);
				4972	} else if (word_relation.ContainedIn()) {
				4973	// Following character is definitely a word character.
				4974	type = (type == AssertionNode::AT_BOUNDARY) ?
				4975	AssertionNode::AFTER_NONWORD_CHARACTER :
				4976	AssertionNode::AFTER_WORD_CHARACTER;
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4977	that->set_type(type);
				4978	}
				4979	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4980	}
				4981
				4982
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4983	ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() {
				4984	if (first_character_set_ == NULL) {
				4985	if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) {
				4986	// If we can't find an exact solution within the budget, we
				4987	// set the value to the set of every character, i.e., all characters
				4988	// are possible.
				4989	ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1);
				4990	all_set->Add(CharacterRange::Everything());
				4991	first_character_set_ = all_set;
				4992	}
				4993	}
				4994	return first_character_set_;
				4995	}
				4996
				4997
				4998	int RegExpNode::ComputeFirstCharacterSet(int budget) {
				4999	// Default behavior is to not be able to determine the first character.
				5000	return kComputeFirstCharacterSetFail;
				5001	}
				5002
				5003
				5004	int LoopChoiceNode::ComputeFirstCharacterSet(int budget) {
				5005	budget--;
				5006	if (budget >= 0) {
				5007	// Find loop min-iteration. It's the value of the guarded choice node
				5008	// with a GEQ guard, if any.
				5009	int min_repetition = 0;
				5010
				5011	for (int i = 0; i <= 1; i++) {
				5012	GuardedAlternative alternative = alternatives()->at(i);
				5013	ZoneList<Guard> guards = alternative.guards();
				5014	if (guards != NULL && guards->length() > 0) {
				5015	Guard* guard = guards->at(0);
				5016	if (guard->op() == Guard::GEQ) {
				5017	min_repetition = guard->value();
				5018	break;
				5019	}
				5020	}
				5021	}
				5022
				5023	budget = loop_node()->ComputeFirstCharacterSet(budget);
				5024	if (budget >= 0) {
				5025	ZoneList<CharacterRange>* character_set =
				5026	loop_node()->first_character_set();
				5027	if (body_can_be_zero_length() \|\| min_repetition == 0) {
				5028	budget = continue_node()->ComputeFirstCharacterSet(budget);
				5029	if (budget < 0) return budget;
				5030	ZoneList<CharacterRange>* body_set =
				5031	continue_node()->first_character_set();
				5032	ZoneList<CharacterRange>* union_set =
				5033	new ZoneList<CharacterRange>(Max(character_set->length(),
				5034	body_set->length()));
				5035	CharacterRange::Merge(character_set,
				5036	body_set,
				5037	union_set,
				5038	union_set,
				5039	union_set);
				5040	character_set = union_set;
				5041	}
				5042	set_first_character_set(character_set);
				5043	}
				5044	}
				5045	return budget;
				5046	}
				5047
				5048
				5049	int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) {
				5050	budget--;
				5051	if (budget >= 0) {
				5052	GuardedAlternative successor = this->alternatives()->at(1);
				5053	RegExpNode* successor_node = successor.node();
				5054	budget = successor_node->ComputeFirstCharacterSet(budget);
				5055	if (budget >= 0) {
				5056	set_first_character_set(successor_node->first_character_set());
				5057	}
				5058	}
				5059	return budget;
				5060	}
				5061
				5062
				5063	// The first character set of an EndNode is unknowable. Just use the
				5064	// default implementation that fails and returns all characters as possible.
				5065
				5066
				5067	int AssertionNode::ComputeFirstCharacterSet(int budget) {
				5068	budget -= 1;
				5069	if (budget >= 0) {
				5070	switch (type_) {
				5071	case AT_END: {
				5072	set_first_character_set(new ZoneList<CharacterRange>(0));
				5073	break;
				5074	}
				5075	case AT_START:
				5076	case AT_BOUNDARY:
				5077	case AT_NON_BOUNDARY:
				5078	case AFTER_NEWLINE:
				5079	case AFTER_NONWORD_CHARACTER:
				5080	case AFTER_WORD_CHARACTER: {
				5081	ASSERT_NOT_NULL(on_success());
				5082	budget = on_success()->ComputeFirstCharacterSet(budget);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5083	if (budget >= 0) {
				5084	set_first_character_set(on_success()->first_character_set());
				5085	}
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5086	break;
				5087	}
				5088	}
				5089	}
				5090	return budget;
				5091	}
				5092
				5093
				5094	int ActionNode::ComputeFirstCharacterSet(int budget) {
				5095	if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail;
				5096	budget--;
				5097	if (budget >= 0) {
				5098	ASSERT_NOT_NULL(on_success());
				5099	budget = on_success()->ComputeFirstCharacterSet(budget);
				5100	if (budget >= 0) {
				5101	set_first_character_set(on_success()->first_character_set());
				5102	}
				5103	}
				5104	return budget;
				5105	}
				5106
				5107
				5108	int BackReferenceNode::ComputeFirstCharacterSet(int budget) {
				5109	// We don't know anything about the first character of a backreference
				5110	// at this point.
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5111	// The potential first characters are the first characters of the capture,
				5112	// and the first characters of the on_success node, depending on whether the
				5113	// capture can be empty and whether it is known to be participating or known
				5114	// not to be.
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5115	return kComputeFirstCharacterSetFail;
				5116	}
				5117
				5118
				5119	int TextNode::ComputeFirstCharacterSet(int budget) {
				5120	budget--;
				5121	if (budget >= 0) {
				5122	ASSERT_NE(0, elements()->length());
				5123	TextElement text = elements()->at(0);
				5124	if (text.type == TextElement::ATOM) {
				5125	RegExpAtom* atom = text.data.u_atom;
				5126	ASSERT_NE(0, atom->length());
				5127	uc16 first_char = atom->data()[0];
				5128	ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1);
				5129	range->Add(CharacterRange(first_char, first_char));
				5130	set_first_character_set(range);
				5131	} else {
				5132	ASSERT(text.type == TextElement::CHAR_CLASS);
				5133	RegExpCharacterClass* char_class = text.data.u_char_class;
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5134	ZoneList<CharacterRange>* ranges = char_class->ranges();
				5135	// TODO(lrn): Canonicalize ranges when they are created
				5136	// instead of waiting until now.
				5137	CharacterRange::Canonicalize(ranges);
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5138	if (char_class->is_negated()) {
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5139	int length = ranges->length();
				5140	int new_length = length + 1;
				5141	if (length > 0) {
				5142	if (ranges->at(0).from() == 0) new_length--;
				5143	if (ranges->at(length - 1).to() == String::kMaxUC16CharCode) {
				5144	new_length--;
				5145	}
				5146	}
				5147	ZoneList<CharacterRange>* negated_ranges =
				5148	new ZoneList<CharacterRange>(new_length);
				5149	CharacterRange::Negate(ranges, negated_ranges);
				5150	set_first_character_set(negated_ranges);
				5151	} else {
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5152	set_first_character_set(ranges);
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5153	}
				5154	}
				5155	}
				5156	return budget;
				5157	}
				5158
				5159
				5160
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5161	// -------------------------------------------------------------------
				5162	// Dispatch table construction
				5163
				5164
				5165	void DispatchTableConstructor::VisitEnd(EndNode* that) {
				5166	AddRange(CharacterRange::Everything());
				5167	}
				5168
				5169
				5170	void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
				5171	node->set_being_calculated(true);
				5172	ZoneList<GuardedAlternative>* alternatives = node->alternatives();
				5173	for (int i = 0; i < alternatives->length(); i++) {
				5174	set_choice_index(i);
				5175	alternatives->at(i).node()->Accept(this);
				5176	}
				5177	node->set_being_calculated(false);
				5178	}
				5179
				5180
				5181	class AddDispatchRange {
				5182	public:
				5183	explicit AddDispatchRange(DispatchTableConstructor* constructor)
				5184	: constructor_(constructor) { }
				5185	void Call(uc32 from, DispatchTable::Entry entry);
				5186	private:
				5187	DispatchTableConstructor* constructor_;
				5188	};
				5189
				5190
				5191	void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
				5192	CharacterRange range(from, entry.to());
				5193	constructor_->AddRange(range);
				5194	}
				5195
				5196
				5197	void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
				5198	if (node->being_calculated())
				5199	return;
				5200	DispatchTable* table = node->GetTable(ignore_case_);
				5201	AddDispatchRange adder(this);
				5202	table->ForEach(&adder);
				5203	}
				5204
				5205
				5206	void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
				5207	// TODO(160): Find the node that we refer back to and propagate its start
				5208	// set back to here. For now we just accept anything.
				5209	AddRange(CharacterRange::Everything());
				5210	}
				5211
				5212
				5213	void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
				5214	RegExpNode* target = that->on_success();
				5215	target->Accept(this);
				5216	}
				5217
				5218
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5219	static int CompareRangeByFrom(const CharacterRange* a,
				5220	const CharacterRange* b) {
				5221	return Compare<uc16>(a->from(), b->from());
				5222	}
				5223
				5224
				5225	void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
				5226	ranges->Sort(CompareRangeByFrom);
				5227	uc16 last = 0;
				5228	for (int i = 0; i < ranges->length(); i++) {
				5229	CharacterRange range = ranges->at(i);
				5230	if (last < range.from())
				5231	AddRange(CharacterRange(last, range.from() - 1));
				5232	if (range.to() >= last) {
				5233	if (range.to() == String::kMaxUC16CharCode) {
				5234	return;
				5235	} else {
				5236	last = range.to() + 1;
				5237	}
				5238	}
				5239	}
				5240	AddRange(CharacterRange(last, String::kMaxUC16CharCode));
				5241	}
				5242
				5243
				5244	void DispatchTableConstructor::VisitText(TextNode* that) {
				5245	TextElement elm = that->elements()->at(0);
				5246	switch (elm.type) {
				5247	case TextElement::ATOM: {
				5248	uc16 c = elm.data.u_atom->data()[0];
				5249	AddRange(CharacterRange(c, c));
				5250	break;
				5251	}
				5252	case TextElement::CHAR_CLASS: {
				5253	RegExpCharacterClass* tree = elm.data.u_char_class;
				5254	ZoneList<CharacterRange>* ranges = tree->ranges();
				5255	if (tree->is_negated()) {
				5256	AddInverse(ranges);
				5257	} else {
				5258	for (int i = 0; i < ranges->length(); i++)
				5259	AddRange(ranges->at(i));
				5260	}
				5261	break;
				5262	}
				5263	default: {
				5264	UNIMPLEMENTED();
				5265	}
				5266	}
				5267	}
				5268
				5269
				5270	void DispatchTableConstructor::VisitAction(ActionNode* that) {
				5271	RegExpNode* target = that->on_success();
				5272	target->Accept(this);
				5273	}
				5274
				5275
				5276	RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data,
				5277	bool ignore_case,
				5278	bool is_multiline,
				5279	Handle<String> pattern,
				5280	bool is_ascii) {
				5281	if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
				5282	return IrregexpRegExpTooBig();
				5283	}
				5284	RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
				5285	// Wrap the body of the regexp in capture #0.
				5286	RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
				5287	0,
				5288	&compiler,
				5289	compiler.accept());
				5290	RegExpNode* node = captured_body;
Ben Murdoch	f87a203	2010-10-22 12:50:53 +0100	[diff] [blame]	5291	bool is_end_anchored = data->tree->IsAnchoredAtEnd();
				5292	bool is_start_anchored = data->tree->IsAnchoredAtStart();
				5293	int max_length = data->tree->max_match();
				5294	if (!is_start_anchored) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5295	// Add a .*? at the beginning, outside the body capture, unless
				5296	// this expression is anchored at the beginning.
				5297	RegExpNode* loop_node =
				5298	RegExpQuantifier::ToNode(0,
				5299	RegExpTree::kInfinity,
				5300	false,
				5301	new RegExpCharacterClass('*'),
				5302	&compiler,
				5303	captured_body,
				5304	data->contains_anchor);
				5305
				5306	if (data->contains_anchor) {
				5307	// Unroll loop once, to take care of the case that might start
				5308	// at the start of input.
				5309	ChoiceNode* first_step_node = new ChoiceNode(2);
				5310	first_step_node->AddAlternative(GuardedAlternative(captured_body));
				5311	first_step_node->AddAlternative(GuardedAlternative(
				5312	new TextNode(new RegExpCharacterClass('*'), loop_node)));
				5313	node = first_step_node;
				5314	} else {
				5315	node = loop_node;
				5316	}
				5317	}
				5318	data->node = node;
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	5319	Analysis analysis(ignore_case, is_ascii);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5320	analysis.EnsureAnalyzed(node);
				5321	if (analysis.has_failed()) {
				5322	const char* error_message = analysis.error_message();
				5323	return CompilationResult(error_message);
				5324	}
				5325
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5326	// Create the correct assembler for the architecture.
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5327	#ifndef V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5328	// Native regexp implementation.
				5329
				5330	NativeRegExpMacroAssembler::Mode mode =
				5331	is_ascii ? NativeRegExpMacroAssembler::ASCII
				5332	: NativeRegExpMacroAssembler::UC16;
				5333
				5334	#if V8_TARGET_ARCH_IA32
				5335	RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2);
				5336	#elif V8_TARGET_ARCH_X64
				5337	RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2);
				5338	#elif V8_TARGET_ARCH_ARM
				5339	RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2);
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	5340	#elif V8_TARGET_ARCH_MIPS
				5341	RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5342	#endif
				5343
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5344	#else // V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5345	// Interpreted regexp implementation.
				5346	EmbeddedVector<byte, 1024> codes;
				5347	RegExpMacroAssemblerIrregexp macro_assembler(codes);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5348	#endif // V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5349
Ben Murdoch	f87a203	2010-10-22 12:50:53 +0100	[diff] [blame]	5350	// Inserted here, instead of in Assembler, because it depends on information
				5351	// in the AST that isn't replicated in the Node structure.
				5352	static const int kMaxBacksearchLimit = 1024;
				5353	if (is_end_anchored &&
				5354	!is_start_anchored &&
				5355	max_length < kMaxBacksearchLimit) {
				5356	macro_assembler.SetCurrentPositionFromEnd(max_length);
				5357	}
				5358
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5359	return compiler.Assemble(&macro_assembler,
				5360	node,
				5361	data->capture_count,
				5362	pattern);
				5363	}
				5364
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5365
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5366	}} // namespace v8::internal