Blame - src/jsregexp.cc - fp2-dev/platform/external/v8

blob: 8ccbae49ce4d3c8a5d90f4734ce17945b7c5f70b [file] [log] [blame]

Ben Murdoch	3fb3ca8	2011-12-02 17:19:32 +0000	[diff] [blame]	1	// Copyright 2011 the V8 project authors. All rights reserved.
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2	// Redistribution and use in source and binary forms, with or without
				3	// modification, are permitted provided that the following conditions are
				4	// met:
				5	//
				6	// * Redistributions of source code must retain the above copyright
				7	// notice, this list of conditions and the following disclaimer.
				8	// * Redistributions in binary form must reproduce the above
				9	// copyright notice, this list of conditions and the following
				10	// disclaimer in the documentation and/or other materials provided
				11	// with the distribution.
				12	// * Neither the name of Google Inc. nor the names of its
				13	// contributors may be used to endorse or promote products derived
				14	// from this software without specific prior written permission.
				15	//
				16	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				17	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				18	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				19	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				20	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				21	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				22	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				23	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				24	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				25	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				26	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				27
				28	#include "v8.h"
				29
				30	#include "ast.h"
				31	#include "compiler.h"
				32	#include "execution.h"
				33	#include "factory.h"
				34	#include "jsregexp.h"
				35	#include "platform.h"
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	36	#include "string-search.h"
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	37	#include "runtime.h"
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	38	#include "compilation-cache.h"
				39	#include "string-stream.h"
				40	#include "parser.h"
				41	#include "regexp-macro-assembler.h"
				42	#include "regexp-macro-assembler-tracer.h"
				43	#include "regexp-macro-assembler-irregexp.h"
				44	#include "regexp-stack.h"
				45
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	46	#ifndef V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	47	#if V8_TARGET_ARCH_IA32
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	48	#include "ia32/regexp-macro-assembler-ia32.h"
				49	#elif V8_TARGET_ARCH_X64
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	50	#include "x64/regexp-macro-assembler-x64.h"
				51	#elif V8_TARGET_ARCH_ARM
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	52	#include "arm/regexp-macro-assembler-arm.h"
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	53	#elif V8_TARGET_ARCH_MIPS
				54	#include "mips/regexp-macro-assembler-mips.h"
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	55	#else
				56	#error Unsupported target architecture.
				57	#endif
				58	#endif
				59
				60	#include "interpreter-irregexp.h"
				61
				62
				63	namespace v8 {
				64	namespace internal {
				65
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	66	Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
				67	Handle<String> pattern,
				68	Handle<String> flags,
				69	bool* has_pending_exception) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	70	// Call the construct code with 2 arguments.
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	71	Handle<Object> argv[] = { pattern, flags };
				72	return Execution::New(constructor, ARRAY_SIZE(argv), argv,
				73	has_pending_exception);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	74	}
				75
				76
				77	static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
				78	int flags = JSRegExp::NONE;
				79	for (int i = 0; i < str->length(); i++) {
				80	switch (str->Get(i)) {
				81	case 'i':
				82	flags \|= JSRegExp::IGNORE_CASE;
				83	break;
				84	case 'g':
				85	flags \|= JSRegExp::GLOBAL;
				86	break;
				87	case 'm':
				88	flags \|= JSRegExp::MULTILINE;
				89	break;
				90	}
				91	}
				92	return JSRegExp::Flags(flags);
				93	}
				94
				95
				96	static inline void ThrowRegExpException(Handle<JSRegExp> re,
				97	Handle<String> pattern,
				98	Handle<String> error_text,
				99	const char* message) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	100	Isolate* isolate = re->GetIsolate();
				101	Factory* factory = isolate->factory();
				102	Handle<FixedArray> elements = factory->NewFixedArray(2);
Ben Murdoch	e0cee9b	2011-05-25 10:26:03 +0100	[diff] [blame]	103	elements->set(0, *pattern);
				104	elements->set(1, *error_text);
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	105	Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
				106	Handle<Object> regexp_err = factory->NewSyntaxError(message, array);
				107	isolate->Throw(*regexp_err);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	108	}
				109
				110
				111	// Generic RegExp methods. Dispatches to implementation specific methods.
				112
				113
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	114	Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
				115	Handle<String> pattern,
				116	Handle<String> flag_str) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	117	Isolate* isolate = re->GetIsolate();
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	118	JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	119	CompilationCache* compilation_cache = isolate->compilation_cache();
				120	Handle<FixedArray> cached = compilation_cache->LookupRegExp(pattern, flags);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	121	bool in_cache = !cached.is_null();
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	122	LOG(isolate, RegExpCompileEvent(re, in_cache));
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	123
				124	Handle<Object> result;
				125	if (in_cache) {
				126	re->set_data(*cached);
				127	return re;
				128	}
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	129	pattern = FlattenGetString(pattern);
Ben Murdoch	3fb3ca8	2011-12-02 17:19:32 +0000	[diff] [blame]	130	ZoneScope zone_scope(isolate, DELETE_ON_EXIT);
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	131	PostponeInterruptsScope postpone(isolate);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	132	RegExpCompileData parse_result;
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	133	FlatStringReader reader(isolate, pattern);
Teng-Hui Zhu	3e5fa29	2010-11-09 16:16:48 -0800	[diff] [blame]	134	if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
				135	&parse_result)) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	136	// Throw an exception if we fail to parse the pattern.
				137	ThrowRegExpException(re,
				138	pattern,
				139	parse_result.error,
				140	"malformed_regexp");
				141	return Handle<Object>::null();
				142	}
				143
				144	if (parse_result.simple && !flags.is_ignore_case()) {
				145	// Parse-tree is a single atom that is equal to the pattern.
				146	AtomCompile(re, pattern, flags, pattern);
				147	} else if (parse_result.tree->IsAtom() &&
				148	!flags.is_ignore_case() &&
				149	parse_result.capture_count == 0) {
				150	RegExpAtom* atom = parse_result.tree->AsAtom();
				151	Vector<const uc16> atom_pattern = atom->data();
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	152	Handle<String> atom_string =
				153	isolate->factory()->NewStringFromTwoByte(atom_pattern);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	154	AtomCompile(re, pattern, flags, atom_string);
				155	} else {
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	156	IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	157	}
				158	ASSERT(re->data()->IsFixedArray());
				159	// Compilation succeeded so the data is set on the regexp
				160	// and we can store it in the cache.
				161	Handle<FixedArray> data(FixedArray::cast(re->data()));
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	162	compilation_cache->PutRegExp(pattern, flags, data);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	163
				164	return re;
				165	}
				166
				167
				168	Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
				169	Handle<String> subject,
				170	int index,
				171	Handle<JSArray> last_match_info) {
				172	switch (regexp->TypeTag()) {
				173	case JSRegExp::ATOM:
				174	return AtomExec(regexp, subject, index, last_match_info);
				175	case JSRegExp::IRREGEXP: {
				176	Handle<Object> result =
				177	IrregexpExec(regexp, subject, index, last_match_info);
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	178	ASSERT(!result.is_null() \|\|
				179	regexp->GetIsolate()->has_pending_exception());
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	180	return result;
				181	}
				182	default:
				183	UNREACHABLE();
				184	return Handle<Object>::null();
				185	}
				186	}
				187
				188
				189	// RegExp Atom implementation: Simple string search using indexOf.
				190
				191
				192	void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
				193	Handle<String> pattern,
				194	JSRegExp::Flags flags,
				195	Handle<String> match_pattern) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	196	re->GetIsolate()->factory()->SetRegExpAtomData(re,
				197	JSRegExp::ATOM,
				198	pattern,
				199	flags,
				200	match_pattern);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	201	}
				202
				203
				204	static void SetAtomLastCapture(FixedArray* array,
				205	String* subject,
				206	int from,
				207	int to) {
				208	NoHandleAllocation no_handles;
				209	RegExpImpl::SetLastCaptureCount(array, 2);
				210	RegExpImpl::SetLastSubject(array, subject);
				211	RegExpImpl::SetLastInput(array, subject);
				212	RegExpImpl::SetCapture(array, 0, from);
				213	RegExpImpl::SetCapture(array, 1, to);
				214	}
				215
				216
				217	Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
				218	Handle<String> subject,
				219	int index,
				220	Handle<JSArray> last_match_info) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	221	Isolate* isolate = re->GetIsolate();
				222
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	223	ASSERT(0 <= index);
				224	ASSERT(index <= subject->length());
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	225
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	226	if (!subject->IsFlat()) FlattenString(subject);
				227	AssertNoAllocation no_heap_allocation; // ensure vectors stay valid
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	228
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	229	String* needle = String::cast(re->DataAt(JSRegExp::kAtomPatternIndex));
				230	int needle_len = needle->length();
Ben Murdoch	69a99ed	2011-11-30 16:03:39 +0000	[diff] [blame]	231	ASSERT(needle->IsFlat());
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	232
				233	if (needle_len != 0) {
Ben Murdoch	69a99ed	2011-11-30 16:03:39 +0000	[diff] [blame]	234	if (index + needle_len > subject->length()) {
				235	return isolate->factory()->null_value();
				236	}
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	237
Ben Murdoch	69a99ed	2011-11-30 16:03:39 +0000	[diff] [blame]	238	String::FlatContent needle_content = needle->GetFlatContent();
				239	String::FlatContent subject_content = subject->GetFlatContent();
				240	ASSERT(needle_content.IsFlat());
				241	ASSERT(subject_content.IsFlat());
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	242	// dispatch on type of strings
Ben Murdoch	69a99ed	2011-11-30 16:03:39 +0000	[diff] [blame]	243	index = (needle_content.IsAscii()
				244	? (subject_content.IsAscii()
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	245	? SearchString(isolate,
Ben Murdoch	69a99ed	2011-11-30 16:03:39 +0000	[diff] [blame]	246	subject_content.ToAsciiVector(),
				247	needle_content.ToAsciiVector(),
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	248	index)
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	249	: SearchString(isolate,
Ben Murdoch	69a99ed	2011-11-30 16:03:39 +0000	[diff] [blame]	250	subject_content.ToUC16Vector(),
				251	needle_content.ToAsciiVector(),
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	252	index))
Ben Murdoch	69a99ed	2011-11-30 16:03:39 +0000	[diff] [blame]	253	: (subject_content.IsAscii()
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	254	? SearchString(isolate,
Ben Murdoch	69a99ed	2011-11-30 16:03:39 +0000	[diff] [blame]	255	subject_content.ToAsciiVector(),
				256	needle_content.ToUC16Vector(),
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	257	index)
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	258	: SearchString(isolate,
Ben Murdoch	69a99ed	2011-11-30 16:03:39 +0000	[diff] [blame]	259	subject_content.ToUC16Vector(),
				260	needle_content.ToUC16Vector(),
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	261	index)));
Ben Murdoch	69a99ed	2011-11-30 16:03:39 +0000	[diff] [blame]	262	if (index == -1) return isolate->factory()->null_value();
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	263	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	264	ASSERT(last_match_info->HasFastElements());
				265
				266	{
				267	NoHandleAllocation no_handles;
				268	FixedArray* array = FixedArray::cast(last_match_info->elements());
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	269	SetAtomLastCapture(array, *subject, index, index + needle_len);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	270	}
				271	return last_match_info;
				272	}
				273
				274
				275	// Irregexp implementation.
				276
				277	// Ensures that the regexp object contains a compiled version of the
				278	// source for either ASCII or non-ASCII strings.
				279	// If the compiled version doesn't already exist, it is compiled
				280	// from the source pattern.
				281	// If compilation fails, an exception is thrown and this function
				282	// returns false.
				283	bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {
				284	Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	285	#ifdef V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	286	if (compiled_code->IsByteArray()) return true;
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	287	#else // V8_INTERPRETED_REGEXP (RegExp native code)
				288	if (compiled_code->IsCode()) return true;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	289	#endif
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	290	// We could potentially have marked this as flushable, but have kept
				291	// a saved version if we did not flush it yet.
				292	Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii));
				293	if (saved_code->IsCode()) {
				294	// Reinstate the code in the original place.
				295	re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code);
				296	ASSERT(compiled_code->IsSmi());
				297	return true;
				298	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	299	return CompileIrregexp(re, is_ascii);
				300	}
				301
				302
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	303	static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re,
				304	bool is_ascii,
				305	Handle<String> error_message,
				306	Isolate* isolate) {
				307	Factory* factory = isolate->factory();
				308	Handle<FixedArray> elements = factory->NewFixedArray(2);
				309	elements->set(0, re->Pattern());
				310	elements->set(1, *error_message);
				311	Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
				312	Handle<Object> regexp_err =
				313	factory->NewSyntaxError("malformed_regexp", array);
				314	isolate->Throw(*regexp_err);
				315	return false;
				316	}
				317
				318
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	319	bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {
				320	// Compile the RegExp.
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	321	Isolate* isolate = re->GetIsolate();
Ben Murdoch	3fb3ca8	2011-12-02 17:19:32 +0000	[diff] [blame]	322	ZoneScope zone_scope(isolate, DELETE_ON_EXIT);
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	323	PostponeInterruptsScope postpone(isolate);
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	324	// If we had a compilation error the last time this is saved at the
				325	// saved code index.
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	326	Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	327	// When arriving here entry can only be a smi, either representing an
				328	// uncompiled regexp, a previous compilation error, or code that has
				329	// been flushed.
				330	ASSERT(entry->IsSmi());
				331	int entry_value = Smi::cast(entry)->value();
				332	ASSERT(entry_value == JSRegExp::kUninitializedValue \|\|
				333	entry_value == JSRegExp::kCompilationErrorValue \|\|
				334	(entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
				335
				336	if (entry_value == JSRegExp::kCompilationErrorValue) {
				337	// A previous compilation failed and threw an error which we store in
				338	// the saved code index (we store the error message, not the actual
				339	// error). Recreate the error object and throw it.
				340	Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_ascii));
				341	ASSERT(error_string->IsString());
				342	Handle<String> error_message(String::cast(error_string));
				343	CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	344	return false;
				345	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	346
				347	JSRegExp::Flags flags = re->GetFlags();
				348
				349	Handle<String> pattern(re->Pattern());
Ben Murdoch	69a99ed	2011-11-30 16:03:39 +0000	[diff] [blame]	350	if (!pattern->IsFlat()) FlattenString(pattern);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	351	RegExpCompileData compile_data;
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	352	FlatStringReader reader(isolate, pattern);
Teng-Hui Zhu	3e5fa29	2010-11-09 16:16:48 -0800	[diff] [blame]	353	if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
				354	&compile_data)) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	355	// Throw an exception if we fail to parse the pattern.
				356	// THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
				357	ThrowRegExpException(re,
				358	pattern,
				359	compile_data.error,
				360	"malformed_regexp");
				361	return false;
				362	}
				363	RegExpEngine::CompilationResult result =
				364	RegExpEngine::Compile(&compile_data,
				365	flags.is_ignore_case(),
				366	flags.is_multiline(),
				367	pattern,
				368	is_ascii);
				369	if (result.error_message != NULL) {
				370	// Unable to compile regexp.
Ben Murdoch	e0cee9b	2011-05-25 10:26:03 +0100	[diff] [blame]	371	Handle<String> error_message =
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	372	isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message));
				373	CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	374	return false;
				375	}
				376
				377	Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
				378	data->set(JSRegExp::code_index(is_ascii), result.code);
				379	int register_max = IrregexpMaxRegisterCount(*data);
				380	if (result.num_registers > register_max) {
				381	SetIrregexpMaxRegisterCount(*data, result.num_registers);
				382	}
				383
				384	return true;
				385	}
				386
				387
				388	int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
				389	return Smi::cast(
				390	re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
				391	}
				392
				393
				394	void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
				395	re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
				396	}
				397
				398
				399	int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
				400	return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
				401	}
				402
				403
				404	int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
				405	return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
				406	}
				407
				408
				409	ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
				410	return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
				411	}
				412
				413
				414	Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
				415	return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
				416	}
				417
				418
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	419	void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
				420	Handle<String> pattern,
				421	JSRegExp::Flags flags,
				422	int capture_count) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	423	// Initialize compiled code entries to null.
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	424	re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
				425	JSRegExp::IRREGEXP,
				426	pattern,
				427	flags,
				428	capture_count);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	429	}
				430
				431
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	432	int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
				433	Handle<String> subject) {
Ben Murdoch	69a99ed	2011-11-30 16:03:39 +0000	[diff] [blame]	434	if (!subject->IsFlat()) FlattenString(subject);
				435
Steve Block	8defd9f	2010-07-08 12:39:36 +0100	[diff] [blame]	436	// Check the asciiness of the underlying storage.
Ben Murdoch	69a99ed	2011-11-30 16:03:39 +0000	[diff] [blame]	437	bool is_ascii = subject->IsAsciiRepresentationUnderneath();
				438	if (!EnsureCompiledIrregexp(regexp, is_ascii)) return -1;
				439
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	440	#ifdef V8_INTERPRETED_REGEXP
				441	// Byte-code regexp needs space allocated for all its registers.
				442	return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data()));
				443	#else // V8_INTERPRETED_REGEXP
				444	// Native regexp only needs room to output captures. Registers are handled
				445	// internally.
				446	return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
				447	#endif // V8_INTERPRETED_REGEXP
				448	}
				449
				450
Steve Block	791712a	2010-08-27 10:21:07 +0100	[diff] [blame]	451	RegExpImpl::IrregexpResult RegExpImpl::IrregexpExecOnce(
				452	Handle<JSRegExp> regexp,
				453	Handle<String> subject,
				454	int index,
Ben Murdoch	b8e0da2	2011-05-16 14:20:40 +0100	[diff] [blame]	455	Vector<int> output) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	456	Isolate* isolate = regexp->GetIsolate();
				457
				458	Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	459
				460	ASSERT(index >= 0);
				461	ASSERT(index <= subject->length());
				462	ASSERT(subject->IsFlat());
				463
Ben Murdoch	69a99ed	2011-11-30 16:03:39 +0000	[diff] [blame]	464	bool is_ascii = subject->IsAsciiRepresentationUnderneath();
Steve Block	8defd9f	2010-07-08 12:39:36 +0100	[diff] [blame]	465
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	466	#ifndef V8_INTERPRETED_REGEXP
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	467	ASSERT(output.length() >= (IrregexpNumberOfCaptures(irregexp) + 1) 2);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	468	do {
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	469	EnsureCompiledIrregexp(regexp, is_ascii);
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	470	Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	471	NativeRegExpMacroAssembler::Result res =
				472	NativeRegExpMacroAssembler::Match(code,
				473	subject,
				474	output.start(),
				475	output.length(),
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	476	index,
				477	isolate);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	478	if (res != NativeRegExpMacroAssembler::RETRY) {
				479	ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION \|\|
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	480	isolate->has_pending_exception());
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	481	STATIC_ASSERT(
				482	static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
				483	STATIC_ASSERT(
				484	static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
				485	STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
				486	== RE_EXCEPTION);
				487	return static_cast<IrregexpResult>(res);
				488	}
				489	// If result is RETRY, the string has changed representation, and we
				490	// must restart from scratch.
				491	// In this case, it means we must make sure we are prepared to handle
Steve Block	8defd9f	2010-07-08 12:39:36 +0100	[diff] [blame]	492	// the, potentially, different subject (the string can switch between
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	493	// being internal and external, and even between being ASCII and UC16,
				494	// but the characters are always the same).
				495	IrregexpPrepare(regexp, subject);
Ben Murdoch	69a99ed	2011-11-30 16:03:39 +0000	[diff] [blame]	496	is_ascii = subject->IsAsciiRepresentationUnderneath();
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	497	} while (true);
				498	UNREACHABLE();
				499	return RE_EXCEPTION;
				500	#else // V8_INTERPRETED_REGEXP
				501
				502	ASSERT(output.length() >= IrregexpNumberOfRegisters(*irregexp));
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	503	// We must have done EnsureCompiledIrregexp, so we can get the number of
				504	// registers.
				505	int* register_vector = output.start();
				506	int number_of_capture_registers =
				507	(IrregexpNumberOfCaptures(irregexp) + 1) 2;
				508	for (int i = number_of_capture_registers - 1; i >= 0; i--) {
				509	register_vector[i] = -1;
				510	}
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	511	Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	512
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	513	IrregexpResult result = IrregexpInterpreter::Match(isolate,
				514	byte_codes,
				515	subject,
				516	register_vector,
				517	index);
				518	if (result == RE_EXCEPTION) {
				519	ASSERT(!isolate->has_pending_exception());
				520	isolate->StackOverflow();
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	521	}
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	522	return result;
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	523	#endif // V8_INTERPRETED_REGEXP
				524	}
				525
				526
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	527	Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp,
				528	Handle<String> subject,
				529	int previous_index,
				530	Handle<JSArray> last_match_info) {
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	531	Isolate* isolate = jsregexp->GetIsolate();
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	532	ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP);
				533
				534	// Prepare space for the return values.
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	535	#ifdef V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	536	#ifdef DEBUG
				537	if (FLAG_trace_regexp_bytecodes) {
				538	String* pattern = jsregexp->Pattern();
				539	PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
				540	PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
				541	}
				542	#endif
				543	#endif
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	544	int required_registers = RegExpImpl::IrregexpPrepare(jsregexp, subject);
				545	if (required_registers < 0) {
				546	// Compiling failed with an exception.
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	547	ASSERT(isolate->has_pending_exception());
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	548	return Handle<Object>::null();
				549	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	550
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	551	OffsetsVector registers(required_registers, isolate);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	552
Iain Merrick	7568138	2010-08-19 15:07:18 +0100	[diff] [blame]	553	IrregexpResult res = RegExpImpl::IrregexpExecOnce(
Ben Murdoch	b8e0da2	2011-05-16 14:20:40 +0100	[diff] [blame]	554	jsregexp, subject, previous_index, Vector<int>(registers.vector(),
				555	registers.length()));
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	556	if (res == RE_SUCCESS) {
				557	int capture_register_count =
				558	(IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2;
				559	last_match_info->EnsureSize(capture_register_count + kLastMatchOverhead);
				560	AssertNoAllocation no_gc;
				561	int* register_vector = registers.vector();
				562	FixedArray* array = FixedArray::cast(last_match_info->elements());
				563	for (int i = 0; i < capture_register_count; i += 2) {
				564	SetCapture(array, i, register_vector[i]);
				565	SetCapture(array, i + 1, register_vector[i + 1]);
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	566	}
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	567	SetLastCaptureCount(array, capture_register_count);
				568	SetLastSubject(array, *subject);
				569	SetLastInput(array, *subject);
				570	return last_match_info;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	571	}
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	572	if (res == RE_EXCEPTION) {
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	573	ASSERT(isolate->has_pending_exception());
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	574	return Handle<Object>::null();
				575	}
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	576	ASSERT(res == RE_FAILURE);
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	577	return isolate->factory()->null_value();
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	578	}
				579
				580
				581	// -------------------------------------------------------------------
				582	// Implementation of the Irregexp regular expression engine.
				583	//
				584	// The Irregexp regular expression engine is intended to be a complete
				585	// implementation of ECMAScript regular expressions. It generates either
				586	// bytecodes or native code.
				587
				588	// The Irregexp regexp engine is structured in three steps.
				589	// 1) The parser generates an abstract syntax tree. See ast.cc.
				590	// 2) From the AST a node network is created. The nodes are all
				591	// subclasses of RegExpNode. The nodes represent states when
				592	// executing a regular expression. Several optimizations are
				593	// performed on the node network.
				594	// 3) From the nodes we generate either byte codes or native code
				595	// that can actually execute the regular expression (perform
				596	// the search). The code generation step is described in more
				597	// detail below.
				598
				599	// Code generation.
				600	//
				601	// The nodes are divided into four main categories.
				602	// * Choice nodes
				603	// These represent places where the regular expression can
				604	// match in more than one way. For example on entry to an
				605	// alternation (foo\|bar) or a repetition (*, +, ? or {}).
				606	// * Action nodes
				607	// These represent places where some action should be
				608	// performed. Examples include recording the current position
				609	// in the input string to a register (in order to implement
				610	// captures) or other actions on register for example in order
				611	// to implement the counters needed for {} repetitions.
				612	// * Matching nodes
				613	// These attempt to match some element part of the input string.
				614	// Examples of elements include character classes, plain strings
				615	// or back references.
				616	// * End nodes
				617	// These are used to implement the actions required on finding
				618	// a successful match or failing to find a match.
				619	//
				620	// The code generated (whether as byte codes or native code) maintains
				621	// some state as it runs. This consists of the following elements:
				622	//
				623	// * The capture registers. Used for string captures.
				624	// * Other registers. Used for counters etc.
				625	// * The current position.
				626	// * The stack of backtracking information. Used when a matching node
				627	// fails to find a match and needs to try an alternative.
				628	//
				629	// Conceptual regular expression execution model:
				630	//
				631	// There is a simple conceptual model of regular expression execution
				632	// which will be presented first. The actual code generated is a more
				633	// efficient simulation of the simple conceptual model:
				634	//
				635	// * Choice nodes are implemented as follows:
				636	// For each choice except the last {
				637	// push current position
				638	// push backtrack code location
				639	// <generate code to test for choice>
				640	// backtrack code location:
				641	// pop current position
				642	// }
				643	// <generate code to test for last choice>
				644	//
				645	// * Actions nodes are generated as follows
				646	// <push affected registers on backtrack stack>
				647	// <generate code to perform action>
				648	// push backtrack code location
				649	// <generate code to test for following nodes>
				650	// backtrack code location:
				651	// <pop affected registers to restore their state>
				652	// <pop backtrack location from stack and go to it>
				653	//
				654	// * Matching nodes are generated as follows:
				655	// if input string matches at current position
				656	// update current position
				657	// <generate code to test for following nodes>
				658	// else
				659	// <pop backtrack location from stack and go to it>
				660	//
				661	// Thus it can be seen that the current position is saved and restored
				662	// by the choice nodes, whereas the registers are saved and restored by
				663	// by the action nodes that manipulate them.
				664	//
				665	// The other interesting aspect of this model is that nodes are generated
				666	// at the point where they are needed by a recursive call to Emit(). If
				667	// the node has already been code generated then the Emit() call will
				668	// generate a jump to the previously generated code instead. In order to
				669	// limit recursion it is possible for the Emit() function to put the node
				670	// on a work list for later generation and instead generate a jump. The
				671	// destination of the jump is resolved later when the code is generated.
				672	//
				673	// Actual regular expression code generation.
				674	//
				675	// Code generation is actually more complicated than the above. In order
				676	// to improve the efficiency of the generated code some optimizations are
				677	// performed
				678	//
				679	// * Choice nodes have 1-character lookahead.
				680	// A choice node looks at the following character and eliminates some of
				681	// the choices immediately based on that character. This is not yet
				682	// implemented.
				683	// * Simple greedy loops store reduced backtracking information.
				684	// A quantifier like /.*foo/m will greedily match the whole input. It will
				685	// then need to backtrack to a point where it can match "foo". The naive
				686	// implementation of this would push each character position onto the
				687	// backtracking stack, then pop them off one by one. This would use space
				688	// proportional to the length of the input string. However since the "."
				689	// can only match in one way and always has a constant length (in this case
				690	// of 1) it suffices to store the current position on the top of the stack
				691	// once. Matching now becomes merely incrementing the current position and
				692	// backtracking becomes decrementing the current position and checking the
				693	// result against the stored current position. This is faster and saves
				694	// space.
				695	// * The current state is virtualized.
				696	// This is used to defer expensive operations until it is clear that they
				697	// are needed and to generate code for a node more than once, allowing
				698	// specialized an efficient versions of the code to be created. This is
				699	// explained in the section below.
				700	//
				701	// Execution state virtualization.
				702	//
				703	// Instead of emitting code, nodes that manipulate the state can record their
				704	// manipulation in an object called the Trace. The Trace object can record a
				705	// current position offset, an optional backtrack code location on the top of
				706	// the virtualized backtrack stack and some register changes. When a node is
				707	// to be emitted it can flush the Trace or update it. Flushing the Trace
				708	// will emit code to bring the actual state into line with the virtual state.
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	709	// Avoiding flushing the state can postpone some work (e.g. updates of capture
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	710	// registers). Postponing work can save time when executing the regular
				711	// expression since it may be found that the work never has to be done as a
				712	// failure to match can occur. In addition it is much faster to jump to a
				713	// known backtrack code location than it is to pop an unknown backtrack
				714	// location from the stack and jump there.
				715	//
				716	// The virtual state found in the Trace affects code generation. For example
				717	// the virtual state contains the difference between the actual current
				718	// position and the virtual current position, and matching code needs to use
				719	// this offset to attempt a match in the correct location of the input
				720	// string. Therefore code generated for a non-trivial trace is specialized
				721	// to that trace. The code generator therefore has the ability to generate
				722	// code for each node several times. In order to limit the size of the
				723	// generated code there is an arbitrary limit on how many specialized sets of
				724	// code may be generated for a given node. If the limit is reached, the
				725	// trace is flushed and a generic version of the code for a node is emitted.
				726	// This is subsequently used for that node. The code emitted for non-generic
				727	// trace is not recorded in the node and so it cannot currently be reused in
				728	// the event that code generation is requested for an identical trace.
				729
				730
				731	void RegExpTree::AppendToText(RegExpText* text) {
				732	UNREACHABLE();
				733	}
				734
				735
				736	void RegExpAtom::AppendToText(RegExpText* text) {
				737	text->AddElement(TextElement::Atom(this));
				738	}
				739
				740
				741	void RegExpCharacterClass::AppendToText(RegExpText* text) {
				742	text->AddElement(TextElement::CharClass(this));
				743	}
				744
				745
				746	void RegExpText::AppendToText(RegExpText* text) {
				747	for (int i = 0; i < elements()->length(); i++)
				748	text->AddElement(elements()->at(i));
				749	}
				750
				751
				752	TextElement TextElement::Atom(RegExpAtom* atom) {
				753	TextElement result = TextElement(ATOM);
				754	result.data.u_atom = atom;
				755	return result;
				756	}
				757
				758
				759	TextElement TextElement::CharClass(
				760	RegExpCharacterClass* char_class) {
				761	TextElement result = TextElement(CHAR_CLASS);
				762	result.data.u_char_class = char_class;
				763	return result;
				764	}
				765
				766
				767	int TextElement::length() {
				768	if (type == ATOM) {
				769	return data.u_atom->length();
				770	} else {
				771	ASSERT(type == CHAR_CLASS);
				772	return 1;
				773	}
				774	}
				775
				776
				777	DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
				778	if (table_ == NULL) {
				779	table_ = new DispatchTable();
				780	DispatchTableConstructor cons(table_, ignore_case);
				781	cons.BuildTable(this);
				782	}
				783	return table_;
				784	}
				785
				786
				787	class RegExpCompiler {
				788	public:
				789	RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
				790
				791	int AllocateRegister() {
				792	if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
				793	reg_exp_too_big_ = true;
				794	return next_register_;
				795	}
				796	return next_register_++;
				797	}
				798
				799	RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
				800	RegExpNode* start,
				801	int capture_count,
				802	Handle<String> pattern);
				803
				804	inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
				805
				806	static const int kImplementationOffset = 0;
				807	static const int kNumberOfRegistersOffset = 0;
				808	static const int kCodeOffset = 1;
				809
				810	RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
				811	EndNode* accept() { return accept_; }
				812
				813	static const int kMaxRecursion = 100;
				814	inline int recursion_depth() { return recursion_depth_; }
				815	inline void IncrementRecursionDepth() { recursion_depth_++; }
				816	inline void DecrementRecursionDepth() { recursion_depth_--; }
				817
				818	void SetRegExpTooBig() { reg_exp_too_big_ = true; }
				819
				820	inline bool ignore_case() { return ignore_case_; }
				821	inline bool ascii() { return ascii_; }
				822
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	823	int current_expansion_factor() { return current_expansion_factor_; }
				824	void set_current_expansion_factor(int value) {
				825	current_expansion_factor_ = value;
				826	}
				827
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	828	static const int kNoRegister = -1;
Ben Murdoch	3fb3ca8	2011-12-02 17:19:32 +0000	[diff] [blame]	829
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	830	private:
				831	EndNode* accept_;
				832	int next_register_;
				833	List<RegExpNode> work_list_;
				834	int recursion_depth_;
				835	RegExpMacroAssembler* macro_assembler_;
				836	bool ignore_case_;
				837	bool ascii_;
				838	bool reg_exp_too_big_;
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	839	int current_expansion_factor_;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	840	};
				841
				842
				843	class RecursionCheck {
				844	public:
				845	explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
				846	compiler->IncrementRecursionDepth();
				847	}
				848	~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
				849	private:
				850	RegExpCompiler* compiler_;
				851	};
				852
				853
				854	static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
				855	return RegExpEngine::CompilationResult("RegExp too big");
				856	}
				857
				858
				859	// Attempts to compile the regexp using an Irregexp code generator. Returns
				860	// a fixed array or a null handle depending on whether it succeeded.
				861	RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
				862	: next_register_(2 * (capture_count + 1)),
				863	work_list_(NULL),
				864	recursion_depth_(0),
				865	ignore_case_(ignore_case),
				866	ascii_(ascii),
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	867	reg_exp_too_big_(false),
				868	current_expansion_factor_(1) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	869	accept_ = new EndNode(EndNode::ACCEPT);
				870	ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
				871	}
				872
				873
				874	RegExpEngine::CompilationResult RegExpCompiler::Assemble(
				875	RegExpMacroAssembler* macro_assembler,
				876	RegExpNode* start,
				877	int capture_count,
				878	Handle<String> pattern) {
Steve Block	053d10c	2011-06-13 19:13:29 +0100	[diff] [blame]	879	Heap* heap = pattern->GetHeap();
				880
				881	bool use_slow_safe_regexp_compiler = false;
				882	if (heap->total_regexp_code_generated() >
				883	RegExpImpl::kRegWxpCompiledLimit &&
				884	heap->isolate()->memory_allocator()->SizeExecutable() >
				885	RegExpImpl::kRegExpExecutableMemoryLimit) {
				886	use_slow_safe_regexp_compiler = true;
				887	}
				888
				889	macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);
				890
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	891	#ifdef DEBUG
				892	if (FLAG_trace_regexp_assembler)
				893	macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
				894	else
				895	#endif
				896	macro_assembler_ = macro_assembler;
Steve Block	053d10c	2011-06-13 19:13:29 +0100	[diff] [blame]	897
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	898	List <RegExpNode*> work_list(0);
				899	work_list_ = &work_list;
				900	Label fail;
				901	macro_assembler_->PushBacktrack(&fail);
				902	Trace new_trace;
				903	start->Emit(this, &new_trace);
				904	macro_assembler_->Bind(&fail);
				905	macro_assembler_->Fail();
				906	while (!work_list.is_empty()) {
				907	work_list.RemoveLast()->Emit(this, &new_trace);
				908	}
				909	if (reg_exp_too_big_) return IrregexpRegExpTooBig();
				910
Steve Block	053d10c	2011-06-13 19:13:29 +0100	[diff] [blame]	911	Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
				912	heap->IncreaseTotalRegexpCodeGenerated(code->Size());
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	913	work_list_ = NULL;
				914	#ifdef DEBUG
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	915	if (FLAG_print_code) {
				916	Handle<Code>::cast(code)->Disassemble(*pattern->ToCString());
				917	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	918	if (FLAG_trace_regexp_assembler) {
				919	delete macro_assembler_;
				920	}
				921	#endif
				922	return RegExpEngine::CompilationResult(*code, next_register_);
				923	}
				924
				925
				926	bool Trace::DeferredAction::Mentions(int that) {
				927	if (type() == ActionNode::CLEAR_CAPTURES) {
				928	Interval range = static_cast<DeferredClearCaptures*>(this)->range();
				929	return range.Contains(that);
				930	} else {
				931	return reg() == that;
				932	}
				933	}
				934
				935
				936	bool Trace::mentions_reg(int reg) {
				937	for (DeferredAction* action = actions_;
				938	action != NULL;
				939	action = action->next()) {
				940	if (action->Mentions(reg))
				941	return true;
				942	}
				943	return false;
				944	}
				945
				946
				947	bool Trace::GetStoredPosition(int reg, int* cp_offset) {
				948	ASSERT_EQ(0, *cp_offset);
				949	for (DeferredAction* action = actions_;
				950	action != NULL;
				951	action = action->next()) {
				952	if (action->Mentions(reg)) {
				953	if (action->type() == ActionNode::STORE_POSITION) {
				954	cp_offset = static_cast<DeferredCapture>(action)->cp_offset();
				955	return true;
				956	} else {
				957	return false;
				958	}
				959	}
				960	}
				961	return false;
				962	}
				963
				964
				965	int Trace::FindAffectedRegisters(OutSet* affected_registers) {
				966	int max_register = RegExpCompiler::kNoRegister;
				967	for (DeferredAction* action = actions_;
				968	action != NULL;
				969	action = action->next()) {
				970	if (action->type() == ActionNode::CLEAR_CAPTURES) {
				971	Interval range = static_cast<DeferredClearCaptures*>(action)->range();
				972	for (int i = range.from(); i <= range.to(); i++)
				973	affected_registers->Set(i);
				974	if (range.to() > max_register) max_register = range.to();
				975	} else {
				976	affected_registers->Set(action->reg());
				977	if (action->reg() > max_register) max_register = action->reg();
				978	}
				979	}
				980	return max_register;
				981	}
				982
				983
				984	void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
				985	int max_register,
				986	OutSet& registers_to_pop,
				987	OutSet& registers_to_clear) {
				988	for (int reg = max_register; reg >= 0; reg--) {
				989	if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
				990	else if (registers_to_clear.Get(reg)) {
				991	int clear_to = reg;
				992	while (reg > 0 && registers_to_clear.Get(reg - 1)) {
				993	reg--;
				994	}
				995	assembler->ClearRegisters(reg, clear_to);
				996	}
				997	}
				998	}
				999
				1000
				1001	void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
				1002	int max_register,
				1003	OutSet& affected_registers,
				1004	OutSet* registers_to_pop,
				1005	OutSet* registers_to_clear) {
				1006	// The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
				1007	const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
				1008
				1009	// Count pushes performed to force a stack limit check occasionally.
				1010	int pushes = 0;
				1011
				1012	for (int reg = 0; reg <= max_register; reg++) {
				1013	if (!affected_registers.Get(reg)) {
				1014	continue;
				1015	}
				1016
				1017	// The chronologically first deferred action in the trace
				1018	// is used to infer the action needed to restore a register
				1019	// to its previous state (or not, if it's safe to ignore it).
				1020	enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
				1021	DeferredActionUndoType undo_action = IGNORE;
				1022
				1023	int value = 0;
				1024	bool absolute = false;
				1025	bool clear = false;
				1026	int store_position = -1;
				1027	// This is a little tricky because we are scanning the actions in reverse
				1028	// historical order (newest first).
				1029	for (DeferredAction* action = actions_;
				1030	action != NULL;
				1031	action = action->next()) {
				1032	if (action->Mentions(reg)) {
				1033	switch (action->type()) {
				1034	case ActionNode::SET_REGISTER: {
				1035	Trace::DeferredSetRegister* psr =
				1036	static_cast<Trace::DeferredSetRegister*>(action);
				1037	if (!absolute) {
				1038	value += psr->value();
				1039	absolute = true;
				1040	}
				1041	// SET_REGISTER is currently only used for newly introduced loop
				1042	// counters. They can have a significant previous value if they
				1043	// occour in a loop. TODO(lrn): Propagate this information, so
				1044	// we can set undo_action to IGNORE if we know there is no value to
				1045	// restore.
				1046	undo_action = RESTORE;
				1047	ASSERT_EQ(store_position, -1);
				1048	ASSERT(!clear);
				1049	break;
				1050	}
				1051	case ActionNode::INCREMENT_REGISTER:
				1052	if (!absolute) {
				1053	value++;
				1054	}
				1055	ASSERT_EQ(store_position, -1);
				1056	ASSERT(!clear);
				1057	undo_action = RESTORE;
				1058	break;
				1059	case ActionNode::STORE_POSITION: {
				1060	Trace::DeferredCapture* pc =
				1061	static_cast<Trace::DeferredCapture*>(action);
				1062	if (!clear && store_position == -1) {
				1063	store_position = pc->cp_offset();
				1064	}
				1065
				1066	// For captures we know that stores and clears alternate.
				1067	// Other register, are never cleared, and if the occur
				1068	// inside a loop, they might be assigned more than once.
				1069	if (reg <= 1) {
				1070	// Registers zero and one, aka "capture zero", is
				1071	// always set correctly if we succeed. There is no
				1072	// need to undo a setting on backtrack, because we
				1073	// will set it again or fail.
				1074	undo_action = IGNORE;
				1075	} else {
				1076	undo_action = pc->is_capture() ? CLEAR : RESTORE;
				1077	}
				1078	ASSERT(!absolute);
				1079	ASSERT_EQ(value, 0);
				1080	break;
				1081	}
				1082	case ActionNode::CLEAR_CAPTURES: {
				1083	// Since we're scanning in reverse order, if we've already
				1084	// set the position we have to ignore historically earlier
				1085	// clearing operations.
				1086	if (store_position == -1) {
				1087	clear = true;
				1088	}
				1089	undo_action = RESTORE;
				1090	ASSERT(!absolute);
				1091	ASSERT_EQ(value, 0);
				1092	break;
				1093	}
				1094	default:
				1095	UNREACHABLE();
				1096	break;
				1097	}
				1098	}
				1099	}
				1100	// Prepare for the undo-action (e.g., push if it's going to be popped).
				1101	if (undo_action == RESTORE) {
				1102	pushes++;
				1103	RegExpMacroAssembler::StackCheckFlag stack_check =
				1104	RegExpMacroAssembler::kNoStackLimitCheck;
				1105	if (pushes == push_limit) {
				1106	stack_check = RegExpMacroAssembler::kCheckStackLimit;
				1107	pushes = 0;
				1108	}
				1109
				1110	assembler->PushRegister(reg, stack_check);
				1111	registers_to_pop->Set(reg);
				1112	} else if (undo_action == CLEAR) {
				1113	registers_to_clear->Set(reg);
				1114	}
				1115	// Perform the chronologically last action (or accumulated increment)
				1116	// for the register.
				1117	if (store_position != -1) {
				1118	assembler->WriteCurrentPositionToRegister(reg, store_position);
				1119	} else if (clear) {
				1120	assembler->ClearRegisters(reg, reg);
				1121	} else if (absolute) {
				1122	assembler->SetRegister(reg, value);
				1123	} else if (value != 0) {
				1124	assembler->AdvanceRegister(reg, value);
				1125	}
				1126	}
				1127	}
				1128
				1129
				1130	// This is called as we come into a loop choice node and some other tricky
				1131	// nodes. It normalizes the state of the code generator to ensure we can
				1132	// generate generic code.
				1133	void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
				1134	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1135
				1136	ASSERT(!is_trivial());
				1137
				1138	if (actions_ == NULL && backtrack() == NULL) {
				1139	// Here we just have some deferred cp advances to fix and we are back to
				1140	// a normal situation. We may also have to forget some information gained
				1141	// through a quick check that was already performed.
				1142	if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
				1143	// Create a new trivial state and generate the node with that.
				1144	Trace new_state;
				1145	successor->Emit(compiler, &new_state);
				1146	return;
				1147	}
				1148
				1149	// Generate deferred actions here along with code to undo them again.
				1150	OutSet affected_registers;
				1151
				1152	if (backtrack() != NULL) {
				1153	// Here we have a concrete backtrack location. These are set up by choice
				1154	// nodes and so they indicate that we have a deferred save of the current
				1155	// position which we may need to emit here.
				1156	assembler->PushCurrentPosition();
				1157	}
				1158
				1159	int max_register = FindAffectedRegisters(&affected_registers);
				1160	OutSet registers_to_pop;
				1161	OutSet registers_to_clear;
				1162	PerformDeferredActions(assembler,
				1163	max_register,
				1164	affected_registers,
				1165	&registers_to_pop,
				1166	&registers_to_clear);
				1167	if (cp_offset_ != 0) {
				1168	assembler->AdvanceCurrentPosition(cp_offset_);
				1169	}
				1170
				1171	// Create a new trivial state and generate the node with that.
				1172	Label undo;
				1173	assembler->PushBacktrack(&undo);
				1174	Trace new_state;
				1175	successor->Emit(compiler, &new_state);
				1176
				1177	// On backtrack we need to restore state.
				1178	assembler->Bind(&undo);
				1179	RestoreAffectedRegisters(assembler,
				1180	max_register,
				1181	registers_to_pop,
				1182	registers_to_clear);
				1183	if (backtrack() == NULL) {
				1184	assembler->Backtrack();
				1185	} else {
				1186	assembler->PopCurrentPosition();
				1187	assembler->GoTo(backtrack());
				1188	}
				1189	}
				1190
				1191
				1192	void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
				1193	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1194
				1195	// Omit flushing the trace. We discard the entire stack frame anyway.
				1196
				1197	if (!label()->is_bound()) {
				1198	// We are completely independent of the trace, since we ignore it,
				1199	// so this code can be used as the generic version.
				1200	assembler->Bind(label());
				1201	}
				1202
				1203	// Throw away everything on the backtrack stack since the start
				1204	// of the negative submatch and restore the character position.
				1205	assembler->ReadCurrentPositionFromRegister(current_position_register_);
				1206	assembler->ReadStackPointerFromRegister(stack_pointer_register_);
				1207	if (clear_capture_count_ > 0) {
				1208	// Clear any captures that might have been performed during the success
				1209	// of the body of the negative look-ahead.
				1210	int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
				1211	assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
				1212	}
				1213	// Now that we have unwound the stack we find at the top of the stack the
				1214	// backtrack that the BeginSubmatch node got.
				1215	assembler->Backtrack();
				1216	}
				1217
				1218
				1219	void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				1220	if (!trace->is_trivial()) {
				1221	trace->Flush(compiler, this);
				1222	return;
				1223	}
				1224	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1225	if (!label()->is_bound()) {
				1226	assembler->Bind(label());
				1227	}
				1228	switch (action_) {
				1229	case ACCEPT:
				1230	assembler->Succeed();
				1231	return;
				1232	case BACKTRACK:
				1233	assembler->GoTo(trace->backtrack());
				1234	return;
				1235	case NEGATIVE_SUBMATCH_SUCCESS:
				1236	// This case is handled in a different virtual method.
				1237	UNREACHABLE();
				1238	}
				1239	UNIMPLEMENTED();
				1240	}
				1241
				1242
				1243	void GuardedAlternative::AddGuard(Guard* guard) {
				1244	if (guards_ == NULL)
				1245	guards_ = new ZoneList<Guard*>(1);
				1246	guards_->Add(guard);
				1247	}
				1248
				1249
				1250	ActionNode* ActionNode::SetRegister(int reg,
				1251	int val,
				1252	RegExpNode* on_success) {
				1253	ActionNode* result = new ActionNode(SET_REGISTER, on_success);
				1254	result->data_.u_store_register.reg = reg;
				1255	result->data_.u_store_register.value = val;
				1256	return result;
				1257	}
				1258
				1259
				1260	ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
				1261	ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
				1262	result->data_.u_increment_register.reg = reg;
				1263	return result;
				1264	}
				1265
				1266
				1267	ActionNode* ActionNode::StorePosition(int reg,
				1268	bool is_capture,
				1269	RegExpNode* on_success) {
				1270	ActionNode* result = new ActionNode(STORE_POSITION, on_success);
				1271	result->data_.u_position_register.reg = reg;
				1272	result->data_.u_position_register.is_capture = is_capture;
				1273	return result;
				1274	}
				1275
				1276
				1277	ActionNode* ActionNode::ClearCaptures(Interval range,
				1278	RegExpNode* on_success) {
				1279	ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
				1280	result->data_.u_clear_captures.range_from = range.from();
				1281	result->data_.u_clear_captures.range_to = range.to();
				1282	return result;
				1283	}
				1284
				1285
				1286	ActionNode* ActionNode::BeginSubmatch(int stack_reg,
				1287	int position_reg,
				1288	RegExpNode* on_success) {
				1289	ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
				1290	result->data_.u_submatch.stack_pointer_register = stack_reg;
				1291	result->data_.u_submatch.current_position_register = position_reg;
				1292	return result;
				1293	}
				1294
				1295
				1296	ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
				1297	int position_reg,
				1298	int clear_register_count,
				1299	int clear_register_from,
				1300	RegExpNode* on_success) {
				1301	ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
				1302	result->data_.u_submatch.stack_pointer_register = stack_reg;
				1303	result->data_.u_submatch.current_position_register = position_reg;
				1304	result->data_.u_submatch.clear_register_count = clear_register_count;
				1305	result->data_.u_submatch.clear_register_from = clear_register_from;
				1306	return result;
				1307	}
				1308
				1309
				1310	ActionNode* ActionNode::EmptyMatchCheck(int start_register,
				1311	int repetition_register,
				1312	int repetition_limit,
				1313	RegExpNode* on_success) {
				1314	ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
				1315	result->data_.u_empty_match_check.start_register = start_register;
				1316	result->data_.u_empty_match_check.repetition_register = repetition_register;
				1317	result->data_.u_empty_match_check.repetition_limit = repetition_limit;
				1318	return result;
				1319	}
				1320
				1321
				1322	#define DEFINE_ACCEPT(Type) \
				1323	void Type##Node::Accept(NodeVisitor* visitor) { \
				1324	visitor->Visit##Type(this); \
				1325	}
				1326	FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
				1327	#undef DEFINE_ACCEPT
				1328
				1329
				1330	void LoopChoiceNode::Accept(NodeVisitor* visitor) {
				1331	visitor->VisitLoopChoice(this);
				1332	}
				1333
				1334
				1335	// -------------------------------------------------------------------
				1336	// Emit code.
				1337
				1338
				1339	void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
				1340	Guard* guard,
				1341	Trace* trace) {
				1342	switch (guard->op()) {
				1343	case Guard::LT:
				1344	ASSERT(!trace->mentions_reg(guard->reg()));
				1345	macro_assembler->IfRegisterGE(guard->reg(),
				1346	guard->value(),
				1347	trace->backtrack());
				1348	break;
				1349	case Guard::GEQ:
				1350	ASSERT(!trace->mentions_reg(guard->reg()));
				1351	macro_assembler->IfRegisterLT(guard->reg(),
				1352	guard->value(),
				1353	trace->backtrack());
				1354	break;
				1355	}
				1356	}
				1357
				1358
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1359	// Returns the number of characters in the equivalence class, omitting those
				1360	// that cannot occur in the source string because it is ASCII.
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	1361	static int GetCaseIndependentLetters(Isolate* isolate,
				1362	uc16 character,
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1363	bool ascii_subject,
				1364	unibrow::uchar* letters) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	1365	int length =
				1366	isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	1367	// Unibrow returns 0 or 1 for characters where case independence is
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1368	// trivial.
				1369	if (length == 0) {
				1370	letters[0] = character;
				1371	length = 1;
				1372	}
				1373	if (!ascii_subject \|\| character <= String::kMaxAsciiCharCode) {
				1374	return length;
				1375	}
				1376	// The standard requires that non-ASCII characters cannot have ASCII
				1377	// character codes in their equivalence class.
				1378	return 0;
				1379	}
				1380
				1381
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	1382	static inline bool EmitSimpleCharacter(Isolate* isolate,
				1383	RegExpCompiler* compiler,
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1384	uc16 c,
				1385	Label* on_failure,
				1386	int cp_offset,
				1387	bool check,
				1388	bool preloaded) {
				1389	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1390	bool bound_checked = false;
				1391	if (!preloaded) {
				1392	assembler->LoadCurrentCharacter(
				1393	cp_offset,
				1394	on_failure,
				1395	check);
				1396	bound_checked = true;
				1397	}
				1398	assembler->CheckNotCharacter(c, on_failure);
				1399	return bound_checked;
				1400	}
				1401
				1402
				1403	// Only emits non-letters (things that don't have case). Only used for case
				1404	// independent matches.
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	1405	static inline bool EmitAtomNonLetter(Isolate* isolate,
				1406	RegExpCompiler* compiler,
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1407	uc16 c,
				1408	Label* on_failure,
				1409	int cp_offset,
				1410	bool check,
				1411	bool preloaded) {
				1412	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1413	bool ascii = compiler->ascii();
				1414	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	1415	int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1416	if (length < 1) {
				1417	// This can't match. Must be an ASCII subject and a non-ASCII character.
				1418	// We do not need to do anything since the ASCII pass already handled this.
				1419	return false; // Bounds not checked.
				1420	}
				1421	bool checked = false;
				1422	// We handle the length > 1 case in a later pass.
				1423	if (length == 1) {
				1424	if (ascii && c > String::kMaxAsciiCharCodeU) {
				1425	// Can't match - see above.
				1426	return false; // Bounds not checked.
				1427	}
				1428	if (!preloaded) {
				1429	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
				1430	checked = check;
				1431	}
				1432	macro_assembler->CheckNotCharacter(c, on_failure);
				1433	}
				1434	return checked;
				1435	}
				1436
				1437
				1438	static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
				1439	bool ascii,
				1440	uc16 c1,
				1441	uc16 c2,
				1442	Label* on_failure) {
				1443	uc16 char_mask;
				1444	if (ascii) {
				1445	char_mask = String::kMaxAsciiCharCode;
				1446	} else {
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	1447	char_mask = String::kMaxUtf16CodeUnit;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1448	}
				1449	uc16 exor = c1 ^ c2;
				1450	// Check whether exor has only one bit set.
				1451	if (((exor - 1) & exor) == 0) {
				1452	// If c1 and c2 differ only by one bit.
				1453	// Ecma262UnCanonicalize always gives the highest number last.
				1454	ASSERT(c2 > c1);
				1455	uc16 mask = char_mask ^ exor;
				1456	macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
				1457	return true;
				1458	}
				1459	ASSERT(c2 > c1);
				1460	uc16 diff = c2 - c1;
				1461	if (((diff - 1) & diff) == 0 && c1 >= diff) {
				1462	// If the characters differ by 2^n but don't differ by one bit then
				1463	// subtract the difference from the found character, then do the or
				1464	// trick. We avoid the theoretical case where negative numbers are
				1465	// involved in order to simplify code generation.
				1466	uc16 mask = char_mask ^ diff;
				1467	macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
				1468	diff,
				1469	mask,
				1470	on_failure);
				1471	return true;
				1472	}
				1473	return false;
				1474	}
				1475
				1476
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	1477	typedef bool EmitCharacterFunction(Isolate* isolate,
				1478	RegExpCompiler* compiler,
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1479	uc16 c,
				1480	Label* on_failure,
				1481	int cp_offset,
				1482	bool check,
				1483	bool preloaded);
				1484
				1485	// Only emits letters (things that have case). Only used for case independent
				1486	// matches.
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	1487	static inline bool EmitAtomLetter(Isolate* isolate,
				1488	RegExpCompiler* compiler,
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1489	uc16 c,
				1490	Label* on_failure,
				1491	int cp_offset,
				1492	bool check,
				1493	bool preloaded) {
				1494	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1495	bool ascii = compiler->ascii();
				1496	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	1497	int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1498	if (length <= 1) return false;
				1499	// We may not need to check against the end of the input string
				1500	// if this character lies before a character that matched.
				1501	if (!preloaded) {
				1502	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
				1503	}
				1504	Label ok;
				1505	ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
				1506	switch (length) {
				1507	case 2: {
				1508	if (ShortCutEmitCharacterPair(macro_assembler,
				1509	ascii,
				1510	chars[0],
				1511	chars[1],
				1512	on_failure)) {
				1513	} else {
				1514	macro_assembler->CheckCharacter(chars[0], &ok);
				1515	macro_assembler->CheckNotCharacter(chars[1], on_failure);
				1516	macro_assembler->Bind(&ok);
				1517	}
				1518	break;
				1519	}
				1520	case 4:
				1521	macro_assembler->CheckCharacter(chars[3], &ok);
				1522	// Fall through!
				1523	case 3:
				1524	macro_assembler->CheckCharacter(chars[0], &ok);
				1525	macro_assembler->CheckCharacter(chars[1], &ok);
				1526	macro_assembler->CheckNotCharacter(chars[2], on_failure);
				1527	macro_assembler->Bind(&ok);
				1528	break;
				1529	default:
				1530	UNREACHABLE();
				1531	break;
				1532	}
				1533	return true;
				1534	}
				1535
				1536
				1537	static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
				1538	RegExpCharacterClass* cc,
				1539	bool ascii,
				1540	Label* on_failure,
				1541	int cp_offset,
				1542	bool check_offset,
				1543	bool preloaded) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1544	ZoneList<CharacterRange>* ranges = cc->ranges();
				1545	int max_char;
				1546	if (ascii) {
				1547	max_char = String::kMaxAsciiCharCode;
				1548	} else {
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	1549	max_char = String::kMaxUtf16CodeUnit;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1550	}
				1551
				1552	Label success;
				1553
				1554	Label* char_is_in_class =
				1555	cc->is_negated() ? on_failure : &success;
				1556
				1557	int range_count = ranges->length();
				1558
				1559	int last_valid_range = range_count - 1;
				1560	while (last_valid_range >= 0) {
				1561	CharacterRange& range = ranges->at(last_valid_range);
				1562	if (range.from() <= max_char) {
				1563	break;
				1564	}
				1565	last_valid_range--;
				1566	}
				1567
				1568	if (last_valid_range < 0) {
				1569	if (!cc->is_negated()) {
				1570	// TODO(plesner): We can remove this when the node level does our
				1571	// ASCII optimizations for us.
				1572	macro_assembler->GoTo(on_failure);
				1573	}
				1574	if (check_offset) {
				1575	macro_assembler->CheckPosition(cp_offset, on_failure);
				1576	}
				1577	return;
				1578	}
				1579
				1580	if (last_valid_range == 0 &&
				1581	!cc->is_negated() &&
				1582	ranges->at(0).IsEverything(max_char)) {
				1583	// This is a common case hit by non-anchored expressions.
				1584	if (check_offset) {
				1585	macro_assembler->CheckPosition(cp_offset, on_failure);
				1586	}
				1587	return;
				1588	}
				1589
				1590	if (!preloaded) {
				1591	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
				1592	}
				1593
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	1594	if (cc->is_standard() &&
				1595	macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
				1596	on_failure)) {
				1597	return;
				1598	}
				1599
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1600	for (int i = 0; i < last_valid_range; i++) {
				1601	CharacterRange& range = ranges->at(i);
				1602	Label next_range;
				1603	uc16 from = range.from();
				1604	uc16 to = range.to();
				1605	if (from > max_char) {
				1606	continue;
				1607	}
				1608	if (to > max_char) to = max_char;
				1609	if (to == from) {
				1610	macro_assembler->CheckCharacter(to, char_is_in_class);
				1611	} else {
				1612	if (from != 0) {
				1613	macro_assembler->CheckCharacterLT(from, &next_range);
				1614	}
				1615	if (to != max_char) {
				1616	macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
				1617	} else {
				1618	macro_assembler->GoTo(char_is_in_class);
				1619	}
				1620	}
				1621	macro_assembler->Bind(&next_range);
				1622	}
				1623
				1624	CharacterRange& range = ranges->at(last_valid_range);
				1625	uc16 from = range.from();
				1626	uc16 to = range.to();
				1627
				1628	if (to > max_char) to = max_char;
				1629	ASSERT(to >= from);
				1630
				1631	if (to == from) {
				1632	if (cc->is_negated()) {
				1633	macro_assembler->CheckCharacter(to, on_failure);
				1634	} else {
				1635	macro_assembler->CheckNotCharacter(to, on_failure);
				1636	}
				1637	} else {
				1638	if (from != 0) {
				1639	if (cc->is_negated()) {
				1640	macro_assembler->CheckCharacterLT(from, &success);
				1641	} else {
				1642	macro_assembler->CheckCharacterLT(from, on_failure);
				1643	}
				1644	}
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	1645	if (to != String::kMaxUtf16CodeUnit) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1646	if (cc->is_negated()) {
				1647	macro_assembler->CheckCharacterLT(to + 1, on_failure);
				1648	} else {
				1649	macro_assembler->CheckCharacterGT(to, on_failure);
				1650	}
				1651	} else {
				1652	if (cc->is_negated()) {
				1653	macro_assembler->GoTo(on_failure);
				1654	}
				1655	}
				1656	}
				1657	macro_assembler->Bind(&success);
				1658	}
				1659
				1660
				1661	RegExpNode::~RegExpNode() {
				1662	}
				1663
				1664
				1665	RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
				1666	Trace* trace) {
				1667	// If we are generating a greedy loop then don't stop and don't reuse code.
				1668	if (trace->stop_node() != NULL) {
				1669	return CONTINUE;
				1670	}
				1671
				1672	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1673	if (trace->is_trivial()) {
				1674	if (label_.is_bound()) {
				1675	// We are being asked to generate a generic version, but that's already
				1676	// been done so just go to it.
				1677	macro_assembler->GoTo(&label_);
				1678	return DONE;
				1679	}
				1680	if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
				1681	// To avoid too deep recursion we push the node to the work queue and just
				1682	// generate a goto here.
				1683	compiler->AddWork(this);
				1684	macro_assembler->GoTo(&label_);
				1685	return DONE;
				1686	}
				1687	// Generate generic version of the node and bind the label for later use.
				1688	macro_assembler->Bind(&label_);
				1689	return CONTINUE;
				1690	}
				1691
				1692	// We are being asked to make a non-generic version. Keep track of how many
				1693	// non-generic versions we generate so as not to overdo it.
				1694	trace_count_++;
				1695	if (FLAG_regexp_optimization &&
				1696	trace_count_ < kMaxCopiesCodeGenerated &&
				1697	compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
				1698	return CONTINUE;
				1699	}
				1700
				1701	// If we get here code has been generated for this node too many times or
				1702	// recursion is too deep. Time to switch to a generic version. The code for
				1703	// generic versions above can handle deep recursion properly.
				1704	trace->Flush(compiler, this);
				1705	return DONE;
				1706	}
				1707
				1708
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1709	int ActionNode::EatsAtLeast(int still_to_find,
				1710	int recursion_depth,
				1711	bool not_at_start) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1712	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
				1713	if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1714	return on_success()->EatsAtLeast(still_to_find,
				1715	recursion_depth + 1,
				1716	not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1717	}
				1718
				1719
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1720	int AssertionNode::EatsAtLeast(int still_to_find,
				1721	int recursion_depth,
				1722	bool not_at_start) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1723	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1724	// If we know we are not at the start and we are asked "how many characters
				1725	// will you match if you succeed?" then we can answer anything since false
				1726	// implies false. So lets just return the max answer (still_to_find) since
				1727	// that won't prevent us from preloading a lot of characters for the other
				1728	// branches in the node graph.
				1729	if (type() == AT_START && not_at_start) return still_to_find;
				1730	return on_success()->EatsAtLeast(still_to_find,
				1731	recursion_depth + 1,
				1732	not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1733	}
				1734
				1735
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1736	int BackReferenceNode::EatsAtLeast(int still_to_find,
				1737	int recursion_depth,
				1738	bool not_at_start) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1739	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1740	return on_success()->EatsAtLeast(still_to_find,
				1741	recursion_depth + 1,
				1742	not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1743	}
				1744
				1745
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1746	int TextNode::EatsAtLeast(int still_to_find,
				1747	int recursion_depth,
				1748	bool not_at_start) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1749	int answer = Length();
				1750	if (answer >= still_to_find) return answer;
				1751	if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1752	// We are not at start after this node so we set the last argument to 'true'.
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1753	return answer + on_success()->EatsAtLeast(still_to_find - answer,
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1754	recursion_depth + 1,
				1755	true);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1756	}
				1757
				1758
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	1759	int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1760	int recursion_depth,
				1761	bool not_at_start) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1762	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
				1763	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				1764	// afterwards.
				1765	RegExpNode* node = alternatives_->at(1).node();
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1766	return node->EatsAtLeast(still_to_find, recursion_depth + 1, not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1767	}
				1768
				1769
				1770	void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
				1771	QuickCheckDetails* details,
				1772	RegExpCompiler* compiler,
				1773	int filled_in,
				1774	bool not_at_start) {
				1775	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				1776	// afterwards.
				1777	RegExpNode* node = alternatives_->at(1).node();
				1778	return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
				1779	}
				1780
				1781
				1782	int ChoiceNode::EatsAtLeastHelper(int still_to_find,
				1783	int recursion_depth,
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1784	RegExpNode* ignore_this_node,
				1785	bool not_at_start) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1786	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
				1787	int min = 100;
				1788	int choice_count = alternatives_->length();
				1789	for (int i = 0; i < choice_count; i++) {
				1790	RegExpNode* node = alternatives_->at(i).node();
				1791	if (node == ignore_this_node) continue;
				1792	int node_eats_at_least = node->EatsAtLeast(still_to_find,
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1793	recursion_depth + 1,
				1794	not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1795	if (node_eats_at_least < min) min = node_eats_at_least;
				1796	}
				1797	return min;
				1798	}
				1799
				1800
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1801	int LoopChoiceNode::EatsAtLeast(int still_to_find,
				1802	int recursion_depth,
				1803	bool not_at_start) {
				1804	return EatsAtLeastHelper(still_to_find,
				1805	recursion_depth,
				1806	loop_node_,
				1807	not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1808	}
				1809
				1810
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1811	int ChoiceNode::EatsAtLeast(int still_to_find,
				1812	int recursion_depth,
				1813	bool not_at_start) {
				1814	return EatsAtLeastHelper(still_to_find,
				1815	recursion_depth,
				1816	NULL,
				1817	not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1818	}
				1819
				1820
				1821	// Takes the left-most 1-bit and smears it out, setting all bits to its right.
				1822	static inline uint32_t SmearBitsRight(uint32_t v) {
				1823	v \|= v >> 1;
				1824	v \|= v >> 2;
				1825	v \|= v >> 4;
				1826	v \|= v >> 8;
				1827	v \|= v >> 16;
				1828	return v;
				1829	}
				1830
				1831
				1832	bool QuickCheckDetails::Rationalize(bool asc) {
				1833	bool found_useful_op = false;
				1834	uint32_t char_mask;
				1835	if (asc) {
				1836	char_mask = String::kMaxAsciiCharCode;
				1837	} else {
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	1838	char_mask = String::kMaxUtf16CodeUnit;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1839	}
				1840	mask_ = 0;
				1841	value_ = 0;
				1842	int char_shift = 0;
				1843	for (int i = 0; i < characters_; i++) {
				1844	Position* pos = &positions_[i];
				1845	if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
				1846	found_useful_op = true;
				1847	}
				1848	mask_ \|= (pos->mask & char_mask) << char_shift;
				1849	value_ \|= (pos->value & char_mask) << char_shift;
				1850	char_shift += asc ? 8 : 16;
				1851	}
				1852	return found_useful_op;
				1853	}
				1854
				1855
				1856	bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
				1857	Trace* trace,
				1858	bool preload_has_checked_bounds,
				1859	Label* on_possible_success,
				1860	QuickCheckDetails* details,
				1861	bool fall_through_on_failure) {
				1862	if (details->characters() == 0) return false;
				1863	GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
				1864	if (details->cannot_match()) return false;
				1865	if (!details->Rationalize(compiler->ascii())) return false;
				1866	ASSERT(details->characters() == 1 \|\|
				1867	compiler->macro_assembler()->CanReadUnaligned());
				1868	uint32_t mask = details->mask();
				1869	uint32_t value = details->value();
				1870
				1871	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1872
				1873	if (trace->characters_preloaded() != details->characters()) {
				1874	assembler->LoadCurrentCharacter(trace->cp_offset(),
				1875	trace->backtrack(),
				1876	!preload_has_checked_bounds,
				1877	details->characters());
				1878	}
				1879
				1880
				1881	bool need_mask = true;
				1882
				1883	if (details->characters() == 1) {
				1884	// If number of characters preloaded is 1 then we used a byte or 16 bit
				1885	// load so the value is already masked down.
				1886	uint32_t char_mask;
				1887	if (compiler->ascii()) {
				1888	char_mask = String::kMaxAsciiCharCode;
				1889	} else {
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	1890	char_mask = String::kMaxUtf16CodeUnit;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1891	}
				1892	if ((mask & char_mask) == char_mask) need_mask = false;
				1893	mask &= char_mask;
				1894	} else {
Kristian Monsen	9dcf7e2	2010-06-28 14:14:28 +0100	[diff] [blame]	1895	// For 2-character preloads in ASCII mode or 1-character preloads in
				1896	// TWO_BYTE mode we also use a 16 bit load with zero extend.
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1897	if (details->characters() == 2 && compiler->ascii()) {
Kristian Monsen	9dcf7e2	2010-06-28 14:14:28 +0100	[diff] [blame]	1898	if ((mask & 0x7f7f) == 0x7f7f) need_mask = false;
				1899	} else if (details->characters() == 1 && !compiler->ascii()) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1900	if ((mask & 0xffff) == 0xffff) need_mask = false;
				1901	} else {
				1902	if (mask == 0xffffffff) need_mask = false;
				1903	}
				1904	}
				1905
				1906	if (fall_through_on_failure) {
				1907	if (need_mask) {
				1908	assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
				1909	} else {
				1910	assembler->CheckCharacter(value, on_possible_success);
				1911	}
				1912	} else {
				1913	if (need_mask) {
				1914	assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
				1915	} else {
				1916	assembler->CheckNotCharacter(value, trace->backtrack());
				1917	}
				1918	}
				1919	return true;
				1920	}
				1921
				1922
				1923	// Here is the meat of GetQuickCheckDetails (see also the comment on the
				1924	// super-class in the .h file).
				1925	//
				1926	// We iterate along the text object, building up for each character a
				1927	// mask and value that can be used to test for a quick failure to match.
				1928	// The masks and values for the positions will be combined into a single
				1929	// machine word for the current character width in order to be used in
				1930	// generating a quick check.
				1931	void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
				1932	RegExpCompiler* compiler,
				1933	int characters_filled_in,
				1934	bool not_at_start) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	1935	Isolate* isolate = Isolate::Current();
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1936	ASSERT(characters_filled_in < details->characters());
				1937	int characters = details->characters();
				1938	int char_mask;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1939	if (compiler->ascii()) {
				1940	char_mask = String::kMaxAsciiCharCode;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1941	} else {
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	1942	char_mask = String::kMaxUtf16CodeUnit;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1943	}
				1944	for (int k = 0; k < elms_->length(); k++) {
				1945	TextElement elm = elms_->at(k);
				1946	if (elm.type == TextElement::ATOM) {
				1947	Vector<const uc16> quarks = elm.data.u_atom->data();
				1948	for (int i = 0; i < characters && i < quarks.length(); i++) {
				1949	QuickCheckDetails::Position* pos =
				1950	details->positions(characters_filled_in);
				1951	uc16 c = quarks[i];
				1952	if (c > char_mask) {
				1953	// If we expect a non-ASCII character from an ASCII string,
				1954	// there is no way we can match. Not even case independent
				1955	// matching can turn an ASCII character into non-ASCII or
				1956	// vice versa.
				1957	details->set_cannot_match();
				1958	pos->determines_perfectly = false;
				1959	return;
				1960	}
				1961	if (compiler->ignore_case()) {
				1962	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	1963	int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),
				1964	chars);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1965	ASSERT(length != 0); // Can only happen if c > char_mask (see above).
				1966	if (length == 1) {
				1967	// This letter has no case equivalents, so it's nice and simple
				1968	// and the mask-compare will determine definitely whether we have
				1969	// a match at this character position.
				1970	pos->mask = char_mask;
				1971	pos->value = c;
				1972	pos->determines_perfectly = true;
				1973	} else {
				1974	uint32_t common_bits = char_mask;
				1975	uint32_t bits = chars[0];
				1976	for (int j = 1; j < length; j++) {
				1977	uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
				1978	common_bits ^= differing_bits;
				1979	bits &= common_bits;
				1980	}
				1981	// If length is 2 and common bits has only one zero in it then
				1982	// our mask and compare instruction will determine definitely
				1983	// whether we have a match at this character position. Otherwise
				1984	// it can only be an approximate check.
				1985	uint32_t one_zero = (common_bits \| ~char_mask);
				1986	if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
				1987	pos->determines_perfectly = true;
				1988	}
				1989	pos->mask = common_bits;
				1990	pos->value = bits;
				1991	}
				1992	} else {
				1993	// Don't ignore case. Nice simple case where the mask-compare will
				1994	// determine definitely whether we have a match at this character
				1995	// position.
				1996	pos->mask = char_mask;
				1997	pos->value = c;
				1998	pos->determines_perfectly = true;
				1999	}
				2000	characters_filled_in++;
				2001	ASSERT(characters_filled_in <= details->characters());
				2002	if (characters_filled_in == details->characters()) {
				2003	return;
				2004	}
				2005	}
				2006	} else {
				2007	QuickCheckDetails::Position* pos =
				2008	details->positions(characters_filled_in);
				2009	RegExpCharacterClass* tree = elm.data.u_char_class;
				2010	ZoneList<CharacterRange>* ranges = tree->ranges();
				2011	if (tree->is_negated()) {
				2012	// A quick check uses multi-character mask and compare. There is no
				2013	// useful way to incorporate a negative char class into this scheme
				2014	// so we just conservatively create a mask and value that will always
				2015	// succeed.
				2016	pos->mask = 0;
				2017	pos->value = 0;
				2018	} else {
				2019	int first_range = 0;
				2020	while (ranges->at(first_range).from() > char_mask) {
				2021	first_range++;
				2022	if (first_range == ranges->length()) {
				2023	details->set_cannot_match();
				2024	pos->determines_perfectly = false;
				2025	return;
				2026	}
				2027	}
				2028	CharacterRange range = ranges->at(first_range);
				2029	uc16 from = range.from();
				2030	uc16 to = range.to();
				2031	if (to > char_mask) {
				2032	to = char_mask;
				2033	}
				2034	uint32_t differing_bits = (from ^ to);
				2035	// A mask and compare is only perfect if the differing bits form a
				2036	// number like 00011111 with one single block of trailing 1s.
				2037	if ((differing_bits & (differing_bits + 1)) == 0 &&
				2038	from + differing_bits == to) {
				2039	pos->determines_perfectly = true;
				2040	}
				2041	uint32_t common_bits = ~SmearBitsRight(differing_bits);
				2042	uint32_t bits = (from & common_bits);
				2043	for (int i = first_range + 1; i < ranges->length(); i++) {
				2044	CharacterRange range = ranges->at(i);
				2045	uc16 from = range.from();
				2046	uc16 to = range.to();
				2047	if (from > char_mask) continue;
				2048	if (to > char_mask) to = char_mask;
				2049	// Here we are combining more ranges into the mask and compare
				2050	// value. With each new range the mask becomes more sparse and
				2051	// so the chances of a false positive rise. A character class
				2052	// with multiple ranges is assumed never to be equivalent to a
				2053	// mask and compare operation.
				2054	pos->determines_perfectly = false;
				2055	uint32_t new_common_bits = (from ^ to);
				2056	new_common_bits = ~SmearBitsRight(new_common_bits);
				2057	common_bits &= new_common_bits;
				2058	bits &= new_common_bits;
				2059	uint32_t differing_bits = (from & common_bits) ^ bits;
				2060	common_bits ^= differing_bits;
				2061	bits &= common_bits;
				2062	}
				2063	pos->mask = common_bits;
				2064	pos->value = bits;
				2065	}
				2066	characters_filled_in++;
				2067	ASSERT(characters_filled_in <= details->characters());
				2068	if (characters_filled_in == details->characters()) {
				2069	return;
				2070	}
				2071	}
				2072	}
				2073	ASSERT(characters_filled_in != details->characters());
				2074	on_success()-> GetQuickCheckDetails(details,
				2075	compiler,
				2076	characters_filled_in,
				2077	true);
				2078	}
				2079
				2080
				2081	void QuickCheckDetails::Clear() {
				2082	for (int i = 0; i < characters_; i++) {
				2083	positions_[i].mask = 0;
				2084	positions_[i].value = 0;
				2085	positions_[i].determines_perfectly = false;
				2086	}
				2087	characters_ = 0;
				2088	}
				2089
				2090
				2091	void QuickCheckDetails::Advance(int by, bool ascii) {
				2092	ASSERT(by >= 0);
				2093	if (by >= characters_) {
				2094	Clear();
				2095	return;
				2096	}
				2097	for (int i = 0; i < characters_ - by; i++) {
				2098	positions_[i] = positions_[by + i];
				2099	}
				2100	for (int i = characters_ - by; i < characters_; i++) {
				2101	positions_[i].mask = 0;
				2102	positions_[i].value = 0;
				2103	positions_[i].determines_perfectly = false;
				2104	}
				2105	characters_ -= by;
				2106	// We could change mask_ and value_ here but we would never advance unless
				2107	// they had already been used in a check and they won't be used again because
				2108	// it would gain us nothing. So there's no point.
				2109	}
				2110
				2111
				2112	void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
				2113	ASSERT(characters_ == other->characters_);
				2114	if (other->cannot_match_) {
				2115	return;
				2116	}
				2117	if (cannot_match_) {
				2118	this = other;
				2119	return;
				2120	}
				2121	for (int i = from_index; i < characters_; i++) {
				2122	QuickCheckDetails::Position* pos = positions(i);
				2123	QuickCheckDetails::Position* other_pos = other->positions(i);
				2124	if (pos->mask != other_pos->mask \|\|
				2125	pos->value != other_pos->value \|\|
				2126	!other_pos->determines_perfectly) {
				2127	// Our mask-compare operation will be approximate unless we have the
				2128	// exact same operation on both sides of the alternation.
				2129	pos->determines_perfectly = false;
				2130	}
				2131	pos->mask &= other_pos->mask;
				2132	pos->value &= pos->mask;
				2133	other_pos->value &= pos->mask;
				2134	uc16 differing_bits = (pos->value ^ other_pos->value);
				2135	pos->mask &= ~differing_bits;
				2136	pos->value &= pos->mask;
				2137	}
				2138	}
				2139
				2140
				2141	class VisitMarker {
				2142	public:
				2143	explicit VisitMarker(NodeInfo* info) : info_(info) {
				2144	ASSERT(!info->visited);
				2145	info->visited = true;
				2146	}
				2147	~VisitMarker() {
				2148	info_->visited = false;
				2149	}
				2150	private:
				2151	NodeInfo* info_;
				2152	};
				2153
				2154
				2155	void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2156	RegExpCompiler* compiler,
				2157	int characters_filled_in,
				2158	bool not_at_start) {
				2159	if (body_can_be_zero_length_ \|\| info()->visited) return;
				2160	VisitMarker marker(info());
				2161	return ChoiceNode::GetQuickCheckDetails(details,
				2162	compiler,
				2163	characters_filled_in,
				2164	not_at_start);
				2165	}
				2166
				2167
				2168	void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2169	RegExpCompiler* compiler,
				2170	int characters_filled_in,
				2171	bool not_at_start) {
				2172	not_at_start = (not_at_start \|\| not_at_start_);
				2173	int choice_count = alternatives_->length();
				2174	ASSERT(choice_count > 0);
				2175	alternatives_->at(0).node()->GetQuickCheckDetails(details,
				2176	compiler,
				2177	characters_filled_in,
				2178	not_at_start);
				2179	for (int i = 1; i < choice_count; i++) {
				2180	QuickCheckDetails new_details(details->characters());
				2181	RegExpNode* node = alternatives_->at(i).node();
				2182	node->GetQuickCheckDetails(&new_details, compiler,
				2183	characters_filled_in,
				2184	not_at_start);
				2185	// Here we merge the quick match details of the two branches.
				2186	details->Merge(&new_details, characters_filled_in);
				2187	}
				2188	}
				2189
				2190
				2191	// Check for [0-9A-Z_a-z].
				2192	static void EmitWordCheck(RegExpMacroAssembler* assembler,
				2193	Label* word,
				2194	Label* non_word,
				2195	bool fall_through_on_word) {
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2196	if (assembler->CheckSpecialCharacterClass(
				2197	fall_through_on_word ? 'w' : 'W',
				2198	fall_through_on_word ? non_word : word)) {
				2199	// Optimized implementation available.
				2200	return;
				2201	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2202	assembler->CheckCharacterGT('z', non_word);
				2203	assembler->CheckCharacterLT('0', non_word);
				2204	assembler->CheckCharacterGT('a' - 1, word);
				2205	assembler->CheckCharacterLT('9' + 1, word);
				2206	assembler->CheckCharacterLT('A', non_word);
				2207	assembler->CheckCharacterLT('Z' + 1, word);
				2208	if (fall_through_on_word) {
				2209	assembler->CheckNotCharacter('_', non_word);
				2210	} else {
				2211	assembler->CheckCharacter('_', word);
				2212	}
				2213	}
				2214
				2215
				2216	// Emit the code to check for a ^ in multiline mode (1-character lookbehind
				2217	// that matches newline or the start of input).
				2218	static void EmitHat(RegExpCompiler* compiler,
				2219	RegExpNode* on_success,
				2220	Trace* trace) {
				2221	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2222	// We will be loading the previous character into the current character
				2223	// register.
				2224	Trace new_trace(*trace);
				2225	new_trace.InvalidateCurrentCharacter();
				2226
				2227	Label ok;
				2228	if (new_trace.cp_offset() == 0) {
				2229	// The start of input counts as a newline in this context, so skip to
				2230	// ok if we are at the start.
				2231	assembler->CheckAtStart(&ok);
				2232	}
				2233	// We already checked that we are not at the start of input so it must be
				2234	// OK to load the previous character.
				2235	assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
				2236	new_trace.backtrack(),
				2237	false);
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2238	if (!assembler->CheckSpecialCharacterClass('n',
				2239	new_trace.backtrack())) {
				2240	// Newline means \n, \r, 0x2028 or 0x2029.
				2241	if (!compiler->ascii()) {
				2242	assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
				2243	}
				2244	assembler->CheckCharacter('\n', &ok);
				2245	assembler->CheckNotCharacter('\r', new_trace.backtrack());
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2246	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2247	assembler->Bind(&ok);
				2248	on_success->Emit(compiler, &new_trace);
				2249	}
				2250
				2251
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2252	// Emit the code to handle \b and \B (word-boundary or non-word-boundary)
				2253	// when we know whether the next character must be a word character or not.
				2254	static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type,
				2255	RegExpCompiler* compiler,
				2256	RegExpNode* on_success,
				2257	Trace* trace) {
				2258	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2259	Label done;
				2260
				2261	Trace new_trace(*trace);
				2262
				2263	bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER);
				2264	Label* on_word = expect_word_character ? &done : new_trace.backtrack();
				2265	Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done;
				2266
				2267	// Check whether previous character was a word character.
				2268	switch (trace->at_start()) {
				2269	case Trace::TRUE:
				2270	if (expect_word_character) {
				2271	assembler->GoTo(on_non_word);
				2272	}
				2273	break;
				2274	case Trace::UNKNOWN:
				2275	ASSERT_EQ(0, trace->cp_offset());
				2276	assembler->CheckAtStart(on_non_word);
				2277	// Fall through.
				2278	case Trace::FALSE:
				2279	int prev_char_offset = trace->cp_offset() - 1;
				2280	assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1);
				2281	EmitWordCheck(assembler, on_word, on_non_word, expect_word_character);
				2282	// We may or may not have loaded the previous character.
				2283	new_trace.InvalidateCurrentCharacter();
				2284	}
				2285
				2286	assembler->Bind(&done);
				2287
				2288	on_success->Emit(compiler, &new_trace);
				2289	}
				2290
				2291
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2292	// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
				2293	static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
				2294	RegExpCompiler* compiler,
				2295	RegExpNode* on_success,
				2296	Trace* trace) {
				2297	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2298	Label before_non_word;
				2299	Label before_word;
				2300	if (trace->characters_preloaded() != 1) {
				2301	assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
				2302	}
				2303	// Fall through on non-word.
				2304	EmitWordCheck(assembler, &before_word, &before_non_word, false);
				2305
				2306	// We will be loading the previous character into the current character
				2307	// register.
				2308	Trace new_trace(*trace);
				2309	new_trace.InvalidateCurrentCharacter();
				2310
				2311	Label ok;
				2312	Label* boundary;
				2313	Label* not_boundary;
				2314	if (type == AssertionNode::AT_BOUNDARY) {
				2315	boundary = &ok;
				2316	not_boundary = new_trace.backtrack();
				2317	} else {
				2318	not_boundary = &ok;
				2319	boundary = new_trace.backtrack();
				2320	}
				2321
				2322	// Next character is not a word character.
				2323	assembler->Bind(&before_non_word);
				2324	if (new_trace.cp_offset() == 0) {
				2325	// The start of input counts as a non-word character, so the question is
				2326	// decided if we are at the start.
				2327	assembler->CheckAtStart(not_boundary);
				2328	}
				2329	// We already checked that we are not at the start of input so it must be
				2330	// OK to load the previous character.
				2331	assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
				2332	&ok, // Unused dummy label in this call.
				2333	false);
				2334	// Fall through on non-word.
				2335	EmitWordCheck(assembler, boundary, not_boundary, false);
				2336	assembler->GoTo(not_boundary);
				2337
				2338	// Next character is a word character.
				2339	assembler->Bind(&before_word);
				2340	if (new_trace.cp_offset() == 0) {
				2341	// The start of input counts as a non-word character, so the question is
				2342	// decided if we are at the start.
				2343	assembler->CheckAtStart(boundary);
				2344	}
				2345	// We already checked that we are not at the start of input so it must be
				2346	// OK to load the previous character.
				2347	assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
				2348	&ok, // Unused dummy label in this call.
				2349	false);
				2350	bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
				2351	EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
				2352
				2353	assembler->Bind(&ok);
				2354
				2355	on_success->Emit(compiler, &new_trace);
				2356	}
				2357
				2358
				2359	void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2360	RegExpCompiler* compiler,
				2361	int filled_in,
				2362	bool not_at_start) {
				2363	if (type_ == AT_START && not_at_start) {
				2364	details->set_cannot_match();
				2365	return;
				2366	}
				2367	return on_success()->GetQuickCheckDetails(details,
				2368	compiler,
				2369	filled_in,
				2370	not_at_start);
				2371	}
				2372
				2373
				2374	void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2375	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2376	switch (type_) {
				2377	case AT_END: {
				2378	Label ok;
				2379	assembler->CheckPosition(trace->cp_offset(), &ok);
				2380	assembler->GoTo(trace->backtrack());
				2381	assembler->Bind(&ok);
				2382	break;
				2383	}
				2384	case AT_START: {
				2385	if (trace->at_start() == Trace::FALSE) {
				2386	assembler->GoTo(trace->backtrack());
				2387	return;
				2388	}
				2389	if (trace->at_start() == Trace::UNKNOWN) {
				2390	assembler->CheckNotAtStart(trace->backtrack());
				2391	Trace at_start_trace = *trace;
				2392	at_start_trace.set_at_start(true);
				2393	on_success()->Emit(compiler, &at_start_trace);
				2394	return;
				2395	}
				2396	}
				2397	break;
				2398	case AFTER_NEWLINE:
				2399	EmitHat(compiler, on_success(), trace);
				2400	return;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2401	case AT_BOUNDARY:
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2402	case AT_NON_BOUNDARY: {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2403	EmitBoundaryCheck(type_, compiler, on_success(), trace);
				2404	return;
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2405	}
				2406	case AFTER_WORD_CHARACTER:
				2407	case AFTER_NONWORD_CHARACTER: {
				2408	EmitHalfBoundaryCheck(type_, compiler, on_success(), trace);
				2409	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2410	}
				2411	on_success()->Emit(compiler, trace);
				2412	}
				2413
				2414
				2415	static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
				2416	if (quick_check == NULL) return false;
				2417	if (offset >= quick_check->characters()) return false;
				2418	return quick_check->positions(offset)->determines_perfectly;
				2419	}
				2420
				2421
				2422	static void UpdateBoundsCheck(int index, int* checked_up_to) {
				2423	if (index > *checked_up_to) {
				2424	*checked_up_to = index;
				2425	}
				2426	}
				2427
				2428
				2429	// We call this repeatedly to generate code for each pass over the text node.
				2430	// The passes are in increasing order of difficulty because we hope one
				2431	// of the first passes will fail in which case we are saved the work of the
				2432	// later passes. for example for the case independent regexp /%[asdfghjkl]a/
				2433	// we will check the '%' in the first pass, the case independent 'a' in the
				2434	// second pass and the character class in the last pass.
				2435	//
				2436	// The passes are done from right to left, so for example to test for /bar/
				2437	// we will first test for an 'r' with offset 2, then an 'a' with offset 1
				2438	// and then a 'b' with offset 0. This means we can avoid the end-of-input
				2439	// bounds check most of the time. In the example we only need to check for
				2440	// end-of-input when loading the putative 'r'.
				2441	//
				2442	// A slight complication involves the fact that the first character may already
				2443	// be fetched into a register by the previous node. In this case we want to
				2444	// do the test for that character first. We do this in separate passes. The
				2445	// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
				2446	// pass has been performed then subsequent passes will have true in
				2447	// first_element_checked to indicate that that character does not need to be
				2448	// checked again.
				2449	//
				2450	// In addition to all this we are passed a Trace, which can
				2451	// contain an AlternativeGeneration object. In this AlternativeGeneration
				2452	// object we can see details of any quick check that was already passed in
				2453	// order to get to the code we are now generating. The quick check can involve
				2454	// loading characters, which means we do not need to recheck the bounds
				2455	// up to the limit the quick check already checked. In addition the quick
				2456	// check can have involved a mask and compare operation which may simplify
				2457	// or obviate the need for further checks at some character positions.
				2458	void TextNode::TextEmitPass(RegExpCompiler* compiler,
				2459	TextEmitPassType pass,
				2460	bool preloaded,
				2461	Trace* trace,
				2462	bool first_element_checked,
				2463	int* checked_up_to) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	2464	Isolate* isolate = Isolate::Current();
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2465	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2466	bool ascii = compiler->ascii();
				2467	Label* backtrack = trace->backtrack();
				2468	QuickCheckDetails* quick_check = trace->quick_check_performed();
				2469	int element_count = elms_->length();
				2470	for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
				2471	TextElement elm = elms_->at(i);
				2472	int cp_offset = trace->cp_offset() + elm.cp_offset;
				2473	if (elm.type == TextElement::ATOM) {
				2474	Vector<const uc16> quarks = elm.data.u_atom->data();
				2475	for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
				2476	if (first_element_checked && i == 0 && j == 0) continue;
				2477	if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
				2478	EmitCharacterFunction* emit_function = NULL;
				2479	switch (pass) {
				2480	case NON_ASCII_MATCH:
				2481	ASSERT(ascii);
				2482	if (quarks[j] > String::kMaxAsciiCharCode) {
				2483	assembler->GoTo(backtrack);
				2484	return;
				2485	}
				2486	break;
				2487	case NON_LETTER_CHARACTER_MATCH:
				2488	emit_function = &EmitAtomNonLetter;
				2489	break;
				2490	case SIMPLE_CHARACTER_MATCH:
				2491	emit_function = &EmitSimpleCharacter;
				2492	break;
				2493	case CASE_CHARACTER_MATCH:
				2494	emit_function = &EmitAtomLetter;
				2495	break;
				2496	default:
				2497	break;
				2498	}
				2499	if (emit_function != NULL) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	2500	bool bound_checked = emit_function(isolate,
				2501	compiler,
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2502	quarks[j],
				2503	backtrack,
				2504	cp_offset + j,
				2505	*checked_up_to < cp_offset + j,
				2506	preloaded);
				2507	if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
				2508	}
				2509	}
				2510	} else {
				2511	ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
				2512	if (pass == CHARACTER_CLASS_MATCH) {
				2513	if (first_element_checked && i == 0) continue;
				2514	if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
				2515	RegExpCharacterClass* cc = elm.data.u_char_class;
				2516	EmitCharClass(assembler,
				2517	cc,
				2518	ascii,
				2519	backtrack,
				2520	cp_offset,
				2521	*checked_up_to < cp_offset,
				2522	preloaded);
				2523	UpdateBoundsCheck(cp_offset, checked_up_to);
				2524	}
				2525	}
				2526	}
				2527	}
				2528
				2529
				2530	int TextNode::Length() {
				2531	TextElement elm = elms_->last();
				2532	ASSERT(elm.cp_offset >= 0);
				2533	if (elm.type == TextElement::ATOM) {
				2534	return elm.cp_offset + elm.data.u_atom->data().length();
				2535	} else {
				2536	return elm.cp_offset + 1;
				2537	}
				2538	}
				2539
				2540
				2541	bool TextNode::SkipPass(int int_pass, bool ignore_case) {
				2542	TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
				2543	if (ignore_case) {
				2544	return pass == SIMPLE_CHARACTER_MATCH;
				2545	} else {
				2546	return pass == NON_LETTER_CHARACTER_MATCH \|\| pass == CASE_CHARACTER_MATCH;
				2547	}
				2548	}
				2549
				2550
				2551	// This generates the code to match a text node. A text node can contain
				2552	// straight character sequences (possibly to be matched in a case-independent
				2553	// way) and character classes. For efficiency we do not do this in a single
				2554	// pass from left to right. Instead we pass over the text node several times,
				2555	// emitting code for some character positions every time. See the comment on
				2556	// TextEmitPass for details.
				2557	void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2558	LimitResult limit_result = LimitVersions(compiler, trace);
				2559	if (limit_result == DONE) return;
				2560	ASSERT(limit_result == CONTINUE);
				2561
				2562	if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
				2563	compiler->SetRegExpTooBig();
				2564	return;
				2565	}
				2566
				2567	if (compiler->ascii()) {
				2568	int dummy = 0;
				2569	TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
				2570	}
				2571
				2572	bool first_elt_done = false;
				2573	int bound_checked_to = trace->cp_offset() - 1;
				2574	bound_checked_to += trace->bound_checked_up_to();
				2575
				2576	// If a character is preloaded into the current character register then
				2577	// check that now.
				2578	if (trace->characters_preloaded() == 1) {
				2579	for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
				2580	if (!SkipPass(pass, compiler->ignore_case())) {
				2581	TextEmitPass(compiler,
				2582	static_cast<TextEmitPassType>(pass),
				2583	true,
				2584	trace,
				2585	false,
				2586	&bound_checked_to);
				2587	}
				2588	}
				2589	first_elt_done = true;
				2590	}
				2591
				2592	for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
				2593	if (!SkipPass(pass, compiler->ignore_case())) {
				2594	TextEmitPass(compiler,
				2595	static_cast<TextEmitPassType>(pass),
				2596	false,
				2597	trace,
				2598	first_elt_done,
				2599	&bound_checked_to);
				2600	}
				2601	}
				2602
				2603	Trace successor_trace(*trace);
				2604	successor_trace.set_at_start(false);
				2605	successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
				2606	RecursionCheck rc(compiler);
				2607	on_success()->Emit(compiler, &successor_trace);
				2608	}
				2609
				2610
				2611	void Trace::InvalidateCurrentCharacter() {
				2612	characters_preloaded_ = 0;
				2613	}
				2614
				2615
				2616	void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
				2617	ASSERT(by > 0);
				2618	// We don't have an instruction for shifting the current character register
				2619	// down or for using a shifted value for anything so lets just forget that
				2620	// we preloaded any characters into it.
				2621	characters_preloaded_ = 0;
				2622	// Adjust the offsets of the quick check performed information. This
				2623	// information is used to find out what we already determined about the
				2624	// characters by means of mask and compare.
				2625	quick_check_performed_.Advance(by, compiler->ascii());
				2626	cp_offset_ += by;
				2627	if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
				2628	compiler->SetRegExpTooBig();
				2629	cp_offset_ = 0;
				2630	}
				2631	bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
				2632	}
				2633
				2634
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	2635	void TextNode::MakeCaseIndependent(bool is_ascii) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2636	int element_count = elms_->length();
				2637	for (int i = 0; i < element_count; i++) {
				2638	TextElement elm = elms_->at(i);
				2639	if (elm.type == TextElement::CHAR_CLASS) {
				2640	RegExpCharacterClass* cc = elm.data.u_char_class;
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	2641	// None of the standard character classes is different in the case
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	2642	// independent case and it slows us down if we don't know that.
				2643	if (cc->is_standard()) continue;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2644	ZoneList<CharacterRange>* ranges = cc->ranges();
				2645	int range_count = ranges->length();
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	2646	for (int j = 0; j < range_count; j++) {
				2647	ranges->at(j).AddCaseEquivalents(ranges, is_ascii);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2648	}
				2649	}
				2650	}
				2651	}
				2652
				2653
				2654	int TextNode::GreedyLoopTextLength() {
				2655	TextElement elm = elms_->at(elms_->length() - 1);
				2656	if (elm.type == TextElement::CHAR_CLASS) {
				2657	return elm.cp_offset + 1;
				2658	} else {
				2659	return elm.cp_offset + elm.data.u_atom->data().length();
				2660	}
				2661	}
				2662
				2663
				2664	// Finds the fixed match length of a sequence of nodes that goes from
				2665	// this alternative and back to this choice node. If there are variable
				2666	// length nodes or other complications in the way then return a sentinel
				2667	// value indicating that a greedy loop cannot be constructed.
Ben Murdoch	589d697	2011-11-30 16:04:58 +0000	[diff] [blame]	2668	int ChoiceNode::GreedyLoopTextLengthForAlternative(
				2669	GuardedAlternative* alternative) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2670	int length = 0;
				2671	RegExpNode* node = alternative->node();
				2672	// Later we will generate code for all these text nodes using recursion
				2673	// so we have to limit the max number.
				2674	int recursion_depth = 0;
				2675	while (node != this) {
				2676	if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
				2677	return kNodeIsTooComplexForGreedyLoops;
				2678	}
				2679	int node_length = node->GreedyLoopTextLength();
				2680	if (node_length == kNodeIsTooComplexForGreedyLoops) {
				2681	return kNodeIsTooComplexForGreedyLoops;
				2682	}
				2683	length += node_length;
				2684	SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
				2685	node = seq_node->on_success();
				2686	}
				2687	return length;
				2688	}
				2689
				2690
				2691	void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
				2692	ASSERT_EQ(loop_node_, NULL);
				2693	AddAlternative(alt);
				2694	loop_node_ = alt.node();
				2695	}
				2696
				2697
				2698	void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
				2699	ASSERT_EQ(continue_node_, NULL);
				2700	AddAlternative(alt);
				2701	continue_node_ = alt.node();
				2702	}
				2703
				2704
				2705	void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2706	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				2707	if (trace->stop_node() == this) {
Ben Murdoch	589d697	2011-11-30 16:04:58 +0000	[diff] [blame]	2708	int text_length =
				2709	GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2710	ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
				2711	// Update the counter-based backtracking info on the stack. This is an
				2712	// optimization for greedy loops (see below).
				2713	ASSERT(trace->cp_offset() == text_length);
				2714	macro_assembler->AdvanceCurrentPosition(text_length);
				2715	macro_assembler->GoTo(trace->loop_label());
				2716	return;
				2717	}
				2718	ASSERT(trace->stop_node() == NULL);
				2719	if (!trace->is_trivial()) {
				2720	trace->Flush(compiler, this);
				2721	return;
				2722	}
				2723	ChoiceNode::Emit(compiler, trace);
				2724	}
				2725
				2726
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	2727	int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
				2728	bool not_at_start) {
				2729	int preload_characters = EatsAtLeast(4, 0, not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2730	if (compiler->macro_assembler()->CanReadUnaligned()) {
				2731	bool ascii = compiler->ascii();
				2732	if (ascii) {
				2733	if (preload_characters > 4) preload_characters = 4;
				2734	// We can't preload 3 characters because there is no machine instruction
				2735	// to do that. We can't just load 4 because we could be reading
				2736	// beyond the end of the string, which could cause a memory fault.
				2737	if (preload_characters == 3) preload_characters = 2;
				2738	} else {
				2739	if (preload_characters > 2) preload_characters = 2;
				2740	}
				2741	} else {
				2742	if (preload_characters > 1) preload_characters = 1;
				2743	}
				2744	return preload_characters;
				2745	}
				2746
				2747
				2748	// This class is used when generating the alternatives in a choice node. It
				2749	// records the way the alternative is being code generated.
				2750	class AlternativeGeneration: public Malloced {
				2751	public:
				2752	AlternativeGeneration()
				2753	: possible_success(),
				2754	expects_preload(false),
				2755	after(),
				2756	quick_check_details() { }
				2757	Label possible_success;
				2758	bool expects_preload;
				2759	Label after;
				2760	QuickCheckDetails quick_check_details;
				2761	};
				2762
				2763
				2764	// Creates a list of AlternativeGenerations. If the list has a reasonable
				2765	// size then it is on the stack, otherwise the excess is on the heap.
				2766	class AlternativeGenerationList {
				2767	public:
				2768	explicit AlternativeGenerationList(int count)
				2769	: alt_gens_(count) {
				2770	for (int i = 0; i < count && i < kAFew; i++) {
				2771	alt_gens_.Add(a_few_alt_gens_ + i);
				2772	}
				2773	for (int i = kAFew; i < count; i++) {
				2774	alt_gens_.Add(new AlternativeGeneration());
				2775	}
				2776	}
				2777	~AlternativeGenerationList() {
				2778	for (int i = kAFew; i < alt_gens_.length(); i++) {
				2779	delete alt_gens_[i];
				2780	alt_gens_[i] = NULL;
				2781	}
				2782	}
				2783
				2784	AlternativeGeneration* at(int i) {
				2785	return alt_gens_[i];
				2786	}
Ben Murdoch	3fb3ca8	2011-12-02 17:19:32 +0000	[diff] [blame]	2787
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2788	private:
				2789	static const int kAFew = 10;
				2790	ZoneList<AlternativeGeneration*> alt_gens_;
				2791	AlternativeGeneration a_few_alt_gens_[kAFew];
				2792	};
				2793
				2794
				2795	/* Code generation for choice nodes.
				2796	*
				2797	* We generate quick checks that do a mask and compare to eliminate a
				2798	* choice. If the quick check succeeds then it jumps to the continuation to
				2799	* do slow checks and check subsequent nodes. If it fails (the common case)
				2800	* it falls through to the next choice.
				2801	*
				2802	* Here is the desired flow graph. Nodes directly below each other imply
				2803	* fallthrough. Alternatives 1 and 2 have quick checks. Alternative
				2804	* 3 doesn't have a quick check so we have to call the slow check.
				2805	* Nodes are marked Qn for quick checks and Sn for slow checks. The entire
				2806	* regexp continuation is generated directly after the Sn node, up to the
				2807	* next GoTo if we decide to reuse some already generated code. Some
				2808	* nodes expect preload_characters to be preloaded into the current
				2809	* character register. R nodes do this preloading. Vertices are marked
				2810	* F for failures and S for success (possible success in the case of quick
				2811	* nodes). L, V, < and > are used as arrow heads.
				2812	*
				2813	* ----------> R
				2814	* \|
				2815	* V
				2816	* Q1 -----> S1
				2817	* \| S /
				2818	* F\| /
				2819	* \| F/
				2820	* \| /
				2821	* \| R
				2822	* \| /
				2823	* V L
				2824	* Q2 -----> S2
				2825	* \| S /
				2826	* F\| /
				2827	* \| F/
				2828	* \| /
				2829	* \| R
				2830	* \| /
				2831	* V L
				2832	* S3
				2833	* \|
				2834	* F\|
				2835	* \|
				2836	* R
				2837	* \|
				2838	* backtrack V
				2839	* <----------Q4
				2840	* \ F \|
				2841	* \ \|S
				2842	* \ F V
				2843	* \-----S4
				2844	*
				2845	* For greedy loops we reverse our expectation and expect to match rather
				2846	* than fail. Therefore we want the loop code to look like this (U is the
				2847	* unwind code that steps back in the greedy loop). The following alternatives
				2848	* look the same as above.
				2849	* _____
				2850	* / \
				2851	* V \|
				2852	* ----------> S1 \|
				2853	* /\| \|
				2854	* / \|S \|
				2855	* F/ \_____/
				2856	* /
				2857	* \|<-----------
				2858	* \| \
				2859	* V \
				2860	* Q2 ---> S2 \
				2861	* \| S / \|
				2862	* F\| / \|
				2863	* \| F/ \|
				2864	* \| / \|
				2865	* \| R \|
				2866	* \| / \|
				2867	* F VL \|
				2868	* <------U \|
				2869	* back \|S \|
				2870	* \______________/
				2871	*/
				2872
				2873
				2874	void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2875	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				2876	int choice_count = alternatives_->length();
				2877	#ifdef DEBUG
				2878	for (int i = 0; i < choice_count - 1; i++) {
				2879	GuardedAlternative alternative = alternatives_->at(i);
				2880	ZoneList<Guard> guards = alternative.guards();
				2881	int guard_count = (guards == NULL) ? 0 : guards->length();
				2882	for (int j = 0; j < guard_count; j++) {
				2883	ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
				2884	}
				2885	}
				2886	#endif
				2887
				2888	LimitResult limit_result = LimitVersions(compiler, trace);
				2889	if (limit_result == DONE) return;
				2890	ASSERT(limit_result == CONTINUE);
				2891
				2892	int new_flush_budget = trace->flush_budget() / choice_count;
				2893	if (trace->flush_budget() == 0 && trace->actions() != NULL) {
				2894	trace->Flush(compiler, this);
				2895	return;
				2896	}
				2897
				2898	RecursionCheck rc(compiler);
				2899
				2900	Trace* current_trace = trace;
				2901
Ben Murdoch	589d697	2011-11-30 16:04:58 +0000	[diff] [blame]	2902	int text_length = GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2903	bool greedy_loop = false;
				2904	Label greedy_loop_label;
				2905	Trace counter_backtrack_trace;
				2906	counter_backtrack_trace.set_backtrack(&greedy_loop_label);
				2907	if (not_at_start()) counter_backtrack_trace.set_at_start(false);
				2908
				2909	if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
				2910	// Here we have special handling for greedy loops containing only text nodes
				2911	// and other simple nodes. These are handled by pushing the current
				2912	// position on the stack and then incrementing the current position each
				2913	// time around the switch. On backtrack we decrement the current position
				2914	// and check it against the pushed value. This avoids pushing backtrack
				2915	// information for each iteration of the loop, which could take up a lot of
				2916	// space.
				2917	greedy_loop = true;
				2918	ASSERT(trace->stop_node() == NULL);
				2919	macro_assembler->PushCurrentPosition();
				2920	current_trace = &counter_backtrack_trace;
				2921	Label greedy_match_failed;
				2922	Trace greedy_match_trace;
				2923	if (not_at_start()) greedy_match_trace.set_at_start(false);
				2924	greedy_match_trace.set_backtrack(&greedy_match_failed);
				2925	Label loop_label;
				2926	macro_assembler->Bind(&loop_label);
				2927	greedy_match_trace.set_stop_node(this);
				2928	greedy_match_trace.set_loop_label(&loop_label);
				2929	alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
				2930	macro_assembler->Bind(&greedy_match_failed);
				2931	}
				2932
				2933	Label second_choice; // For use in greedy matches.
				2934	macro_assembler->Bind(&second_choice);
				2935
				2936	int first_normal_choice = greedy_loop ? 1 : 0;
				2937
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	2938	int preload_characters =
				2939	CalculatePreloadCharacters(compiler,
				2940	current_trace->at_start() == Trace::FALSE);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2941	bool preload_is_current =
				2942	(current_trace->characters_preloaded() == preload_characters);
				2943	bool preload_has_checked_bounds = preload_is_current;
				2944
				2945	AlternativeGenerationList alt_gens(choice_count);
				2946
				2947	// For now we just call all choices one after the other. The idea ultimately
				2948	// is to use the Dispatch table to try only the relevant ones.
				2949	for (int i = first_normal_choice; i < choice_count; i++) {
				2950	GuardedAlternative alternative = alternatives_->at(i);
				2951	AlternativeGeneration* alt_gen = alt_gens.at(i);
				2952	alt_gen->quick_check_details.set_characters(preload_characters);
				2953	ZoneList<Guard> guards = alternative.guards();
				2954	int guard_count = (guards == NULL) ? 0 : guards->length();
				2955	Trace new_trace(*current_trace);
				2956	new_trace.set_characters_preloaded(preload_is_current ?
				2957	preload_characters :
				2958	0);
				2959	if (preload_has_checked_bounds) {
				2960	new_trace.set_bound_checked_up_to(preload_characters);
				2961	}
				2962	new_trace.quick_check_performed()->Clear();
				2963	if (not_at_start_) new_trace.set_at_start(Trace::FALSE);
				2964	alt_gen->expects_preload = preload_is_current;
				2965	bool generate_full_check_inline = false;
				2966	if (FLAG_regexp_optimization &&
				2967	try_to_emit_quick_check_for_alternative(i) &&
				2968	alternative.node()->EmitQuickCheck(compiler,
				2969	&new_trace,
				2970	preload_has_checked_bounds,
				2971	&alt_gen->possible_success,
				2972	&alt_gen->quick_check_details,
				2973	i < choice_count - 1)) {
				2974	// Quick check was generated for this choice.
				2975	preload_is_current = true;
				2976	preload_has_checked_bounds = true;
				2977	// On the last choice in the ChoiceNode we generated the quick
				2978	// check to fall through on possible success. So now we need to
				2979	// generate the full check inline.
				2980	if (i == choice_count - 1) {
				2981	macro_assembler->Bind(&alt_gen->possible_success);
				2982	new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
				2983	new_trace.set_characters_preloaded(preload_characters);
				2984	new_trace.set_bound_checked_up_to(preload_characters);
				2985	generate_full_check_inline = true;
				2986	}
				2987	} else if (alt_gen->quick_check_details.cannot_match()) {
				2988	if (i == choice_count - 1 && !greedy_loop) {
				2989	macro_assembler->GoTo(trace->backtrack());
				2990	}
				2991	continue;
				2992	} else {
				2993	// No quick check was generated. Put the full code here.
				2994	// If this is not the first choice then there could be slow checks from
				2995	// previous cases that go here when they fail. There's no reason to
				2996	// insist that they preload characters since the slow check we are about
				2997	// to generate probably can't use it.
				2998	if (i != first_normal_choice) {
				2999	alt_gen->expects_preload = false;
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	3000	new_trace.InvalidateCurrentCharacter();
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3001	}
				3002	if (i < choice_count - 1) {
				3003	new_trace.set_backtrack(&alt_gen->after);
				3004	}
				3005	generate_full_check_inline = true;
				3006	}
				3007	if (generate_full_check_inline) {
				3008	if (new_trace.actions() != NULL) {
				3009	new_trace.set_flush_budget(new_flush_budget);
				3010	}
				3011	for (int j = 0; j < guard_count; j++) {
				3012	GenerateGuard(macro_assembler, guards->at(j), &new_trace);
				3013	}
				3014	alternative.node()->Emit(compiler, &new_trace);
				3015	preload_is_current = false;
				3016	}
				3017	macro_assembler->Bind(&alt_gen->after);
				3018	}
				3019	if (greedy_loop) {
				3020	macro_assembler->Bind(&greedy_loop_label);
				3021	// If we have unwound to the bottom then backtrack.
				3022	macro_assembler->CheckGreedyLoop(trace->backtrack());
				3023	// Otherwise try the second priority at an earlier position.
				3024	macro_assembler->AdvanceCurrentPosition(-text_length);
				3025	macro_assembler->GoTo(&second_choice);
				3026	}
				3027
				3028	// At this point we need to generate slow checks for the alternatives where
				3029	// the quick check was inlined. We can recognize these because the associated
				3030	// label was bound.
				3031	for (int i = first_normal_choice; i < choice_count - 1; i++) {
				3032	AlternativeGeneration* alt_gen = alt_gens.at(i);
				3033	Trace new_trace(*current_trace);
				3034	// If there are actions to be flushed we have to limit how many times
				3035	// they are flushed. Take the budget of the parent trace and distribute
				3036	// it fairly amongst the children.
				3037	if (new_trace.actions() != NULL) {
				3038	new_trace.set_flush_budget(new_flush_budget);
				3039	}
				3040	EmitOutOfLineContinuation(compiler,
				3041	&new_trace,
				3042	alternatives_->at(i),
				3043	alt_gen,
				3044	preload_characters,
				3045	alt_gens.at(i + 1)->expects_preload);
				3046	}
				3047	}
				3048
				3049
				3050	void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
				3051	Trace* trace,
				3052	GuardedAlternative alternative,
				3053	AlternativeGeneration* alt_gen,
				3054	int preload_characters,
				3055	bool next_expects_preload) {
				3056	if (!alt_gen->possible_success.is_linked()) return;
				3057
				3058	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				3059	macro_assembler->Bind(&alt_gen->possible_success);
				3060	Trace out_of_line_trace(*trace);
				3061	out_of_line_trace.set_characters_preloaded(preload_characters);
				3062	out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
				3063	if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE);
				3064	ZoneList<Guard> guards = alternative.guards();
				3065	int guard_count = (guards == NULL) ? 0 : guards->length();
				3066	if (next_expects_preload) {
				3067	Label reload_current_char;
				3068	out_of_line_trace.set_backtrack(&reload_current_char);
				3069	for (int j = 0; j < guard_count; j++) {
				3070	GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
				3071	}
				3072	alternative.node()->Emit(compiler, &out_of_line_trace);
				3073	macro_assembler->Bind(&reload_current_char);
				3074	// Reload the current character, since the next quick check expects that.
				3075	// We don't need to check bounds here because we only get into this
				3076	// code through a quick check which already did the checked load.
				3077	macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
				3078	NULL,
				3079	false,
				3080	preload_characters);
				3081	macro_assembler->GoTo(&(alt_gen->after));
				3082	} else {
				3083	out_of_line_trace.set_backtrack(&(alt_gen->after));
				3084	for (int j = 0; j < guard_count; j++) {
				3085	GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
				3086	}
				3087	alternative.node()->Emit(compiler, &out_of_line_trace);
				3088	}
				3089	}
				3090
				3091
				3092	void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3093	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3094	LimitResult limit_result = LimitVersions(compiler, trace);
				3095	if (limit_result == DONE) return;
				3096	ASSERT(limit_result == CONTINUE);
				3097
				3098	RecursionCheck rc(compiler);
				3099
				3100	switch (type_) {
				3101	case STORE_POSITION: {
				3102	Trace::DeferredCapture
				3103	new_capture(data_.u_position_register.reg,
				3104	data_.u_position_register.is_capture,
				3105	trace);
				3106	Trace new_trace = *trace;
				3107	new_trace.add_action(&new_capture);
				3108	on_success()->Emit(compiler, &new_trace);
				3109	break;
				3110	}
				3111	case INCREMENT_REGISTER: {
				3112	Trace::DeferredIncrementRegister
				3113	new_increment(data_.u_increment_register.reg);
				3114	Trace new_trace = *trace;
				3115	new_trace.add_action(&new_increment);
				3116	on_success()->Emit(compiler, &new_trace);
				3117	break;
				3118	}
				3119	case SET_REGISTER: {
				3120	Trace::DeferredSetRegister
				3121	new_set(data_.u_store_register.reg, data_.u_store_register.value);
				3122	Trace new_trace = *trace;
				3123	new_trace.add_action(&new_set);
				3124	on_success()->Emit(compiler, &new_trace);
				3125	break;
				3126	}
				3127	case CLEAR_CAPTURES: {
				3128	Trace::DeferredClearCaptures
				3129	new_capture(Interval(data_.u_clear_captures.range_from,
				3130	data_.u_clear_captures.range_to));
				3131	Trace new_trace = *trace;
				3132	new_trace.add_action(&new_capture);
				3133	on_success()->Emit(compiler, &new_trace);
				3134	break;
				3135	}
				3136	case BEGIN_SUBMATCH:
				3137	if (!trace->is_trivial()) {
				3138	trace->Flush(compiler, this);
				3139	} else {
				3140	assembler->WriteCurrentPositionToRegister(
				3141	data_.u_submatch.current_position_register, 0);
				3142	assembler->WriteStackPointerToRegister(
				3143	data_.u_submatch.stack_pointer_register);
				3144	on_success()->Emit(compiler, trace);
				3145	}
				3146	break;
				3147	case EMPTY_MATCH_CHECK: {
				3148	int start_pos_reg = data_.u_empty_match_check.start_register;
				3149	int stored_pos = 0;
				3150	int rep_reg = data_.u_empty_match_check.repetition_register;
				3151	bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
				3152	bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
				3153	if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
				3154	// If we know we haven't advanced and there is no minimum we
				3155	// can just backtrack immediately.
				3156	assembler->GoTo(trace->backtrack());
				3157	} else if (know_dist && stored_pos < trace->cp_offset()) {
				3158	// If we know we've advanced we can generate the continuation
				3159	// immediately.
				3160	on_success()->Emit(compiler, trace);
				3161	} else if (!trace->is_trivial()) {
				3162	trace->Flush(compiler, this);
				3163	} else {
				3164	Label skip_empty_check;
				3165	// If we have a minimum number of repetitions we check the current
				3166	// number first and skip the empty check if it's not enough.
				3167	if (has_minimum) {
				3168	int limit = data_.u_empty_match_check.repetition_limit;
				3169	assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
				3170	}
				3171	// If the match is empty we bail out, otherwise we fall through
				3172	// to the on-success continuation.
				3173	assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
				3174	trace->backtrack());
				3175	assembler->Bind(&skip_empty_check);
				3176	on_success()->Emit(compiler, trace);
				3177	}
				3178	break;
				3179	}
				3180	case POSITIVE_SUBMATCH_SUCCESS: {
				3181	if (!trace->is_trivial()) {
				3182	trace->Flush(compiler, this);
				3183	return;
				3184	}
				3185	assembler->ReadCurrentPositionFromRegister(
				3186	data_.u_submatch.current_position_register);
				3187	assembler->ReadStackPointerFromRegister(
				3188	data_.u_submatch.stack_pointer_register);
				3189	int clear_register_count = data_.u_submatch.clear_register_count;
				3190	if (clear_register_count == 0) {
				3191	on_success()->Emit(compiler, trace);
				3192	return;
				3193	}
				3194	int clear_registers_from = data_.u_submatch.clear_register_from;
				3195	Label clear_registers_backtrack;
				3196	Trace new_trace = *trace;
				3197	new_trace.set_backtrack(&clear_registers_backtrack);
				3198	on_success()->Emit(compiler, &new_trace);
				3199
				3200	assembler->Bind(&clear_registers_backtrack);
				3201	int clear_registers_to = clear_registers_from + clear_register_count - 1;
				3202	assembler->ClearRegisters(clear_registers_from, clear_registers_to);
				3203
				3204	ASSERT(trace->backtrack() == NULL);
				3205	assembler->Backtrack();
				3206	return;
				3207	}
				3208	default:
				3209	UNREACHABLE();
				3210	}
				3211	}
				3212
				3213
				3214	void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3215	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3216	if (!trace->is_trivial()) {
				3217	trace->Flush(compiler, this);
				3218	return;
				3219	}
				3220
				3221	LimitResult limit_result = LimitVersions(compiler, trace);
				3222	if (limit_result == DONE) return;
				3223	ASSERT(limit_result == CONTINUE);
				3224
				3225	RecursionCheck rc(compiler);
				3226
				3227	ASSERT_EQ(start_reg_ + 1, end_reg_);
				3228	if (compiler->ignore_case()) {
				3229	assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
				3230	trace->backtrack());
				3231	} else {
				3232	assembler->CheckNotBackReference(start_reg_, trace->backtrack());
				3233	}
				3234	on_success()->Emit(compiler, trace);
				3235	}
				3236
				3237
				3238	// -------------------------------------------------------------------
				3239	// Dot/dotty output
				3240
				3241
				3242	#ifdef DEBUG
				3243
				3244
				3245	class DotPrinter: public NodeVisitor {
				3246	public:
				3247	explicit DotPrinter(bool ignore_case)
				3248	: ignore_case_(ignore_case),
				3249	stream_(&alloc_) { }
				3250	void PrintNode(const char* label, RegExpNode* node);
				3251	void Visit(RegExpNode* node);
				3252	void PrintAttributes(RegExpNode* from);
				3253	StringStream* stream() { return &stream_; }
				3254	void PrintOnFailure(RegExpNode* from, RegExpNode* to);
				3255	#define DECLARE_VISIT(Type) \
				3256	virtual void Visit##Type(Type##Node* that);
				3257	FOR_EACH_NODE_TYPE(DECLARE_VISIT)
				3258	#undef DECLARE_VISIT
				3259	private:
				3260	bool ignore_case_;
				3261	HeapStringAllocator alloc_;
				3262	StringStream stream_;
				3263	};
				3264
				3265
				3266	void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
				3267	stream()->Add("digraph G {\n graph [label=\"");
				3268	for (int i = 0; label[i]; i++) {
				3269	switch (label[i]) {
				3270	case '\\':
				3271	stream()->Add("\\\\");
				3272	break;
				3273	case '"':
				3274	stream()->Add("\"");
				3275	break;
				3276	default:
				3277	stream()->Put(label[i]);
				3278	break;
				3279	}
				3280	}
				3281	stream()->Add("\"];\n");
				3282	Visit(node);
				3283	stream()->Add("}\n");
				3284	printf("%s", *(stream()->ToCString()));
				3285	}
				3286
				3287
				3288	void DotPrinter::Visit(RegExpNode* node) {
				3289	if (node->info()->visited) return;
				3290	node->info()->visited = true;
				3291	node->Accept(this);
				3292	}
				3293
				3294
				3295	void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
				3296	stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
				3297	Visit(on_failure);
				3298	}
				3299
				3300
				3301	class TableEntryBodyPrinter {
				3302	public:
				3303	TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
				3304	: stream_(stream), choice_(choice) { }
				3305	void Call(uc16 from, DispatchTable::Entry entry) {
				3306	OutSet* out_set = entry.out_set();
				3307	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				3308	if (out_set->Get(i)) {
				3309	stream()->Add(" n%p:s%io%i -> n%p;\n",
				3310	choice(),
				3311	from,
				3312	i,
				3313	choice()->alternatives()->at(i).node());
				3314	}
				3315	}
				3316	}
				3317	private:
				3318	StringStream* stream() { return stream_; }
				3319	ChoiceNode* choice() { return choice_; }
				3320	StringStream* stream_;
				3321	ChoiceNode* choice_;
				3322	};
				3323
				3324
				3325	class TableEntryHeaderPrinter {
				3326	public:
				3327	explicit TableEntryHeaderPrinter(StringStream* stream)
				3328	: first_(true), stream_(stream) { }
				3329	void Call(uc16 from, DispatchTable::Entry entry) {
				3330	if (first_) {
				3331	first_ = false;
				3332	} else {
				3333	stream()->Add("\|");
				3334	}
				3335	stream()->Add("{\\%k-\\%k\|{", from, entry.to());
				3336	OutSet* out_set = entry.out_set();
				3337	int priority = 0;
				3338	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				3339	if (out_set->Get(i)) {
				3340	if (priority > 0) stream()->Add("\|");
				3341	stream()->Add("<s%io%i> %i", from, i, priority);
				3342	priority++;
				3343	}
				3344	}
				3345	stream()->Add("}}");
				3346	}
Ben Murdoch	3fb3ca8	2011-12-02 17:19:32 +0000	[diff] [blame]	3347
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3348	private:
				3349	bool first_;
				3350	StringStream* stream() { return stream_; }
				3351	StringStream* stream_;
				3352	};
				3353
				3354
				3355	class AttributePrinter {
				3356	public:
				3357	explicit AttributePrinter(DotPrinter* out)
				3358	: out_(out), first_(true) { }
				3359	void PrintSeparator() {
				3360	if (first_) {
				3361	first_ = false;
				3362	} else {
				3363	out_->stream()->Add("\|");
				3364	}
				3365	}
				3366	void PrintBit(const char* name, bool value) {
				3367	if (!value) return;
				3368	PrintSeparator();
				3369	out_->stream()->Add("{%s}", name);
				3370	}
				3371	void PrintPositive(const char* name, int value) {
				3372	if (value < 0) return;
				3373	PrintSeparator();
				3374	out_->stream()->Add("{%s\|%x}", name, value);
				3375	}
				3376	private:
				3377	DotPrinter* out_;
				3378	bool first_;
				3379	};
				3380
				3381
				3382	void DotPrinter::PrintAttributes(RegExpNode* that) {
				3383	stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
				3384	"margin=0.1, fontsize=10, label=\"{",
				3385	that);
				3386	AttributePrinter printer(this);
				3387	NodeInfo* info = that->info();
				3388	printer.PrintBit("NI", info->follows_newline_interest);
				3389	printer.PrintBit("WI", info->follows_word_interest);
				3390	printer.PrintBit("SI", info->follows_start_interest);
				3391	Label* label = that->label();
				3392	if (label->is_bound())
				3393	printer.PrintPositive("@", label->pos());
				3394	stream()->Add("}\"];\n");
				3395	stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
				3396	"arrowhead=none];\n", that, that);
				3397	}
				3398
				3399
				3400	static const bool kPrintDispatchTable = false;
				3401	void DotPrinter::VisitChoice(ChoiceNode* that) {
				3402	if (kPrintDispatchTable) {
				3403	stream()->Add(" n%p [shape=Mrecord, label=\"", that);
				3404	TableEntryHeaderPrinter header_printer(stream());
				3405	that->GetTable(ignore_case_)->ForEach(&header_printer);
				3406	stream()->Add("\"]\n", that);
				3407	PrintAttributes(that);
				3408	TableEntryBodyPrinter body_printer(stream(), that);
				3409	that->GetTable(ignore_case_)->ForEach(&body_printer);
				3410	} else {
				3411	stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
				3412	for (int i = 0; i < that->alternatives()->length(); i++) {
				3413	GuardedAlternative alt = that->alternatives()->at(i);
				3414	stream()->Add(" n%p -> n%p;\n", that, alt.node());
				3415	}
				3416	}
				3417	for (int i = 0; i < that->alternatives()->length(); i++) {
				3418	GuardedAlternative alt = that->alternatives()->at(i);
				3419	alt.node()->Accept(this);
				3420	}
				3421	}
				3422
				3423
				3424	void DotPrinter::VisitText(TextNode* that) {
				3425	stream()->Add(" n%p [label=\"", that);
				3426	for (int i = 0; i < that->elements()->length(); i++) {
				3427	if (i > 0) stream()->Add(" ");
				3428	TextElement elm = that->elements()->at(i);
				3429	switch (elm.type) {
				3430	case TextElement::ATOM: {
				3431	stream()->Add("'%w'", elm.data.u_atom->data());
				3432	break;
				3433	}
				3434	case TextElement::CHAR_CLASS: {
				3435	RegExpCharacterClass* node = elm.data.u_char_class;
				3436	stream()->Add("[");
				3437	if (node->is_negated())
				3438	stream()->Add("^");
				3439	for (int j = 0; j < node->ranges()->length(); j++) {
				3440	CharacterRange range = node->ranges()->at(j);
				3441	stream()->Add("%k-%k", range.from(), range.to());
				3442	}
				3443	stream()->Add("]");
				3444	break;
				3445	}
				3446	default:
				3447	UNREACHABLE();
				3448	}
				3449	}
				3450	stream()->Add("\", shape=box, peripheries=2];\n");
				3451	PrintAttributes(that);
				3452	stream()->Add(" n%p -> n%p;\n", that, that->on_success());
				3453	Visit(that->on_success());
				3454	}
				3455
				3456
				3457	void DotPrinter::VisitBackReference(BackReferenceNode* that) {
				3458	stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
				3459	that,
				3460	that->start_register(),
				3461	that->end_register());
				3462	PrintAttributes(that);
				3463	stream()->Add(" n%p -> n%p;\n", that, that->on_success());
				3464	Visit(that->on_success());
				3465	}
				3466
				3467
				3468	void DotPrinter::VisitEnd(EndNode* that) {
				3469	stream()->Add(" n%p [style=bold, shape=point];\n", that);
				3470	PrintAttributes(that);
				3471	}
				3472
				3473
				3474	void DotPrinter::VisitAssertion(AssertionNode* that) {
				3475	stream()->Add(" n%p [", that);
				3476	switch (that->type()) {
				3477	case AssertionNode::AT_END:
				3478	stream()->Add("label=\"$\", shape=septagon");
				3479	break;
				3480	case AssertionNode::AT_START:
				3481	stream()->Add("label=\"^\", shape=septagon");
				3482	break;
				3483	case AssertionNode::AT_BOUNDARY:
				3484	stream()->Add("label=\"\\b\", shape=septagon");
				3485	break;
				3486	case AssertionNode::AT_NON_BOUNDARY:
				3487	stream()->Add("label=\"\\B\", shape=septagon");
				3488	break;
				3489	case AssertionNode::AFTER_NEWLINE:
				3490	stream()->Add("label=\"(?<=\\n)\", shape=septagon");
				3491	break;
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	3492	case AssertionNode::AFTER_WORD_CHARACTER:
				3493	stream()->Add("label=\"(?<=\\w)\", shape=septagon");
				3494	break;
				3495	case AssertionNode::AFTER_NONWORD_CHARACTER:
				3496	stream()->Add("label=\"(?<=\\W)\", shape=septagon");
				3497	break;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3498	}
				3499	stream()->Add("];\n");
				3500	PrintAttributes(that);
				3501	RegExpNode* successor = that->on_success();
				3502	stream()->Add(" n%p -> n%p;\n", that, successor);
				3503	Visit(successor);
				3504	}
				3505
				3506
				3507	void DotPrinter::VisitAction(ActionNode* that) {
				3508	stream()->Add(" n%p [", that);
				3509	switch (that->type_) {
				3510	case ActionNode::SET_REGISTER:
				3511	stream()->Add("label=\"$%i:=%i\", shape=octagon",
				3512	that->data_.u_store_register.reg,
				3513	that->data_.u_store_register.value);
				3514	break;
				3515	case ActionNode::INCREMENT_REGISTER:
				3516	stream()->Add("label=\"$%i++\", shape=octagon",
				3517	that->data_.u_increment_register.reg);
				3518	break;
				3519	case ActionNode::STORE_POSITION:
				3520	stream()->Add("label=\"$%i:=$pos\", shape=octagon",
				3521	that->data_.u_position_register.reg);
				3522	break;
				3523	case ActionNode::BEGIN_SUBMATCH:
				3524	stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
				3525	that->data_.u_submatch.current_position_register);
				3526	break;
				3527	case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
				3528	stream()->Add("label=\"escape\", shape=septagon");
				3529	break;
				3530	case ActionNode::EMPTY_MATCH_CHECK:
				3531	stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
				3532	that->data_.u_empty_match_check.start_register,
				3533	that->data_.u_empty_match_check.repetition_register,
				3534	that->data_.u_empty_match_check.repetition_limit);
				3535	break;
				3536	case ActionNode::CLEAR_CAPTURES: {
				3537	stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
				3538	that->data_.u_clear_captures.range_from,
				3539	that->data_.u_clear_captures.range_to);
				3540	break;
				3541	}
				3542	}
				3543	stream()->Add("];\n");
				3544	PrintAttributes(that);
				3545	RegExpNode* successor = that->on_success();
				3546	stream()->Add(" n%p -> n%p;\n", that, successor);
				3547	Visit(successor);
				3548	}
				3549
				3550
				3551	class DispatchTableDumper {
				3552	public:
				3553	explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
				3554	void Call(uc16 key, DispatchTable::Entry entry);
				3555	StringStream* stream() { return stream_; }
				3556	private:
				3557	StringStream* stream_;
				3558	};
				3559
				3560
				3561	void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
				3562	stream()->Add("[%k-%k]: {", key, entry.to());
				3563	OutSet* set = entry.out_set();
				3564	bool first = true;
				3565	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				3566	if (set->Get(i)) {
				3567	if (first) {
				3568	first = false;
				3569	} else {
				3570	stream()->Add(", ");
				3571	}
				3572	stream()->Add("%i", i);
				3573	}
				3574	}
				3575	stream()->Add("}\n");
				3576	}
				3577
				3578
				3579	void DispatchTable::Dump() {
				3580	HeapStringAllocator alloc;
				3581	StringStream stream(&alloc);
				3582	DispatchTableDumper dumper(&stream);
				3583	tree()->ForEach(&dumper);
				3584	OS::PrintError("%s", *stream.ToCString());
				3585	}
				3586
				3587
				3588	void RegExpEngine::DotPrint(const char* label,
				3589	RegExpNode* node,
				3590	bool ignore_case) {
				3591	DotPrinter printer(ignore_case);
				3592	printer.PrintNode(label, node);
				3593	}
				3594
				3595
				3596	#endif // DEBUG
				3597
				3598
				3599	// -------------------------------------------------------------------
				3600	// Tree to graph conversion
				3601
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	3602	static const uc16 kSpaceRanges[] = { 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0,
				3603	0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029,
				3604	0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000, 0xFEFF, 0xFEFF };
				3605	static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3606
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	3607	static const uc16 kWordRanges[] = { '0', '9', 'A', 'Z', '_', '_', 'a', 'z' };
				3608	static const int kWordRangeCount = ARRAY_SIZE(kWordRanges);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3609
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	3610	static const uc16 kDigitRanges[] = { '0', '9' };
				3611	static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3612
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	3613	static const uc16 kLineTerminatorRanges[] = { 0x000A, 0x000A, 0x000D, 0x000D,
				3614	0x2028, 0x2029 };
				3615	static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3616
				3617	RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
				3618	RegExpNode* on_success) {
				3619	ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
				3620	elms->Add(TextElement::Atom(this));
				3621	return new TextNode(elms, on_success);
				3622	}
				3623
				3624
				3625	RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
				3626	RegExpNode* on_success) {
				3627	return new TextNode(elements(), on_success);
				3628	}
				3629
				3630	static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
				3631	const uc16* special_class,
				3632	int length) {
				3633	ASSERT(ranges->length() != 0);
				3634	ASSERT(length != 0);
				3635	ASSERT(special_class[0] != 0);
				3636	if (ranges->length() != (length >> 1) + 1) {
				3637	return false;
				3638	}
				3639	CharacterRange range = ranges->at(0);
				3640	if (range.from() != 0) {
				3641	return false;
				3642	}
				3643	for (int i = 0; i < length; i += 2) {
				3644	if (special_class[i] != (range.to() + 1)) {
				3645	return false;
				3646	}
				3647	range = ranges->at((i >> 1) + 1);
				3648	if (special_class[i+1] != range.from() - 1) {
				3649	return false;
				3650	}
				3651	}
				3652	if (range.to() != 0xffff) {
				3653	return false;
				3654	}
				3655	return true;
				3656	}
				3657
				3658
				3659	static bool CompareRanges(ZoneList<CharacterRange>* ranges,
				3660	const uc16* special_class,
				3661	int length) {
				3662	if (ranges->length() * 2 != length) {
				3663	return false;
				3664	}
				3665	for (int i = 0; i < length; i += 2) {
				3666	CharacterRange range = ranges->at(i >> 1);
				3667	if (range.from() != special_class[i] \|\| range.to() != special_class[i+1]) {
				3668	return false;
				3669	}
				3670	}
				3671	return true;
				3672	}
				3673
				3674
				3675	bool RegExpCharacterClass::is_standard() {
				3676	// TODO(lrn): Remove need for this function, by not throwing away information
				3677	// along the way.
				3678	if (is_negated_) {
				3679	return false;
				3680	}
				3681	if (set_.is_standard()) {
				3682	return true;
				3683	}
				3684	if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
				3685	set_.set_standard_set_type('s');
				3686	return true;
				3687	}
				3688	if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
				3689	set_.set_standard_set_type('S');
				3690	return true;
				3691	}
				3692	if (CompareInverseRanges(set_.ranges(),
				3693	kLineTerminatorRanges,
				3694	kLineTerminatorRangeCount)) {
				3695	set_.set_standard_set_type('.');
				3696	return true;
				3697	}
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	3698	if (CompareRanges(set_.ranges(),
				3699	kLineTerminatorRanges,
				3700	kLineTerminatorRangeCount)) {
				3701	set_.set_standard_set_type('n');
				3702	return true;
				3703	}
				3704	if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
				3705	set_.set_standard_set_type('w');
				3706	return true;
				3707	}
				3708	if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
				3709	set_.set_standard_set_type('W');
				3710	return true;
				3711	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3712	return false;
				3713	}
				3714
				3715
				3716	RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
				3717	RegExpNode* on_success) {
				3718	return new TextNode(this, on_success);
				3719	}
				3720
				3721
				3722	RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
				3723	RegExpNode* on_success) {
				3724	ZoneList<RegExpTree> alternatives = this->alternatives();
				3725	int length = alternatives->length();
				3726	ChoiceNode* result = new ChoiceNode(length);
				3727	for (int i = 0; i < length; i++) {
				3728	GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
				3729	on_success));
				3730	result->AddAlternative(alternative);
				3731	}
				3732	return result;
				3733	}
				3734
				3735
				3736	RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
				3737	RegExpNode* on_success) {
				3738	return ToNode(min(),
				3739	max(),
				3740	is_greedy(),
				3741	body(),
				3742	compiler,
				3743	on_success);
				3744	}
				3745
				3746
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	3747	// Scoped object to keep track of how much we unroll quantifier loops in the
				3748	// regexp graph generator.
				3749	class RegExpExpansionLimiter {
				3750	public:
				3751	static const int kMaxExpansionFactor = 6;
				3752	RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
				3753	: compiler_(compiler),
				3754	saved_expansion_factor_(compiler->current_expansion_factor()),
				3755	ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
				3756	ASSERT(factor > 0);
				3757	if (ok_to_expand_) {
				3758	if (factor > kMaxExpansionFactor) {
				3759	// Avoid integer overflow of the current expansion factor.
				3760	ok_to_expand_ = false;
				3761	compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
				3762	} else {
				3763	int new_factor = saved_expansion_factor_ * factor;
				3764	ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
				3765	compiler->set_current_expansion_factor(new_factor);
				3766	}
				3767	}
				3768	}
				3769
				3770	~RegExpExpansionLimiter() {
				3771	compiler_->set_current_expansion_factor(saved_expansion_factor_);
				3772	}
				3773
				3774	bool ok_to_expand() { return ok_to_expand_; }
				3775
				3776	private:
				3777	RegExpCompiler* compiler_;
				3778	int saved_expansion_factor_;
				3779	bool ok_to_expand_;
				3780
				3781	DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
				3782	};
				3783
				3784
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3785	RegExpNode* RegExpQuantifier::ToNode(int min,
				3786	int max,
				3787	bool is_greedy,
				3788	RegExpTree* body,
				3789	RegExpCompiler* compiler,
				3790	RegExpNode* on_success,
				3791	bool not_at_start) {
				3792	// x{f, t} becomes this:
				3793	//
				3794	// (r++)<-.
				3795	// \| `
				3796	// \| (x)
				3797	// v ^
				3798	// (r=0)-->(?)---/ [if r < t]
				3799	// \|
				3800	// [if r >= f] \----> ...
				3801	//
				3802
				3803	// 15.10.2.5 RepeatMatcher algorithm.
				3804	// The parser has already eliminated the case where max is 0. In the case
				3805	// where max_match is zero the parser has removed the quantifier if min was
				3806	// > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
				3807
				3808	// If we know that we cannot match zero length then things are a little
				3809	// simpler since we don't need to make the special zero length match check
				3810	// from step 2.1. If the min and max are small we can unroll a little in
				3811	// this case.
				3812	static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
				3813	static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
				3814	if (max == 0) return on_success; // This can happen due to recursion.
				3815	bool body_can_be_empty = (body->min_match() == 0);
				3816	int body_start_reg = RegExpCompiler::kNoRegister;
				3817	Interval capture_registers = body->CaptureRegisters();
				3818	bool needs_capture_clearing = !capture_registers.is_empty();
				3819	if (body_can_be_empty) {
				3820	body_start_reg = compiler->AllocateRegister();
				3821	} else if (FLAG_regexp_optimization && !needs_capture_clearing) {
				3822	// Only unroll if there are no captures and the body can't be
				3823	// empty.
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	3824	{
				3825	RegExpExpansionLimiter limiter(
				3826	compiler, min + ((max != min) ? 1 : 0));
				3827	if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
				3828	int new_max = (max == kInfinity) ? max : max - min;
				3829	// Recurse once to get the loop or optional matches after the fixed
				3830	// ones.
				3831	RegExpNode* answer = ToNode(
				3832	0, new_max, is_greedy, body, compiler, on_success, true);
				3833	// Unroll the forced matches from 0 to min. This can cause chains of
				3834	// TextNodes (which the parser does not generate). These should be
				3835	// combined if it turns out they hinder good code generation.
				3836	for (int i = 0; i < min; i++) {
				3837	answer = body->ToNode(compiler, answer);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3838	}
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	3839	return answer;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3840	}
Ben Murdoch	257744e	2011-11-30 15:57:28 +0000	[diff] [blame]	3841	}
				3842	if (max <= kMaxUnrolledMaxMatches && min == 0) {
				3843	ASSERT(max > 0); // Due to the 'if' above.
				3844	RegExpExpansionLimiter limiter(compiler, max);
				3845	if (limiter.ok_to_expand()) {
				3846	// Unroll the optional matches up to max.
				3847	RegExpNode* answer = on_success;
				3848	for (int i = 0; i < max; i++) {
				3849	ChoiceNode* alternation = new ChoiceNode(2);
				3850	if (is_greedy) {
				3851	alternation->AddAlternative(
				3852	GuardedAlternative(body->ToNode(compiler, answer)));
				3853	alternation->AddAlternative(GuardedAlternative(on_success));
				3854	} else {
				3855	alternation->AddAlternative(GuardedAlternative(on_success));
				3856	alternation->AddAlternative(
				3857	GuardedAlternative(body->ToNode(compiler, answer)));
				3858	}
				3859	answer = alternation;
				3860	if (not_at_start) alternation->set_not_at_start();
				3861	}
				3862	return answer;
				3863	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3864	}
				3865	}
				3866	bool has_min = min > 0;
				3867	bool has_max = max < RegExpTree::kInfinity;
				3868	bool needs_counter = has_min \|\| has_max;
				3869	int reg_ctr = needs_counter
				3870	? compiler->AllocateRegister()
				3871	: RegExpCompiler::kNoRegister;
				3872	LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
				3873	if (not_at_start) center->set_not_at_start();
				3874	RegExpNode* loop_return = needs_counter
				3875	? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
				3876	: static_cast<RegExpNode*>(center);
				3877	if (body_can_be_empty) {
				3878	// If the body can be empty we need to check if it was and then
				3879	// backtrack.
				3880	loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
				3881	reg_ctr,
				3882	min,
				3883	loop_return);
				3884	}
				3885	RegExpNode* body_node = body->ToNode(compiler, loop_return);
				3886	if (body_can_be_empty) {
				3887	// If the body can be empty we need to store the start position
				3888	// so we can bail out if it was empty.
				3889	body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
				3890	}
				3891	if (needs_capture_clearing) {
				3892	// Before entering the body of this loop we need to clear captures.
				3893	body_node = ActionNode::ClearCaptures(capture_registers, body_node);
				3894	}
				3895	GuardedAlternative body_alt(body_node);
				3896	if (has_max) {
				3897	Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
				3898	body_alt.AddGuard(body_guard);
				3899	}
				3900	GuardedAlternative rest_alt(on_success);
				3901	if (has_min) {
				3902	Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
				3903	rest_alt.AddGuard(rest_guard);
				3904	}
				3905	if (is_greedy) {
				3906	center->AddLoopAlternative(body_alt);
				3907	center->AddContinueAlternative(rest_alt);
				3908	} else {
				3909	center->AddContinueAlternative(rest_alt);
				3910	center->AddLoopAlternative(body_alt);
				3911	}
				3912	if (needs_counter) {
				3913	return ActionNode::SetRegister(reg_ctr, 0, center);
				3914	} else {
				3915	return center;
				3916	}
				3917	}
				3918
				3919
				3920	RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
				3921	RegExpNode* on_success) {
				3922	NodeInfo info;
				3923	switch (type()) {
				3924	case START_OF_LINE:
				3925	return AssertionNode::AfterNewline(on_success);
				3926	case START_OF_INPUT:
				3927	return AssertionNode::AtStart(on_success);
				3928	case BOUNDARY:
				3929	return AssertionNode::AtBoundary(on_success);
				3930	case NON_BOUNDARY:
				3931	return AssertionNode::AtNonBoundary(on_success);
				3932	case END_OF_INPUT:
				3933	return AssertionNode::AtEnd(on_success);
				3934	case END_OF_LINE: {
				3935	// Compile $ in multiline regexps as an alternation with a positive
				3936	// lookahead in one side and an end-of-input on the other side.
				3937	// We need two registers for the lookahead.
				3938	int stack_pointer_register = compiler->AllocateRegister();
				3939	int position_register = compiler->AllocateRegister();
				3940	// The ChoiceNode to distinguish between a newline and end-of-input.
				3941	ChoiceNode* result = new ChoiceNode(2);
				3942	// Create a newline atom.
				3943	ZoneList<CharacterRange>* newline_ranges =
				3944	new ZoneList<CharacterRange>(3);
				3945	CharacterRange::AddClassEscape('n', newline_ranges);
				3946	RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
				3947	TextNode* newline_matcher = new TextNode(
				3948	newline_atom,
				3949	ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
				3950	position_register,
				3951	0, // No captures inside.
				3952	-1, // Ignored if no captures.
				3953	on_success));
				3954	// Create an end-of-input matcher.
				3955	RegExpNode* end_of_line = ActionNode::BeginSubmatch(
				3956	stack_pointer_register,
				3957	position_register,
				3958	newline_matcher);
				3959	// Add the two alternatives to the ChoiceNode.
				3960	GuardedAlternative eol_alternative(end_of_line);
				3961	result->AddAlternative(eol_alternative);
				3962	GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
				3963	result->AddAlternative(end_alternative);
				3964	return result;
				3965	}
				3966	default:
				3967	UNREACHABLE();
				3968	}
				3969	return on_success;
				3970	}
				3971
				3972
				3973	RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
				3974	RegExpNode* on_success) {
				3975	return new BackReferenceNode(RegExpCapture::StartRegister(index()),
				3976	RegExpCapture::EndRegister(index()),
				3977	on_success);
				3978	}
				3979
				3980
				3981	RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
				3982	RegExpNode* on_success) {
				3983	return on_success;
				3984	}
				3985
				3986
				3987	RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
				3988	RegExpNode* on_success) {
				3989	int stack_pointer_register = compiler->AllocateRegister();
				3990	int position_register = compiler->AllocateRegister();
				3991
				3992	const int registers_per_capture = 2;
				3993	const int register_of_first_capture = 2;
				3994	int register_count = capture_count_ * registers_per_capture;
				3995	int register_start =
				3996	register_of_first_capture + capture_from_ * registers_per_capture;
				3997
				3998	RegExpNode* success;
				3999	if (is_positive()) {
				4000	RegExpNode* node = ActionNode::BeginSubmatch(
				4001	stack_pointer_register,
				4002	position_register,
				4003	body()->ToNode(
				4004	compiler,
				4005	ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
				4006	position_register,
				4007	register_count,
				4008	register_start,
				4009	on_success)));
				4010	return node;
				4011	} else {
				4012	// We use a ChoiceNode for a negative lookahead because it has most of
				4013	// the characteristics we need. It has the body of the lookahead as its
				4014	// first alternative and the expression after the lookahead of the second
				4015	// alternative. If the first alternative succeeds then the
				4016	// NegativeSubmatchSuccess will unwind the stack including everything the
				4017	// choice node set up and backtrack. If the first alternative fails then
				4018	// the second alternative is tried, which is exactly the desired result
				4019	// for a negative lookahead. The NegativeLookaheadChoiceNode is a special
				4020	// ChoiceNode that knows to ignore the first exit when calculating quick
				4021	// checks.
				4022	GuardedAlternative body_alt(
				4023	body()->ToNode(
				4024	compiler,
				4025	success = new NegativeSubmatchSuccess(stack_pointer_register,
				4026	position_register,
				4027	register_count,
				4028	register_start)));
				4029	ChoiceNode* choice_node =
				4030	new NegativeLookaheadChoiceNode(body_alt,
				4031	GuardedAlternative(on_success));
				4032	return ActionNode::BeginSubmatch(stack_pointer_register,
				4033	position_register,
				4034	choice_node);
				4035	}
				4036	}
				4037
				4038
				4039	RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
				4040	RegExpNode* on_success) {
				4041	return ToNode(body(), index(), compiler, on_success);
				4042	}
				4043
				4044
				4045	RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
				4046	int index,
				4047	RegExpCompiler* compiler,
				4048	RegExpNode* on_success) {
				4049	int start_reg = RegExpCapture::StartRegister(index);
				4050	int end_reg = RegExpCapture::EndRegister(index);
				4051	RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
				4052	RegExpNode* body_node = body->ToNode(compiler, store_end);
				4053	return ActionNode::StorePosition(start_reg, true, body_node);
				4054	}
				4055
				4056
				4057	RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
				4058	RegExpNode* on_success) {
				4059	ZoneList<RegExpTree> children = nodes();
				4060	RegExpNode* current = on_success;
				4061	for (int i = children->length() - 1; i >= 0; i--) {
				4062	current = children->at(i)->ToNode(compiler, current);
				4063	}
				4064	return current;
				4065	}
				4066
				4067
				4068	static void AddClass(const uc16* elmv,
				4069	int elmc,
				4070	ZoneList<CharacterRange>* ranges) {
				4071	for (int i = 0; i < elmc; i += 2) {
				4072	ASSERT(elmv[i] <= elmv[i + 1]);
				4073	ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
				4074	}
				4075	}
				4076
				4077
				4078	static void AddClassNegated(const uc16 *elmv,
				4079	int elmc,
				4080	ZoneList<CharacterRange>* ranges) {
				4081	ASSERT(elmv[0] != 0x0000);
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	4082	ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4083	uc16 last = 0x0000;
				4084	for (int i = 0; i < elmc; i += 2) {
				4085	ASSERT(last <= elmv[i] - 1);
				4086	ASSERT(elmv[i] <= elmv[i + 1]);
				4087	ranges->Add(CharacterRange(last, elmv[i] - 1));
				4088	last = elmv[i + 1] + 1;
				4089	}
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	4090	ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit));
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4091	}
				4092
				4093
				4094	void CharacterRange::AddClassEscape(uc16 type,
				4095	ZoneList<CharacterRange>* ranges) {
				4096	switch (type) {
				4097	case 's':
				4098	AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
				4099	break;
				4100	case 'S':
				4101	AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
				4102	break;
				4103	case 'w':
				4104	AddClass(kWordRanges, kWordRangeCount, ranges);
				4105	break;
				4106	case 'W':
				4107	AddClassNegated(kWordRanges, kWordRangeCount, ranges);
				4108	break;
				4109	case 'd':
				4110	AddClass(kDigitRanges, kDigitRangeCount, ranges);
				4111	break;
				4112	case 'D':
				4113	AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
				4114	break;
				4115	case '.':
				4116	AddClassNegated(kLineTerminatorRanges,
				4117	kLineTerminatorRangeCount,
				4118	ranges);
				4119	break;
				4120	// This is not a character range as defined by the spec but a
				4121	// convenient shorthand for a character class that matches any
				4122	// character.
				4123	case '*':
				4124	ranges->Add(CharacterRange::Everything());
				4125	break;
				4126	// This is the set of characters matched by the $ and ^ symbols
				4127	// in multiline mode.
				4128	case 'n':
				4129	AddClass(kLineTerminatorRanges,
				4130	kLineTerminatorRangeCount,
				4131	ranges);
				4132	break;
				4133	default:
				4134	UNREACHABLE();
				4135	}
				4136	}
				4137
				4138
				4139	Vector<const uc16> CharacterRange::GetWordBounds() {
				4140	return Vector<const uc16>(kWordRanges, kWordRangeCount);
				4141	}
				4142
				4143
				4144	class CharacterRangeSplitter {
				4145	public:
				4146	CharacterRangeSplitter(ZoneList<CharacterRange>** included,
				4147	ZoneList<CharacterRange>** excluded)
				4148	: included_(included),
				4149	excluded_(excluded) { }
				4150	void Call(uc16 from, DispatchTable::Entry entry);
				4151
				4152	static const int kInBase = 0;
				4153	static const int kInOverlay = 1;
				4154
				4155	private:
				4156	ZoneList<CharacterRange>** included_;
				4157	ZoneList<CharacterRange>** excluded_;
				4158	};
				4159
				4160
				4161	void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
				4162	if (!entry.out_set()->Get(kInBase)) return;
				4163	ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
				4164	? included_
				4165	: excluded_;
				4166	if (target == NULL) target = new ZoneList<CharacterRange>(2);
				4167	(*target)->Add(CharacterRange(entry.from(), entry.to()));
				4168	}
				4169
				4170
				4171	void CharacterRange::Split(ZoneList<CharacterRange>* base,
				4172	Vector<const uc16> overlay,
				4173	ZoneList<CharacterRange>** included,
				4174	ZoneList<CharacterRange>** excluded) {
				4175	ASSERT_EQ(NULL, *included);
				4176	ASSERT_EQ(NULL, *excluded);
				4177	DispatchTable table;
				4178	for (int i = 0; i < base->length(); i++)
				4179	table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
				4180	for (int i = 0; i < overlay.length(); i += 2) {
				4181	table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
				4182	CharacterRangeSplitter::kInOverlay);
				4183	}
				4184	CharacterRangeSplitter callback(included, excluded);
				4185	table.ForEach(&callback);
				4186	}
				4187
				4188
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4189	void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
				4190	bool is_ascii) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	4191	Isolate* isolate = Isolate::Current();
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4192	uc16 bottom = from();
				4193	uc16 top = to();
				4194	if (is_ascii) {
				4195	if (bottom > String::kMaxAsciiCharCode) return;
				4196	if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode;
				4197	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4198	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4199	if (top == bottom) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4200	// If this is a singleton we just expand the one character.
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	4201	int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4202	for (int i = 0; i < length; i++) {
				4203	uc32 chr = chars[i];
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4204	if (chr != bottom) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4205	ranges->Add(CharacterRange::Singleton(chars[i]));
				4206	}
				4207	}
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4208	} else {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4209	// If this is a range we expand the characters block by block,
				4210	// expanding contiguous subranges (blocks) one at a time.
				4211	// The approach is as follows. For a given start character we
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4212	// look up the remainder of the block that contains it (represented
				4213	// by the end point), for instance we find 'z' if the character
				4214	// is 'c'. A block is characterized by the property
				4215	// that all characters uncanonicalize in the same way, except that
				4216	// each entry in the result is incremented by the distance from the first
				4217	// element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
				4218	// the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
				4219	// Once we've found the end point we look up its uncanonicalization
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4220	// and produce a range for each element. For instance for [c-f]
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4221	// we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4222	// add a range if it is not already contained in the input, so [c-f]
				4223	// will be skipped but [C-F] will be added. If this range is not
				4224	// completely contained in a block we do this for all the blocks
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4225	// covered by the range (handling characters that is not in a block
				4226	// as a "singleton block").
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4227	unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4228	int pos = bottom;
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4229	while (pos < top) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	4230	int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4231	uc16 block_end;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4232	if (length == 0) {
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4233	block_end = pos;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4234	} else {
				4235	ASSERT_EQ(1, length);
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4236	block_end = range[0];
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4237	}
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4238	int end = (block_end > top) ? top : block_end;
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	4239	length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4240	for (int i = 0; i < length; i++) {
				4241	uc32 c = range[i];
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4242	uc16 range_from = c - (block_end - pos);
				4243	uc16 range_to = c - (block_end - end);
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4244	if (!(bottom <= range_from && range_to <= top)) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4245	ranges->Add(CharacterRange(range_from, range_to));
				4246	}
				4247	}
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4248	pos = end + 1;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4249	}
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4250	}
				4251	}
				4252
				4253
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4254	bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
				4255	ASSERT_NOT_NULL(ranges);
				4256	int n = ranges->length();
				4257	if (n <= 1) return true;
				4258	int max = ranges->at(0).to();
				4259	for (int i = 1; i < n; i++) {
				4260	CharacterRange next_range = ranges->at(i);
				4261	if (next_range.from() <= max + 1) return false;
				4262	max = next_range.to();
				4263	}
				4264	return true;
				4265	}
				4266
				4267	SetRelation CharacterRange::WordCharacterRelation(
				4268	ZoneList<CharacterRange>* range) {
				4269	ASSERT(IsCanonical(range));
				4270	int i = 0; // Word character range index.
				4271	int j = 0; // Argument range index.
				4272	ASSERT_NE(0, kWordRangeCount);
				4273	SetRelation result;
				4274	if (range->length() == 0) {
				4275	result.SetElementsInSecondSet();
				4276	return result;
				4277	}
				4278	CharacterRange argument_range = range->at(0);
				4279	CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]);
				4280	while (i < kWordRangeCount && j < range->length()) {
				4281	// Check the two ranges for the five cases:
				4282	// - no overlap.
				4283	// - partial overlap (there are elements in both ranges that isn't
				4284	// in the other, and there are also elements that are in both).
				4285	// - argument range entirely inside word range.
				4286	// - word range entirely inside argument range.
				4287	// - ranges are completely equal.
				4288
				4289	// First check for no overlap. The earlier range is not in the other set.
				4290	if (argument_range.from() > word_range.to()) {
				4291	// Ranges are disjoint. The earlier word range contains elements that
				4292	// cannot be in the argument set.
				4293	result.SetElementsInSecondSet();
				4294	} else if (word_range.from() > argument_range.to()) {
				4295	// Ranges are disjoint. The earlier argument range contains elements that
				4296	// cannot be in the word set.
				4297	result.SetElementsInFirstSet();
				4298	} else if (word_range.from() <= argument_range.from() &&
				4299	word_range.to() >= argument_range.from()) {
				4300	result.SetElementsInBothSets();
				4301	// argument range completely inside word range.
				4302	if (word_range.from() < argument_range.from() \|\|
				4303	word_range.to() > argument_range.from()) {
				4304	result.SetElementsInSecondSet();
				4305	}
				4306	} else if (word_range.from() >= argument_range.from() &&
				4307	word_range.to() <= argument_range.from()) {
				4308	result.SetElementsInBothSets();
				4309	result.SetElementsInFirstSet();
				4310	} else {
				4311	// There is overlap, and neither is a subrange of the other
				4312	result.SetElementsInFirstSet();
				4313	result.SetElementsInSecondSet();
				4314	result.SetElementsInBothSets();
				4315	}
				4316	if (result.NonTrivialIntersection()) {
				4317	// The result is as (im)precise as we can possibly make it.
				4318	return result;
				4319	}
				4320	// Progress the range(s) with minimal to-character.
				4321	uc16 word_to = word_range.to();
				4322	uc16 argument_to = argument_range.to();
				4323	if (argument_to <= word_to) {
				4324	j++;
				4325	if (j < range->length()) {
				4326	argument_range = range->at(j);
				4327	}
				4328	}
				4329	if (word_to <= argument_to) {
				4330	i += 2;
				4331	if (i < kWordRangeCount) {
				4332	word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]);
				4333	}
				4334	}
				4335	}
				4336	// Check if anything wasn't compared in the loop.
				4337	if (i < kWordRangeCount) {
				4338	// word range contains something not in argument range.
				4339	result.SetElementsInSecondSet();
				4340	} else if (j < range->length()) {
				4341	// Argument range contains something not in word range.
				4342	result.SetElementsInFirstSet();
				4343	}
				4344
				4345	return result;
				4346	}
				4347
				4348
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4349	ZoneList<CharacterRange>* CharacterSet::ranges() {
				4350	if (ranges_ == NULL) {
				4351	ranges_ = new ZoneList<CharacterRange>(2);
				4352	CharacterRange::AddClassEscape(standard_set_type_, ranges_);
				4353	}
				4354	return ranges_;
				4355	}
				4356
				4357
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4358	// Move a number of elements in a zonelist to another position
				4359	// in the same list. Handles overlapping source and target areas.
				4360	static void MoveRanges(ZoneList<CharacterRange>* list,
				4361	int from,
				4362	int to,
				4363	int count) {
				4364	// Ranges are potentially overlapping.
				4365	if (from < to) {
				4366	for (int i = count - 1; i >= 0; i--) {
				4367	list->at(to + i) = list->at(from + i);
				4368	}
				4369	} else {
				4370	for (int i = 0; i < count; i++) {
				4371	list->at(to + i) = list->at(from + i);
				4372	}
				4373	}
				4374	}
				4375
				4376
				4377	static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
				4378	int count,
				4379	CharacterRange insert) {
				4380	// Inserts a range into list[0..count[, which must be sorted
				4381	// by from value and non-overlapping and non-adjacent, using at most
				4382	// list[0..count] for the result. Returns the number of resulting
				4383	// canonicalized ranges. Inserting a range may collapse existing ranges into
				4384	// fewer ranges, so the return value can be anything in the range 1..count+1.
				4385	uc16 from = insert.from();
				4386	uc16 to = insert.to();
				4387	int start_pos = 0;
				4388	int end_pos = count;
				4389	for (int i = count - 1; i >= 0; i--) {
				4390	CharacterRange current = list->at(i);
				4391	if (current.from() > to + 1) {
				4392	end_pos = i;
				4393	} else if (current.to() + 1 < from) {
				4394	start_pos = i + 1;
				4395	break;
				4396	}
				4397	}
				4398
				4399	// Inserted range overlaps, or is adjacent to, ranges at positions
				4400	// [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
				4401	// not affected by the insertion.
				4402	// If start_pos == end_pos, the range must be inserted before start_pos.
				4403	// if start_pos < end_pos, the entire range from start_pos to end_pos
				4404	// must be merged with the insert range.
				4405
				4406	if (start_pos == end_pos) {
				4407	// Insert between existing ranges at position start_pos.
				4408	if (start_pos < count) {
				4409	MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
				4410	}
				4411	list->at(start_pos) = insert;
				4412	return count + 1;
				4413	}
				4414	if (start_pos + 1 == end_pos) {
				4415	// Replace single existing range at position start_pos.
				4416	CharacterRange to_replace = list->at(start_pos);
				4417	int new_from = Min(to_replace.from(), from);
				4418	int new_to = Max(to_replace.to(), to);
				4419	list->at(start_pos) = CharacterRange(new_from, new_to);
				4420	return count;
				4421	}
				4422	// Replace a number of existing ranges from start_pos to end_pos - 1.
				4423	// Move the remaining ranges down.
				4424
				4425	int new_from = Min(list->at(start_pos).from(), from);
				4426	int new_to = Max(list->at(end_pos - 1).to(), to);
				4427	if (end_pos < count) {
				4428	MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
				4429	}
				4430	list->at(start_pos) = CharacterRange(new_from, new_to);
				4431	return count - (end_pos - start_pos) + 1;
				4432	}
				4433
				4434
				4435	void CharacterSet::Canonicalize() {
				4436	// Special/default classes are always considered canonical. The result
				4437	// of calling ranges() will be sorted.
				4438	if (ranges_ == NULL) return;
				4439	CharacterRange::Canonicalize(ranges_);
				4440	}
				4441
				4442
				4443	void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
				4444	if (character_ranges->length() <= 1) return;
				4445	// Check whether ranges are already canonical (increasing, non-overlapping,
				4446	// non-adjacent).
				4447	int n = character_ranges->length();
				4448	int max = character_ranges->at(0).to();
				4449	int i = 1;
				4450	while (i < n) {
				4451	CharacterRange current = character_ranges->at(i);
				4452	if (current.from() <= max + 1) {
				4453	break;
				4454	}
				4455	max = current.to();
				4456	i++;
				4457	}
				4458	// Canonical until the i'th range. If that's all of them, we are done.
				4459	if (i == n) return;
				4460
				4461	// The ranges at index i and forward are not canonicalized. Make them so by
				4462	// doing the equivalent of insertion sort (inserting each into the previous
				4463	// list, in order).
				4464	// Notice that inserting a range can reduce the number of ranges in the
				4465	// result due to combining of adjacent and overlapping ranges.
				4466	int read = i; // Range to insert.
				4467	int num_canonical = i; // Length of canonicalized part of list.
				4468	do {
				4469	num_canonical = InsertRangeInCanonicalList(character_ranges,
				4470	num_canonical,
				4471	character_ranges->at(read));
				4472	read++;
				4473	} while (read < n);
				4474	character_ranges->Rewind(num_canonical);
				4475
				4476	ASSERT(CharacterRange::IsCanonical(character_ranges));
				4477	}
				4478
				4479
				4480	// Utility function for CharacterRange::Merge. Adds a range at the end of
				4481	// a canonicalized range list, if necessary merging the range with the last
				4482	// range of the list.
				4483	static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) {
				4484	if (set == NULL) return;
				4485	ASSERT(set->length() == 0 \|\| set->at(set->length() - 1).to() < range.from());
				4486	int n = set->length();
				4487	if (n > 0) {
				4488	CharacterRange lastRange = set->at(n - 1);
				4489	if (lastRange.to() == range.from() - 1) {
				4490	set->at(n - 1) = CharacterRange(lastRange.from(), range.to());
				4491	return;
				4492	}
				4493	}
				4494	set->Add(range);
				4495	}
				4496
				4497
				4498	static void AddRangeToSelectedSet(int selector,
				4499	ZoneList<CharacterRange>* first_set,
				4500	ZoneList<CharacterRange>* second_set,
				4501	ZoneList<CharacterRange>* intersection_set,
				4502	CharacterRange range) {
				4503	switch (selector) {
				4504	case kInsideFirst:
				4505	AddRangeToSet(first_set, range);
				4506	break;
				4507	case kInsideSecond:
				4508	AddRangeToSet(second_set, range);
				4509	break;
				4510	case kInsideBoth:
				4511	AddRangeToSet(intersection_set, range);
				4512	break;
				4513	}
				4514	}
				4515
				4516
				4517
				4518	void CharacterRange::Merge(ZoneList<CharacterRange>* first_set,
				4519	ZoneList<CharacterRange>* second_set,
				4520	ZoneList<CharacterRange>* first_set_only_out,
				4521	ZoneList<CharacterRange>* second_set_only_out,
				4522	ZoneList<CharacterRange>* both_sets_out) {
				4523	// Inputs are canonicalized.
				4524	ASSERT(CharacterRange::IsCanonical(first_set));
				4525	ASSERT(CharacterRange::IsCanonical(second_set));
				4526	// Outputs are empty, if applicable.
				4527	ASSERT(first_set_only_out == NULL \|\| first_set_only_out->length() == 0);
				4528	ASSERT(second_set_only_out == NULL \|\| second_set_only_out->length() == 0);
				4529	ASSERT(both_sets_out == NULL \|\| both_sets_out->length() == 0);
				4530
				4531	// Merge sets by iterating through the lists in order of lowest "from" value,
				4532	// and putting intervals into one of three sets.
				4533
				4534	if (first_set->length() == 0) {
				4535	second_set_only_out->AddAll(*second_set);
				4536	return;
				4537	}
				4538	if (second_set->length() == 0) {
				4539	first_set_only_out->AddAll(*first_set);
				4540	return;
				4541	}
				4542	// Indices into input lists.
				4543	int i1 = 0;
				4544	int i2 = 0;
				4545	// Cache length of input lists.
				4546	int n1 = first_set->length();
				4547	int n2 = second_set->length();
				4548	// Current range. May be invalid if state is kInsideNone.
				4549	int from = 0;
				4550	int to = -1;
				4551	// Where current range comes from.
				4552	int state = kInsideNone;
				4553
				4554	while (i1 < n1 \|\| i2 < n2) {
				4555	CharacterRange next_range;
				4556	int range_source;
Leon Clarke	d91b9f7	2010-01-27 17:25:45 +0000	[diff] [blame]	4557	if (i2 == n2 \|\|
				4558	(i1 < n1 && first_set->at(i1).from() < second_set->at(i2).from())) {
				4559	// Next smallest element is in first set.
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4560	next_range = first_set->at(i1++);
				4561	range_source = kInsideFirst;
				4562	} else {
Leon Clarke	d91b9f7	2010-01-27 17:25:45 +0000	[diff] [blame]	4563	// Next smallest element is in second set.
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4564	next_range = second_set->at(i2++);
				4565	range_source = kInsideSecond;
				4566	}
				4567	if (to < next_range.from()) {
				4568	// Ranges disjoint: \|current\| \|next\|
				4569	AddRangeToSelectedSet(state,
				4570	first_set_only_out,
				4571	second_set_only_out,
				4572	both_sets_out,
				4573	CharacterRange(from, to));
				4574	from = next_range.from();
				4575	to = next_range.to();
				4576	state = range_source;
				4577	} else {
				4578	if (from < next_range.from()) {
				4579	AddRangeToSelectedSet(state,
				4580	first_set_only_out,
				4581	second_set_only_out,
				4582	both_sets_out,
				4583	CharacterRange(from, next_range.from()-1));
				4584	}
				4585	if (to < next_range.to()) {
				4586	// Ranges overlap: \|current\|
				4587	// \|next\|
				4588	AddRangeToSelectedSet(state \| range_source,
				4589	first_set_only_out,
				4590	second_set_only_out,
				4591	both_sets_out,
				4592	CharacterRange(next_range.from(), to));
				4593	from = to + 1;
				4594	to = next_range.to();
				4595	state = range_source;
				4596	} else {
				4597	// Range included: \|current\| , possibly ending at same character.
				4598	// \|next\|
				4599	AddRangeToSelectedSet(
				4600	state \| range_source,
				4601	first_set_only_out,
				4602	second_set_only_out,
				4603	both_sets_out,
				4604	CharacterRange(next_range.from(), next_range.to()));
				4605	from = next_range.to() + 1;
				4606	// If ranges end at same character, both ranges are consumed completely.
				4607	if (next_range.to() == to) state = kInsideNone;
				4608	}
				4609	}
				4610	}
				4611	AddRangeToSelectedSet(state,
				4612	first_set_only_out,
				4613	second_set_only_out,
				4614	both_sets_out,
				4615	CharacterRange(from, to));
				4616	}
				4617
				4618
				4619	void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
				4620	ZoneList<CharacterRange>* negated_ranges) {
				4621	ASSERT(CharacterRange::IsCanonical(ranges));
				4622	ASSERT_EQ(0, negated_ranges->length());
				4623	int range_count = ranges->length();
				4624	uc16 from = 0;
				4625	int i = 0;
				4626	if (range_count > 0 && ranges->at(0).from() == 0) {
				4627	from = ranges->at(0).to();
				4628	i = 1;
				4629	}
				4630	while (i < range_count) {
				4631	CharacterRange range = ranges->at(i);
				4632	negated_ranges->Add(CharacterRange(from + 1, range.from() - 1));
				4633	from = range.to();
				4634	i++;
				4635	}
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	4636	if (from < String::kMaxUtf16CodeUnit) {
				4637	negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit));
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4638	}
				4639	}
				4640
				4641
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4642
				4643	// -------------------------------------------------------------------
				4644	// Interest propagation
				4645
				4646
				4647	RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
				4648	for (int i = 0; i < siblings_.length(); i++) {
				4649	RegExpNode* sibling = siblings_.Get(i);
				4650	if (sibling->info()->Matches(info))
				4651	return sibling;
				4652	}
				4653	return NULL;
				4654	}
				4655
				4656
				4657	RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
				4658	ASSERT_EQ(false, *cloned);
				4659	siblings_.Ensure(this);
				4660	RegExpNode* result = TryGetSibling(info);
				4661	if (result != NULL) return result;
				4662	result = this->Clone();
				4663	NodeInfo* new_info = result->info();
				4664	new_info->ResetCompilationState();
				4665	new_info->AddFromPreceding(info);
				4666	AddSibling(result);
				4667	*cloned = true;
				4668	return result;
				4669	}
				4670
				4671
				4672	template <class C>
				4673	static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
				4674	NodeInfo full_info(*node->info());
				4675	full_info.AddFromPreceding(info);
				4676	bool cloned = false;
				4677	return RegExpNode::EnsureSibling(node, &full_info, &cloned);
				4678	}
				4679
				4680
				4681	// -------------------------------------------------------------------
				4682	// Splay tree
				4683
				4684
				4685	OutSet* OutSet::Extend(unsigned value) {
				4686	if (Get(value))
				4687	return this;
				4688	if (successors() != NULL) {
				4689	for (int i = 0; i < successors()->length(); i++) {
				4690	OutSet* successor = successors()->at(i);
				4691	if (successor->Get(value))
				4692	return successor;
				4693	}
				4694	} else {
				4695	successors_ = new ZoneList<OutSet*>(2);
				4696	}
				4697	OutSet* result = new OutSet(first_, remaining_);
				4698	result->Set(value);
				4699	successors()->Add(result);
				4700	return result;
				4701	}
				4702
				4703
				4704	void OutSet::Set(unsigned value) {
				4705	if (value < kFirstLimit) {
				4706	first_ \|= (1 << value);
				4707	} else {
				4708	if (remaining_ == NULL)
				4709	remaining_ = new ZoneList<unsigned>(1);
				4710	if (remaining_->is_empty() \|\| !remaining_->Contains(value))
				4711	remaining_->Add(value);
				4712	}
				4713	}
				4714
				4715
				4716	bool OutSet::Get(unsigned value) {
				4717	if (value < kFirstLimit) {
				4718	return (first_ & (1 << value)) != 0;
				4719	} else if (remaining_ == NULL) {
				4720	return false;
				4721	} else {
				4722	return remaining_->Contains(value);
				4723	}
				4724	}
				4725
				4726
				4727	const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4728
				4729
				4730	void DispatchTable::AddRange(CharacterRange full_range, int value) {
				4731	CharacterRange current = full_range;
				4732	if (tree()->is_empty()) {
				4733	// If this is the first range we just insert into the table.
				4734	ZoneSplayTree<Config>::Locator loc;
				4735	ASSERT_RESULT(tree()->Insert(current.from(), &loc));
				4736	loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
				4737	return;
				4738	}
				4739	// First see if there is a range to the left of this one that
				4740	// overlaps.
				4741	ZoneSplayTree<Config>::Locator loc;
				4742	if (tree()->FindGreatestLessThan(current.from(), &loc)) {
				4743	Entry* entry = &loc.value();
				4744	// If we've found a range that overlaps with this one, and it
				4745	// starts strictly to the left of this one, we have to fix it
				4746	// because the following code only handles ranges that start on
				4747	// or after the start point of the range we're adding.
				4748	if (entry->from() < current.from() && entry->to() >= current.from()) {
				4749	// Snap the overlapping range in half around the start point of
				4750	// the range we're adding.
				4751	CharacterRange left(entry->from(), current.from() - 1);
				4752	CharacterRange right(current.from(), entry->to());
				4753	// The left part of the overlapping range doesn't overlap.
				4754	// Truncate the whole entry to be just the left part.
				4755	entry->set_to(left.to());
				4756	// The right part is the one that overlaps. We add this part
				4757	// to the map and let the next step deal with merging it with
				4758	// the range we're adding.
				4759	ZoneSplayTree<Config>::Locator loc;
				4760	ASSERT_RESULT(tree()->Insert(right.from(), &loc));
				4761	loc.set_value(Entry(right.from(),
				4762	right.to(),
				4763	entry->out_set()));
				4764	}
				4765	}
				4766	while (current.is_valid()) {
				4767	if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
				4768	(loc.value().from() <= current.to()) &&
				4769	(loc.value().to() >= current.from())) {
				4770	Entry* entry = &loc.value();
				4771	// We have overlap. If there is space between the start point of
				4772	// the range we're adding and where the overlapping range starts
				4773	// then we have to add a range covering just that space.
				4774	if (current.from() < entry->from()) {
				4775	ZoneSplayTree<Config>::Locator ins;
				4776	ASSERT_RESULT(tree()->Insert(current.from(), &ins));
				4777	ins.set_value(Entry(current.from(),
				4778	entry->from() - 1,
				4779	empty()->Extend(value)));
				4780	current.set_from(entry->from());
				4781	}
				4782	ASSERT_EQ(current.from(), entry->from());
				4783	// If the overlapping range extends beyond the one we want to add
				4784	// we have to snap the right part off and add it separately.
				4785	if (entry->to() > current.to()) {
				4786	ZoneSplayTree<Config>::Locator ins;
				4787	ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
				4788	ins.set_value(Entry(current.to() + 1,
				4789	entry->to(),
				4790	entry->out_set()));
				4791	entry->set_to(current.to());
				4792	}
				4793	ASSERT(entry->to() <= current.to());
				4794	// The overlapping range is now completely contained by the range
				4795	// we're adding so we can just update it and move the start point
				4796	// of the range we're adding just past it.
				4797	entry->AddValue(value);
				4798	// Bail out if the last interval ended at 0xFFFF since otherwise
				4799	// adding 1 will wrap around to 0.
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	4800	if (entry->to() == String::kMaxUtf16CodeUnit)
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4801	break;
				4802	ASSERT(entry->to() + 1 > current.from());
				4803	current.set_from(entry->to() + 1);
				4804	} else {
				4805	// There is no overlap so we can just add the range
				4806	ZoneSplayTree<Config>::Locator ins;
				4807	ASSERT_RESULT(tree()->Insert(current.from(), &ins));
				4808	ins.set_value(Entry(current.from(),
				4809	current.to(),
				4810	empty()->Extend(value)));
				4811	break;
				4812	}
				4813	}
				4814	}
				4815
				4816
				4817	OutSet* DispatchTable::Get(uc16 value) {
				4818	ZoneSplayTree<Config>::Locator loc;
				4819	if (!tree()->FindGreatestLessThan(value, &loc))
				4820	return empty();
				4821	Entry* entry = &loc.value();
				4822	if (value <= entry->to())
				4823	return entry->out_set();
				4824	else
				4825	return empty();
				4826	}
				4827
				4828
				4829	// -------------------------------------------------------------------
				4830	// Analysis
				4831
				4832
				4833	void Analysis::EnsureAnalyzed(RegExpNode* that) {
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	4834	StackLimitCheck check(Isolate::Current());
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4835	if (check.HasOverflowed()) {
				4836	fail("Stack overflow");
				4837	return;
				4838	}
				4839	if (that->info()->been_analyzed \|\| that->info()->being_analyzed)
				4840	return;
				4841	that->info()->being_analyzed = true;
				4842	that->Accept(this);
				4843	that->info()->being_analyzed = false;
				4844	that->info()->been_analyzed = true;
				4845	}
				4846
				4847
				4848	void Analysis::VisitEnd(EndNode* that) {
				4849	// nothing to do
				4850	}
				4851
				4852
				4853	void TextNode::CalculateOffsets() {
				4854	int element_count = elements()->length();
				4855	// Set up the offsets of the elements relative to the start. This is a fixed
				4856	// quantity since a TextNode can only contain fixed-width things.
				4857	int cp_offset = 0;
				4858	for (int i = 0; i < element_count; i++) {
				4859	TextElement& elm = elements()->at(i);
				4860	elm.cp_offset = cp_offset;
				4861	if (elm.type == TextElement::ATOM) {
				4862	cp_offset += elm.data.u_atom->data().length();
				4863	} else {
				4864	cp_offset++;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4865	}
				4866	}
				4867	}
				4868
				4869
				4870	void Analysis::VisitText(TextNode* that) {
				4871	if (ignore_case_) {
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4872	that->MakeCaseIndependent(is_ascii_);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4873	}
				4874	EnsureAnalyzed(that->on_success());
				4875	if (!has_failed()) {
				4876	that->CalculateOffsets();
				4877	}
				4878	}
				4879
				4880
				4881	void Analysis::VisitAction(ActionNode* that) {
				4882	RegExpNode* target = that->on_success();
				4883	EnsureAnalyzed(target);
				4884	if (!has_failed()) {
				4885	// If the next node is interested in what it follows then this node
				4886	// has to be interested too so it can pass the information on.
				4887	that->info()->AddFromFollowing(target->info());
				4888	}
				4889	}
				4890
				4891
				4892	void Analysis::VisitChoice(ChoiceNode* that) {
				4893	NodeInfo* info = that->info();
				4894	for (int i = 0; i < that->alternatives()->length(); i++) {
				4895	RegExpNode* node = that->alternatives()->at(i).node();
				4896	EnsureAnalyzed(node);
				4897	if (has_failed()) return;
				4898	// Anything the following nodes need to know has to be known by
				4899	// this node also, so it can pass it on.
				4900	info->AddFromFollowing(node->info());
				4901	}
				4902	}
				4903
				4904
				4905	void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
				4906	NodeInfo* info = that->info();
				4907	for (int i = 0; i < that->alternatives()->length(); i++) {
				4908	RegExpNode* node = that->alternatives()->at(i).node();
				4909	if (node != that->loop_node()) {
				4910	EnsureAnalyzed(node);
				4911	if (has_failed()) return;
				4912	info->AddFromFollowing(node->info());
				4913	}
				4914	}
				4915	// Check the loop last since it may need the value of this node
				4916	// to get a correct result.
				4917	EnsureAnalyzed(that->loop_node());
				4918	if (!has_failed()) {
				4919	info->AddFromFollowing(that->loop_node()->info());
				4920	}
				4921	}
				4922
				4923
				4924	void Analysis::VisitBackReference(BackReferenceNode* that) {
				4925	EnsureAnalyzed(that->on_success());
				4926	}
				4927
				4928
				4929	void Analysis::VisitAssertion(AssertionNode* that) {
				4930	EnsureAnalyzed(that->on_success());
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4931	AssertionNode::AssertionNodeType type = that->type();
				4932	if (type == AssertionNode::AT_BOUNDARY \|\|
				4933	type == AssertionNode::AT_NON_BOUNDARY) {
				4934	// Check if the following character is known to be a word character
				4935	// or known to not be a word character.
				4936	ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet();
				4937
				4938	CharacterRange::Canonicalize(following_chars);
				4939
				4940	SetRelation word_relation =
				4941	CharacterRange::WordCharacterRelation(following_chars);
Andrei Popescu	6d3d5a3	2010-04-27 19:40:12 +0100	[diff] [blame]	4942	if (word_relation.Disjoint()) {
				4943	// Includes the case where following_chars is empty (e.g., end-of-input).
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4944	// Following character is definitely not a word character.
				4945	type = (type == AssertionNode::AT_BOUNDARY) ?
Andrei Popescu	6d3d5a3	2010-04-27 19:40:12 +0100	[diff] [blame]	4946	AssertionNode::AFTER_WORD_CHARACTER :
				4947	AssertionNode::AFTER_NONWORD_CHARACTER;
				4948	that->set_type(type);
				4949	} else if (word_relation.ContainedIn()) {
				4950	// Following character is definitely a word character.
				4951	type = (type == AssertionNode::AT_BOUNDARY) ?
				4952	AssertionNode::AFTER_NONWORD_CHARACTER :
				4953	AssertionNode::AFTER_WORD_CHARACTER;
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4954	that->set_type(type);
				4955	}
				4956	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4957	}
				4958
				4959
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4960	ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() {
				4961	if (first_character_set_ == NULL) {
				4962	if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) {
				4963	// If we can't find an exact solution within the budget, we
				4964	// set the value to the set of every character, i.e., all characters
				4965	// are possible.
				4966	ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1);
				4967	all_set->Add(CharacterRange::Everything());
				4968	first_character_set_ = all_set;
				4969	}
				4970	}
				4971	return first_character_set_;
				4972	}
				4973
				4974
				4975	int RegExpNode::ComputeFirstCharacterSet(int budget) {
				4976	// Default behavior is to not be able to determine the first character.
				4977	return kComputeFirstCharacterSetFail;
				4978	}
				4979
				4980
				4981	int LoopChoiceNode::ComputeFirstCharacterSet(int budget) {
				4982	budget--;
				4983	if (budget >= 0) {
				4984	// Find loop min-iteration. It's the value of the guarded choice node
				4985	// with a GEQ guard, if any.
				4986	int min_repetition = 0;
				4987
				4988	for (int i = 0; i <= 1; i++) {
				4989	GuardedAlternative alternative = alternatives()->at(i);
				4990	ZoneList<Guard> guards = alternative.guards();
				4991	if (guards != NULL && guards->length() > 0) {
				4992	Guard* guard = guards->at(0);
				4993	if (guard->op() == Guard::GEQ) {
				4994	min_repetition = guard->value();
				4995	break;
				4996	}
				4997	}
				4998	}
				4999
				5000	budget = loop_node()->ComputeFirstCharacterSet(budget);
				5001	if (budget >= 0) {
				5002	ZoneList<CharacterRange>* character_set =
				5003	loop_node()->first_character_set();
				5004	if (body_can_be_zero_length() \|\| min_repetition == 0) {
				5005	budget = continue_node()->ComputeFirstCharacterSet(budget);
				5006	if (budget < 0) return budget;
				5007	ZoneList<CharacterRange>* body_set =
				5008	continue_node()->first_character_set();
				5009	ZoneList<CharacterRange>* union_set =
				5010	new ZoneList<CharacterRange>(Max(character_set->length(),
				5011	body_set->length()));
				5012	CharacterRange::Merge(character_set,
				5013	body_set,
				5014	union_set,
				5015	union_set,
				5016	union_set);
				5017	character_set = union_set;
				5018	}
				5019	set_first_character_set(character_set);
				5020	}
				5021	}
				5022	return budget;
				5023	}
				5024
				5025
				5026	int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) {
				5027	budget--;
				5028	if (budget >= 0) {
				5029	GuardedAlternative successor = this->alternatives()->at(1);
				5030	RegExpNode* successor_node = successor.node();
				5031	budget = successor_node->ComputeFirstCharacterSet(budget);
				5032	if (budget >= 0) {
				5033	set_first_character_set(successor_node->first_character_set());
				5034	}
				5035	}
				5036	return budget;
				5037	}
				5038
				5039
				5040	// The first character set of an EndNode is unknowable. Just use the
				5041	// default implementation that fails and returns all characters as possible.
				5042
				5043
				5044	int AssertionNode::ComputeFirstCharacterSet(int budget) {
				5045	budget -= 1;
				5046	if (budget >= 0) {
				5047	switch (type_) {
				5048	case AT_END: {
				5049	set_first_character_set(new ZoneList<CharacterRange>(0));
				5050	break;
				5051	}
				5052	case AT_START:
				5053	case AT_BOUNDARY:
				5054	case AT_NON_BOUNDARY:
				5055	case AFTER_NEWLINE:
				5056	case AFTER_NONWORD_CHARACTER:
				5057	case AFTER_WORD_CHARACTER: {
				5058	ASSERT_NOT_NULL(on_success());
				5059	budget = on_success()->ComputeFirstCharacterSet(budget);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5060	if (budget >= 0) {
				5061	set_first_character_set(on_success()->first_character_set());
				5062	}
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5063	break;
				5064	}
				5065	}
				5066	}
				5067	return budget;
				5068	}
				5069
				5070
				5071	int ActionNode::ComputeFirstCharacterSet(int budget) {
				5072	if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail;
				5073	budget--;
				5074	if (budget >= 0) {
				5075	ASSERT_NOT_NULL(on_success());
				5076	budget = on_success()->ComputeFirstCharacterSet(budget);
				5077	if (budget >= 0) {
				5078	set_first_character_set(on_success()->first_character_set());
				5079	}
				5080	}
				5081	return budget;
				5082	}
				5083
				5084
				5085	int BackReferenceNode::ComputeFirstCharacterSet(int budget) {
				5086	// We don't know anything about the first character of a backreference
				5087	// at this point.
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5088	// The potential first characters are the first characters of the capture,
				5089	// and the first characters of the on_success node, depending on whether the
				5090	// capture can be empty and whether it is known to be participating or known
				5091	// not to be.
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5092	return kComputeFirstCharacterSetFail;
				5093	}
				5094
				5095
				5096	int TextNode::ComputeFirstCharacterSet(int budget) {
				5097	budget--;
				5098	if (budget >= 0) {
				5099	ASSERT_NE(0, elements()->length());
				5100	TextElement text = elements()->at(0);
				5101	if (text.type == TextElement::ATOM) {
				5102	RegExpAtom* atom = text.data.u_atom;
				5103	ASSERT_NE(0, atom->length());
				5104	uc16 first_char = atom->data()[0];
				5105	ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1);
				5106	range->Add(CharacterRange(first_char, first_char));
				5107	set_first_character_set(range);
				5108	} else {
				5109	ASSERT(text.type == TextElement::CHAR_CLASS);
				5110	RegExpCharacterClass* char_class = text.data.u_char_class;
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5111	ZoneList<CharacterRange>* ranges = char_class->ranges();
				5112	// TODO(lrn): Canonicalize ranges when they are created
				5113	// instead of waiting until now.
				5114	CharacterRange::Canonicalize(ranges);
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5115	if (char_class->is_negated()) {
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5116	int length = ranges->length();
				5117	int new_length = length + 1;
				5118	if (length > 0) {
				5119	if (ranges->at(0).from() == 0) new_length--;
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	5120	if (ranges->at(length - 1).to() == String::kMaxUtf16CodeUnit) {
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5121	new_length--;
				5122	}
				5123	}
				5124	ZoneList<CharacterRange>* negated_ranges =
				5125	new ZoneList<CharacterRange>(new_length);
				5126	CharacterRange::Negate(ranges, negated_ranges);
				5127	set_first_character_set(negated_ranges);
				5128	} else {
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5129	set_first_character_set(ranges);
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5130	}
				5131	}
				5132	}
				5133	return budget;
				5134	}
				5135
				5136
				5137
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5138	// -------------------------------------------------------------------
				5139	// Dispatch table construction
				5140
				5141
				5142	void DispatchTableConstructor::VisitEnd(EndNode* that) {
				5143	AddRange(CharacterRange::Everything());
				5144	}
				5145
				5146
				5147	void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
				5148	node->set_being_calculated(true);
				5149	ZoneList<GuardedAlternative>* alternatives = node->alternatives();
				5150	for (int i = 0; i < alternatives->length(); i++) {
				5151	set_choice_index(i);
				5152	alternatives->at(i).node()->Accept(this);
				5153	}
				5154	node->set_being_calculated(false);
				5155	}
				5156
				5157
				5158	class AddDispatchRange {
				5159	public:
				5160	explicit AddDispatchRange(DispatchTableConstructor* constructor)
				5161	: constructor_(constructor) { }
				5162	void Call(uc32 from, DispatchTable::Entry entry);
				5163	private:
				5164	DispatchTableConstructor* constructor_;
				5165	};
				5166
				5167
				5168	void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
				5169	CharacterRange range(from, entry.to());
				5170	constructor_->AddRange(range);
				5171	}
				5172
				5173
				5174	void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
				5175	if (node->being_calculated())
				5176	return;
				5177	DispatchTable* table = node->GetTable(ignore_case_);
				5178	AddDispatchRange adder(this);
				5179	table->ForEach(&adder);
				5180	}
				5181
				5182
				5183	void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
				5184	// TODO(160): Find the node that we refer back to and propagate its start
				5185	// set back to here. For now we just accept anything.
				5186	AddRange(CharacterRange::Everything());
				5187	}
				5188
				5189
				5190	void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
				5191	RegExpNode* target = that->on_success();
				5192	target->Accept(this);
				5193	}
				5194
				5195
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5196	static int CompareRangeByFrom(const CharacterRange* a,
				5197	const CharacterRange* b) {
				5198	return Compare<uc16>(a->from(), b->from());
				5199	}
				5200
				5201
				5202	void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
				5203	ranges->Sort(CompareRangeByFrom);
				5204	uc16 last = 0;
				5205	for (int i = 0; i < ranges->length(); i++) {
				5206	CharacterRange range = ranges->at(i);
				5207	if (last < range.from())
				5208	AddRange(CharacterRange(last, range.from() - 1));
				5209	if (range.to() >= last) {
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	5210	if (range.to() == String::kMaxUtf16CodeUnit) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5211	return;
				5212	} else {
				5213	last = range.to() + 1;
				5214	}
				5215	}
				5216	}
Ben Murdoch	3ef787d	2012-04-12 10:51:47 +0100	[diff] [blame^]	5217	AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit));
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5218	}
				5219
				5220
				5221	void DispatchTableConstructor::VisitText(TextNode* that) {
				5222	TextElement elm = that->elements()->at(0);
				5223	switch (elm.type) {
				5224	case TextElement::ATOM: {
				5225	uc16 c = elm.data.u_atom->data()[0];
				5226	AddRange(CharacterRange(c, c));
				5227	break;
				5228	}
				5229	case TextElement::CHAR_CLASS: {
				5230	RegExpCharacterClass* tree = elm.data.u_char_class;
				5231	ZoneList<CharacterRange>* ranges = tree->ranges();
				5232	if (tree->is_negated()) {
				5233	AddInverse(ranges);
				5234	} else {
				5235	for (int i = 0; i < ranges->length(); i++)
				5236	AddRange(ranges->at(i));
				5237	}
				5238	break;
				5239	}
				5240	default: {
				5241	UNIMPLEMENTED();
				5242	}
				5243	}
				5244	}
				5245
				5246
				5247	void DispatchTableConstructor::VisitAction(ActionNode* that) {
				5248	RegExpNode* target = that->on_success();
				5249	target->Accept(this);
				5250	}
				5251
				5252
				5253	RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data,
				5254	bool ignore_case,
				5255	bool is_multiline,
				5256	Handle<String> pattern,
				5257	bool is_ascii) {
				5258	if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
				5259	return IrregexpRegExpTooBig();
				5260	}
				5261	RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
				5262	// Wrap the body of the regexp in capture #0.
				5263	RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
				5264	0,
				5265	&compiler,
				5266	compiler.accept());
				5267	RegExpNode* node = captured_body;
Ben Murdoch	f87a203	2010-10-22 12:50:53 +0100	[diff] [blame]	5268	bool is_end_anchored = data->tree->IsAnchoredAtEnd();
				5269	bool is_start_anchored = data->tree->IsAnchoredAtStart();
				5270	int max_length = data->tree->max_match();
				5271	if (!is_start_anchored) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5272	// Add a .*? at the beginning, outside the body capture, unless
				5273	// this expression is anchored at the beginning.
				5274	RegExpNode* loop_node =
				5275	RegExpQuantifier::ToNode(0,
				5276	RegExpTree::kInfinity,
				5277	false,
				5278	new RegExpCharacterClass('*'),
				5279	&compiler,
				5280	captured_body,
				5281	data->contains_anchor);
				5282
				5283	if (data->contains_anchor) {
				5284	// Unroll loop once, to take care of the case that might start
				5285	// at the start of input.
				5286	ChoiceNode* first_step_node = new ChoiceNode(2);
				5287	first_step_node->AddAlternative(GuardedAlternative(captured_body));
				5288	first_step_node->AddAlternative(GuardedAlternative(
				5289	new TextNode(new RegExpCharacterClass('*'), loop_node)));
				5290	node = first_step_node;
				5291	} else {
				5292	node = loop_node;
				5293	}
				5294	}
				5295	data->node = node;
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	5296	Analysis analysis(ignore_case, is_ascii);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5297	analysis.EnsureAnalyzed(node);
				5298	if (analysis.has_failed()) {
				5299	const char* error_message = analysis.error_message();
				5300	return CompilationResult(error_message);
				5301	}
				5302
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5303	// Create the correct assembler for the architecture.
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5304	#ifndef V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5305	// Native regexp implementation.
				5306
				5307	NativeRegExpMacroAssembler::Mode mode =
				5308	is_ascii ? NativeRegExpMacroAssembler::ASCII
				5309	: NativeRegExpMacroAssembler::UC16;
				5310
				5311	#if V8_TARGET_ARCH_IA32
				5312	RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2);
				5313	#elif V8_TARGET_ARCH_X64
				5314	RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2);
				5315	#elif V8_TARGET_ARCH_ARM
				5316	RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2);
Steve Block	44f0eee	2011-05-26 01:26:41 +0100	[diff] [blame]	5317	#elif V8_TARGET_ARCH_MIPS
				5318	RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5319	#endif
				5320
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5321	#else // V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5322	// Interpreted regexp implementation.
				5323	EmbeddedVector<byte, 1024> codes;
				5324	RegExpMacroAssemblerIrregexp macro_assembler(codes);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5325	#endif // V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5326
Ben Murdoch	f87a203	2010-10-22 12:50:53 +0100	[diff] [blame]	5327	// Inserted here, instead of in Assembler, because it depends on information
				5328	// in the AST that isn't replicated in the Node structure.
				5329	static const int kMaxBacksearchLimit = 1024;
				5330	if (is_end_anchored &&
				5331	!is_start_anchored &&
				5332	max_length < kMaxBacksearchLimit) {
				5333	macro_assembler.SetCurrentPositionFromEnd(max_length);
				5334	}
				5335
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5336	return compiler.Assemble(&macro_assembler,
				5337	node,
				5338	data->capture_count,
				5339	pattern);
				5340	}
				5341
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5342
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5343	}} // namespace v8::internal