Blame - src/jsregexp.cc - platform/external/v8

blob: b271b027f9c111bcddd380edce7d29590e29a5ec [file] [log] [blame]

Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1	// Copyright 2006-2009 the V8 project authors. All rights reserved.
				2	// Redistribution and use in source and binary forms, with or without
				3	// modification, are permitted provided that the following conditions are
				4	// met:
				5	//
				6	// * Redistributions of source code must retain the above copyright
				7	// notice, this list of conditions and the following disclaimer.
				8	// * Redistributions in binary form must reproduce the above
				9	// copyright notice, this list of conditions and the following
				10	// disclaimer in the documentation and/or other materials provided
				11	// with the distribution.
				12	// * Neither the name of Google Inc. nor the names of its
				13	// contributors may be used to endorse or promote products derived
				14	// from this software without specific prior written permission.
				15	//
				16	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				17	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				18	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				19	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				20	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				21	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				22	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				23	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				24	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				25	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				26	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				27
				28	#include "v8.h"
				29
				30	#include "ast.h"
				31	#include "compiler.h"
				32	#include "execution.h"
				33	#include "factory.h"
				34	#include "jsregexp.h"
				35	#include "platform.h"
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	36	#include "string-search.h"
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	37	#include "runtime.h"
				38	#include "top.h"
				39	#include "compilation-cache.h"
				40	#include "string-stream.h"
				41	#include "parser.h"
				42	#include "regexp-macro-assembler.h"
				43	#include "regexp-macro-assembler-tracer.h"
				44	#include "regexp-macro-assembler-irregexp.h"
				45	#include "regexp-stack.h"
				46
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	47	#ifndef V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	48	#if V8_TARGET_ARCH_IA32
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	49	#include "ia32/regexp-macro-assembler-ia32.h"
				50	#elif V8_TARGET_ARCH_X64
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	51	#include "x64/regexp-macro-assembler-x64.h"
				52	#elif V8_TARGET_ARCH_ARM
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	53	#include "arm/regexp-macro-assembler-arm.h"
				54	#else
				55	#error Unsupported target architecture.
				56	#endif
				57	#endif
				58
				59	#include "interpreter-irregexp.h"
				60
				61
				62	namespace v8 {
				63	namespace internal {
				64
				65
				66	Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
				67	Handle<String> pattern,
				68	Handle<String> flags,
				69	bool* has_pending_exception) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	70	// Call the construct code with 2 arguments.
				71	Object** argv[2] = { Handle<Object>::cast(pattern).location(),
				72	Handle<Object>::cast(flags).location() };
				73	return Execution::New(constructor, 2, argv, has_pending_exception);
				74	}
				75
				76
				77	static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
				78	int flags = JSRegExp::NONE;
				79	for (int i = 0; i < str->length(); i++) {
				80	switch (str->Get(i)) {
				81	case 'i':
				82	flags \|= JSRegExp::IGNORE_CASE;
				83	break;
				84	case 'g':
				85	flags \|= JSRegExp::GLOBAL;
				86	break;
				87	case 'm':
				88	flags \|= JSRegExp::MULTILINE;
				89	break;
				90	}
				91	}
				92	return JSRegExp::Flags(flags);
				93	}
				94
				95
				96	static inline void ThrowRegExpException(Handle<JSRegExp> re,
				97	Handle<String> pattern,
				98	Handle<String> error_text,
				99	const char* message) {
Ben Murdoch	e0cee9b	2011-05-25 10:26:03 +0100	[diff] [blame^]	100	Handle<FixedArray> elements = Factory::NewFixedArray(2);
				101	elements->set(0, *pattern);
				102	elements->set(1, *error_text);
				103	Handle<JSArray> array = Factory::NewJSArrayWithElements(elements);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	104	Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
				105	Top::Throw(*regexp_err);
				106	}
				107
				108
				109	// Generic RegExp methods. Dispatches to implementation specific methods.
				110
				111
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	112	Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
				113	Handle<String> pattern,
				114	Handle<String> flag_str) {
				115	JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
				116	Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
				117	bool in_cache = !cached.is_null();
				118	LOG(RegExpCompileEvent(re, in_cache));
				119
				120	Handle<Object> result;
				121	if (in_cache) {
				122	re->set_data(*cached);
				123	return re;
				124	}
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	125	pattern = FlattenGetString(pattern);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	126	CompilationZoneScope zone_scope(DELETE_ON_EXIT);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	127	PostponeInterruptsScope postpone;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	128	RegExpCompileData parse_result;
				129	FlatStringReader reader(pattern);
Teng-Hui Zhu	3e5fa29	2010-11-09 16:16:48 -0800	[diff] [blame]	130	if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
				131	&parse_result)) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	132	// Throw an exception if we fail to parse the pattern.
				133	ThrowRegExpException(re,
				134	pattern,
				135	parse_result.error,
				136	"malformed_regexp");
				137	return Handle<Object>::null();
				138	}
				139
				140	if (parse_result.simple && !flags.is_ignore_case()) {
				141	// Parse-tree is a single atom that is equal to the pattern.
				142	AtomCompile(re, pattern, flags, pattern);
				143	} else if (parse_result.tree->IsAtom() &&
				144	!flags.is_ignore_case() &&
				145	parse_result.capture_count == 0) {
				146	RegExpAtom* atom = parse_result.tree->AsAtom();
				147	Vector<const uc16> atom_pattern = atom->data();
				148	Handle<String> atom_string = Factory::NewStringFromTwoByte(atom_pattern);
				149	AtomCompile(re, pattern, flags, atom_string);
				150	} else {
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	151	IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	152	}
				153	ASSERT(re->data()->IsFixedArray());
				154	// Compilation succeeded so the data is set on the regexp
				155	// and we can store it in the cache.
				156	Handle<FixedArray> data(FixedArray::cast(re->data()));
				157	CompilationCache::PutRegExp(pattern, flags, data);
				158
				159	return re;
				160	}
				161
				162
				163	Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
				164	Handle<String> subject,
				165	int index,
				166	Handle<JSArray> last_match_info) {
				167	switch (regexp->TypeTag()) {
				168	case JSRegExp::ATOM:
				169	return AtomExec(regexp, subject, index, last_match_info);
				170	case JSRegExp::IRREGEXP: {
				171	Handle<Object> result =
				172	IrregexpExec(regexp, subject, index, last_match_info);
				173	ASSERT(!result.is_null() \|\| Top::has_pending_exception());
				174	return result;
				175	}
				176	default:
				177	UNREACHABLE();
				178	return Handle<Object>::null();
				179	}
				180	}
				181
				182
				183	// RegExp Atom implementation: Simple string search using indexOf.
				184
				185
				186	void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
				187	Handle<String> pattern,
				188	JSRegExp::Flags flags,
				189	Handle<String> match_pattern) {
				190	Factory::SetRegExpAtomData(re,
				191	JSRegExp::ATOM,
				192	pattern,
				193	flags,
				194	match_pattern);
				195	}
				196
				197
				198	static void SetAtomLastCapture(FixedArray* array,
				199	String* subject,
				200	int from,
				201	int to) {
				202	NoHandleAllocation no_handles;
				203	RegExpImpl::SetLastCaptureCount(array, 2);
				204	RegExpImpl::SetLastSubject(array, subject);
				205	RegExpImpl::SetLastInput(array, subject);
				206	RegExpImpl::SetCapture(array, 0, from);
				207	RegExpImpl::SetCapture(array, 1, to);
				208	}
				209
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	210	/* template <typename SubjectChar>, typename PatternChar>
				211	static int ReStringMatch(Vector<const SubjectChar> sub_vector,
				212	Vector<const PatternChar> pat_vector,
				213	int start_index) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	214
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	215	int pattern_length = pat_vector.length();
				216	if (pattern_length == 0) return start_index;
				217
				218	int subject_length = sub_vector.length();
				219	if (start_index + pattern_length > subject_length) return -1;
				220	return SearchString(sub_vector, pat_vector, start_index);
				221	}
				222	*/
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	223	Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
				224	Handle<String> subject,
				225	int index,
				226	Handle<JSArray> last_match_info) {
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	227	ASSERT(0 <= index);
				228	ASSERT(index <= subject->length());
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	229
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	230	if (!subject->IsFlat()) FlattenString(subject);
				231	AssertNoAllocation no_heap_allocation; // ensure vectors stay valid
				232	// Extract flattened substrings of cons strings before determining asciiness.
				233	String* seq_sub = *subject;
				234	if (seq_sub->IsConsString()) seq_sub = ConsString::cast(seq_sub)->first();
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	235
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	236	String* needle = String::cast(re->DataAt(JSRegExp::kAtomPatternIndex));
				237	int needle_len = needle->length();
				238
				239	if (needle_len != 0) {
				240	if (index + needle_len > subject->length()) return Factory::null_value();
				241	// dispatch on type of strings
				242	index = (needle->IsAsciiRepresentation()
				243	? (seq_sub->IsAsciiRepresentation()
				244	? SearchString(seq_sub->ToAsciiVector(),
				245	needle->ToAsciiVector(),
				246	index)
				247	: SearchString(seq_sub->ToUC16Vector(),
				248	needle->ToAsciiVector(),
				249	index))
				250	: (seq_sub->IsAsciiRepresentation()
				251	? SearchString(seq_sub->ToAsciiVector(),
				252	needle->ToUC16Vector(),
				253	index)
				254	: SearchString(seq_sub->ToUC16Vector(),
				255	needle->ToUC16Vector(),
				256	index)));
				257	if (index == -1) return Factory::null_value();
				258	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	259	ASSERT(last_match_info->HasFastElements());
				260
				261	{
				262	NoHandleAllocation no_handles;
				263	FixedArray* array = FixedArray::cast(last_match_info->elements());
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	264	SetAtomLastCapture(array, *subject, index, index + needle_len);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	265	}
				266	return last_match_info;
				267	}
				268
				269
				270	// Irregexp implementation.
				271
				272	// Ensures that the regexp object contains a compiled version of the
				273	// source for either ASCII or non-ASCII strings.
				274	// If the compiled version doesn't already exist, it is compiled
				275	// from the source pattern.
				276	// If compilation fails, an exception is thrown and this function
				277	// returns false.
				278	bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {
				279	Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	280	#ifdef V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	281	if (compiled_code->IsByteArray()) return true;
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	282	#else // V8_INTERPRETED_REGEXP (RegExp native code)
				283	if (compiled_code->IsCode()) return true;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	284	#endif
				285	return CompileIrregexp(re, is_ascii);
				286	}
				287
				288
				289	bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {
				290	// Compile the RegExp.
				291	CompilationZoneScope zone_scope(DELETE_ON_EXIT);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	292	PostponeInterruptsScope postpone;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	293	Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
				294	if (entry->IsJSObject()) {
				295	// If it's a JSObject, a previous compilation failed and threw this object.
				296	// Re-throw the object without trying again.
				297	Top::Throw(entry);
				298	return false;
				299	}
				300	ASSERT(entry->IsTheHole());
				301
				302	JSRegExp::Flags flags = re->GetFlags();
				303
				304	Handle<String> pattern(re->Pattern());
				305	if (!pattern->IsFlat()) {
				306	FlattenString(pattern);
				307	}
				308
				309	RegExpCompileData compile_data;
				310	FlatStringReader reader(pattern);
Teng-Hui Zhu	3e5fa29	2010-11-09 16:16:48 -0800	[diff] [blame]	311	if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
				312	&compile_data)) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	313	// Throw an exception if we fail to parse the pattern.
				314	// THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
				315	ThrowRegExpException(re,
				316	pattern,
				317	compile_data.error,
				318	"malformed_regexp");
				319	return false;
				320	}
				321	RegExpEngine::CompilationResult result =
				322	RegExpEngine::Compile(&compile_data,
				323	flags.is_ignore_case(),
				324	flags.is_multiline(),
				325	pattern,
				326	is_ascii);
				327	if (result.error_message != NULL) {
				328	// Unable to compile regexp.
Ben Murdoch	e0cee9b	2011-05-25 10:26:03 +0100	[diff] [blame^]	329	Handle<FixedArray> elements = Factory::NewFixedArray(2);
				330	elements->set(0, *pattern);
				331	Handle<String> error_message =
				332	Factory::NewStringFromUtf8(CStrVector(result.error_message));
				333	elements->set(1, *error_message);
				334	Handle<JSArray> array = Factory::NewJSArrayWithElements(elements);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	335	Handle<Object> regexp_err =
				336	Factory::NewSyntaxError("malformed_regexp", array);
				337	Top::Throw(*regexp_err);
				338	re->SetDataAt(JSRegExp::code_index(is_ascii), *regexp_err);
				339	return false;
				340	}
				341
				342	Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
				343	data->set(JSRegExp::code_index(is_ascii), result.code);
				344	int register_max = IrregexpMaxRegisterCount(*data);
				345	if (result.num_registers > register_max) {
				346	SetIrregexpMaxRegisterCount(*data, result.num_registers);
				347	}
				348
				349	return true;
				350	}
				351
				352
				353	int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
				354	return Smi::cast(
				355	re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
				356	}
				357
				358
				359	void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
				360	re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
				361	}
				362
				363
				364	int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
				365	return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
				366	}
				367
				368
				369	int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
				370	return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
				371	}
				372
				373
				374	ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
				375	return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
				376	}
				377
				378
				379	Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
				380	return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
				381	}
				382
				383
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	384	void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
				385	Handle<String> pattern,
				386	JSRegExp::Flags flags,
				387	int capture_count) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	388	// Initialize compiled code entries to null.
				389	Factory::SetRegExpIrregexpData(re,
				390	JSRegExp::IRREGEXP,
				391	pattern,
				392	flags,
				393	capture_count);
				394	}
				395
				396
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	397	int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
				398	Handle<String> subject) {
				399	if (!subject->IsFlat()) {
				400	FlattenString(subject);
				401	}
Steve Block	8defd9f	2010-07-08 12:39:36 +0100	[diff] [blame]	402	// Check the asciiness of the underlying storage.
				403	bool is_ascii;
				404	{
				405	AssertNoAllocation no_gc;
				406	String* sequential_string = *subject;
				407	if (subject->IsConsString()) {
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	408	sequential_string = ConsString::cast(*subject)->first();
Steve Block	8defd9f	2010-07-08 12:39:36 +0100	[diff] [blame]	409	}
				410	is_ascii = sequential_string->IsAsciiRepresentation();
				411	}
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	412	if (!EnsureCompiledIrregexp(regexp, is_ascii)) {
				413	return -1;
				414	}
				415	#ifdef V8_INTERPRETED_REGEXP
				416	// Byte-code regexp needs space allocated for all its registers.
				417	return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data()));
				418	#else // V8_INTERPRETED_REGEXP
				419	// Native regexp only needs room to output captures. Registers are handled
				420	// internally.
				421	return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
				422	#endif // V8_INTERPRETED_REGEXP
				423	}
				424
				425
Steve Block	791712a	2010-08-27 10:21:07 +0100	[diff] [blame]	426	RegExpImpl::IrregexpResult RegExpImpl::IrregexpExecOnce(
				427	Handle<JSRegExp> regexp,
				428	Handle<String> subject,
				429	int index,
Ben Murdoch	b8e0da2	2011-05-16 14:20:40 +0100	[diff] [blame]	430	Vector<int> output) {
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	431	Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()));
				432
				433	ASSERT(index >= 0);
				434	ASSERT(index <= subject->length());
				435	ASSERT(subject->IsFlat());
				436
Steve Block	8defd9f	2010-07-08 12:39:36 +0100	[diff] [blame]	437	// A flat ASCII string might have a two-byte first part.
				438	if (subject->IsConsString()) {
				439	subject = Handle<String>(ConsString::cast(*subject)->first());
				440	}
				441
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	442	#ifndef V8_INTERPRETED_REGEXP
				443	ASSERT(output.length() >=
				444	(IrregexpNumberOfCaptures(irregexp) + 1) 2);
				445	do {
				446	bool is_ascii = subject->IsAsciiRepresentation();
				447	Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii));
				448	NativeRegExpMacroAssembler::Result res =
				449	NativeRegExpMacroAssembler::Match(code,
				450	subject,
				451	output.start(),
				452	output.length(),
				453	index);
				454	if (res != NativeRegExpMacroAssembler::RETRY) {
				455	ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION \|\|
				456	Top::has_pending_exception());
				457	STATIC_ASSERT(
				458	static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
				459	STATIC_ASSERT(
				460	static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
				461	STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
				462	== RE_EXCEPTION);
				463	return static_cast<IrregexpResult>(res);
				464	}
				465	// If result is RETRY, the string has changed representation, and we
				466	// must restart from scratch.
				467	// In this case, it means we must make sure we are prepared to handle
Steve Block	8defd9f	2010-07-08 12:39:36 +0100	[diff] [blame]	468	// the, potentially, different subject (the string can switch between
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	469	// being internal and external, and even between being ASCII and UC16,
				470	// but the characters are always the same).
				471	IrregexpPrepare(regexp, subject);
				472	} while (true);
				473	UNREACHABLE();
				474	return RE_EXCEPTION;
				475	#else // V8_INTERPRETED_REGEXP
				476
				477	ASSERT(output.length() >= IrregexpNumberOfRegisters(*irregexp));
				478	bool is_ascii = subject->IsAsciiRepresentation();
				479	// We must have done EnsureCompiledIrregexp, so we can get the number of
				480	// registers.
				481	int* register_vector = output.start();
				482	int number_of_capture_registers =
				483	(IrregexpNumberOfCaptures(irregexp) + 1) 2;
				484	for (int i = number_of_capture_registers - 1; i >= 0; i--) {
				485	register_vector[i] = -1;
				486	}
				487	Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii));
				488
				489	if (IrregexpInterpreter::Match(byte_codes,
				490	subject,
				491	register_vector,
				492	index)) {
				493	return RE_SUCCESS;
				494	}
				495	return RE_FAILURE;
				496	#endif // V8_INTERPRETED_REGEXP
				497	}
				498
				499
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	500	Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp,
				501	Handle<String> subject,
				502	int previous_index,
				503	Handle<JSArray> last_match_info) {
				504	ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP);
				505
				506	// Prepare space for the return values.
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	507	#ifdef V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	508	#ifdef DEBUG
				509	if (FLAG_trace_regexp_bytecodes) {
				510	String* pattern = jsregexp->Pattern();
				511	PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
				512	PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
				513	}
				514	#endif
				515	#endif
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	516	int required_registers = RegExpImpl::IrregexpPrepare(jsregexp, subject);
				517	if (required_registers < 0) {
				518	// Compiling failed with an exception.
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	519	ASSERT(Top::has_pending_exception());
				520	return Handle<Object>::null();
				521	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	522
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	523	OffsetsVector registers(required_registers);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	524
Iain Merrick	7568138	2010-08-19 15:07:18 +0100	[diff] [blame]	525	IrregexpResult res = RegExpImpl::IrregexpExecOnce(
Ben Murdoch	b8e0da2	2011-05-16 14:20:40 +0100	[diff] [blame]	526	jsregexp, subject, previous_index, Vector<int>(registers.vector(),
				527	registers.length()));
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	528	if (res == RE_SUCCESS) {
				529	int capture_register_count =
				530	(IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2;
				531	last_match_info->EnsureSize(capture_register_count + kLastMatchOverhead);
				532	AssertNoAllocation no_gc;
				533	int* register_vector = registers.vector();
				534	FixedArray* array = FixedArray::cast(last_match_info->elements());
				535	for (int i = 0; i < capture_register_count; i += 2) {
				536	SetCapture(array, i, register_vector[i]);
				537	SetCapture(array, i + 1, register_vector[i + 1]);
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	538	}
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	539	SetLastCaptureCount(array, capture_register_count);
				540	SetLastSubject(array, *subject);
				541	SetLastInput(array, *subject);
				542	return last_match_info;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	543	}
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	544	if (res == RE_EXCEPTION) {
				545	ASSERT(Top::has_pending_exception());
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	546	return Handle<Object>::null();
				547	}
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	548	ASSERT(res == RE_FAILURE);
				549	return Factory::null_value();
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	550	}
				551
				552
				553	// -------------------------------------------------------------------
				554	// Implementation of the Irregexp regular expression engine.
				555	//
				556	// The Irregexp regular expression engine is intended to be a complete
				557	// implementation of ECMAScript regular expressions. It generates either
				558	// bytecodes or native code.
				559
				560	// The Irregexp regexp engine is structured in three steps.
				561	// 1) The parser generates an abstract syntax tree. See ast.cc.
				562	// 2) From the AST a node network is created. The nodes are all
				563	// subclasses of RegExpNode. The nodes represent states when
				564	// executing a regular expression. Several optimizations are
				565	// performed on the node network.
				566	// 3) From the nodes we generate either byte codes or native code
				567	// that can actually execute the regular expression (perform
				568	// the search). The code generation step is described in more
				569	// detail below.
				570
				571	// Code generation.
				572	//
				573	// The nodes are divided into four main categories.
				574	// * Choice nodes
				575	// These represent places where the regular expression can
				576	// match in more than one way. For example on entry to an
				577	// alternation (foo\|bar) or a repetition (*, +, ? or {}).
				578	// * Action nodes
				579	// These represent places where some action should be
				580	// performed. Examples include recording the current position
				581	// in the input string to a register (in order to implement
				582	// captures) or other actions on register for example in order
				583	// to implement the counters needed for {} repetitions.
				584	// * Matching nodes
				585	// These attempt to match some element part of the input string.
				586	// Examples of elements include character classes, plain strings
				587	// or back references.
				588	// * End nodes
				589	// These are used to implement the actions required on finding
				590	// a successful match or failing to find a match.
				591	//
				592	// The code generated (whether as byte codes or native code) maintains
				593	// some state as it runs. This consists of the following elements:
				594	//
				595	// * The capture registers. Used for string captures.
				596	// * Other registers. Used for counters etc.
				597	// * The current position.
				598	// * The stack of backtracking information. Used when a matching node
				599	// fails to find a match and needs to try an alternative.
				600	//
				601	// Conceptual regular expression execution model:
				602	//
				603	// There is a simple conceptual model of regular expression execution
				604	// which will be presented first. The actual code generated is a more
				605	// efficient simulation of the simple conceptual model:
				606	//
				607	// * Choice nodes are implemented as follows:
				608	// For each choice except the last {
				609	// push current position
				610	// push backtrack code location
				611	// <generate code to test for choice>
				612	// backtrack code location:
				613	// pop current position
				614	// }
				615	// <generate code to test for last choice>
				616	//
				617	// * Actions nodes are generated as follows
				618	// <push affected registers on backtrack stack>
				619	// <generate code to perform action>
				620	// push backtrack code location
				621	// <generate code to test for following nodes>
				622	// backtrack code location:
				623	// <pop affected registers to restore their state>
				624	// <pop backtrack location from stack and go to it>
				625	//
				626	// * Matching nodes are generated as follows:
				627	// if input string matches at current position
				628	// update current position
				629	// <generate code to test for following nodes>
				630	// else
				631	// <pop backtrack location from stack and go to it>
				632	//
				633	// Thus it can be seen that the current position is saved and restored
				634	// by the choice nodes, whereas the registers are saved and restored by
				635	// by the action nodes that manipulate them.
				636	//
				637	// The other interesting aspect of this model is that nodes are generated
				638	// at the point where they are needed by a recursive call to Emit(). If
				639	// the node has already been code generated then the Emit() call will
				640	// generate a jump to the previously generated code instead. In order to
				641	// limit recursion it is possible for the Emit() function to put the node
				642	// on a work list for later generation and instead generate a jump. The
				643	// destination of the jump is resolved later when the code is generated.
				644	//
				645	// Actual regular expression code generation.
				646	//
				647	// Code generation is actually more complicated than the above. In order
				648	// to improve the efficiency of the generated code some optimizations are
				649	// performed
				650	//
				651	// * Choice nodes have 1-character lookahead.
				652	// A choice node looks at the following character and eliminates some of
				653	// the choices immediately based on that character. This is not yet
				654	// implemented.
				655	// * Simple greedy loops store reduced backtracking information.
				656	// A quantifier like /.*foo/m will greedily match the whole input. It will
				657	// then need to backtrack to a point where it can match "foo". The naive
				658	// implementation of this would push each character position onto the
				659	// backtracking stack, then pop them off one by one. This would use space
				660	// proportional to the length of the input string. However since the "."
				661	// can only match in one way and always has a constant length (in this case
				662	// of 1) it suffices to store the current position on the top of the stack
				663	// once. Matching now becomes merely incrementing the current position and
				664	// backtracking becomes decrementing the current position and checking the
				665	// result against the stored current position. This is faster and saves
				666	// space.
				667	// * The current state is virtualized.
				668	// This is used to defer expensive operations until it is clear that they
				669	// are needed and to generate code for a node more than once, allowing
				670	// specialized an efficient versions of the code to be created. This is
				671	// explained in the section below.
				672	//
				673	// Execution state virtualization.
				674	//
				675	// Instead of emitting code, nodes that manipulate the state can record their
				676	// manipulation in an object called the Trace. The Trace object can record a
				677	// current position offset, an optional backtrack code location on the top of
				678	// the virtualized backtrack stack and some register changes. When a node is
				679	// to be emitted it can flush the Trace or update it. Flushing the Trace
				680	// will emit code to bring the actual state into line with the virtual state.
				681	// Avoiding flushing the state can postpone some work (eg updates of capture
				682	// registers). Postponing work can save time when executing the regular
				683	// expression since it may be found that the work never has to be done as a
				684	// failure to match can occur. In addition it is much faster to jump to a
				685	// known backtrack code location than it is to pop an unknown backtrack
				686	// location from the stack and jump there.
				687	//
				688	// The virtual state found in the Trace affects code generation. For example
				689	// the virtual state contains the difference between the actual current
				690	// position and the virtual current position, and matching code needs to use
				691	// this offset to attempt a match in the correct location of the input
				692	// string. Therefore code generated for a non-trivial trace is specialized
				693	// to that trace. The code generator therefore has the ability to generate
				694	// code for each node several times. In order to limit the size of the
				695	// generated code there is an arbitrary limit on how many specialized sets of
				696	// code may be generated for a given node. If the limit is reached, the
				697	// trace is flushed and a generic version of the code for a node is emitted.
				698	// This is subsequently used for that node. The code emitted for non-generic
				699	// trace is not recorded in the node and so it cannot currently be reused in
				700	// the event that code generation is requested for an identical trace.
				701
				702
				703	void RegExpTree::AppendToText(RegExpText* text) {
				704	UNREACHABLE();
				705	}
				706
				707
				708	void RegExpAtom::AppendToText(RegExpText* text) {
				709	text->AddElement(TextElement::Atom(this));
				710	}
				711
				712
				713	void RegExpCharacterClass::AppendToText(RegExpText* text) {
				714	text->AddElement(TextElement::CharClass(this));
				715	}
				716
				717
				718	void RegExpText::AppendToText(RegExpText* text) {
				719	for (int i = 0; i < elements()->length(); i++)
				720	text->AddElement(elements()->at(i));
				721	}
				722
				723
				724	TextElement TextElement::Atom(RegExpAtom* atom) {
				725	TextElement result = TextElement(ATOM);
				726	result.data.u_atom = atom;
				727	return result;
				728	}
				729
				730
				731	TextElement TextElement::CharClass(
				732	RegExpCharacterClass* char_class) {
				733	TextElement result = TextElement(CHAR_CLASS);
				734	result.data.u_char_class = char_class;
				735	return result;
				736	}
				737
				738
				739	int TextElement::length() {
				740	if (type == ATOM) {
				741	return data.u_atom->length();
				742	} else {
				743	ASSERT(type == CHAR_CLASS);
				744	return 1;
				745	}
				746	}
				747
				748
				749	DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
				750	if (table_ == NULL) {
				751	table_ = new DispatchTable();
				752	DispatchTableConstructor cons(table_, ignore_case);
				753	cons.BuildTable(this);
				754	}
				755	return table_;
				756	}
				757
				758
				759	class RegExpCompiler {
				760	public:
				761	RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
				762
				763	int AllocateRegister() {
				764	if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
				765	reg_exp_too_big_ = true;
				766	return next_register_;
				767	}
				768	return next_register_++;
				769	}
				770
				771	RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
				772	RegExpNode* start,
				773	int capture_count,
				774	Handle<String> pattern);
				775
				776	inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
				777
				778	static const int kImplementationOffset = 0;
				779	static const int kNumberOfRegistersOffset = 0;
				780	static const int kCodeOffset = 1;
				781
				782	RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
				783	EndNode* accept() { return accept_; }
				784
				785	static const int kMaxRecursion = 100;
				786	inline int recursion_depth() { return recursion_depth_; }
				787	inline void IncrementRecursionDepth() { recursion_depth_++; }
				788	inline void DecrementRecursionDepth() { recursion_depth_--; }
				789
				790	void SetRegExpTooBig() { reg_exp_too_big_ = true; }
				791
				792	inline bool ignore_case() { return ignore_case_; }
				793	inline bool ascii() { return ascii_; }
				794
				795	static const int kNoRegister = -1;
				796	private:
				797	EndNode* accept_;
				798	int next_register_;
				799	List<RegExpNode> work_list_;
				800	int recursion_depth_;
				801	RegExpMacroAssembler* macro_assembler_;
				802	bool ignore_case_;
				803	bool ascii_;
				804	bool reg_exp_too_big_;
				805	};
				806
				807
				808	class RecursionCheck {
				809	public:
				810	explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
				811	compiler->IncrementRecursionDepth();
				812	}
				813	~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
				814	private:
				815	RegExpCompiler* compiler_;
				816	};
				817
				818
				819	static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
				820	return RegExpEngine::CompilationResult("RegExp too big");
				821	}
				822
				823
				824	// Attempts to compile the regexp using an Irregexp code generator. Returns
				825	// a fixed array or a null handle depending on whether it succeeded.
				826	RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
				827	: next_register_(2 * (capture_count + 1)),
				828	work_list_(NULL),
				829	recursion_depth_(0),
				830	ignore_case_(ignore_case),
				831	ascii_(ascii),
				832	reg_exp_too_big_(false) {
				833	accept_ = new EndNode(EndNode::ACCEPT);
				834	ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
				835	}
				836
				837
				838	RegExpEngine::CompilationResult RegExpCompiler::Assemble(
				839	RegExpMacroAssembler* macro_assembler,
				840	RegExpNode* start,
				841	int capture_count,
				842	Handle<String> pattern) {
				843	#ifdef DEBUG
				844	if (FLAG_trace_regexp_assembler)
				845	macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
				846	else
				847	#endif
				848	macro_assembler_ = macro_assembler;
				849	List <RegExpNode*> work_list(0);
				850	work_list_ = &work_list;
				851	Label fail;
				852	macro_assembler_->PushBacktrack(&fail);
				853	Trace new_trace;
				854	start->Emit(this, &new_trace);
				855	macro_assembler_->Bind(&fail);
				856	macro_assembler_->Fail();
				857	while (!work_list.is_empty()) {
				858	work_list.RemoveLast()->Emit(this, &new_trace);
				859	}
				860	if (reg_exp_too_big_) return IrregexpRegExpTooBig();
				861
				862	Handle<Object> code = macro_assembler_->GetCode(pattern);
				863
				864	work_list_ = NULL;
				865	#ifdef DEBUG
				866	if (FLAG_trace_regexp_assembler) {
				867	delete macro_assembler_;
				868	}
				869	#endif
				870	return RegExpEngine::CompilationResult(*code, next_register_);
				871	}
				872
				873
				874	bool Trace::DeferredAction::Mentions(int that) {
				875	if (type() == ActionNode::CLEAR_CAPTURES) {
				876	Interval range = static_cast<DeferredClearCaptures*>(this)->range();
				877	return range.Contains(that);
				878	} else {
				879	return reg() == that;
				880	}
				881	}
				882
				883
				884	bool Trace::mentions_reg(int reg) {
				885	for (DeferredAction* action = actions_;
				886	action != NULL;
				887	action = action->next()) {
				888	if (action->Mentions(reg))
				889	return true;
				890	}
				891	return false;
				892	}
				893
				894
				895	bool Trace::GetStoredPosition(int reg, int* cp_offset) {
				896	ASSERT_EQ(0, *cp_offset);
				897	for (DeferredAction* action = actions_;
				898	action != NULL;
				899	action = action->next()) {
				900	if (action->Mentions(reg)) {
				901	if (action->type() == ActionNode::STORE_POSITION) {
				902	cp_offset = static_cast<DeferredCapture>(action)->cp_offset();
				903	return true;
				904	} else {
				905	return false;
				906	}
				907	}
				908	}
				909	return false;
				910	}
				911
				912
				913	int Trace::FindAffectedRegisters(OutSet* affected_registers) {
				914	int max_register = RegExpCompiler::kNoRegister;
				915	for (DeferredAction* action = actions_;
				916	action != NULL;
				917	action = action->next()) {
				918	if (action->type() == ActionNode::CLEAR_CAPTURES) {
				919	Interval range = static_cast<DeferredClearCaptures*>(action)->range();
				920	for (int i = range.from(); i <= range.to(); i++)
				921	affected_registers->Set(i);
				922	if (range.to() > max_register) max_register = range.to();
				923	} else {
				924	affected_registers->Set(action->reg());
				925	if (action->reg() > max_register) max_register = action->reg();
				926	}
				927	}
				928	return max_register;
				929	}
				930
				931
				932	void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
				933	int max_register,
				934	OutSet& registers_to_pop,
				935	OutSet& registers_to_clear) {
				936	for (int reg = max_register; reg >= 0; reg--) {
				937	if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
				938	else if (registers_to_clear.Get(reg)) {
				939	int clear_to = reg;
				940	while (reg > 0 && registers_to_clear.Get(reg - 1)) {
				941	reg--;
				942	}
				943	assembler->ClearRegisters(reg, clear_to);
				944	}
				945	}
				946	}
				947
				948
				949	void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
				950	int max_register,
				951	OutSet& affected_registers,
				952	OutSet* registers_to_pop,
				953	OutSet* registers_to_clear) {
				954	// The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
				955	const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
				956
				957	// Count pushes performed to force a stack limit check occasionally.
				958	int pushes = 0;
				959
				960	for (int reg = 0; reg <= max_register; reg++) {
				961	if (!affected_registers.Get(reg)) {
				962	continue;
				963	}
				964
				965	// The chronologically first deferred action in the trace
				966	// is used to infer the action needed to restore a register
				967	// to its previous state (or not, if it's safe to ignore it).
				968	enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
				969	DeferredActionUndoType undo_action = IGNORE;
				970
				971	int value = 0;
				972	bool absolute = false;
				973	bool clear = false;
				974	int store_position = -1;
				975	// This is a little tricky because we are scanning the actions in reverse
				976	// historical order (newest first).
				977	for (DeferredAction* action = actions_;
				978	action != NULL;
				979	action = action->next()) {
				980	if (action->Mentions(reg)) {
				981	switch (action->type()) {
				982	case ActionNode::SET_REGISTER: {
				983	Trace::DeferredSetRegister* psr =
				984	static_cast<Trace::DeferredSetRegister*>(action);
				985	if (!absolute) {
				986	value += psr->value();
				987	absolute = true;
				988	}
				989	// SET_REGISTER is currently only used for newly introduced loop
				990	// counters. They can have a significant previous value if they
				991	// occour in a loop. TODO(lrn): Propagate this information, so
				992	// we can set undo_action to IGNORE if we know there is no value to
				993	// restore.
				994	undo_action = RESTORE;
				995	ASSERT_EQ(store_position, -1);
				996	ASSERT(!clear);
				997	break;
				998	}
				999	case ActionNode::INCREMENT_REGISTER:
				1000	if (!absolute) {
				1001	value++;
				1002	}
				1003	ASSERT_EQ(store_position, -1);
				1004	ASSERT(!clear);
				1005	undo_action = RESTORE;
				1006	break;
				1007	case ActionNode::STORE_POSITION: {
				1008	Trace::DeferredCapture* pc =
				1009	static_cast<Trace::DeferredCapture*>(action);
				1010	if (!clear && store_position == -1) {
				1011	store_position = pc->cp_offset();
				1012	}
				1013
				1014	// For captures we know that stores and clears alternate.
				1015	// Other register, are never cleared, and if the occur
				1016	// inside a loop, they might be assigned more than once.
				1017	if (reg <= 1) {
				1018	// Registers zero and one, aka "capture zero", is
				1019	// always set correctly if we succeed. There is no
				1020	// need to undo a setting on backtrack, because we
				1021	// will set it again or fail.
				1022	undo_action = IGNORE;
				1023	} else {
				1024	undo_action = pc->is_capture() ? CLEAR : RESTORE;
				1025	}
				1026	ASSERT(!absolute);
				1027	ASSERT_EQ(value, 0);
				1028	break;
				1029	}
				1030	case ActionNode::CLEAR_CAPTURES: {
				1031	// Since we're scanning in reverse order, if we've already
				1032	// set the position we have to ignore historically earlier
				1033	// clearing operations.
				1034	if (store_position == -1) {
				1035	clear = true;
				1036	}
				1037	undo_action = RESTORE;
				1038	ASSERT(!absolute);
				1039	ASSERT_EQ(value, 0);
				1040	break;
				1041	}
				1042	default:
				1043	UNREACHABLE();
				1044	break;
				1045	}
				1046	}
				1047	}
				1048	// Prepare for the undo-action (e.g., push if it's going to be popped).
				1049	if (undo_action == RESTORE) {
				1050	pushes++;
				1051	RegExpMacroAssembler::StackCheckFlag stack_check =
				1052	RegExpMacroAssembler::kNoStackLimitCheck;
				1053	if (pushes == push_limit) {
				1054	stack_check = RegExpMacroAssembler::kCheckStackLimit;
				1055	pushes = 0;
				1056	}
				1057
				1058	assembler->PushRegister(reg, stack_check);
				1059	registers_to_pop->Set(reg);
				1060	} else if (undo_action == CLEAR) {
				1061	registers_to_clear->Set(reg);
				1062	}
				1063	// Perform the chronologically last action (or accumulated increment)
				1064	// for the register.
				1065	if (store_position != -1) {
				1066	assembler->WriteCurrentPositionToRegister(reg, store_position);
				1067	} else if (clear) {
				1068	assembler->ClearRegisters(reg, reg);
				1069	} else if (absolute) {
				1070	assembler->SetRegister(reg, value);
				1071	} else if (value != 0) {
				1072	assembler->AdvanceRegister(reg, value);
				1073	}
				1074	}
				1075	}
				1076
				1077
				1078	// This is called as we come into a loop choice node and some other tricky
				1079	// nodes. It normalizes the state of the code generator to ensure we can
				1080	// generate generic code.
				1081	void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
				1082	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1083
				1084	ASSERT(!is_trivial());
				1085
				1086	if (actions_ == NULL && backtrack() == NULL) {
				1087	// Here we just have some deferred cp advances to fix and we are back to
				1088	// a normal situation. We may also have to forget some information gained
				1089	// through a quick check that was already performed.
				1090	if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
				1091	// Create a new trivial state and generate the node with that.
				1092	Trace new_state;
				1093	successor->Emit(compiler, &new_state);
				1094	return;
				1095	}
				1096
				1097	// Generate deferred actions here along with code to undo them again.
				1098	OutSet affected_registers;
				1099
				1100	if (backtrack() != NULL) {
				1101	// Here we have a concrete backtrack location. These are set up by choice
				1102	// nodes and so they indicate that we have a deferred save of the current
				1103	// position which we may need to emit here.
				1104	assembler->PushCurrentPosition();
				1105	}
				1106
				1107	int max_register = FindAffectedRegisters(&affected_registers);
				1108	OutSet registers_to_pop;
				1109	OutSet registers_to_clear;
				1110	PerformDeferredActions(assembler,
				1111	max_register,
				1112	affected_registers,
				1113	&registers_to_pop,
				1114	&registers_to_clear);
				1115	if (cp_offset_ != 0) {
				1116	assembler->AdvanceCurrentPosition(cp_offset_);
				1117	}
				1118
				1119	// Create a new trivial state and generate the node with that.
				1120	Label undo;
				1121	assembler->PushBacktrack(&undo);
				1122	Trace new_state;
				1123	successor->Emit(compiler, &new_state);
				1124
				1125	// On backtrack we need to restore state.
				1126	assembler->Bind(&undo);
				1127	RestoreAffectedRegisters(assembler,
				1128	max_register,
				1129	registers_to_pop,
				1130	registers_to_clear);
				1131	if (backtrack() == NULL) {
				1132	assembler->Backtrack();
				1133	} else {
				1134	assembler->PopCurrentPosition();
				1135	assembler->GoTo(backtrack());
				1136	}
				1137	}
				1138
				1139
				1140	void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
				1141	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1142
				1143	// Omit flushing the trace. We discard the entire stack frame anyway.
				1144
				1145	if (!label()->is_bound()) {
				1146	// We are completely independent of the trace, since we ignore it,
				1147	// so this code can be used as the generic version.
				1148	assembler->Bind(label());
				1149	}
				1150
				1151	// Throw away everything on the backtrack stack since the start
				1152	// of the negative submatch and restore the character position.
				1153	assembler->ReadCurrentPositionFromRegister(current_position_register_);
				1154	assembler->ReadStackPointerFromRegister(stack_pointer_register_);
				1155	if (clear_capture_count_ > 0) {
				1156	// Clear any captures that might have been performed during the success
				1157	// of the body of the negative look-ahead.
				1158	int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
				1159	assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
				1160	}
				1161	// Now that we have unwound the stack we find at the top of the stack the
				1162	// backtrack that the BeginSubmatch node got.
				1163	assembler->Backtrack();
				1164	}
				1165
				1166
				1167	void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				1168	if (!trace->is_trivial()) {
				1169	trace->Flush(compiler, this);
				1170	return;
				1171	}
				1172	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1173	if (!label()->is_bound()) {
				1174	assembler->Bind(label());
				1175	}
				1176	switch (action_) {
				1177	case ACCEPT:
				1178	assembler->Succeed();
				1179	return;
				1180	case BACKTRACK:
				1181	assembler->GoTo(trace->backtrack());
				1182	return;
				1183	case NEGATIVE_SUBMATCH_SUCCESS:
				1184	// This case is handled in a different virtual method.
				1185	UNREACHABLE();
				1186	}
				1187	UNIMPLEMENTED();
				1188	}
				1189
				1190
				1191	void GuardedAlternative::AddGuard(Guard* guard) {
				1192	if (guards_ == NULL)
				1193	guards_ = new ZoneList<Guard*>(1);
				1194	guards_->Add(guard);
				1195	}
				1196
				1197
				1198	ActionNode* ActionNode::SetRegister(int reg,
				1199	int val,
				1200	RegExpNode* on_success) {
				1201	ActionNode* result = new ActionNode(SET_REGISTER, on_success);
				1202	result->data_.u_store_register.reg = reg;
				1203	result->data_.u_store_register.value = val;
				1204	return result;
				1205	}
				1206
				1207
				1208	ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
				1209	ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
				1210	result->data_.u_increment_register.reg = reg;
				1211	return result;
				1212	}
				1213
				1214
				1215	ActionNode* ActionNode::StorePosition(int reg,
				1216	bool is_capture,
				1217	RegExpNode* on_success) {
				1218	ActionNode* result = new ActionNode(STORE_POSITION, on_success);
				1219	result->data_.u_position_register.reg = reg;
				1220	result->data_.u_position_register.is_capture = is_capture;
				1221	return result;
				1222	}
				1223
				1224
				1225	ActionNode* ActionNode::ClearCaptures(Interval range,
				1226	RegExpNode* on_success) {
				1227	ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
				1228	result->data_.u_clear_captures.range_from = range.from();
				1229	result->data_.u_clear_captures.range_to = range.to();
				1230	return result;
				1231	}
				1232
				1233
				1234	ActionNode* ActionNode::BeginSubmatch(int stack_reg,
				1235	int position_reg,
				1236	RegExpNode* on_success) {
				1237	ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
				1238	result->data_.u_submatch.stack_pointer_register = stack_reg;
				1239	result->data_.u_submatch.current_position_register = position_reg;
				1240	return result;
				1241	}
				1242
				1243
				1244	ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
				1245	int position_reg,
				1246	int clear_register_count,
				1247	int clear_register_from,
				1248	RegExpNode* on_success) {
				1249	ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
				1250	result->data_.u_submatch.stack_pointer_register = stack_reg;
				1251	result->data_.u_submatch.current_position_register = position_reg;
				1252	result->data_.u_submatch.clear_register_count = clear_register_count;
				1253	result->data_.u_submatch.clear_register_from = clear_register_from;
				1254	return result;
				1255	}
				1256
				1257
				1258	ActionNode* ActionNode::EmptyMatchCheck(int start_register,
				1259	int repetition_register,
				1260	int repetition_limit,
				1261	RegExpNode* on_success) {
				1262	ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
				1263	result->data_.u_empty_match_check.start_register = start_register;
				1264	result->data_.u_empty_match_check.repetition_register = repetition_register;
				1265	result->data_.u_empty_match_check.repetition_limit = repetition_limit;
				1266	return result;
				1267	}
				1268
				1269
				1270	#define DEFINE_ACCEPT(Type) \
				1271	void Type##Node::Accept(NodeVisitor* visitor) { \
				1272	visitor->Visit##Type(this); \
				1273	}
				1274	FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
				1275	#undef DEFINE_ACCEPT
				1276
				1277
				1278	void LoopChoiceNode::Accept(NodeVisitor* visitor) {
				1279	visitor->VisitLoopChoice(this);
				1280	}
				1281
				1282
				1283	// -------------------------------------------------------------------
				1284	// Emit code.
				1285
				1286
				1287	void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
				1288	Guard* guard,
				1289	Trace* trace) {
				1290	switch (guard->op()) {
				1291	case Guard::LT:
				1292	ASSERT(!trace->mentions_reg(guard->reg()));
				1293	macro_assembler->IfRegisterGE(guard->reg(),
				1294	guard->value(),
				1295	trace->backtrack());
				1296	break;
				1297	case Guard::GEQ:
				1298	ASSERT(!trace->mentions_reg(guard->reg()));
				1299	macro_assembler->IfRegisterLT(guard->reg(),
				1300	guard->value(),
				1301	trace->backtrack());
				1302	break;
				1303	}
				1304	}
				1305
				1306
				1307	static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
				1308	static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
				1309
				1310
				1311	// Returns the number of characters in the equivalence class, omitting those
				1312	// that cannot occur in the source string because it is ASCII.
				1313	static int GetCaseIndependentLetters(uc16 character,
				1314	bool ascii_subject,
				1315	unibrow::uchar* letters) {
				1316	int length = uncanonicalize.get(character, '\0', letters);
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	1317	// Unibrow returns 0 or 1 for characters where case independence is
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1318	// trivial.
				1319	if (length == 0) {
				1320	letters[0] = character;
				1321	length = 1;
				1322	}
				1323	if (!ascii_subject \|\| character <= String::kMaxAsciiCharCode) {
				1324	return length;
				1325	}
				1326	// The standard requires that non-ASCII characters cannot have ASCII
				1327	// character codes in their equivalence class.
				1328	return 0;
				1329	}
				1330
				1331
				1332	static inline bool EmitSimpleCharacter(RegExpCompiler* compiler,
				1333	uc16 c,
				1334	Label* on_failure,
				1335	int cp_offset,
				1336	bool check,
				1337	bool preloaded) {
				1338	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1339	bool bound_checked = false;
				1340	if (!preloaded) {
				1341	assembler->LoadCurrentCharacter(
				1342	cp_offset,
				1343	on_failure,
				1344	check);
				1345	bound_checked = true;
				1346	}
				1347	assembler->CheckNotCharacter(c, on_failure);
				1348	return bound_checked;
				1349	}
				1350
				1351
				1352	// Only emits non-letters (things that don't have case). Only used for case
				1353	// independent matches.
				1354	static inline bool EmitAtomNonLetter(RegExpCompiler* compiler,
				1355	uc16 c,
				1356	Label* on_failure,
				1357	int cp_offset,
				1358	bool check,
				1359	bool preloaded) {
				1360	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1361	bool ascii = compiler->ascii();
				1362	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				1363	int length = GetCaseIndependentLetters(c, ascii, chars);
				1364	if (length < 1) {
				1365	// This can't match. Must be an ASCII subject and a non-ASCII character.
				1366	// We do not need to do anything since the ASCII pass already handled this.
				1367	return false; // Bounds not checked.
				1368	}
				1369	bool checked = false;
				1370	// We handle the length > 1 case in a later pass.
				1371	if (length == 1) {
				1372	if (ascii && c > String::kMaxAsciiCharCodeU) {
				1373	// Can't match - see above.
				1374	return false; // Bounds not checked.
				1375	}
				1376	if (!preloaded) {
				1377	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
				1378	checked = check;
				1379	}
				1380	macro_assembler->CheckNotCharacter(c, on_failure);
				1381	}
				1382	return checked;
				1383	}
				1384
				1385
				1386	static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
				1387	bool ascii,
				1388	uc16 c1,
				1389	uc16 c2,
				1390	Label* on_failure) {
				1391	uc16 char_mask;
				1392	if (ascii) {
				1393	char_mask = String::kMaxAsciiCharCode;
				1394	} else {
				1395	char_mask = String::kMaxUC16CharCode;
				1396	}
				1397	uc16 exor = c1 ^ c2;
				1398	// Check whether exor has only one bit set.
				1399	if (((exor - 1) & exor) == 0) {
				1400	// If c1 and c2 differ only by one bit.
				1401	// Ecma262UnCanonicalize always gives the highest number last.
				1402	ASSERT(c2 > c1);
				1403	uc16 mask = char_mask ^ exor;
				1404	macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
				1405	return true;
				1406	}
				1407	ASSERT(c2 > c1);
				1408	uc16 diff = c2 - c1;
				1409	if (((diff - 1) & diff) == 0 && c1 >= diff) {
				1410	// If the characters differ by 2^n but don't differ by one bit then
				1411	// subtract the difference from the found character, then do the or
				1412	// trick. We avoid the theoretical case where negative numbers are
				1413	// involved in order to simplify code generation.
				1414	uc16 mask = char_mask ^ diff;
				1415	macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
				1416	diff,
				1417	mask,
				1418	on_failure);
				1419	return true;
				1420	}
				1421	return false;
				1422	}
				1423
				1424
				1425	typedef bool EmitCharacterFunction(RegExpCompiler* compiler,
				1426	uc16 c,
				1427	Label* on_failure,
				1428	int cp_offset,
				1429	bool check,
				1430	bool preloaded);
				1431
				1432	// Only emits letters (things that have case). Only used for case independent
				1433	// matches.
				1434	static inline bool EmitAtomLetter(RegExpCompiler* compiler,
				1435	uc16 c,
				1436	Label* on_failure,
				1437	int cp_offset,
				1438	bool check,
				1439	bool preloaded) {
				1440	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1441	bool ascii = compiler->ascii();
				1442	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				1443	int length = GetCaseIndependentLetters(c, ascii, chars);
				1444	if (length <= 1) return false;
				1445	// We may not need to check against the end of the input string
				1446	// if this character lies before a character that matched.
				1447	if (!preloaded) {
				1448	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
				1449	}
				1450	Label ok;
				1451	ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
				1452	switch (length) {
				1453	case 2: {
				1454	if (ShortCutEmitCharacterPair(macro_assembler,
				1455	ascii,
				1456	chars[0],
				1457	chars[1],
				1458	on_failure)) {
				1459	} else {
				1460	macro_assembler->CheckCharacter(chars[0], &ok);
				1461	macro_assembler->CheckNotCharacter(chars[1], on_failure);
				1462	macro_assembler->Bind(&ok);
				1463	}
				1464	break;
				1465	}
				1466	case 4:
				1467	macro_assembler->CheckCharacter(chars[3], &ok);
				1468	// Fall through!
				1469	case 3:
				1470	macro_assembler->CheckCharacter(chars[0], &ok);
				1471	macro_assembler->CheckCharacter(chars[1], &ok);
				1472	macro_assembler->CheckNotCharacter(chars[2], on_failure);
				1473	macro_assembler->Bind(&ok);
				1474	break;
				1475	default:
				1476	UNREACHABLE();
				1477	break;
				1478	}
				1479	return true;
				1480	}
				1481
				1482
				1483	static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
				1484	RegExpCharacterClass* cc,
				1485	bool ascii,
				1486	Label* on_failure,
				1487	int cp_offset,
				1488	bool check_offset,
				1489	bool preloaded) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1490	ZoneList<CharacterRange>* ranges = cc->ranges();
				1491	int max_char;
				1492	if (ascii) {
				1493	max_char = String::kMaxAsciiCharCode;
				1494	} else {
				1495	max_char = String::kMaxUC16CharCode;
				1496	}
				1497
				1498	Label success;
				1499
				1500	Label* char_is_in_class =
				1501	cc->is_negated() ? on_failure : &success;
				1502
				1503	int range_count = ranges->length();
				1504
				1505	int last_valid_range = range_count - 1;
				1506	while (last_valid_range >= 0) {
				1507	CharacterRange& range = ranges->at(last_valid_range);
				1508	if (range.from() <= max_char) {
				1509	break;
				1510	}
				1511	last_valid_range--;
				1512	}
				1513
				1514	if (last_valid_range < 0) {
				1515	if (!cc->is_negated()) {
				1516	// TODO(plesner): We can remove this when the node level does our
				1517	// ASCII optimizations for us.
				1518	macro_assembler->GoTo(on_failure);
				1519	}
				1520	if (check_offset) {
				1521	macro_assembler->CheckPosition(cp_offset, on_failure);
				1522	}
				1523	return;
				1524	}
				1525
				1526	if (last_valid_range == 0 &&
				1527	!cc->is_negated() &&
				1528	ranges->at(0).IsEverything(max_char)) {
				1529	// This is a common case hit by non-anchored expressions.
				1530	if (check_offset) {
				1531	macro_assembler->CheckPosition(cp_offset, on_failure);
				1532	}
				1533	return;
				1534	}
				1535
				1536	if (!preloaded) {
				1537	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
				1538	}
				1539
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	1540	if (cc->is_standard() &&
				1541	macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
				1542	on_failure)) {
				1543	return;
				1544	}
				1545
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1546	for (int i = 0; i < last_valid_range; i++) {
				1547	CharacterRange& range = ranges->at(i);
				1548	Label next_range;
				1549	uc16 from = range.from();
				1550	uc16 to = range.to();
				1551	if (from > max_char) {
				1552	continue;
				1553	}
				1554	if (to > max_char) to = max_char;
				1555	if (to == from) {
				1556	macro_assembler->CheckCharacter(to, char_is_in_class);
				1557	} else {
				1558	if (from != 0) {
				1559	macro_assembler->CheckCharacterLT(from, &next_range);
				1560	}
				1561	if (to != max_char) {
				1562	macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
				1563	} else {
				1564	macro_assembler->GoTo(char_is_in_class);
				1565	}
				1566	}
				1567	macro_assembler->Bind(&next_range);
				1568	}
				1569
				1570	CharacterRange& range = ranges->at(last_valid_range);
				1571	uc16 from = range.from();
				1572	uc16 to = range.to();
				1573
				1574	if (to > max_char) to = max_char;
				1575	ASSERT(to >= from);
				1576
				1577	if (to == from) {
				1578	if (cc->is_negated()) {
				1579	macro_assembler->CheckCharacter(to, on_failure);
				1580	} else {
				1581	macro_assembler->CheckNotCharacter(to, on_failure);
				1582	}
				1583	} else {
				1584	if (from != 0) {
				1585	if (cc->is_negated()) {
				1586	macro_assembler->CheckCharacterLT(from, &success);
				1587	} else {
				1588	macro_assembler->CheckCharacterLT(from, on_failure);
				1589	}
				1590	}
				1591	if (to != String::kMaxUC16CharCode) {
				1592	if (cc->is_negated()) {
				1593	macro_assembler->CheckCharacterLT(to + 1, on_failure);
				1594	} else {
				1595	macro_assembler->CheckCharacterGT(to, on_failure);
				1596	}
				1597	} else {
				1598	if (cc->is_negated()) {
				1599	macro_assembler->GoTo(on_failure);
				1600	}
				1601	}
				1602	}
				1603	macro_assembler->Bind(&success);
				1604	}
				1605
				1606
				1607	RegExpNode::~RegExpNode() {
				1608	}
				1609
				1610
				1611	RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
				1612	Trace* trace) {
				1613	// If we are generating a greedy loop then don't stop and don't reuse code.
				1614	if (trace->stop_node() != NULL) {
				1615	return CONTINUE;
				1616	}
				1617
				1618	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1619	if (trace->is_trivial()) {
				1620	if (label_.is_bound()) {
				1621	// We are being asked to generate a generic version, but that's already
				1622	// been done so just go to it.
				1623	macro_assembler->GoTo(&label_);
				1624	return DONE;
				1625	}
				1626	if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
				1627	// To avoid too deep recursion we push the node to the work queue and just
				1628	// generate a goto here.
				1629	compiler->AddWork(this);
				1630	macro_assembler->GoTo(&label_);
				1631	return DONE;
				1632	}
				1633	// Generate generic version of the node and bind the label for later use.
				1634	macro_assembler->Bind(&label_);
				1635	return CONTINUE;
				1636	}
				1637
				1638	// We are being asked to make a non-generic version. Keep track of how many
				1639	// non-generic versions we generate so as not to overdo it.
				1640	trace_count_++;
				1641	if (FLAG_regexp_optimization &&
				1642	trace_count_ < kMaxCopiesCodeGenerated &&
				1643	compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
				1644	return CONTINUE;
				1645	}
				1646
				1647	// If we get here code has been generated for this node too many times or
				1648	// recursion is too deep. Time to switch to a generic version. The code for
				1649	// generic versions above can handle deep recursion properly.
				1650	trace->Flush(compiler, this);
				1651	return DONE;
				1652	}
				1653
				1654
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1655	int ActionNode::EatsAtLeast(int still_to_find,
				1656	int recursion_depth,
				1657	bool not_at_start) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1658	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
				1659	if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1660	return on_success()->EatsAtLeast(still_to_find,
				1661	recursion_depth + 1,
				1662	not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1663	}
				1664
				1665
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1666	int AssertionNode::EatsAtLeast(int still_to_find,
				1667	int recursion_depth,
				1668	bool not_at_start) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1669	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1670	// If we know we are not at the start and we are asked "how many characters
				1671	// will you match if you succeed?" then we can answer anything since false
				1672	// implies false. So lets just return the max answer (still_to_find) since
				1673	// that won't prevent us from preloading a lot of characters for the other
				1674	// branches in the node graph.
				1675	if (type() == AT_START && not_at_start) return still_to_find;
				1676	return on_success()->EatsAtLeast(still_to_find,
				1677	recursion_depth + 1,
				1678	not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1679	}
				1680
				1681
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1682	int BackReferenceNode::EatsAtLeast(int still_to_find,
				1683	int recursion_depth,
				1684	bool not_at_start) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1685	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1686	return on_success()->EatsAtLeast(still_to_find,
				1687	recursion_depth + 1,
				1688	not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1689	}
				1690
				1691
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1692	int TextNode::EatsAtLeast(int still_to_find,
				1693	int recursion_depth,
				1694	bool not_at_start) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1695	int answer = Length();
				1696	if (answer >= still_to_find) return answer;
				1697	if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1698	// We are not at start after this node so we set the last argument to 'true'.
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1699	return answer + on_success()->EatsAtLeast(still_to_find - answer,
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1700	recursion_depth + 1,
				1701	true);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1702	}
				1703
				1704
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	1705	int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1706	int recursion_depth,
				1707	bool not_at_start) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1708	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
				1709	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				1710	// afterwards.
				1711	RegExpNode* node = alternatives_->at(1).node();
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1712	return node->EatsAtLeast(still_to_find, recursion_depth + 1, not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1713	}
				1714
				1715
				1716	void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
				1717	QuickCheckDetails* details,
				1718	RegExpCompiler* compiler,
				1719	int filled_in,
				1720	bool not_at_start) {
				1721	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				1722	// afterwards.
				1723	RegExpNode* node = alternatives_->at(1).node();
				1724	return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
				1725	}
				1726
				1727
				1728	int ChoiceNode::EatsAtLeastHelper(int still_to_find,
				1729	int recursion_depth,
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1730	RegExpNode* ignore_this_node,
				1731	bool not_at_start) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1732	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
				1733	int min = 100;
				1734	int choice_count = alternatives_->length();
				1735	for (int i = 0; i < choice_count; i++) {
				1736	RegExpNode* node = alternatives_->at(i).node();
				1737	if (node == ignore_this_node) continue;
				1738	int node_eats_at_least = node->EatsAtLeast(still_to_find,
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1739	recursion_depth + 1,
				1740	not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1741	if (node_eats_at_least < min) min = node_eats_at_least;
				1742	}
				1743	return min;
				1744	}
				1745
				1746
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1747	int LoopChoiceNode::EatsAtLeast(int still_to_find,
				1748	int recursion_depth,
				1749	bool not_at_start) {
				1750	return EatsAtLeastHelper(still_to_find,
				1751	recursion_depth,
				1752	loop_node_,
				1753	not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1754	}
				1755
				1756
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	1757	int ChoiceNode::EatsAtLeast(int still_to_find,
				1758	int recursion_depth,
				1759	bool not_at_start) {
				1760	return EatsAtLeastHelper(still_to_find,
				1761	recursion_depth,
				1762	NULL,
				1763	not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1764	}
				1765
				1766
				1767	// Takes the left-most 1-bit and smears it out, setting all bits to its right.
				1768	static inline uint32_t SmearBitsRight(uint32_t v) {
				1769	v \|= v >> 1;
				1770	v \|= v >> 2;
				1771	v \|= v >> 4;
				1772	v \|= v >> 8;
				1773	v \|= v >> 16;
				1774	return v;
				1775	}
				1776
				1777
				1778	bool QuickCheckDetails::Rationalize(bool asc) {
				1779	bool found_useful_op = false;
				1780	uint32_t char_mask;
				1781	if (asc) {
				1782	char_mask = String::kMaxAsciiCharCode;
				1783	} else {
				1784	char_mask = String::kMaxUC16CharCode;
				1785	}
				1786	mask_ = 0;
				1787	value_ = 0;
				1788	int char_shift = 0;
				1789	for (int i = 0; i < characters_; i++) {
				1790	Position* pos = &positions_[i];
				1791	if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
				1792	found_useful_op = true;
				1793	}
				1794	mask_ \|= (pos->mask & char_mask) << char_shift;
				1795	value_ \|= (pos->value & char_mask) << char_shift;
				1796	char_shift += asc ? 8 : 16;
				1797	}
				1798	return found_useful_op;
				1799	}
				1800
				1801
				1802	bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
				1803	Trace* trace,
				1804	bool preload_has_checked_bounds,
				1805	Label* on_possible_success,
				1806	QuickCheckDetails* details,
				1807	bool fall_through_on_failure) {
				1808	if (details->characters() == 0) return false;
				1809	GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
				1810	if (details->cannot_match()) return false;
				1811	if (!details->Rationalize(compiler->ascii())) return false;
				1812	ASSERT(details->characters() == 1 \|\|
				1813	compiler->macro_assembler()->CanReadUnaligned());
				1814	uint32_t mask = details->mask();
				1815	uint32_t value = details->value();
				1816
				1817	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1818
				1819	if (trace->characters_preloaded() != details->characters()) {
				1820	assembler->LoadCurrentCharacter(trace->cp_offset(),
				1821	trace->backtrack(),
				1822	!preload_has_checked_bounds,
				1823	details->characters());
				1824	}
				1825
				1826
				1827	bool need_mask = true;
				1828
				1829	if (details->characters() == 1) {
				1830	// If number of characters preloaded is 1 then we used a byte or 16 bit
				1831	// load so the value is already masked down.
				1832	uint32_t char_mask;
				1833	if (compiler->ascii()) {
				1834	char_mask = String::kMaxAsciiCharCode;
				1835	} else {
				1836	char_mask = String::kMaxUC16CharCode;
				1837	}
				1838	if ((mask & char_mask) == char_mask) need_mask = false;
				1839	mask &= char_mask;
				1840	} else {
Kristian Monsen	9dcf7e2	2010-06-28 14:14:28 +0100	[diff] [blame]	1841	// For 2-character preloads in ASCII mode or 1-character preloads in
				1842	// TWO_BYTE mode we also use a 16 bit load with zero extend.
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1843	if (details->characters() == 2 && compiler->ascii()) {
Kristian Monsen	9dcf7e2	2010-06-28 14:14:28 +0100	[diff] [blame]	1844	if ((mask & 0x7f7f) == 0x7f7f) need_mask = false;
				1845	} else if (details->characters() == 1 && !compiler->ascii()) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1846	if ((mask & 0xffff) == 0xffff) need_mask = false;
				1847	} else {
				1848	if (mask == 0xffffffff) need_mask = false;
				1849	}
				1850	}
				1851
				1852	if (fall_through_on_failure) {
				1853	if (need_mask) {
				1854	assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
				1855	} else {
				1856	assembler->CheckCharacter(value, on_possible_success);
				1857	}
				1858	} else {
				1859	if (need_mask) {
				1860	assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
				1861	} else {
				1862	assembler->CheckNotCharacter(value, trace->backtrack());
				1863	}
				1864	}
				1865	return true;
				1866	}
				1867
				1868
				1869	// Here is the meat of GetQuickCheckDetails (see also the comment on the
				1870	// super-class in the .h file).
				1871	//
				1872	// We iterate along the text object, building up for each character a
				1873	// mask and value that can be used to test for a quick failure to match.
				1874	// The masks and values for the positions will be combined into a single
				1875	// machine word for the current character width in order to be used in
				1876	// generating a quick check.
				1877	void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
				1878	RegExpCompiler* compiler,
				1879	int characters_filled_in,
				1880	bool not_at_start) {
				1881	ASSERT(characters_filled_in < details->characters());
				1882	int characters = details->characters();
				1883	int char_mask;
				1884	int char_shift;
				1885	if (compiler->ascii()) {
				1886	char_mask = String::kMaxAsciiCharCode;
				1887	char_shift = 8;
				1888	} else {
				1889	char_mask = String::kMaxUC16CharCode;
				1890	char_shift = 16;
				1891	}
				1892	for (int k = 0; k < elms_->length(); k++) {
				1893	TextElement elm = elms_->at(k);
				1894	if (elm.type == TextElement::ATOM) {
				1895	Vector<const uc16> quarks = elm.data.u_atom->data();
				1896	for (int i = 0; i < characters && i < quarks.length(); i++) {
				1897	QuickCheckDetails::Position* pos =
				1898	details->positions(characters_filled_in);
				1899	uc16 c = quarks[i];
				1900	if (c > char_mask) {
				1901	// If we expect a non-ASCII character from an ASCII string,
				1902	// there is no way we can match. Not even case independent
				1903	// matching can turn an ASCII character into non-ASCII or
				1904	// vice versa.
				1905	details->set_cannot_match();
				1906	pos->determines_perfectly = false;
				1907	return;
				1908	}
				1909	if (compiler->ignore_case()) {
				1910	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				1911	int length = GetCaseIndependentLetters(c, compiler->ascii(), chars);
				1912	ASSERT(length != 0); // Can only happen if c > char_mask (see above).
				1913	if (length == 1) {
				1914	// This letter has no case equivalents, so it's nice and simple
				1915	// and the mask-compare will determine definitely whether we have
				1916	// a match at this character position.
				1917	pos->mask = char_mask;
				1918	pos->value = c;
				1919	pos->determines_perfectly = true;
				1920	} else {
				1921	uint32_t common_bits = char_mask;
				1922	uint32_t bits = chars[0];
				1923	for (int j = 1; j < length; j++) {
				1924	uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
				1925	common_bits ^= differing_bits;
				1926	bits &= common_bits;
				1927	}
				1928	// If length is 2 and common bits has only one zero in it then
				1929	// our mask and compare instruction will determine definitely
				1930	// whether we have a match at this character position. Otherwise
				1931	// it can only be an approximate check.
				1932	uint32_t one_zero = (common_bits \| ~char_mask);
				1933	if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
				1934	pos->determines_perfectly = true;
				1935	}
				1936	pos->mask = common_bits;
				1937	pos->value = bits;
				1938	}
				1939	} else {
				1940	// Don't ignore case. Nice simple case where the mask-compare will
				1941	// determine definitely whether we have a match at this character
				1942	// position.
				1943	pos->mask = char_mask;
				1944	pos->value = c;
				1945	pos->determines_perfectly = true;
				1946	}
				1947	characters_filled_in++;
				1948	ASSERT(characters_filled_in <= details->characters());
				1949	if (characters_filled_in == details->characters()) {
				1950	return;
				1951	}
				1952	}
				1953	} else {
				1954	QuickCheckDetails::Position* pos =
				1955	details->positions(characters_filled_in);
				1956	RegExpCharacterClass* tree = elm.data.u_char_class;
				1957	ZoneList<CharacterRange>* ranges = tree->ranges();
				1958	if (tree->is_negated()) {
				1959	// A quick check uses multi-character mask and compare. There is no
				1960	// useful way to incorporate a negative char class into this scheme
				1961	// so we just conservatively create a mask and value that will always
				1962	// succeed.
				1963	pos->mask = 0;
				1964	pos->value = 0;
				1965	} else {
				1966	int first_range = 0;
				1967	while (ranges->at(first_range).from() > char_mask) {
				1968	first_range++;
				1969	if (first_range == ranges->length()) {
				1970	details->set_cannot_match();
				1971	pos->determines_perfectly = false;
				1972	return;
				1973	}
				1974	}
				1975	CharacterRange range = ranges->at(first_range);
				1976	uc16 from = range.from();
				1977	uc16 to = range.to();
				1978	if (to > char_mask) {
				1979	to = char_mask;
				1980	}
				1981	uint32_t differing_bits = (from ^ to);
				1982	// A mask and compare is only perfect if the differing bits form a
				1983	// number like 00011111 with one single block of trailing 1s.
				1984	if ((differing_bits & (differing_bits + 1)) == 0 &&
				1985	from + differing_bits == to) {
				1986	pos->determines_perfectly = true;
				1987	}
				1988	uint32_t common_bits = ~SmearBitsRight(differing_bits);
				1989	uint32_t bits = (from & common_bits);
				1990	for (int i = first_range + 1; i < ranges->length(); i++) {
				1991	CharacterRange range = ranges->at(i);
				1992	uc16 from = range.from();
				1993	uc16 to = range.to();
				1994	if (from > char_mask) continue;
				1995	if (to > char_mask) to = char_mask;
				1996	// Here we are combining more ranges into the mask and compare
				1997	// value. With each new range the mask becomes more sparse and
				1998	// so the chances of a false positive rise. A character class
				1999	// with multiple ranges is assumed never to be equivalent to a
				2000	// mask and compare operation.
				2001	pos->determines_perfectly = false;
				2002	uint32_t new_common_bits = (from ^ to);
				2003	new_common_bits = ~SmearBitsRight(new_common_bits);
				2004	common_bits &= new_common_bits;
				2005	bits &= new_common_bits;
				2006	uint32_t differing_bits = (from & common_bits) ^ bits;
				2007	common_bits ^= differing_bits;
				2008	bits &= common_bits;
				2009	}
				2010	pos->mask = common_bits;
				2011	pos->value = bits;
				2012	}
				2013	characters_filled_in++;
				2014	ASSERT(characters_filled_in <= details->characters());
				2015	if (characters_filled_in == details->characters()) {
				2016	return;
				2017	}
				2018	}
				2019	}
				2020	ASSERT(characters_filled_in != details->characters());
				2021	on_success()-> GetQuickCheckDetails(details,
				2022	compiler,
				2023	characters_filled_in,
				2024	true);
				2025	}
				2026
				2027
				2028	void QuickCheckDetails::Clear() {
				2029	for (int i = 0; i < characters_; i++) {
				2030	positions_[i].mask = 0;
				2031	positions_[i].value = 0;
				2032	positions_[i].determines_perfectly = false;
				2033	}
				2034	characters_ = 0;
				2035	}
				2036
				2037
				2038	void QuickCheckDetails::Advance(int by, bool ascii) {
				2039	ASSERT(by >= 0);
				2040	if (by >= characters_) {
				2041	Clear();
				2042	return;
				2043	}
				2044	for (int i = 0; i < characters_ - by; i++) {
				2045	positions_[i] = positions_[by + i];
				2046	}
				2047	for (int i = characters_ - by; i < characters_; i++) {
				2048	positions_[i].mask = 0;
				2049	positions_[i].value = 0;
				2050	positions_[i].determines_perfectly = false;
				2051	}
				2052	characters_ -= by;
				2053	// We could change mask_ and value_ here but we would never advance unless
				2054	// they had already been used in a check and they won't be used again because
				2055	// it would gain us nothing. So there's no point.
				2056	}
				2057
				2058
				2059	void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
				2060	ASSERT(characters_ == other->characters_);
				2061	if (other->cannot_match_) {
				2062	return;
				2063	}
				2064	if (cannot_match_) {
				2065	this = other;
				2066	return;
				2067	}
				2068	for (int i = from_index; i < characters_; i++) {
				2069	QuickCheckDetails::Position* pos = positions(i);
				2070	QuickCheckDetails::Position* other_pos = other->positions(i);
				2071	if (pos->mask != other_pos->mask \|\|
				2072	pos->value != other_pos->value \|\|
				2073	!other_pos->determines_perfectly) {
				2074	// Our mask-compare operation will be approximate unless we have the
				2075	// exact same operation on both sides of the alternation.
				2076	pos->determines_perfectly = false;
				2077	}
				2078	pos->mask &= other_pos->mask;
				2079	pos->value &= pos->mask;
				2080	other_pos->value &= pos->mask;
				2081	uc16 differing_bits = (pos->value ^ other_pos->value);
				2082	pos->mask &= ~differing_bits;
				2083	pos->value &= pos->mask;
				2084	}
				2085	}
				2086
				2087
				2088	class VisitMarker {
				2089	public:
				2090	explicit VisitMarker(NodeInfo* info) : info_(info) {
				2091	ASSERT(!info->visited);
				2092	info->visited = true;
				2093	}
				2094	~VisitMarker() {
				2095	info_->visited = false;
				2096	}
				2097	private:
				2098	NodeInfo* info_;
				2099	};
				2100
				2101
				2102	void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2103	RegExpCompiler* compiler,
				2104	int characters_filled_in,
				2105	bool not_at_start) {
				2106	if (body_can_be_zero_length_ \|\| info()->visited) return;
				2107	VisitMarker marker(info());
				2108	return ChoiceNode::GetQuickCheckDetails(details,
				2109	compiler,
				2110	characters_filled_in,
				2111	not_at_start);
				2112	}
				2113
				2114
				2115	void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2116	RegExpCompiler* compiler,
				2117	int characters_filled_in,
				2118	bool not_at_start) {
				2119	not_at_start = (not_at_start \|\| not_at_start_);
				2120	int choice_count = alternatives_->length();
				2121	ASSERT(choice_count > 0);
				2122	alternatives_->at(0).node()->GetQuickCheckDetails(details,
				2123	compiler,
				2124	characters_filled_in,
				2125	not_at_start);
				2126	for (int i = 1; i < choice_count; i++) {
				2127	QuickCheckDetails new_details(details->characters());
				2128	RegExpNode* node = alternatives_->at(i).node();
				2129	node->GetQuickCheckDetails(&new_details, compiler,
				2130	characters_filled_in,
				2131	not_at_start);
				2132	// Here we merge the quick match details of the two branches.
				2133	details->Merge(&new_details, characters_filled_in);
				2134	}
				2135	}
				2136
				2137
				2138	// Check for [0-9A-Z_a-z].
				2139	static void EmitWordCheck(RegExpMacroAssembler* assembler,
				2140	Label* word,
				2141	Label* non_word,
				2142	bool fall_through_on_word) {
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2143	if (assembler->CheckSpecialCharacterClass(
				2144	fall_through_on_word ? 'w' : 'W',
				2145	fall_through_on_word ? non_word : word)) {
				2146	// Optimized implementation available.
				2147	return;
				2148	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2149	assembler->CheckCharacterGT('z', non_word);
				2150	assembler->CheckCharacterLT('0', non_word);
				2151	assembler->CheckCharacterGT('a' - 1, word);
				2152	assembler->CheckCharacterLT('9' + 1, word);
				2153	assembler->CheckCharacterLT('A', non_word);
				2154	assembler->CheckCharacterLT('Z' + 1, word);
				2155	if (fall_through_on_word) {
				2156	assembler->CheckNotCharacter('_', non_word);
				2157	} else {
				2158	assembler->CheckCharacter('_', word);
				2159	}
				2160	}
				2161
				2162
				2163	// Emit the code to check for a ^ in multiline mode (1-character lookbehind
				2164	// that matches newline or the start of input).
				2165	static void EmitHat(RegExpCompiler* compiler,
				2166	RegExpNode* on_success,
				2167	Trace* trace) {
				2168	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2169	// We will be loading the previous character into the current character
				2170	// register.
				2171	Trace new_trace(*trace);
				2172	new_trace.InvalidateCurrentCharacter();
				2173
				2174	Label ok;
				2175	if (new_trace.cp_offset() == 0) {
				2176	// The start of input counts as a newline in this context, so skip to
				2177	// ok if we are at the start.
				2178	assembler->CheckAtStart(&ok);
				2179	}
				2180	// We already checked that we are not at the start of input so it must be
				2181	// OK to load the previous character.
				2182	assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
				2183	new_trace.backtrack(),
				2184	false);
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2185	if (!assembler->CheckSpecialCharacterClass('n',
				2186	new_trace.backtrack())) {
				2187	// Newline means \n, \r, 0x2028 or 0x2029.
				2188	if (!compiler->ascii()) {
				2189	assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
				2190	}
				2191	assembler->CheckCharacter('\n', &ok);
				2192	assembler->CheckNotCharacter('\r', new_trace.backtrack());
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2193	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2194	assembler->Bind(&ok);
				2195	on_success->Emit(compiler, &new_trace);
				2196	}
				2197
				2198
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2199	// Emit the code to handle \b and \B (word-boundary or non-word-boundary)
				2200	// when we know whether the next character must be a word character or not.
				2201	static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type,
				2202	RegExpCompiler* compiler,
				2203	RegExpNode* on_success,
				2204	Trace* trace) {
				2205	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2206	Label done;
				2207
				2208	Trace new_trace(*trace);
				2209
				2210	bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER);
				2211	Label* on_word = expect_word_character ? &done : new_trace.backtrack();
				2212	Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done;
				2213
				2214	// Check whether previous character was a word character.
				2215	switch (trace->at_start()) {
				2216	case Trace::TRUE:
				2217	if (expect_word_character) {
				2218	assembler->GoTo(on_non_word);
				2219	}
				2220	break;
				2221	case Trace::UNKNOWN:
				2222	ASSERT_EQ(0, trace->cp_offset());
				2223	assembler->CheckAtStart(on_non_word);
				2224	// Fall through.
				2225	case Trace::FALSE:
				2226	int prev_char_offset = trace->cp_offset() - 1;
				2227	assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1);
				2228	EmitWordCheck(assembler, on_word, on_non_word, expect_word_character);
				2229	// We may or may not have loaded the previous character.
				2230	new_trace.InvalidateCurrentCharacter();
				2231	}
				2232
				2233	assembler->Bind(&done);
				2234
				2235	on_success->Emit(compiler, &new_trace);
				2236	}
				2237
				2238
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2239	// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
				2240	static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
				2241	RegExpCompiler* compiler,
				2242	RegExpNode* on_success,
				2243	Trace* trace) {
				2244	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2245	Label before_non_word;
				2246	Label before_word;
				2247	if (trace->characters_preloaded() != 1) {
				2248	assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
				2249	}
				2250	// Fall through on non-word.
				2251	EmitWordCheck(assembler, &before_word, &before_non_word, false);
				2252
				2253	// We will be loading the previous character into the current character
				2254	// register.
				2255	Trace new_trace(*trace);
				2256	new_trace.InvalidateCurrentCharacter();
				2257
				2258	Label ok;
				2259	Label* boundary;
				2260	Label* not_boundary;
				2261	if (type == AssertionNode::AT_BOUNDARY) {
				2262	boundary = &ok;
				2263	not_boundary = new_trace.backtrack();
				2264	} else {
				2265	not_boundary = &ok;
				2266	boundary = new_trace.backtrack();
				2267	}
				2268
				2269	// Next character is not a word character.
				2270	assembler->Bind(&before_non_word);
				2271	if (new_trace.cp_offset() == 0) {
				2272	// The start of input counts as a non-word character, so the question is
				2273	// decided if we are at the start.
				2274	assembler->CheckAtStart(not_boundary);
				2275	}
				2276	// We already checked that we are not at the start of input so it must be
				2277	// OK to load the previous character.
				2278	assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
				2279	&ok, // Unused dummy label in this call.
				2280	false);
				2281	// Fall through on non-word.
				2282	EmitWordCheck(assembler, boundary, not_boundary, false);
				2283	assembler->GoTo(not_boundary);
				2284
				2285	// Next character is a word character.
				2286	assembler->Bind(&before_word);
				2287	if (new_trace.cp_offset() == 0) {
				2288	// The start of input counts as a non-word character, so the question is
				2289	// decided if we are at the start.
				2290	assembler->CheckAtStart(boundary);
				2291	}
				2292	// We already checked that we are not at the start of input so it must be
				2293	// OK to load the previous character.
				2294	assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
				2295	&ok, // Unused dummy label in this call.
				2296	false);
				2297	bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
				2298	EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
				2299
				2300	assembler->Bind(&ok);
				2301
				2302	on_success->Emit(compiler, &new_trace);
				2303	}
				2304
				2305
				2306	void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2307	RegExpCompiler* compiler,
				2308	int filled_in,
				2309	bool not_at_start) {
				2310	if (type_ == AT_START && not_at_start) {
				2311	details->set_cannot_match();
				2312	return;
				2313	}
				2314	return on_success()->GetQuickCheckDetails(details,
				2315	compiler,
				2316	filled_in,
				2317	not_at_start);
				2318	}
				2319
				2320
				2321	void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2322	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2323	switch (type_) {
				2324	case AT_END: {
				2325	Label ok;
				2326	assembler->CheckPosition(trace->cp_offset(), &ok);
				2327	assembler->GoTo(trace->backtrack());
				2328	assembler->Bind(&ok);
				2329	break;
				2330	}
				2331	case AT_START: {
				2332	if (trace->at_start() == Trace::FALSE) {
				2333	assembler->GoTo(trace->backtrack());
				2334	return;
				2335	}
				2336	if (trace->at_start() == Trace::UNKNOWN) {
				2337	assembler->CheckNotAtStart(trace->backtrack());
				2338	Trace at_start_trace = *trace;
				2339	at_start_trace.set_at_start(true);
				2340	on_success()->Emit(compiler, &at_start_trace);
				2341	return;
				2342	}
				2343	}
				2344	break;
				2345	case AFTER_NEWLINE:
				2346	EmitHat(compiler, on_success(), trace);
				2347	return;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2348	case AT_BOUNDARY:
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2349	case AT_NON_BOUNDARY: {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2350	EmitBoundaryCheck(type_, compiler, on_success(), trace);
				2351	return;
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2352	}
				2353	case AFTER_WORD_CHARACTER:
				2354	case AFTER_NONWORD_CHARACTER: {
				2355	EmitHalfBoundaryCheck(type_, compiler, on_success(), trace);
				2356	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2357	}
				2358	on_success()->Emit(compiler, trace);
				2359	}
				2360
				2361
				2362	static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
				2363	if (quick_check == NULL) return false;
				2364	if (offset >= quick_check->characters()) return false;
				2365	return quick_check->positions(offset)->determines_perfectly;
				2366	}
				2367
				2368
				2369	static void UpdateBoundsCheck(int index, int* checked_up_to) {
				2370	if (index > *checked_up_to) {
				2371	*checked_up_to = index;
				2372	}
				2373	}
				2374
				2375
				2376	// We call this repeatedly to generate code for each pass over the text node.
				2377	// The passes are in increasing order of difficulty because we hope one
				2378	// of the first passes will fail in which case we are saved the work of the
				2379	// later passes. for example for the case independent regexp /%[asdfghjkl]a/
				2380	// we will check the '%' in the first pass, the case independent 'a' in the
				2381	// second pass and the character class in the last pass.
				2382	//
				2383	// The passes are done from right to left, so for example to test for /bar/
				2384	// we will first test for an 'r' with offset 2, then an 'a' with offset 1
				2385	// and then a 'b' with offset 0. This means we can avoid the end-of-input
				2386	// bounds check most of the time. In the example we only need to check for
				2387	// end-of-input when loading the putative 'r'.
				2388	//
				2389	// A slight complication involves the fact that the first character may already
				2390	// be fetched into a register by the previous node. In this case we want to
				2391	// do the test for that character first. We do this in separate passes. The
				2392	// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
				2393	// pass has been performed then subsequent passes will have true in
				2394	// first_element_checked to indicate that that character does not need to be
				2395	// checked again.
				2396	//
				2397	// In addition to all this we are passed a Trace, which can
				2398	// contain an AlternativeGeneration object. In this AlternativeGeneration
				2399	// object we can see details of any quick check that was already passed in
				2400	// order to get to the code we are now generating. The quick check can involve
				2401	// loading characters, which means we do not need to recheck the bounds
				2402	// up to the limit the quick check already checked. In addition the quick
				2403	// check can have involved a mask and compare operation which may simplify
				2404	// or obviate the need for further checks at some character positions.
				2405	void TextNode::TextEmitPass(RegExpCompiler* compiler,
				2406	TextEmitPassType pass,
				2407	bool preloaded,
				2408	Trace* trace,
				2409	bool first_element_checked,
				2410	int* checked_up_to) {
				2411	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2412	bool ascii = compiler->ascii();
				2413	Label* backtrack = trace->backtrack();
				2414	QuickCheckDetails* quick_check = trace->quick_check_performed();
				2415	int element_count = elms_->length();
				2416	for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
				2417	TextElement elm = elms_->at(i);
				2418	int cp_offset = trace->cp_offset() + elm.cp_offset;
				2419	if (elm.type == TextElement::ATOM) {
				2420	Vector<const uc16> quarks = elm.data.u_atom->data();
				2421	for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
				2422	if (first_element_checked && i == 0 && j == 0) continue;
				2423	if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
				2424	EmitCharacterFunction* emit_function = NULL;
				2425	switch (pass) {
				2426	case NON_ASCII_MATCH:
				2427	ASSERT(ascii);
				2428	if (quarks[j] > String::kMaxAsciiCharCode) {
				2429	assembler->GoTo(backtrack);
				2430	return;
				2431	}
				2432	break;
				2433	case NON_LETTER_CHARACTER_MATCH:
				2434	emit_function = &EmitAtomNonLetter;
				2435	break;
				2436	case SIMPLE_CHARACTER_MATCH:
				2437	emit_function = &EmitSimpleCharacter;
				2438	break;
				2439	case CASE_CHARACTER_MATCH:
				2440	emit_function = &EmitAtomLetter;
				2441	break;
				2442	default:
				2443	break;
				2444	}
				2445	if (emit_function != NULL) {
				2446	bool bound_checked = emit_function(compiler,
				2447	quarks[j],
				2448	backtrack,
				2449	cp_offset + j,
				2450	*checked_up_to < cp_offset + j,
				2451	preloaded);
				2452	if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
				2453	}
				2454	}
				2455	} else {
				2456	ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
				2457	if (pass == CHARACTER_CLASS_MATCH) {
				2458	if (first_element_checked && i == 0) continue;
				2459	if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
				2460	RegExpCharacterClass* cc = elm.data.u_char_class;
				2461	EmitCharClass(assembler,
				2462	cc,
				2463	ascii,
				2464	backtrack,
				2465	cp_offset,
				2466	*checked_up_to < cp_offset,
				2467	preloaded);
				2468	UpdateBoundsCheck(cp_offset, checked_up_to);
				2469	}
				2470	}
				2471	}
				2472	}
				2473
				2474
				2475	int TextNode::Length() {
				2476	TextElement elm = elms_->last();
				2477	ASSERT(elm.cp_offset >= 0);
				2478	if (elm.type == TextElement::ATOM) {
				2479	return elm.cp_offset + elm.data.u_atom->data().length();
				2480	} else {
				2481	return elm.cp_offset + 1;
				2482	}
				2483	}
				2484
				2485
				2486	bool TextNode::SkipPass(int int_pass, bool ignore_case) {
				2487	TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
				2488	if (ignore_case) {
				2489	return pass == SIMPLE_CHARACTER_MATCH;
				2490	} else {
				2491	return pass == NON_LETTER_CHARACTER_MATCH \|\| pass == CASE_CHARACTER_MATCH;
				2492	}
				2493	}
				2494
				2495
				2496	// This generates the code to match a text node. A text node can contain
				2497	// straight character sequences (possibly to be matched in a case-independent
				2498	// way) and character classes. For efficiency we do not do this in a single
				2499	// pass from left to right. Instead we pass over the text node several times,
				2500	// emitting code for some character positions every time. See the comment on
				2501	// TextEmitPass for details.
				2502	void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2503	LimitResult limit_result = LimitVersions(compiler, trace);
				2504	if (limit_result == DONE) return;
				2505	ASSERT(limit_result == CONTINUE);
				2506
				2507	if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
				2508	compiler->SetRegExpTooBig();
				2509	return;
				2510	}
				2511
				2512	if (compiler->ascii()) {
				2513	int dummy = 0;
				2514	TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
				2515	}
				2516
				2517	bool first_elt_done = false;
				2518	int bound_checked_to = trace->cp_offset() - 1;
				2519	bound_checked_to += trace->bound_checked_up_to();
				2520
				2521	// If a character is preloaded into the current character register then
				2522	// check that now.
				2523	if (trace->characters_preloaded() == 1) {
				2524	for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
				2525	if (!SkipPass(pass, compiler->ignore_case())) {
				2526	TextEmitPass(compiler,
				2527	static_cast<TextEmitPassType>(pass),
				2528	true,
				2529	trace,
				2530	false,
				2531	&bound_checked_to);
				2532	}
				2533	}
				2534	first_elt_done = true;
				2535	}
				2536
				2537	for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
				2538	if (!SkipPass(pass, compiler->ignore_case())) {
				2539	TextEmitPass(compiler,
				2540	static_cast<TextEmitPassType>(pass),
				2541	false,
				2542	trace,
				2543	first_elt_done,
				2544	&bound_checked_to);
				2545	}
				2546	}
				2547
				2548	Trace successor_trace(*trace);
				2549	successor_trace.set_at_start(false);
				2550	successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
				2551	RecursionCheck rc(compiler);
				2552	on_success()->Emit(compiler, &successor_trace);
				2553	}
				2554
				2555
				2556	void Trace::InvalidateCurrentCharacter() {
				2557	characters_preloaded_ = 0;
				2558	}
				2559
				2560
				2561	void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
				2562	ASSERT(by > 0);
				2563	// We don't have an instruction for shifting the current character register
				2564	// down or for using a shifted value for anything so lets just forget that
				2565	// we preloaded any characters into it.
				2566	characters_preloaded_ = 0;
				2567	// Adjust the offsets of the quick check performed information. This
				2568	// information is used to find out what we already determined about the
				2569	// characters by means of mask and compare.
				2570	quick_check_performed_.Advance(by, compiler->ascii());
				2571	cp_offset_ += by;
				2572	if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
				2573	compiler->SetRegExpTooBig();
				2574	cp_offset_ = 0;
				2575	}
				2576	bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
				2577	}
				2578
				2579
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	2580	void TextNode::MakeCaseIndependent(bool is_ascii) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2581	int element_count = elms_->length();
				2582	for (int i = 0; i < element_count; i++) {
				2583	TextElement elm = elms_->at(i);
				2584	if (elm.type == TextElement::CHAR_CLASS) {
				2585	RegExpCharacterClass* cc = elm.data.u_char_class;
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	2586	// None of the standard character classses is different in the case
				2587	// independent case and it slows us down if we don't know that.
				2588	if (cc->is_standard()) continue;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2589	ZoneList<CharacterRange>* ranges = cc->ranges();
				2590	int range_count = ranges->length();
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	2591	for (int j = 0; j < range_count; j++) {
				2592	ranges->at(j).AddCaseEquivalents(ranges, is_ascii);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2593	}
				2594	}
				2595	}
				2596	}
				2597
				2598
				2599	int TextNode::GreedyLoopTextLength() {
				2600	TextElement elm = elms_->at(elms_->length() - 1);
				2601	if (elm.type == TextElement::CHAR_CLASS) {
				2602	return elm.cp_offset + 1;
				2603	} else {
				2604	return elm.cp_offset + elm.data.u_atom->data().length();
				2605	}
				2606	}
				2607
				2608
				2609	// Finds the fixed match length of a sequence of nodes that goes from
				2610	// this alternative and back to this choice node. If there are variable
				2611	// length nodes or other complications in the way then return a sentinel
				2612	// value indicating that a greedy loop cannot be constructed.
				2613	int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) {
				2614	int length = 0;
				2615	RegExpNode* node = alternative->node();
				2616	// Later we will generate code for all these text nodes using recursion
				2617	// so we have to limit the max number.
				2618	int recursion_depth = 0;
				2619	while (node != this) {
				2620	if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
				2621	return kNodeIsTooComplexForGreedyLoops;
				2622	}
				2623	int node_length = node->GreedyLoopTextLength();
				2624	if (node_length == kNodeIsTooComplexForGreedyLoops) {
				2625	return kNodeIsTooComplexForGreedyLoops;
				2626	}
				2627	length += node_length;
				2628	SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
				2629	node = seq_node->on_success();
				2630	}
				2631	return length;
				2632	}
				2633
				2634
				2635	void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
				2636	ASSERT_EQ(loop_node_, NULL);
				2637	AddAlternative(alt);
				2638	loop_node_ = alt.node();
				2639	}
				2640
				2641
				2642	void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
				2643	ASSERT_EQ(continue_node_, NULL);
				2644	AddAlternative(alt);
				2645	continue_node_ = alt.node();
				2646	}
				2647
				2648
				2649	void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2650	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				2651	if (trace->stop_node() == this) {
				2652	int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
				2653	ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
				2654	// Update the counter-based backtracking info on the stack. This is an
				2655	// optimization for greedy loops (see below).
				2656	ASSERT(trace->cp_offset() == text_length);
				2657	macro_assembler->AdvanceCurrentPosition(text_length);
				2658	macro_assembler->GoTo(trace->loop_label());
				2659	return;
				2660	}
				2661	ASSERT(trace->stop_node() == NULL);
				2662	if (!trace->is_trivial()) {
				2663	trace->Flush(compiler, this);
				2664	return;
				2665	}
				2666	ChoiceNode::Emit(compiler, trace);
				2667	}
				2668
				2669
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	2670	int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
				2671	bool not_at_start) {
				2672	int preload_characters = EatsAtLeast(4, 0, not_at_start);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2673	if (compiler->macro_assembler()->CanReadUnaligned()) {
				2674	bool ascii = compiler->ascii();
				2675	if (ascii) {
				2676	if (preload_characters > 4) preload_characters = 4;
				2677	// We can't preload 3 characters because there is no machine instruction
				2678	// to do that. We can't just load 4 because we could be reading
				2679	// beyond the end of the string, which could cause a memory fault.
				2680	if (preload_characters == 3) preload_characters = 2;
				2681	} else {
				2682	if (preload_characters > 2) preload_characters = 2;
				2683	}
				2684	} else {
				2685	if (preload_characters > 1) preload_characters = 1;
				2686	}
				2687	return preload_characters;
				2688	}
				2689
				2690
				2691	// This class is used when generating the alternatives in a choice node. It
				2692	// records the way the alternative is being code generated.
				2693	class AlternativeGeneration: public Malloced {
				2694	public:
				2695	AlternativeGeneration()
				2696	: possible_success(),
				2697	expects_preload(false),
				2698	after(),
				2699	quick_check_details() { }
				2700	Label possible_success;
				2701	bool expects_preload;
				2702	Label after;
				2703	QuickCheckDetails quick_check_details;
				2704	};
				2705
				2706
				2707	// Creates a list of AlternativeGenerations. If the list has a reasonable
				2708	// size then it is on the stack, otherwise the excess is on the heap.
				2709	class AlternativeGenerationList {
				2710	public:
				2711	explicit AlternativeGenerationList(int count)
				2712	: alt_gens_(count) {
				2713	for (int i = 0; i < count && i < kAFew; i++) {
				2714	alt_gens_.Add(a_few_alt_gens_ + i);
				2715	}
				2716	for (int i = kAFew; i < count; i++) {
				2717	alt_gens_.Add(new AlternativeGeneration());
				2718	}
				2719	}
				2720	~AlternativeGenerationList() {
				2721	for (int i = kAFew; i < alt_gens_.length(); i++) {
				2722	delete alt_gens_[i];
				2723	alt_gens_[i] = NULL;
				2724	}
				2725	}
				2726
				2727	AlternativeGeneration* at(int i) {
				2728	return alt_gens_[i];
				2729	}
				2730	private:
				2731	static const int kAFew = 10;
				2732	ZoneList<AlternativeGeneration*> alt_gens_;
				2733	AlternativeGeneration a_few_alt_gens_[kAFew];
				2734	};
				2735
				2736
				2737	/* Code generation for choice nodes.
				2738	*
				2739	* We generate quick checks that do a mask and compare to eliminate a
				2740	* choice. If the quick check succeeds then it jumps to the continuation to
				2741	* do slow checks and check subsequent nodes. If it fails (the common case)
				2742	* it falls through to the next choice.
				2743	*
				2744	* Here is the desired flow graph. Nodes directly below each other imply
				2745	* fallthrough. Alternatives 1 and 2 have quick checks. Alternative
				2746	* 3 doesn't have a quick check so we have to call the slow check.
				2747	* Nodes are marked Qn for quick checks and Sn for slow checks. The entire
				2748	* regexp continuation is generated directly after the Sn node, up to the
				2749	* next GoTo if we decide to reuse some already generated code. Some
				2750	* nodes expect preload_characters to be preloaded into the current
				2751	* character register. R nodes do this preloading. Vertices are marked
				2752	* F for failures and S for success (possible success in the case of quick
				2753	* nodes). L, V, < and > are used as arrow heads.
				2754	*
				2755	* ----------> R
				2756	* \|
				2757	* V
				2758	* Q1 -----> S1
				2759	* \| S /
				2760	* F\| /
				2761	* \| F/
				2762	* \| /
				2763	* \| R
				2764	* \| /
				2765	* V L
				2766	* Q2 -----> S2
				2767	* \| S /
				2768	* F\| /
				2769	* \| F/
				2770	* \| /
				2771	* \| R
				2772	* \| /
				2773	* V L
				2774	* S3
				2775	* \|
				2776	* F\|
				2777	* \|
				2778	* R
				2779	* \|
				2780	* backtrack V
				2781	* <----------Q4
				2782	* \ F \|
				2783	* \ \|S
				2784	* \ F V
				2785	* \-----S4
				2786	*
				2787	* For greedy loops we reverse our expectation and expect to match rather
				2788	* than fail. Therefore we want the loop code to look like this (U is the
				2789	* unwind code that steps back in the greedy loop). The following alternatives
				2790	* look the same as above.
				2791	* _____
				2792	* / \
				2793	* V \|
				2794	* ----------> S1 \|
				2795	* /\| \|
				2796	* / \|S \|
				2797	* F/ \_____/
				2798	* /
				2799	* \|<-----------
				2800	* \| \
				2801	* V \
				2802	* Q2 ---> S2 \
				2803	* \| S / \|
				2804	* F\| / \|
				2805	* \| F/ \|
				2806	* \| / \|
				2807	* \| R \|
				2808	* \| / \|
				2809	* F VL \|
				2810	* <------U \|
				2811	* back \|S \|
				2812	* \______________/
				2813	*/
				2814
				2815
				2816	void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2817	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				2818	int choice_count = alternatives_->length();
				2819	#ifdef DEBUG
				2820	for (int i = 0; i < choice_count - 1; i++) {
				2821	GuardedAlternative alternative = alternatives_->at(i);
				2822	ZoneList<Guard> guards = alternative.guards();
				2823	int guard_count = (guards == NULL) ? 0 : guards->length();
				2824	for (int j = 0; j < guard_count; j++) {
				2825	ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
				2826	}
				2827	}
				2828	#endif
				2829
				2830	LimitResult limit_result = LimitVersions(compiler, trace);
				2831	if (limit_result == DONE) return;
				2832	ASSERT(limit_result == CONTINUE);
				2833
				2834	int new_flush_budget = trace->flush_budget() / choice_count;
				2835	if (trace->flush_budget() == 0 && trace->actions() != NULL) {
				2836	trace->Flush(compiler, this);
				2837	return;
				2838	}
				2839
				2840	RecursionCheck rc(compiler);
				2841
				2842	Trace* current_trace = trace;
				2843
				2844	int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
				2845	bool greedy_loop = false;
				2846	Label greedy_loop_label;
				2847	Trace counter_backtrack_trace;
				2848	counter_backtrack_trace.set_backtrack(&greedy_loop_label);
				2849	if (not_at_start()) counter_backtrack_trace.set_at_start(false);
				2850
				2851	if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
				2852	// Here we have special handling for greedy loops containing only text nodes
				2853	// and other simple nodes. These are handled by pushing the current
				2854	// position on the stack and then incrementing the current position each
				2855	// time around the switch. On backtrack we decrement the current position
				2856	// and check it against the pushed value. This avoids pushing backtrack
				2857	// information for each iteration of the loop, which could take up a lot of
				2858	// space.
				2859	greedy_loop = true;
				2860	ASSERT(trace->stop_node() == NULL);
				2861	macro_assembler->PushCurrentPosition();
				2862	current_trace = &counter_backtrack_trace;
				2863	Label greedy_match_failed;
				2864	Trace greedy_match_trace;
				2865	if (not_at_start()) greedy_match_trace.set_at_start(false);
				2866	greedy_match_trace.set_backtrack(&greedy_match_failed);
				2867	Label loop_label;
				2868	macro_assembler->Bind(&loop_label);
				2869	greedy_match_trace.set_stop_node(this);
				2870	greedy_match_trace.set_loop_label(&loop_label);
				2871	alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
				2872	macro_assembler->Bind(&greedy_match_failed);
				2873	}
				2874
				2875	Label second_choice; // For use in greedy matches.
				2876	macro_assembler->Bind(&second_choice);
				2877
				2878	int first_normal_choice = greedy_loop ? 1 : 0;
				2879
Ben Murdoch	b0fe162	2011-05-05 13:52:32 +0100	[diff] [blame]	2880	int preload_characters =
				2881	CalculatePreloadCharacters(compiler,
				2882	current_trace->at_start() == Trace::FALSE);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2883	bool preload_is_current =
				2884	(current_trace->characters_preloaded() == preload_characters);
				2885	bool preload_has_checked_bounds = preload_is_current;
				2886
				2887	AlternativeGenerationList alt_gens(choice_count);
				2888
				2889	// For now we just call all choices one after the other. The idea ultimately
				2890	// is to use the Dispatch table to try only the relevant ones.
				2891	for (int i = first_normal_choice; i < choice_count; i++) {
				2892	GuardedAlternative alternative = alternatives_->at(i);
				2893	AlternativeGeneration* alt_gen = alt_gens.at(i);
				2894	alt_gen->quick_check_details.set_characters(preload_characters);
				2895	ZoneList<Guard> guards = alternative.guards();
				2896	int guard_count = (guards == NULL) ? 0 : guards->length();
				2897	Trace new_trace(*current_trace);
				2898	new_trace.set_characters_preloaded(preload_is_current ?
				2899	preload_characters :
				2900	0);
				2901	if (preload_has_checked_bounds) {
				2902	new_trace.set_bound_checked_up_to(preload_characters);
				2903	}
				2904	new_trace.quick_check_performed()->Clear();
				2905	if (not_at_start_) new_trace.set_at_start(Trace::FALSE);
				2906	alt_gen->expects_preload = preload_is_current;
				2907	bool generate_full_check_inline = false;
				2908	if (FLAG_regexp_optimization &&
				2909	try_to_emit_quick_check_for_alternative(i) &&
				2910	alternative.node()->EmitQuickCheck(compiler,
				2911	&new_trace,
				2912	preload_has_checked_bounds,
				2913	&alt_gen->possible_success,
				2914	&alt_gen->quick_check_details,
				2915	i < choice_count - 1)) {
				2916	// Quick check was generated for this choice.
				2917	preload_is_current = true;
				2918	preload_has_checked_bounds = true;
				2919	// On the last choice in the ChoiceNode we generated the quick
				2920	// check to fall through on possible success. So now we need to
				2921	// generate the full check inline.
				2922	if (i == choice_count - 1) {
				2923	macro_assembler->Bind(&alt_gen->possible_success);
				2924	new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
				2925	new_trace.set_characters_preloaded(preload_characters);
				2926	new_trace.set_bound_checked_up_to(preload_characters);
				2927	generate_full_check_inline = true;
				2928	}
				2929	} else if (alt_gen->quick_check_details.cannot_match()) {
				2930	if (i == choice_count - 1 && !greedy_loop) {
				2931	macro_assembler->GoTo(trace->backtrack());
				2932	}
				2933	continue;
				2934	} else {
				2935	// No quick check was generated. Put the full code here.
				2936	// If this is not the first choice then there could be slow checks from
				2937	// previous cases that go here when they fail. There's no reason to
				2938	// insist that they preload characters since the slow check we are about
				2939	// to generate probably can't use it.
				2940	if (i != first_normal_choice) {
				2941	alt_gen->expects_preload = false;
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2942	new_trace.InvalidateCurrentCharacter();
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2943	}
				2944	if (i < choice_count - 1) {
				2945	new_trace.set_backtrack(&alt_gen->after);
				2946	}
				2947	generate_full_check_inline = true;
				2948	}
				2949	if (generate_full_check_inline) {
				2950	if (new_trace.actions() != NULL) {
				2951	new_trace.set_flush_budget(new_flush_budget);
				2952	}
				2953	for (int j = 0; j < guard_count; j++) {
				2954	GenerateGuard(macro_assembler, guards->at(j), &new_trace);
				2955	}
				2956	alternative.node()->Emit(compiler, &new_trace);
				2957	preload_is_current = false;
				2958	}
				2959	macro_assembler->Bind(&alt_gen->after);
				2960	}
				2961	if (greedy_loop) {
				2962	macro_assembler->Bind(&greedy_loop_label);
				2963	// If we have unwound to the bottom then backtrack.
				2964	macro_assembler->CheckGreedyLoop(trace->backtrack());
				2965	// Otherwise try the second priority at an earlier position.
				2966	macro_assembler->AdvanceCurrentPosition(-text_length);
				2967	macro_assembler->GoTo(&second_choice);
				2968	}
				2969
				2970	// At this point we need to generate slow checks for the alternatives where
				2971	// the quick check was inlined. We can recognize these because the associated
				2972	// label was bound.
				2973	for (int i = first_normal_choice; i < choice_count - 1; i++) {
				2974	AlternativeGeneration* alt_gen = alt_gens.at(i);
				2975	Trace new_trace(*current_trace);
				2976	// If there are actions to be flushed we have to limit how many times
				2977	// they are flushed. Take the budget of the parent trace and distribute
				2978	// it fairly amongst the children.
				2979	if (new_trace.actions() != NULL) {
				2980	new_trace.set_flush_budget(new_flush_budget);
				2981	}
				2982	EmitOutOfLineContinuation(compiler,
				2983	&new_trace,
				2984	alternatives_->at(i),
				2985	alt_gen,
				2986	preload_characters,
				2987	alt_gens.at(i + 1)->expects_preload);
				2988	}
				2989	}
				2990
				2991
				2992	void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
				2993	Trace* trace,
				2994	GuardedAlternative alternative,
				2995	AlternativeGeneration* alt_gen,
				2996	int preload_characters,
				2997	bool next_expects_preload) {
				2998	if (!alt_gen->possible_success.is_linked()) return;
				2999
				3000	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				3001	macro_assembler->Bind(&alt_gen->possible_success);
				3002	Trace out_of_line_trace(*trace);
				3003	out_of_line_trace.set_characters_preloaded(preload_characters);
				3004	out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
				3005	if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE);
				3006	ZoneList<Guard> guards = alternative.guards();
				3007	int guard_count = (guards == NULL) ? 0 : guards->length();
				3008	if (next_expects_preload) {
				3009	Label reload_current_char;
				3010	out_of_line_trace.set_backtrack(&reload_current_char);
				3011	for (int j = 0; j < guard_count; j++) {
				3012	GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
				3013	}
				3014	alternative.node()->Emit(compiler, &out_of_line_trace);
				3015	macro_assembler->Bind(&reload_current_char);
				3016	// Reload the current character, since the next quick check expects that.
				3017	// We don't need to check bounds here because we only get into this
				3018	// code through a quick check which already did the checked load.
				3019	macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
				3020	NULL,
				3021	false,
				3022	preload_characters);
				3023	macro_assembler->GoTo(&(alt_gen->after));
				3024	} else {
				3025	out_of_line_trace.set_backtrack(&(alt_gen->after));
				3026	for (int j = 0; j < guard_count; j++) {
				3027	GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
				3028	}
				3029	alternative.node()->Emit(compiler, &out_of_line_trace);
				3030	}
				3031	}
				3032
				3033
				3034	void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3035	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3036	LimitResult limit_result = LimitVersions(compiler, trace);
				3037	if (limit_result == DONE) return;
				3038	ASSERT(limit_result == CONTINUE);
				3039
				3040	RecursionCheck rc(compiler);
				3041
				3042	switch (type_) {
				3043	case STORE_POSITION: {
				3044	Trace::DeferredCapture
				3045	new_capture(data_.u_position_register.reg,
				3046	data_.u_position_register.is_capture,
				3047	trace);
				3048	Trace new_trace = *trace;
				3049	new_trace.add_action(&new_capture);
				3050	on_success()->Emit(compiler, &new_trace);
				3051	break;
				3052	}
				3053	case INCREMENT_REGISTER: {
				3054	Trace::DeferredIncrementRegister
				3055	new_increment(data_.u_increment_register.reg);
				3056	Trace new_trace = *trace;
				3057	new_trace.add_action(&new_increment);
				3058	on_success()->Emit(compiler, &new_trace);
				3059	break;
				3060	}
				3061	case SET_REGISTER: {
				3062	Trace::DeferredSetRegister
				3063	new_set(data_.u_store_register.reg, data_.u_store_register.value);
				3064	Trace new_trace = *trace;
				3065	new_trace.add_action(&new_set);
				3066	on_success()->Emit(compiler, &new_trace);
				3067	break;
				3068	}
				3069	case CLEAR_CAPTURES: {
				3070	Trace::DeferredClearCaptures
				3071	new_capture(Interval(data_.u_clear_captures.range_from,
				3072	data_.u_clear_captures.range_to));
				3073	Trace new_trace = *trace;
				3074	new_trace.add_action(&new_capture);
				3075	on_success()->Emit(compiler, &new_trace);
				3076	break;
				3077	}
				3078	case BEGIN_SUBMATCH:
				3079	if (!trace->is_trivial()) {
				3080	trace->Flush(compiler, this);
				3081	} else {
				3082	assembler->WriteCurrentPositionToRegister(
				3083	data_.u_submatch.current_position_register, 0);
				3084	assembler->WriteStackPointerToRegister(
				3085	data_.u_submatch.stack_pointer_register);
				3086	on_success()->Emit(compiler, trace);
				3087	}
				3088	break;
				3089	case EMPTY_MATCH_CHECK: {
				3090	int start_pos_reg = data_.u_empty_match_check.start_register;
				3091	int stored_pos = 0;
				3092	int rep_reg = data_.u_empty_match_check.repetition_register;
				3093	bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
				3094	bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
				3095	if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
				3096	// If we know we haven't advanced and there is no minimum we
				3097	// can just backtrack immediately.
				3098	assembler->GoTo(trace->backtrack());
				3099	} else if (know_dist && stored_pos < trace->cp_offset()) {
				3100	// If we know we've advanced we can generate the continuation
				3101	// immediately.
				3102	on_success()->Emit(compiler, trace);
				3103	} else if (!trace->is_trivial()) {
				3104	trace->Flush(compiler, this);
				3105	} else {
				3106	Label skip_empty_check;
				3107	// If we have a minimum number of repetitions we check the current
				3108	// number first and skip the empty check if it's not enough.
				3109	if (has_minimum) {
				3110	int limit = data_.u_empty_match_check.repetition_limit;
				3111	assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
				3112	}
				3113	// If the match is empty we bail out, otherwise we fall through
				3114	// to the on-success continuation.
				3115	assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
				3116	trace->backtrack());
				3117	assembler->Bind(&skip_empty_check);
				3118	on_success()->Emit(compiler, trace);
				3119	}
				3120	break;
				3121	}
				3122	case POSITIVE_SUBMATCH_SUCCESS: {
				3123	if (!trace->is_trivial()) {
				3124	trace->Flush(compiler, this);
				3125	return;
				3126	}
				3127	assembler->ReadCurrentPositionFromRegister(
				3128	data_.u_submatch.current_position_register);
				3129	assembler->ReadStackPointerFromRegister(
				3130	data_.u_submatch.stack_pointer_register);
				3131	int clear_register_count = data_.u_submatch.clear_register_count;
				3132	if (clear_register_count == 0) {
				3133	on_success()->Emit(compiler, trace);
				3134	return;
				3135	}
				3136	int clear_registers_from = data_.u_submatch.clear_register_from;
				3137	Label clear_registers_backtrack;
				3138	Trace new_trace = *trace;
				3139	new_trace.set_backtrack(&clear_registers_backtrack);
				3140	on_success()->Emit(compiler, &new_trace);
				3141
				3142	assembler->Bind(&clear_registers_backtrack);
				3143	int clear_registers_to = clear_registers_from + clear_register_count - 1;
				3144	assembler->ClearRegisters(clear_registers_from, clear_registers_to);
				3145
				3146	ASSERT(trace->backtrack() == NULL);
				3147	assembler->Backtrack();
				3148	return;
				3149	}
				3150	default:
				3151	UNREACHABLE();
				3152	}
				3153	}
				3154
				3155
				3156	void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3157	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3158	if (!trace->is_trivial()) {
				3159	trace->Flush(compiler, this);
				3160	return;
				3161	}
				3162
				3163	LimitResult limit_result = LimitVersions(compiler, trace);
				3164	if (limit_result == DONE) return;
				3165	ASSERT(limit_result == CONTINUE);
				3166
				3167	RecursionCheck rc(compiler);
				3168
				3169	ASSERT_EQ(start_reg_ + 1, end_reg_);
				3170	if (compiler->ignore_case()) {
				3171	assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
				3172	trace->backtrack());
				3173	} else {
				3174	assembler->CheckNotBackReference(start_reg_, trace->backtrack());
				3175	}
				3176	on_success()->Emit(compiler, trace);
				3177	}
				3178
				3179
				3180	// -------------------------------------------------------------------
				3181	// Dot/dotty output
				3182
				3183
				3184	#ifdef DEBUG
				3185
				3186
				3187	class DotPrinter: public NodeVisitor {
				3188	public:
				3189	explicit DotPrinter(bool ignore_case)
				3190	: ignore_case_(ignore_case),
				3191	stream_(&alloc_) { }
				3192	void PrintNode(const char* label, RegExpNode* node);
				3193	void Visit(RegExpNode* node);
				3194	void PrintAttributes(RegExpNode* from);
				3195	StringStream* stream() { return &stream_; }
				3196	void PrintOnFailure(RegExpNode* from, RegExpNode* to);
				3197	#define DECLARE_VISIT(Type) \
				3198	virtual void Visit##Type(Type##Node* that);
				3199	FOR_EACH_NODE_TYPE(DECLARE_VISIT)
				3200	#undef DECLARE_VISIT
				3201	private:
				3202	bool ignore_case_;
				3203	HeapStringAllocator alloc_;
				3204	StringStream stream_;
				3205	};
				3206
				3207
				3208	void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
				3209	stream()->Add("digraph G {\n graph [label=\"");
				3210	for (int i = 0; label[i]; i++) {
				3211	switch (label[i]) {
				3212	case '\\':
				3213	stream()->Add("\\\\");
				3214	break;
				3215	case '"':
				3216	stream()->Add("\"");
				3217	break;
				3218	default:
				3219	stream()->Put(label[i]);
				3220	break;
				3221	}
				3222	}
				3223	stream()->Add("\"];\n");
				3224	Visit(node);
				3225	stream()->Add("}\n");
				3226	printf("%s", *(stream()->ToCString()));
				3227	}
				3228
				3229
				3230	void DotPrinter::Visit(RegExpNode* node) {
				3231	if (node->info()->visited) return;
				3232	node->info()->visited = true;
				3233	node->Accept(this);
				3234	}
				3235
				3236
				3237	void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
				3238	stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
				3239	Visit(on_failure);
				3240	}
				3241
				3242
				3243	class TableEntryBodyPrinter {
				3244	public:
				3245	TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
				3246	: stream_(stream), choice_(choice) { }
				3247	void Call(uc16 from, DispatchTable::Entry entry) {
				3248	OutSet* out_set = entry.out_set();
				3249	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				3250	if (out_set->Get(i)) {
				3251	stream()->Add(" n%p:s%io%i -> n%p;\n",
				3252	choice(),
				3253	from,
				3254	i,
				3255	choice()->alternatives()->at(i).node());
				3256	}
				3257	}
				3258	}
				3259	private:
				3260	StringStream* stream() { return stream_; }
				3261	ChoiceNode* choice() { return choice_; }
				3262	StringStream* stream_;
				3263	ChoiceNode* choice_;
				3264	};
				3265
				3266
				3267	class TableEntryHeaderPrinter {
				3268	public:
				3269	explicit TableEntryHeaderPrinter(StringStream* stream)
				3270	: first_(true), stream_(stream) { }
				3271	void Call(uc16 from, DispatchTable::Entry entry) {
				3272	if (first_) {
				3273	first_ = false;
				3274	} else {
				3275	stream()->Add("\|");
				3276	}
				3277	stream()->Add("{\\%k-\\%k\|{", from, entry.to());
				3278	OutSet* out_set = entry.out_set();
				3279	int priority = 0;
				3280	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				3281	if (out_set->Get(i)) {
				3282	if (priority > 0) stream()->Add("\|");
				3283	stream()->Add("<s%io%i> %i", from, i, priority);
				3284	priority++;
				3285	}
				3286	}
				3287	stream()->Add("}}");
				3288	}
				3289	private:
				3290	bool first_;
				3291	StringStream* stream() { return stream_; }
				3292	StringStream* stream_;
				3293	};
				3294
				3295
				3296	class AttributePrinter {
				3297	public:
				3298	explicit AttributePrinter(DotPrinter* out)
				3299	: out_(out), first_(true) { }
				3300	void PrintSeparator() {
				3301	if (first_) {
				3302	first_ = false;
				3303	} else {
				3304	out_->stream()->Add("\|");
				3305	}
				3306	}
				3307	void PrintBit(const char* name, bool value) {
				3308	if (!value) return;
				3309	PrintSeparator();
				3310	out_->stream()->Add("{%s}", name);
				3311	}
				3312	void PrintPositive(const char* name, int value) {
				3313	if (value < 0) return;
				3314	PrintSeparator();
				3315	out_->stream()->Add("{%s\|%x}", name, value);
				3316	}
				3317	private:
				3318	DotPrinter* out_;
				3319	bool first_;
				3320	};
				3321
				3322
				3323	void DotPrinter::PrintAttributes(RegExpNode* that) {
				3324	stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
				3325	"margin=0.1, fontsize=10, label=\"{",
				3326	that);
				3327	AttributePrinter printer(this);
				3328	NodeInfo* info = that->info();
				3329	printer.PrintBit("NI", info->follows_newline_interest);
				3330	printer.PrintBit("WI", info->follows_word_interest);
				3331	printer.PrintBit("SI", info->follows_start_interest);
				3332	Label* label = that->label();
				3333	if (label->is_bound())
				3334	printer.PrintPositive("@", label->pos());
				3335	stream()->Add("}\"];\n");
				3336	stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
				3337	"arrowhead=none];\n", that, that);
				3338	}
				3339
				3340
				3341	static const bool kPrintDispatchTable = false;
				3342	void DotPrinter::VisitChoice(ChoiceNode* that) {
				3343	if (kPrintDispatchTable) {
				3344	stream()->Add(" n%p [shape=Mrecord, label=\"", that);
				3345	TableEntryHeaderPrinter header_printer(stream());
				3346	that->GetTable(ignore_case_)->ForEach(&header_printer);
				3347	stream()->Add("\"]\n", that);
				3348	PrintAttributes(that);
				3349	TableEntryBodyPrinter body_printer(stream(), that);
				3350	that->GetTable(ignore_case_)->ForEach(&body_printer);
				3351	} else {
				3352	stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
				3353	for (int i = 0; i < that->alternatives()->length(); i++) {
				3354	GuardedAlternative alt = that->alternatives()->at(i);
				3355	stream()->Add(" n%p -> n%p;\n", that, alt.node());
				3356	}
				3357	}
				3358	for (int i = 0; i < that->alternatives()->length(); i++) {
				3359	GuardedAlternative alt = that->alternatives()->at(i);
				3360	alt.node()->Accept(this);
				3361	}
				3362	}
				3363
				3364
				3365	void DotPrinter::VisitText(TextNode* that) {
				3366	stream()->Add(" n%p [label=\"", that);
				3367	for (int i = 0; i < that->elements()->length(); i++) {
				3368	if (i > 0) stream()->Add(" ");
				3369	TextElement elm = that->elements()->at(i);
				3370	switch (elm.type) {
				3371	case TextElement::ATOM: {
				3372	stream()->Add("'%w'", elm.data.u_atom->data());
				3373	break;
				3374	}
				3375	case TextElement::CHAR_CLASS: {
				3376	RegExpCharacterClass* node = elm.data.u_char_class;
				3377	stream()->Add("[");
				3378	if (node->is_negated())
				3379	stream()->Add("^");
				3380	for (int j = 0; j < node->ranges()->length(); j++) {
				3381	CharacterRange range = node->ranges()->at(j);
				3382	stream()->Add("%k-%k", range.from(), range.to());
				3383	}
				3384	stream()->Add("]");
				3385	break;
				3386	}
				3387	default:
				3388	UNREACHABLE();
				3389	}
				3390	}
				3391	stream()->Add("\", shape=box, peripheries=2];\n");
				3392	PrintAttributes(that);
				3393	stream()->Add(" n%p -> n%p;\n", that, that->on_success());
				3394	Visit(that->on_success());
				3395	}
				3396
				3397
				3398	void DotPrinter::VisitBackReference(BackReferenceNode* that) {
				3399	stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
				3400	that,
				3401	that->start_register(),
				3402	that->end_register());
				3403	PrintAttributes(that);
				3404	stream()->Add(" n%p -> n%p;\n", that, that->on_success());
				3405	Visit(that->on_success());
				3406	}
				3407
				3408
				3409	void DotPrinter::VisitEnd(EndNode* that) {
				3410	stream()->Add(" n%p [style=bold, shape=point];\n", that);
				3411	PrintAttributes(that);
				3412	}
				3413
				3414
				3415	void DotPrinter::VisitAssertion(AssertionNode* that) {
				3416	stream()->Add(" n%p [", that);
				3417	switch (that->type()) {
				3418	case AssertionNode::AT_END:
				3419	stream()->Add("label=\"$\", shape=septagon");
				3420	break;
				3421	case AssertionNode::AT_START:
				3422	stream()->Add("label=\"^\", shape=septagon");
				3423	break;
				3424	case AssertionNode::AT_BOUNDARY:
				3425	stream()->Add("label=\"\\b\", shape=septagon");
				3426	break;
				3427	case AssertionNode::AT_NON_BOUNDARY:
				3428	stream()->Add("label=\"\\B\", shape=septagon");
				3429	break;
				3430	case AssertionNode::AFTER_NEWLINE:
				3431	stream()->Add("label=\"(?<=\\n)\", shape=septagon");
				3432	break;
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	3433	case AssertionNode::AFTER_WORD_CHARACTER:
				3434	stream()->Add("label=\"(?<=\\w)\", shape=septagon");
				3435	break;
				3436	case AssertionNode::AFTER_NONWORD_CHARACTER:
				3437	stream()->Add("label=\"(?<=\\W)\", shape=septagon");
				3438	break;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3439	}
				3440	stream()->Add("];\n");
				3441	PrintAttributes(that);
				3442	RegExpNode* successor = that->on_success();
				3443	stream()->Add(" n%p -> n%p;\n", that, successor);
				3444	Visit(successor);
				3445	}
				3446
				3447
				3448	void DotPrinter::VisitAction(ActionNode* that) {
				3449	stream()->Add(" n%p [", that);
				3450	switch (that->type_) {
				3451	case ActionNode::SET_REGISTER:
				3452	stream()->Add("label=\"$%i:=%i\", shape=octagon",
				3453	that->data_.u_store_register.reg,
				3454	that->data_.u_store_register.value);
				3455	break;
				3456	case ActionNode::INCREMENT_REGISTER:
				3457	stream()->Add("label=\"$%i++\", shape=octagon",
				3458	that->data_.u_increment_register.reg);
				3459	break;
				3460	case ActionNode::STORE_POSITION:
				3461	stream()->Add("label=\"$%i:=$pos\", shape=octagon",
				3462	that->data_.u_position_register.reg);
				3463	break;
				3464	case ActionNode::BEGIN_SUBMATCH:
				3465	stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
				3466	that->data_.u_submatch.current_position_register);
				3467	break;
				3468	case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
				3469	stream()->Add("label=\"escape\", shape=septagon");
				3470	break;
				3471	case ActionNode::EMPTY_MATCH_CHECK:
				3472	stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
				3473	that->data_.u_empty_match_check.start_register,
				3474	that->data_.u_empty_match_check.repetition_register,
				3475	that->data_.u_empty_match_check.repetition_limit);
				3476	break;
				3477	case ActionNode::CLEAR_CAPTURES: {
				3478	stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
				3479	that->data_.u_clear_captures.range_from,
				3480	that->data_.u_clear_captures.range_to);
				3481	break;
				3482	}
				3483	}
				3484	stream()->Add("];\n");
				3485	PrintAttributes(that);
				3486	RegExpNode* successor = that->on_success();
				3487	stream()->Add(" n%p -> n%p;\n", that, successor);
				3488	Visit(successor);
				3489	}
				3490
				3491
				3492	class DispatchTableDumper {
				3493	public:
				3494	explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
				3495	void Call(uc16 key, DispatchTable::Entry entry);
				3496	StringStream* stream() { return stream_; }
				3497	private:
				3498	StringStream* stream_;
				3499	};
				3500
				3501
				3502	void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
				3503	stream()->Add("[%k-%k]: {", key, entry.to());
				3504	OutSet* set = entry.out_set();
				3505	bool first = true;
				3506	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				3507	if (set->Get(i)) {
				3508	if (first) {
				3509	first = false;
				3510	} else {
				3511	stream()->Add(", ");
				3512	}
				3513	stream()->Add("%i", i);
				3514	}
				3515	}
				3516	stream()->Add("}\n");
				3517	}
				3518
				3519
				3520	void DispatchTable::Dump() {
				3521	HeapStringAllocator alloc;
				3522	StringStream stream(&alloc);
				3523	DispatchTableDumper dumper(&stream);
				3524	tree()->ForEach(&dumper);
				3525	OS::PrintError("%s", *stream.ToCString());
				3526	}
				3527
				3528
				3529	void RegExpEngine::DotPrint(const char* label,
				3530	RegExpNode* node,
				3531	bool ignore_case) {
				3532	DotPrinter printer(ignore_case);
				3533	printer.PrintNode(label, node);
				3534	}
				3535
				3536
				3537	#endif // DEBUG
				3538
				3539
				3540	// -------------------------------------------------------------------
				3541	// Tree to graph conversion
				3542
				3543	static const int kSpaceRangeCount = 20;
				3544	static const int kSpaceRangeAsciiCount = 4;
				3545	static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020,
				3546	0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A,
				3547	0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 };
				3548
				3549	static const int kWordRangeCount = 8;
				3550	static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_',
				3551	'_', 'a', 'z' };
				3552
				3553	static const int kDigitRangeCount = 2;
				3554	static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' };
				3555
				3556	static const int kLineTerminatorRangeCount = 6;
				3557	static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A,
				3558	0x000A, 0x000D, 0x000D, 0x2028, 0x2029 };
				3559
				3560	RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
				3561	RegExpNode* on_success) {
				3562	ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
				3563	elms->Add(TextElement::Atom(this));
				3564	return new TextNode(elms, on_success);
				3565	}
				3566
				3567
				3568	RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
				3569	RegExpNode* on_success) {
				3570	return new TextNode(elements(), on_success);
				3571	}
				3572
				3573	static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
				3574	const uc16* special_class,
				3575	int length) {
				3576	ASSERT(ranges->length() != 0);
				3577	ASSERT(length != 0);
				3578	ASSERT(special_class[0] != 0);
				3579	if (ranges->length() != (length >> 1) + 1) {
				3580	return false;
				3581	}
				3582	CharacterRange range = ranges->at(0);
				3583	if (range.from() != 0) {
				3584	return false;
				3585	}
				3586	for (int i = 0; i < length; i += 2) {
				3587	if (special_class[i] != (range.to() + 1)) {
				3588	return false;
				3589	}
				3590	range = ranges->at((i >> 1) + 1);
				3591	if (special_class[i+1] != range.from() - 1) {
				3592	return false;
				3593	}
				3594	}
				3595	if (range.to() != 0xffff) {
				3596	return false;
				3597	}
				3598	return true;
				3599	}
				3600
				3601
				3602	static bool CompareRanges(ZoneList<CharacterRange>* ranges,
				3603	const uc16* special_class,
				3604	int length) {
				3605	if (ranges->length() * 2 != length) {
				3606	return false;
				3607	}
				3608	for (int i = 0; i < length; i += 2) {
				3609	CharacterRange range = ranges->at(i >> 1);
				3610	if (range.from() != special_class[i] \|\| range.to() != special_class[i+1]) {
				3611	return false;
				3612	}
				3613	}
				3614	return true;
				3615	}
				3616
				3617
				3618	bool RegExpCharacterClass::is_standard() {
				3619	// TODO(lrn): Remove need for this function, by not throwing away information
				3620	// along the way.
				3621	if (is_negated_) {
				3622	return false;
				3623	}
				3624	if (set_.is_standard()) {
				3625	return true;
				3626	}
				3627	if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
				3628	set_.set_standard_set_type('s');
				3629	return true;
				3630	}
				3631	if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
				3632	set_.set_standard_set_type('S');
				3633	return true;
				3634	}
				3635	if (CompareInverseRanges(set_.ranges(),
				3636	kLineTerminatorRanges,
				3637	kLineTerminatorRangeCount)) {
				3638	set_.set_standard_set_type('.');
				3639	return true;
				3640	}
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	3641	if (CompareRanges(set_.ranges(),
				3642	kLineTerminatorRanges,
				3643	kLineTerminatorRangeCount)) {
				3644	set_.set_standard_set_type('n');
				3645	return true;
				3646	}
				3647	if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
				3648	set_.set_standard_set_type('w');
				3649	return true;
				3650	}
				3651	if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
				3652	set_.set_standard_set_type('W');
				3653	return true;
				3654	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3655	return false;
				3656	}
				3657
				3658
				3659	RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
				3660	RegExpNode* on_success) {
				3661	return new TextNode(this, on_success);
				3662	}
				3663
				3664
				3665	RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
				3666	RegExpNode* on_success) {
				3667	ZoneList<RegExpTree> alternatives = this->alternatives();
				3668	int length = alternatives->length();
				3669	ChoiceNode* result = new ChoiceNode(length);
				3670	for (int i = 0; i < length; i++) {
				3671	GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
				3672	on_success));
				3673	result->AddAlternative(alternative);
				3674	}
				3675	return result;
				3676	}
				3677
				3678
				3679	RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
				3680	RegExpNode* on_success) {
				3681	return ToNode(min(),
				3682	max(),
				3683	is_greedy(),
				3684	body(),
				3685	compiler,
				3686	on_success);
				3687	}
				3688
				3689
				3690	RegExpNode* RegExpQuantifier::ToNode(int min,
				3691	int max,
				3692	bool is_greedy,
				3693	RegExpTree* body,
				3694	RegExpCompiler* compiler,
				3695	RegExpNode* on_success,
				3696	bool not_at_start) {
				3697	// x{f, t} becomes this:
				3698	//
				3699	// (r++)<-.
				3700	// \| `
				3701	// \| (x)
				3702	// v ^
				3703	// (r=0)-->(?)---/ [if r < t]
				3704	// \|
				3705	// [if r >= f] \----> ...
				3706	//
				3707
				3708	// 15.10.2.5 RepeatMatcher algorithm.
				3709	// The parser has already eliminated the case where max is 0. In the case
				3710	// where max_match is zero the parser has removed the quantifier if min was
				3711	// > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
				3712
				3713	// If we know that we cannot match zero length then things are a little
				3714	// simpler since we don't need to make the special zero length match check
				3715	// from step 2.1. If the min and max are small we can unroll a little in
				3716	// this case.
				3717	static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
				3718	static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
				3719	if (max == 0) return on_success; // This can happen due to recursion.
				3720	bool body_can_be_empty = (body->min_match() == 0);
				3721	int body_start_reg = RegExpCompiler::kNoRegister;
				3722	Interval capture_registers = body->CaptureRegisters();
				3723	bool needs_capture_clearing = !capture_registers.is_empty();
				3724	if (body_can_be_empty) {
				3725	body_start_reg = compiler->AllocateRegister();
				3726	} else if (FLAG_regexp_optimization && !needs_capture_clearing) {
				3727	// Only unroll if there are no captures and the body can't be
				3728	// empty.
				3729	if (min > 0 && min <= kMaxUnrolledMinMatches) {
				3730	int new_max = (max == kInfinity) ? max : max - min;
				3731	// Recurse once to get the loop or optional matches after the fixed ones.
				3732	RegExpNode* answer = ToNode(
				3733	0, new_max, is_greedy, body, compiler, on_success, true);
				3734	// Unroll the forced matches from 0 to min. This can cause chains of
				3735	// TextNodes (which the parser does not generate). These should be
				3736	// combined if it turns out they hinder good code generation.
				3737	for (int i = 0; i < min; i++) {
				3738	answer = body->ToNode(compiler, answer);
				3739	}
				3740	return answer;
				3741	}
				3742	if (max <= kMaxUnrolledMaxMatches) {
				3743	ASSERT(min == 0);
				3744	// Unroll the optional matches up to max.
				3745	RegExpNode* answer = on_success;
				3746	for (int i = 0; i < max; i++) {
				3747	ChoiceNode* alternation = new ChoiceNode(2);
				3748	if (is_greedy) {
				3749	alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
				3750	answer)));
				3751	alternation->AddAlternative(GuardedAlternative(on_success));
				3752	} else {
				3753	alternation->AddAlternative(GuardedAlternative(on_success));
				3754	alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
				3755	answer)));
				3756	}
				3757	answer = alternation;
				3758	if (not_at_start) alternation->set_not_at_start();
				3759	}
				3760	return answer;
				3761	}
				3762	}
				3763	bool has_min = min > 0;
				3764	bool has_max = max < RegExpTree::kInfinity;
				3765	bool needs_counter = has_min \|\| has_max;
				3766	int reg_ctr = needs_counter
				3767	? compiler->AllocateRegister()
				3768	: RegExpCompiler::kNoRegister;
				3769	LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
				3770	if (not_at_start) center->set_not_at_start();
				3771	RegExpNode* loop_return = needs_counter
				3772	? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
				3773	: static_cast<RegExpNode*>(center);
				3774	if (body_can_be_empty) {
				3775	// If the body can be empty we need to check if it was and then
				3776	// backtrack.
				3777	loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
				3778	reg_ctr,
				3779	min,
				3780	loop_return);
				3781	}
				3782	RegExpNode* body_node = body->ToNode(compiler, loop_return);
				3783	if (body_can_be_empty) {
				3784	// If the body can be empty we need to store the start position
				3785	// so we can bail out if it was empty.
				3786	body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
				3787	}
				3788	if (needs_capture_clearing) {
				3789	// Before entering the body of this loop we need to clear captures.
				3790	body_node = ActionNode::ClearCaptures(capture_registers, body_node);
				3791	}
				3792	GuardedAlternative body_alt(body_node);
				3793	if (has_max) {
				3794	Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
				3795	body_alt.AddGuard(body_guard);
				3796	}
				3797	GuardedAlternative rest_alt(on_success);
				3798	if (has_min) {
				3799	Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
				3800	rest_alt.AddGuard(rest_guard);
				3801	}
				3802	if (is_greedy) {
				3803	center->AddLoopAlternative(body_alt);
				3804	center->AddContinueAlternative(rest_alt);
				3805	} else {
				3806	center->AddContinueAlternative(rest_alt);
				3807	center->AddLoopAlternative(body_alt);
				3808	}
				3809	if (needs_counter) {
				3810	return ActionNode::SetRegister(reg_ctr, 0, center);
				3811	} else {
				3812	return center;
				3813	}
				3814	}
				3815
				3816
				3817	RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
				3818	RegExpNode* on_success) {
				3819	NodeInfo info;
				3820	switch (type()) {
				3821	case START_OF_LINE:
				3822	return AssertionNode::AfterNewline(on_success);
				3823	case START_OF_INPUT:
				3824	return AssertionNode::AtStart(on_success);
				3825	case BOUNDARY:
				3826	return AssertionNode::AtBoundary(on_success);
				3827	case NON_BOUNDARY:
				3828	return AssertionNode::AtNonBoundary(on_success);
				3829	case END_OF_INPUT:
				3830	return AssertionNode::AtEnd(on_success);
				3831	case END_OF_LINE: {
				3832	// Compile $ in multiline regexps as an alternation with a positive
				3833	// lookahead in one side and an end-of-input on the other side.
				3834	// We need two registers for the lookahead.
				3835	int stack_pointer_register = compiler->AllocateRegister();
				3836	int position_register = compiler->AllocateRegister();
				3837	// The ChoiceNode to distinguish between a newline and end-of-input.
				3838	ChoiceNode* result = new ChoiceNode(2);
				3839	// Create a newline atom.
				3840	ZoneList<CharacterRange>* newline_ranges =
				3841	new ZoneList<CharacterRange>(3);
				3842	CharacterRange::AddClassEscape('n', newline_ranges);
				3843	RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
				3844	TextNode* newline_matcher = new TextNode(
				3845	newline_atom,
				3846	ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
				3847	position_register,
				3848	0, // No captures inside.
				3849	-1, // Ignored if no captures.
				3850	on_success));
				3851	// Create an end-of-input matcher.
				3852	RegExpNode* end_of_line = ActionNode::BeginSubmatch(
				3853	stack_pointer_register,
				3854	position_register,
				3855	newline_matcher);
				3856	// Add the two alternatives to the ChoiceNode.
				3857	GuardedAlternative eol_alternative(end_of_line);
				3858	result->AddAlternative(eol_alternative);
				3859	GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
				3860	result->AddAlternative(end_alternative);
				3861	return result;
				3862	}
				3863	default:
				3864	UNREACHABLE();
				3865	}
				3866	return on_success;
				3867	}
				3868
				3869
				3870	RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
				3871	RegExpNode* on_success) {
				3872	return new BackReferenceNode(RegExpCapture::StartRegister(index()),
				3873	RegExpCapture::EndRegister(index()),
				3874	on_success);
				3875	}
				3876
				3877
				3878	RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
				3879	RegExpNode* on_success) {
				3880	return on_success;
				3881	}
				3882
				3883
				3884	RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
				3885	RegExpNode* on_success) {
				3886	int stack_pointer_register = compiler->AllocateRegister();
				3887	int position_register = compiler->AllocateRegister();
				3888
				3889	const int registers_per_capture = 2;
				3890	const int register_of_first_capture = 2;
				3891	int register_count = capture_count_ * registers_per_capture;
				3892	int register_start =
				3893	register_of_first_capture + capture_from_ * registers_per_capture;
				3894
				3895	RegExpNode* success;
				3896	if (is_positive()) {
				3897	RegExpNode* node = ActionNode::BeginSubmatch(
				3898	stack_pointer_register,
				3899	position_register,
				3900	body()->ToNode(
				3901	compiler,
				3902	ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
				3903	position_register,
				3904	register_count,
				3905	register_start,
				3906	on_success)));
				3907	return node;
				3908	} else {
				3909	// We use a ChoiceNode for a negative lookahead because it has most of
				3910	// the characteristics we need. It has the body of the lookahead as its
				3911	// first alternative and the expression after the lookahead of the second
				3912	// alternative. If the first alternative succeeds then the
				3913	// NegativeSubmatchSuccess will unwind the stack including everything the
				3914	// choice node set up and backtrack. If the first alternative fails then
				3915	// the second alternative is tried, which is exactly the desired result
				3916	// for a negative lookahead. The NegativeLookaheadChoiceNode is a special
				3917	// ChoiceNode that knows to ignore the first exit when calculating quick
				3918	// checks.
				3919	GuardedAlternative body_alt(
				3920	body()->ToNode(
				3921	compiler,
				3922	success = new NegativeSubmatchSuccess(stack_pointer_register,
				3923	position_register,
				3924	register_count,
				3925	register_start)));
				3926	ChoiceNode* choice_node =
				3927	new NegativeLookaheadChoiceNode(body_alt,
				3928	GuardedAlternative(on_success));
				3929	return ActionNode::BeginSubmatch(stack_pointer_register,
				3930	position_register,
				3931	choice_node);
				3932	}
				3933	}
				3934
				3935
				3936	RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
				3937	RegExpNode* on_success) {
				3938	return ToNode(body(), index(), compiler, on_success);
				3939	}
				3940
				3941
				3942	RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
				3943	int index,
				3944	RegExpCompiler* compiler,
				3945	RegExpNode* on_success) {
				3946	int start_reg = RegExpCapture::StartRegister(index);
				3947	int end_reg = RegExpCapture::EndRegister(index);
				3948	RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
				3949	RegExpNode* body_node = body->ToNode(compiler, store_end);
				3950	return ActionNode::StorePosition(start_reg, true, body_node);
				3951	}
				3952
				3953
				3954	RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
				3955	RegExpNode* on_success) {
				3956	ZoneList<RegExpTree> children = nodes();
				3957	RegExpNode* current = on_success;
				3958	for (int i = children->length() - 1; i >= 0; i--) {
				3959	current = children->at(i)->ToNode(compiler, current);
				3960	}
				3961	return current;
				3962	}
				3963
				3964
				3965	static void AddClass(const uc16* elmv,
				3966	int elmc,
				3967	ZoneList<CharacterRange>* ranges) {
				3968	for (int i = 0; i < elmc; i += 2) {
				3969	ASSERT(elmv[i] <= elmv[i + 1]);
				3970	ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
				3971	}
				3972	}
				3973
				3974
				3975	static void AddClassNegated(const uc16 *elmv,
				3976	int elmc,
				3977	ZoneList<CharacterRange>* ranges) {
				3978	ASSERT(elmv[0] != 0x0000);
				3979	ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
				3980	uc16 last = 0x0000;
				3981	for (int i = 0; i < elmc; i += 2) {
				3982	ASSERT(last <= elmv[i] - 1);
				3983	ASSERT(elmv[i] <= elmv[i + 1]);
				3984	ranges->Add(CharacterRange(last, elmv[i] - 1));
				3985	last = elmv[i + 1] + 1;
				3986	}
				3987	ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
				3988	}
				3989
				3990
				3991	void CharacterRange::AddClassEscape(uc16 type,
				3992	ZoneList<CharacterRange>* ranges) {
				3993	switch (type) {
				3994	case 's':
				3995	AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
				3996	break;
				3997	case 'S':
				3998	AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
				3999	break;
				4000	case 'w':
				4001	AddClass(kWordRanges, kWordRangeCount, ranges);
				4002	break;
				4003	case 'W':
				4004	AddClassNegated(kWordRanges, kWordRangeCount, ranges);
				4005	break;
				4006	case 'd':
				4007	AddClass(kDigitRanges, kDigitRangeCount, ranges);
				4008	break;
				4009	case 'D':
				4010	AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
				4011	break;
				4012	case '.':
				4013	AddClassNegated(kLineTerminatorRanges,
				4014	kLineTerminatorRangeCount,
				4015	ranges);
				4016	break;
				4017	// This is not a character range as defined by the spec but a
				4018	// convenient shorthand for a character class that matches any
				4019	// character.
				4020	case '*':
				4021	ranges->Add(CharacterRange::Everything());
				4022	break;
				4023	// This is the set of characters matched by the $ and ^ symbols
				4024	// in multiline mode.
				4025	case 'n':
				4026	AddClass(kLineTerminatorRanges,
				4027	kLineTerminatorRangeCount,
				4028	ranges);
				4029	break;
				4030	default:
				4031	UNREACHABLE();
				4032	}
				4033	}
				4034
				4035
				4036	Vector<const uc16> CharacterRange::GetWordBounds() {
				4037	return Vector<const uc16>(kWordRanges, kWordRangeCount);
				4038	}
				4039
				4040
				4041	class CharacterRangeSplitter {
				4042	public:
				4043	CharacterRangeSplitter(ZoneList<CharacterRange>** included,
				4044	ZoneList<CharacterRange>** excluded)
				4045	: included_(included),
				4046	excluded_(excluded) { }
				4047	void Call(uc16 from, DispatchTable::Entry entry);
				4048
				4049	static const int kInBase = 0;
				4050	static const int kInOverlay = 1;
				4051
				4052	private:
				4053	ZoneList<CharacterRange>** included_;
				4054	ZoneList<CharacterRange>** excluded_;
				4055	};
				4056
				4057
				4058	void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
				4059	if (!entry.out_set()->Get(kInBase)) return;
				4060	ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
				4061	? included_
				4062	: excluded_;
				4063	if (target == NULL) target = new ZoneList<CharacterRange>(2);
				4064	(*target)->Add(CharacterRange(entry.from(), entry.to()));
				4065	}
				4066
				4067
				4068	void CharacterRange::Split(ZoneList<CharacterRange>* base,
				4069	Vector<const uc16> overlay,
				4070	ZoneList<CharacterRange>** included,
				4071	ZoneList<CharacterRange>** excluded) {
				4072	ASSERT_EQ(NULL, *included);
				4073	ASSERT_EQ(NULL, *excluded);
				4074	DispatchTable table;
				4075	for (int i = 0; i < base->length(); i++)
				4076	table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
				4077	for (int i = 0; i < overlay.length(); i += 2) {
				4078	table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
				4079	CharacterRangeSplitter::kInOverlay);
				4080	}
				4081	CharacterRangeSplitter callback(included, excluded);
				4082	table.ForEach(&callback);
				4083	}
				4084
				4085
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4086	static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
				4087	int bottom,
				4088	int top);
				4089
				4090
				4091	void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
				4092	bool is_ascii) {
				4093	uc16 bottom = from();
				4094	uc16 top = to();
				4095	if (is_ascii) {
				4096	if (bottom > String::kMaxAsciiCharCode) return;
				4097	if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode;
				4098	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4099	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4100	if (top == bottom) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4101	// If this is a singleton we just expand the one character.
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4102	int length = uncanonicalize.get(bottom, '\0', chars);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4103	for (int i = 0; i < length; i++) {
				4104	uc32 chr = chars[i];
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4105	if (chr != bottom) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4106	ranges->Add(CharacterRange::Singleton(chars[i]));
				4107	}
				4108	}
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4109	} else {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4110	// If this is a range we expand the characters block by block,
				4111	// expanding contiguous subranges (blocks) one at a time.
				4112	// The approach is as follows. For a given start character we
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4113	// look up the remainder of the block that contains it (represented
				4114	// by the end point), for instance we find 'z' if the character
				4115	// is 'c'. A block is characterized by the property
				4116	// that all characters uncanonicalize in the same way, except that
				4117	// each entry in the result is incremented by the distance from the first
				4118	// element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
				4119	// the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
				4120	// Once we've found the end point we look up its uncanonicalization
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4121	// and produce a range for each element. For instance for [c-f]
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4122	// we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4123	// add a range if it is not already contained in the input, so [c-f]
				4124	// will be skipped but [C-F] will be added. If this range is not
				4125	// completely contained in a block we do this for all the blocks
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4126	// covered by the range (handling characters that is not in a block
				4127	// as a "singleton block").
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4128	unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4129	int pos = bottom;
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4130	while (pos < top) {
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4131	int length = canonrange.get(pos, '\0', range);
				4132	uc16 block_end;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4133	if (length == 0) {
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4134	block_end = pos;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4135	} else {
				4136	ASSERT_EQ(1, length);
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4137	block_end = range[0];
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4138	}
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4139	int end = (block_end > top) ? top : block_end;
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4140	length = uncanonicalize.get(block_end, '\0', range);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4141	for (int i = 0; i < length; i++) {
				4142	uc32 c = range[i];
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4143	uc16 range_from = c - (block_end - pos);
				4144	uc16 range_to = c - (block_end - end);
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4145	if (!(bottom <= range_from && range_to <= top)) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4146	ranges->Add(CharacterRange(range_from, range_to));
				4147	}
				4148	}
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4149	pos = end + 1;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4150	}
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4151	}
				4152	}
				4153
				4154
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4155	bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
				4156	ASSERT_NOT_NULL(ranges);
				4157	int n = ranges->length();
				4158	if (n <= 1) return true;
				4159	int max = ranges->at(0).to();
				4160	for (int i = 1; i < n; i++) {
				4161	CharacterRange next_range = ranges->at(i);
				4162	if (next_range.from() <= max + 1) return false;
				4163	max = next_range.to();
				4164	}
				4165	return true;
				4166	}
				4167
				4168	SetRelation CharacterRange::WordCharacterRelation(
				4169	ZoneList<CharacterRange>* range) {
				4170	ASSERT(IsCanonical(range));
				4171	int i = 0; // Word character range index.
				4172	int j = 0; // Argument range index.
				4173	ASSERT_NE(0, kWordRangeCount);
				4174	SetRelation result;
				4175	if (range->length() == 0) {
				4176	result.SetElementsInSecondSet();
				4177	return result;
				4178	}
				4179	CharacterRange argument_range = range->at(0);
				4180	CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]);
				4181	while (i < kWordRangeCount && j < range->length()) {
				4182	// Check the two ranges for the five cases:
				4183	// - no overlap.
				4184	// - partial overlap (there are elements in both ranges that isn't
				4185	// in the other, and there are also elements that are in both).
				4186	// - argument range entirely inside word range.
				4187	// - word range entirely inside argument range.
				4188	// - ranges are completely equal.
				4189
				4190	// First check for no overlap. The earlier range is not in the other set.
				4191	if (argument_range.from() > word_range.to()) {
				4192	// Ranges are disjoint. The earlier word range contains elements that
				4193	// cannot be in the argument set.
				4194	result.SetElementsInSecondSet();
				4195	} else if (word_range.from() > argument_range.to()) {
				4196	// Ranges are disjoint. The earlier argument range contains elements that
				4197	// cannot be in the word set.
				4198	result.SetElementsInFirstSet();
				4199	} else if (word_range.from() <= argument_range.from() &&
				4200	word_range.to() >= argument_range.from()) {
				4201	result.SetElementsInBothSets();
				4202	// argument range completely inside word range.
				4203	if (word_range.from() < argument_range.from() \|\|
				4204	word_range.to() > argument_range.from()) {
				4205	result.SetElementsInSecondSet();
				4206	}
				4207	} else if (word_range.from() >= argument_range.from() &&
				4208	word_range.to() <= argument_range.from()) {
				4209	result.SetElementsInBothSets();
				4210	result.SetElementsInFirstSet();
				4211	} else {
				4212	// There is overlap, and neither is a subrange of the other
				4213	result.SetElementsInFirstSet();
				4214	result.SetElementsInSecondSet();
				4215	result.SetElementsInBothSets();
				4216	}
				4217	if (result.NonTrivialIntersection()) {
				4218	// The result is as (im)precise as we can possibly make it.
				4219	return result;
				4220	}
				4221	// Progress the range(s) with minimal to-character.
				4222	uc16 word_to = word_range.to();
				4223	uc16 argument_to = argument_range.to();
				4224	if (argument_to <= word_to) {
				4225	j++;
				4226	if (j < range->length()) {
				4227	argument_range = range->at(j);
				4228	}
				4229	}
				4230	if (word_to <= argument_to) {
				4231	i += 2;
				4232	if (i < kWordRangeCount) {
				4233	word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]);
				4234	}
				4235	}
				4236	}
				4237	// Check if anything wasn't compared in the loop.
				4238	if (i < kWordRangeCount) {
				4239	// word range contains something not in argument range.
				4240	result.SetElementsInSecondSet();
				4241	} else if (j < range->length()) {
				4242	// Argument range contains something not in word range.
				4243	result.SetElementsInFirstSet();
				4244	}
				4245
				4246	return result;
				4247	}
				4248
				4249
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4250	static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
				4251	int bottom,
				4252	int top) {
				4253	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				4254	// Zones with no case mappings. There is a DEBUG-mode loop to assert that
				4255	// this table is correct.
				4256	// 0x0600 - 0x0fff
				4257	// 0x1100 - 0x1cff
				4258	// 0x2000 - 0x20ff
				4259	// 0x2200 - 0x23ff
				4260	// 0x2500 - 0x2bff
				4261	// 0x2e00 - 0xa5ff
				4262	// 0xa800 - 0xfaff
				4263	// 0xfc00 - 0xfeff
				4264	const int boundary_count = 18;
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4265	int boundaries[] = {
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4266	0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500,
				4267	0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00};
				4268
				4269	// Special ASCII rule from spec can save us some work here.
				4270	if (bottom == 0x80 && top == 0xffff) return;
				4271
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4272	if (top <= boundaries[0]) {
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4273	CharacterRange range(bottom, top);
				4274	range.AddCaseEquivalents(ranges, false);
				4275	return;
				4276	}
				4277
				4278	// Split up very large ranges. This helps remove ranges where there are no
				4279	// case mappings.
				4280	for (int i = 0; i < boundary_count; i++) {
				4281	if (bottom < boundaries[i] && top >= boundaries[i]) {
				4282	AddUncanonicals(ranges, bottom, boundaries[i] - 1);
				4283	AddUncanonicals(ranges, boundaries[i], top);
				4284	return;
				4285	}
				4286	}
				4287
				4288	// If we are completely in a zone with no case mappings then we are done.
Ben Murdoch	bb769b2	2010-08-11 14:56:33 +0100	[diff] [blame]	4289	for (int i = 0; i < boundary_count; i += 2) {
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4290	if (bottom >= boundaries[i] && top < boundaries[i + 1]) {
				4291	#ifdef DEBUG
				4292	for (int j = bottom; j <= top; j++) {
				4293	unsigned current_char = j;
				4294	int length = uncanonicalize.get(current_char, '\0', chars);
				4295	for (int k = 0; k < length; k++) {
				4296	ASSERT(chars[k] == current_char);
				4297	}
				4298	}
				4299	#endif
				4300	return;
				4301	}
				4302	}
				4303
				4304	// Step through the range finding equivalent characters.
				4305	ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100);
				4306	for (int i = bottom; i <= top; i++) {
				4307	int length = uncanonicalize.get(i, '\0', chars);
				4308	for (int j = 0; j < length; j++) {
				4309	uc32 chr = chars[j];
				4310	if (chr != i && (chr < bottom \|\| chr > top)) {
				4311	characters->Add(chr);
				4312	}
				4313	}
				4314	}
				4315
				4316	// Step through the equivalent characters finding simple ranges and
				4317	// adding ranges to the character class.
				4318	if (characters->length() > 0) {
				4319	int new_from = characters->at(0);
				4320	int new_to = new_from;
				4321	for (int i = 1; i < characters->length(); i++) {
				4322	int chr = characters->at(i);
				4323	if (chr == new_to + 1) {
				4324	new_to++;
				4325	} else {
				4326	if (new_to == new_from) {
				4327	ranges->Add(CharacterRange::Singleton(new_from));
				4328	} else {
				4329	ranges->Add(CharacterRange(new_from, new_to));
				4330	}
				4331	new_from = new_to = chr;
				4332	}
				4333	}
				4334	if (new_to == new_from) {
				4335	ranges->Add(CharacterRange::Singleton(new_from));
				4336	} else {
				4337	ranges->Add(CharacterRange(new_from, new_to));
				4338	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4339	}
				4340	}
				4341
				4342
				4343	ZoneList<CharacterRange>* CharacterSet::ranges() {
				4344	if (ranges_ == NULL) {
				4345	ranges_ = new ZoneList<CharacterRange>(2);
				4346	CharacterRange::AddClassEscape(standard_set_type_, ranges_);
				4347	}
				4348	return ranges_;
				4349	}
				4350
				4351
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4352	// Move a number of elements in a zonelist to another position
				4353	// in the same list. Handles overlapping source and target areas.
				4354	static void MoveRanges(ZoneList<CharacterRange>* list,
				4355	int from,
				4356	int to,
				4357	int count) {
				4358	// Ranges are potentially overlapping.
				4359	if (from < to) {
				4360	for (int i = count - 1; i >= 0; i--) {
				4361	list->at(to + i) = list->at(from + i);
				4362	}
				4363	} else {
				4364	for (int i = 0; i < count; i++) {
				4365	list->at(to + i) = list->at(from + i);
				4366	}
				4367	}
				4368	}
				4369
				4370
				4371	static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
				4372	int count,
				4373	CharacterRange insert) {
				4374	// Inserts a range into list[0..count[, which must be sorted
				4375	// by from value and non-overlapping and non-adjacent, using at most
				4376	// list[0..count] for the result. Returns the number of resulting
				4377	// canonicalized ranges. Inserting a range may collapse existing ranges into
				4378	// fewer ranges, so the return value can be anything in the range 1..count+1.
				4379	uc16 from = insert.from();
				4380	uc16 to = insert.to();
				4381	int start_pos = 0;
				4382	int end_pos = count;
				4383	for (int i = count - 1; i >= 0; i--) {
				4384	CharacterRange current = list->at(i);
				4385	if (current.from() > to + 1) {
				4386	end_pos = i;
				4387	} else if (current.to() + 1 < from) {
				4388	start_pos = i + 1;
				4389	break;
				4390	}
				4391	}
				4392
				4393	// Inserted range overlaps, or is adjacent to, ranges at positions
				4394	// [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
				4395	// not affected by the insertion.
				4396	// If start_pos == end_pos, the range must be inserted before start_pos.
				4397	// if start_pos < end_pos, the entire range from start_pos to end_pos
				4398	// must be merged with the insert range.
				4399
				4400	if (start_pos == end_pos) {
				4401	// Insert between existing ranges at position start_pos.
				4402	if (start_pos < count) {
				4403	MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
				4404	}
				4405	list->at(start_pos) = insert;
				4406	return count + 1;
				4407	}
				4408	if (start_pos + 1 == end_pos) {
				4409	// Replace single existing range at position start_pos.
				4410	CharacterRange to_replace = list->at(start_pos);
				4411	int new_from = Min(to_replace.from(), from);
				4412	int new_to = Max(to_replace.to(), to);
				4413	list->at(start_pos) = CharacterRange(new_from, new_to);
				4414	return count;
				4415	}
				4416	// Replace a number of existing ranges from start_pos to end_pos - 1.
				4417	// Move the remaining ranges down.
				4418
				4419	int new_from = Min(list->at(start_pos).from(), from);
				4420	int new_to = Max(list->at(end_pos - 1).to(), to);
				4421	if (end_pos < count) {
				4422	MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
				4423	}
				4424	list->at(start_pos) = CharacterRange(new_from, new_to);
				4425	return count - (end_pos - start_pos) + 1;
				4426	}
				4427
				4428
				4429	void CharacterSet::Canonicalize() {
				4430	// Special/default classes are always considered canonical. The result
				4431	// of calling ranges() will be sorted.
				4432	if (ranges_ == NULL) return;
				4433	CharacterRange::Canonicalize(ranges_);
				4434	}
				4435
				4436
				4437	void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
				4438	if (character_ranges->length() <= 1) return;
				4439	// Check whether ranges are already canonical (increasing, non-overlapping,
				4440	// non-adjacent).
				4441	int n = character_ranges->length();
				4442	int max = character_ranges->at(0).to();
				4443	int i = 1;
				4444	while (i < n) {
				4445	CharacterRange current = character_ranges->at(i);
				4446	if (current.from() <= max + 1) {
				4447	break;
				4448	}
				4449	max = current.to();
				4450	i++;
				4451	}
				4452	// Canonical until the i'th range. If that's all of them, we are done.
				4453	if (i == n) return;
				4454
				4455	// The ranges at index i and forward are not canonicalized. Make them so by
				4456	// doing the equivalent of insertion sort (inserting each into the previous
				4457	// list, in order).
				4458	// Notice that inserting a range can reduce the number of ranges in the
				4459	// result due to combining of adjacent and overlapping ranges.
				4460	int read = i; // Range to insert.
				4461	int num_canonical = i; // Length of canonicalized part of list.
				4462	do {
				4463	num_canonical = InsertRangeInCanonicalList(character_ranges,
				4464	num_canonical,
				4465	character_ranges->at(read));
				4466	read++;
				4467	} while (read < n);
				4468	character_ranges->Rewind(num_canonical);
				4469
				4470	ASSERT(CharacterRange::IsCanonical(character_ranges));
				4471	}
				4472
				4473
				4474	// Utility function for CharacterRange::Merge. Adds a range at the end of
				4475	// a canonicalized range list, if necessary merging the range with the last
				4476	// range of the list.
				4477	static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) {
				4478	if (set == NULL) return;
				4479	ASSERT(set->length() == 0 \|\| set->at(set->length() - 1).to() < range.from());
				4480	int n = set->length();
				4481	if (n > 0) {
				4482	CharacterRange lastRange = set->at(n - 1);
				4483	if (lastRange.to() == range.from() - 1) {
				4484	set->at(n - 1) = CharacterRange(lastRange.from(), range.to());
				4485	return;
				4486	}
				4487	}
				4488	set->Add(range);
				4489	}
				4490
				4491
				4492	static void AddRangeToSelectedSet(int selector,
				4493	ZoneList<CharacterRange>* first_set,
				4494	ZoneList<CharacterRange>* second_set,
				4495	ZoneList<CharacterRange>* intersection_set,
				4496	CharacterRange range) {
				4497	switch (selector) {
				4498	case kInsideFirst:
				4499	AddRangeToSet(first_set, range);
				4500	break;
				4501	case kInsideSecond:
				4502	AddRangeToSet(second_set, range);
				4503	break;
				4504	case kInsideBoth:
				4505	AddRangeToSet(intersection_set, range);
				4506	break;
				4507	}
				4508	}
				4509
				4510
				4511
				4512	void CharacterRange::Merge(ZoneList<CharacterRange>* first_set,
				4513	ZoneList<CharacterRange>* second_set,
				4514	ZoneList<CharacterRange>* first_set_only_out,
				4515	ZoneList<CharacterRange>* second_set_only_out,
				4516	ZoneList<CharacterRange>* both_sets_out) {
				4517	// Inputs are canonicalized.
				4518	ASSERT(CharacterRange::IsCanonical(first_set));
				4519	ASSERT(CharacterRange::IsCanonical(second_set));
				4520	// Outputs are empty, if applicable.
				4521	ASSERT(first_set_only_out == NULL \|\| first_set_only_out->length() == 0);
				4522	ASSERT(second_set_only_out == NULL \|\| second_set_only_out->length() == 0);
				4523	ASSERT(both_sets_out == NULL \|\| both_sets_out->length() == 0);
				4524
				4525	// Merge sets by iterating through the lists in order of lowest "from" value,
				4526	// and putting intervals into one of three sets.
				4527
				4528	if (first_set->length() == 0) {
				4529	second_set_only_out->AddAll(*second_set);
				4530	return;
				4531	}
				4532	if (second_set->length() == 0) {
				4533	first_set_only_out->AddAll(*first_set);
				4534	return;
				4535	}
				4536	// Indices into input lists.
				4537	int i1 = 0;
				4538	int i2 = 0;
				4539	// Cache length of input lists.
				4540	int n1 = first_set->length();
				4541	int n2 = second_set->length();
				4542	// Current range. May be invalid if state is kInsideNone.
				4543	int from = 0;
				4544	int to = -1;
				4545	// Where current range comes from.
				4546	int state = kInsideNone;
				4547
				4548	while (i1 < n1 \|\| i2 < n2) {
				4549	CharacterRange next_range;
				4550	int range_source;
Leon Clarke	d91b9f7	2010-01-27 17:25:45 +0000	[diff] [blame]	4551	if (i2 == n2 \|\|
				4552	(i1 < n1 && first_set->at(i1).from() < second_set->at(i2).from())) {
				4553	// Next smallest element is in first set.
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4554	next_range = first_set->at(i1++);
				4555	range_source = kInsideFirst;
				4556	} else {
Leon Clarke	d91b9f7	2010-01-27 17:25:45 +0000	[diff] [blame]	4557	// Next smallest element is in second set.
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4558	next_range = second_set->at(i2++);
				4559	range_source = kInsideSecond;
				4560	}
				4561	if (to < next_range.from()) {
				4562	// Ranges disjoint: \|current\| \|next\|
				4563	AddRangeToSelectedSet(state,
				4564	first_set_only_out,
				4565	second_set_only_out,
				4566	both_sets_out,
				4567	CharacterRange(from, to));
				4568	from = next_range.from();
				4569	to = next_range.to();
				4570	state = range_source;
				4571	} else {
				4572	if (from < next_range.from()) {
				4573	AddRangeToSelectedSet(state,
				4574	first_set_only_out,
				4575	second_set_only_out,
				4576	both_sets_out,
				4577	CharacterRange(from, next_range.from()-1));
				4578	}
				4579	if (to < next_range.to()) {
				4580	// Ranges overlap: \|current\|
				4581	// \|next\|
				4582	AddRangeToSelectedSet(state \| range_source,
				4583	first_set_only_out,
				4584	second_set_only_out,
				4585	both_sets_out,
				4586	CharacterRange(next_range.from(), to));
				4587	from = to + 1;
				4588	to = next_range.to();
				4589	state = range_source;
				4590	} else {
				4591	// Range included: \|current\| , possibly ending at same character.
				4592	// \|next\|
				4593	AddRangeToSelectedSet(
				4594	state \| range_source,
				4595	first_set_only_out,
				4596	second_set_only_out,
				4597	both_sets_out,
				4598	CharacterRange(next_range.from(), next_range.to()));
				4599	from = next_range.to() + 1;
				4600	// If ranges end at same character, both ranges are consumed completely.
				4601	if (next_range.to() == to) state = kInsideNone;
				4602	}
				4603	}
				4604	}
				4605	AddRangeToSelectedSet(state,
				4606	first_set_only_out,
				4607	second_set_only_out,
				4608	both_sets_out,
				4609	CharacterRange(from, to));
				4610	}
				4611
				4612
				4613	void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
				4614	ZoneList<CharacterRange>* negated_ranges) {
				4615	ASSERT(CharacterRange::IsCanonical(ranges));
				4616	ASSERT_EQ(0, negated_ranges->length());
				4617	int range_count = ranges->length();
				4618	uc16 from = 0;
				4619	int i = 0;
				4620	if (range_count > 0 && ranges->at(0).from() == 0) {
				4621	from = ranges->at(0).to();
				4622	i = 1;
				4623	}
				4624	while (i < range_count) {
				4625	CharacterRange range = ranges->at(i);
				4626	negated_ranges->Add(CharacterRange(from + 1, range.from() - 1));
				4627	from = range.to();
				4628	i++;
				4629	}
				4630	if (from < String::kMaxUC16CharCode) {
				4631	negated_ranges->Add(CharacterRange(from + 1, String::kMaxUC16CharCode));
				4632	}
				4633	}
				4634
				4635
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4636
				4637	// -------------------------------------------------------------------
				4638	// Interest propagation
				4639
				4640
				4641	RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
				4642	for (int i = 0; i < siblings_.length(); i++) {
				4643	RegExpNode* sibling = siblings_.Get(i);
				4644	if (sibling->info()->Matches(info))
				4645	return sibling;
				4646	}
				4647	return NULL;
				4648	}
				4649
				4650
				4651	RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
				4652	ASSERT_EQ(false, *cloned);
				4653	siblings_.Ensure(this);
				4654	RegExpNode* result = TryGetSibling(info);
				4655	if (result != NULL) return result;
				4656	result = this->Clone();
				4657	NodeInfo* new_info = result->info();
				4658	new_info->ResetCompilationState();
				4659	new_info->AddFromPreceding(info);
				4660	AddSibling(result);
				4661	*cloned = true;
				4662	return result;
				4663	}
				4664
				4665
				4666	template <class C>
				4667	static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
				4668	NodeInfo full_info(*node->info());
				4669	full_info.AddFromPreceding(info);
				4670	bool cloned = false;
				4671	return RegExpNode::EnsureSibling(node, &full_info, &cloned);
				4672	}
				4673
				4674
				4675	// -------------------------------------------------------------------
				4676	// Splay tree
				4677
				4678
				4679	OutSet* OutSet::Extend(unsigned value) {
				4680	if (Get(value))
				4681	return this;
				4682	if (successors() != NULL) {
				4683	for (int i = 0; i < successors()->length(); i++) {
				4684	OutSet* successor = successors()->at(i);
				4685	if (successor->Get(value))
				4686	return successor;
				4687	}
				4688	} else {
				4689	successors_ = new ZoneList<OutSet*>(2);
				4690	}
				4691	OutSet* result = new OutSet(first_, remaining_);
				4692	result->Set(value);
				4693	successors()->Add(result);
				4694	return result;
				4695	}
				4696
				4697
				4698	void OutSet::Set(unsigned value) {
				4699	if (value < kFirstLimit) {
				4700	first_ \|= (1 << value);
				4701	} else {
				4702	if (remaining_ == NULL)
				4703	remaining_ = new ZoneList<unsigned>(1);
				4704	if (remaining_->is_empty() \|\| !remaining_->Contains(value))
				4705	remaining_->Add(value);
				4706	}
				4707	}
				4708
				4709
				4710	bool OutSet::Get(unsigned value) {
				4711	if (value < kFirstLimit) {
				4712	return (first_ & (1 << value)) != 0;
				4713	} else if (remaining_ == NULL) {
				4714	return false;
				4715	} else {
				4716	return remaining_->Contains(value);
				4717	}
				4718	}
				4719
				4720
				4721	const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
				4722	const DispatchTable::Entry DispatchTable::Config::kNoValue;
				4723
				4724
				4725	void DispatchTable::AddRange(CharacterRange full_range, int value) {
				4726	CharacterRange current = full_range;
				4727	if (tree()->is_empty()) {
				4728	// If this is the first range we just insert into the table.
				4729	ZoneSplayTree<Config>::Locator loc;
				4730	ASSERT_RESULT(tree()->Insert(current.from(), &loc));
				4731	loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
				4732	return;
				4733	}
				4734	// First see if there is a range to the left of this one that
				4735	// overlaps.
				4736	ZoneSplayTree<Config>::Locator loc;
				4737	if (tree()->FindGreatestLessThan(current.from(), &loc)) {
				4738	Entry* entry = &loc.value();
				4739	// If we've found a range that overlaps with this one, and it
				4740	// starts strictly to the left of this one, we have to fix it
				4741	// because the following code only handles ranges that start on
				4742	// or after the start point of the range we're adding.
				4743	if (entry->from() < current.from() && entry->to() >= current.from()) {
				4744	// Snap the overlapping range in half around the start point of
				4745	// the range we're adding.
				4746	CharacterRange left(entry->from(), current.from() - 1);
				4747	CharacterRange right(current.from(), entry->to());
				4748	// The left part of the overlapping range doesn't overlap.
				4749	// Truncate the whole entry to be just the left part.
				4750	entry->set_to(left.to());
				4751	// The right part is the one that overlaps. We add this part
				4752	// to the map and let the next step deal with merging it with
				4753	// the range we're adding.
				4754	ZoneSplayTree<Config>::Locator loc;
				4755	ASSERT_RESULT(tree()->Insert(right.from(), &loc));
				4756	loc.set_value(Entry(right.from(),
				4757	right.to(),
				4758	entry->out_set()));
				4759	}
				4760	}
				4761	while (current.is_valid()) {
				4762	if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
				4763	(loc.value().from() <= current.to()) &&
				4764	(loc.value().to() >= current.from())) {
				4765	Entry* entry = &loc.value();
				4766	// We have overlap. If there is space between the start point of
				4767	// the range we're adding and where the overlapping range starts
				4768	// then we have to add a range covering just that space.
				4769	if (current.from() < entry->from()) {
				4770	ZoneSplayTree<Config>::Locator ins;
				4771	ASSERT_RESULT(tree()->Insert(current.from(), &ins));
				4772	ins.set_value(Entry(current.from(),
				4773	entry->from() - 1,
				4774	empty()->Extend(value)));
				4775	current.set_from(entry->from());
				4776	}
				4777	ASSERT_EQ(current.from(), entry->from());
				4778	// If the overlapping range extends beyond the one we want to add
				4779	// we have to snap the right part off and add it separately.
				4780	if (entry->to() > current.to()) {
				4781	ZoneSplayTree<Config>::Locator ins;
				4782	ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
				4783	ins.set_value(Entry(current.to() + 1,
				4784	entry->to(),
				4785	entry->out_set()));
				4786	entry->set_to(current.to());
				4787	}
				4788	ASSERT(entry->to() <= current.to());
				4789	// The overlapping range is now completely contained by the range
				4790	// we're adding so we can just update it and move the start point
				4791	// of the range we're adding just past it.
				4792	entry->AddValue(value);
				4793	// Bail out if the last interval ended at 0xFFFF since otherwise
				4794	// adding 1 will wrap around to 0.
				4795	if (entry->to() == String::kMaxUC16CharCode)
				4796	break;
				4797	ASSERT(entry->to() + 1 > current.from());
				4798	current.set_from(entry->to() + 1);
				4799	} else {
				4800	// There is no overlap so we can just add the range
				4801	ZoneSplayTree<Config>::Locator ins;
				4802	ASSERT_RESULT(tree()->Insert(current.from(), &ins));
				4803	ins.set_value(Entry(current.from(),
				4804	current.to(),
				4805	empty()->Extend(value)));
				4806	break;
				4807	}
				4808	}
				4809	}
				4810
				4811
				4812	OutSet* DispatchTable::Get(uc16 value) {
				4813	ZoneSplayTree<Config>::Locator loc;
				4814	if (!tree()->FindGreatestLessThan(value, &loc))
				4815	return empty();
				4816	Entry* entry = &loc.value();
				4817	if (value <= entry->to())
				4818	return entry->out_set();
				4819	else
				4820	return empty();
				4821	}
				4822
				4823
				4824	// -------------------------------------------------------------------
				4825	// Analysis
				4826
				4827
				4828	void Analysis::EnsureAnalyzed(RegExpNode* that) {
				4829	StackLimitCheck check;
				4830	if (check.HasOverflowed()) {
				4831	fail("Stack overflow");
				4832	return;
				4833	}
				4834	if (that->info()->been_analyzed \|\| that->info()->being_analyzed)
				4835	return;
				4836	that->info()->being_analyzed = true;
				4837	that->Accept(this);
				4838	that->info()->being_analyzed = false;
				4839	that->info()->been_analyzed = true;
				4840	}
				4841
				4842
				4843	void Analysis::VisitEnd(EndNode* that) {
				4844	// nothing to do
				4845	}
				4846
				4847
				4848	void TextNode::CalculateOffsets() {
				4849	int element_count = elements()->length();
				4850	// Set up the offsets of the elements relative to the start. This is a fixed
				4851	// quantity since a TextNode can only contain fixed-width things.
				4852	int cp_offset = 0;
				4853	for (int i = 0; i < element_count; i++) {
				4854	TextElement& elm = elements()->at(i);
				4855	elm.cp_offset = cp_offset;
				4856	if (elm.type == TextElement::ATOM) {
				4857	cp_offset += elm.data.u_atom->data().length();
				4858	} else {
				4859	cp_offset++;
				4860	Vector<const uc16> quarks = elm.data.u_atom->data();
				4861	}
				4862	}
				4863	}
				4864
				4865
				4866	void Analysis::VisitText(TextNode* that) {
				4867	if (ignore_case_) {
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4868	that->MakeCaseIndependent(is_ascii_);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4869	}
				4870	EnsureAnalyzed(that->on_success());
				4871	if (!has_failed()) {
				4872	that->CalculateOffsets();
				4873	}
				4874	}
				4875
				4876
				4877	void Analysis::VisitAction(ActionNode* that) {
				4878	RegExpNode* target = that->on_success();
				4879	EnsureAnalyzed(target);
				4880	if (!has_failed()) {
				4881	// If the next node is interested in what it follows then this node
				4882	// has to be interested too so it can pass the information on.
				4883	that->info()->AddFromFollowing(target->info());
				4884	}
				4885	}
				4886
				4887
				4888	void Analysis::VisitChoice(ChoiceNode* that) {
				4889	NodeInfo* info = that->info();
				4890	for (int i = 0; i < that->alternatives()->length(); i++) {
				4891	RegExpNode* node = that->alternatives()->at(i).node();
				4892	EnsureAnalyzed(node);
				4893	if (has_failed()) return;
				4894	// Anything the following nodes need to know has to be known by
				4895	// this node also, so it can pass it on.
				4896	info->AddFromFollowing(node->info());
				4897	}
				4898	}
				4899
				4900
				4901	void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
				4902	NodeInfo* info = that->info();
				4903	for (int i = 0; i < that->alternatives()->length(); i++) {
				4904	RegExpNode* node = that->alternatives()->at(i).node();
				4905	if (node != that->loop_node()) {
				4906	EnsureAnalyzed(node);
				4907	if (has_failed()) return;
				4908	info->AddFromFollowing(node->info());
				4909	}
				4910	}
				4911	// Check the loop last since it may need the value of this node
				4912	// to get a correct result.
				4913	EnsureAnalyzed(that->loop_node());
				4914	if (!has_failed()) {
				4915	info->AddFromFollowing(that->loop_node()->info());
				4916	}
				4917	}
				4918
				4919
				4920	void Analysis::VisitBackReference(BackReferenceNode* that) {
				4921	EnsureAnalyzed(that->on_success());
				4922	}
				4923
				4924
				4925	void Analysis::VisitAssertion(AssertionNode* that) {
				4926	EnsureAnalyzed(that->on_success());
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4927	AssertionNode::AssertionNodeType type = that->type();
				4928	if (type == AssertionNode::AT_BOUNDARY \|\|
				4929	type == AssertionNode::AT_NON_BOUNDARY) {
				4930	// Check if the following character is known to be a word character
				4931	// or known to not be a word character.
				4932	ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet();
				4933
				4934	CharacterRange::Canonicalize(following_chars);
				4935
				4936	SetRelation word_relation =
				4937	CharacterRange::WordCharacterRelation(following_chars);
Andrei Popescu	6d3d5a3	2010-04-27 19:40:12 +0100	[diff] [blame]	4938	if (word_relation.Disjoint()) {
				4939	// Includes the case where following_chars is empty (e.g., end-of-input).
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4940	// Following character is definitely not a word character.
				4941	type = (type == AssertionNode::AT_BOUNDARY) ?
Andrei Popescu	6d3d5a3	2010-04-27 19:40:12 +0100	[diff] [blame]	4942	AssertionNode::AFTER_WORD_CHARACTER :
				4943	AssertionNode::AFTER_NONWORD_CHARACTER;
				4944	that->set_type(type);
				4945	} else if (word_relation.ContainedIn()) {
				4946	// Following character is definitely a word character.
				4947	type = (type == AssertionNode::AT_BOUNDARY) ?
				4948	AssertionNode::AFTER_NONWORD_CHARACTER :
				4949	AssertionNode::AFTER_WORD_CHARACTER;
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4950	that->set_type(type);
				4951	}
				4952	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4953	}
				4954
				4955
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4956	ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() {
				4957	if (first_character_set_ == NULL) {
				4958	if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) {
				4959	// If we can't find an exact solution within the budget, we
				4960	// set the value to the set of every character, i.e., all characters
				4961	// are possible.
				4962	ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1);
				4963	all_set->Add(CharacterRange::Everything());
				4964	first_character_set_ = all_set;
				4965	}
				4966	}
				4967	return first_character_set_;
				4968	}
				4969
				4970
				4971	int RegExpNode::ComputeFirstCharacterSet(int budget) {
				4972	// Default behavior is to not be able to determine the first character.
				4973	return kComputeFirstCharacterSetFail;
				4974	}
				4975
				4976
				4977	int LoopChoiceNode::ComputeFirstCharacterSet(int budget) {
				4978	budget--;
				4979	if (budget >= 0) {
				4980	// Find loop min-iteration. It's the value of the guarded choice node
				4981	// with a GEQ guard, if any.
				4982	int min_repetition = 0;
				4983
				4984	for (int i = 0; i <= 1; i++) {
				4985	GuardedAlternative alternative = alternatives()->at(i);
				4986	ZoneList<Guard> guards = alternative.guards();
				4987	if (guards != NULL && guards->length() > 0) {
				4988	Guard* guard = guards->at(0);
				4989	if (guard->op() == Guard::GEQ) {
				4990	min_repetition = guard->value();
				4991	break;
				4992	}
				4993	}
				4994	}
				4995
				4996	budget = loop_node()->ComputeFirstCharacterSet(budget);
				4997	if (budget >= 0) {
				4998	ZoneList<CharacterRange>* character_set =
				4999	loop_node()->first_character_set();
				5000	if (body_can_be_zero_length() \|\| min_repetition == 0) {
				5001	budget = continue_node()->ComputeFirstCharacterSet(budget);
				5002	if (budget < 0) return budget;
				5003	ZoneList<CharacterRange>* body_set =
				5004	continue_node()->first_character_set();
				5005	ZoneList<CharacterRange>* union_set =
				5006	new ZoneList<CharacterRange>(Max(character_set->length(),
				5007	body_set->length()));
				5008	CharacterRange::Merge(character_set,
				5009	body_set,
				5010	union_set,
				5011	union_set,
				5012	union_set);
				5013	character_set = union_set;
				5014	}
				5015	set_first_character_set(character_set);
				5016	}
				5017	}
				5018	return budget;
				5019	}
				5020
				5021
				5022	int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) {
				5023	budget--;
				5024	if (budget >= 0) {
				5025	GuardedAlternative successor = this->alternatives()->at(1);
				5026	RegExpNode* successor_node = successor.node();
				5027	budget = successor_node->ComputeFirstCharacterSet(budget);
				5028	if (budget >= 0) {
				5029	set_first_character_set(successor_node->first_character_set());
				5030	}
				5031	}
				5032	return budget;
				5033	}
				5034
				5035
				5036	// The first character set of an EndNode is unknowable. Just use the
				5037	// default implementation that fails and returns all characters as possible.
				5038
				5039
				5040	int AssertionNode::ComputeFirstCharacterSet(int budget) {
				5041	budget -= 1;
				5042	if (budget >= 0) {
				5043	switch (type_) {
				5044	case AT_END: {
				5045	set_first_character_set(new ZoneList<CharacterRange>(0));
				5046	break;
				5047	}
				5048	case AT_START:
				5049	case AT_BOUNDARY:
				5050	case AT_NON_BOUNDARY:
				5051	case AFTER_NEWLINE:
				5052	case AFTER_NONWORD_CHARACTER:
				5053	case AFTER_WORD_CHARACTER: {
				5054	ASSERT_NOT_NULL(on_success());
				5055	budget = on_success()->ComputeFirstCharacterSet(budget);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5056	if (budget >= 0) {
				5057	set_first_character_set(on_success()->first_character_set());
				5058	}
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5059	break;
				5060	}
				5061	}
				5062	}
				5063	return budget;
				5064	}
				5065
				5066
				5067	int ActionNode::ComputeFirstCharacterSet(int budget) {
				5068	if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail;
				5069	budget--;
				5070	if (budget >= 0) {
				5071	ASSERT_NOT_NULL(on_success());
				5072	budget = on_success()->ComputeFirstCharacterSet(budget);
				5073	if (budget >= 0) {
				5074	set_first_character_set(on_success()->first_character_set());
				5075	}
				5076	}
				5077	return budget;
				5078	}
				5079
				5080
				5081	int BackReferenceNode::ComputeFirstCharacterSet(int budget) {
				5082	// We don't know anything about the first character of a backreference
				5083	// at this point.
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5084	// The potential first characters are the first characters of the capture,
				5085	// and the first characters of the on_success node, depending on whether the
				5086	// capture can be empty and whether it is known to be participating or known
				5087	// not to be.
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5088	return kComputeFirstCharacterSetFail;
				5089	}
				5090
				5091
				5092	int TextNode::ComputeFirstCharacterSet(int budget) {
				5093	budget--;
				5094	if (budget >= 0) {
				5095	ASSERT_NE(0, elements()->length());
				5096	TextElement text = elements()->at(0);
				5097	if (text.type == TextElement::ATOM) {
				5098	RegExpAtom* atom = text.data.u_atom;
				5099	ASSERT_NE(0, atom->length());
				5100	uc16 first_char = atom->data()[0];
				5101	ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1);
				5102	range->Add(CharacterRange(first_char, first_char));
				5103	set_first_character_set(range);
				5104	} else {
				5105	ASSERT(text.type == TextElement::CHAR_CLASS);
				5106	RegExpCharacterClass* char_class = text.data.u_char_class;
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5107	ZoneList<CharacterRange>* ranges = char_class->ranges();
				5108	// TODO(lrn): Canonicalize ranges when they are created
				5109	// instead of waiting until now.
				5110	CharacterRange::Canonicalize(ranges);
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5111	if (char_class->is_negated()) {
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5112	int length = ranges->length();
				5113	int new_length = length + 1;
				5114	if (length > 0) {
				5115	if (ranges->at(0).from() == 0) new_length--;
				5116	if (ranges->at(length - 1).to() == String::kMaxUC16CharCode) {
				5117	new_length--;
				5118	}
				5119	}
				5120	ZoneList<CharacterRange>* negated_ranges =
				5121	new ZoneList<CharacterRange>(new_length);
				5122	CharacterRange::Negate(ranges, negated_ranges);
				5123	set_first_character_set(negated_ranges);
				5124	} else {
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5125	set_first_character_set(ranges);
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5126	}
				5127	}
				5128	}
				5129	return budget;
				5130	}
				5131
				5132
				5133
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5134	// -------------------------------------------------------------------
				5135	// Dispatch table construction
				5136
				5137
				5138	void DispatchTableConstructor::VisitEnd(EndNode* that) {
				5139	AddRange(CharacterRange::Everything());
				5140	}
				5141
				5142
				5143	void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
				5144	node->set_being_calculated(true);
				5145	ZoneList<GuardedAlternative>* alternatives = node->alternatives();
				5146	for (int i = 0; i < alternatives->length(); i++) {
				5147	set_choice_index(i);
				5148	alternatives->at(i).node()->Accept(this);
				5149	}
				5150	node->set_being_calculated(false);
				5151	}
				5152
				5153
				5154	class AddDispatchRange {
				5155	public:
				5156	explicit AddDispatchRange(DispatchTableConstructor* constructor)
				5157	: constructor_(constructor) { }
				5158	void Call(uc32 from, DispatchTable::Entry entry);
				5159	private:
				5160	DispatchTableConstructor* constructor_;
				5161	};
				5162
				5163
				5164	void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
				5165	CharacterRange range(from, entry.to());
				5166	constructor_->AddRange(range);
				5167	}
				5168
				5169
				5170	void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
				5171	if (node->being_calculated())
				5172	return;
				5173	DispatchTable* table = node->GetTable(ignore_case_);
				5174	AddDispatchRange adder(this);
				5175	table->ForEach(&adder);
				5176	}
				5177
				5178
				5179	void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
				5180	// TODO(160): Find the node that we refer back to and propagate its start
				5181	// set back to here. For now we just accept anything.
				5182	AddRange(CharacterRange::Everything());
				5183	}
				5184
				5185
				5186	void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
				5187	RegExpNode* target = that->on_success();
				5188	target->Accept(this);
				5189	}
				5190
				5191
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5192	static int CompareRangeByFrom(const CharacterRange* a,
				5193	const CharacterRange* b) {
				5194	return Compare<uc16>(a->from(), b->from());
				5195	}
				5196
				5197
				5198	void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
				5199	ranges->Sort(CompareRangeByFrom);
				5200	uc16 last = 0;
				5201	for (int i = 0; i < ranges->length(); i++) {
				5202	CharacterRange range = ranges->at(i);
				5203	if (last < range.from())
				5204	AddRange(CharacterRange(last, range.from() - 1));
				5205	if (range.to() >= last) {
				5206	if (range.to() == String::kMaxUC16CharCode) {
				5207	return;
				5208	} else {
				5209	last = range.to() + 1;
				5210	}
				5211	}
				5212	}
				5213	AddRange(CharacterRange(last, String::kMaxUC16CharCode));
				5214	}
				5215
				5216
				5217	void DispatchTableConstructor::VisitText(TextNode* that) {
				5218	TextElement elm = that->elements()->at(0);
				5219	switch (elm.type) {
				5220	case TextElement::ATOM: {
				5221	uc16 c = elm.data.u_atom->data()[0];
				5222	AddRange(CharacterRange(c, c));
				5223	break;
				5224	}
				5225	case TextElement::CHAR_CLASS: {
				5226	RegExpCharacterClass* tree = elm.data.u_char_class;
				5227	ZoneList<CharacterRange>* ranges = tree->ranges();
				5228	if (tree->is_negated()) {
				5229	AddInverse(ranges);
				5230	} else {
				5231	for (int i = 0; i < ranges->length(); i++)
				5232	AddRange(ranges->at(i));
				5233	}
				5234	break;
				5235	}
				5236	default: {
				5237	UNIMPLEMENTED();
				5238	}
				5239	}
				5240	}
				5241
				5242
				5243	void DispatchTableConstructor::VisitAction(ActionNode* that) {
				5244	RegExpNode* target = that->on_success();
				5245	target->Accept(this);
				5246	}
				5247
				5248
				5249	RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data,
				5250	bool ignore_case,
				5251	bool is_multiline,
				5252	Handle<String> pattern,
				5253	bool is_ascii) {
				5254	if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
				5255	return IrregexpRegExpTooBig();
				5256	}
				5257	RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
				5258	// Wrap the body of the regexp in capture #0.
				5259	RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
				5260	0,
				5261	&compiler,
				5262	compiler.accept());
				5263	RegExpNode* node = captured_body;
Ben Murdoch	f87a203	2010-10-22 12:50:53 +0100	[diff] [blame]	5264	bool is_end_anchored = data->tree->IsAnchoredAtEnd();
				5265	bool is_start_anchored = data->tree->IsAnchoredAtStart();
				5266	int max_length = data->tree->max_match();
				5267	if (!is_start_anchored) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5268	// Add a .*? at the beginning, outside the body capture, unless
				5269	// this expression is anchored at the beginning.
				5270	RegExpNode* loop_node =
				5271	RegExpQuantifier::ToNode(0,
				5272	RegExpTree::kInfinity,
				5273	false,
				5274	new RegExpCharacterClass('*'),
				5275	&compiler,
				5276	captured_body,
				5277	data->contains_anchor);
				5278
				5279	if (data->contains_anchor) {
				5280	// Unroll loop once, to take care of the case that might start
				5281	// at the start of input.
				5282	ChoiceNode* first_step_node = new ChoiceNode(2);
				5283	first_step_node->AddAlternative(GuardedAlternative(captured_body));
				5284	first_step_node->AddAlternative(GuardedAlternative(
				5285	new TextNode(new RegExpCharacterClass('*'), loop_node)));
				5286	node = first_step_node;
				5287	} else {
				5288	node = loop_node;
				5289	}
				5290	}
				5291	data->node = node;
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	5292	Analysis analysis(ignore_case, is_ascii);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5293	analysis.EnsureAnalyzed(node);
				5294	if (analysis.has_failed()) {
				5295	const char* error_message = analysis.error_message();
				5296	return CompilationResult(error_message);
				5297	}
				5298
				5299	NodeInfo info = *node->info();
				5300
				5301	// Create the correct assembler for the architecture.
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5302	#ifndef V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5303	// Native regexp implementation.
				5304
				5305	NativeRegExpMacroAssembler::Mode mode =
				5306	is_ascii ? NativeRegExpMacroAssembler::ASCII
				5307	: NativeRegExpMacroAssembler::UC16;
				5308
				5309	#if V8_TARGET_ARCH_IA32
				5310	RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2);
				5311	#elif V8_TARGET_ARCH_X64
				5312	RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2);
				5313	#elif V8_TARGET_ARCH_ARM
				5314	RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2);
				5315	#endif
				5316
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5317	#else // V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5318	// Interpreted regexp implementation.
				5319	EmbeddedVector<byte, 1024> codes;
				5320	RegExpMacroAssemblerIrregexp macro_assembler(codes);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5321	#endif // V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5322
Ben Murdoch	f87a203	2010-10-22 12:50:53 +0100	[diff] [blame]	5323	// Inserted here, instead of in Assembler, because it depends on information
				5324	// in the AST that isn't replicated in the Node structure.
				5325	static const int kMaxBacksearchLimit = 1024;
				5326	if (is_end_anchored &&
				5327	!is_start_anchored &&
				5328	max_length < kMaxBacksearchLimit) {
				5329	macro_assembler.SetCurrentPositionFromEnd(max_length);
				5330	}
				5331
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5332	return compiler.Assemble(&macro_assembler,
				5333	node,
				5334	data->capture_count,
				5335	pattern);
				5336	}
				5337
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5338
				5339	int OffsetsVector::static_offsets_vector_[
				5340	OffsetsVector::kStaticOffsetsVectorSize];
				5341
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5342	}} // namespace v8::internal