Blame - src/jsregexp.cc - fp2-dev/platform/external/v8

blob: 9a1f1f114c0145de49657882f6df3868d5a8db78 [file] [log] [blame]

Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1	// Copyright 2006-2009 the V8 project authors. All rights reserved.
				2	// Redistribution and use in source and binary forms, with or without
				3	// modification, are permitted provided that the following conditions are
				4	// met:
				5	//
				6	// * Redistributions of source code must retain the above copyright
				7	// notice, this list of conditions and the following disclaimer.
				8	// * Redistributions in binary form must reproduce the above
				9	// copyright notice, this list of conditions and the following
				10	// disclaimer in the documentation and/or other materials provided
				11	// with the distribution.
				12	// * Neither the name of Google Inc. nor the names of its
				13	// contributors may be used to endorse or promote products derived
				14	// from this software without specific prior written permission.
				15	//
				16	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				17	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				18	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				19	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				20	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				21	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				22	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				23	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				24	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				25	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				26	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				27
				28	#include "v8.h"
				29
				30	#include "ast.h"
				31	#include "compiler.h"
				32	#include "execution.h"
				33	#include "factory.h"
				34	#include "jsregexp.h"
				35	#include "platform.h"
				36	#include "runtime.h"
				37	#include "top.h"
				38	#include "compilation-cache.h"
				39	#include "string-stream.h"
				40	#include "parser.h"
				41	#include "regexp-macro-assembler.h"
				42	#include "regexp-macro-assembler-tracer.h"
				43	#include "regexp-macro-assembler-irregexp.h"
				44	#include "regexp-stack.h"
				45
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	46	#ifndef V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	47	#if V8_TARGET_ARCH_IA32
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	48	#include "ia32/regexp-macro-assembler-ia32.h"
				49	#elif V8_TARGET_ARCH_X64
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	50	#include "x64/regexp-macro-assembler-x64.h"
				51	#elif V8_TARGET_ARCH_ARM
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	52	#include "arm/regexp-macro-assembler-arm.h"
				53	#else
				54	#error Unsupported target architecture.
				55	#endif
				56	#endif
				57
				58	#include "interpreter-irregexp.h"
				59
				60
				61	namespace v8 {
				62	namespace internal {
				63
				64
				65	Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
				66	Handle<String> pattern,
				67	Handle<String> flags,
				68	bool* has_pending_exception) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	69	// Call the construct code with 2 arguments.
				70	Object** argv[2] = { Handle<Object>::cast(pattern).location(),
				71	Handle<Object>::cast(flags).location() };
				72	return Execution::New(constructor, 2, argv, has_pending_exception);
				73	}
				74
				75
				76	static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
				77	int flags = JSRegExp::NONE;
				78	for (int i = 0; i < str->length(); i++) {
				79	switch (str->Get(i)) {
				80	case 'i':
				81	flags \|= JSRegExp::IGNORE_CASE;
				82	break;
				83	case 'g':
				84	flags \|= JSRegExp::GLOBAL;
				85	break;
				86	case 'm':
				87	flags \|= JSRegExp::MULTILINE;
				88	break;
				89	}
				90	}
				91	return JSRegExp::Flags(flags);
				92	}
				93
				94
				95	static inline void ThrowRegExpException(Handle<JSRegExp> re,
				96	Handle<String> pattern,
				97	Handle<String> error_text,
				98	const char* message) {
				99	Handle<JSArray> array = Factory::NewJSArray(2);
				100	SetElement(array, 0, pattern);
				101	SetElement(array, 1, error_text);
				102	Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
				103	Top::Throw(*regexp_err);
				104	}
				105
				106
				107	// Generic RegExp methods. Dispatches to implementation specific methods.
				108
				109
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	110	Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
				111	Handle<String> pattern,
				112	Handle<String> flag_str) {
				113	JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
				114	Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
				115	bool in_cache = !cached.is_null();
				116	LOG(RegExpCompileEvent(re, in_cache));
				117
				118	Handle<Object> result;
				119	if (in_cache) {
				120	re->set_data(*cached);
				121	return re;
				122	}
				123	FlattenString(pattern);
				124	CompilationZoneScope zone_scope(DELETE_ON_EXIT);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	125	PostponeInterruptsScope postpone;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	126	RegExpCompileData parse_result;
				127	FlatStringReader reader(pattern);
				128	if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
				129	// Throw an exception if we fail to parse the pattern.
				130	ThrowRegExpException(re,
				131	pattern,
				132	parse_result.error,
				133	"malformed_regexp");
				134	return Handle<Object>::null();
				135	}
				136
				137	if (parse_result.simple && !flags.is_ignore_case()) {
				138	// Parse-tree is a single atom that is equal to the pattern.
				139	AtomCompile(re, pattern, flags, pattern);
				140	} else if (parse_result.tree->IsAtom() &&
				141	!flags.is_ignore_case() &&
				142	parse_result.capture_count == 0) {
				143	RegExpAtom* atom = parse_result.tree->AsAtom();
				144	Vector<const uc16> atom_pattern = atom->data();
				145	Handle<String> atom_string = Factory::NewStringFromTwoByte(atom_pattern);
				146	AtomCompile(re, pattern, flags, atom_string);
				147	} else {
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	148	IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	149	}
				150	ASSERT(re->data()->IsFixedArray());
				151	// Compilation succeeded so the data is set on the regexp
				152	// and we can store it in the cache.
				153	Handle<FixedArray> data(FixedArray::cast(re->data()));
				154	CompilationCache::PutRegExp(pattern, flags, data);
				155
				156	return re;
				157	}
				158
				159
				160	Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
				161	Handle<String> subject,
				162	int index,
				163	Handle<JSArray> last_match_info) {
				164	switch (regexp->TypeTag()) {
				165	case JSRegExp::ATOM:
				166	return AtomExec(regexp, subject, index, last_match_info);
				167	case JSRegExp::IRREGEXP: {
				168	Handle<Object> result =
				169	IrregexpExec(regexp, subject, index, last_match_info);
				170	ASSERT(!result.is_null() \|\| Top::has_pending_exception());
				171	return result;
				172	}
				173	default:
				174	UNREACHABLE();
				175	return Handle<Object>::null();
				176	}
				177	}
				178
				179
				180	// RegExp Atom implementation: Simple string search using indexOf.
				181
				182
				183	void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
				184	Handle<String> pattern,
				185	JSRegExp::Flags flags,
				186	Handle<String> match_pattern) {
				187	Factory::SetRegExpAtomData(re,
				188	JSRegExp::ATOM,
				189	pattern,
				190	flags,
				191	match_pattern);
				192	}
				193
				194
				195	static void SetAtomLastCapture(FixedArray* array,
				196	String* subject,
				197	int from,
				198	int to) {
				199	NoHandleAllocation no_handles;
				200	RegExpImpl::SetLastCaptureCount(array, 2);
				201	RegExpImpl::SetLastSubject(array, subject);
				202	RegExpImpl::SetLastInput(array, subject);
				203	RegExpImpl::SetCapture(array, 0, from);
				204	RegExpImpl::SetCapture(array, 1, to);
				205	}
				206
				207
				208	Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
				209	Handle<String> subject,
				210	int index,
				211	Handle<JSArray> last_match_info) {
				212	Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
				213
				214	uint32_t start_index = index;
				215
				216	int value = Runtime::StringMatch(subject, needle, start_index);
				217	if (value == -1) return Factory::null_value();
				218	ASSERT(last_match_info->HasFastElements());
				219
				220	{
				221	NoHandleAllocation no_handles;
				222	FixedArray* array = FixedArray::cast(last_match_info->elements());
				223	SetAtomLastCapture(array, *subject, value, value + needle->length());
				224	}
				225	return last_match_info;
				226	}
				227
				228
				229	// Irregexp implementation.
				230
				231	// Ensures that the regexp object contains a compiled version of the
				232	// source for either ASCII or non-ASCII strings.
				233	// If the compiled version doesn't already exist, it is compiled
				234	// from the source pattern.
				235	// If compilation fails, an exception is thrown and this function
				236	// returns false.
				237	bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {
				238	Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	239	#ifdef V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	240	if (compiled_code->IsByteArray()) return true;
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	241	#else // V8_INTERPRETED_REGEXP (RegExp native code)
				242	if (compiled_code->IsCode()) return true;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	243	#endif
				244	return CompileIrregexp(re, is_ascii);
				245	}
				246
				247
				248	bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {
				249	// Compile the RegExp.
				250	CompilationZoneScope zone_scope(DELETE_ON_EXIT);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	251	PostponeInterruptsScope postpone;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	252	Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
				253	if (entry->IsJSObject()) {
				254	// If it's a JSObject, a previous compilation failed and threw this object.
				255	// Re-throw the object without trying again.
				256	Top::Throw(entry);
				257	return false;
				258	}
				259	ASSERT(entry->IsTheHole());
				260
				261	JSRegExp::Flags flags = re->GetFlags();
				262
				263	Handle<String> pattern(re->Pattern());
				264	if (!pattern->IsFlat()) {
				265	FlattenString(pattern);
				266	}
				267
				268	RegExpCompileData compile_data;
				269	FlatStringReader reader(pattern);
				270	if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) {
				271	// Throw an exception if we fail to parse the pattern.
				272	// THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
				273	ThrowRegExpException(re,
				274	pattern,
				275	compile_data.error,
				276	"malformed_regexp");
				277	return false;
				278	}
				279	RegExpEngine::CompilationResult result =
				280	RegExpEngine::Compile(&compile_data,
				281	flags.is_ignore_case(),
				282	flags.is_multiline(),
				283	pattern,
				284	is_ascii);
				285	if (result.error_message != NULL) {
				286	// Unable to compile regexp.
				287	Handle<JSArray> array = Factory::NewJSArray(2);
				288	SetElement(array, 0, pattern);
				289	SetElement(array,
				290	1,
				291	Factory::NewStringFromUtf8(CStrVector(result.error_message)));
				292	Handle<Object> regexp_err =
				293	Factory::NewSyntaxError("malformed_regexp", array);
				294	Top::Throw(*regexp_err);
				295	re->SetDataAt(JSRegExp::code_index(is_ascii), *regexp_err);
				296	return false;
				297	}
				298
				299	Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
				300	data->set(JSRegExp::code_index(is_ascii), result.code);
				301	int register_max = IrregexpMaxRegisterCount(*data);
				302	if (result.num_registers > register_max) {
				303	SetIrregexpMaxRegisterCount(*data, result.num_registers);
				304	}
				305
				306	return true;
				307	}
				308
				309
				310	int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
				311	return Smi::cast(
				312	re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
				313	}
				314
				315
				316	void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
				317	re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
				318	}
				319
				320
				321	int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
				322	return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
				323	}
				324
				325
				326	int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
				327	return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
				328	}
				329
				330
				331	ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
				332	return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
				333	}
				334
				335
				336	Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
				337	return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
				338	}
				339
				340
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	341	void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
				342	Handle<String> pattern,
				343	JSRegExp::Flags flags,
				344	int capture_count) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	345	// Initialize compiled code entries to null.
				346	Factory::SetRegExpIrregexpData(re,
				347	JSRegExp::IRREGEXP,
				348	pattern,
				349	flags,
				350	capture_count);
				351	}
				352
				353
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	354	int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
				355	Handle<String> subject) {
				356	if (!subject->IsFlat()) {
				357	FlattenString(subject);
				358	}
				359	bool is_ascii = subject->IsAsciiRepresentation();
				360	if (!EnsureCompiledIrregexp(regexp, is_ascii)) {
				361	return -1;
				362	}
				363	#ifdef V8_INTERPRETED_REGEXP
				364	// Byte-code regexp needs space allocated for all its registers.
				365	return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data()));
				366	#else // V8_INTERPRETED_REGEXP
				367	// Native regexp only needs room to output captures. Registers are handled
				368	// internally.
				369	return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
				370	#endif // V8_INTERPRETED_REGEXP
				371	}
				372
				373
				374	RegExpImpl::IrregexpResult RegExpImpl::IrregexpExecOnce(Handle<JSRegExp> regexp,
				375	Handle<String> subject,
				376	int index,
				377	Vector<int> output) {
				378	Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()));
				379
				380	ASSERT(index >= 0);
				381	ASSERT(index <= subject->length());
				382	ASSERT(subject->IsFlat());
				383
				384	#ifndef V8_INTERPRETED_REGEXP
				385	ASSERT(output.length() >=
				386	(IrregexpNumberOfCaptures(irregexp) + 1) 2);
				387	do {
				388	bool is_ascii = subject->IsAsciiRepresentation();
				389	Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii));
				390	NativeRegExpMacroAssembler::Result res =
				391	NativeRegExpMacroAssembler::Match(code,
				392	subject,
				393	output.start(),
				394	output.length(),
				395	index);
				396	if (res != NativeRegExpMacroAssembler::RETRY) {
				397	ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION \|\|
				398	Top::has_pending_exception());
				399	STATIC_ASSERT(
				400	static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
				401	STATIC_ASSERT(
				402	static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
				403	STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
				404	== RE_EXCEPTION);
				405	return static_cast<IrregexpResult>(res);
				406	}
				407	// If result is RETRY, the string has changed representation, and we
				408	// must restart from scratch.
				409	// In this case, it means we must make sure we are prepared to handle
				410	// the, potentially, differen subject (the string can switch between
				411	// being internal and external, and even between being ASCII and UC16,
				412	// but the characters are always the same).
				413	IrregexpPrepare(regexp, subject);
				414	} while (true);
				415	UNREACHABLE();
				416	return RE_EXCEPTION;
				417	#else // V8_INTERPRETED_REGEXP
				418
				419	ASSERT(output.length() >= IrregexpNumberOfRegisters(*irregexp));
				420	bool is_ascii = subject->IsAsciiRepresentation();
				421	// We must have done EnsureCompiledIrregexp, so we can get the number of
				422	// registers.
				423	int* register_vector = output.start();
				424	int number_of_capture_registers =
				425	(IrregexpNumberOfCaptures(irregexp) + 1) 2;
				426	for (int i = number_of_capture_registers - 1; i >= 0; i--) {
				427	register_vector[i] = -1;
				428	}
				429	Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii));
				430
				431	if (IrregexpInterpreter::Match(byte_codes,
				432	subject,
				433	register_vector,
				434	index)) {
				435	return RE_SUCCESS;
				436	}
				437	return RE_FAILURE;
				438	#endif // V8_INTERPRETED_REGEXP
				439	}
				440
				441
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	442	Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp,
				443	Handle<String> subject,
				444	int previous_index,
				445	Handle<JSArray> last_match_info) {
				446	ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP);
				447
				448	// Prepare space for the return values.
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	449	#ifdef V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	450	#ifdef DEBUG
				451	if (FLAG_trace_regexp_bytecodes) {
				452	String* pattern = jsregexp->Pattern();
				453	PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
				454	PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
				455	}
				456	#endif
				457	#endif
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	458	int required_registers = RegExpImpl::IrregexpPrepare(jsregexp, subject);
				459	if (required_registers < 0) {
				460	// Compiling failed with an exception.
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	461	ASSERT(Top::has_pending_exception());
				462	return Handle<Object>::null();
				463	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	464
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	465	OffsetsVector registers(required_registers);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	466
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	467	IrregexpResult res = IrregexpExecOnce(jsregexp,
				468	subject,
				469	previous_index,
				470	Vector<int>(registers.vector(),
				471	registers.length()));
				472	if (res == RE_SUCCESS) {
				473	int capture_register_count =
				474	(IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2;
				475	last_match_info->EnsureSize(capture_register_count + kLastMatchOverhead);
				476	AssertNoAllocation no_gc;
				477	int* register_vector = registers.vector();
				478	FixedArray* array = FixedArray::cast(last_match_info->elements());
				479	for (int i = 0; i < capture_register_count; i += 2) {
				480	SetCapture(array, i, register_vector[i]);
				481	SetCapture(array, i + 1, register_vector[i + 1]);
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	482	}
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	483	SetLastCaptureCount(array, capture_register_count);
				484	SetLastSubject(array, *subject);
				485	SetLastInput(array, *subject);
				486	return last_match_info;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	487	}
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	488	if (res == RE_EXCEPTION) {
				489	ASSERT(Top::has_pending_exception());
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	490	return Handle<Object>::null();
				491	}
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	492	ASSERT(res == RE_FAILURE);
				493	return Factory::null_value();
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	494	}
				495
				496
				497	// -------------------------------------------------------------------
				498	// Implementation of the Irregexp regular expression engine.
				499	//
				500	// The Irregexp regular expression engine is intended to be a complete
				501	// implementation of ECMAScript regular expressions. It generates either
				502	// bytecodes or native code.
				503
				504	// The Irregexp regexp engine is structured in three steps.
				505	// 1) The parser generates an abstract syntax tree. See ast.cc.
				506	// 2) From the AST a node network is created. The nodes are all
				507	// subclasses of RegExpNode. The nodes represent states when
				508	// executing a regular expression. Several optimizations are
				509	// performed on the node network.
				510	// 3) From the nodes we generate either byte codes or native code
				511	// that can actually execute the regular expression (perform
				512	// the search). The code generation step is described in more
				513	// detail below.
				514
				515	// Code generation.
				516	//
				517	// The nodes are divided into four main categories.
				518	// * Choice nodes
				519	// These represent places where the regular expression can
				520	// match in more than one way. For example on entry to an
				521	// alternation (foo\|bar) or a repetition (*, +, ? or {}).
				522	// * Action nodes
				523	// These represent places where some action should be
				524	// performed. Examples include recording the current position
				525	// in the input string to a register (in order to implement
				526	// captures) or other actions on register for example in order
				527	// to implement the counters needed for {} repetitions.
				528	// * Matching nodes
				529	// These attempt to match some element part of the input string.
				530	// Examples of elements include character classes, plain strings
				531	// or back references.
				532	// * End nodes
				533	// These are used to implement the actions required on finding
				534	// a successful match or failing to find a match.
				535	//
				536	// The code generated (whether as byte codes or native code) maintains
				537	// some state as it runs. This consists of the following elements:
				538	//
				539	// * The capture registers. Used for string captures.
				540	// * Other registers. Used for counters etc.
				541	// * The current position.
				542	// * The stack of backtracking information. Used when a matching node
				543	// fails to find a match and needs to try an alternative.
				544	//
				545	// Conceptual regular expression execution model:
				546	//
				547	// There is a simple conceptual model of regular expression execution
				548	// which will be presented first. The actual code generated is a more
				549	// efficient simulation of the simple conceptual model:
				550	//
				551	// * Choice nodes are implemented as follows:
				552	// For each choice except the last {
				553	// push current position
				554	// push backtrack code location
				555	// <generate code to test for choice>
				556	// backtrack code location:
				557	// pop current position
				558	// }
				559	// <generate code to test for last choice>
				560	//
				561	// * Actions nodes are generated as follows
				562	// <push affected registers on backtrack stack>
				563	// <generate code to perform action>
				564	// push backtrack code location
				565	// <generate code to test for following nodes>
				566	// backtrack code location:
				567	// <pop affected registers to restore their state>
				568	// <pop backtrack location from stack and go to it>
				569	//
				570	// * Matching nodes are generated as follows:
				571	// if input string matches at current position
				572	// update current position
				573	// <generate code to test for following nodes>
				574	// else
				575	// <pop backtrack location from stack and go to it>
				576	//
				577	// Thus it can be seen that the current position is saved and restored
				578	// by the choice nodes, whereas the registers are saved and restored by
				579	// by the action nodes that manipulate them.
				580	//
				581	// The other interesting aspect of this model is that nodes are generated
				582	// at the point where they are needed by a recursive call to Emit(). If
				583	// the node has already been code generated then the Emit() call will
				584	// generate a jump to the previously generated code instead. In order to
				585	// limit recursion it is possible for the Emit() function to put the node
				586	// on a work list for later generation and instead generate a jump. The
				587	// destination of the jump is resolved later when the code is generated.
				588	//
				589	// Actual regular expression code generation.
				590	//
				591	// Code generation is actually more complicated than the above. In order
				592	// to improve the efficiency of the generated code some optimizations are
				593	// performed
				594	//
				595	// * Choice nodes have 1-character lookahead.
				596	// A choice node looks at the following character and eliminates some of
				597	// the choices immediately based on that character. This is not yet
				598	// implemented.
				599	// * Simple greedy loops store reduced backtracking information.
				600	// A quantifier like /.*foo/m will greedily match the whole input. It will
				601	// then need to backtrack to a point where it can match "foo". The naive
				602	// implementation of this would push each character position onto the
				603	// backtracking stack, then pop them off one by one. This would use space
				604	// proportional to the length of the input string. However since the "."
				605	// can only match in one way and always has a constant length (in this case
				606	// of 1) it suffices to store the current position on the top of the stack
				607	// once. Matching now becomes merely incrementing the current position and
				608	// backtracking becomes decrementing the current position and checking the
				609	// result against the stored current position. This is faster and saves
				610	// space.
				611	// * The current state is virtualized.
				612	// This is used to defer expensive operations until it is clear that they
				613	// are needed and to generate code for a node more than once, allowing
				614	// specialized an efficient versions of the code to be created. This is
				615	// explained in the section below.
				616	//
				617	// Execution state virtualization.
				618	//
				619	// Instead of emitting code, nodes that manipulate the state can record their
				620	// manipulation in an object called the Trace. The Trace object can record a
				621	// current position offset, an optional backtrack code location on the top of
				622	// the virtualized backtrack stack and some register changes. When a node is
				623	// to be emitted it can flush the Trace or update it. Flushing the Trace
				624	// will emit code to bring the actual state into line with the virtual state.
				625	// Avoiding flushing the state can postpone some work (eg updates of capture
				626	// registers). Postponing work can save time when executing the regular
				627	// expression since it may be found that the work never has to be done as a
				628	// failure to match can occur. In addition it is much faster to jump to a
				629	// known backtrack code location than it is to pop an unknown backtrack
				630	// location from the stack and jump there.
				631	//
				632	// The virtual state found in the Trace affects code generation. For example
				633	// the virtual state contains the difference between the actual current
				634	// position and the virtual current position, and matching code needs to use
				635	// this offset to attempt a match in the correct location of the input
				636	// string. Therefore code generated for a non-trivial trace is specialized
				637	// to that trace. The code generator therefore has the ability to generate
				638	// code for each node several times. In order to limit the size of the
				639	// generated code there is an arbitrary limit on how many specialized sets of
				640	// code may be generated for a given node. If the limit is reached, the
				641	// trace is flushed and a generic version of the code for a node is emitted.
				642	// This is subsequently used for that node. The code emitted for non-generic
				643	// trace is not recorded in the node and so it cannot currently be reused in
				644	// the event that code generation is requested for an identical trace.
				645
				646
				647	void RegExpTree::AppendToText(RegExpText* text) {
				648	UNREACHABLE();
				649	}
				650
				651
				652	void RegExpAtom::AppendToText(RegExpText* text) {
				653	text->AddElement(TextElement::Atom(this));
				654	}
				655
				656
				657	void RegExpCharacterClass::AppendToText(RegExpText* text) {
				658	text->AddElement(TextElement::CharClass(this));
				659	}
				660
				661
				662	void RegExpText::AppendToText(RegExpText* text) {
				663	for (int i = 0; i < elements()->length(); i++)
				664	text->AddElement(elements()->at(i));
				665	}
				666
				667
				668	TextElement TextElement::Atom(RegExpAtom* atom) {
				669	TextElement result = TextElement(ATOM);
				670	result.data.u_atom = atom;
				671	return result;
				672	}
				673
				674
				675	TextElement TextElement::CharClass(
				676	RegExpCharacterClass* char_class) {
				677	TextElement result = TextElement(CHAR_CLASS);
				678	result.data.u_char_class = char_class;
				679	return result;
				680	}
				681
				682
				683	int TextElement::length() {
				684	if (type == ATOM) {
				685	return data.u_atom->length();
				686	} else {
				687	ASSERT(type == CHAR_CLASS);
				688	return 1;
				689	}
				690	}
				691
				692
				693	DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
				694	if (table_ == NULL) {
				695	table_ = new DispatchTable();
				696	DispatchTableConstructor cons(table_, ignore_case);
				697	cons.BuildTable(this);
				698	}
				699	return table_;
				700	}
				701
				702
				703	class RegExpCompiler {
				704	public:
				705	RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
				706
				707	int AllocateRegister() {
				708	if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
				709	reg_exp_too_big_ = true;
				710	return next_register_;
				711	}
				712	return next_register_++;
				713	}
				714
				715	RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
				716	RegExpNode* start,
				717	int capture_count,
				718	Handle<String> pattern);
				719
				720	inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
				721
				722	static const int kImplementationOffset = 0;
				723	static const int kNumberOfRegistersOffset = 0;
				724	static const int kCodeOffset = 1;
				725
				726	RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
				727	EndNode* accept() { return accept_; }
				728
				729	static const int kMaxRecursion = 100;
				730	inline int recursion_depth() { return recursion_depth_; }
				731	inline void IncrementRecursionDepth() { recursion_depth_++; }
				732	inline void DecrementRecursionDepth() { recursion_depth_--; }
				733
				734	void SetRegExpTooBig() { reg_exp_too_big_ = true; }
				735
				736	inline bool ignore_case() { return ignore_case_; }
				737	inline bool ascii() { return ascii_; }
				738
				739	static const int kNoRegister = -1;
				740	private:
				741	EndNode* accept_;
				742	int next_register_;
				743	List<RegExpNode> work_list_;
				744	int recursion_depth_;
				745	RegExpMacroAssembler* macro_assembler_;
				746	bool ignore_case_;
				747	bool ascii_;
				748	bool reg_exp_too_big_;
				749	};
				750
				751
				752	class RecursionCheck {
				753	public:
				754	explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
				755	compiler->IncrementRecursionDepth();
				756	}
				757	~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
				758	private:
				759	RegExpCompiler* compiler_;
				760	};
				761
				762
				763	static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
				764	return RegExpEngine::CompilationResult("RegExp too big");
				765	}
				766
				767
				768	// Attempts to compile the regexp using an Irregexp code generator. Returns
				769	// a fixed array or a null handle depending on whether it succeeded.
				770	RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
				771	: next_register_(2 * (capture_count + 1)),
				772	work_list_(NULL),
				773	recursion_depth_(0),
				774	ignore_case_(ignore_case),
				775	ascii_(ascii),
				776	reg_exp_too_big_(false) {
				777	accept_ = new EndNode(EndNode::ACCEPT);
				778	ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
				779	}
				780
				781
				782	RegExpEngine::CompilationResult RegExpCompiler::Assemble(
				783	RegExpMacroAssembler* macro_assembler,
				784	RegExpNode* start,
				785	int capture_count,
				786	Handle<String> pattern) {
				787	#ifdef DEBUG
				788	if (FLAG_trace_regexp_assembler)
				789	macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
				790	else
				791	#endif
				792	macro_assembler_ = macro_assembler;
				793	List <RegExpNode*> work_list(0);
				794	work_list_ = &work_list;
				795	Label fail;
				796	macro_assembler_->PushBacktrack(&fail);
				797	Trace new_trace;
				798	start->Emit(this, &new_trace);
				799	macro_assembler_->Bind(&fail);
				800	macro_assembler_->Fail();
				801	while (!work_list.is_empty()) {
				802	work_list.RemoveLast()->Emit(this, &new_trace);
				803	}
				804	if (reg_exp_too_big_) return IrregexpRegExpTooBig();
				805
				806	Handle<Object> code = macro_assembler_->GetCode(pattern);
				807
				808	work_list_ = NULL;
				809	#ifdef DEBUG
				810	if (FLAG_trace_regexp_assembler) {
				811	delete macro_assembler_;
				812	}
				813	#endif
				814	return RegExpEngine::CompilationResult(*code, next_register_);
				815	}
				816
				817
				818	bool Trace::DeferredAction::Mentions(int that) {
				819	if (type() == ActionNode::CLEAR_CAPTURES) {
				820	Interval range = static_cast<DeferredClearCaptures*>(this)->range();
				821	return range.Contains(that);
				822	} else {
				823	return reg() == that;
				824	}
				825	}
				826
				827
				828	bool Trace::mentions_reg(int reg) {
				829	for (DeferredAction* action = actions_;
				830	action != NULL;
				831	action = action->next()) {
				832	if (action->Mentions(reg))
				833	return true;
				834	}
				835	return false;
				836	}
				837
				838
				839	bool Trace::GetStoredPosition(int reg, int* cp_offset) {
				840	ASSERT_EQ(0, *cp_offset);
				841	for (DeferredAction* action = actions_;
				842	action != NULL;
				843	action = action->next()) {
				844	if (action->Mentions(reg)) {
				845	if (action->type() == ActionNode::STORE_POSITION) {
				846	cp_offset = static_cast<DeferredCapture>(action)->cp_offset();
				847	return true;
				848	} else {
				849	return false;
				850	}
				851	}
				852	}
				853	return false;
				854	}
				855
				856
				857	int Trace::FindAffectedRegisters(OutSet* affected_registers) {
				858	int max_register = RegExpCompiler::kNoRegister;
				859	for (DeferredAction* action = actions_;
				860	action != NULL;
				861	action = action->next()) {
				862	if (action->type() == ActionNode::CLEAR_CAPTURES) {
				863	Interval range = static_cast<DeferredClearCaptures*>(action)->range();
				864	for (int i = range.from(); i <= range.to(); i++)
				865	affected_registers->Set(i);
				866	if (range.to() > max_register) max_register = range.to();
				867	} else {
				868	affected_registers->Set(action->reg());
				869	if (action->reg() > max_register) max_register = action->reg();
				870	}
				871	}
				872	return max_register;
				873	}
				874
				875
				876	void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
				877	int max_register,
				878	OutSet& registers_to_pop,
				879	OutSet& registers_to_clear) {
				880	for (int reg = max_register; reg >= 0; reg--) {
				881	if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
				882	else if (registers_to_clear.Get(reg)) {
				883	int clear_to = reg;
				884	while (reg > 0 && registers_to_clear.Get(reg - 1)) {
				885	reg--;
				886	}
				887	assembler->ClearRegisters(reg, clear_to);
				888	}
				889	}
				890	}
				891
				892
				893	void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
				894	int max_register,
				895	OutSet& affected_registers,
				896	OutSet* registers_to_pop,
				897	OutSet* registers_to_clear) {
				898	// The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
				899	const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
				900
				901	// Count pushes performed to force a stack limit check occasionally.
				902	int pushes = 0;
				903
				904	for (int reg = 0; reg <= max_register; reg++) {
				905	if (!affected_registers.Get(reg)) {
				906	continue;
				907	}
				908
				909	// The chronologically first deferred action in the trace
				910	// is used to infer the action needed to restore a register
				911	// to its previous state (or not, if it's safe to ignore it).
				912	enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
				913	DeferredActionUndoType undo_action = IGNORE;
				914
				915	int value = 0;
				916	bool absolute = false;
				917	bool clear = false;
				918	int store_position = -1;
				919	// This is a little tricky because we are scanning the actions in reverse
				920	// historical order (newest first).
				921	for (DeferredAction* action = actions_;
				922	action != NULL;
				923	action = action->next()) {
				924	if (action->Mentions(reg)) {
				925	switch (action->type()) {
				926	case ActionNode::SET_REGISTER: {
				927	Trace::DeferredSetRegister* psr =
				928	static_cast<Trace::DeferredSetRegister*>(action);
				929	if (!absolute) {
				930	value += psr->value();
				931	absolute = true;
				932	}
				933	// SET_REGISTER is currently only used for newly introduced loop
				934	// counters. They can have a significant previous value if they
				935	// occour in a loop. TODO(lrn): Propagate this information, so
				936	// we can set undo_action to IGNORE if we know there is no value to
				937	// restore.
				938	undo_action = RESTORE;
				939	ASSERT_EQ(store_position, -1);
				940	ASSERT(!clear);
				941	break;
				942	}
				943	case ActionNode::INCREMENT_REGISTER:
				944	if (!absolute) {
				945	value++;
				946	}
				947	ASSERT_EQ(store_position, -1);
				948	ASSERT(!clear);
				949	undo_action = RESTORE;
				950	break;
				951	case ActionNode::STORE_POSITION: {
				952	Trace::DeferredCapture* pc =
				953	static_cast<Trace::DeferredCapture*>(action);
				954	if (!clear && store_position == -1) {
				955	store_position = pc->cp_offset();
				956	}
				957
				958	// For captures we know that stores and clears alternate.
				959	// Other register, are never cleared, and if the occur
				960	// inside a loop, they might be assigned more than once.
				961	if (reg <= 1) {
				962	// Registers zero and one, aka "capture zero", is
				963	// always set correctly if we succeed. There is no
				964	// need to undo a setting on backtrack, because we
				965	// will set it again or fail.
				966	undo_action = IGNORE;
				967	} else {
				968	undo_action = pc->is_capture() ? CLEAR : RESTORE;
				969	}
				970	ASSERT(!absolute);
				971	ASSERT_EQ(value, 0);
				972	break;
				973	}
				974	case ActionNode::CLEAR_CAPTURES: {
				975	// Since we're scanning in reverse order, if we've already
				976	// set the position we have to ignore historically earlier
				977	// clearing operations.
				978	if (store_position == -1) {
				979	clear = true;
				980	}
				981	undo_action = RESTORE;
				982	ASSERT(!absolute);
				983	ASSERT_EQ(value, 0);
				984	break;
				985	}
				986	default:
				987	UNREACHABLE();
				988	break;
				989	}
				990	}
				991	}
				992	// Prepare for the undo-action (e.g., push if it's going to be popped).
				993	if (undo_action == RESTORE) {
				994	pushes++;
				995	RegExpMacroAssembler::StackCheckFlag stack_check =
				996	RegExpMacroAssembler::kNoStackLimitCheck;
				997	if (pushes == push_limit) {
				998	stack_check = RegExpMacroAssembler::kCheckStackLimit;
				999	pushes = 0;
				1000	}
				1001
				1002	assembler->PushRegister(reg, stack_check);
				1003	registers_to_pop->Set(reg);
				1004	} else if (undo_action == CLEAR) {
				1005	registers_to_clear->Set(reg);
				1006	}
				1007	// Perform the chronologically last action (or accumulated increment)
				1008	// for the register.
				1009	if (store_position != -1) {
				1010	assembler->WriteCurrentPositionToRegister(reg, store_position);
				1011	} else if (clear) {
				1012	assembler->ClearRegisters(reg, reg);
				1013	} else if (absolute) {
				1014	assembler->SetRegister(reg, value);
				1015	} else if (value != 0) {
				1016	assembler->AdvanceRegister(reg, value);
				1017	}
				1018	}
				1019	}
				1020
				1021
				1022	// This is called as we come into a loop choice node and some other tricky
				1023	// nodes. It normalizes the state of the code generator to ensure we can
				1024	// generate generic code.
				1025	void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
				1026	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1027
				1028	ASSERT(!is_trivial());
				1029
				1030	if (actions_ == NULL && backtrack() == NULL) {
				1031	// Here we just have some deferred cp advances to fix and we are back to
				1032	// a normal situation. We may also have to forget some information gained
				1033	// through a quick check that was already performed.
				1034	if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
				1035	// Create a new trivial state and generate the node with that.
				1036	Trace new_state;
				1037	successor->Emit(compiler, &new_state);
				1038	return;
				1039	}
				1040
				1041	// Generate deferred actions here along with code to undo them again.
				1042	OutSet affected_registers;
				1043
				1044	if (backtrack() != NULL) {
				1045	// Here we have a concrete backtrack location. These are set up by choice
				1046	// nodes and so they indicate that we have a deferred save of the current
				1047	// position which we may need to emit here.
				1048	assembler->PushCurrentPosition();
				1049	}
				1050
				1051	int max_register = FindAffectedRegisters(&affected_registers);
				1052	OutSet registers_to_pop;
				1053	OutSet registers_to_clear;
				1054	PerformDeferredActions(assembler,
				1055	max_register,
				1056	affected_registers,
				1057	&registers_to_pop,
				1058	&registers_to_clear);
				1059	if (cp_offset_ != 0) {
				1060	assembler->AdvanceCurrentPosition(cp_offset_);
				1061	}
				1062
				1063	// Create a new trivial state and generate the node with that.
				1064	Label undo;
				1065	assembler->PushBacktrack(&undo);
				1066	Trace new_state;
				1067	successor->Emit(compiler, &new_state);
				1068
				1069	// On backtrack we need to restore state.
				1070	assembler->Bind(&undo);
				1071	RestoreAffectedRegisters(assembler,
				1072	max_register,
				1073	registers_to_pop,
				1074	registers_to_clear);
				1075	if (backtrack() == NULL) {
				1076	assembler->Backtrack();
				1077	} else {
				1078	assembler->PopCurrentPosition();
				1079	assembler->GoTo(backtrack());
				1080	}
				1081	}
				1082
				1083
				1084	void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
				1085	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1086
				1087	// Omit flushing the trace. We discard the entire stack frame anyway.
				1088
				1089	if (!label()->is_bound()) {
				1090	// We are completely independent of the trace, since we ignore it,
				1091	// so this code can be used as the generic version.
				1092	assembler->Bind(label());
				1093	}
				1094
				1095	// Throw away everything on the backtrack stack since the start
				1096	// of the negative submatch and restore the character position.
				1097	assembler->ReadCurrentPositionFromRegister(current_position_register_);
				1098	assembler->ReadStackPointerFromRegister(stack_pointer_register_);
				1099	if (clear_capture_count_ > 0) {
				1100	// Clear any captures that might have been performed during the success
				1101	// of the body of the negative look-ahead.
				1102	int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
				1103	assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
				1104	}
				1105	// Now that we have unwound the stack we find at the top of the stack the
				1106	// backtrack that the BeginSubmatch node got.
				1107	assembler->Backtrack();
				1108	}
				1109
				1110
				1111	void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				1112	if (!trace->is_trivial()) {
				1113	trace->Flush(compiler, this);
				1114	return;
				1115	}
				1116	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1117	if (!label()->is_bound()) {
				1118	assembler->Bind(label());
				1119	}
				1120	switch (action_) {
				1121	case ACCEPT:
				1122	assembler->Succeed();
				1123	return;
				1124	case BACKTRACK:
				1125	assembler->GoTo(trace->backtrack());
				1126	return;
				1127	case NEGATIVE_SUBMATCH_SUCCESS:
				1128	// This case is handled in a different virtual method.
				1129	UNREACHABLE();
				1130	}
				1131	UNIMPLEMENTED();
				1132	}
				1133
				1134
				1135	void GuardedAlternative::AddGuard(Guard* guard) {
				1136	if (guards_ == NULL)
				1137	guards_ = new ZoneList<Guard*>(1);
				1138	guards_->Add(guard);
				1139	}
				1140
				1141
				1142	ActionNode* ActionNode::SetRegister(int reg,
				1143	int val,
				1144	RegExpNode* on_success) {
				1145	ActionNode* result = new ActionNode(SET_REGISTER, on_success);
				1146	result->data_.u_store_register.reg = reg;
				1147	result->data_.u_store_register.value = val;
				1148	return result;
				1149	}
				1150
				1151
				1152	ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
				1153	ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
				1154	result->data_.u_increment_register.reg = reg;
				1155	return result;
				1156	}
				1157
				1158
				1159	ActionNode* ActionNode::StorePosition(int reg,
				1160	bool is_capture,
				1161	RegExpNode* on_success) {
				1162	ActionNode* result = new ActionNode(STORE_POSITION, on_success);
				1163	result->data_.u_position_register.reg = reg;
				1164	result->data_.u_position_register.is_capture = is_capture;
				1165	return result;
				1166	}
				1167
				1168
				1169	ActionNode* ActionNode::ClearCaptures(Interval range,
				1170	RegExpNode* on_success) {
				1171	ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
				1172	result->data_.u_clear_captures.range_from = range.from();
				1173	result->data_.u_clear_captures.range_to = range.to();
				1174	return result;
				1175	}
				1176
				1177
				1178	ActionNode* ActionNode::BeginSubmatch(int stack_reg,
				1179	int position_reg,
				1180	RegExpNode* on_success) {
				1181	ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
				1182	result->data_.u_submatch.stack_pointer_register = stack_reg;
				1183	result->data_.u_submatch.current_position_register = position_reg;
				1184	return result;
				1185	}
				1186
				1187
				1188	ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
				1189	int position_reg,
				1190	int clear_register_count,
				1191	int clear_register_from,
				1192	RegExpNode* on_success) {
				1193	ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
				1194	result->data_.u_submatch.stack_pointer_register = stack_reg;
				1195	result->data_.u_submatch.current_position_register = position_reg;
				1196	result->data_.u_submatch.clear_register_count = clear_register_count;
				1197	result->data_.u_submatch.clear_register_from = clear_register_from;
				1198	return result;
				1199	}
				1200
				1201
				1202	ActionNode* ActionNode::EmptyMatchCheck(int start_register,
				1203	int repetition_register,
				1204	int repetition_limit,
				1205	RegExpNode* on_success) {
				1206	ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
				1207	result->data_.u_empty_match_check.start_register = start_register;
				1208	result->data_.u_empty_match_check.repetition_register = repetition_register;
				1209	result->data_.u_empty_match_check.repetition_limit = repetition_limit;
				1210	return result;
				1211	}
				1212
				1213
				1214	#define DEFINE_ACCEPT(Type) \
				1215	void Type##Node::Accept(NodeVisitor* visitor) { \
				1216	visitor->Visit##Type(this); \
				1217	}
				1218	FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
				1219	#undef DEFINE_ACCEPT
				1220
				1221
				1222	void LoopChoiceNode::Accept(NodeVisitor* visitor) {
				1223	visitor->VisitLoopChoice(this);
				1224	}
				1225
				1226
				1227	// -------------------------------------------------------------------
				1228	// Emit code.
				1229
				1230
				1231	void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
				1232	Guard* guard,
				1233	Trace* trace) {
				1234	switch (guard->op()) {
				1235	case Guard::LT:
				1236	ASSERT(!trace->mentions_reg(guard->reg()));
				1237	macro_assembler->IfRegisterGE(guard->reg(),
				1238	guard->value(),
				1239	trace->backtrack());
				1240	break;
				1241	case Guard::GEQ:
				1242	ASSERT(!trace->mentions_reg(guard->reg()));
				1243	macro_assembler->IfRegisterLT(guard->reg(),
				1244	guard->value(),
				1245	trace->backtrack());
				1246	break;
				1247	}
				1248	}
				1249
				1250
				1251	static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
				1252	static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
				1253
				1254
				1255	// Returns the number of characters in the equivalence class, omitting those
				1256	// that cannot occur in the source string because it is ASCII.
				1257	static int GetCaseIndependentLetters(uc16 character,
				1258	bool ascii_subject,
				1259	unibrow::uchar* letters) {
				1260	int length = uncanonicalize.get(character, '\0', letters);
				1261	// Unibrow returns 0 or 1 for characters where case independependence is
				1262	// trivial.
				1263	if (length == 0) {
				1264	letters[0] = character;
				1265	length = 1;
				1266	}
				1267	if (!ascii_subject \|\| character <= String::kMaxAsciiCharCode) {
				1268	return length;
				1269	}
				1270	// The standard requires that non-ASCII characters cannot have ASCII
				1271	// character codes in their equivalence class.
				1272	return 0;
				1273	}
				1274
				1275
				1276	static inline bool EmitSimpleCharacter(RegExpCompiler* compiler,
				1277	uc16 c,
				1278	Label* on_failure,
				1279	int cp_offset,
				1280	bool check,
				1281	bool preloaded) {
				1282	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1283	bool bound_checked = false;
				1284	if (!preloaded) {
				1285	assembler->LoadCurrentCharacter(
				1286	cp_offset,
				1287	on_failure,
				1288	check);
				1289	bound_checked = true;
				1290	}
				1291	assembler->CheckNotCharacter(c, on_failure);
				1292	return bound_checked;
				1293	}
				1294
				1295
				1296	// Only emits non-letters (things that don't have case). Only used for case
				1297	// independent matches.
				1298	static inline bool EmitAtomNonLetter(RegExpCompiler* compiler,
				1299	uc16 c,
				1300	Label* on_failure,
				1301	int cp_offset,
				1302	bool check,
				1303	bool preloaded) {
				1304	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1305	bool ascii = compiler->ascii();
				1306	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				1307	int length = GetCaseIndependentLetters(c, ascii, chars);
				1308	if (length < 1) {
				1309	// This can't match. Must be an ASCII subject and a non-ASCII character.
				1310	// We do not need to do anything since the ASCII pass already handled this.
				1311	return false; // Bounds not checked.
				1312	}
				1313	bool checked = false;
				1314	// We handle the length > 1 case in a later pass.
				1315	if (length == 1) {
				1316	if (ascii && c > String::kMaxAsciiCharCodeU) {
				1317	// Can't match - see above.
				1318	return false; // Bounds not checked.
				1319	}
				1320	if (!preloaded) {
				1321	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
				1322	checked = check;
				1323	}
				1324	macro_assembler->CheckNotCharacter(c, on_failure);
				1325	}
				1326	return checked;
				1327	}
				1328
				1329
				1330	static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
				1331	bool ascii,
				1332	uc16 c1,
				1333	uc16 c2,
				1334	Label* on_failure) {
				1335	uc16 char_mask;
				1336	if (ascii) {
				1337	char_mask = String::kMaxAsciiCharCode;
				1338	} else {
				1339	char_mask = String::kMaxUC16CharCode;
				1340	}
				1341	uc16 exor = c1 ^ c2;
				1342	// Check whether exor has only one bit set.
				1343	if (((exor - 1) & exor) == 0) {
				1344	// If c1 and c2 differ only by one bit.
				1345	// Ecma262UnCanonicalize always gives the highest number last.
				1346	ASSERT(c2 > c1);
				1347	uc16 mask = char_mask ^ exor;
				1348	macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
				1349	return true;
				1350	}
				1351	ASSERT(c2 > c1);
				1352	uc16 diff = c2 - c1;
				1353	if (((diff - 1) & diff) == 0 && c1 >= diff) {
				1354	// If the characters differ by 2^n but don't differ by one bit then
				1355	// subtract the difference from the found character, then do the or
				1356	// trick. We avoid the theoretical case where negative numbers are
				1357	// involved in order to simplify code generation.
				1358	uc16 mask = char_mask ^ diff;
				1359	macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
				1360	diff,
				1361	mask,
				1362	on_failure);
				1363	return true;
				1364	}
				1365	return false;
				1366	}
				1367
				1368
				1369	typedef bool EmitCharacterFunction(RegExpCompiler* compiler,
				1370	uc16 c,
				1371	Label* on_failure,
				1372	int cp_offset,
				1373	bool check,
				1374	bool preloaded);
				1375
				1376	// Only emits letters (things that have case). Only used for case independent
				1377	// matches.
				1378	static inline bool EmitAtomLetter(RegExpCompiler* compiler,
				1379	uc16 c,
				1380	Label* on_failure,
				1381	int cp_offset,
				1382	bool check,
				1383	bool preloaded) {
				1384	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1385	bool ascii = compiler->ascii();
				1386	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				1387	int length = GetCaseIndependentLetters(c, ascii, chars);
				1388	if (length <= 1) return false;
				1389	// We may not need to check against the end of the input string
				1390	// if this character lies before a character that matched.
				1391	if (!preloaded) {
				1392	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
				1393	}
				1394	Label ok;
				1395	ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
				1396	switch (length) {
				1397	case 2: {
				1398	if (ShortCutEmitCharacterPair(macro_assembler,
				1399	ascii,
				1400	chars[0],
				1401	chars[1],
				1402	on_failure)) {
				1403	} else {
				1404	macro_assembler->CheckCharacter(chars[0], &ok);
				1405	macro_assembler->CheckNotCharacter(chars[1], on_failure);
				1406	macro_assembler->Bind(&ok);
				1407	}
				1408	break;
				1409	}
				1410	case 4:
				1411	macro_assembler->CheckCharacter(chars[3], &ok);
				1412	// Fall through!
				1413	case 3:
				1414	macro_assembler->CheckCharacter(chars[0], &ok);
				1415	macro_assembler->CheckCharacter(chars[1], &ok);
				1416	macro_assembler->CheckNotCharacter(chars[2], on_failure);
				1417	macro_assembler->Bind(&ok);
				1418	break;
				1419	default:
				1420	UNREACHABLE();
				1421	break;
				1422	}
				1423	return true;
				1424	}
				1425
				1426
				1427	static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
				1428	RegExpCharacterClass* cc,
				1429	bool ascii,
				1430	Label* on_failure,
				1431	int cp_offset,
				1432	bool check_offset,
				1433	bool preloaded) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1434	ZoneList<CharacterRange>* ranges = cc->ranges();
				1435	int max_char;
				1436	if (ascii) {
				1437	max_char = String::kMaxAsciiCharCode;
				1438	} else {
				1439	max_char = String::kMaxUC16CharCode;
				1440	}
				1441
				1442	Label success;
				1443
				1444	Label* char_is_in_class =
				1445	cc->is_negated() ? on_failure : &success;
				1446
				1447	int range_count = ranges->length();
				1448
				1449	int last_valid_range = range_count - 1;
				1450	while (last_valid_range >= 0) {
				1451	CharacterRange& range = ranges->at(last_valid_range);
				1452	if (range.from() <= max_char) {
				1453	break;
				1454	}
				1455	last_valid_range--;
				1456	}
				1457
				1458	if (last_valid_range < 0) {
				1459	if (!cc->is_negated()) {
				1460	// TODO(plesner): We can remove this when the node level does our
				1461	// ASCII optimizations for us.
				1462	macro_assembler->GoTo(on_failure);
				1463	}
				1464	if (check_offset) {
				1465	macro_assembler->CheckPosition(cp_offset, on_failure);
				1466	}
				1467	return;
				1468	}
				1469
				1470	if (last_valid_range == 0 &&
				1471	!cc->is_negated() &&
				1472	ranges->at(0).IsEverything(max_char)) {
				1473	// This is a common case hit by non-anchored expressions.
				1474	if (check_offset) {
				1475	macro_assembler->CheckPosition(cp_offset, on_failure);
				1476	}
				1477	return;
				1478	}
				1479
				1480	if (!preloaded) {
				1481	macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
				1482	}
				1483
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	1484	if (cc->is_standard() &&
				1485	macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
				1486	on_failure)) {
				1487	return;
				1488	}
				1489
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1490	for (int i = 0; i < last_valid_range; i++) {
				1491	CharacterRange& range = ranges->at(i);
				1492	Label next_range;
				1493	uc16 from = range.from();
				1494	uc16 to = range.to();
				1495	if (from > max_char) {
				1496	continue;
				1497	}
				1498	if (to > max_char) to = max_char;
				1499	if (to == from) {
				1500	macro_assembler->CheckCharacter(to, char_is_in_class);
				1501	} else {
				1502	if (from != 0) {
				1503	macro_assembler->CheckCharacterLT(from, &next_range);
				1504	}
				1505	if (to != max_char) {
				1506	macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
				1507	} else {
				1508	macro_assembler->GoTo(char_is_in_class);
				1509	}
				1510	}
				1511	macro_assembler->Bind(&next_range);
				1512	}
				1513
				1514	CharacterRange& range = ranges->at(last_valid_range);
				1515	uc16 from = range.from();
				1516	uc16 to = range.to();
				1517
				1518	if (to > max_char) to = max_char;
				1519	ASSERT(to >= from);
				1520
				1521	if (to == from) {
				1522	if (cc->is_negated()) {
				1523	macro_assembler->CheckCharacter(to, on_failure);
				1524	} else {
				1525	macro_assembler->CheckNotCharacter(to, on_failure);
				1526	}
				1527	} else {
				1528	if (from != 0) {
				1529	if (cc->is_negated()) {
				1530	macro_assembler->CheckCharacterLT(from, &success);
				1531	} else {
				1532	macro_assembler->CheckCharacterLT(from, on_failure);
				1533	}
				1534	}
				1535	if (to != String::kMaxUC16CharCode) {
				1536	if (cc->is_negated()) {
				1537	macro_assembler->CheckCharacterLT(to + 1, on_failure);
				1538	} else {
				1539	macro_assembler->CheckCharacterGT(to, on_failure);
				1540	}
				1541	} else {
				1542	if (cc->is_negated()) {
				1543	macro_assembler->GoTo(on_failure);
				1544	}
				1545	}
				1546	}
				1547	macro_assembler->Bind(&success);
				1548	}
				1549
				1550
				1551	RegExpNode::~RegExpNode() {
				1552	}
				1553
				1554
				1555	RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
				1556	Trace* trace) {
				1557	// If we are generating a greedy loop then don't stop and don't reuse code.
				1558	if (trace->stop_node() != NULL) {
				1559	return CONTINUE;
				1560	}
				1561
				1562	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				1563	if (trace->is_trivial()) {
				1564	if (label_.is_bound()) {
				1565	// We are being asked to generate a generic version, but that's already
				1566	// been done so just go to it.
				1567	macro_assembler->GoTo(&label_);
				1568	return DONE;
				1569	}
				1570	if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
				1571	// To avoid too deep recursion we push the node to the work queue and just
				1572	// generate a goto here.
				1573	compiler->AddWork(this);
				1574	macro_assembler->GoTo(&label_);
				1575	return DONE;
				1576	}
				1577	// Generate generic version of the node and bind the label for later use.
				1578	macro_assembler->Bind(&label_);
				1579	return CONTINUE;
				1580	}
				1581
				1582	// We are being asked to make a non-generic version. Keep track of how many
				1583	// non-generic versions we generate so as not to overdo it.
				1584	trace_count_++;
				1585	if (FLAG_regexp_optimization &&
				1586	trace_count_ < kMaxCopiesCodeGenerated &&
				1587	compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
				1588	return CONTINUE;
				1589	}
				1590
				1591	// If we get here code has been generated for this node too many times or
				1592	// recursion is too deep. Time to switch to a generic version. The code for
				1593	// generic versions above can handle deep recursion properly.
				1594	trace->Flush(compiler, this);
				1595	return DONE;
				1596	}
				1597
				1598
				1599	int ActionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
				1600	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
				1601	if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
				1602	return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
				1603	}
				1604
				1605
				1606	int AssertionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
				1607	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
				1608	return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
				1609	}
				1610
				1611
				1612	int BackReferenceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
				1613	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
				1614	return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
				1615	}
				1616
				1617
				1618	int TextNode::EatsAtLeast(int still_to_find, int recursion_depth) {
				1619	int answer = Length();
				1620	if (answer >= still_to_find) return answer;
				1621	if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
				1622	return answer + on_success()->EatsAtLeast(still_to_find - answer,
				1623	recursion_depth + 1);
				1624	}
				1625
				1626
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	1627	int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
				1628	int recursion_depth) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1629	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
				1630	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				1631	// afterwards.
				1632	RegExpNode* node = alternatives_->at(1).node();
				1633	return node->EatsAtLeast(still_to_find, recursion_depth + 1);
				1634	}
				1635
				1636
				1637	void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
				1638	QuickCheckDetails* details,
				1639	RegExpCompiler* compiler,
				1640	int filled_in,
				1641	bool not_at_start) {
				1642	// Alternative 0 is the negative lookahead, alternative 1 is what comes
				1643	// afterwards.
				1644	RegExpNode* node = alternatives_->at(1).node();
				1645	return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
				1646	}
				1647
				1648
				1649	int ChoiceNode::EatsAtLeastHelper(int still_to_find,
				1650	int recursion_depth,
				1651	RegExpNode* ignore_this_node) {
				1652	if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
				1653	int min = 100;
				1654	int choice_count = alternatives_->length();
				1655	for (int i = 0; i < choice_count; i++) {
				1656	RegExpNode* node = alternatives_->at(i).node();
				1657	if (node == ignore_this_node) continue;
				1658	int node_eats_at_least = node->EatsAtLeast(still_to_find,
				1659	recursion_depth + 1);
				1660	if (node_eats_at_least < min) min = node_eats_at_least;
				1661	}
				1662	return min;
				1663	}
				1664
				1665
				1666	int LoopChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
				1667	return EatsAtLeastHelper(still_to_find, recursion_depth, loop_node_);
				1668	}
				1669
				1670
				1671	int ChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
				1672	return EatsAtLeastHelper(still_to_find, recursion_depth, NULL);
				1673	}
				1674
				1675
				1676	// Takes the left-most 1-bit and smears it out, setting all bits to its right.
				1677	static inline uint32_t SmearBitsRight(uint32_t v) {
				1678	v \|= v >> 1;
				1679	v \|= v >> 2;
				1680	v \|= v >> 4;
				1681	v \|= v >> 8;
				1682	v \|= v >> 16;
				1683	return v;
				1684	}
				1685
				1686
				1687	bool QuickCheckDetails::Rationalize(bool asc) {
				1688	bool found_useful_op = false;
				1689	uint32_t char_mask;
				1690	if (asc) {
				1691	char_mask = String::kMaxAsciiCharCode;
				1692	} else {
				1693	char_mask = String::kMaxUC16CharCode;
				1694	}
				1695	mask_ = 0;
				1696	value_ = 0;
				1697	int char_shift = 0;
				1698	for (int i = 0; i < characters_; i++) {
				1699	Position* pos = &positions_[i];
				1700	if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
				1701	found_useful_op = true;
				1702	}
				1703	mask_ \|= (pos->mask & char_mask) << char_shift;
				1704	value_ \|= (pos->value & char_mask) << char_shift;
				1705	char_shift += asc ? 8 : 16;
				1706	}
				1707	return found_useful_op;
				1708	}
				1709
				1710
				1711	bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
				1712	Trace* trace,
				1713	bool preload_has_checked_bounds,
				1714	Label* on_possible_success,
				1715	QuickCheckDetails* details,
				1716	bool fall_through_on_failure) {
				1717	if (details->characters() == 0) return false;
				1718	GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
				1719	if (details->cannot_match()) return false;
				1720	if (!details->Rationalize(compiler->ascii())) return false;
				1721	ASSERT(details->characters() == 1 \|\|
				1722	compiler->macro_assembler()->CanReadUnaligned());
				1723	uint32_t mask = details->mask();
				1724	uint32_t value = details->value();
				1725
				1726	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				1727
				1728	if (trace->characters_preloaded() != details->characters()) {
				1729	assembler->LoadCurrentCharacter(trace->cp_offset(),
				1730	trace->backtrack(),
				1731	!preload_has_checked_bounds,
				1732	details->characters());
				1733	}
				1734
				1735
				1736	bool need_mask = true;
				1737
				1738	if (details->characters() == 1) {
				1739	// If number of characters preloaded is 1 then we used a byte or 16 bit
				1740	// load so the value is already masked down.
				1741	uint32_t char_mask;
				1742	if (compiler->ascii()) {
				1743	char_mask = String::kMaxAsciiCharCode;
				1744	} else {
				1745	char_mask = String::kMaxUC16CharCode;
				1746	}
				1747	if ((mask & char_mask) == char_mask) need_mask = false;
				1748	mask &= char_mask;
				1749	} else {
				1750	// For 2-character preloads in ASCII mode we also use a 16 bit load with
				1751	// zero extend.
				1752	if (details->characters() == 2 && compiler->ascii()) {
				1753	if ((mask & 0xffff) == 0xffff) need_mask = false;
				1754	} else {
				1755	if (mask == 0xffffffff) need_mask = false;
				1756	}
				1757	}
				1758
				1759	if (fall_through_on_failure) {
				1760	if (need_mask) {
				1761	assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
				1762	} else {
				1763	assembler->CheckCharacter(value, on_possible_success);
				1764	}
				1765	} else {
				1766	if (need_mask) {
				1767	assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
				1768	} else {
				1769	assembler->CheckNotCharacter(value, trace->backtrack());
				1770	}
				1771	}
				1772	return true;
				1773	}
				1774
				1775
				1776	// Here is the meat of GetQuickCheckDetails (see also the comment on the
				1777	// super-class in the .h file).
				1778	//
				1779	// We iterate along the text object, building up for each character a
				1780	// mask and value that can be used to test for a quick failure to match.
				1781	// The masks and values for the positions will be combined into a single
				1782	// machine word for the current character width in order to be used in
				1783	// generating a quick check.
				1784	void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
				1785	RegExpCompiler* compiler,
				1786	int characters_filled_in,
				1787	bool not_at_start) {
				1788	ASSERT(characters_filled_in < details->characters());
				1789	int characters = details->characters();
				1790	int char_mask;
				1791	int char_shift;
				1792	if (compiler->ascii()) {
				1793	char_mask = String::kMaxAsciiCharCode;
				1794	char_shift = 8;
				1795	} else {
				1796	char_mask = String::kMaxUC16CharCode;
				1797	char_shift = 16;
				1798	}
				1799	for (int k = 0; k < elms_->length(); k++) {
				1800	TextElement elm = elms_->at(k);
				1801	if (elm.type == TextElement::ATOM) {
				1802	Vector<const uc16> quarks = elm.data.u_atom->data();
				1803	for (int i = 0; i < characters && i < quarks.length(); i++) {
				1804	QuickCheckDetails::Position* pos =
				1805	details->positions(characters_filled_in);
				1806	uc16 c = quarks[i];
				1807	if (c > char_mask) {
				1808	// If we expect a non-ASCII character from an ASCII string,
				1809	// there is no way we can match. Not even case independent
				1810	// matching can turn an ASCII character into non-ASCII or
				1811	// vice versa.
				1812	details->set_cannot_match();
				1813	pos->determines_perfectly = false;
				1814	return;
				1815	}
				1816	if (compiler->ignore_case()) {
				1817	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				1818	int length = GetCaseIndependentLetters(c, compiler->ascii(), chars);
				1819	ASSERT(length != 0); // Can only happen if c > char_mask (see above).
				1820	if (length == 1) {
				1821	// This letter has no case equivalents, so it's nice and simple
				1822	// and the mask-compare will determine definitely whether we have
				1823	// a match at this character position.
				1824	pos->mask = char_mask;
				1825	pos->value = c;
				1826	pos->determines_perfectly = true;
				1827	} else {
				1828	uint32_t common_bits = char_mask;
				1829	uint32_t bits = chars[0];
				1830	for (int j = 1; j < length; j++) {
				1831	uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
				1832	common_bits ^= differing_bits;
				1833	bits &= common_bits;
				1834	}
				1835	// If length is 2 and common bits has only one zero in it then
				1836	// our mask and compare instruction will determine definitely
				1837	// whether we have a match at this character position. Otherwise
				1838	// it can only be an approximate check.
				1839	uint32_t one_zero = (common_bits \| ~char_mask);
				1840	if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
				1841	pos->determines_perfectly = true;
				1842	}
				1843	pos->mask = common_bits;
				1844	pos->value = bits;
				1845	}
				1846	} else {
				1847	// Don't ignore case. Nice simple case where the mask-compare will
				1848	// determine definitely whether we have a match at this character
				1849	// position.
				1850	pos->mask = char_mask;
				1851	pos->value = c;
				1852	pos->determines_perfectly = true;
				1853	}
				1854	characters_filled_in++;
				1855	ASSERT(characters_filled_in <= details->characters());
				1856	if (characters_filled_in == details->characters()) {
				1857	return;
				1858	}
				1859	}
				1860	} else {
				1861	QuickCheckDetails::Position* pos =
				1862	details->positions(characters_filled_in);
				1863	RegExpCharacterClass* tree = elm.data.u_char_class;
				1864	ZoneList<CharacterRange>* ranges = tree->ranges();
				1865	if (tree->is_negated()) {
				1866	// A quick check uses multi-character mask and compare. There is no
				1867	// useful way to incorporate a negative char class into this scheme
				1868	// so we just conservatively create a mask and value that will always
				1869	// succeed.
				1870	pos->mask = 0;
				1871	pos->value = 0;
				1872	} else {
				1873	int first_range = 0;
				1874	while (ranges->at(first_range).from() > char_mask) {
				1875	first_range++;
				1876	if (first_range == ranges->length()) {
				1877	details->set_cannot_match();
				1878	pos->determines_perfectly = false;
				1879	return;
				1880	}
				1881	}
				1882	CharacterRange range = ranges->at(first_range);
				1883	uc16 from = range.from();
				1884	uc16 to = range.to();
				1885	if (to > char_mask) {
				1886	to = char_mask;
				1887	}
				1888	uint32_t differing_bits = (from ^ to);
				1889	// A mask and compare is only perfect if the differing bits form a
				1890	// number like 00011111 with one single block of trailing 1s.
				1891	if ((differing_bits & (differing_bits + 1)) == 0 &&
				1892	from + differing_bits == to) {
				1893	pos->determines_perfectly = true;
				1894	}
				1895	uint32_t common_bits = ~SmearBitsRight(differing_bits);
				1896	uint32_t bits = (from & common_bits);
				1897	for (int i = first_range + 1; i < ranges->length(); i++) {
				1898	CharacterRange range = ranges->at(i);
				1899	uc16 from = range.from();
				1900	uc16 to = range.to();
				1901	if (from > char_mask) continue;
				1902	if (to > char_mask) to = char_mask;
				1903	// Here we are combining more ranges into the mask and compare
				1904	// value. With each new range the mask becomes more sparse and
				1905	// so the chances of a false positive rise. A character class
				1906	// with multiple ranges is assumed never to be equivalent to a
				1907	// mask and compare operation.
				1908	pos->determines_perfectly = false;
				1909	uint32_t new_common_bits = (from ^ to);
				1910	new_common_bits = ~SmearBitsRight(new_common_bits);
				1911	common_bits &= new_common_bits;
				1912	bits &= new_common_bits;
				1913	uint32_t differing_bits = (from & common_bits) ^ bits;
				1914	common_bits ^= differing_bits;
				1915	bits &= common_bits;
				1916	}
				1917	pos->mask = common_bits;
				1918	pos->value = bits;
				1919	}
				1920	characters_filled_in++;
				1921	ASSERT(characters_filled_in <= details->characters());
				1922	if (characters_filled_in == details->characters()) {
				1923	return;
				1924	}
				1925	}
				1926	}
				1927	ASSERT(characters_filled_in != details->characters());
				1928	on_success()-> GetQuickCheckDetails(details,
				1929	compiler,
				1930	characters_filled_in,
				1931	true);
				1932	}
				1933
				1934
				1935	void QuickCheckDetails::Clear() {
				1936	for (int i = 0; i < characters_; i++) {
				1937	positions_[i].mask = 0;
				1938	positions_[i].value = 0;
				1939	positions_[i].determines_perfectly = false;
				1940	}
				1941	characters_ = 0;
				1942	}
				1943
				1944
				1945	void QuickCheckDetails::Advance(int by, bool ascii) {
				1946	ASSERT(by >= 0);
				1947	if (by >= characters_) {
				1948	Clear();
				1949	return;
				1950	}
				1951	for (int i = 0; i < characters_ - by; i++) {
				1952	positions_[i] = positions_[by + i];
				1953	}
				1954	for (int i = characters_ - by; i < characters_; i++) {
				1955	positions_[i].mask = 0;
				1956	positions_[i].value = 0;
				1957	positions_[i].determines_perfectly = false;
				1958	}
				1959	characters_ -= by;
				1960	// We could change mask_ and value_ here but we would never advance unless
				1961	// they had already been used in a check and they won't be used again because
				1962	// it would gain us nothing. So there's no point.
				1963	}
				1964
				1965
				1966	void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
				1967	ASSERT(characters_ == other->characters_);
				1968	if (other->cannot_match_) {
				1969	return;
				1970	}
				1971	if (cannot_match_) {
				1972	this = other;
				1973	return;
				1974	}
				1975	for (int i = from_index; i < characters_; i++) {
				1976	QuickCheckDetails::Position* pos = positions(i);
				1977	QuickCheckDetails::Position* other_pos = other->positions(i);
				1978	if (pos->mask != other_pos->mask \|\|
				1979	pos->value != other_pos->value \|\|
				1980	!other_pos->determines_perfectly) {
				1981	// Our mask-compare operation will be approximate unless we have the
				1982	// exact same operation on both sides of the alternation.
				1983	pos->determines_perfectly = false;
				1984	}
				1985	pos->mask &= other_pos->mask;
				1986	pos->value &= pos->mask;
				1987	other_pos->value &= pos->mask;
				1988	uc16 differing_bits = (pos->value ^ other_pos->value);
				1989	pos->mask &= ~differing_bits;
				1990	pos->value &= pos->mask;
				1991	}
				1992	}
				1993
				1994
				1995	class VisitMarker {
				1996	public:
				1997	explicit VisitMarker(NodeInfo* info) : info_(info) {
				1998	ASSERT(!info->visited);
				1999	info->visited = true;
				2000	}
				2001	~VisitMarker() {
				2002	info_->visited = false;
				2003	}
				2004	private:
				2005	NodeInfo* info_;
				2006	};
				2007
				2008
				2009	void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2010	RegExpCompiler* compiler,
				2011	int characters_filled_in,
				2012	bool not_at_start) {
				2013	if (body_can_be_zero_length_ \|\| info()->visited) return;
				2014	VisitMarker marker(info());
				2015	return ChoiceNode::GetQuickCheckDetails(details,
				2016	compiler,
				2017	characters_filled_in,
				2018	not_at_start);
				2019	}
				2020
				2021
				2022	void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2023	RegExpCompiler* compiler,
				2024	int characters_filled_in,
				2025	bool not_at_start) {
				2026	not_at_start = (not_at_start \|\| not_at_start_);
				2027	int choice_count = alternatives_->length();
				2028	ASSERT(choice_count > 0);
				2029	alternatives_->at(0).node()->GetQuickCheckDetails(details,
				2030	compiler,
				2031	characters_filled_in,
				2032	not_at_start);
				2033	for (int i = 1; i < choice_count; i++) {
				2034	QuickCheckDetails new_details(details->characters());
				2035	RegExpNode* node = alternatives_->at(i).node();
				2036	node->GetQuickCheckDetails(&new_details, compiler,
				2037	characters_filled_in,
				2038	not_at_start);
				2039	// Here we merge the quick match details of the two branches.
				2040	details->Merge(&new_details, characters_filled_in);
				2041	}
				2042	}
				2043
				2044
				2045	// Check for [0-9A-Z_a-z].
				2046	static void EmitWordCheck(RegExpMacroAssembler* assembler,
				2047	Label* word,
				2048	Label* non_word,
				2049	bool fall_through_on_word) {
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2050	if (assembler->CheckSpecialCharacterClass(
				2051	fall_through_on_word ? 'w' : 'W',
				2052	fall_through_on_word ? non_word : word)) {
				2053	// Optimized implementation available.
				2054	return;
				2055	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2056	assembler->CheckCharacterGT('z', non_word);
				2057	assembler->CheckCharacterLT('0', non_word);
				2058	assembler->CheckCharacterGT('a' - 1, word);
				2059	assembler->CheckCharacterLT('9' + 1, word);
				2060	assembler->CheckCharacterLT('A', non_word);
				2061	assembler->CheckCharacterLT('Z' + 1, word);
				2062	if (fall_through_on_word) {
				2063	assembler->CheckNotCharacter('_', non_word);
				2064	} else {
				2065	assembler->CheckCharacter('_', word);
				2066	}
				2067	}
				2068
				2069
				2070	// Emit the code to check for a ^ in multiline mode (1-character lookbehind
				2071	// that matches newline or the start of input).
				2072	static void EmitHat(RegExpCompiler* compiler,
				2073	RegExpNode* on_success,
				2074	Trace* trace) {
				2075	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2076	// We will be loading the previous character into the current character
				2077	// register.
				2078	Trace new_trace(*trace);
				2079	new_trace.InvalidateCurrentCharacter();
				2080
				2081	Label ok;
				2082	if (new_trace.cp_offset() == 0) {
				2083	// The start of input counts as a newline in this context, so skip to
				2084	// ok if we are at the start.
				2085	assembler->CheckAtStart(&ok);
				2086	}
				2087	// We already checked that we are not at the start of input so it must be
				2088	// OK to load the previous character.
				2089	assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
				2090	new_trace.backtrack(),
				2091	false);
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2092	if (!assembler->CheckSpecialCharacterClass('n',
				2093	new_trace.backtrack())) {
				2094	// Newline means \n, \r, 0x2028 or 0x2029.
				2095	if (!compiler->ascii()) {
				2096	assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
				2097	}
				2098	assembler->CheckCharacter('\n', &ok);
				2099	assembler->CheckNotCharacter('\r', new_trace.backtrack());
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2100	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2101	assembler->Bind(&ok);
				2102	on_success->Emit(compiler, &new_trace);
				2103	}
				2104
				2105
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2106	// Emit the code to handle \b and \B (word-boundary or non-word-boundary)
				2107	// when we know whether the next character must be a word character or not.
				2108	static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type,
				2109	RegExpCompiler* compiler,
				2110	RegExpNode* on_success,
				2111	Trace* trace) {
				2112	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2113	Label done;
				2114
				2115	Trace new_trace(*trace);
				2116
				2117	bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER);
				2118	Label* on_word = expect_word_character ? &done : new_trace.backtrack();
				2119	Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done;
				2120
				2121	// Check whether previous character was a word character.
				2122	switch (trace->at_start()) {
				2123	case Trace::TRUE:
				2124	if (expect_word_character) {
				2125	assembler->GoTo(on_non_word);
				2126	}
				2127	break;
				2128	case Trace::UNKNOWN:
				2129	ASSERT_EQ(0, trace->cp_offset());
				2130	assembler->CheckAtStart(on_non_word);
				2131	// Fall through.
				2132	case Trace::FALSE:
				2133	int prev_char_offset = trace->cp_offset() - 1;
				2134	assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1);
				2135	EmitWordCheck(assembler, on_word, on_non_word, expect_word_character);
				2136	// We may or may not have loaded the previous character.
				2137	new_trace.InvalidateCurrentCharacter();
				2138	}
				2139
				2140	assembler->Bind(&done);
				2141
				2142	on_success->Emit(compiler, &new_trace);
				2143	}
				2144
				2145
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2146	// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
				2147	static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
				2148	RegExpCompiler* compiler,
				2149	RegExpNode* on_success,
				2150	Trace* trace) {
				2151	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2152	Label before_non_word;
				2153	Label before_word;
				2154	if (trace->characters_preloaded() != 1) {
				2155	assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
				2156	}
				2157	// Fall through on non-word.
				2158	EmitWordCheck(assembler, &before_word, &before_non_word, false);
				2159
				2160	// We will be loading the previous character into the current character
				2161	// register.
				2162	Trace new_trace(*trace);
				2163	new_trace.InvalidateCurrentCharacter();
				2164
				2165	Label ok;
				2166	Label* boundary;
				2167	Label* not_boundary;
				2168	if (type == AssertionNode::AT_BOUNDARY) {
				2169	boundary = &ok;
				2170	not_boundary = new_trace.backtrack();
				2171	} else {
				2172	not_boundary = &ok;
				2173	boundary = new_trace.backtrack();
				2174	}
				2175
				2176	// Next character is not a word character.
				2177	assembler->Bind(&before_non_word);
				2178	if (new_trace.cp_offset() == 0) {
				2179	// The start of input counts as a non-word character, so the question is
				2180	// decided if we are at the start.
				2181	assembler->CheckAtStart(not_boundary);
				2182	}
				2183	// We already checked that we are not at the start of input so it must be
				2184	// OK to load the previous character.
				2185	assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
				2186	&ok, // Unused dummy label in this call.
				2187	false);
				2188	// Fall through on non-word.
				2189	EmitWordCheck(assembler, boundary, not_boundary, false);
				2190	assembler->GoTo(not_boundary);
				2191
				2192	// Next character is a word character.
				2193	assembler->Bind(&before_word);
				2194	if (new_trace.cp_offset() == 0) {
				2195	// The start of input counts as a non-word character, so the question is
				2196	// decided if we are at the start.
				2197	assembler->CheckAtStart(boundary);
				2198	}
				2199	// We already checked that we are not at the start of input so it must be
				2200	// OK to load the previous character.
				2201	assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
				2202	&ok, // Unused dummy label in this call.
				2203	false);
				2204	bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
				2205	EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
				2206
				2207	assembler->Bind(&ok);
				2208
				2209	on_success->Emit(compiler, &new_trace);
				2210	}
				2211
				2212
				2213	void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
				2214	RegExpCompiler* compiler,
				2215	int filled_in,
				2216	bool not_at_start) {
				2217	if (type_ == AT_START && not_at_start) {
				2218	details->set_cannot_match();
				2219	return;
				2220	}
				2221	return on_success()->GetQuickCheckDetails(details,
				2222	compiler,
				2223	filled_in,
				2224	not_at_start);
				2225	}
				2226
				2227
				2228	void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2229	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2230	switch (type_) {
				2231	case AT_END: {
				2232	Label ok;
				2233	assembler->CheckPosition(trace->cp_offset(), &ok);
				2234	assembler->GoTo(trace->backtrack());
				2235	assembler->Bind(&ok);
				2236	break;
				2237	}
				2238	case AT_START: {
				2239	if (trace->at_start() == Trace::FALSE) {
				2240	assembler->GoTo(trace->backtrack());
				2241	return;
				2242	}
				2243	if (trace->at_start() == Trace::UNKNOWN) {
				2244	assembler->CheckNotAtStart(trace->backtrack());
				2245	Trace at_start_trace = *trace;
				2246	at_start_trace.set_at_start(true);
				2247	on_success()->Emit(compiler, &at_start_trace);
				2248	return;
				2249	}
				2250	}
				2251	break;
				2252	case AFTER_NEWLINE:
				2253	EmitHat(compiler, on_success(), trace);
				2254	return;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2255	case AT_BOUNDARY:
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2256	case AT_NON_BOUNDARY: {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2257	EmitBoundaryCheck(type_, compiler, on_success(), trace);
				2258	return;
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2259	}
				2260	case AFTER_WORD_CHARACTER:
				2261	case AFTER_NONWORD_CHARACTER: {
				2262	EmitHalfBoundaryCheck(type_, compiler, on_success(), trace);
				2263	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2264	}
				2265	on_success()->Emit(compiler, trace);
				2266	}
				2267
				2268
				2269	static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
				2270	if (quick_check == NULL) return false;
				2271	if (offset >= quick_check->characters()) return false;
				2272	return quick_check->positions(offset)->determines_perfectly;
				2273	}
				2274
				2275
				2276	static void UpdateBoundsCheck(int index, int* checked_up_to) {
				2277	if (index > *checked_up_to) {
				2278	*checked_up_to = index;
				2279	}
				2280	}
				2281
				2282
				2283	// We call this repeatedly to generate code for each pass over the text node.
				2284	// The passes are in increasing order of difficulty because we hope one
				2285	// of the first passes will fail in which case we are saved the work of the
				2286	// later passes. for example for the case independent regexp /%[asdfghjkl]a/
				2287	// we will check the '%' in the first pass, the case independent 'a' in the
				2288	// second pass and the character class in the last pass.
				2289	//
				2290	// The passes are done from right to left, so for example to test for /bar/
				2291	// we will first test for an 'r' with offset 2, then an 'a' with offset 1
				2292	// and then a 'b' with offset 0. This means we can avoid the end-of-input
				2293	// bounds check most of the time. In the example we only need to check for
				2294	// end-of-input when loading the putative 'r'.
				2295	//
				2296	// A slight complication involves the fact that the first character may already
				2297	// be fetched into a register by the previous node. In this case we want to
				2298	// do the test for that character first. We do this in separate passes. The
				2299	// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
				2300	// pass has been performed then subsequent passes will have true in
				2301	// first_element_checked to indicate that that character does not need to be
				2302	// checked again.
				2303	//
				2304	// In addition to all this we are passed a Trace, which can
				2305	// contain an AlternativeGeneration object. In this AlternativeGeneration
				2306	// object we can see details of any quick check that was already passed in
				2307	// order to get to the code we are now generating. The quick check can involve
				2308	// loading characters, which means we do not need to recheck the bounds
				2309	// up to the limit the quick check already checked. In addition the quick
				2310	// check can have involved a mask and compare operation which may simplify
				2311	// or obviate the need for further checks at some character positions.
				2312	void TextNode::TextEmitPass(RegExpCompiler* compiler,
				2313	TextEmitPassType pass,
				2314	bool preloaded,
				2315	Trace* trace,
				2316	bool first_element_checked,
				2317	int* checked_up_to) {
				2318	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2319	bool ascii = compiler->ascii();
				2320	Label* backtrack = trace->backtrack();
				2321	QuickCheckDetails* quick_check = trace->quick_check_performed();
				2322	int element_count = elms_->length();
				2323	for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
				2324	TextElement elm = elms_->at(i);
				2325	int cp_offset = trace->cp_offset() + elm.cp_offset;
				2326	if (elm.type == TextElement::ATOM) {
				2327	Vector<const uc16> quarks = elm.data.u_atom->data();
				2328	for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
				2329	if (first_element_checked && i == 0 && j == 0) continue;
				2330	if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
				2331	EmitCharacterFunction* emit_function = NULL;
				2332	switch (pass) {
				2333	case NON_ASCII_MATCH:
				2334	ASSERT(ascii);
				2335	if (quarks[j] > String::kMaxAsciiCharCode) {
				2336	assembler->GoTo(backtrack);
				2337	return;
				2338	}
				2339	break;
				2340	case NON_LETTER_CHARACTER_MATCH:
				2341	emit_function = &EmitAtomNonLetter;
				2342	break;
				2343	case SIMPLE_CHARACTER_MATCH:
				2344	emit_function = &EmitSimpleCharacter;
				2345	break;
				2346	case CASE_CHARACTER_MATCH:
				2347	emit_function = &EmitAtomLetter;
				2348	break;
				2349	default:
				2350	break;
				2351	}
				2352	if (emit_function != NULL) {
				2353	bool bound_checked = emit_function(compiler,
				2354	quarks[j],
				2355	backtrack,
				2356	cp_offset + j,
				2357	*checked_up_to < cp_offset + j,
				2358	preloaded);
				2359	if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
				2360	}
				2361	}
				2362	} else {
				2363	ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
				2364	if (pass == CHARACTER_CLASS_MATCH) {
				2365	if (first_element_checked && i == 0) continue;
				2366	if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
				2367	RegExpCharacterClass* cc = elm.data.u_char_class;
				2368	EmitCharClass(assembler,
				2369	cc,
				2370	ascii,
				2371	backtrack,
				2372	cp_offset,
				2373	*checked_up_to < cp_offset,
				2374	preloaded);
				2375	UpdateBoundsCheck(cp_offset, checked_up_to);
				2376	}
				2377	}
				2378	}
				2379	}
				2380
				2381
				2382	int TextNode::Length() {
				2383	TextElement elm = elms_->last();
				2384	ASSERT(elm.cp_offset >= 0);
				2385	if (elm.type == TextElement::ATOM) {
				2386	return elm.cp_offset + elm.data.u_atom->data().length();
				2387	} else {
				2388	return elm.cp_offset + 1;
				2389	}
				2390	}
				2391
				2392
				2393	bool TextNode::SkipPass(int int_pass, bool ignore_case) {
				2394	TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
				2395	if (ignore_case) {
				2396	return pass == SIMPLE_CHARACTER_MATCH;
				2397	} else {
				2398	return pass == NON_LETTER_CHARACTER_MATCH \|\| pass == CASE_CHARACTER_MATCH;
				2399	}
				2400	}
				2401
				2402
				2403	// This generates the code to match a text node. A text node can contain
				2404	// straight character sequences (possibly to be matched in a case-independent
				2405	// way) and character classes. For efficiency we do not do this in a single
				2406	// pass from left to right. Instead we pass over the text node several times,
				2407	// emitting code for some character positions every time. See the comment on
				2408	// TextEmitPass for details.
				2409	void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2410	LimitResult limit_result = LimitVersions(compiler, trace);
				2411	if (limit_result == DONE) return;
				2412	ASSERT(limit_result == CONTINUE);
				2413
				2414	if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
				2415	compiler->SetRegExpTooBig();
				2416	return;
				2417	}
				2418
				2419	if (compiler->ascii()) {
				2420	int dummy = 0;
				2421	TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
				2422	}
				2423
				2424	bool first_elt_done = false;
				2425	int bound_checked_to = trace->cp_offset() - 1;
				2426	bound_checked_to += trace->bound_checked_up_to();
				2427
				2428	// If a character is preloaded into the current character register then
				2429	// check that now.
				2430	if (trace->characters_preloaded() == 1) {
				2431	for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
				2432	if (!SkipPass(pass, compiler->ignore_case())) {
				2433	TextEmitPass(compiler,
				2434	static_cast<TextEmitPassType>(pass),
				2435	true,
				2436	trace,
				2437	false,
				2438	&bound_checked_to);
				2439	}
				2440	}
				2441	first_elt_done = true;
				2442	}
				2443
				2444	for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
				2445	if (!SkipPass(pass, compiler->ignore_case())) {
				2446	TextEmitPass(compiler,
				2447	static_cast<TextEmitPassType>(pass),
				2448	false,
				2449	trace,
				2450	first_elt_done,
				2451	&bound_checked_to);
				2452	}
				2453	}
				2454
				2455	Trace successor_trace(*trace);
				2456	successor_trace.set_at_start(false);
				2457	successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
				2458	RecursionCheck rc(compiler);
				2459	on_success()->Emit(compiler, &successor_trace);
				2460	}
				2461
				2462
				2463	void Trace::InvalidateCurrentCharacter() {
				2464	characters_preloaded_ = 0;
				2465	}
				2466
				2467
				2468	void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
				2469	ASSERT(by > 0);
				2470	// We don't have an instruction for shifting the current character register
				2471	// down or for using a shifted value for anything so lets just forget that
				2472	// we preloaded any characters into it.
				2473	characters_preloaded_ = 0;
				2474	// Adjust the offsets of the quick check performed information. This
				2475	// information is used to find out what we already determined about the
				2476	// characters by means of mask and compare.
				2477	quick_check_performed_.Advance(by, compiler->ascii());
				2478	cp_offset_ += by;
				2479	if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
				2480	compiler->SetRegExpTooBig();
				2481	cp_offset_ = 0;
				2482	}
				2483	bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
				2484	}
				2485
				2486
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	2487	void TextNode::MakeCaseIndependent(bool is_ascii) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2488	int element_count = elms_->length();
				2489	for (int i = 0; i < element_count; i++) {
				2490	TextElement elm = elms_->at(i);
				2491	if (elm.type == TextElement::CHAR_CLASS) {
				2492	RegExpCharacterClass* cc = elm.data.u_char_class;
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	2493	// None of the standard character classses is different in the case
				2494	// independent case and it slows us down if we don't know that.
				2495	if (cc->is_standard()) continue;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2496	ZoneList<CharacterRange>* ranges = cc->ranges();
				2497	int range_count = ranges->length();
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	2498	for (int j = 0; j < range_count; j++) {
				2499	ranges->at(j).AddCaseEquivalents(ranges, is_ascii);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2500	}
				2501	}
				2502	}
				2503	}
				2504
				2505
				2506	int TextNode::GreedyLoopTextLength() {
				2507	TextElement elm = elms_->at(elms_->length() - 1);
				2508	if (elm.type == TextElement::CHAR_CLASS) {
				2509	return elm.cp_offset + 1;
				2510	} else {
				2511	return elm.cp_offset + elm.data.u_atom->data().length();
				2512	}
				2513	}
				2514
				2515
				2516	// Finds the fixed match length of a sequence of nodes that goes from
				2517	// this alternative and back to this choice node. If there are variable
				2518	// length nodes or other complications in the way then return a sentinel
				2519	// value indicating that a greedy loop cannot be constructed.
				2520	int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) {
				2521	int length = 0;
				2522	RegExpNode* node = alternative->node();
				2523	// Later we will generate code for all these text nodes using recursion
				2524	// so we have to limit the max number.
				2525	int recursion_depth = 0;
				2526	while (node != this) {
				2527	if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
				2528	return kNodeIsTooComplexForGreedyLoops;
				2529	}
				2530	int node_length = node->GreedyLoopTextLength();
				2531	if (node_length == kNodeIsTooComplexForGreedyLoops) {
				2532	return kNodeIsTooComplexForGreedyLoops;
				2533	}
				2534	length += node_length;
				2535	SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
				2536	node = seq_node->on_success();
				2537	}
				2538	return length;
				2539	}
				2540
				2541
				2542	void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
				2543	ASSERT_EQ(loop_node_, NULL);
				2544	AddAlternative(alt);
				2545	loop_node_ = alt.node();
				2546	}
				2547
				2548
				2549	void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
				2550	ASSERT_EQ(continue_node_, NULL);
				2551	AddAlternative(alt);
				2552	continue_node_ = alt.node();
				2553	}
				2554
				2555
				2556	void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2557	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				2558	if (trace->stop_node() == this) {
				2559	int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
				2560	ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
				2561	// Update the counter-based backtracking info on the stack. This is an
				2562	// optimization for greedy loops (see below).
				2563	ASSERT(trace->cp_offset() == text_length);
				2564	macro_assembler->AdvanceCurrentPosition(text_length);
				2565	macro_assembler->GoTo(trace->loop_label());
				2566	return;
				2567	}
				2568	ASSERT(trace->stop_node() == NULL);
				2569	if (!trace->is_trivial()) {
				2570	trace->Flush(compiler, this);
				2571	return;
				2572	}
				2573	ChoiceNode::Emit(compiler, trace);
				2574	}
				2575
				2576
				2577	int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler) {
				2578	int preload_characters = EatsAtLeast(4, 0);
				2579	if (compiler->macro_assembler()->CanReadUnaligned()) {
				2580	bool ascii = compiler->ascii();
				2581	if (ascii) {
				2582	if (preload_characters > 4) preload_characters = 4;
				2583	// We can't preload 3 characters because there is no machine instruction
				2584	// to do that. We can't just load 4 because we could be reading
				2585	// beyond the end of the string, which could cause a memory fault.
				2586	if (preload_characters == 3) preload_characters = 2;
				2587	} else {
				2588	if (preload_characters > 2) preload_characters = 2;
				2589	}
				2590	} else {
				2591	if (preload_characters > 1) preload_characters = 1;
				2592	}
				2593	return preload_characters;
				2594	}
				2595
				2596
				2597	// This class is used when generating the alternatives in a choice node. It
				2598	// records the way the alternative is being code generated.
				2599	class AlternativeGeneration: public Malloced {
				2600	public:
				2601	AlternativeGeneration()
				2602	: possible_success(),
				2603	expects_preload(false),
				2604	after(),
				2605	quick_check_details() { }
				2606	Label possible_success;
				2607	bool expects_preload;
				2608	Label after;
				2609	QuickCheckDetails quick_check_details;
				2610	};
				2611
				2612
				2613	// Creates a list of AlternativeGenerations. If the list has a reasonable
				2614	// size then it is on the stack, otherwise the excess is on the heap.
				2615	class AlternativeGenerationList {
				2616	public:
				2617	explicit AlternativeGenerationList(int count)
				2618	: alt_gens_(count) {
				2619	for (int i = 0; i < count && i < kAFew; i++) {
				2620	alt_gens_.Add(a_few_alt_gens_ + i);
				2621	}
				2622	for (int i = kAFew; i < count; i++) {
				2623	alt_gens_.Add(new AlternativeGeneration());
				2624	}
				2625	}
				2626	~AlternativeGenerationList() {
				2627	for (int i = kAFew; i < alt_gens_.length(); i++) {
				2628	delete alt_gens_[i];
				2629	alt_gens_[i] = NULL;
				2630	}
				2631	}
				2632
				2633	AlternativeGeneration* at(int i) {
				2634	return alt_gens_[i];
				2635	}
				2636	private:
				2637	static const int kAFew = 10;
				2638	ZoneList<AlternativeGeneration*> alt_gens_;
				2639	AlternativeGeneration a_few_alt_gens_[kAFew];
				2640	};
				2641
				2642
				2643	/* Code generation for choice nodes.
				2644	*
				2645	* We generate quick checks that do a mask and compare to eliminate a
				2646	* choice. If the quick check succeeds then it jumps to the continuation to
				2647	* do slow checks and check subsequent nodes. If it fails (the common case)
				2648	* it falls through to the next choice.
				2649	*
				2650	* Here is the desired flow graph. Nodes directly below each other imply
				2651	* fallthrough. Alternatives 1 and 2 have quick checks. Alternative
				2652	* 3 doesn't have a quick check so we have to call the slow check.
				2653	* Nodes are marked Qn for quick checks and Sn for slow checks. The entire
				2654	* regexp continuation is generated directly after the Sn node, up to the
				2655	* next GoTo if we decide to reuse some already generated code. Some
				2656	* nodes expect preload_characters to be preloaded into the current
				2657	* character register. R nodes do this preloading. Vertices are marked
				2658	* F for failures and S for success (possible success in the case of quick
				2659	* nodes). L, V, < and > are used as arrow heads.
				2660	*
				2661	* ----------> R
				2662	* \|
				2663	* V
				2664	* Q1 -----> S1
				2665	* \| S /
				2666	* F\| /
				2667	* \| F/
				2668	* \| /
				2669	* \| R
				2670	* \| /
				2671	* V L
				2672	* Q2 -----> S2
				2673	* \| S /
				2674	* F\| /
				2675	* \| F/
				2676	* \| /
				2677	* \| R
				2678	* \| /
				2679	* V L
				2680	* S3
				2681	* \|
				2682	* F\|
				2683	* \|
				2684	* R
				2685	* \|
				2686	* backtrack V
				2687	* <----------Q4
				2688	* \ F \|
				2689	* \ \|S
				2690	* \ F V
				2691	* \-----S4
				2692	*
				2693	* For greedy loops we reverse our expectation and expect to match rather
				2694	* than fail. Therefore we want the loop code to look like this (U is the
				2695	* unwind code that steps back in the greedy loop). The following alternatives
				2696	* look the same as above.
				2697	* _____
				2698	* / \
				2699	* V \|
				2700	* ----------> S1 \|
				2701	* /\| \|
				2702	* / \|S \|
				2703	* F/ \_____/
				2704	* /
				2705	* \|<-----------
				2706	* \| \
				2707	* V \
				2708	* Q2 ---> S2 \
				2709	* \| S / \|
				2710	* F\| / \|
				2711	* \| F/ \|
				2712	* \| / \|
				2713	* \| R \|
				2714	* \| / \|
				2715	* F VL \|
				2716	* <------U \|
				2717	* back \|S \|
				2718	* \______________/
				2719	*/
				2720
				2721
				2722	void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2723	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				2724	int choice_count = alternatives_->length();
				2725	#ifdef DEBUG
				2726	for (int i = 0; i < choice_count - 1; i++) {
				2727	GuardedAlternative alternative = alternatives_->at(i);
				2728	ZoneList<Guard> guards = alternative.guards();
				2729	int guard_count = (guards == NULL) ? 0 : guards->length();
				2730	for (int j = 0; j < guard_count; j++) {
				2731	ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
				2732	}
				2733	}
				2734	#endif
				2735
				2736	LimitResult limit_result = LimitVersions(compiler, trace);
				2737	if (limit_result == DONE) return;
				2738	ASSERT(limit_result == CONTINUE);
				2739
				2740	int new_flush_budget = trace->flush_budget() / choice_count;
				2741	if (trace->flush_budget() == 0 && trace->actions() != NULL) {
				2742	trace->Flush(compiler, this);
				2743	return;
				2744	}
				2745
				2746	RecursionCheck rc(compiler);
				2747
				2748	Trace* current_trace = trace;
				2749
				2750	int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
				2751	bool greedy_loop = false;
				2752	Label greedy_loop_label;
				2753	Trace counter_backtrack_trace;
				2754	counter_backtrack_trace.set_backtrack(&greedy_loop_label);
				2755	if (not_at_start()) counter_backtrack_trace.set_at_start(false);
				2756
				2757	if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
				2758	// Here we have special handling for greedy loops containing only text nodes
				2759	// and other simple nodes. These are handled by pushing the current
				2760	// position on the stack and then incrementing the current position each
				2761	// time around the switch. On backtrack we decrement the current position
				2762	// and check it against the pushed value. This avoids pushing backtrack
				2763	// information for each iteration of the loop, which could take up a lot of
				2764	// space.
				2765	greedy_loop = true;
				2766	ASSERT(trace->stop_node() == NULL);
				2767	macro_assembler->PushCurrentPosition();
				2768	current_trace = &counter_backtrack_trace;
				2769	Label greedy_match_failed;
				2770	Trace greedy_match_trace;
				2771	if (not_at_start()) greedy_match_trace.set_at_start(false);
				2772	greedy_match_trace.set_backtrack(&greedy_match_failed);
				2773	Label loop_label;
				2774	macro_assembler->Bind(&loop_label);
				2775	greedy_match_trace.set_stop_node(this);
				2776	greedy_match_trace.set_loop_label(&loop_label);
				2777	alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
				2778	macro_assembler->Bind(&greedy_match_failed);
				2779	}
				2780
				2781	Label second_choice; // For use in greedy matches.
				2782	macro_assembler->Bind(&second_choice);
				2783
				2784	int first_normal_choice = greedy_loop ? 1 : 0;
				2785
				2786	int preload_characters = CalculatePreloadCharacters(compiler);
				2787	bool preload_is_current =
				2788	(current_trace->characters_preloaded() == preload_characters);
				2789	bool preload_has_checked_bounds = preload_is_current;
				2790
				2791	AlternativeGenerationList alt_gens(choice_count);
				2792
				2793	// For now we just call all choices one after the other. The idea ultimately
				2794	// is to use the Dispatch table to try only the relevant ones.
				2795	for (int i = first_normal_choice; i < choice_count; i++) {
				2796	GuardedAlternative alternative = alternatives_->at(i);
				2797	AlternativeGeneration* alt_gen = alt_gens.at(i);
				2798	alt_gen->quick_check_details.set_characters(preload_characters);
				2799	ZoneList<Guard> guards = alternative.guards();
				2800	int guard_count = (guards == NULL) ? 0 : guards->length();
				2801	Trace new_trace(*current_trace);
				2802	new_trace.set_characters_preloaded(preload_is_current ?
				2803	preload_characters :
				2804	0);
				2805	if (preload_has_checked_bounds) {
				2806	new_trace.set_bound_checked_up_to(preload_characters);
				2807	}
				2808	new_trace.quick_check_performed()->Clear();
				2809	if (not_at_start_) new_trace.set_at_start(Trace::FALSE);
				2810	alt_gen->expects_preload = preload_is_current;
				2811	bool generate_full_check_inline = false;
				2812	if (FLAG_regexp_optimization &&
				2813	try_to_emit_quick_check_for_alternative(i) &&
				2814	alternative.node()->EmitQuickCheck(compiler,
				2815	&new_trace,
				2816	preload_has_checked_bounds,
				2817	&alt_gen->possible_success,
				2818	&alt_gen->quick_check_details,
				2819	i < choice_count - 1)) {
				2820	// Quick check was generated for this choice.
				2821	preload_is_current = true;
				2822	preload_has_checked_bounds = true;
				2823	// On the last choice in the ChoiceNode we generated the quick
				2824	// check to fall through on possible success. So now we need to
				2825	// generate the full check inline.
				2826	if (i == choice_count - 1) {
				2827	macro_assembler->Bind(&alt_gen->possible_success);
				2828	new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
				2829	new_trace.set_characters_preloaded(preload_characters);
				2830	new_trace.set_bound_checked_up_to(preload_characters);
				2831	generate_full_check_inline = true;
				2832	}
				2833	} else if (alt_gen->quick_check_details.cannot_match()) {
				2834	if (i == choice_count - 1 && !greedy_loop) {
				2835	macro_assembler->GoTo(trace->backtrack());
				2836	}
				2837	continue;
				2838	} else {
				2839	// No quick check was generated. Put the full code here.
				2840	// If this is not the first choice then there could be slow checks from
				2841	// previous cases that go here when they fail. There's no reason to
				2842	// insist that they preload characters since the slow check we are about
				2843	// to generate probably can't use it.
				2844	if (i != first_normal_choice) {
				2845	alt_gen->expects_preload = false;
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	2846	new_trace.InvalidateCurrentCharacter();
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	2847	}
				2848	if (i < choice_count - 1) {
				2849	new_trace.set_backtrack(&alt_gen->after);
				2850	}
				2851	generate_full_check_inline = true;
				2852	}
				2853	if (generate_full_check_inline) {
				2854	if (new_trace.actions() != NULL) {
				2855	new_trace.set_flush_budget(new_flush_budget);
				2856	}
				2857	for (int j = 0; j < guard_count; j++) {
				2858	GenerateGuard(macro_assembler, guards->at(j), &new_trace);
				2859	}
				2860	alternative.node()->Emit(compiler, &new_trace);
				2861	preload_is_current = false;
				2862	}
				2863	macro_assembler->Bind(&alt_gen->after);
				2864	}
				2865	if (greedy_loop) {
				2866	macro_assembler->Bind(&greedy_loop_label);
				2867	// If we have unwound to the bottom then backtrack.
				2868	macro_assembler->CheckGreedyLoop(trace->backtrack());
				2869	// Otherwise try the second priority at an earlier position.
				2870	macro_assembler->AdvanceCurrentPosition(-text_length);
				2871	macro_assembler->GoTo(&second_choice);
				2872	}
				2873
				2874	// At this point we need to generate slow checks for the alternatives where
				2875	// the quick check was inlined. We can recognize these because the associated
				2876	// label was bound.
				2877	for (int i = first_normal_choice; i < choice_count - 1; i++) {
				2878	AlternativeGeneration* alt_gen = alt_gens.at(i);
				2879	Trace new_trace(*current_trace);
				2880	// If there are actions to be flushed we have to limit how many times
				2881	// they are flushed. Take the budget of the parent trace and distribute
				2882	// it fairly amongst the children.
				2883	if (new_trace.actions() != NULL) {
				2884	new_trace.set_flush_budget(new_flush_budget);
				2885	}
				2886	EmitOutOfLineContinuation(compiler,
				2887	&new_trace,
				2888	alternatives_->at(i),
				2889	alt_gen,
				2890	preload_characters,
				2891	alt_gens.at(i + 1)->expects_preload);
				2892	}
				2893	}
				2894
				2895
				2896	void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
				2897	Trace* trace,
				2898	GuardedAlternative alternative,
				2899	AlternativeGeneration* alt_gen,
				2900	int preload_characters,
				2901	bool next_expects_preload) {
				2902	if (!alt_gen->possible_success.is_linked()) return;
				2903
				2904	RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
				2905	macro_assembler->Bind(&alt_gen->possible_success);
				2906	Trace out_of_line_trace(*trace);
				2907	out_of_line_trace.set_characters_preloaded(preload_characters);
				2908	out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
				2909	if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE);
				2910	ZoneList<Guard> guards = alternative.guards();
				2911	int guard_count = (guards == NULL) ? 0 : guards->length();
				2912	if (next_expects_preload) {
				2913	Label reload_current_char;
				2914	out_of_line_trace.set_backtrack(&reload_current_char);
				2915	for (int j = 0; j < guard_count; j++) {
				2916	GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
				2917	}
				2918	alternative.node()->Emit(compiler, &out_of_line_trace);
				2919	macro_assembler->Bind(&reload_current_char);
				2920	// Reload the current character, since the next quick check expects that.
				2921	// We don't need to check bounds here because we only get into this
				2922	// code through a quick check which already did the checked load.
				2923	macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
				2924	NULL,
				2925	false,
				2926	preload_characters);
				2927	macro_assembler->GoTo(&(alt_gen->after));
				2928	} else {
				2929	out_of_line_trace.set_backtrack(&(alt_gen->after));
				2930	for (int j = 0; j < guard_count; j++) {
				2931	GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
				2932	}
				2933	alternative.node()->Emit(compiler, &out_of_line_trace);
				2934	}
				2935	}
				2936
				2937
				2938	void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				2939	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				2940	LimitResult limit_result = LimitVersions(compiler, trace);
				2941	if (limit_result == DONE) return;
				2942	ASSERT(limit_result == CONTINUE);
				2943
				2944	RecursionCheck rc(compiler);
				2945
				2946	switch (type_) {
				2947	case STORE_POSITION: {
				2948	Trace::DeferredCapture
				2949	new_capture(data_.u_position_register.reg,
				2950	data_.u_position_register.is_capture,
				2951	trace);
				2952	Trace new_trace = *trace;
				2953	new_trace.add_action(&new_capture);
				2954	on_success()->Emit(compiler, &new_trace);
				2955	break;
				2956	}
				2957	case INCREMENT_REGISTER: {
				2958	Trace::DeferredIncrementRegister
				2959	new_increment(data_.u_increment_register.reg);
				2960	Trace new_trace = *trace;
				2961	new_trace.add_action(&new_increment);
				2962	on_success()->Emit(compiler, &new_trace);
				2963	break;
				2964	}
				2965	case SET_REGISTER: {
				2966	Trace::DeferredSetRegister
				2967	new_set(data_.u_store_register.reg, data_.u_store_register.value);
				2968	Trace new_trace = *trace;
				2969	new_trace.add_action(&new_set);
				2970	on_success()->Emit(compiler, &new_trace);
				2971	break;
				2972	}
				2973	case CLEAR_CAPTURES: {
				2974	Trace::DeferredClearCaptures
				2975	new_capture(Interval(data_.u_clear_captures.range_from,
				2976	data_.u_clear_captures.range_to));
				2977	Trace new_trace = *trace;
				2978	new_trace.add_action(&new_capture);
				2979	on_success()->Emit(compiler, &new_trace);
				2980	break;
				2981	}
				2982	case BEGIN_SUBMATCH:
				2983	if (!trace->is_trivial()) {
				2984	trace->Flush(compiler, this);
				2985	} else {
				2986	assembler->WriteCurrentPositionToRegister(
				2987	data_.u_submatch.current_position_register, 0);
				2988	assembler->WriteStackPointerToRegister(
				2989	data_.u_submatch.stack_pointer_register);
				2990	on_success()->Emit(compiler, trace);
				2991	}
				2992	break;
				2993	case EMPTY_MATCH_CHECK: {
				2994	int start_pos_reg = data_.u_empty_match_check.start_register;
				2995	int stored_pos = 0;
				2996	int rep_reg = data_.u_empty_match_check.repetition_register;
				2997	bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
				2998	bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
				2999	if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
				3000	// If we know we haven't advanced and there is no minimum we
				3001	// can just backtrack immediately.
				3002	assembler->GoTo(trace->backtrack());
				3003	} else if (know_dist && stored_pos < trace->cp_offset()) {
				3004	// If we know we've advanced we can generate the continuation
				3005	// immediately.
				3006	on_success()->Emit(compiler, trace);
				3007	} else if (!trace->is_trivial()) {
				3008	trace->Flush(compiler, this);
				3009	} else {
				3010	Label skip_empty_check;
				3011	// If we have a minimum number of repetitions we check the current
				3012	// number first and skip the empty check if it's not enough.
				3013	if (has_minimum) {
				3014	int limit = data_.u_empty_match_check.repetition_limit;
				3015	assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
				3016	}
				3017	// If the match is empty we bail out, otherwise we fall through
				3018	// to the on-success continuation.
				3019	assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
				3020	trace->backtrack());
				3021	assembler->Bind(&skip_empty_check);
				3022	on_success()->Emit(compiler, trace);
				3023	}
				3024	break;
				3025	}
				3026	case POSITIVE_SUBMATCH_SUCCESS: {
				3027	if (!trace->is_trivial()) {
				3028	trace->Flush(compiler, this);
				3029	return;
				3030	}
				3031	assembler->ReadCurrentPositionFromRegister(
				3032	data_.u_submatch.current_position_register);
				3033	assembler->ReadStackPointerFromRegister(
				3034	data_.u_submatch.stack_pointer_register);
				3035	int clear_register_count = data_.u_submatch.clear_register_count;
				3036	if (clear_register_count == 0) {
				3037	on_success()->Emit(compiler, trace);
				3038	return;
				3039	}
				3040	int clear_registers_from = data_.u_submatch.clear_register_from;
				3041	Label clear_registers_backtrack;
				3042	Trace new_trace = *trace;
				3043	new_trace.set_backtrack(&clear_registers_backtrack);
				3044	on_success()->Emit(compiler, &new_trace);
				3045
				3046	assembler->Bind(&clear_registers_backtrack);
				3047	int clear_registers_to = clear_registers_from + clear_register_count - 1;
				3048	assembler->ClearRegisters(clear_registers_from, clear_registers_to);
				3049
				3050	ASSERT(trace->backtrack() == NULL);
				3051	assembler->Backtrack();
				3052	return;
				3053	}
				3054	default:
				3055	UNREACHABLE();
				3056	}
				3057	}
				3058
				3059
				3060	void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
				3061	RegExpMacroAssembler* assembler = compiler->macro_assembler();
				3062	if (!trace->is_trivial()) {
				3063	trace->Flush(compiler, this);
				3064	return;
				3065	}
				3066
				3067	LimitResult limit_result = LimitVersions(compiler, trace);
				3068	if (limit_result == DONE) return;
				3069	ASSERT(limit_result == CONTINUE);
				3070
				3071	RecursionCheck rc(compiler);
				3072
				3073	ASSERT_EQ(start_reg_ + 1, end_reg_);
				3074	if (compiler->ignore_case()) {
				3075	assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
				3076	trace->backtrack());
				3077	} else {
				3078	assembler->CheckNotBackReference(start_reg_, trace->backtrack());
				3079	}
				3080	on_success()->Emit(compiler, trace);
				3081	}
				3082
				3083
				3084	// -------------------------------------------------------------------
				3085	// Dot/dotty output
				3086
				3087
				3088	#ifdef DEBUG
				3089
				3090
				3091	class DotPrinter: public NodeVisitor {
				3092	public:
				3093	explicit DotPrinter(bool ignore_case)
				3094	: ignore_case_(ignore_case),
				3095	stream_(&alloc_) { }
				3096	void PrintNode(const char* label, RegExpNode* node);
				3097	void Visit(RegExpNode* node);
				3098	void PrintAttributes(RegExpNode* from);
				3099	StringStream* stream() { return &stream_; }
				3100	void PrintOnFailure(RegExpNode* from, RegExpNode* to);
				3101	#define DECLARE_VISIT(Type) \
				3102	virtual void Visit##Type(Type##Node* that);
				3103	FOR_EACH_NODE_TYPE(DECLARE_VISIT)
				3104	#undef DECLARE_VISIT
				3105	private:
				3106	bool ignore_case_;
				3107	HeapStringAllocator alloc_;
				3108	StringStream stream_;
				3109	};
				3110
				3111
				3112	void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
				3113	stream()->Add("digraph G {\n graph [label=\"");
				3114	for (int i = 0; label[i]; i++) {
				3115	switch (label[i]) {
				3116	case '\\':
				3117	stream()->Add("\\\\");
				3118	break;
				3119	case '"':
				3120	stream()->Add("\"");
				3121	break;
				3122	default:
				3123	stream()->Put(label[i]);
				3124	break;
				3125	}
				3126	}
				3127	stream()->Add("\"];\n");
				3128	Visit(node);
				3129	stream()->Add("}\n");
				3130	printf("%s", *(stream()->ToCString()));
				3131	}
				3132
				3133
				3134	void DotPrinter::Visit(RegExpNode* node) {
				3135	if (node->info()->visited) return;
				3136	node->info()->visited = true;
				3137	node->Accept(this);
				3138	}
				3139
				3140
				3141	void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
				3142	stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
				3143	Visit(on_failure);
				3144	}
				3145
				3146
				3147	class TableEntryBodyPrinter {
				3148	public:
				3149	TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
				3150	: stream_(stream), choice_(choice) { }
				3151	void Call(uc16 from, DispatchTable::Entry entry) {
				3152	OutSet* out_set = entry.out_set();
				3153	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				3154	if (out_set->Get(i)) {
				3155	stream()->Add(" n%p:s%io%i -> n%p;\n",
				3156	choice(),
				3157	from,
				3158	i,
				3159	choice()->alternatives()->at(i).node());
				3160	}
				3161	}
				3162	}
				3163	private:
				3164	StringStream* stream() { return stream_; }
				3165	ChoiceNode* choice() { return choice_; }
				3166	StringStream* stream_;
				3167	ChoiceNode* choice_;
				3168	};
				3169
				3170
				3171	class TableEntryHeaderPrinter {
				3172	public:
				3173	explicit TableEntryHeaderPrinter(StringStream* stream)
				3174	: first_(true), stream_(stream) { }
				3175	void Call(uc16 from, DispatchTable::Entry entry) {
				3176	if (first_) {
				3177	first_ = false;
				3178	} else {
				3179	stream()->Add("\|");
				3180	}
				3181	stream()->Add("{\\%k-\\%k\|{", from, entry.to());
				3182	OutSet* out_set = entry.out_set();
				3183	int priority = 0;
				3184	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				3185	if (out_set->Get(i)) {
				3186	if (priority > 0) stream()->Add("\|");
				3187	stream()->Add("<s%io%i> %i", from, i, priority);
				3188	priority++;
				3189	}
				3190	}
				3191	stream()->Add("}}");
				3192	}
				3193	private:
				3194	bool first_;
				3195	StringStream* stream() { return stream_; }
				3196	StringStream* stream_;
				3197	};
				3198
				3199
				3200	class AttributePrinter {
				3201	public:
				3202	explicit AttributePrinter(DotPrinter* out)
				3203	: out_(out), first_(true) { }
				3204	void PrintSeparator() {
				3205	if (first_) {
				3206	first_ = false;
				3207	} else {
				3208	out_->stream()->Add("\|");
				3209	}
				3210	}
				3211	void PrintBit(const char* name, bool value) {
				3212	if (!value) return;
				3213	PrintSeparator();
				3214	out_->stream()->Add("{%s}", name);
				3215	}
				3216	void PrintPositive(const char* name, int value) {
				3217	if (value < 0) return;
				3218	PrintSeparator();
				3219	out_->stream()->Add("{%s\|%x}", name, value);
				3220	}
				3221	private:
				3222	DotPrinter* out_;
				3223	bool first_;
				3224	};
				3225
				3226
				3227	void DotPrinter::PrintAttributes(RegExpNode* that) {
				3228	stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
				3229	"margin=0.1, fontsize=10, label=\"{",
				3230	that);
				3231	AttributePrinter printer(this);
				3232	NodeInfo* info = that->info();
				3233	printer.PrintBit("NI", info->follows_newline_interest);
				3234	printer.PrintBit("WI", info->follows_word_interest);
				3235	printer.PrintBit("SI", info->follows_start_interest);
				3236	Label* label = that->label();
				3237	if (label->is_bound())
				3238	printer.PrintPositive("@", label->pos());
				3239	stream()->Add("}\"];\n");
				3240	stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
				3241	"arrowhead=none];\n", that, that);
				3242	}
				3243
				3244
				3245	static const bool kPrintDispatchTable = false;
				3246	void DotPrinter::VisitChoice(ChoiceNode* that) {
				3247	if (kPrintDispatchTable) {
				3248	stream()->Add(" n%p [shape=Mrecord, label=\"", that);
				3249	TableEntryHeaderPrinter header_printer(stream());
				3250	that->GetTable(ignore_case_)->ForEach(&header_printer);
				3251	stream()->Add("\"]\n", that);
				3252	PrintAttributes(that);
				3253	TableEntryBodyPrinter body_printer(stream(), that);
				3254	that->GetTable(ignore_case_)->ForEach(&body_printer);
				3255	} else {
				3256	stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
				3257	for (int i = 0; i < that->alternatives()->length(); i++) {
				3258	GuardedAlternative alt = that->alternatives()->at(i);
				3259	stream()->Add(" n%p -> n%p;\n", that, alt.node());
				3260	}
				3261	}
				3262	for (int i = 0; i < that->alternatives()->length(); i++) {
				3263	GuardedAlternative alt = that->alternatives()->at(i);
				3264	alt.node()->Accept(this);
				3265	}
				3266	}
				3267
				3268
				3269	void DotPrinter::VisitText(TextNode* that) {
				3270	stream()->Add(" n%p [label=\"", that);
				3271	for (int i = 0; i < that->elements()->length(); i++) {
				3272	if (i > 0) stream()->Add(" ");
				3273	TextElement elm = that->elements()->at(i);
				3274	switch (elm.type) {
				3275	case TextElement::ATOM: {
				3276	stream()->Add("'%w'", elm.data.u_atom->data());
				3277	break;
				3278	}
				3279	case TextElement::CHAR_CLASS: {
				3280	RegExpCharacterClass* node = elm.data.u_char_class;
				3281	stream()->Add("[");
				3282	if (node->is_negated())
				3283	stream()->Add("^");
				3284	for (int j = 0; j < node->ranges()->length(); j++) {
				3285	CharacterRange range = node->ranges()->at(j);
				3286	stream()->Add("%k-%k", range.from(), range.to());
				3287	}
				3288	stream()->Add("]");
				3289	break;
				3290	}
				3291	default:
				3292	UNREACHABLE();
				3293	}
				3294	}
				3295	stream()->Add("\", shape=box, peripheries=2];\n");
				3296	PrintAttributes(that);
				3297	stream()->Add(" n%p -> n%p;\n", that, that->on_success());
				3298	Visit(that->on_success());
				3299	}
				3300
				3301
				3302	void DotPrinter::VisitBackReference(BackReferenceNode* that) {
				3303	stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
				3304	that,
				3305	that->start_register(),
				3306	that->end_register());
				3307	PrintAttributes(that);
				3308	stream()->Add(" n%p -> n%p;\n", that, that->on_success());
				3309	Visit(that->on_success());
				3310	}
				3311
				3312
				3313	void DotPrinter::VisitEnd(EndNode* that) {
				3314	stream()->Add(" n%p [style=bold, shape=point];\n", that);
				3315	PrintAttributes(that);
				3316	}
				3317
				3318
				3319	void DotPrinter::VisitAssertion(AssertionNode* that) {
				3320	stream()->Add(" n%p [", that);
				3321	switch (that->type()) {
				3322	case AssertionNode::AT_END:
				3323	stream()->Add("label=\"$\", shape=septagon");
				3324	break;
				3325	case AssertionNode::AT_START:
				3326	stream()->Add("label=\"^\", shape=septagon");
				3327	break;
				3328	case AssertionNode::AT_BOUNDARY:
				3329	stream()->Add("label=\"\\b\", shape=septagon");
				3330	break;
				3331	case AssertionNode::AT_NON_BOUNDARY:
				3332	stream()->Add("label=\"\\B\", shape=septagon");
				3333	break;
				3334	case AssertionNode::AFTER_NEWLINE:
				3335	stream()->Add("label=\"(?<=\\n)\", shape=septagon");
				3336	break;
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	3337	case AssertionNode::AFTER_WORD_CHARACTER:
				3338	stream()->Add("label=\"(?<=\\w)\", shape=septagon");
				3339	break;
				3340	case AssertionNode::AFTER_NONWORD_CHARACTER:
				3341	stream()->Add("label=\"(?<=\\W)\", shape=septagon");
				3342	break;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3343	}
				3344	stream()->Add("];\n");
				3345	PrintAttributes(that);
				3346	RegExpNode* successor = that->on_success();
				3347	stream()->Add(" n%p -> n%p;\n", that, successor);
				3348	Visit(successor);
				3349	}
				3350
				3351
				3352	void DotPrinter::VisitAction(ActionNode* that) {
				3353	stream()->Add(" n%p [", that);
				3354	switch (that->type_) {
				3355	case ActionNode::SET_REGISTER:
				3356	stream()->Add("label=\"$%i:=%i\", shape=octagon",
				3357	that->data_.u_store_register.reg,
				3358	that->data_.u_store_register.value);
				3359	break;
				3360	case ActionNode::INCREMENT_REGISTER:
				3361	stream()->Add("label=\"$%i++\", shape=octagon",
				3362	that->data_.u_increment_register.reg);
				3363	break;
				3364	case ActionNode::STORE_POSITION:
				3365	stream()->Add("label=\"$%i:=$pos\", shape=octagon",
				3366	that->data_.u_position_register.reg);
				3367	break;
				3368	case ActionNode::BEGIN_SUBMATCH:
				3369	stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
				3370	that->data_.u_submatch.current_position_register);
				3371	break;
				3372	case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
				3373	stream()->Add("label=\"escape\", shape=septagon");
				3374	break;
				3375	case ActionNode::EMPTY_MATCH_CHECK:
				3376	stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
				3377	that->data_.u_empty_match_check.start_register,
				3378	that->data_.u_empty_match_check.repetition_register,
				3379	that->data_.u_empty_match_check.repetition_limit);
				3380	break;
				3381	case ActionNode::CLEAR_CAPTURES: {
				3382	stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
				3383	that->data_.u_clear_captures.range_from,
				3384	that->data_.u_clear_captures.range_to);
				3385	break;
				3386	}
				3387	}
				3388	stream()->Add("];\n");
				3389	PrintAttributes(that);
				3390	RegExpNode* successor = that->on_success();
				3391	stream()->Add(" n%p -> n%p;\n", that, successor);
				3392	Visit(successor);
				3393	}
				3394
				3395
				3396	class DispatchTableDumper {
				3397	public:
				3398	explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
				3399	void Call(uc16 key, DispatchTable::Entry entry);
				3400	StringStream* stream() { return stream_; }
				3401	private:
				3402	StringStream* stream_;
				3403	};
				3404
				3405
				3406	void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
				3407	stream()->Add("[%k-%k]: {", key, entry.to());
				3408	OutSet* set = entry.out_set();
				3409	bool first = true;
				3410	for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
				3411	if (set->Get(i)) {
				3412	if (first) {
				3413	first = false;
				3414	} else {
				3415	stream()->Add(", ");
				3416	}
				3417	stream()->Add("%i", i);
				3418	}
				3419	}
				3420	stream()->Add("}\n");
				3421	}
				3422
				3423
				3424	void DispatchTable::Dump() {
				3425	HeapStringAllocator alloc;
				3426	StringStream stream(&alloc);
				3427	DispatchTableDumper dumper(&stream);
				3428	tree()->ForEach(&dumper);
				3429	OS::PrintError("%s", *stream.ToCString());
				3430	}
				3431
				3432
				3433	void RegExpEngine::DotPrint(const char* label,
				3434	RegExpNode* node,
				3435	bool ignore_case) {
				3436	DotPrinter printer(ignore_case);
				3437	printer.PrintNode(label, node);
				3438	}
				3439
				3440
				3441	#endif // DEBUG
				3442
				3443
				3444	// -------------------------------------------------------------------
				3445	// Tree to graph conversion
				3446
				3447	static const int kSpaceRangeCount = 20;
				3448	static const int kSpaceRangeAsciiCount = 4;
				3449	static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020,
				3450	0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A,
				3451	0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 };
				3452
				3453	static const int kWordRangeCount = 8;
				3454	static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_',
				3455	'_', 'a', 'z' };
				3456
				3457	static const int kDigitRangeCount = 2;
				3458	static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' };
				3459
				3460	static const int kLineTerminatorRangeCount = 6;
				3461	static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A,
				3462	0x000A, 0x000D, 0x000D, 0x2028, 0x2029 };
				3463
				3464	RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
				3465	RegExpNode* on_success) {
				3466	ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
				3467	elms->Add(TextElement::Atom(this));
				3468	return new TextNode(elms, on_success);
				3469	}
				3470
				3471
				3472	RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
				3473	RegExpNode* on_success) {
				3474	return new TextNode(elements(), on_success);
				3475	}
				3476
				3477	static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
				3478	const uc16* special_class,
				3479	int length) {
				3480	ASSERT(ranges->length() != 0);
				3481	ASSERT(length != 0);
				3482	ASSERT(special_class[0] != 0);
				3483	if (ranges->length() != (length >> 1) + 1) {
				3484	return false;
				3485	}
				3486	CharacterRange range = ranges->at(0);
				3487	if (range.from() != 0) {
				3488	return false;
				3489	}
				3490	for (int i = 0; i < length; i += 2) {
				3491	if (special_class[i] != (range.to() + 1)) {
				3492	return false;
				3493	}
				3494	range = ranges->at((i >> 1) + 1);
				3495	if (special_class[i+1] != range.from() - 1) {
				3496	return false;
				3497	}
				3498	}
				3499	if (range.to() != 0xffff) {
				3500	return false;
				3501	}
				3502	return true;
				3503	}
				3504
				3505
				3506	static bool CompareRanges(ZoneList<CharacterRange>* ranges,
				3507	const uc16* special_class,
				3508	int length) {
				3509	if (ranges->length() * 2 != length) {
				3510	return false;
				3511	}
				3512	for (int i = 0; i < length; i += 2) {
				3513	CharacterRange range = ranges->at(i >> 1);
				3514	if (range.from() != special_class[i] \|\| range.to() != special_class[i+1]) {
				3515	return false;
				3516	}
				3517	}
				3518	return true;
				3519	}
				3520
				3521
				3522	bool RegExpCharacterClass::is_standard() {
				3523	// TODO(lrn): Remove need for this function, by not throwing away information
				3524	// along the way.
				3525	if (is_negated_) {
				3526	return false;
				3527	}
				3528	if (set_.is_standard()) {
				3529	return true;
				3530	}
				3531	if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
				3532	set_.set_standard_set_type('s');
				3533	return true;
				3534	}
				3535	if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
				3536	set_.set_standard_set_type('S');
				3537	return true;
				3538	}
				3539	if (CompareInverseRanges(set_.ranges(),
				3540	kLineTerminatorRanges,
				3541	kLineTerminatorRangeCount)) {
				3542	set_.set_standard_set_type('.');
				3543	return true;
				3544	}
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	3545	if (CompareRanges(set_.ranges(),
				3546	kLineTerminatorRanges,
				3547	kLineTerminatorRangeCount)) {
				3548	set_.set_standard_set_type('n');
				3549	return true;
				3550	}
				3551	if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
				3552	set_.set_standard_set_type('w');
				3553	return true;
				3554	}
				3555	if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
				3556	set_.set_standard_set_type('W');
				3557	return true;
				3558	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	3559	return false;
				3560	}
				3561
				3562
				3563	RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
				3564	RegExpNode* on_success) {
				3565	return new TextNode(this, on_success);
				3566	}
				3567
				3568
				3569	RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
				3570	RegExpNode* on_success) {
				3571	ZoneList<RegExpTree> alternatives = this->alternatives();
				3572	int length = alternatives->length();
				3573	ChoiceNode* result = new ChoiceNode(length);
				3574	for (int i = 0; i < length; i++) {
				3575	GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
				3576	on_success));
				3577	result->AddAlternative(alternative);
				3578	}
				3579	return result;
				3580	}
				3581
				3582
				3583	RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
				3584	RegExpNode* on_success) {
				3585	return ToNode(min(),
				3586	max(),
				3587	is_greedy(),
				3588	body(),
				3589	compiler,
				3590	on_success);
				3591	}
				3592
				3593
				3594	RegExpNode* RegExpQuantifier::ToNode(int min,
				3595	int max,
				3596	bool is_greedy,
				3597	RegExpTree* body,
				3598	RegExpCompiler* compiler,
				3599	RegExpNode* on_success,
				3600	bool not_at_start) {
				3601	// x{f, t} becomes this:
				3602	//
				3603	// (r++)<-.
				3604	// \| `
				3605	// \| (x)
				3606	// v ^
				3607	// (r=0)-->(?)---/ [if r < t]
				3608	// \|
				3609	// [if r >= f] \----> ...
				3610	//
				3611
				3612	// 15.10.2.5 RepeatMatcher algorithm.
				3613	// The parser has already eliminated the case where max is 0. In the case
				3614	// where max_match is zero the parser has removed the quantifier if min was
				3615	// > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
				3616
				3617	// If we know that we cannot match zero length then things are a little
				3618	// simpler since we don't need to make the special zero length match check
				3619	// from step 2.1. If the min and max are small we can unroll a little in
				3620	// this case.
				3621	static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
				3622	static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
				3623	if (max == 0) return on_success; // This can happen due to recursion.
				3624	bool body_can_be_empty = (body->min_match() == 0);
				3625	int body_start_reg = RegExpCompiler::kNoRegister;
				3626	Interval capture_registers = body->CaptureRegisters();
				3627	bool needs_capture_clearing = !capture_registers.is_empty();
				3628	if (body_can_be_empty) {
				3629	body_start_reg = compiler->AllocateRegister();
				3630	} else if (FLAG_regexp_optimization && !needs_capture_clearing) {
				3631	// Only unroll if there are no captures and the body can't be
				3632	// empty.
				3633	if (min > 0 && min <= kMaxUnrolledMinMatches) {
				3634	int new_max = (max == kInfinity) ? max : max - min;
				3635	// Recurse once to get the loop or optional matches after the fixed ones.
				3636	RegExpNode* answer = ToNode(
				3637	0, new_max, is_greedy, body, compiler, on_success, true);
				3638	// Unroll the forced matches from 0 to min. This can cause chains of
				3639	// TextNodes (which the parser does not generate). These should be
				3640	// combined if it turns out they hinder good code generation.
				3641	for (int i = 0; i < min; i++) {
				3642	answer = body->ToNode(compiler, answer);
				3643	}
				3644	return answer;
				3645	}
				3646	if (max <= kMaxUnrolledMaxMatches) {
				3647	ASSERT(min == 0);
				3648	// Unroll the optional matches up to max.
				3649	RegExpNode* answer = on_success;
				3650	for (int i = 0; i < max; i++) {
				3651	ChoiceNode* alternation = new ChoiceNode(2);
				3652	if (is_greedy) {
				3653	alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
				3654	answer)));
				3655	alternation->AddAlternative(GuardedAlternative(on_success));
				3656	} else {
				3657	alternation->AddAlternative(GuardedAlternative(on_success));
				3658	alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
				3659	answer)));
				3660	}
				3661	answer = alternation;
				3662	if (not_at_start) alternation->set_not_at_start();
				3663	}
				3664	return answer;
				3665	}
				3666	}
				3667	bool has_min = min > 0;
				3668	bool has_max = max < RegExpTree::kInfinity;
				3669	bool needs_counter = has_min \|\| has_max;
				3670	int reg_ctr = needs_counter
				3671	? compiler->AllocateRegister()
				3672	: RegExpCompiler::kNoRegister;
				3673	LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
				3674	if (not_at_start) center->set_not_at_start();
				3675	RegExpNode* loop_return = needs_counter
				3676	? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
				3677	: static_cast<RegExpNode*>(center);
				3678	if (body_can_be_empty) {
				3679	// If the body can be empty we need to check if it was and then
				3680	// backtrack.
				3681	loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
				3682	reg_ctr,
				3683	min,
				3684	loop_return);
				3685	}
				3686	RegExpNode* body_node = body->ToNode(compiler, loop_return);
				3687	if (body_can_be_empty) {
				3688	// If the body can be empty we need to store the start position
				3689	// so we can bail out if it was empty.
				3690	body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
				3691	}
				3692	if (needs_capture_clearing) {
				3693	// Before entering the body of this loop we need to clear captures.
				3694	body_node = ActionNode::ClearCaptures(capture_registers, body_node);
				3695	}
				3696	GuardedAlternative body_alt(body_node);
				3697	if (has_max) {
				3698	Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
				3699	body_alt.AddGuard(body_guard);
				3700	}
				3701	GuardedAlternative rest_alt(on_success);
				3702	if (has_min) {
				3703	Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
				3704	rest_alt.AddGuard(rest_guard);
				3705	}
				3706	if (is_greedy) {
				3707	center->AddLoopAlternative(body_alt);
				3708	center->AddContinueAlternative(rest_alt);
				3709	} else {
				3710	center->AddContinueAlternative(rest_alt);
				3711	center->AddLoopAlternative(body_alt);
				3712	}
				3713	if (needs_counter) {
				3714	return ActionNode::SetRegister(reg_ctr, 0, center);
				3715	} else {
				3716	return center;
				3717	}
				3718	}
				3719
				3720
				3721	RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
				3722	RegExpNode* on_success) {
				3723	NodeInfo info;
				3724	switch (type()) {
				3725	case START_OF_LINE:
				3726	return AssertionNode::AfterNewline(on_success);
				3727	case START_OF_INPUT:
				3728	return AssertionNode::AtStart(on_success);
				3729	case BOUNDARY:
				3730	return AssertionNode::AtBoundary(on_success);
				3731	case NON_BOUNDARY:
				3732	return AssertionNode::AtNonBoundary(on_success);
				3733	case END_OF_INPUT:
				3734	return AssertionNode::AtEnd(on_success);
				3735	case END_OF_LINE: {
				3736	// Compile $ in multiline regexps as an alternation with a positive
				3737	// lookahead in one side and an end-of-input on the other side.
				3738	// We need two registers for the lookahead.
				3739	int stack_pointer_register = compiler->AllocateRegister();
				3740	int position_register = compiler->AllocateRegister();
				3741	// The ChoiceNode to distinguish between a newline and end-of-input.
				3742	ChoiceNode* result = new ChoiceNode(2);
				3743	// Create a newline atom.
				3744	ZoneList<CharacterRange>* newline_ranges =
				3745	new ZoneList<CharacterRange>(3);
				3746	CharacterRange::AddClassEscape('n', newline_ranges);
				3747	RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
				3748	TextNode* newline_matcher = new TextNode(
				3749	newline_atom,
				3750	ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
				3751	position_register,
				3752	0, // No captures inside.
				3753	-1, // Ignored if no captures.
				3754	on_success));
				3755	// Create an end-of-input matcher.
				3756	RegExpNode* end_of_line = ActionNode::BeginSubmatch(
				3757	stack_pointer_register,
				3758	position_register,
				3759	newline_matcher);
				3760	// Add the two alternatives to the ChoiceNode.
				3761	GuardedAlternative eol_alternative(end_of_line);
				3762	result->AddAlternative(eol_alternative);
				3763	GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
				3764	result->AddAlternative(end_alternative);
				3765	return result;
				3766	}
				3767	default:
				3768	UNREACHABLE();
				3769	}
				3770	return on_success;
				3771	}
				3772
				3773
				3774	RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
				3775	RegExpNode* on_success) {
				3776	return new BackReferenceNode(RegExpCapture::StartRegister(index()),
				3777	RegExpCapture::EndRegister(index()),
				3778	on_success);
				3779	}
				3780
				3781
				3782	RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
				3783	RegExpNode* on_success) {
				3784	return on_success;
				3785	}
				3786
				3787
				3788	RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
				3789	RegExpNode* on_success) {
				3790	int stack_pointer_register = compiler->AllocateRegister();
				3791	int position_register = compiler->AllocateRegister();
				3792
				3793	const int registers_per_capture = 2;
				3794	const int register_of_first_capture = 2;
				3795	int register_count = capture_count_ * registers_per_capture;
				3796	int register_start =
				3797	register_of_first_capture + capture_from_ * registers_per_capture;
				3798
				3799	RegExpNode* success;
				3800	if (is_positive()) {
				3801	RegExpNode* node = ActionNode::BeginSubmatch(
				3802	stack_pointer_register,
				3803	position_register,
				3804	body()->ToNode(
				3805	compiler,
				3806	ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
				3807	position_register,
				3808	register_count,
				3809	register_start,
				3810	on_success)));
				3811	return node;
				3812	} else {
				3813	// We use a ChoiceNode for a negative lookahead because it has most of
				3814	// the characteristics we need. It has the body of the lookahead as its
				3815	// first alternative and the expression after the lookahead of the second
				3816	// alternative. If the first alternative succeeds then the
				3817	// NegativeSubmatchSuccess will unwind the stack including everything the
				3818	// choice node set up and backtrack. If the first alternative fails then
				3819	// the second alternative is tried, which is exactly the desired result
				3820	// for a negative lookahead. The NegativeLookaheadChoiceNode is a special
				3821	// ChoiceNode that knows to ignore the first exit when calculating quick
				3822	// checks.
				3823	GuardedAlternative body_alt(
				3824	body()->ToNode(
				3825	compiler,
				3826	success = new NegativeSubmatchSuccess(stack_pointer_register,
				3827	position_register,
				3828	register_count,
				3829	register_start)));
				3830	ChoiceNode* choice_node =
				3831	new NegativeLookaheadChoiceNode(body_alt,
				3832	GuardedAlternative(on_success));
				3833	return ActionNode::BeginSubmatch(stack_pointer_register,
				3834	position_register,
				3835	choice_node);
				3836	}
				3837	}
				3838
				3839
				3840	RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
				3841	RegExpNode* on_success) {
				3842	return ToNode(body(), index(), compiler, on_success);
				3843	}
				3844
				3845
				3846	RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
				3847	int index,
				3848	RegExpCompiler* compiler,
				3849	RegExpNode* on_success) {
				3850	int start_reg = RegExpCapture::StartRegister(index);
				3851	int end_reg = RegExpCapture::EndRegister(index);
				3852	RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
				3853	RegExpNode* body_node = body->ToNode(compiler, store_end);
				3854	return ActionNode::StorePosition(start_reg, true, body_node);
				3855	}
				3856
				3857
				3858	RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
				3859	RegExpNode* on_success) {
				3860	ZoneList<RegExpTree> children = nodes();
				3861	RegExpNode* current = on_success;
				3862	for (int i = children->length() - 1; i >= 0; i--) {
				3863	current = children->at(i)->ToNode(compiler, current);
				3864	}
				3865	return current;
				3866	}
				3867
				3868
				3869	static void AddClass(const uc16* elmv,
				3870	int elmc,
				3871	ZoneList<CharacterRange>* ranges) {
				3872	for (int i = 0; i < elmc; i += 2) {
				3873	ASSERT(elmv[i] <= elmv[i + 1]);
				3874	ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
				3875	}
				3876	}
				3877
				3878
				3879	static void AddClassNegated(const uc16 *elmv,
				3880	int elmc,
				3881	ZoneList<CharacterRange>* ranges) {
				3882	ASSERT(elmv[0] != 0x0000);
				3883	ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
				3884	uc16 last = 0x0000;
				3885	for (int i = 0; i < elmc; i += 2) {
				3886	ASSERT(last <= elmv[i] - 1);
				3887	ASSERT(elmv[i] <= elmv[i + 1]);
				3888	ranges->Add(CharacterRange(last, elmv[i] - 1));
				3889	last = elmv[i + 1] + 1;
				3890	}
				3891	ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
				3892	}
				3893
				3894
				3895	void CharacterRange::AddClassEscape(uc16 type,
				3896	ZoneList<CharacterRange>* ranges) {
				3897	switch (type) {
				3898	case 's':
				3899	AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
				3900	break;
				3901	case 'S':
				3902	AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
				3903	break;
				3904	case 'w':
				3905	AddClass(kWordRanges, kWordRangeCount, ranges);
				3906	break;
				3907	case 'W':
				3908	AddClassNegated(kWordRanges, kWordRangeCount, ranges);
				3909	break;
				3910	case 'd':
				3911	AddClass(kDigitRanges, kDigitRangeCount, ranges);
				3912	break;
				3913	case 'D':
				3914	AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
				3915	break;
				3916	case '.':
				3917	AddClassNegated(kLineTerminatorRanges,
				3918	kLineTerminatorRangeCount,
				3919	ranges);
				3920	break;
				3921	// This is not a character range as defined by the spec but a
				3922	// convenient shorthand for a character class that matches any
				3923	// character.
				3924	case '*':
				3925	ranges->Add(CharacterRange::Everything());
				3926	break;
				3927	// This is the set of characters matched by the $ and ^ symbols
				3928	// in multiline mode.
				3929	case 'n':
				3930	AddClass(kLineTerminatorRanges,
				3931	kLineTerminatorRangeCount,
				3932	ranges);
				3933	break;
				3934	default:
				3935	UNREACHABLE();
				3936	}
				3937	}
				3938
				3939
				3940	Vector<const uc16> CharacterRange::GetWordBounds() {
				3941	return Vector<const uc16>(kWordRanges, kWordRangeCount);
				3942	}
				3943
				3944
				3945	class CharacterRangeSplitter {
				3946	public:
				3947	CharacterRangeSplitter(ZoneList<CharacterRange>** included,
				3948	ZoneList<CharacterRange>** excluded)
				3949	: included_(included),
				3950	excluded_(excluded) { }
				3951	void Call(uc16 from, DispatchTable::Entry entry);
				3952
				3953	static const int kInBase = 0;
				3954	static const int kInOverlay = 1;
				3955
				3956	private:
				3957	ZoneList<CharacterRange>** included_;
				3958	ZoneList<CharacterRange>** excluded_;
				3959	};
				3960
				3961
				3962	void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
				3963	if (!entry.out_set()->Get(kInBase)) return;
				3964	ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
				3965	? included_
				3966	: excluded_;
				3967	if (target == NULL) target = new ZoneList<CharacterRange>(2);
				3968	(*target)->Add(CharacterRange(entry.from(), entry.to()));
				3969	}
				3970
				3971
				3972	void CharacterRange::Split(ZoneList<CharacterRange>* base,
				3973	Vector<const uc16> overlay,
				3974	ZoneList<CharacterRange>** included,
				3975	ZoneList<CharacterRange>** excluded) {
				3976	ASSERT_EQ(NULL, *included);
				3977	ASSERT_EQ(NULL, *excluded);
				3978	DispatchTable table;
				3979	for (int i = 0; i < base->length(); i++)
				3980	table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
				3981	for (int i = 0; i < overlay.length(); i += 2) {
				3982	table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
				3983	CharacterRangeSplitter::kInOverlay);
				3984	}
				3985	CharacterRangeSplitter callback(included, excluded);
				3986	table.ForEach(&callback);
				3987	}
				3988
				3989
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	3990	static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
				3991	int bottom,
				3992	int top);
				3993
				3994
				3995	void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
				3996	bool is_ascii) {
				3997	uc16 bottom = from();
				3998	uc16 top = to();
				3999	if (is_ascii) {
				4000	if (bottom > String::kMaxAsciiCharCode) return;
				4001	if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode;
				4002	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4003	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4004	if (top == bottom) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4005	// If this is a singleton we just expand the one character.
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4006	int length = uncanonicalize.get(bottom, '\0', chars);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4007	for (int i = 0; i < length; i++) {
				4008	uc32 chr = chars[i];
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4009	if (chr != bottom) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4010	ranges->Add(CharacterRange::Singleton(chars[i]));
				4011	}
				4012	}
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4013	} else if (bottom <= kRangeCanonicalizeMax &&
				4014	top <= kRangeCanonicalizeMax) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4015	// If this is a range we expand the characters block by block,
				4016	// expanding contiguous subranges (blocks) one at a time.
				4017	// The approach is as follows. For a given start character we
				4018	// look up the block that contains it, for instance 'a' if the
				4019	// start character is 'c'. A block is characterized by the property
				4020	// that all characters uncanonicalize in the same way as the first
				4021	// element, except that each entry in the result is incremented
				4022	// by the distance from the first element. So a-z is a block
				4023	// because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
				4024	// uncanonicalizes to ['a' + k, 'A' + k].
				4025	// Once we've found the start point we look up its uncanonicalization
				4026	// and produce a range for each element. For instance for [c-f]
				4027	// we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
				4028	// add a range if it is not already contained in the input, so [c-f]
				4029	// will be skipped but [C-F] will be added. If this range is not
				4030	// completely contained in a block we do this for all the blocks
				4031	// covered by the range.
				4032	unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4033	// First, look up the block that contains the 'bottom' character.
				4034	int length = canonrange.get(bottom, '\0', range);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4035	if (length == 0) {
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4036	range[0] = bottom;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4037	} else {
				4038	ASSERT_EQ(1, length);
				4039	}
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4040	int pos = bottom;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4041	// The start of the current block. Note that except for the first
				4042	// iteration 'start' is always equal to 'pos'.
				4043	int start;
				4044	// If it is not the start point of a block the entry contains the
				4045	// offset of the character from the start point.
				4046	if ((range[0] & kStartMarker) == 0) {
				4047	start = pos - range[0];
				4048	} else {
				4049	start = pos;
				4050	}
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4051	// Then we add the ranges one at a time, incrementing the current
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4052	// position to be after the last block each time. The position
				4053	// always points to the start of a block.
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4054	while (pos < top) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4055	length = canonrange.get(start, '\0', range);
				4056	if (length == 0) {
				4057	range[0] = start;
				4058	} else {
				4059	ASSERT_EQ(1, length);
				4060	}
				4061	ASSERT((range[0] & kStartMarker) != 0);
				4062	// The start point of a block contains the distance to the end
				4063	// of the range.
				4064	int block_end = start + (range[0] & kPayloadMask) - 1;
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4065	int end = (block_end > top) ? top : block_end;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4066	length = uncanonicalize.get(start, '\0', range);
				4067	for (int i = 0; i < length; i++) {
				4068	uc32 c = range[i];
				4069	uc16 range_from = c + (pos - start);
				4070	uc16 range_to = c + (end - start);
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4071	if (!(bottom <= range_from && range_to <= top)) {
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4072	ranges->Add(CharacterRange(range_from, range_to));
				4073	}
				4074	}
				4075	start = pos = block_end + 1;
				4076	}
				4077	} else {
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4078	// Unibrow ranges don't work for high characters due to the "2^11 bug".
				4079	// Therefore we do something dumber for these ranges.
				4080	AddUncanonicals(ranges, bottom, top);
				4081	}
				4082	}
				4083
				4084
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4085	bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
				4086	ASSERT_NOT_NULL(ranges);
				4087	int n = ranges->length();
				4088	if (n <= 1) return true;
				4089	int max = ranges->at(0).to();
				4090	for (int i = 1; i < n; i++) {
				4091	CharacterRange next_range = ranges->at(i);
				4092	if (next_range.from() <= max + 1) return false;
				4093	max = next_range.to();
				4094	}
				4095	return true;
				4096	}
				4097
				4098	SetRelation CharacterRange::WordCharacterRelation(
				4099	ZoneList<CharacterRange>* range) {
				4100	ASSERT(IsCanonical(range));
				4101	int i = 0; // Word character range index.
				4102	int j = 0; // Argument range index.
				4103	ASSERT_NE(0, kWordRangeCount);
				4104	SetRelation result;
				4105	if (range->length() == 0) {
				4106	result.SetElementsInSecondSet();
				4107	return result;
				4108	}
				4109	CharacterRange argument_range = range->at(0);
				4110	CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]);
				4111	while (i < kWordRangeCount && j < range->length()) {
				4112	// Check the two ranges for the five cases:
				4113	// - no overlap.
				4114	// - partial overlap (there are elements in both ranges that isn't
				4115	// in the other, and there are also elements that are in both).
				4116	// - argument range entirely inside word range.
				4117	// - word range entirely inside argument range.
				4118	// - ranges are completely equal.
				4119
				4120	// First check for no overlap. The earlier range is not in the other set.
				4121	if (argument_range.from() > word_range.to()) {
				4122	// Ranges are disjoint. The earlier word range contains elements that
				4123	// cannot be in the argument set.
				4124	result.SetElementsInSecondSet();
				4125	} else if (word_range.from() > argument_range.to()) {
				4126	// Ranges are disjoint. The earlier argument range contains elements that
				4127	// cannot be in the word set.
				4128	result.SetElementsInFirstSet();
				4129	} else if (word_range.from() <= argument_range.from() &&
				4130	word_range.to() >= argument_range.from()) {
				4131	result.SetElementsInBothSets();
				4132	// argument range completely inside word range.
				4133	if (word_range.from() < argument_range.from() \|\|
				4134	word_range.to() > argument_range.from()) {
				4135	result.SetElementsInSecondSet();
				4136	}
				4137	} else if (word_range.from() >= argument_range.from() &&
				4138	word_range.to() <= argument_range.from()) {
				4139	result.SetElementsInBothSets();
				4140	result.SetElementsInFirstSet();
				4141	} else {
				4142	// There is overlap, and neither is a subrange of the other
				4143	result.SetElementsInFirstSet();
				4144	result.SetElementsInSecondSet();
				4145	result.SetElementsInBothSets();
				4146	}
				4147	if (result.NonTrivialIntersection()) {
				4148	// The result is as (im)precise as we can possibly make it.
				4149	return result;
				4150	}
				4151	// Progress the range(s) with minimal to-character.
				4152	uc16 word_to = word_range.to();
				4153	uc16 argument_to = argument_range.to();
				4154	if (argument_to <= word_to) {
				4155	j++;
				4156	if (j < range->length()) {
				4157	argument_range = range->at(j);
				4158	}
				4159	}
				4160	if (word_to <= argument_to) {
				4161	i += 2;
				4162	if (i < kWordRangeCount) {
				4163	word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]);
				4164	}
				4165	}
				4166	}
				4167	// Check if anything wasn't compared in the loop.
				4168	if (i < kWordRangeCount) {
				4169	// word range contains something not in argument range.
				4170	result.SetElementsInSecondSet();
				4171	} else if (j < range->length()) {
				4172	// Argument range contains something not in word range.
				4173	result.SetElementsInFirstSet();
				4174	}
				4175
				4176	return result;
				4177	}
				4178
				4179
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4180	static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
				4181	int bottom,
				4182	int top) {
				4183	unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
				4184	// Zones with no case mappings. There is a DEBUG-mode loop to assert that
				4185	// this table is correct.
				4186	// 0x0600 - 0x0fff
				4187	// 0x1100 - 0x1cff
				4188	// 0x2000 - 0x20ff
				4189	// 0x2200 - 0x23ff
				4190	// 0x2500 - 0x2bff
				4191	// 0x2e00 - 0xa5ff
				4192	// 0xa800 - 0xfaff
				4193	// 0xfc00 - 0xfeff
				4194	const int boundary_count = 18;
				4195	// The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this
				4196	// array. This is to split up big ranges and not because they actually denote
				4197	// a case-mapping-free-zone.
				4198	ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600);
				4199	const int kFirstRealCaselessZoneIndex = 2;
				4200	int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax,
				4201	0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500,
				4202	0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00};
				4203
				4204	// Special ASCII rule from spec can save us some work here.
				4205	if (bottom == 0x80 && top == 0xffff) return;
				4206
				4207	// We have optimized support for this range.
				4208	if (top <= CharacterRange::kRangeCanonicalizeMax) {
				4209	CharacterRange range(bottom, top);
				4210	range.AddCaseEquivalents(ranges, false);
				4211	return;
				4212	}
				4213
				4214	// Split up very large ranges. This helps remove ranges where there are no
				4215	// case mappings.
				4216	for (int i = 0; i < boundary_count; i++) {
				4217	if (bottom < boundaries[i] && top >= boundaries[i]) {
				4218	AddUncanonicals(ranges, bottom, boundaries[i] - 1);
				4219	AddUncanonicals(ranges, boundaries[i], top);
				4220	return;
				4221	}
				4222	}
				4223
				4224	// If we are completely in a zone with no case mappings then we are done.
				4225	// We start at 2 so as not to except the ASCII range from mappings.
				4226	for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) {
				4227	if (bottom >= boundaries[i] && top < boundaries[i + 1]) {
				4228	#ifdef DEBUG
				4229	for (int j = bottom; j <= top; j++) {
				4230	unsigned current_char = j;
				4231	int length = uncanonicalize.get(current_char, '\0', chars);
				4232	for (int k = 0; k < length; k++) {
				4233	ASSERT(chars[k] == current_char);
				4234	}
				4235	}
				4236	#endif
				4237	return;
				4238	}
				4239	}
				4240
				4241	// Step through the range finding equivalent characters.
				4242	ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100);
				4243	for (int i = bottom; i <= top; i++) {
				4244	int length = uncanonicalize.get(i, '\0', chars);
				4245	for (int j = 0; j < length; j++) {
				4246	uc32 chr = chars[j];
				4247	if (chr != i && (chr < bottom \|\| chr > top)) {
				4248	characters->Add(chr);
				4249	}
				4250	}
				4251	}
				4252
				4253	// Step through the equivalent characters finding simple ranges and
				4254	// adding ranges to the character class.
				4255	if (characters->length() > 0) {
				4256	int new_from = characters->at(0);
				4257	int new_to = new_from;
				4258	for (int i = 1; i < characters->length(); i++) {
				4259	int chr = characters->at(i);
				4260	if (chr == new_to + 1) {
				4261	new_to++;
				4262	} else {
				4263	if (new_to == new_from) {
				4264	ranges->Add(CharacterRange::Singleton(new_from));
				4265	} else {
				4266	ranges->Add(CharacterRange(new_from, new_to));
				4267	}
				4268	new_from = new_to = chr;
				4269	}
				4270	}
				4271	if (new_to == new_from) {
				4272	ranges->Add(CharacterRange::Singleton(new_from));
				4273	} else {
				4274	ranges->Add(CharacterRange(new_from, new_to));
				4275	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4276	}
				4277	}
				4278
				4279
				4280	ZoneList<CharacterRange>* CharacterSet::ranges() {
				4281	if (ranges_ == NULL) {
				4282	ranges_ = new ZoneList<CharacterRange>(2);
				4283	CharacterRange::AddClassEscape(standard_set_type_, ranges_);
				4284	}
				4285	return ranges_;
				4286	}
				4287
				4288
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4289	// Move a number of elements in a zonelist to another position
				4290	// in the same list. Handles overlapping source and target areas.
				4291	static void MoveRanges(ZoneList<CharacterRange>* list,
				4292	int from,
				4293	int to,
				4294	int count) {
				4295	// Ranges are potentially overlapping.
				4296	if (from < to) {
				4297	for (int i = count - 1; i >= 0; i--) {
				4298	list->at(to + i) = list->at(from + i);
				4299	}
				4300	} else {
				4301	for (int i = 0; i < count; i++) {
				4302	list->at(to + i) = list->at(from + i);
				4303	}
				4304	}
				4305	}
				4306
				4307
				4308	static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
				4309	int count,
				4310	CharacterRange insert) {
				4311	// Inserts a range into list[0..count[, which must be sorted
				4312	// by from value and non-overlapping and non-adjacent, using at most
				4313	// list[0..count] for the result. Returns the number of resulting
				4314	// canonicalized ranges. Inserting a range may collapse existing ranges into
				4315	// fewer ranges, so the return value can be anything in the range 1..count+1.
				4316	uc16 from = insert.from();
				4317	uc16 to = insert.to();
				4318	int start_pos = 0;
				4319	int end_pos = count;
				4320	for (int i = count - 1; i >= 0; i--) {
				4321	CharacterRange current = list->at(i);
				4322	if (current.from() > to + 1) {
				4323	end_pos = i;
				4324	} else if (current.to() + 1 < from) {
				4325	start_pos = i + 1;
				4326	break;
				4327	}
				4328	}
				4329
				4330	// Inserted range overlaps, or is adjacent to, ranges at positions
				4331	// [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
				4332	// not affected by the insertion.
				4333	// If start_pos == end_pos, the range must be inserted before start_pos.
				4334	// if start_pos < end_pos, the entire range from start_pos to end_pos
				4335	// must be merged with the insert range.
				4336
				4337	if (start_pos == end_pos) {
				4338	// Insert between existing ranges at position start_pos.
				4339	if (start_pos < count) {
				4340	MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
				4341	}
				4342	list->at(start_pos) = insert;
				4343	return count + 1;
				4344	}
				4345	if (start_pos + 1 == end_pos) {
				4346	// Replace single existing range at position start_pos.
				4347	CharacterRange to_replace = list->at(start_pos);
				4348	int new_from = Min(to_replace.from(), from);
				4349	int new_to = Max(to_replace.to(), to);
				4350	list->at(start_pos) = CharacterRange(new_from, new_to);
				4351	return count;
				4352	}
				4353	// Replace a number of existing ranges from start_pos to end_pos - 1.
				4354	// Move the remaining ranges down.
				4355
				4356	int new_from = Min(list->at(start_pos).from(), from);
				4357	int new_to = Max(list->at(end_pos - 1).to(), to);
				4358	if (end_pos < count) {
				4359	MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
				4360	}
				4361	list->at(start_pos) = CharacterRange(new_from, new_to);
				4362	return count - (end_pos - start_pos) + 1;
				4363	}
				4364
				4365
				4366	void CharacterSet::Canonicalize() {
				4367	// Special/default classes are always considered canonical. The result
				4368	// of calling ranges() will be sorted.
				4369	if (ranges_ == NULL) return;
				4370	CharacterRange::Canonicalize(ranges_);
				4371	}
				4372
				4373
				4374	void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
				4375	if (character_ranges->length() <= 1) return;
				4376	// Check whether ranges are already canonical (increasing, non-overlapping,
				4377	// non-adjacent).
				4378	int n = character_ranges->length();
				4379	int max = character_ranges->at(0).to();
				4380	int i = 1;
				4381	while (i < n) {
				4382	CharacterRange current = character_ranges->at(i);
				4383	if (current.from() <= max + 1) {
				4384	break;
				4385	}
				4386	max = current.to();
				4387	i++;
				4388	}
				4389	// Canonical until the i'th range. If that's all of them, we are done.
				4390	if (i == n) return;
				4391
				4392	// The ranges at index i and forward are not canonicalized. Make them so by
				4393	// doing the equivalent of insertion sort (inserting each into the previous
				4394	// list, in order).
				4395	// Notice that inserting a range can reduce the number of ranges in the
				4396	// result due to combining of adjacent and overlapping ranges.
				4397	int read = i; // Range to insert.
				4398	int num_canonical = i; // Length of canonicalized part of list.
				4399	do {
				4400	num_canonical = InsertRangeInCanonicalList(character_ranges,
				4401	num_canonical,
				4402	character_ranges->at(read));
				4403	read++;
				4404	} while (read < n);
				4405	character_ranges->Rewind(num_canonical);
				4406
				4407	ASSERT(CharacterRange::IsCanonical(character_ranges));
				4408	}
				4409
				4410
				4411	// Utility function for CharacterRange::Merge. Adds a range at the end of
				4412	// a canonicalized range list, if necessary merging the range with the last
				4413	// range of the list.
				4414	static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) {
				4415	if (set == NULL) return;
				4416	ASSERT(set->length() == 0 \|\| set->at(set->length() - 1).to() < range.from());
				4417	int n = set->length();
				4418	if (n > 0) {
				4419	CharacterRange lastRange = set->at(n - 1);
				4420	if (lastRange.to() == range.from() - 1) {
				4421	set->at(n - 1) = CharacterRange(lastRange.from(), range.to());
				4422	return;
				4423	}
				4424	}
				4425	set->Add(range);
				4426	}
				4427
				4428
				4429	static void AddRangeToSelectedSet(int selector,
				4430	ZoneList<CharacterRange>* first_set,
				4431	ZoneList<CharacterRange>* second_set,
				4432	ZoneList<CharacterRange>* intersection_set,
				4433	CharacterRange range) {
				4434	switch (selector) {
				4435	case kInsideFirst:
				4436	AddRangeToSet(first_set, range);
				4437	break;
				4438	case kInsideSecond:
				4439	AddRangeToSet(second_set, range);
				4440	break;
				4441	case kInsideBoth:
				4442	AddRangeToSet(intersection_set, range);
				4443	break;
				4444	}
				4445	}
				4446
				4447
				4448
				4449	void CharacterRange::Merge(ZoneList<CharacterRange>* first_set,
				4450	ZoneList<CharacterRange>* second_set,
				4451	ZoneList<CharacterRange>* first_set_only_out,
				4452	ZoneList<CharacterRange>* second_set_only_out,
				4453	ZoneList<CharacterRange>* both_sets_out) {
				4454	// Inputs are canonicalized.
				4455	ASSERT(CharacterRange::IsCanonical(first_set));
				4456	ASSERT(CharacterRange::IsCanonical(second_set));
				4457	// Outputs are empty, if applicable.
				4458	ASSERT(first_set_only_out == NULL \|\| first_set_only_out->length() == 0);
				4459	ASSERT(second_set_only_out == NULL \|\| second_set_only_out->length() == 0);
				4460	ASSERT(both_sets_out == NULL \|\| both_sets_out->length() == 0);
				4461
				4462	// Merge sets by iterating through the lists in order of lowest "from" value,
				4463	// and putting intervals into one of three sets.
				4464
				4465	if (first_set->length() == 0) {
				4466	second_set_only_out->AddAll(*second_set);
				4467	return;
				4468	}
				4469	if (second_set->length() == 0) {
				4470	first_set_only_out->AddAll(*first_set);
				4471	return;
				4472	}
				4473	// Indices into input lists.
				4474	int i1 = 0;
				4475	int i2 = 0;
				4476	// Cache length of input lists.
				4477	int n1 = first_set->length();
				4478	int n2 = second_set->length();
				4479	// Current range. May be invalid if state is kInsideNone.
				4480	int from = 0;
				4481	int to = -1;
				4482	// Where current range comes from.
				4483	int state = kInsideNone;
				4484
				4485	while (i1 < n1 \|\| i2 < n2) {
				4486	CharacterRange next_range;
				4487	int range_source;
Leon Clarke	d91b9f7	2010-01-27 17:25:45 +0000	[diff] [blame]	4488	if (i2 == n2 \|\|
				4489	(i1 < n1 && first_set->at(i1).from() < second_set->at(i2).from())) {
				4490	// Next smallest element is in first set.
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4491	next_range = first_set->at(i1++);
				4492	range_source = kInsideFirst;
				4493	} else {
Leon Clarke	d91b9f7	2010-01-27 17:25:45 +0000	[diff] [blame]	4494	// Next smallest element is in second set.
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4495	next_range = second_set->at(i2++);
				4496	range_source = kInsideSecond;
				4497	}
				4498	if (to < next_range.from()) {
				4499	// Ranges disjoint: \|current\| \|next\|
				4500	AddRangeToSelectedSet(state,
				4501	first_set_only_out,
				4502	second_set_only_out,
				4503	both_sets_out,
				4504	CharacterRange(from, to));
				4505	from = next_range.from();
				4506	to = next_range.to();
				4507	state = range_source;
				4508	} else {
				4509	if (from < next_range.from()) {
				4510	AddRangeToSelectedSet(state,
				4511	first_set_only_out,
				4512	second_set_only_out,
				4513	both_sets_out,
				4514	CharacterRange(from, next_range.from()-1));
				4515	}
				4516	if (to < next_range.to()) {
				4517	// Ranges overlap: \|current\|
				4518	// \|next\|
				4519	AddRangeToSelectedSet(state \| range_source,
				4520	first_set_only_out,
				4521	second_set_only_out,
				4522	both_sets_out,
				4523	CharacterRange(next_range.from(), to));
				4524	from = to + 1;
				4525	to = next_range.to();
				4526	state = range_source;
				4527	} else {
				4528	// Range included: \|current\| , possibly ending at same character.
				4529	// \|next\|
				4530	AddRangeToSelectedSet(
				4531	state \| range_source,
				4532	first_set_only_out,
				4533	second_set_only_out,
				4534	both_sets_out,
				4535	CharacterRange(next_range.from(), next_range.to()));
				4536	from = next_range.to() + 1;
				4537	// If ranges end at same character, both ranges are consumed completely.
				4538	if (next_range.to() == to) state = kInsideNone;
				4539	}
				4540	}
				4541	}
				4542	AddRangeToSelectedSet(state,
				4543	first_set_only_out,
				4544	second_set_only_out,
				4545	both_sets_out,
				4546	CharacterRange(from, to));
				4547	}
				4548
				4549
				4550	void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
				4551	ZoneList<CharacterRange>* negated_ranges) {
				4552	ASSERT(CharacterRange::IsCanonical(ranges));
				4553	ASSERT_EQ(0, negated_ranges->length());
				4554	int range_count = ranges->length();
				4555	uc16 from = 0;
				4556	int i = 0;
				4557	if (range_count > 0 && ranges->at(0).from() == 0) {
				4558	from = ranges->at(0).to();
				4559	i = 1;
				4560	}
				4561	while (i < range_count) {
				4562	CharacterRange range = ranges->at(i);
				4563	negated_ranges->Add(CharacterRange(from + 1, range.from() - 1));
				4564	from = range.to();
				4565	i++;
				4566	}
				4567	if (from < String::kMaxUC16CharCode) {
				4568	negated_ranges->Add(CharacterRange(from + 1, String::kMaxUC16CharCode));
				4569	}
				4570	}
				4571
				4572
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4573
				4574	// -------------------------------------------------------------------
				4575	// Interest propagation
				4576
				4577
				4578	RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
				4579	for (int i = 0; i < siblings_.length(); i++) {
				4580	RegExpNode* sibling = siblings_.Get(i);
				4581	if (sibling->info()->Matches(info))
				4582	return sibling;
				4583	}
				4584	return NULL;
				4585	}
				4586
				4587
				4588	RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
				4589	ASSERT_EQ(false, *cloned);
				4590	siblings_.Ensure(this);
				4591	RegExpNode* result = TryGetSibling(info);
				4592	if (result != NULL) return result;
				4593	result = this->Clone();
				4594	NodeInfo* new_info = result->info();
				4595	new_info->ResetCompilationState();
				4596	new_info->AddFromPreceding(info);
				4597	AddSibling(result);
				4598	*cloned = true;
				4599	return result;
				4600	}
				4601
				4602
				4603	template <class C>
				4604	static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
				4605	NodeInfo full_info(*node->info());
				4606	full_info.AddFromPreceding(info);
				4607	bool cloned = false;
				4608	return RegExpNode::EnsureSibling(node, &full_info, &cloned);
				4609	}
				4610
				4611
				4612	// -------------------------------------------------------------------
				4613	// Splay tree
				4614
				4615
				4616	OutSet* OutSet::Extend(unsigned value) {
				4617	if (Get(value))
				4618	return this;
				4619	if (successors() != NULL) {
				4620	for (int i = 0; i < successors()->length(); i++) {
				4621	OutSet* successor = successors()->at(i);
				4622	if (successor->Get(value))
				4623	return successor;
				4624	}
				4625	} else {
				4626	successors_ = new ZoneList<OutSet*>(2);
				4627	}
				4628	OutSet* result = new OutSet(first_, remaining_);
				4629	result->Set(value);
				4630	successors()->Add(result);
				4631	return result;
				4632	}
				4633
				4634
				4635	void OutSet::Set(unsigned value) {
				4636	if (value < kFirstLimit) {
				4637	first_ \|= (1 << value);
				4638	} else {
				4639	if (remaining_ == NULL)
				4640	remaining_ = new ZoneList<unsigned>(1);
				4641	if (remaining_->is_empty() \|\| !remaining_->Contains(value))
				4642	remaining_->Add(value);
				4643	}
				4644	}
				4645
				4646
				4647	bool OutSet::Get(unsigned value) {
				4648	if (value < kFirstLimit) {
				4649	return (first_ & (1 << value)) != 0;
				4650	} else if (remaining_ == NULL) {
				4651	return false;
				4652	} else {
				4653	return remaining_->Contains(value);
				4654	}
				4655	}
				4656
				4657
				4658	const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
				4659	const DispatchTable::Entry DispatchTable::Config::kNoValue;
				4660
				4661
				4662	void DispatchTable::AddRange(CharacterRange full_range, int value) {
				4663	CharacterRange current = full_range;
				4664	if (tree()->is_empty()) {
				4665	// If this is the first range we just insert into the table.
				4666	ZoneSplayTree<Config>::Locator loc;
				4667	ASSERT_RESULT(tree()->Insert(current.from(), &loc));
				4668	loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
				4669	return;
				4670	}
				4671	// First see if there is a range to the left of this one that
				4672	// overlaps.
				4673	ZoneSplayTree<Config>::Locator loc;
				4674	if (tree()->FindGreatestLessThan(current.from(), &loc)) {
				4675	Entry* entry = &loc.value();
				4676	// If we've found a range that overlaps with this one, and it
				4677	// starts strictly to the left of this one, we have to fix it
				4678	// because the following code only handles ranges that start on
				4679	// or after the start point of the range we're adding.
				4680	if (entry->from() < current.from() && entry->to() >= current.from()) {
				4681	// Snap the overlapping range in half around the start point of
				4682	// the range we're adding.
				4683	CharacterRange left(entry->from(), current.from() - 1);
				4684	CharacterRange right(current.from(), entry->to());
				4685	// The left part of the overlapping range doesn't overlap.
				4686	// Truncate the whole entry to be just the left part.
				4687	entry->set_to(left.to());
				4688	// The right part is the one that overlaps. We add this part
				4689	// to the map and let the next step deal with merging it with
				4690	// the range we're adding.
				4691	ZoneSplayTree<Config>::Locator loc;
				4692	ASSERT_RESULT(tree()->Insert(right.from(), &loc));
				4693	loc.set_value(Entry(right.from(),
				4694	right.to(),
				4695	entry->out_set()));
				4696	}
				4697	}
				4698	while (current.is_valid()) {
				4699	if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
				4700	(loc.value().from() <= current.to()) &&
				4701	(loc.value().to() >= current.from())) {
				4702	Entry* entry = &loc.value();
				4703	// We have overlap. If there is space between the start point of
				4704	// the range we're adding and where the overlapping range starts
				4705	// then we have to add a range covering just that space.
				4706	if (current.from() < entry->from()) {
				4707	ZoneSplayTree<Config>::Locator ins;
				4708	ASSERT_RESULT(tree()->Insert(current.from(), &ins));
				4709	ins.set_value(Entry(current.from(),
				4710	entry->from() - 1,
				4711	empty()->Extend(value)));
				4712	current.set_from(entry->from());
				4713	}
				4714	ASSERT_EQ(current.from(), entry->from());
				4715	// If the overlapping range extends beyond the one we want to add
				4716	// we have to snap the right part off and add it separately.
				4717	if (entry->to() > current.to()) {
				4718	ZoneSplayTree<Config>::Locator ins;
				4719	ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
				4720	ins.set_value(Entry(current.to() + 1,
				4721	entry->to(),
				4722	entry->out_set()));
				4723	entry->set_to(current.to());
				4724	}
				4725	ASSERT(entry->to() <= current.to());
				4726	// The overlapping range is now completely contained by the range
				4727	// we're adding so we can just update it and move the start point
				4728	// of the range we're adding just past it.
				4729	entry->AddValue(value);
				4730	// Bail out if the last interval ended at 0xFFFF since otherwise
				4731	// adding 1 will wrap around to 0.
				4732	if (entry->to() == String::kMaxUC16CharCode)
				4733	break;
				4734	ASSERT(entry->to() + 1 > current.from());
				4735	current.set_from(entry->to() + 1);
				4736	} else {
				4737	// There is no overlap so we can just add the range
				4738	ZoneSplayTree<Config>::Locator ins;
				4739	ASSERT_RESULT(tree()->Insert(current.from(), &ins));
				4740	ins.set_value(Entry(current.from(),
				4741	current.to(),
				4742	empty()->Extend(value)));
				4743	break;
				4744	}
				4745	}
				4746	}
				4747
				4748
				4749	OutSet* DispatchTable::Get(uc16 value) {
				4750	ZoneSplayTree<Config>::Locator loc;
				4751	if (!tree()->FindGreatestLessThan(value, &loc))
				4752	return empty();
				4753	Entry* entry = &loc.value();
				4754	if (value <= entry->to())
				4755	return entry->out_set();
				4756	else
				4757	return empty();
				4758	}
				4759
				4760
				4761	// -------------------------------------------------------------------
				4762	// Analysis
				4763
				4764
				4765	void Analysis::EnsureAnalyzed(RegExpNode* that) {
				4766	StackLimitCheck check;
				4767	if (check.HasOverflowed()) {
				4768	fail("Stack overflow");
				4769	return;
				4770	}
				4771	if (that->info()->been_analyzed \|\| that->info()->being_analyzed)
				4772	return;
				4773	that->info()->being_analyzed = true;
				4774	that->Accept(this);
				4775	that->info()->being_analyzed = false;
				4776	that->info()->been_analyzed = true;
				4777	}
				4778
				4779
				4780	void Analysis::VisitEnd(EndNode* that) {
				4781	// nothing to do
				4782	}
				4783
				4784
				4785	void TextNode::CalculateOffsets() {
				4786	int element_count = elements()->length();
				4787	// Set up the offsets of the elements relative to the start. This is a fixed
				4788	// quantity since a TextNode can only contain fixed-width things.
				4789	int cp_offset = 0;
				4790	for (int i = 0; i < element_count; i++) {
				4791	TextElement& elm = elements()->at(i);
				4792	elm.cp_offset = cp_offset;
				4793	if (elm.type == TextElement::ATOM) {
				4794	cp_offset += elm.data.u_atom->data().length();
				4795	} else {
				4796	cp_offset++;
				4797	Vector<const uc16> quarks = elm.data.u_atom->data();
				4798	}
				4799	}
				4800	}
				4801
				4802
				4803	void Analysis::VisitText(TextNode* that) {
				4804	if (ignore_case_) {
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	4805	that->MakeCaseIndependent(is_ascii_);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4806	}
				4807	EnsureAnalyzed(that->on_success());
				4808	if (!has_failed()) {
				4809	that->CalculateOffsets();
				4810	}
				4811	}
				4812
				4813
				4814	void Analysis::VisitAction(ActionNode* that) {
				4815	RegExpNode* target = that->on_success();
				4816	EnsureAnalyzed(target);
				4817	if (!has_failed()) {
				4818	// If the next node is interested in what it follows then this node
				4819	// has to be interested too so it can pass the information on.
				4820	that->info()->AddFromFollowing(target->info());
				4821	}
				4822	}
				4823
				4824
				4825	void Analysis::VisitChoice(ChoiceNode* that) {
				4826	NodeInfo* info = that->info();
				4827	for (int i = 0; i < that->alternatives()->length(); i++) {
				4828	RegExpNode* node = that->alternatives()->at(i).node();
				4829	EnsureAnalyzed(node);
				4830	if (has_failed()) return;
				4831	// Anything the following nodes need to know has to be known by
				4832	// this node also, so it can pass it on.
				4833	info->AddFromFollowing(node->info());
				4834	}
				4835	}
				4836
				4837
				4838	void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
				4839	NodeInfo* info = that->info();
				4840	for (int i = 0; i < that->alternatives()->length(); i++) {
				4841	RegExpNode* node = that->alternatives()->at(i).node();
				4842	if (node != that->loop_node()) {
				4843	EnsureAnalyzed(node);
				4844	if (has_failed()) return;
				4845	info->AddFromFollowing(node->info());
				4846	}
				4847	}
				4848	// Check the loop last since it may need the value of this node
				4849	// to get a correct result.
				4850	EnsureAnalyzed(that->loop_node());
				4851	if (!has_failed()) {
				4852	info->AddFromFollowing(that->loop_node()->info());
				4853	}
				4854	}
				4855
				4856
				4857	void Analysis::VisitBackReference(BackReferenceNode* that) {
				4858	EnsureAnalyzed(that->on_success());
				4859	}
				4860
				4861
				4862	void Analysis::VisitAssertion(AssertionNode* that) {
				4863	EnsureAnalyzed(that->on_success());
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4864	AssertionNode::AssertionNodeType type = that->type();
				4865	if (type == AssertionNode::AT_BOUNDARY \|\|
				4866	type == AssertionNode::AT_NON_BOUNDARY) {
				4867	// Check if the following character is known to be a word character
				4868	// or known to not be a word character.
				4869	ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet();
				4870
				4871	CharacterRange::Canonicalize(following_chars);
				4872
				4873	SetRelation word_relation =
				4874	CharacterRange::WordCharacterRelation(following_chars);
Andrei Popescu	6d3d5a3	2010-04-27 19:40:12 +0100	[diff] [blame]	4875	if (word_relation.Disjoint()) {
				4876	// Includes the case where following_chars is empty (e.g., end-of-input).
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4877	// Following character is definitely not a word character.
				4878	type = (type == AssertionNode::AT_BOUNDARY) ?
Andrei Popescu	6d3d5a3	2010-04-27 19:40:12 +0100	[diff] [blame]	4879	AssertionNode::AFTER_WORD_CHARACTER :
				4880	AssertionNode::AFTER_NONWORD_CHARACTER;
				4881	that->set_type(type);
				4882	} else if (word_relation.ContainedIn()) {
				4883	// Following character is definitely a word character.
				4884	type = (type == AssertionNode::AT_BOUNDARY) ?
				4885	AssertionNode::AFTER_NONWORD_CHARACTER :
				4886	AssertionNode::AFTER_WORD_CHARACTER;
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4887	that->set_type(type);
				4888	}
				4889	}
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	4890	}
				4891
				4892
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4893	ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() {
				4894	if (first_character_set_ == NULL) {
				4895	if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) {
				4896	// If we can't find an exact solution within the budget, we
				4897	// set the value to the set of every character, i.e., all characters
				4898	// are possible.
				4899	ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1);
				4900	all_set->Add(CharacterRange::Everything());
				4901	first_character_set_ = all_set;
				4902	}
				4903	}
				4904	return first_character_set_;
				4905	}
				4906
				4907
				4908	int RegExpNode::ComputeFirstCharacterSet(int budget) {
				4909	// Default behavior is to not be able to determine the first character.
				4910	return kComputeFirstCharacterSetFail;
				4911	}
				4912
				4913
				4914	int LoopChoiceNode::ComputeFirstCharacterSet(int budget) {
				4915	budget--;
				4916	if (budget >= 0) {
				4917	// Find loop min-iteration. It's the value of the guarded choice node
				4918	// with a GEQ guard, if any.
				4919	int min_repetition = 0;
				4920
				4921	for (int i = 0; i <= 1; i++) {
				4922	GuardedAlternative alternative = alternatives()->at(i);
				4923	ZoneList<Guard> guards = alternative.guards();
				4924	if (guards != NULL && guards->length() > 0) {
				4925	Guard* guard = guards->at(0);
				4926	if (guard->op() == Guard::GEQ) {
				4927	min_repetition = guard->value();
				4928	break;
				4929	}
				4930	}
				4931	}
				4932
				4933	budget = loop_node()->ComputeFirstCharacterSet(budget);
				4934	if (budget >= 0) {
				4935	ZoneList<CharacterRange>* character_set =
				4936	loop_node()->first_character_set();
				4937	if (body_can_be_zero_length() \|\| min_repetition == 0) {
				4938	budget = continue_node()->ComputeFirstCharacterSet(budget);
				4939	if (budget < 0) return budget;
				4940	ZoneList<CharacterRange>* body_set =
				4941	continue_node()->first_character_set();
				4942	ZoneList<CharacterRange>* union_set =
				4943	new ZoneList<CharacterRange>(Max(character_set->length(),
				4944	body_set->length()));
				4945	CharacterRange::Merge(character_set,
				4946	body_set,
				4947	union_set,
				4948	union_set,
				4949	union_set);
				4950	character_set = union_set;
				4951	}
				4952	set_first_character_set(character_set);
				4953	}
				4954	}
				4955	return budget;
				4956	}
				4957
				4958
				4959	int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) {
				4960	budget--;
				4961	if (budget >= 0) {
				4962	GuardedAlternative successor = this->alternatives()->at(1);
				4963	RegExpNode* successor_node = successor.node();
				4964	budget = successor_node->ComputeFirstCharacterSet(budget);
				4965	if (budget >= 0) {
				4966	set_first_character_set(successor_node->first_character_set());
				4967	}
				4968	}
				4969	return budget;
				4970	}
				4971
				4972
				4973	// The first character set of an EndNode is unknowable. Just use the
				4974	// default implementation that fails and returns all characters as possible.
				4975
				4976
				4977	int AssertionNode::ComputeFirstCharacterSet(int budget) {
				4978	budget -= 1;
				4979	if (budget >= 0) {
				4980	switch (type_) {
				4981	case AT_END: {
				4982	set_first_character_set(new ZoneList<CharacterRange>(0));
				4983	break;
				4984	}
				4985	case AT_START:
				4986	case AT_BOUNDARY:
				4987	case AT_NON_BOUNDARY:
				4988	case AFTER_NEWLINE:
				4989	case AFTER_NONWORD_CHARACTER:
				4990	case AFTER_WORD_CHARACTER: {
				4991	ASSERT_NOT_NULL(on_success());
				4992	budget = on_success()->ComputeFirstCharacterSet(budget);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	4993	if (budget >= 0) {
				4994	set_first_character_set(on_success()->first_character_set());
				4995	}
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	4996	break;
				4997	}
				4998	}
				4999	}
				5000	return budget;
				5001	}
				5002
				5003
				5004	int ActionNode::ComputeFirstCharacterSet(int budget) {
				5005	if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail;
				5006	budget--;
				5007	if (budget >= 0) {
				5008	ASSERT_NOT_NULL(on_success());
				5009	budget = on_success()->ComputeFirstCharacterSet(budget);
				5010	if (budget >= 0) {
				5011	set_first_character_set(on_success()->first_character_set());
				5012	}
				5013	}
				5014	return budget;
				5015	}
				5016
				5017
				5018	int BackReferenceNode::ComputeFirstCharacterSet(int budget) {
				5019	// We don't know anything about the first character of a backreference
				5020	// at this point.
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5021	// The potential first characters are the first characters of the capture,
				5022	// and the first characters of the on_success node, depending on whether the
				5023	// capture can be empty and whether it is known to be participating or known
				5024	// not to be.
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5025	return kComputeFirstCharacterSetFail;
				5026	}
				5027
				5028
				5029	int TextNode::ComputeFirstCharacterSet(int budget) {
				5030	budget--;
				5031	if (budget >= 0) {
				5032	ASSERT_NE(0, elements()->length());
				5033	TextElement text = elements()->at(0);
				5034	if (text.type == TextElement::ATOM) {
				5035	RegExpAtom* atom = text.data.u_atom;
				5036	ASSERT_NE(0, atom->length());
				5037	uc16 first_char = atom->data()[0];
				5038	ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1);
				5039	range->Add(CharacterRange(first_char, first_char));
				5040	set_first_character_set(range);
				5041	} else {
				5042	ASSERT(text.type == TextElement::CHAR_CLASS);
				5043	RegExpCharacterClass* char_class = text.data.u_char_class;
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5044	ZoneList<CharacterRange>* ranges = char_class->ranges();
				5045	// TODO(lrn): Canonicalize ranges when they are created
				5046	// instead of waiting until now.
				5047	CharacterRange::Canonicalize(ranges);
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5048	if (char_class->is_negated()) {
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5049	int length = ranges->length();
				5050	int new_length = length + 1;
				5051	if (length > 0) {
				5052	if (ranges->at(0).from() == 0) new_length--;
				5053	if (ranges->at(length - 1).to() == String::kMaxUC16CharCode) {
				5054	new_length--;
				5055	}
				5056	}
				5057	ZoneList<CharacterRange>* negated_ranges =
				5058	new ZoneList<CharacterRange>(new_length);
				5059	CharacterRange::Negate(ranges, negated_ranges);
				5060	set_first_character_set(negated_ranges);
				5061	} else {
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5062	set_first_character_set(ranges);
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5063	}
				5064	}
				5065	}
				5066	return budget;
				5067	}
				5068
				5069
				5070
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5071	// -------------------------------------------------------------------
				5072	// Dispatch table construction
				5073
				5074
				5075	void DispatchTableConstructor::VisitEnd(EndNode* that) {
				5076	AddRange(CharacterRange::Everything());
				5077	}
				5078
				5079
				5080	void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
				5081	node->set_being_calculated(true);
				5082	ZoneList<GuardedAlternative>* alternatives = node->alternatives();
				5083	for (int i = 0; i < alternatives->length(); i++) {
				5084	set_choice_index(i);
				5085	alternatives->at(i).node()->Accept(this);
				5086	}
				5087	node->set_being_calculated(false);
				5088	}
				5089
				5090
				5091	class AddDispatchRange {
				5092	public:
				5093	explicit AddDispatchRange(DispatchTableConstructor* constructor)
				5094	: constructor_(constructor) { }
				5095	void Call(uc32 from, DispatchTable::Entry entry);
				5096	private:
				5097	DispatchTableConstructor* constructor_;
				5098	};
				5099
				5100
				5101	void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
				5102	CharacterRange range(from, entry.to());
				5103	constructor_->AddRange(range);
				5104	}
				5105
				5106
				5107	void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
				5108	if (node->being_calculated())
				5109	return;
				5110	DispatchTable* table = node->GetTable(ignore_case_);
				5111	AddDispatchRange adder(this);
				5112	table->ForEach(&adder);
				5113	}
				5114
				5115
				5116	void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
				5117	// TODO(160): Find the node that we refer back to and propagate its start
				5118	// set back to here. For now we just accept anything.
				5119	AddRange(CharacterRange::Everything());
				5120	}
				5121
				5122
				5123	void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
				5124	RegExpNode* target = that->on_success();
				5125	target->Accept(this);
				5126	}
				5127
				5128
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5129	static int CompareRangeByFrom(const CharacterRange* a,
				5130	const CharacterRange* b) {
				5131	return Compare<uc16>(a->from(), b->from());
				5132	}
				5133
				5134
				5135	void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
				5136	ranges->Sort(CompareRangeByFrom);
				5137	uc16 last = 0;
				5138	for (int i = 0; i < ranges->length(); i++) {
				5139	CharacterRange range = ranges->at(i);
				5140	if (last < range.from())
				5141	AddRange(CharacterRange(last, range.from() - 1));
				5142	if (range.to() >= last) {
				5143	if (range.to() == String::kMaxUC16CharCode) {
				5144	return;
				5145	} else {
				5146	last = range.to() + 1;
				5147	}
				5148	}
				5149	}
				5150	AddRange(CharacterRange(last, String::kMaxUC16CharCode));
				5151	}
				5152
				5153
				5154	void DispatchTableConstructor::VisitText(TextNode* that) {
				5155	TextElement elm = that->elements()->at(0);
				5156	switch (elm.type) {
				5157	case TextElement::ATOM: {
				5158	uc16 c = elm.data.u_atom->data()[0];
				5159	AddRange(CharacterRange(c, c));
				5160	break;
				5161	}
				5162	case TextElement::CHAR_CLASS: {
				5163	RegExpCharacterClass* tree = elm.data.u_char_class;
				5164	ZoneList<CharacterRange>* ranges = tree->ranges();
				5165	if (tree->is_negated()) {
				5166	AddInverse(ranges);
				5167	} else {
				5168	for (int i = 0; i < ranges->length(); i++)
				5169	AddRange(ranges->at(i));
				5170	}
				5171	break;
				5172	}
				5173	default: {
				5174	UNIMPLEMENTED();
				5175	}
				5176	}
				5177	}
				5178
				5179
				5180	void DispatchTableConstructor::VisitAction(ActionNode* that) {
				5181	RegExpNode* target = that->on_success();
				5182	target->Accept(this);
				5183	}
				5184
				5185
				5186	RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data,
				5187	bool ignore_case,
				5188	bool is_multiline,
				5189	Handle<String> pattern,
				5190	bool is_ascii) {
				5191	if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
				5192	return IrregexpRegExpTooBig();
				5193	}
				5194	RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
				5195	// Wrap the body of the regexp in capture #0.
				5196	RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
				5197	0,
				5198	&compiler,
				5199	compiler.accept());
				5200	RegExpNode* node = captured_body;
				5201	if (!data->tree->IsAnchored()) {
				5202	// Add a .*? at the beginning, outside the body capture, unless
				5203	// this expression is anchored at the beginning.
				5204	RegExpNode* loop_node =
				5205	RegExpQuantifier::ToNode(0,
				5206	RegExpTree::kInfinity,
				5207	false,
				5208	new RegExpCharacterClass('*'),
				5209	&compiler,
				5210	captured_body,
				5211	data->contains_anchor);
				5212
				5213	if (data->contains_anchor) {
				5214	// Unroll loop once, to take care of the case that might start
				5215	// at the start of input.
				5216	ChoiceNode* first_step_node = new ChoiceNode(2);
				5217	first_step_node->AddAlternative(GuardedAlternative(captured_body));
				5218	first_step_node->AddAlternative(GuardedAlternative(
				5219	new TextNode(new RegExpCharacterClass('*'), loop_node)));
				5220	node = first_step_node;
				5221	} else {
				5222	node = loop_node;
				5223	}
				5224	}
				5225	data->node = node;
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	5226	Analysis analysis(ignore_case, is_ascii);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5227	analysis.EnsureAnalyzed(node);
				5228	if (analysis.has_failed()) {
				5229	const char* error_message = analysis.error_message();
				5230	return CompilationResult(error_message);
				5231	}
				5232
				5233	NodeInfo info = *node->info();
				5234
				5235	// Create the correct assembler for the architecture.
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5236	#ifndef V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5237	// Native regexp implementation.
				5238
				5239	NativeRegExpMacroAssembler::Mode mode =
				5240	is_ascii ? NativeRegExpMacroAssembler::ASCII
				5241	: NativeRegExpMacroAssembler::UC16;
				5242
				5243	#if V8_TARGET_ARCH_IA32
				5244	RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2);
				5245	#elif V8_TARGET_ARCH_X64
				5246	RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2);
				5247	#elif V8_TARGET_ARCH_ARM
				5248	RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2);
				5249	#endif
				5250
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5251	#else // V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5252	// Interpreted regexp implementation.
				5253	EmbeddedVector<byte, 1024> codes;
				5254	RegExpMacroAssemblerIrregexp macro_assembler(codes);
Steve Block	6ded16b	2010-05-10 14:33:55 +0100	[diff] [blame]	5255	#endif // V8_INTERPRETED_REGEXP
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5256
				5257	return compiler.Assemble(&macro_assembler,
				5258	node,
				5259	data->capture_count,
				5260	pattern);
				5261	}
				5262
Leon Clarke	e46be81	2010-01-19 14:06:41 +0000	[diff] [blame]	5263
				5264	int OffsetsVector::static_offsets_vector_[
				5265	OffsetsVector::kStaticOffsetsVectorSize];
				5266
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	5267	}} // namespace v8::internal