blob: 06208aa50aebda4d9170781d220dac084b98cb24 [file] [log] [blame]
ager@chromium.org381abbb2009-02-25 13:23:22 +00001// Copyright 2006-2009 the V8 project authors. All rights reserved.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include "v8.h"
29
ager@chromium.orga74f0da2008-12-03 16:05:52 +000030#include "ast.h"
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +000031#include "compiler.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000032#include "execution.h"
33#include "factory.h"
sgjesse@chromium.org0b6db592009-07-30 14:48:31 +000034#include "jsregexp.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000035#include "platform.h"
kasperl@chromium.org41044eb2008-10-06 08:24:46 +000036#include "runtime.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000037#include "top.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000038#include "compilation-cache.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000039#include "string-stream.h"
40#include "parser.h"
41#include "regexp-macro-assembler.h"
42#include "regexp-macro-assembler-tracer.h"
43#include "regexp-macro-assembler-irregexp.h"
ager@chromium.org32912102009-01-16 10:38:43 +000044#include "regexp-stack.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000045
sgjesse@chromium.org911335c2009-08-19 12:59:44 +000046#ifdef V8_NATIVE_REGEXP
kasperl@chromium.org71affb52009-05-26 05:44:31 +000047#if V8_TARGET_ARCH_IA32
ager@chromium.org3a37e9b2009-04-27 09:26:21 +000048#include "ia32/macro-assembler-ia32.h"
49#include "ia32/regexp-macro-assembler-ia32.h"
ager@chromium.org9085a012009-05-11 19:22:57 +000050#elif V8_TARGET_ARCH_X64
51#include "x64/macro-assembler-x64.h"
52#include "x64/regexp-macro-assembler-x64.h"
53#elif V8_TARGET_ARCH_ARM
54#include "arm/regexp-macro-assembler-arm.h"
kasperl@chromium.org2abc4502009-07-02 07:00:29 +000055#else
56#error Unsupported target architecture.
ager@chromium.orga74f0da2008-12-03 16:05:52 +000057#endif
sgjesse@chromium.org911335c2009-08-19 12:59:44 +000058#endif
ager@chromium.orga74f0da2008-12-03 16:05:52 +000059
60#include "interpreter-irregexp.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000061
ager@chromium.orga74f0da2008-12-03 16:05:52 +000062
kasperl@chromium.org71affb52009-05-26 05:44:31 +000063namespace v8 {
64namespace internal {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000065
66
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +000067Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
68 Handle<String> pattern,
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000069 Handle<String> flags,
70 bool* has_pending_exception) {
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +000071 // Ensure that the constructor function has been loaded.
72 if (!constructor->IsLoaded()) {
73 LoadLazy(constructor, has_pending_exception);
ager@chromium.orga74f0da2008-12-03 16:05:52 +000074 if (*has_pending_exception) return Handle<Object>();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000075 }
76 // Call the construct code with 2 arguments.
77 Object** argv[2] = { Handle<Object>::cast(pattern).location(),
78 Handle<Object>::cast(flags).location() };
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +000079 return Execution::New(constructor, 2, argv, has_pending_exception);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000080}
81
82
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000083static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
84 int flags = JSRegExp::NONE;
ager@chromium.orgbb29dc92009-03-24 13:25:23 +000085 for (int i = 0; i < str->length(); i++) {
86 switch (str->Get(i)) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000087 case 'i':
88 flags |= JSRegExp::IGNORE_CASE;
89 break;
90 case 'g':
91 flags |= JSRegExp::GLOBAL;
92 break;
93 case 'm':
94 flags |= JSRegExp::MULTILINE;
95 break;
96 }
97 }
98 return JSRegExp::Flags(flags);
99}
100
101
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000102static inline void ThrowRegExpException(Handle<JSRegExp> re,
103 Handle<String> pattern,
104 Handle<String> error_text,
105 const char* message) {
106 Handle<JSArray> array = Factory::NewJSArray(2);
107 SetElement(array, 0, pattern);
108 SetElement(array, 1, error_text);
109 Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
110 Top::Throw(*regexp_err);
111}
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000112
113
ager@chromium.org8bb60582008-12-11 12:02:20 +0000114// Generic RegExp methods. Dispatches to implementation specific methods.
115
116
117class OffsetsVector {
118 public:
119 inline OffsetsVector(int num_registers)
120 : offsets_vector_length_(num_registers) {
121 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
122 vector_ = NewArray<int>(offsets_vector_length_);
123 } else {
124 vector_ = static_offsets_vector_;
125 }
126 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000127 inline ~OffsetsVector() {
128 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
129 DeleteArray(vector_);
130 vector_ = NULL;
131 }
132 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000133 inline int* vector() { return vector_; }
134 inline int length() { return offsets_vector_length_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000135
136 private:
137 int* vector_;
138 int offsets_vector_length_;
139 static const int kStaticOffsetsVectorSize = 50;
140 static int static_offsets_vector_[kStaticOffsetsVectorSize];
141};
142
143
144int OffsetsVector::static_offsets_vector_[
145 OffsetsVector::kStaticOffsetsVectorSize];
146
147
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000148Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
149 Handle<String> pattern,
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000150 Handle<String> flag_str) {
151 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
152 Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
153 bool in_cache = !cached.is_null();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000154 LOG(RegExpCompileEvent(re, in_cache));
155
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000156 Handle<Object> result;
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000157 if (in_cache) {
158 re->set_data(*cached);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000159 return re;
160 }
161 FlattenString(pattern);
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000162 CompilationZoneScope zone_scope(DELETE_ON_EXIT);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000163 RegExpCompileData parse_result;
164 FlatStringReader reader(pattern);
165 if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
166 // Throw an exception if we fail to parse the pattern.
167 ThrowRegExpException(re,
168 pattern,
169 parse_result.error,
170 "malformed_regexp");
171 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000172 }
173
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000174 if (parse_result.simple && !flags.is_ignore_case()) {
175 // Parse-tree is a single atom that is equal to the pattern.
176 AtomCompile(re, pattern, flags, pattern);
177 } else if (parse_result.tree->IsAtom() &&
178 !flags.is_ignore_case() &&
179 parse_result.capture_count == 0) {
180 RegExpAtom* atom = parse_result.tree->AsAtom();
181 Vector<const uc16> atom_pattern = atom->data();
182 Handle<String> atom_string = Factory::NewStringFromTwoByte(atom_pattern);
183 AtomCompile(re, pattern, flags, atom_string);
184 } else {
185 IrregexpPrepare(re, pattern, flags, parse_result.capture_count);
186 }
187 ASSERT(re->data()->IsFixedArray());
188 // Compilation succeeded so the data is set on the regexp
189 // and we can store it in the cache.
190 Handle<FixedArray> data(FixedArray::cast(re->data()));
191 CompilationCache::PutRegExp(pattern, flags, data);
192
193 return re;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000194}
195
196
197Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
198 Handle<String> subject,
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000199 int index,
200 Handle<JSArray> last_match_info) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000201 switch (regexp->TypeTag()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000202 case JSRegExp::ATOM:
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000203 return AtomExec(regexp, subject, index, last_match_info);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000204 case JSRegExp::IRREGEXP: {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000205 Handle<Object> result =
206 IrregexpExec(regexp, subject, index, last_match_info);
ager@chromium.org6f10e412009-02-13 10:11:16 +0000207 ASSERT(!result.is_null() || Top::has_pending_exception());
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000208 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000209 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000210 default:
211 UNREACHABLE();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000212 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000213 }
214}
215
216
ager@chromium.org8bb60582008-12-11 12:02:20 +0000217// RegExp Atom implementation: Simple string search using indexOf.
218
219
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000220void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
221 Handle<String> pattern,
222 JSRegExp::Flags flags,
223 Handle<String> match_pattern) {
224 Factory::SetRegExpAtomData(re,
225 JSRegExp::ATOM,
226 pattern,
227 flags,
228 match_pattern);
229}
230
231
232static void SetAtomLastCapture(FixedArray* array,
233 String* subject,
234 int from,
235 int to) {
236 NoHandleAllocation no_handles;
237 RegExpImpl::SetLastCaptureCount(array, 2);
238 RegExpImpl::SetLastSubject(array, subject);
239 RegExpImpl::SetLastInput(array, subject);
240 RegExpImpl::SetCapture(array, 0, from);
241 RegExpImpl::SetCapture(array, 1, to);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000242}
243
244
245Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
246 Handle<String> subject,
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000247 int index,
248 Handle<JSArray> last_match_info) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000249 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000250
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000251 uint32_t start_index = index;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000252
ager@chromium.org7c537e22008-10-16 08:43:32 +0000253 int value = Runtime::StringMatch(subject, needle, start_index);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000254 if (value == -1) return Factory::null_value();
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000255 ASSERT(last_match_info->HasFastElements());
ager@chromium.org7c537e22008-10-16 08:43:32 +0000256
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000257 {
258 NoHandleAllocation no_handles;
sgjesse@chromium.org0b6db592009-07-30 14:48:31 +0000259 FixedArray* array = FixedArray::cast(last_match_info->elements());
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000260 SetAtomLastCapture(array, *subject, value, value + needle->length());
261 }
262 return last_match_info;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000263}
264
265
ager@chromium.org8bb60582008-12-11 12:02:20 +0000266// Irregexp implementation.
267
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000268// Ensures that the regexp object contains a compiled version of the
269// source for either ASCII or non-ASCII strings.
270// If the compiled version doesn't already exist, it is compiled
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000271// from the source pattern.
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000272// If compilation fails, an exception is thrown and this function
273// returns false.
ager@chromium.org41826e72009-03-30 13:30:57 +0000274bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {
sgjesse@chromium.org911335c2009-08-19 12:59:44 +0000275 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000276#ifdef V8_NATIVE_REGEXP
sgjesse@chromium.org911335c2009-08-19 12:59:44 +0000277 if (compiled_code->IsCode()) return true;
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000278#else // ! V8_NATIVE_REGEXP (RegExp interpreter code)
sgjesse@chromium.org911335c2009-08-19 12:59:44 +0000279 if (compiled_code->IsByteArray()) return true;
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000280#endif
281 return CompileIrregexp(re, is_ascii);
282}
ager@chromium.org8bb60582008-12-11 12:02:20 +0000283
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000284
285bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000286 // Compile the RegExp.
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000287 CompilationZoneScope zone_scope(DELETE_ON_EXIT);
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000288 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
289 if (entry->IsJSObject()) {
290 // If it's a JSObject, a previous compilation failed and threw this object.
291 // Re-throw the object without trying again.
292 Top::Throw(entry);
293 return false;
294 }
295 ASSERT(entry->IsTheHole());
ager@chromium.org8bb60582008-12-11 12:02:20 +0000296
297 JSRegExp::Flags flags = re->GetFlags();
298
299 Handle<String> pattern(re->Pattern());
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000300 if (!pattern->IsFlat()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000301 FlattenString(pattern);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000302 }
303
304 RegExpCompileData compile_data;
305 FlatStringReader reader(pattern);
306 if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) {
307 // Throw an exception if we fail to parse the pattern.
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000308 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000309 ThrowRegExpException(re,
310 pattern,
311 compile_data.error,
312 "malformed_regexp");
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000313 return false;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000314 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000315 RegExpEngine::CompilationResult result =
ager@chromium.org8bb60582008-12-11 12:02:20 +0000316 RegExpEngine::Compile(&compile_data,
317 flags.is_ignore_case(),
318 flags.is_multiline(),
319 pattern,
320 is_ascii);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000321 if (result.error_message != NULL) {
322 // Unable to compile regexp.
323 Handle<JSArray> array = Factory::NewJSArray(2);
324 SetElement(array, 0, pattern);
325 SetElement(array,
326 1,
327 Factory::NewStringFromUtf8(CStrVector(result.error_message)));
328 Handle<Object> regexp_err =
329 Factory::NewSyntaxError("malformed_regexp", array);
330 Top::Throw(*regexp_err);
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000331 re->SetDataAt(JSRegExp::code_index(is_ascii), *regexp_err);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000332 return false;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000333 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000334
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000335 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
336 data->set(JSRegExp::code_index(is_ascii), result.code);
337 int register_max = IrregexpMaxRegisterCount(*data);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000338 if (result.num_registers > register_max) {
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000339 SetIrregexpMaxRegisterCount(*data, result.num_registers);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000340 }
341
342 return true;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000343}
344
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000345
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000346int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
347 return Smi::cast(
348 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000349}
350
351
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000352void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
353 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000354}
355
356
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000357int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
358 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000359}
360
361
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000362int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
363 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000364}
365
366
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000367ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000368 return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000369}
370
371
372Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000373 return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000374}
375
376
377void RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
378 Handle<String> pattern,
379 JSRegExp::Flags flags,
380 int capture_count) {
381 // Initialize compiled code entries to null.
382 Factory::SetRegExpIrregexpData(re,
383 JSRegExp::IRREGEXP,
384 pattern,
385 flags,
386 capture_count);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000387}
388
389
ager@chromium.org41826e72009-03-30 13:30:57 +0000390Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp,
ager@chromium.org8bb60582008-12-11 12:02:20 +0000391 Handle<String> subject,
ager@chromium.org41826e72009-03-30 13:30:57 +0000392 int previous_index,
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000393 Handle<JSArray> last_match_info) {
ager@chromium.org41826e72009-03-30 13:30:57 +0000394 ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000395
ager@chromium.org8bb60582008-12-11 12:02:20 +0000396 // Prepare space for the return values.
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000397 int number_of_capture_registers =
ager@chromium.org41826e72009-03-30 13:30:57 +0000398 (IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000399
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000400#ifndef V8_NATIVE_REGEXP
ager@chromium.org8bb60582008-12-11 12:02:20 +0000401#ifdef DEBUG
402 if (FLAG_trace_regexp_bytecodes) {
ager@chromium.org41826e72009-03-30 13:30:57 +0000403 String* pattern = jsregexp->Pattern();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000404 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
405 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
406 }
407#endif
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000408#endif
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000409
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000410 if (!subject->IsFlat()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000411 FlattenString(subject);
412 }
413
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000414 last_match_info->EnsureSize(number_of_capture_registers + kLastMatchOverhead);
415
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000416 Handle<FixedArray> array;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000417
ager@chromium.org41826e72009-03-30 13:30:57 +0000418 // Dispatch to the correct RegExp implementation.
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000419 Handle<FixedArray> regexp(FixedArray::cast(jsregexp->data()));
sgjesse@chromium.org911335c2009-08-19 12:59:44 +0000420
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000421#ifdef V8_NATIVE_REGEXP
sgjesse@chromium.org911335c2009-08-19 12:59:44 +0000422#ifdef V8_TARGET_ARCH_ARM
423 UNIMPLEMENTED();
424#else // Native regexp supported.
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000425 OffsetsVector captures(number_of_capture_registers);
426 int* captures_vector = captures.vector();
sgjesse@chromium.org911335c2009-08-19 12:59:44 +0000427 NativeRegExpMacroAssembler::Result res;
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000428 do {
ager@chromium.org5ec48922009-05-05 07:25:34 +0000429 bool is_ascii = subject->IsAsciiRepresentation();
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000430 if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) {
431 return Handle<Object>::null();
432 }
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000433 Handle<Code> code(RegExpImpl::IrregexpNativeCode(*regexp, is_ascii));
sgjesse@chromium.org911335c2009-08-19 12:59:44 +0000434 res = NativeRegExpMacroAssembler::Match(code,
435 subject,
436 captures_vector,
437 captures.length(),
438 previous_index);
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000439 // If result is RETRY, the string have changed representation, and we
440 // must restart from scratch.
sgjesse@chromium.org911335c2009-08-19 12:59:44 +0000441 } while (res == NativeRegExpMacroAssembler::RETRY);
442 if (res == NativeRegExpMacroAssembler::EXCEPTION) {
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000443 ASSERT(Top::has_pending_exception());
444 return Handle<Object>::null();
445 }
sgjesse@chromium.org911335c2009-08-19 12:59:44 +0000446 ASSERT(res == NativeRegExpMacroAssembler::SUCCESS
447 || res == NativeRegExpMacroAssembler::FAILURE);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000448
sgjesse@chromium.org911335c2009-08-19 12:59:44 +0000449 if (res != NativeRegExpMacroAssembler::SUCCESS) return Factory::null_value();
ager@chromium.org5aa501c2009-06-23 07:57:28 +0000450
sgjesse@chromium.org0b6db592009-07-30 14:48:31 +0000451 array = Handle<FixedArray>(FixedArray::cast(last_match_info->elements()));
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000452 ASSERT(array->length() >= number_of_capture_registers + kLastMatchOverhead);
453 // The captures come in (start, end+1) pairs.
454 for (int i = 0; i < number_of_capture_registers; i += 2) {
455 SetCapture(*array, i, captures_vector[i]);
456 SetCapture(*array, i + 1, captures_vector[i + 1]);
457 }
sgjesse@chromium.org911335c2009-08-19 12:59:44 +0000458#endif // Native regexp supported.
459
460#else // ! V8_NATIVE_REGEXP
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000461 bool is_ascii = subject->IsAsciiRepresentation();
462 if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) {
463 return Handle<Object>::null();
464 }
465 // Now that we have done EnsureCompiledIrregexp we can get the number of
466 // registers.
467 int number_of_registers =
468 IrregexpNumberOfRegisters(FixedArray::cast(jsregexp->data()));
469 OffsetsVector registers(number_of_registers);
470 int* register_vector = registers.vector();
471 for (int i = number_of_capture_registers - 1; i >= 0; i--) {
472 register_vector[i] = -1;
473 }
474 Handle<ByteArray> byte_codes(IrregexpByteCode(*regexp, is_ascii));
475
476 if (!IrregexpInterpreter::Match(byte_codes,
477 subject,
478 register_vector,
479 previous_index)) {
480 return Factory::null_value();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000481 }
482
sgjesse@chromium.org0b6db592009-07-30 14:48:31 +0000483 array = Handle<FixedArray>(FixedArray::cast(last_match_info->elements()));
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000484 ASSERT(array->length() >= number_of_capture_registers + kLastMatchOverhead);
485 // The captures come in (start, end+1) pairs.
486 for (int i = 0; i < number_of_capture_registers; i += 2) {
487 SetCapture(*array, i, register_vector[i]);
488 SetCapture(*array, i + 1, register_vector[i + 1]);
489 }
490#endif // V8_NATIVE_REGEXP
491
492 SetLastCaptureCount(*array, number_of_capture_registers);
493 SetLastSubject(*array, *subject);
494 SetLastInput(*array, *subject);
ager@chromium.org5aa501c2009-06-23 07:57:28 +0000495
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000496 return last_match_info;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000497}
498
499
500// -------------------------------------------------------------------
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000501// Implementation of the Irregexp regular expression engine.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000502//
503// The Irregexp regular expression engine is intended to be a complete
504// implementation of ECMAScript regular expressions. It generates either
505// bytecodes or native code.
506
507// The Irregexp regexp engine is structured in three steps.
508// 1) The parser generates an abstract syntax tree. See ast.cc.
509// 2) From the AST a node network is created. The nodes are all
510// subclasses of RegExpNode. The nodes represent states when
511// executing a regular expression. Several optimizations are
512// performed on the node network.
513// 3) From the nodes we generate either byte codes or native code
514// that can actually execute the regular expression (perform
515// the search). The code generation step is described in more
516// detail below.
517
518// Code generation.
519//
520// The nodes are divided into four main categories.
521// * Choice nodes
522// These represent places where the regular expression can
523// match in more than one way. For example on entry to an
524// alternation (foo|bar) or a repetition (*, +, ? or {}).
525// * Action nodes
526// These represent places where some action should be
527// performed. Examples include recording the current position
528// in the input string to a register (in order to implement
529// captures) or other actions on register for example in order
530// to implement the counters needed for {} repetitions.
531// * Matching nodes
532// These attempt to match some element part of the input string.
533// Examples of elements include character classes, plain strings
534// or back references.
535// * End nodes
536// These are used to implement the actions required on finding
537// a successful match or failing to find a match.
538//
539// The code generated (whether as byte codes or native code) maintains
540// some state as it runs. This consists of the following elements:
541//
542// * The capture registers. Used for string captures.
543// * Other registers. Used for counters etc.
544// * The current position.
545// * The stack of backtracking information. Used when a matching node
546// fails to find a match and needs to try an alternative.
547//
548// Conceptual regular expression execution model:
549//
550// There is a simple conceptual model of regular expression execution
551// which will be presented first. The actual code generated is a more
552// efficient simulation of the simple conceptual model:
553//
554// * Choice nodes are implemented as follows:
555// For each choice except the last {
556// push current position
557// push backtrack code location
558// <generate code to test for choice>
559// backtrack code location:
560// pop current position
561// }
562// <generate code to test for last choice>
563//
564// * Actions nodes are generated as follows
565// <push affected registers on backtrack stack>
566// <generate code to perform action>
567// push backtrack code location
568// <generate code to test for following nodes>
569// backtrack code location:
570// <pop affected registers to restore their state>
571// <pop backtrack location from stack and go to it>
572//
573// * Matching nodes are generated as follows:
574// if input string matches at current position
575// update current position
576// <generate code to test for following nodes>
577// else
578// <pop backtrack location from stack and go to it>
579//
580// Thus it can be seen that the current position is saved and restored
581// by the choice nodes, whereas the registers are saved and restored by
582// by the action nodes that manipulate them.
583//
584// The other interesting aspect of this model is that nodes are generated
585// at the point where they are needed by a recursive call to Emit(). If
586// the node has already been code generated then the Emit() call will
587// generate a jump to the previously generated code instead. In order to
588// limit recursion it is possible for the Emit() function to put the node
589// on a work list for later generation and instead generate a jump. The
590// destination of the jump is resolved later when the code is generated.
591//
592// Actual regular expression code generation.
593//
594// Code generation is actually more complicated than the above. In order
595// to improve the efficiency of the generated code some optimizations are
596// performed
597//
598// * Choice nodes have 1-character lookahead.
599// A choice node looks at the following character and eliminates some of
600// the choices immediately based on that character. This is not yet
601// implemented.
602// * Simple greedy loops store reduced backtracking information.
603// A quantifier like /.*foo/m will greedily match the whole input. It will
604// then need to backtrack to a point where it can match "foo". The naive
605// implementation of this would push each character position onto the
606// backtracking stack, then pop them off one by one. This would use space
607// proportional to the length of the input string. However since the "."
608// can only match in one way and always has a constant length (in this case
609// of 1) it suffices to store the current position on the top of the stack
610// once. Matching now becomes merely incrementing the current position and
611// backtracking becomes decrementing the current position and checking the
612// result against the stored current position. This is faster and saves
613// space.
614// * The current state is virtualized.
615// This is used to defer expensive operations until it is clear that they
616// are needed and to generate code for a node more than once, allowing
617// specialized an efficient versions of the code to be created. This is
618// explained in the section below.
619//
620// Execution state virtualization.
621//
622// Instead of emitting code, nodes that manipulate the state can record their
ager@chromium.org32912102009-01-16 10:38:43 +0000623// manipulation in an object called the Trace. The Trace object can record a
624// current position offset, an optional backtrack code location on the top of
625// the virtualized backtrack stack and some register changes. When a node is
626// to be emitted it can flush the Trace or update it. Flushing the Trace
ager@chromium.org8bb60582008-12-11 12:02:20 +0000627// will emit code to bring the actual state into line with the virtual state.
628// Avoiding flushing the state can postpone some work (eg updates of capture
629// registers). Postponing work can save time when executing the regular
630// expression since it may be found that the work never has to be done as a
631// failure to match can occur. In addition it is much faster to jump to a
632// known backtrack code location than it is to pop an unknown backtrack
633// location from the stack and jump there.
634//
ager@chromium.org32912102009-01-16 10:38:43 +0000635// The virtual state found in the Trace affects code generation. For example
636// the virtual state contains the difference between the actual current
637// position and the virtual current position, and matching code needs to use
638// this offset to attempt a match in the correct location of the input
639// string. Therefore code generated for a non-trivial trace is specialized
640// to that trace. The code generator therefore has the ability to generate
641// code for each node several times. In order to limit the size of the
642// generated code there is an arbitrary limit on how many specialized sets of
643// code may be generated for a given node. If the limit is reached, the
644// trace is flushed and a generic version of the code for a node is emitted.
645// This is subsequently used for that node. The code emitted for non-generic
646// trace is not recorded in the node and so it cannot currently be reused in
647// the event that code generation is requested for an identical trace.
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000648
649
650void RegExpTree::AppendToText(RegExpText* text) {
651 UNREACHABLE();
652}
653
654
655void RegExpAtom::AppendToText(RegExpText* text) {
656 text->AddElement(TextElement::Atom(this));
657}
658
659
660void RegExpCharacterClass::AppendToText(RegExpText* text) {
661 text->AddElement(TextElement::CharClass(this));
662}
663
664
665void RegExpText::AppendToText(RegExpText* text) {
666 for (int i = 0; i < elements()->length(); i++)
667 text->AddElement(elements()->at(i));
668}
669
670
671TextElement TextElement::Atom(RegExpAtom* atom) {
672 TextElement result = TextElement(ATOM);
673 result.data.u_atom = atom;
674 return result;
675}
676
677
678TextElement TextElement::CharClass(
679 RegExpCharacterClass* char_class) {
680 TextElement result = TextElement(CHAR_CLASS);
681 result.data.u_char_class = char_class;
682 return result;
683}
684
685
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000686int TextElement::length() {
687 if (type == ATOM) {
688 return data.u_atom->length();
689 } else {
690 ASSERT(type == CHAR_CLASS);
691 return 1;
692 }
693}
694
695
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000696DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
697 if (table_ == NULL) {
698 table_ = new DispatchTable();
699 DispatchTableConstructor cons(table_, ignore_case);
700 cons.BuildTable(this);
701 }
702 return table_;
703}
704
705
706class RegExpCompiler {
707 public:
ager@chromium.org8bb60582008-12-11 12:02:20 +0000708 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000709
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000710 int AllocateRegister() {
711 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
712 reg_exp_too_big_ = true;
713 return next_register_;
714 }
715 return next_register_++;
716 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000717
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000718 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
719 RegExpNode* start,
720 int capture_count,
721 Handle<String> pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000722
723 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
724
725 static const int kImplementationOffset = 0;
726 static const int kNumberOfRegistersOffset = 0;
727 static const int kCodeOffset = 1;
728
729 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
730 EndNode* accept() { return accept_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000731
732 static const int kMaxRecursion = 100;
733 inline int recursion_depth() { return recursion_depth_; }
734 inline void IncrementRecursionDepth() { recursion_depth_++; }
735 inline void DecrementRecursionDepth() { recursion_depth_--; }
736
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000737 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
738
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000739 inline bool ignore_case() { return ignore_case_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000740 inline bool ascii() { return ascii_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000741
ager@chromium.org32912102009-01-16 10:38:43 +0000742 static const int kNoRegister = -1;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000743 private:
744 EndNode* accept_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000745 int next_register_;
746 List<RegExpNode*>* work_list_;
747 int recursion_depth_;
748 RegExpMacroAssembler* macro_assembler_;
749 bool ignore_case_;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000750 bool ascii_;
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000751 bool reg_exp_too_big_;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000752};
753
754
755class RecursionCheck {
756 public:
757 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
758 compiler->IncrementRecursionDepth();
759 }
760 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
761 private:
762 RegExpCompiler* compiler_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000763};
764
765
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000766static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
767 return RegExpEngine::CompilationResult("RegExp too big");
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000768}
769
770
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000771// Attempts to compile the regexp using an Irregexp code generator. Returns
772// a fixed array or a null handle depending on whether it succeeded.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000773RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000774 : next_register_(2 * (capture_count + 1)),
775 work_list_(NULL),
776 recursion_depth_(0),
ager@chromium.org8bb60582008-12-11 12:02:20 +0000777 ignore_case_(ignore_case),
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000778 ascii_(ascii),
779 reg_exp_too_big_(false) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000780 accept_ = new EndNode(EndNode::ACCEPT);
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000781 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000782}
783
784
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000785RegExpEngine::CompilationResult RegExpCompiler::Assemble(
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000786 RegExpMacroAssembler* macro_assembler,
787 RegExpNode* start,
ager@chromium.org8bb60582008-12-11 12:02:20 +0000788 int capture_count,
789 Handle<String> pattern) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000790#ifdef DEBUG
791 if (FLAG_trace_regexp_assembler)
792 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
793 else
794#endif
795 macro_assembler_ = macro_assembler;
796 List <RegExpNode*> work_list(0);
797 work_list_ = &work_list;
798 Label fail;
iposva@chromium.org245aa852009-02-10 00:49:54 +0000799 macro_assembler_->PushBacktrack(&fail);
ager@chromium.org32912102009-01-16 10:38:43 +0000800 Trace new_trace;
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000801 start->Emit(this, &new_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000802 macro_assembler_->Bind(&fail);
803 macro_assembler_->Fail();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000804 while (!work_list.is_empty()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000805 work_list.RemoveLast()->Emit(this, &new_trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000806 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000807 if (reg_exp_too_big_) return IrregexpRegExpTooBig();
808
ager@chromium.org8bb60582008-12-11 12:02:20 +0000809 Handle<Object> code = macro_assembler_->GetCode(pattern);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000810
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000811 work_list_ = NULL;
812#ifdef DEBUG
813 if (FLAG_trace_regexp_assembler) {
814 delete macro_assembler_;
815 }
816#endif
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000817 return RegExpEngine::CompilationResult(*code, next_register_);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000818}
819
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000820
ager@chromium.org32912102009-01-16 10:38:43 +0000821bool Trace::DeferredAction::Mentions(int that) {
822 if (type() == ActionNode::CLEAR_CAPTURES) {
823 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
824 return range.Contains(that);
825 } else {
826 return reg() == that;
827 }
828}
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000829
ager@chromium.org32912102009-01-16 10:38:43 +0000830
831bool Trace::mentions_reg(int reg) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000832 for (DeferredAction* action = actions_;
833 action != NULL;
834 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +0000835 if (action->Mentions(reg))
836 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000837 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000838 return false;
839}
840
841
ager@chromium.org32912102009-01-16 10:38:43 +0000842bool Trace::GetStoredPosition(int reg, int* cp_offset) {
843 ASSERT_EQ(0, *cp_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000844 for (DeferredAction* action = actions_;
845 action != NULL;
846 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +0000847 if (action->Mentions(reg)) {
848 if (action->type() == ActionNode::STORE_POSITION) {
849 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
850 return true;
851 } else {
852 return false;
853 }
854 }
855 }
856 return false;
857}
858
859
860int Trace::FindAffectedRegisters(OutSet* affected_registers) {
861 int max_register = RegExpCompiler::kNoRegister;
862 for (DeferredAction* action = actions_;
863 action != NULL;
864 action = action->next()) {
865 if (action->type() == ActionNode::CLEAR_CAPTURES) {
866 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
867 for (int i = range.from(); i <= range.to(); i++)
868 affected_registers->Set(i);
869 if (range.to() > max_register) max_register = range.to();
870 } else {
871 affected_registers->Set(action->reg());
872 if (action->reg() > max_register) max_register = action->reg();
873 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000874 }
875 return max_register;
876}
877
878
ager@chromium.org32912102009-01-16 10:38:43 +0000879void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
880 int max_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000881 OutSet& registers_to_pop,
882 OutSet& registers_to_clear) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000883 for (int reg = max_register; reg >= 0; reg--) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000884 if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
885 else if (registers_to_clear.Get(reg)) {
886 int clear_to = reg;
887 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
888 reg--;
889 }
890 assembler->ClearRegisters(reg, clear_to);
891 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000892 }
893}
894
895
ager@chromium.org32912102009-01-16 10:38:43 +0000896void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
897 int max_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000898 OutSet& affected_registers,
899 OutSet* registers_to_pop,
900 OutSet* registers_to_clear) {
901 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
902 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
903
ager@chromium.org5aa501c2009-06-23 07:57:28 +0000904 // Count pushes performed to force a stack limit check occasionally.
905 int pushes = 0;
906
ager@chromium.org8bb60582008-12-11 12:02:20 +0000907 for (int reg = 0; reg <= max_register; reg++) {
908 if (!affected_registers.Get(reg)) {
909 continue;
910 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000911
912 // The chronologically first deferred action in the trace
913 // is used to infer the action needed to restore a register
914 // to its previous state (or not, if it's safe to ignore it).
915 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
916 DeferredActionUndoType undo_action = IGNORE;
917
ager@chromium.org8bb60582008-12-11 12:02:20 +0000918 int value = 0;
919 bool absolute = false;
ager@chromium.org32912102009-01-16 10:38:43 +0000920 bool clear = false;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000921 int store_position = -1;
922 // This is a little tricky because we are scanning the actions in reverse
923 // historical order (newest first).
924 for (DeferredAction* action = actions_;
925 action != NULL;
926 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +0000927 if (action->Mentions(reg)) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000928 switch (action->type()) {
929 case ActionNode::SET_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +0000930 Trace::DeferredSetRegister* psr =
931 static_cast<Trace::DeferredSetRegister*>(action);
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000932 if (!absolute) {
933 value += psr->value();
934 absolute = true;
935 }
936 // SET_REGISTER is currently only used for newly introduced loop
937 // counters. They can have a significant previous value if they
938 // occour in a loop. TODO(lrn): Propagate this information, so
939 // we can set undo_action to IGNORE if we know there is no value to
940 // restore.
941 undo_action = RESTORE;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000942 ASSERT_EQ(store_position, -1);
ager@chromium.org32912102009-01-16 10:38:43 +0000943 ASSERT(!clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000944 break;
945 }
946 case ActionNode::INCREMENT_REGISTER:
947 if (!absolute) {
948 value++;
949 }
950 ASSERT_EQ(store_position, -1);
ager@chromium.org32912102009-01-16 10:38:43 +0000951 ASSERT(!clear);
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000952 undo_action = RESTORE;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000953 break;
954 case ActionNode::STORE_POSITION: {
ager@chromium.org32912102009-01-16 10:38:43 +0000955 Trace::DeferredCapture* pc =
956 static_cast<Trace::DeferredCapture*>(action);
957 if (!clear && store_position == -1) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000958 store_position = pc->cp_offset();
959 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000960
961 // For captures we know that stores and clears alternate.
962 // Other register, are never cleared, and if the occur
963 // inside a loop, they might be assigned more than once.
964 if (reg <= 1) {
965 // Registers zero and one, aka "capture zero", is
966 // always set correctly if we succeed. There is no
967 // need to undo a setting on backtrack, because we
968 // will set it again or fail.
969 undo_action = IGNORE;
970 } else {
971 undo_action = pc->is_capture() ? CLEAR : RESTORE;
972 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000973 ASSERT(!absolute);
974 ASSERT_EQ(value, 0);
975 break;
976 }
ager@chromium.org32912102009-01-16 10:38:43 +0000977 case ActionNode::CLEAR_CAPTURES: {
978 // Since we're scanning in reverse order, if we've already
979 // set the position we have to ignore historically earlier
980 // clearing operations.
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000981 if (store_position == -1) {
ager@chromium.org32912102009-01-16 10:38:43 +0000982 clear = true;
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000983 }
984 undo_action = RESTORE;
ager@chromium.org32912102009-01-16 10:38:43 +0000985 ASSERT(!absolute);
986 ASSERT_EQ(value, 0);
987 break;
988 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000989 default:
990 UNREACHABLE();
991 break;
992 }
993 }
994 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000995 // Prepare for the undo-action (e.g., push if it's going to be popped).
996 if (undo_action == RESTORE) {
997 pushes++;
998 RegExpMacroAssembler::StackCheckFlag stack_check =
999 RegExpMacroAssembler::kNoStackLimitCheck;
1000 if (pushes == push_limit) {
1001 stack_check = RegExpMacroAssembler::kCheckStackLimit;
1002 pushes = 0;
1003 }
1004
1005 assembler->PushRegister(reg, stack_check);
1006 registers_to_pop->Set(reg);
1007 } else if (undo_action == CLEAR) {
1008 registers_to_clear->Set(reg);
1009 }
1010 // Perform the chronologically last action (or accumulated increment)
1011 // for the register.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001012 if (store_position != -1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001013 assembler->WriteCurrentPositionToRegister(reg, store_position);
ager@chromium.org32912102009-01-16 10:38:43 +00001014 } else if (clear) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001015 assembler->ClearRegisters(reg, reg);
ager@chromium.org32912102009-01-16 10:38:43 +00001016 } else if (absolute) {
1017 assembler->SetRegister(reg, value);
1018 } else if (value != 0) {
1019 assembler->AdvanceRegister(reg, value);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001020 }
1021 }
1022}
1023
1024
ager@chromium.org8bb60582008-12-11 12:02:20 +00001025// This is called as we come into a loop choice node and some other tricky
ager@chromium.org32912102009-01-16 10:38:43 +00001026// nodes. It normalizes the state of the code generator to ensure we can
ager@chromium.org8bb60582008-12-11 12:02:20 +00001027// generate generic code.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001028void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001029 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001030
iposva@chromium.org245aa852009-02-10 00:49:54 +00001031 ASSERT(!is_trivial());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001032
1033 if (actions_ == NULL && backtrack() == NULL) {
1034 // Here we just have some deferred cp advances to fix and we are back to
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001035 // a normal situation. We may also have to forget some information gained
1036 // through a quick check that was already performed.
1037 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001038 // Create a new trivial state and generate the node with that.
ager@chromium.org32912102009-01-16 10:38:43 +00001039 Trace new_state;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001040 successor->Emit(compiler, &new_state);
1041 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001042 }
1043
1044 // Generate deferred actions here along with code to undo them again.
1045 OutSet affected_registers;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001046
ager@chromium.org381abbb2009-02-25 13:23:22 +00001047 if (backtrack() != NULL) {
1048 // Here we have a concrete backtrack location. These are set up by choice
1049 // nodes and so they indicate that we have a deferred save of the current
1050 // position which we may need to emit here.
1051 assembler->PushCurrentPosition();
1052 }
1053
ager@chromium.org8bb60582008-12-11 12:02:20 +00001054 int max_register = FindAffectedRegisters(&affected_registers);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001055 OutSet registers_to_pop;
1056 OutSet registers_to_clear;
1057 PerformDeferredActions(assembler,
1058 max_register,
1059 affected_registers,
1060 &registers_to_pop,
1061 &registers_to_clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001062 if (cp_offset_ != 0) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001063 assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001064 }
1065
1066 // Create a new trivial state and generate the node with that.
1067 Label undo;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001068 assembler->PushBacktrack(&undo);
ager@chromium.org32912102009-01-16 10:38:43 +00001069 Trace new_state;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001070 successor->Emit(compiler, &new_state);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001071
1072 // On backtrack we need to restore state.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001073 assembler->Bind(&undo);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001074 RestoreAffectedRegisters(assembler,
1075 max_register,
1076 registers_to_pop,
1077 registers_to_clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001078 if (backtrack() == NULL) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001079 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001080 } else {
ager@chromium.org381abbb2009-02-25 13:23:22 +00001081 assembler->PopCurrentPosition();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001082 assembler->GoTo(backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001083 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001084}
1085
1086
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001087void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001088 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001089
1090 // Omit flushing the trace. We discard the entire stack frame anyway.
1091
ager@chromium.org8bb60582008-12-11 12:02:20 +00001092 if (!label()->is_bound()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001093 // We are completely independent of the trace, since we ignore it,
1094 // so this code can be used as the generic version.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001095 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001096 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001097
1098 // Throw away everything on the backtrack stack since the start
1099 // of the negative submatch and restore the character position.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001100 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1101 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001102 if (clear_capture_count_ > 0) {
1103 // Clear any captures that might have been performed during the success
1104 // of the body of the negative look-ahead.
1105 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1106 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1107 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001108 // Now that we have unwound the stack we find at the top of the stack the
1109 // backtrack that the BeginSubmatch node got.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001110 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001111}
1112
1113
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001114void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org32912102009-01-16 10:38:43 +00001115 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001116 trace->Flush(compiler, this);
1117 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001118 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001119 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001120 if (!label()->is_bound()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001121 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001122 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001123 switch (action_) {
1124 case ACCEPT:
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001125 assembler->Succeed();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001126 return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001127 case BACKTRACK:
ager@chromium.org32912102009-01-16 10:38:43 +00001128 assembler->GoTo(trace->backtrack());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001129 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001130 case NEGATIVE_SUBMATCH_SUCCESS:
1131 // This case is handled in a different virtual method.
1132 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001133 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001134 UNIMPLEMENTED();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001135}
1136
1137
1138void GuardedAlternative::AddGuard(Guard* guard) {
1139 if (guards_ == NULL)
1140 guards_ = new ZoneList<Guard*>(1);
1141 guards_->Add(guard);
1142}
1143
1144
ager@chromium.org8bb60582008-12-11 12:02:20 +00001145ActionNode* ActionNode::SetRegister(int reg,
1146 int val,
1147 RegExpNode* on_success) {
1148 ActionNode* result = new ActionNode(SET_REGISTER, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001149 result->data_.u_store_register.reg = reg;
1150 result->data_.u_store_register.value = val;
1151 return result;
1152}
1153
1154
1155ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1156 ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
1157 result->data_.u_increment_register.reg = reg;
1158 return result;
1159}
1160
1161
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001162ActionNode* ActionNode::StorePosition(int reg,
1163 bool is_capture,
1164 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001165 ActionNode* result = new ActionNode(STORE_POSITION, on_success);
1166 result->data_.u_position_register.reg = reg;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001167 result->data_.u_position_register.is_capture = is_capture;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001168 return result;
1169}
1170
1171
ager@chromium.org32912102009-01-16 10:38:43 +00001172ActionNode* ActionNode::ClearCaptures(Interval range,
1173 RegExpNode* on_success) {
1174 ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
1175 result->data_.u_clear_captures.range_from = range.from();
1176 result->data_.u_clear_captures.range_to = range.to();
1177 return result;
1178}
1179
1180
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001181ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1182 int position_reg,
1183 RegExpNode* on_success) {
1184 ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
1185 result->data_.u_submatch.stack_pointer_register = stack_reg;
1186 result->data_.u_submatch.current_position_register = position_reg;
1187 return result;
1188}
1189
1190
ager@chromium.org8bb60582008-12-11 12:02:20 +00001191ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1192 int position_reg,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001193 int clear_register_count,
1194 int clear_register_from,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001195 RegExpNode* on_success) {
1196 ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001197 result->data_.u_submatch.stack_pointer_register = stack_reg;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001198 result->data_.u_submatch.current_position_register = position_reg;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001199 result->data_.u_submatch.clear_register_count = clear_register_count;
1200 result->data_.u_submatch.clear_register_from = clear_register_from;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001201 return result;
1202}
1203
1204
ager@chromium.org32912102009-01-16 10:38:43 +00001205ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1206 int repetition_register,
1207 int repetition_limit,
1208 RegExpNode* on_success) {
1209 ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
1210 result->data_.u_empty_match_check.start_register = start_register;
1211 result->data_.u_empty_match_check.repetition_register = repetition_register;
1212 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1213 return result;
1214}
1215
1216
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001217#define DEFINE_ACCEPT(Type) \
1218 void Type##Node::Accept(NodeVisitor* visitor) { \
1219 visitor->Visit##Type(this); \
1220 }
1221FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1222#undef DEFINE_ACCEPT
1223
1224
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001225void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1226 visitor->VisitLoopChoice(this);
1227}
1228
1229
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001230// -------------------------------------------------------------------
1231// Emit code.
1232
1233
1234void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1235 Guard* guard,
ager@chromium.org32912102009-01-16 10:38:43 +00001236 Trace* trace) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001237 switch (guard->op()) {
1238 case Guard::LT:
ager@chromium.org32912102009-01-16 10:38:43 +00001239 ASSERT(!trace->mentions_reg(guard->reg()));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001240 macro_assembler->IfRegisterGE(guard->reg(),
1241 guard->value(),
ager@chromium.org32912102009-01-16 10:38:43 +00001242 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001243 break;
1244 case Guard::GEQ:
ager@chromium.org32912102009-01-16 10:38:43 +00001245 ASSERT(!trace->mentions_reg(guard->reg()));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001246 macro_assembler->IfRegisterLT(guard->reg(),
1247 guard->value(),
ager@chromium.org32912102009-01-16 10:38:43 +00001248 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001249 break;
1250 }
1251}
1252
1253
1254static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
1255static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
1256
1257
ager@chromium.org381abbb2009-02-25 13:23:22 +00001258// Returns the number of characters in the equivalence class, omitting those
1259// that cannot occur in the source string because it is ASCII.
1260static int GetCaseIndependentLetters(uc16 character,
1261 bool ascii_subject,
1262 unibrow::uchar* letters) {
1263 int length = uncanonicalize.get(character, '\0', letters);
1264 // Unibrow returns 0 or 1 for characters where case independependence is
1265 // trivial.
1266 if (length == 0) {
1267 letters[0] = character;
1268 length = 1;
1269 }
1270 if (!ascii_subject || character <= String::kMaxAsciiCharCode) {
1271 return length;
1272 }
1273 // The standard requires that non-ASCII characters cannot have ASCII
1274 // character codes in their equivalence class.
1275 return 0;
1276}
1277
1278
1279static inline bool EmitSimpleCharacter(RegExpCompiler* compiler,
1280 uc16 c,
1281 Label* on_failure,
1282 int cp_offset,
1283 bool check,
1284 bool preloaded) {
1285 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1286 bool bound_checked = false;
1287 if (!preloaded) {
1288 assembler->LoadCurrentCharacter(
1289 cp_offset,
1290 on_failure,
1291 check);
1292 bound_checked = true;
1293 }
1294 assembler->CheckNotCharacter(c, on_failure);
1295 return bound_checked;
1296}
1297
1298
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001299// Only emits non-letters (things that don't have case). Only used for case
1300// independent matches.
ager@chromium.org381abbb2009-02-25 13:23:22 +00001301static inline bool EmitAtomNonLetter(RegExpCompiler* compiler,
1302 uc16 c,
1303 Label* on_failure,
1304 int cp_offset,
1305 bool check,
1306 bool preloaded) {
1307 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1308 bool ascii = compiler->ascii();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001309 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
ager@chromium.org381abbb2009-02-25 13:23:22 +00001310 int length = GetCaseIndependentLetters(c, ascii, chars);
1311 if (length < 1) {
1312 // This can't match. Must be an ASCII subject and a non-ASCII character.
1313 // We do not need to do anything since the ASCII pass already handled this.
1314 return false; // Bounds not checked.
1315 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001316 bool checked = false;
ager@chromium.org381abbb2009-02-25 13:23:22 +00001317 // We handle the length > 1 case in a later pass.
1318 if (length == 1) {
1319 if (ascii && c > String::kMaxAsciiCharCodeU) {
1320 // Can't match - see above.
1321 return false; // Bounds not checked.
1322 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001323 if (!preloaded) {
1324 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1325 checked = check;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001326 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001327 macro_assembler->CheckNotCharacter(c, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001328 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001329 return checked;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001330}
1331
1332
1333static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001334 bool ascii,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001335 uc16 c1,
1336 uc16 c2,
1337 Label* on_failure) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001338 uc16 char_mask;
1339 if (ascii) {
1340 char_mask = String::kMaxAsciiCharCode;
1341 } else {
1342 char_mask = String::kMaxUC16CharCode;
1343 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001344 uc16 exor = c1 ^ c2;
1345 // Check whether exor has only one bit set.
1346 if (((exor - 1) & exor) == 0) {
1347 // If c1 and c2 differ only by one bit.
1348 // Ecma262UnCanonicalize always gives the highest number last.
1349 ASSERT(c2 > c1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001350 uc16 mask = char_mask ^ exor;
1351 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001352 return true;
1353 }
1354 ASSERT(c2 > c1);
1355 uc16 diff = c2 - c1;
1356 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1357 // If the characters differ by 2^n but don't differ by one bit then
1358 // subtract the difference from the found character, then do the or
1359 // trick. We avoid the theoretical case where negative numbers are
1360 // involved in order to simplify code generation.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001361 uc16 mask = char_mask ^ diff;
1362 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1363 diff,
1364 mask,
1365 on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001366 return true;
1367 }
1368 return false;
1369}
1370
1371
ager@chromium.org381abbb2009-02-25 13:23:22 +00001372typedef bool EmitCharacterFunction(RegExpCompiler* compiler,
1373 uc16 c,
1374 Label* on_failure,
1375 int cp_offset,
1376 bool check,
1377 bool preloaded);
1378
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001379// Only emits letters (things that have case). Only used for case independent
1380// matches.
ager@chromium.org381abbb2009-02-25 13:23:22 +00001381static inline bool EmitAtomLetter(RegExpCompiler* compiler,
1382 uc16 c,
1383 Label* on_failure,
1384 int cp_offset,
1385 bool check,
1386 bool preloaded) {
1387 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1388 bool ascii = compiler->ascii();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001389 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
ager@chromium.org381abbb2009-02-25 13:23:22 +00001390 int length = GetCaseIndependentLetters(c, ascii, chars);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001391 if (length <= 1) return false;
1392 // We may not need to check against the end of the input string
1393 // if this character lies before a character that matched.
1394 if (!preloaded) {
1395 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001396 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001397 Label ok;
1398 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1399 switch (length) {
1400 case 2: {
1401 if (ShortCutEmitCharacterPair(macro_assembler,
1402 ascii,
1403 chars[0],
1404 chars[1],
1405 on_failure)) {
1406 } else {
1407 macro_assembler->CheckCharacter(chars[0], &ok);
1408 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1409 macro_assembler->Bind(&ok);
1410 }
1411 break;
1412 }
1413 case 4:
1414 macro_assembler->CheckCharacter(chars[3], &ok);
1415 // Fall through!
1416 case 3:
1417 macro_assembler->CheckCharacter(chars[0], &ok);
1418 macro_assembler->CheckCharacter(chars[1], &ok);
1419 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1420 macro_assembler->Bind(&ok);
1421 break;
1422 default:
1423 UNREACHABLE();
1424 break;
1425 }
1426 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001427}
1428
1429
1430static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1431 RegExpCharacterClass* cc,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001432 bool ascii,
ager@chromium.org381abbb2009-02-25 13:23:22 +00001433 Label* on_failure,
1434 int cp_offset,
1435 bool check_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001436 bool preloaded) {
1437 if (cc->is_standard() &&
1438 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1439 cp_offset,
1440 check_offset,
1441 on_failure)) {
1442 return;
1443 }
1444
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001445 ZoneList<CharacterRange>* ranges = cc->ranges();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001446 int max_char;
1447 if (ascii) {
1448 max_char = String::kMaxAsciiCharCode;
1449 } else {
1450 max_char = String::kMaxUC16CharCode;
1451 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001452
1453 Label success;
1454
1455 Label* char_is_in_class =
1456 cc->is_negated() ? on_failure : &success;
1457
1458 int range_count = ranges->length();
1459
ager@chromium.org8bb60582008-12-11 12:02:20 +00001460 int last_valid_range = range_count - 1;
1461 while (last_valid_range >= 0) {
1462 CharacterRange& range = ranges->at(last_valid_range);
1463 if (range.from() <= max_char) {
1464 break;
1465 }
1466 last_valid_range--;
1467 }
1468
1469 if (last_valid_range < 0) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001470 if (!cc->is_negated()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001471 // TODO(plesner): We can remove this when the node level does our
1472 // ASCII optimizations for us.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001473 macro_assembler->GoTo(on_failure);
1474 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001475 if (check_offset) {
1476 macro_assembler->CheckPosition(cp_offset, on_failure);
1477 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001478 return;
1479 }
1480
ager@chromium.org8bb60582008-12-11 12:02:20 +00001481 if (last_valid_range == 0 &&
1482 !cc->is_negated() &&
1483 ranges->at(0).IsEverything(max_char)) {
1484 // This is a common case hit by non-anchored expressions.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001485 if (check_offset) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001486 macro_assembler->CheckPosition(cp_offset, on_failure);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001487 }
1488 return;
1489 }
1490
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001491 if (!preloaded) {
1492 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001493 }
1494
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001495 for (int i = 0; i < last_valid_range; i++) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001496 CharacterRange& range = ranges->at(i);
1497 Label next_range;
1498 uc16 from = range.from();
1499 uc16 to = range.to();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001500 if (from > max_char) {
1501 continue;
1502 }
1503 if (to > max_char) to = max_char;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001504 if (to == from) {
1505 macro_assembler->CheckCharacter(to, char_is_in_class);
1506 } else {
1507 if (from != 0) {
1508 macro_assembler->CheckCharacterLT(from, &next_range);
1509 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001510 if (to != max_char) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001511 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
1512 } else {
1513 macro_assembler->GoTo(char_is_in_class);
1514 }
1515 }
1516 macro_assembler->Bind(&next_range);
1517 }
1518
ager@chromium.org8bb60582008-12-11 12:02:20 +00001519 CharacterRange& range = ranges->at(last_valid_range);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001520 uc16 from = range.from();
1521 uc16 to = range.to();
1522
ager@chromium.org8bb60582008-12-11 12:02:20 +00001523 if (to > max_char) to = max_char;
1524 ASSERT(to >= from);
1525
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001526 if (to == from) {
1527 if (cc->is_negated()) {
1528 macro_assembler->CheckCharacter(to, on_failure);
1529 } else {
1530 macro_assembler->CheckNotCharacter(to, on_failure);
1531 }
1532 } else {
1533 if (from != 0) {
1534 if (cc->is_negated()) {
1535 macro_assembler->CheckCharacterLT(from, &success);
1536 } else {
1537 macro_assembler->CheckCharacterLT(from, on_failure);
1538 }
1539 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001540 if (to != String::kMaxUC16CharCode) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001541 if (cc->is_negated()) {
1542 macro_assembler->CheckCharacterLT(to + 1, on_failure);
1543 } else {
1544 macro_assembler->CheckCharacterGT(to, on_failure);
1545 }
1546 } else {
1547 if (cc->is_negated()) {
1548 macro_assembler->GoTo(on_failure);
1549 }
1550 }
1551 }
1552 macro_assembler->Bind(&success);
1553}
1554
1555
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001556RegExpNode::~RegExpNode() {
1557}
1558
1559
ager@chromium.org8bb60582008-12-11 12:02:20 +00001560RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00001561 Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001562 // If we are generating a greedy loop then don't stop and don't reuse code.
ager@chromium.org32912102009-01-16 10:38:43 +00001563 if (trace->stop_node() != NULL) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001564 return CONTINUE;
1565 }
1566
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001567 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00001568 if (trace->is_trivial()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001569 if (label_.is_bound()) {
1570 // We are being asked to generate a generic version, but that's already
1571 // been done so just go to it.
1572 macro_assembler->GoTo(&label_);
1573 return DONE;
1574 }
1575 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1576 // To avoid too deep recursion we push the node to the work queue and just
1577 // generate a goto here.
1578 compiler->AddWork(this);
1579 macro_assembler->GoTo(&label_);
1580 return DONE;
1581 }
1582 // Generate generic version of the node and bind the label for later use.
1583 macro_assembler->Bind(&label_);
1584 return CONTINUE;
1585 }
1586
1587 // We are being asked to make a non-generic version. Keep track of how many
1588 // non-generic versions we generate so as not to overdo it.
ager@chromium.org32912102009-01-16 10:38:43 +00001589 trace_count_++;
ager@chromium.org381abbb2009-02-25 13:23:22 +00001590 if (FLAG_regexp_optimization &&
iposva@chromium.org245aa852009-02-10 00:49:54 +00001591 trace_count_ < kMaxCopiesCodeGenerated &&
ager@chromium.org8bb60582008-12-11 12:02:20 +00001592 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
1593 return CONTINUE;
1594 }
1595
ager@chromium.org32912102009-01-16 10:38:43 +00001596 // If we get here code has been generated for this node too many times or
1597 // recursion is too deep. Time to switch to a generic version. The code for
ager@chromium.org8bb60582008-12-11 12:02:20 +00001598 // generic versions above can handle deep recursion properly.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001599 trace->Flush(compiler, this);
1600 return DONE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001601}
1602
1603
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001604int ActionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001605 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1606 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001607 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001608}
1609
1610
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001611int AssertionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1612 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1613 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
1614}
1615
1616
1617int BackReferenceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1618 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1619 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
1620}
1621
1622
1623int TextNode::EatsAtLeast(int still_to_find, int recursion_depth) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001624 int answer = Length();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001625 if (answer >= still_to_find) return answer;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001626 if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001627 return answer + on_success()->EatsAtLeast(still_to_find - answer,
1628 recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001629}
1630
1631
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001632int NegativeLookaheadChoiceNode:: EatsAtLeast(int still_to_find,
1633 int recursion_depth) {
1634 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1635 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1636 // afterwards.
1637 RegExpNode* node = alternatives_->at(1).node();
1638 return node->EatsAtLeast(still_to_find, recursion_depth + 1);
1639}
1640
1641
1642void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
1643 QuickCheckDetails* details,
1644 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00001645 int filled_in,
1646 bool not_at_start) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001647 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1648 // afterwards.
1649 RegExpNode* node = alternatives_->at(1).node();
iposva@chromium.org245aa852009-02-10 00:49:54 +00001650 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001651}
1652
1653
1654int ChoiceNode::EatsAtLeastHelper(int still_to_find,
1655 int recursion_depth,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001656 RegExpNode* ignore_this_node) {
1657 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1658 int min = 100;
1659 int choice_count = alternatives_->length();
1660 for (int i = 0; i < choice_count; i++) {
1661 RegExpNode* node = alternatives_->at(i).node();
1662 if (node == ignore_this_node) continue;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001663 int node_eats_at_least = node->EatsAtLeast(still_to_find,
1664 recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001665 if (node_eats_at_least < min) min = node_eats_at_least;
1666 }
1667 return min;
1668}
1669
1670
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001671int LoopChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1672 return EatsAtLeastHelper(still_to_find, recursion_depth, loop_node_);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001673}
1674
1675
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001676int ChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1677 return EatsAtLeastHelper(still_to_find, recursion_depth, NULL);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001678}
1679
1680
1681// Takes the left-most 1-bit and smears it out, setting all bits to its right.
1682static inline uint32_t SmearBitsRight(uint32_t v) {
1683 v |= v >> 1;
1684 v |= v >> 2;
1685 v |= v >> 4;
1686 v |= v >> 8;
1687 v |= v >> 16;
1688 return v;
1689}
1690
1691
1692bool QuickCheckDetails::Rationalize(bool asc) {
1693 bool found_useful_op = false;
1694 uint32_t char_mask;
1695 if (asc) {
1696 char_mask = String::kMaxAsciiCharCode;
1697 } else {
1698 char_mask = String::kMaxUC16CharCode;
1699 }
1700 mask_ = 0;
1701 value_ = 0;
1702 int char_shift = 0;
1703 for (int i = 0; i < characters_; i++) {
1704 Position* pos = &positions_[i];
1705 if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
1706 found_useful_op = true;
1707 }
1708 mask_ |= (pos->mask & char_mask) << char_shift;
1709 value_ |= (pos->value & char_mask) << char_shift;
1710 char_shift += asc ? 8 : 16;
1711 }
1712 return found_useful_op;
1713}
1714
1715
1716bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00001717 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001718 bool preload_has_checked_bounds,
1719 Label* on_possible_success,
1720 QuickCheckDetails* details,
1721 bool fall_through_on_failure) {
1722 if (details->characters() == 0) return false;
iposva@chromium.org245aa852009-02-10 00:49:54 +00001723 GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
1724 if (details->cannot_match()) return false;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001725 if (!details->Rationalize(compiler->ascii())) return false;
1726 uint32_t mask = details->mask();
1727 uint32_t value = details->value();
1728
1729 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1730
ager@chromium.org32912102009-01-16 10:38:43 +00001731 if (trace->characters_preloaded() != details->characters()) {
1732 assembler->LoadCurrentCharacter(trace->cp_offset(),
1733 trace->backtrack(),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001734 !preload_has_checked_bounds,
1735 details->characters());
1736 }
1737
1738
1739 bool need_mask = true;
1740
1741 if (details->characters() == 1) {
1742 // If number of characters preloaded is 1 then we used a byte or 16 bit
1743 // load so the value is already masked down.
1744 uint32_t char_mask;
1745 if (compiler->ascii()) {
1746 char_mask = String::kMaxAsciiCharCode;
1747 } else {
1748 char_mask = String::kMaxUC16CharCode;
1749 }
1750 if ((mask & char_mask) == char_mask) need_mask = false;
1751 mask &= char_mask;
1752 } else {
1753 // For 2-character preloads in ASCII mode we also use a 16 bit load with
1754 // zero extend.
1755 if (details->characters() == 2 && compiler->ascii()) {
1756 if ((mask & 0xffff) == 0xffff) need_mask = false;
1757 } else {
1758 if (mask == 0xffffffff) need_mask = false;
1759 }
1760 }
1761
1762 if (fall_through_on_failure) {
1763 if (need_mask) {
1764 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1765 } else {
1766 assembler->CheckCharacter(value, on_possible_success);
1767 }
1768 } else {
1769 if (need_mask) {
ager@chromium.org32912102009-01-16 10:38:43 +00001770 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001771 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00001772 assembler->CheckNotCharacter(value, trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001773 }
1774 }
1775 return true;
1776}
1777
1778
1779// Here is the meat of GetQuickCheckDetails (see also the comment on the
1780// super-class in the .h file).
1781//
1782// We iterate along the text object, building up for each character a
1783// mask and value that can be used to test for a quick failure to match.
1784// The masks and values for the positions will be combined into a single
1785// machine word for the current character width in order to be used in
1786// generating a quick check.
1787void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
1788 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00001789 int characters_filled_in,
1790 bool not_at_start) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001791 ASSERT(characters_filled_in < details->characters());
1792 int characters = details->characters();
1793 int char_mask;
1794 int char_shift;
1795 if (compiler->ascii()) {
1796 char_mask = String::kMaxAsciiCharCode;
1797 char_shift = 8;
1798 } else {
1799 char_mask = String::kMaxUC16CharCode;
1800 char_shift = 16;
1801 }
1802 for (int k = 0; k < elms_->length(); k++) {
1803 TextElement elm = elms_->at(k);
1804 if (elm.type == TextElement::ATOM) {
1805 Vector<const uc16> quarks = elm.data.u_atom->data();
1806 for (int i = 0; i < characters && i < quarks.length(); i++) {
1807 QuickCheckDetails::Position* pos =
1808 details->positions(characters_filled_in);
ager@chromium.org6f10e412009-02-13 10:11:16 +00001809 uc16 c = quarks[i];
1810 if (c > char_mask) {
1811 // If we expect a non-ASCII character from an ASCII string,
1812 // there is no way we can match. Not even case independent
1813 // matching can turn an ASCII character into non-ASCII or
1814 // vice versa.
1815 details->set_cannot_match();
ager@chromium.org381abbb2009-02-25 13:23:22 +00001816 pos->determines_perfectly = false;
ager@chromium.org6f10e412009-02-13 10:11:16 +00001817 return;
1818 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001819 if (compiler->ignore_case()) {
1820 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
ager@chromium.org381abbb2009-02-25 13:23:22 +00001821 int length = GetCaseIndependentLetters(c, compiler->ascii(), chars);
1822 ASSERT(length != 0); // Can only happen if c > char_mask (see above).
1823 if (length == 1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001824 // This letter has no case equivalents, so it's nice and simple
1825 // and the mask-compare will determine definitely whether we have
1826 // a match at this character position.
1827 pos->mask = char_mask;
1828 pos->value = c;
1829 pos->determines_perfectly = true;
1830 } else {
1831 uint32_t common_bits = char_mask;
1832 uint32_t bits = chars[0];
1833 for (int j = 1; j < length; j++) {
1834 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1835 common_bits ^= differing_bits;
1836 bits &= common_bits;
1837 }
1838 // If length is 2 and common bits has only one zero in it then
1839 // our mask and compare instruction will determine definitely
1840 // whether we have a match at this character position. Otherwise
1841 // it can only be an approximate check.
1842 uint32_t one_zero = (common_bits | ~char_mask);
1843 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
1844 pos->determines_perfectly = true;
1845 }
1846 pos->mask = common_bits;
1847 pos->value = bits;
1848 }
1849 } else {
1850 // Don't ignore case. Nice simple case where the mask-compare will
1851 // determine definitely whether we have a match at this character
1852 // position.
1853 pos->mask = char_mask;
ager@chromium.org6f10e412009-02-13 10:11:16 +00001854 pos->value = c;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001855 pos->determines_perfectly = true;
1856 }
1857 characters_filled_in++;
1858 ASSERT(characters_filled_in <= details->characters());
1859 if (characters_filled_in == details->characters()) {
1860 return;
1861 }
1862 }
1863 } else {
1864 QuickCheckDetails::Position* pos =
1865 details->positions(characters_filled_in);
1866 RegExpCharacterClass* tree = elm.data.u_char_class;
1867 ZoneList<CharacterRange>* ranges = tree->ranges();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001868 if (tree->is_negated()) {
1869 // A quick check uses multi-character mask and compare. There is no
1870 // useful way to incorporate a negative char class into this scheme
1871 // so we just conservatively create a mask and value that will always
1872 // succeed.
1873 pos->mask = 0;
1874 pos->value = 0;
1875 } else {
ager@chromium.org381abbb2009-02-25 13:23:22 +00001876 int first_range = 0;
1877 while (ranges->at(first_range).from() > char_mask) {
1878 first_range++;
1879 if (first_range == ranges->length()) {
1880 details->set_cannot_match();
1881 pos->determines_perfectly = false;
1882 return;
1883 }
1884 }
1885 CharacterRange range = ranges->at(first_range);
1886 uc16 from = range.from();
1887 uc16 to = range.to();
1888 if (to > char_mask) {
1889 to = char_mask;
1890 }
1891 uint32_t differing_bits = (from ^ to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001892 // A mask and compare is only perfect if the differing bits form a
1893 // number like 00011111 with one single block of trailing 1s.
ager@chromium.org5aa501c2009-06-23 07:57:28 +00001894 if ((differing_bits & (differing_bits + 1)) == 0 &&
1895 from + differing_bits == to) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001896 pos->determines_perfectly = true;
1897 }
1898 uint32_t common_bits = ~SmearBitsRight(differing_bits);
ager@chromium.org381abbb2009-02-25 13:23:22 +00001899 uint32_t bits = (from & common_bits);
1900 for (int i = first_range + 1; i < ranges->length(); i++) {
1901 CharacterRange range = ranges->at(i);
1902 uc16 from = range.from();
1903 uc16 to = range.to();
1904 if (from > char_mask) continue;
1905 if (to > char_mask) to = char_mask;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001906 // Here we are combining more ranges into the mask and compare
1907 // value. With each new range the mask becomes more sparse and
1908 // so the chances of a false positive rise. A character class
1909 // with multiple ranges is assumed never to be equivalent to a
1910 // mask and compare operation.
1911 pos->determines_perfectly = false;
ager@chromium.org381abbb2009-02-25 13:23:22 +00001912 uint32_t new_common_bits = (from ^ to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001913 new_common_bits = ~SmearBitsRight(new_common_bits);
1914 common_bits &= new_common_bits;
1915 bits &= new_common_bits;
ager@chromium.org381abbb2009-02-25 13:23:22 +00001916 uint32_t differing_bits = (from & common_bits) ^ bits;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001917 common_bits ^= differing_bits;
1918 bits &= common_bits;
1919 }
1920 pos->mask = common_bits;
1921 pos->value = bits;
1922 }
1923 characters_filled_in++;
1924 ASSERT(characters_filled_in <= details->characters());
1925 if (characters_filled_in == details->characters()) {
1926 return;
1927 }
1928 }
1929 }
1930 ASSERT(characters_filled_in != details->characters());
iposva@chromium.org245aa852009-02-10 00:49:54 +00001931 on_success()-> GetQuickCheckDetails(details,
1932 compiler,
1933 characters_filled_in,
1934 true);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001935}
1936
1937
1938void QuickCheckDetails::Clear() {
1939 for (int i = 0; i < characters_; i++) {
1940 positions_[i].mask = 0;
1941 positions_[i].value = 0;
1942 positions_[i].determines_perfectly = false;
1943 }
1944 characters_ = 0;
1945}
1946
1947
1948void QuickCheckDetails::Advance(int by, bool ascii) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001949 ASSERT(by >= 0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001950 if (by >= characters_) {
1951 Clear();
1952 return;
1953 }
1954 for (int i = 0; i < characters_ - by; i++) {
1955 positions_[i] = positions_[by + i];
1956 }
1957 for (int i = characters_ - by; i < characters_; i++) {
1958 positions_[i].mask = 0;
1959 positions_[i].value = 0;
1960 positions_[i].determines_perfectly = false;
1961 }
1962 characters_ -= by;
1963 // We could change mask_ and value_ here but we would never advance unless
1964 // they had already been used in a check and they won't be used again because
1965 // it would gain us nothing. So there's no point.
1966}
1967
1968
1969void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
1970 ASSERT(characters_ == other->characters_);
iposva@chromium.org245aa852009-02-10 00:49:54 +00001971 if (other->cannot_match_) {
1972 return;
1973 }
1974 if (cannot_match_) {
1975 *this = *other;
1976 return;
1977 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001978 for (int i = from_index; i < characters_; i++) {
1979 QuickCheckDetails::Position* pos = positions(i);
1980 QuickCheckDetails::Position* other_pos = other->positions(i);
1981 if (pos->mask != other_pos->mask ||
1982 pos->value != other_pos->value ||
1983 !other_pos->determines_perfectly) {
1984 // Our mask-compare operation will be approximate unless we have the
1985 // exact same operation on both sides of the alternation.
1986 pos->determines_perfectly = false;
1987 }
1988 pos->mask &= other_pos->mask;
1989 pos->value &= pos->mask;
1990 other_pos->value &= pos->mask;
1991 uc16 differing_bits = (pos->value ^ other_pos->value);
1992 pos->mask &= ~differing_bits;
1993 pos->value &= pos->mask;
1994 }
1995}
1996
1997
ager@chromium.org32912102009-01-16 10:38:43 +00001998class VisitMarker {
1999 public:
2000 explicit VisitMarker(NodeInfo* info) : info_(info) {
2001 ASSERT(!info->visited);
2002 info->visited = true;
2003 }
2004 ~VisitMarker() {
2005 info_->visited = false;
2006 }
2007 private:
2008 NodeInfo* info_;
2009};
2010
2011
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002012void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2013 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002014 int characters_filled_in,
2015 bool not_at_start) {
ager@chromium.org32912102009-01-16 10:38:43 +00002016 if (body_can_be_zero_length_ || info()->visited) return;
2017 VisitMarker marker(info());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002018 return ChoiceNode::GetQuickCheckDetails(details,
2019 compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002020 characters_filled_in,
2021 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002022}
2023
2024
2025void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2026 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002027 int characters_filled_in,
2028 bool not_at_start) {
2029 not_at_start = (not_at_start || not_at_start_);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002030 int choice_count = alternatives_->length();
2031 ASSERT(choice_count > 0);
2032 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2033 compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002034 characters_filled_in,
2035 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002036 for (int i = 1; i < choice_count; i++) {
2037 QuickCheckDetails new_details(details->characters());
2038 RegExpNode* node = alternatives_->at(i).node();
iposva@chromium.org245aa852009-02-10 00:49:54 +00002039 node->GetQuickCheckDetails(&new_details, compiler,
2040 characters_filled_in,
2041 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002042 // Here we merge the quick match details of the two branches.
2043 details->Merge(&new_details, characters_filled_in);
2044 }
2045}
2046
2047
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002048// Check for [0-9A-Z_a-z].
2049static void EmitWordCheck(RegExpMacroAssembler* assembler,
2050 Label* word,
2051 Label* non_word,
2052 bool fall_through_on_word) {
2053 assembler->CheckCharacterGT('z', non_word);
2054 assembler->CheckCharacterLT('0', non_word);
2055 assembler->CheckCharacterGT('a' - 1, word);
2056 assembler->CheckCharacterLT('9' + 1, word);
2057 assembler->CheckCharacterLT('A', non_word);
2058 assembler->CheckCharacterLT('Z' + 1, word);
2059 if (fall_through_on_word) {
2060 assembler->CheckNotCharacter('_', non_word);
2061 } else {
2062 assembler->CheckCharacter('_', word);
2063 }
2064}
2065
2066
2067// Emit the code to check for a ^ in multiline mode (1-character lookbehind
2068// that matches newline or the start of input).
2069static void EmitHat(RegExpCompiler* compiler,
2070 RegExpNode* on_success,
2071 Trace* trace) {
2072 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2073 // We will be loading the previous character into the current character
2074 // register.
2075 Trace new_trace(*trace);
2076 new_trace.InvalidateCurrentCharacter();
2077
2078 Label ok;
2079 if (new_trace.cp_offset() == 0) {
2080 // The start of input counts as a newline in this context, so skip to
2081 // ok if we are at the start.
2082 assembler->CheckAtStart(&ok);
2083 }
2084 // We already checked that we are not at the start of input so it must be
2085 // OK to load the previous character.
2086 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2087 new_trace.backtrack(),
2088 false);
2089 // Newline means \n, \r, 0x2028 or 0x2029.
2090 if (!compiler->ascii()) {
2091 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2092 }
2093 assembler->CheckCharacter('\n', &ok);
2094 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2095 assembler->Bind(&ok);
2096 on_success->Emit(compiler, &new_trace);
2097}
2098
2099
2100// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2101static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
2102 RegExpCompiler* compiler,
2103 RegExpNode* on_success,
2104 Trace* trace) {
2105 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2106 Label before_non_word;
2107 Label before_word;
2108 if (trace->characters_preloaded() != 1) {
2109 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2110 }
2111 // Fall through on non-word.
2112 EmitWordCheck(assembler, &before_word, &before_non_word, false);
2113
2114 // We will be loading the previous character into the current character
2115 // register.
2116 Trace new_trace(*trace);
2117 new_trace.InvalidateCurrentCharacter();
2118
2119 Label ok;
2120 Label* boundary;
2121 Label* not_boundary;
2122 if (type == AssertionNode::AT_BOUNDARY) {
2123 boundary = &ok;
2124 not_boundary = new_trace.backtrack();
2125 } else {
2126 not_boundary = &ok;
2127 boundary = new_trace.backtrack();
2128 }
2129
2130 // Next character is not a word character.
2131 assembler->Bind(&before_non_word);
2132 if (new_trace.cp_offset() == 0) {
2133 // The start of input counts as a non-word character, so the question is
2134 // decided if we are at the start.
2135 assembler->CheckAtStart(not_boundary);
2136 }
2137 // We already checked that we are not at the start of input so it must be
2138 // OK to load the previous character.
2139 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2140 &ok, // Unused dummy label in this call.
2141 false);
2142 // Fall through on non-word.
2143 EmitWordCheck(assembler, boundary, not_boundary, false);
2144 assembler->GoTo(not_boundary);
2145
2146 // Next character is a word character.
2147 assembler->Bind(&before_word);
2148 if (new_trace.cp_offset() == 0) {
2149 // The start of input counts as a non-word character, so the question is
2150 // decided if we are at the start.
2151 assembler->CheckAtStart(boundary);
2152 }
2153 // We already checked that we are not at the start of input so it must be
2154 // OK to load the previous character.
2155 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2156 &ok, // Unused dummy label in this call.
2157 false);
2158 bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
2159 EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
2160
2161 assembler->Bind(&ok);
2162
2163 on_success->Emit(compiler, &new_trace);
2164}
2165
2166
iposva@chromium.org245aa852009-02-10 00:49:54 +00002167void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2168 RegExpCompiler* compiler,
2169 int filled_in,
2170 bool not_at_start) {
2171 if (type_ == AT_START && not_at_start) {
2172 details->set_cannot_match();
2173 return;
2174 }
2175 return on_success()->GetQuickCheckDetails(details,
2176 compiler,
2177 filled_in,
2178 not_at_start);
2179}
2180
2181
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002182void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2183 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2184 switch (type_) {
2185 case AT_END: {
2186 Label ok;
2187 assembler->CheckPosition(trace->cp_offset(), &ok);
2188 assembler->GoTo(trace->backtrack());
2189 assembler->Bind(&ok);
2190 break;
2191 }
iposva@chromium.org245aa852009-02-10 00:49:54 +00002192 case AT_START: {
2193 if (trace->at_start() == Trace::FALSE) {
2194 assembler->GoTo(trace->backtrack());
2195 return;
2196 }
2197 if (trace->at_start() == Trace::UNKNOWN) {
2198 assembler->CheckNotAtStart(trace->backtrack());
2199 Trace at_start_trace = *trace;
2200 at_start_trace.set_at_start(true);
2201 on_success()->Emit(compiler, &at_start_trace);
2202 return;
2203 }
2204 }
2205 break;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002206 case AFTER_NEWLINE:
2207 EmitHat(compiler, on_success(), trace);
2208 return;
2209 case AT_NON_BOUNDARY:
2210 case AT_BOUNDARY:
2211 EmitBoundaryCheck(type_, compiler, on_success(), trace);
2212 return;
2213 }
2214 on_success()->Emit(compiler, trace);
2215}
2216
2217
ager@chromium.org381abbb2009-02-25 13:23:22 +00002218static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
2219 if (quick_check == NULL) return false;
2220 if (offset >= quick_check->characters()) return false;
2221 return quick_check->positions(offset)->determines_perfectly;
2222}
2223
2224
2225static void UpdateBoundsCheck(int index, int* checked_up_to) {
2226 if (index > *checked_up_to) {
2227 *checked_up_to = index;
2228 }
2229}
2230
2231
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002232// We call this repeatedly to generate code for each pass over the text node.
2233// The passes are in increasing order of difficulty because we hope one
2234// of the first passes will fail in which case we are saved the work of the
2235// later passes. for example for the case independent regexp /%[asdfghjkl]a/
2236// we will check the '%' in the first pass, the case independent 'a' in the
2237// second pass and the character class in the last pass.
2238//
2239// The passes are done from right to left, so for example to test for /bar/
2240// we will first test for an 'r' with offset 2, then an 'a' with offset 1
2241// and then a 'b' with offset 0. This means we can avoid the end-of-input
2242// bounds check most of the time. In the example we only need to check for
2243// end-of-input when loading the putative 'r'.
2244//
2245// A slight complication involves the fact that the first character may already
2246// be fetched into a register by the previous node. In this case we want to
2247// do the test for that character first. We do this in separate passes. The
2248// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2249// pass has been performed then subsequent passes will have true in
2250// first_element_checked to indicate that that character does not need to be
2251// checked again.
2252//
ager@chromium.org32912102009-01-16 10:38:43 +00002253// In addition to all this we are passed a Trace, which can
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002254// contain an AlternativeGeneration object. In this AlternativeGeneration
2255// object we can see details of any quick check that was already passed in
2256// order to get to the code we are now generating. The quick check can involve
2257// loading characters, which means we do not need to recheck the bounds
2258// up to the limit the quick check already checked. In addition the quick
2259// check can have involved a mask and compare operation which may simplify
2260// or obviate the need for further checks at some character positions.
2261void TextNode::TextEmitPass(RegExpCompiler* compiler,
2262 TextEmitPassType pass,
2263 bool preloaded,
ager@chromium.org32912102009-01-16 10:38:43 +00002264 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002265 bool first_element_checked,
2266 int* checked_up_to) {
2267 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2268 bool ascii = compiler->ascii();
ager@chromium.org32912102009-01-16 10:38:43 +00002269 Label* backtrack = trace->backtrack();
2270 QuickCheckDetails* quick_check = trace->quick_check_performed();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002271 int element_count = elms_->length();
2272 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2273 TextElement elm = elms_->at(i);
ager@chromium.org32912102009-01-16 10:38:43 +00002274 int cp_offset = trace->cp_offset() + elm.cp_offset;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002275 if (elm.type == TextElement::ATOM) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00002276 Vector<const uc16> quarks = elm.data.u_atom->data();
2277 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2278 if (first_element_checked && i == 0 && j == 0) continue;
2279 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
2280 EmitCharacterFunction* emit_function = NULL;
2281 switch (pass) {
2282 case NON_ASCII_MATCH:
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002283 ASSERT(ascii);
2284 if (quarks[j] > String::kMaxAsciiCharCode) {
2285 assembler->GoTo(backtrack);
2286 return;
2287 }
ager@chromium.org381abbb2009-02-25 13:23:22 +00002288 break;
2289 case NON_LETTER_CHARACTER_MATCH:
2290 emit_function = &EmitAtomNonLetter;
2291 break;
2292 case SIMPLE_CHARACTER_MATCH:
2293 emit_function = &EmitSimpleCharacter;
2294 break;
2295 case CASE_CHARACTER_MATCH:
2296 emit_function = &EmitAtomLetter;
2297 break;
2298 default:
2299 break;
2300 }
2301 if (emit_function != NULL) {
2302 bool bound_checked = emit_function(compiler,
ager@chromium.org6f10e412009-02-13 10:11:16 +00002303 quarks[j],
2304 backtrack,
2305 cp_offset + j,
2306 *checked_up_to < cp_offset + j,
2307 preloaded);
ager@chromium.org381abbb2009-02-25 13:23:22 +00002308 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002309 }
2310 }
2311 } else {
2312 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002313 if (pass == CHARACTER_CLASS_MATCH) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00002314 if (first_element_checked && i == 0) continue;
2315 if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002316 RegExpCharacterClass* cc = elm.data.u_char_class;
2317 EmitCharClass(assembler,
2318 cc,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002319 ascii,
ager@chromium.org381abbb2009-02-25 13:23:22 +00002320 backtrack,
2321 cp_offset,
2322 *checked_up_to < cp_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002323 preloaded);
ager@chromium.org381abbb2009-02-25 13:23:22 +00002324 UpdateBoundsCheck(cp_offset, checked_up_to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002325 }
2326 }
2327 }
2328}
2329
2330
2331int TextNode::Length() {
2332 TextElement elm = elms_->last();
2333 ASSERT(elm.cp_offset >= 0);
2334 if (elm.type == TextElement::ATOM) {
2335 return elm.cp_offset + elm.data.u_atom->data().length();
2336 } else {
2337 return elm.cp_offset + 1;
2338 }
2339}
2340
2341
ager@chromium.org381abbb2009-02-25 13:23:22 +00002342bool TextNode::SkipPass(int int_pass, bool ignore_case) {
2343 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
2344 if (ignore_case) {
2345 return pass == SIMPLE_CHARACTER_MATCH;
2346 } else {
2347 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2348 }
2349}
2350
2351
ager@chromium.org8bb60582008-12-11 12:02:20 +00002352// This generates the code to match a text node. A text node can contain
2353// straight character sequences (possibly to be matched in a case-independent
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002354// way) and character classes. For efficiency we do not do this in a single
2355// pass from left to right. Instead we pass over the text node several times,
2356// emitting code for some character positions every time. See the comment on
2357// TextEmitPass for details.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002358void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org32912102009-01-16 10:38:43 +00002359 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002360 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002361 ASSERT(limit_result == CONTINUE);
2362
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002363 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2364 compiler->SetRegExpTooBig();
2365 return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002366 }
2367
2368 if (compiler->ascii()) {
2369 int dummy = 0;
ager@chromium.org32912102009-01-16 10:38:43 +00002370 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002371 }
2372
2373 bool first_elt_done = false;
ager@chromium.org32912102009-01-16 10:38:43 +00002374 int bound_checked_to = trace->cp_offset() - 1;
2375 bound_checked_to += trace->bound_checked_up_to();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002376
2377 // If a character is preloaded into the current character register then
2378 // check that now.
ager@chromium.org32912102009-01-16 10:38:43 +00002379 if (trace->characters_preloaded() == 1) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00002380 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2381 if (!SkipPass(pass, compiler->ignore_case())) {
2382 TextEmitPass(compiler,
2383 static_cast<TextEmitPassType>(pass),
2384 true,
2385 trace,
2386 false,
2387 &bound_checked_to);
2388 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002389 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002390 first_elt_done = true;
2391 }
2392
ager@chromium.org381abbb2009-02-25 13:23:22 +00002393 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2394 if (!SkipPass(pass, compiler->ignore_case())) {
2395 TextEmitPass(compiler,
2396 static_cast<TextEmitPassType>(pass),
2397 false,
2398 trace,
2399 first_elt_done,
2400 &bound_checked_to);
2401 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002402 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002403
ager@chromium.org32912102009-01-16 10:38:43 +00002404 Trace successor_trace(*trace);
iposva@chromium.org245aa852009-02-10 00:49:54 +00002405 successor_trace.set_at_start(false);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002406 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002407 RecursionCheck rc(compiler);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002408 on_success()->Emit(compiler, &successor_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002409}
2410
2411
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002412void Trace::InvalidateCurrentCharacter() {
2413 characters_preloaded_ = 0;
2414}
2415
2416
2417void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002418 ASSERT(by > 0);
2419 // We don't have an instruction for shifting the current character register
2420 // down or for using a shifted value for anything so lets just forget that
2421 // we preloaded any characters into it.
2422 characters_preloaded_ = 0;
2423 // Adjust the offsets of the quick check performed information. This
2424 // information is used to find out what we already determined about the
2425 // characters by means of mask and compare.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002426 quick_check_performed_.Advance(by, compiler->ascii());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002427 cp_offset_ += by;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002428 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2429 compiler->SetRegExpTooBig();
2430 cp_offset_ = 0;
2431 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002432 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002433}
2434
2435
2436void TextNode::MakeCaseIndependent() {
2437 int element_count = elms_->length();
2438 for (int i = 0; i < element_count; i++) {
2439 TextElement elm = elms_->at(i);
2440 if (elm.type == TextElement::CHAR_CLASS) {
2441 RegExpCharacterClass* cc = elm.data.u_char_class;
2442 ZoneList<CharacterRange>* ranges = cc->ranges();
2443 int range_count = ranges->length();
2444 for (int i = 0; i < range_count; i++) {
2445 ranges->at(i).AddCaseEquivalents(ranges);
2446 }
2447 }
2448 }
2449}
2450
2451
ager@chromium.org8bb60582008-12-11 12:02:20 +00002452int TextNode::GreedyLoopTextLength() {
2453 TextElement elm = elms_->at(elms_->length() - 1);
2454 if (elm.type == TextElement::CHAR_CLASS) {
2455 return elm.cp_offset + 1;
2456 } else {
2457 return elm.cp_offset + elm.data.u_atom->data().length();
2458 }
2459}
2460
2461
2462// Finds the fixed match length of a sequence of nodes that goes from
2463// this alternative and back to this choice node. If there are variable
2464// length nodes or other complications in the way then return a sentinel
2465// value indicating that a greedy loop cannot be constructed.
2466int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) {
2467 int length = 0;
2468 RegExpNode* node = alternative->node();
2469 // Later we will generate code for all these text nodes using recursion
2470 // so we have to limit the max number.
2471 int recursion_depth = 0;
2472 while (node != this) {
2473 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2474 return kNodeIsTooComplexForGreedyLoops;
2475 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002476 int node_length = node->GreedyLoopTextLength();
2477 if (node_length == kNodeIsTooComplexForGreedyLoops) {
2478 return kNodeIsTooComplexForGreedyLoops;
2479 }
2480 length += node_length;
2481 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2482 node = seq_node->on_success();
2483 }
2484 return length;
2485}
2486
2487
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002488void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2489 ASSERT_EQ(loop_node_, NULL);
2490 AddAlternative(alt);
2491 loop_node_ = alt.node();
2492}
2493
2494
2495void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2496 ASSERT_EQ(continue_node_, NULL);
2497 AddAlternative(alt);
2498 continue_node_ = alt.node();
2499}
2500
2501
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002502void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002503 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00002504 if (trace->stop_node() == this) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002505 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2506 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2507 // Update the counter-based backtracking info on the stack. This is an
2508 // optimization for greedy loops (see below).
ager@chromium.org32912102009-01-16 10:38:43 +00002509 ASSERT(trace->cp_offset() == text_length);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002510 macro_assembler->AdvanceCurrentPosition(text_length);
ager@chromium.org32912102009-01-16 10:38:43 +00002511 macro_assembler->GoTo(trace->loop_label());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002512 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002513 }
ager@chromium.org32912102009-01-16 10:38:43 +00002514 ASSERT(trace->stop_node() == NULL);
2515 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002516 trace->Flush(compiler, this);
2517 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002518 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002519 ChoiceNode::Emit(compiler, trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002520}
2521
2522
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002523int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002524 int preload_characters = EatsAtLeast(4, 0);
ager@chromium.org9085a012009-05-11 19:22:57 +00002525#ifdef V8_HOST_CAN_READ_UNALIGNED
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002526 bool ascii = compiler->ascii();
2527 if (ascii) {
2528 if (preload_characters > 4) preload_characters = 4;
2529 // We can't preload 3 characters because there is no machine instruction
2530 // to do that. We can't just load 4 because we could be reading
2531 // beyond the end of the string, which could cause a memory fault.
2532 if (preload_characters == 3) preload_characters = 2;
2533 } else {
2534 if (preload_characters > 2) preload_characters = 2;
2535 }
2536#else
2537 if (preload_characters > 1) preload_characters = 1;
2538#endif
2539 return preload_characters;
2540}
2541
2542
2543// This class is used when generating the alternatives in a choice node. It
2544// records the way the alternative is being code generated.
2545class AlternativeGeneration: public Malloced {
2546 public:
2547 AlternativeGeneration()
2548 : possible_success(),
2549 expects_preload(false),
2550 after(),
2551 quick_check_details() { }
2552 Label possible_success;
2553 bool expects_preload;
2554 Label after;
2555 QuickCheckDetails quick_check_details;
2556};
2557
2558
2559// Creates a list of AlternativeGenerations. If the list has a reasonable
2560// size then it is on the stack, otherwise the excess is on the heap.
2561class AlternativeGenerationList {
2562 public:
2563 explicit AlternativeGenerationList(int count)
2564 : alt_gens_(count) {
2565 for (int i = 0; i < count && i < kAFew; i++) {
2566 alt_gens_.Add(a_few_alt_gens_ + i);
2567 }
2568 for (int i = kAFew; i < count; i++) {
2569 alt_gens_.Add(new AlternativeGeneration());
2570 }
2571 }
2572 ~AlternativeGenerationList() {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002573 for (int i = kAFew; i < alt_gens_.length(); i++) {
2574 delete alt_gens_[i];
2575 alt_gens_[i] = NULL;
2576 }
2577 }
2578
2579 AlternativeGeneration* at(int i) {
2580 return alt_gens_[i];
2581 }
2582 private:
2583 static const int kAFew = 10;
2584 ZoneList<AlternativeGeneration*> alt_gens_;
2585 AlternativeGeneration a_few_alt_gens_[kAFew];
2586};
2587
2588
2589/* Code generation for choice nodes.
2590 *
2591 * We generate quick checks that do a mask and compare to eliminate a
2592 * choice. If the quick check succeeds then it jumps to the continuation to
2593 * do slow checks and check subsequent nodes. If it fails (the common case)
2594 * it falls through to the next choice.
2595 *
2596 * Here is the desired flow graph. Nodes directly below each other imply
2597 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
2598 * 3 doesn't have a quick check so we have to call the slow check.
2599 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
2600 * regexp continuation is generated directly after the Sn node, up to the
2601 * next GoTo if we decide to reuse some already generated code. Some
2602 * nodes expect preload_characters to be preloaded into the current
2603 * character register. R nodes do this preloading. Vertices are marked
2604 * F for failures and S for success (possible success in the case of quick
2605 * nodes). L, V, < and > are used as arrow heads.
2606 *
2607 * ----------> R
2608 * |
2609 * V
2610 * Q1 -----> S1
2611 * | S /
2612 * F| /
2613 * | F/
2614 * | /
2615 * | R
2616 * | /
2617 * V L
2618 * Q2 -----> S2
2619 * | S /
2620 * F| /
2621 * | F/
2622 * | /
2623 * | R
2624 * | /
2625 * V L
2626 * S3
2627 * |
2628 * F|
2629 * |
2630 * R
2631 * |
2632 * backtrack V
2633 * <----------Q4
2634 * \ F |
2635 * \ |S
2636 * \ F V
2637 * \-----S4
2638 *
2639 * For greedy loops we reverse our expectation and expect to match rather
2640 * than fail. Therefore we want the loop code to look like this (U is the
2641 * unwind code that steps back in the greedy loop). The following alternatives
2642 * look the same as above.
2643 * _____
2644 * / \
2645 * V |
2646 * ----------> S1 |
2647 * /| |
2648 * / |S |
2649 * F/ \_____/
2650 * /
2651 * |<-----------
2652 * | \
2653 * V \
2654 * Q2 ---> S2 \
2655 * | S / |
2656 * F| / |
2657 * | F/ |
2658 * | / |
2659 * | R |
2660 * | / |
2661 * F VL |
2662 * <------U |
2663 * back |S |
2664 * \______________/
2665 */
2666
2667
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002668void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002669 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2670 int choice_count = alternatives_->length();
2671#ifdef DEBUG
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002672 for (int i = 0; i < choice_count - 1; i++) {
2673 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002674 ZoneList<Guard*>* guards = alternative.guards();
ager@chromium.org8bb60582008-12-11 12:02:20 +00002675 int guard_count = (guards == NULL) ? 0 : guards->length();
2676 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00002677 ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002678 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002679 }
2680#endif
2681
ager@chromium.org32912102009-01-16 10:38:43 +00002682 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002683 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002684 ASSERT(limit_result == CONTINUE);
2685
ager@chromium.org381abbb2009-02-25 13:23:22 +00002686 int new_flush_budget = trace->flush_budget() / choice_count;
2687 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
2688 trace->Flush(compiler, this);
2689 return;
2690 }
2691
ager@chromium.org8bb60582008-12-11 12:02:20 +00002692 RecursionCheck rc(compiler);
2693
ager@chromium.org32912102009-01-16 10:38:43 +00002694 Trace* current_trace = trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002695
2696 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2697 bool greedy_loop = false;
2698 Label greedy_loop_label;
ager@chromium.org32912102009-01-16 10:38:43 +00002699 Trace counter_backtrack_trace;
2700 counter_backtrack_trace.set_backtrack(&greedy_loop_label);
iposva@chromium.org245aa852009-02-10 00:49:54 +00002701 if (not_at_start()) counter_backtrack_trace.set_at_start(false);
2702
ager@chromium.org8bb60582008-12-11 12:02:20 +00002703 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
2704 // Here we have special handling for greedy loops containing only text nodes
2705 // and other simple nodes. These are handled by pushing the current
2706 // position on the stack and then incrementing the current position each
2707 // time around the switch. On backtrack we decrement the current position
2708 // and check it against the pushed value. This avoids pushing backtrack
2709 // information for each iteration of the loop, which could take up a lot of
2710 // space.
2711 greedy_loop = true;
ager@chromium.org32912102009-01-16 10:38:43 +00002712 ASSERT(trace->stop_node() == NULL);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002713 macro_assembler->PushCurrentPosition();
ager@chromium.org32912102009-01-16 10:38:43 +00002714 current_trace = &counter_backtrack_trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002715 Label greedy_match_failed;
ager@chromium.org32912102009-01-16 10:38:43 +00002716 Trace greedy_match_trace;
iposva@chromium.org245aa852009-02-10 00:49:54 +00002717 if (not_at_start()) greedy_match_trace.set_at_start(false);
ager@chromium.org32912102009-01-16 10:38:43 +00002718 greedy_match_trace.set_backtrack(&greedy_match_failed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002719 Label loop_label;
2720 macro_assembler->Bind(&loop_label);
ager@chromium.org32912102009-01-16 10:38:43 +00002721 greedy_match_trace.set_stop_node(this);
2722 greedy_match_trace.set_loop_label(&loop_label);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002723 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002724 macro_assembler->Bind(&greedy_match_failed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002725 }
2726
2727 Label second_choice; // For use in greedy matches.
2728 macro_assembler->Bind(&second_choice);
2729
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002730 int first_normal_choice = greedy_loop ? 1 : 0;
2731
2732 int preload_characters = CalculatePreloadCharacters(compiler);
2733 bool preload_is_current =
ager@chromium.org32912102009-01-16 10:38:43 +00002734 (current_trace->characters_preloaded() == preload_characters);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002735 bool preload_has_checked_bounds = preload_is_current;
2736
2737 AlternativeGenerationList alt_gens(choice_count);
2738
ager@chromium.org8bb60582008-12-11 12:02:20 +00002739 // For now we just call all choices one after the other. The idea ultimately
2740 // is to use the Dispatch table to try only the relevant ones.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002741 for (int i = first_normal_choice; i < choice_count; i++) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002742 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.org32912102009-01-16 10:38:43 +00002743 AlternativeGeneration* alt_gen = alt_gens.at(i);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002744 alt_gen->quick_check_details.set_characters(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002745 ZoneList<Guard*>* guards = alternative.guards();
2746 int guard_count = (guards == NULL) ? 0 : guards->length();
ager@chromium.org32912102009-01-16 10:38:43 +00002747 Trace new_trace(*current_trace);
2748 new_trace.set_characters_preloaded(preload_is_current ?
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002749 preload_characters :
2750 0);
2751 if (preload_has_checked_bounds) {
ager@chromium.org32912102009-01-16 10:38:43 +00002752 new_trace.set_bound_checked_up_to(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002753 }
ager@chromium.org32912102009-01-16 10:38:43 +00002754 new_trace.quick_check_performed()->Clear();
iposva@chromium.org245aa852009-02-10 00:49:54 +00002755 if (not_at_start_) new_trace.set_at_start(Trace::FALSE);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002756 alt_gen->expects_preload = preload_is_current;
2757 bool generate_full_check_inline = false;
ager@chromium.org381abbb2009-02-25 13:23:22 +00002758 if (FLAG_regexp_optimization &&
iposva@chromium.org245aa852009-02-10 00:49:54 +00002759 try_to_emit_quick_check_for_alternative(i) &&
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002760 alternative.node()->EmitQuickCheck(compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00002761 &new_trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002762 preload_has_checked_bounds,
2763 &alt_gen->possible_success,
2764 &alt_gen->quick_check_details,
2765 i < choice_count - 1)) {
2766 // Quick check was generated for this choice.
2767 preload_is_current = true;
2768 preload_has_checked_bounds = true;
2769 // On the last choice in the ChoiceNode we generated the quick
2770 // check to fall through on possible success. So now we need to
2771 // generate the full check inline.
2772 if (i == choice_count - 1) {
2773 macro_assembler->Bind(&alt_gen->possible_success);
ager@chromium.org32912102009-01-16 10:38:43 +00002774 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
2775 new_trace.set_characters_preloaded(preload_characters);
2776 new_trace.set_bound_checked_up_to(preload_characters);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002777 generate_full_check_inline = true;
2778 }
iposva@chromium.org245aa852009-02-10 00:49:54 +00002779 } else if (alt_gen->quick_check_details.cannot_match()) {
2780 if (i == choice_count - 1 && !greedy_loop) {
2781 macro_assembler->GoTo(trace->backtrack());
2782 }
2783 continue;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002784 } else {
2785 // No quick check was generated. Put the full code here.
2786 // If this is not the first choice then there could be slow checks from
2787 // previous cases that go here when they fail. There's no reason to
2788 // insist that they preload characters since the slow check we are about
2789 // to generate probably can't use it.
2790 if (i != first_normal_choice) {
2791 alt_gen->expects_preload = false;
ager@chromium.org32912102009-01-16 10:38:43 +00002792 new_trace.set_characters_preloaded(0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002793 }
2794 if (i < choice_count - 1) {
ager@chromium.org32912102009-01-16 10:38:43 +00002795 new_trace.set_backtrack(&alt_gen->after);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002796 }
2797 generate_full_check_inline = true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002798 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002799 if (generate_full_check_inline) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00002800 if (new_trace.actions() != NULL) {
2801 new_trace.set_flush_budget(new_flush_budget);
2802 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002803 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00002804 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002805 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002806 alternative.node()->Emit(compiler, &new_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002807 preload_is_current = false;
2808 }
2809 macro_assembler->Bind(&alt_gen->after);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002810 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002811 if (greedy_loop) {
2812 macro_assembler->Bind(&greedy_loop_label);
2813 // If we have unwound to the bottom then backtrack.
ager@chromium.org32912102009-01-16 10:38:43 +00002814 macro_assembler->CheckGreedyLoop(trace->backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00002815 // Otherwise try the second priority at an earlier position.
2816 macro_assembler->AdvanceCurrentPosition(-text_length);
2817 macro_assembler->GoTo(&second_choice);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002818 }
ager@chromium.org381abbb2009-02-25 13:23:22 +00002819
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002820 // At this point we need to generate slow checks for the alternatives where
2821 // the quick check was inlined. We can recognize these because the associated
2822 // label was bound.
2823 for (int i = first_normal_choice; i < choice_count - 1; i++) {
2824 AlternativeGeneration* alt_gen = alt_gens.at(i);
ager@chromium.org381abbb2009-02-25 13:23:22 +00002825 Trace new_trace(*current_trace);
2826 // If there are actions to be flushed we have to limit how many times
2827 // they are flushed. Take the budget of the parent trace and distribute
2828 // it fairly amongst the children.
2829 if (new_trace.actions() != NULL) {
2830 new_trace.set_flush_budget(new_flush_budget);
2831 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002832 EmitOutOfLineContinuation(compiler,
ager@chromium.org381abbb2009-02-25 13:23:22 +00002833 &new_trace,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002834 alternatives_->at(i),
2835 alt_gen,
2836 preload_characters,
2837 alt_gens.at(i + 1)->expects_preload);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002838 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002839}
2840
2841
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002842void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00002843 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002844 GuardedAlternative alternative,
2845 AlternativeGeneration* alt_gen,
2846 int preload_characters,
2847 bool next_expects_preload) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002848 if (!alt_gen->possible_success.is_linked()) return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002849
2850 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2851 macro_assembler->Bind(&alt_gen->possible_success);
ager@chromium.org32912102009-01-16 10:38:43 +00002852 Trace out_of_line_trace(*trace);
2853 out_of_line_trace.set_characters_preloaded(preload_characters);
2854 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
iposva@chromium.org245aa852009-02-10 00:49:54 +00002855 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002856 ZoneList<Guard*>* guards = alternative.guards();
2857 int guard_count = (guards == NULL) ? 0 : guards->length();
2858 if (next_expects_preload) {
2859 Label reload_current_char;
ager@chromium.org32912102009-01-16 10:38:43 +00002860 out_of_line_trace.set_backtrack(&reload_current_char);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002861 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00002862 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002863 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002864 alternative.node()->Emit(compiler, &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002865 macro_assembler->Bind(&reload_current_char);
2866 // Reload the current character, since the next quick check expects that.
2867 // We don't need to check bounds here because we only get into this
2868 // code through a quick check which already did the checked load.
ager@chromium.org32912102009-01-16 10:38:43 +00002869 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002870 NULL,
2871 false,
2872 preload_characters);
2873 macro_assembler->GoTo(&(alt_gen->after));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002874 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00002875 out_of_line_trace.set_backtrack(&(alt_gen->after));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002876 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00002877 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002878 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002879 alternative.node()->Emit(compiler, &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002880 }
2881}
2882
2883
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002884void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002885 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00002886 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002887 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002888 ASSERT(limit_result == CONTINUE);
2889
2890 RecursionCheck rc(compiler);
2891
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002892 switch (type_) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002893 case STORE_POSITION: {
ager@chromium.org32912102009-01-16 10:38:43 +00002894 Trace::DeferredCapture
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002895 new_capture(data_.u_position_register.reg,
2896 data_.u_position_register.is_capture,
2897 trace);
ager@chromium.org32912102009-01-16 10:38:43 +00002898 Trace new_trace = *trace;
2899 new_trace.add_action(&new_capture);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002900 on_success()->Emit(compiler, &new_trace);
2901 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002902 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002903 case INCREMENT_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00002904 Trace::DeferredIncrementRegister
ager@chromium.org8bb60582008-12-11 12:02:20 +00002905 new_increment(data_.u_increment_register.reg);
ager@chromium.org32912102009-01-16 10:38:43 +00002906 Trace new_trace = *trace;
2907 new_trace.add_action(&new_increment);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002908 on_success()->Emit(compiler, &new_trace);
2909 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002910 }
2911 case SET_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00002912 Trace::DeferredSetRegister
ager@chromium.org8bb60582008-12-11 12:02:20 +00002913 new_set(data_.u_store_register.reg, data_.u_store_register.value);
ager@chromium.org32912102009-01-16 10:38:43 +00002914 Trace new_trace = *trace;
2915 new_trace.add_action(&new_set);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002916 on_success()->Emit(compiler, &new_trace);
2917 break;
ager@chromium.org32912102009-01-16 10:38:43 +00002918 }
2919 case CLEAR_CAPTURES: {
2920 Trace::DeferredClearCaptures
2921 new_capture(Interval(data_.u_clear_captures.range_from,
2922 data_.u_clear_captures.range_to));
2923 Trace new_trace = *trace;
2924 new_trace.add_action(&new_capture);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002925 on_success()->Emit(compiler, &new_trace);
2926 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002927 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002928 case BEGIN_SUBMATCH:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002929 if (!trace->is_trivial()) {
2930 trace->Flush(compiler, this);
2931 } else {
2932 assembler->WriteCurrentPositionToRegister(
2933 data_.u_submatch.current_position_register, 0);
2934 assembler->WriteStackPointerToRegister(
2935 data_.u_submatch.stack_pointer_register);
2936 on_success()->Emit(compiler, trace);
2937 }
2938 break;
ager@chromium.org32912102009-01-16 10:38:43 +00002939 case EMPTY_MATCH_CHECK: {
2940 int start_pos_reg = data_.u_empty_match_check.start_register;
2941 int stored_pos = 0;
2942 int rep_reg = data_.u_empty_match_check.repetition_register;
2943 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
2944 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
2945 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
2946 // If we know we haven't advanced and there is no minimum we
2947 // can just backtrack immediately.
2948 assembler->GoTo(trace->backtrack());
ager@chromium.org32912102009-01-16 10:38:43 +00002949 } else if (know_dist && stored_pos < trace->cp_offset()) {
2950 // If we know we've advanced we can generate the continuation
2951 // immediately.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002952 on_success()->Emit(compiler, trace);
2953 } else if (!trace->is_trivial()) {
2954 trace->Flush(compiler, this);
2955 } else {
2956 Label skip_empty_check;
2957 // If we have a minimum number of repetitions we check the current
2958 // number first and skip the empty check if it's not enough.
2959 if (has_minimum) {
2960 int limit = data_.u_empty_match_check.repetition_limit;
2961 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
2962 }
2963 // If the match is empty we bail out, otherwise we fall through
2964 // to the on-success continuation.
2965 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
2966 trace->backtrack());
2967 assembler->Bind(&skip_empty_check);
2968 on_success()->Emit(compiler, trace);
ager@chromium.org32912102009-01-16 10:38:43 +00002969 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002970 break;
ager@chromium.org32912102009-01-16 10:38:43 +00002971 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002972 case POSITIVE_SUBMATCH_SUCCESS: {
2973 if (!trace->is_trivial()) {
2974 trace->Flush(compiler, this);
2975 return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002976 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002977 assembler->ReadCurrentPositionFromRegister(
ager@chromium.org8bb60582008-12-11 12:02:20 +00002978 data_.u_submatch.current_position_register);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002979 assembler->ReadStackPointerFromRegister(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002980 data_.u_submatch.stack_pointer_register);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002981 int clear_register_count = data_.u_submatch.clear_register_count;
2982 if (clear_register_count == 0) {
2983 on_success()->Emit(compiler, trace);
2984 return;
2985 }
2986 int clear_registers_from = data_.u_submatch.clear_register_from;
2987 Label clear_registers_backtrack;
2988 Trace new_trace = *trace;
2989 new_trace.set_backtrack(&clear_registers_backtrack);
2990 on_success()->Emit(compiler, &new_trace);
2991
2992 assembler->Bind(&clear_registers_backtrack);
2993 int clear_registers_to = clear_registers_from + clear_register_count - 1;
2994 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
2995
2996 ASSERT(trace->backtrack() == NULL);
2997 assembler->Backtrack();
2998 return;
2999 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003000 default:
3001 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003002 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003003}
3004
3005
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003006void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003007 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00003008 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003009 trace->Flush(compiler, this);
3010 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003011 }
3012
ager@chromium.org32912102009-01-16 10:38:43 +00003013 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003014 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003015 ASSERT(limit_result == CONTINUE);
3016
3017 RecursionCheck rc(compiler);
3018
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003019 ASSERT_EQ(start_reg_ + 1, end_reg_);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003020 if (compiler->ignore_case()) {
3021 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
3022 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003023 } else {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003024 assembler->CheckNotBackReference(start_reg_, trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003025 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003026 on_success()->Emit(compiler, trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003027}
3028
3029
3030// -------------------------------------------------------------------
3031// Dot/dotty output
3032
3033
3034#ifdef DEBUG
3035
3036
3037class DotPrinter: public NodeVisitor {
3038 public:
3039 explicit DotPrinter(bool ignore_case)
3040 : ignore_case_(ignore_case),
3041 stream_(&alloc_) { }
3042 void PrintNode(const char* label, RegExpNode* node);
3043 void Visit(RegExpNode* node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003044 void PrintAttributes(RegExpNode* from);
3045 StringStream* stream() { return &stream_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003046 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003047#define DECLARE_VISIT(Type) \
3048 virtual void Visit##Type(Type##Node* that);
3049FOR_EACH_NODE_TYPE(DECLARE_VISIT)
3050#undef DECLARE_VISIT
3051 private:
3052 bool ignore_case_;
3053 HeapStringAllocator alloc_;
3054 StringStream stream_;
3055};
3056
3057
3058void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
3059 stream()->Add("digraph G {\n graph [label=\"");
3060 for (int i = 0; label[i]; i++) {
3061 switch (label[i]) {
3062 case '\\':
3063 stream()->Add("\\\\");
3064 break;
3065 case '"':
3066 stream()->Add("\"");
3067 break;
3068 default:
3069 stream()->Put(label[i]);
3070 break;
3071 }
3072 }
3073 stream()->Add("\"];\n");
3074 Visit(node);
3075 stream()->Add("}\n");
3076 printf("%s", *(stream()->ToCString()));
3077}
3078
3079
3080void DotPrinter::Visit(RegExpNode* node) {
3081 if (node->info()->visited) return;
3082 node->info()->visited = true;
3083 node->Accept(this);
3084}
3085
3086
3087void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003088 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
3089 Visit(on_failure);
3090}
3091
3092
3093class TableEntryBodyPrinter {
3094 public:
3095 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
3096 : stream_(stream), choice_(choice) { }
3097 void Call(uc16 from, DispatchTable::Entry entry) {
3098 OutSet* out_set = entry.out_set();
3099 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3100 if (out_set->Get(i)) {
3101 stream()->Add(" n%p:s%io%i -> n%p;\n",
3102 choice(),
3103 from,
3104 i,
3105 choice()->alternatives()->at(i).node());
3106 }
3107 }
3108 }
3109 private:
3110 StringStream* stream() { return stream_; }
3111 ChoiceNode* choice() { return choice_; }
3112 StringStream* stream_;
3113 ChoiceNode* choice_;
3114};
3115
3116
3117class TableEntryHeaderPrinter {
3118 public:
3119 explicit TableEntryHeaderPrinter(StringStream* stream)
3120 : first_(true), stream_(stream) { }
3121 void Call(uc16 from, DispatchTable::Entry entry) {
3122 if (first_) {
3123 first_ = false;
3124 } else {
3125 stream()->Add("|");
3126 }
3127 stream()->Add("{\\%k-\\%k|{", from, entry.to());
3128 OutSet* out_set = entry.out_set();
3129 int priority = 0;
3130 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3131 if (out_set->Get(i)) {
3132 if (priority > 0) stream()->Add("|");
3133 stream()->Add("<s%io%i> %i", from, i, priority);
3134 priority++;
3135 }
3136 }
3137 stream()->Add("}}");
3138 }
3139 private:
3140 bool first_;
3141 StringStream* stream() { return stream_; }
3142 StringStream* stream_;
3143};
3144
3145
3146class AttributePrinter {
3147 public:
3148 explicit AttributePrinter(DotPrinter* out)
3149 : out_(out), first_(true) { }
3150 void PrintSeparator() {
3151 if (first_) {
3152 first_ = false;
3153 } else {
3154 out_->stream()->Add("|");
3155 }
3156 }
3157 void PrintBit(const char* name, bool value) {
3158 if (!value) return;
3159 PrintSeparator();
3160 out_->stream()->Add("{%s}", name);
3161 }
3162 void PrintPositive(const char* name, int value) {
3163 if (value < 0) return;
3164 PrintSeparator();
3165 out_->stream()->Add("{%s|%x}", name, value);
3166 }
3167 private:
3168 DotPrinter* out_;
3169 bool first_;
3170};
3171
3172
3173void DotPrinter::PrintAttributes(RegExpNode* that) {
3174 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
3175 "margin=0.1, fontsize=10, label=\"{",
3176 that);
3177 AttributePrinter printer(this);
3178 NodeInfo* info = that->info();
3179 printer.PrintBit("NI", info->follows_newline_interest);
3180 printer.PrintBit("WI", info->follows_word_interest);
3181 printer.PrintBit("SI", info->follows_start_interest);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003182 Label* label = that->label();
3183 if (label->is_bound())
3184 printer.PrintPositive("@", label->pos());
3185 stream()->Add("}\"];\n");
3186 stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
3187 "arrowhead=none];\n", that, that);
3188}
3189
3190
3191static const bool kPrintDispatchTable = false;
3192void DotPrinter::VisitChoice(ChoiceNode* that) {
3193 if (kPrintDispatchTable) {
3194 stream()->Add(" n%p [shape=Mrecord, label=\"", that);
3195 TableEntryHeaderPrinter header_printer(stream());
3196 that->GetTable(ignore_case_)->ForEach(&header_printer);
3197 stream()->Add("\"]\n", that);
3198 PrintAttributes(that);
3199 TableEntryBodyPrinter body_printer(stream(), that);
3200 that->GetTable(ignore_case_)->ForEach(&body_printer);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003201 } else {
3202 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
3203 for (int i = 0; i < that->alternatives()->length(); i++) {
3204 GuardedAlternative alt = that->alternatives()->at(i);
3205 stream()->Add(" n%p -> n%p;\n", that, alt.node());
3206 }
3207 }
3208 for (int i = 0; i < that->alternatives()->length(); i++) {
3209 GuardedAlternative alt = that->alternatives()->at(i);
3210 alt.node()->Accept(this);
3211 }
3212}
3213
3214
3215void DotPrinter::VisitText(TextNode* that) {
3216 stream()->Add(" n%p [label=\"", that);
3217 for (int i = 0; i < that->elements()->length(); i++) {
3218 if (i > 0) stream()->Add(" ");
3219 TextElement elm = that->elements()->at(i);
3220 switch (elm.type) {
3221 case TextElement::ATOM: {
3222 stream()->Add("'%w'", elm.data.u_atom->data());
3223 break;
3224 }
3225 case TextElement::CHAR_CLASS: {
3226 RegExpCharacterClass* node = elm.data.u_char_class;
3227 stream()->Add("[");
3228 if (node->is_negated())
3229 stream()->Add("^");
3230 for (int j = 0; j < node->ranges()->length(); j++) {
3231 CharacterRange range = node->ranges()->at(j);
3232 stream()->Add("%k-%k", range.from(), range.to());
3233 }
3234 stream()->Add("]");
3235 break;
3236 }
3237 default:
3238 UNREACHABLE();
3239 }
3240 }
3241 stream()->Add("\", shape=box, peripheries=2];\n");
3242 PrintAttributes(that);
3243 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3244 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003245}
3246
3247
3248void DotPrinter::VisitBackReference(BackReferenceNode* that) {
3249 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
3250 that,
3251 that->start_register(),
3252 that->end_register());
3253 PrintAttributes(that);
3254 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3255 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003256}
3257
3258
3259void DotPrinter::VisitEnd(EndNode* that) {
3260 stream()->Add(" n%p [style=bold, shape=point];\n", that);
3261 PrintAttributes(that);
3262}
3263
3264
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003265void DotPrinter::VisitAssertion(AssertionNode* that) {
3266 stream()->Add(" n%p [", that);
3267 switch (that->type()) {
3268 case AssertionNode::AT_END:
3269 stream()->Add("label=\"$\", shape=septagon");
3270 break;
3271 case AssertionNode::AT_START:
3272 stream()->Add("label=\"^\", shape=septagon");
3273 break;
3274 case AssertionNode::AT_BOUNDARY:
3275 stream()->Add("label=\"\\b\", shape=septagon");
3276 break;
3277 case AssertionNode::AT_NON_BOUNDARY:
3278 stream()->Add("label=\"\\B\", shape=septagon");
3279 break;
3280 case AssertionNode::AFTER_NEWLINE:
3281 stream()->Add("label=\"(?<=\\n)\", shape=septagon");
3282 break;
3283 }
3284 stream()->Add("];\n");
3285 PrintAttributes(that);
3286 RegExpNode* successor = that->on_success();
3287 stream()->Add(" n%p -> n%p;\n", that, successor);
3288 Visit(successor);
3289}
3290
3291
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003292void DotPrinter::VisitAction(ActionNode* that) {
3293 stream()->Add(" n%p [", that);
3294 switch (that->type_) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003295 case ActionNode::SET_REGISTER:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003296 stream()->Add("label=\"$%i:=%i\", shape=octagon",
3297 that->data_.u_store_register.reg,
3298 that->data_.u_store_register.value);
3299 break;
3300 case ActionNode::INCREMENT_REGISTER:
3301 stream()->Add("label=\"$%i++\", shape=octagon",
3302 that->data_.u_increment_register.reg);
3303 break;
3304 case ActionNode::STORE_POSITION:
3305 stream()->Add("label=\"$%i:=$pos\", shape=octagon",
3306 that->data_.u_position_register.reg);
3307 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003308 case ActionNode::BEGIN_SUBMATCH:
3309 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
3310 that->data_.u_submatch.current_position_register);
3311 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003312 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003313 stream()->Add("label=\"escape\", shape=septagon");
3314 break;
ager@chromium.org32912102009-01-16 10:38:43 +00003315 case ActionNode::EMPTY_MATCH_CHECK:
3316 stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
3317 that->data_.u_empty_match_check.start_register,
3318 that->data_.u_empty_match_check.repetition_register,
3319 that->data_.u_empty_match_check.repetition_limit);
3320 break;
3321 case ActionNode::CLEAR_CAPTURES: {
3322 stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
3323 that->data_.u_clear_captures.range_from,
3324 that->data_.u_clear_captures.range_to);
3325 break;
3326 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003327 }
3328 stream()->Add("];\n");
3329 PrintAttributes(that);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003330 RegExpNode* successor = that->on_success();
3331 stream()->Add(" n%p -> n%p;\n", that, successor);
3332 Visit(successor);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003333}
3334
3335
3336class DispatchTableDumper {
3337 public:
3338 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
3339 void Call(uc16 key, DispatchTable::Entry entry);
3340 StringStream* stream() { return stream_; }
3341 private:
3342 StringStream* stream_;
3343};
3344
3345
3346void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
3347 stream()->Add("[%k-%k]: {", key, entry.to());
3348 OutSet* set = entry.out_set();
3349 bool first = true;
3350 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3351 if (set->Get(i)) {
3352 if (first) {
3353 first = false;
3354 } else {
3355 stream()->Add(", ");
3356 }
3357 stream()->Add("%i", i);
3358 }
3359 }
3360 stream()->Add("}\n");
3361}
3362
3363
3364void DispatchTable::Dump() {
3365 HeapStringAllocator alloc;
3366 StringStream stream(&alloc);
3367 DispatchTableDumper dumper(&stream);
3368 tree()->ForEach(&dumper);
3369 OS::PrintError("%s", *stream.ToCString());
3370}
3371
3372
3373void RegExpEngine::DotPrint(const char* label,
3374 RegExpNode* node,
3375 bool ignore_case) {
3376 DotPrinter printer(ignore_case);
3377 printer.PrintNode(label, node);
3378}
3379
3380
3381#endif // DEBUG
3382
3383
3384// -------------------------------------------------------------------
3385// Tree to graph conversion
3386
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003387static const int kSpaceRangeCount = 20;
3388static const int kSpaceRangeAsciiCount = 4;
3389static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020,
3390 0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A,
3391 0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 };
3392
3393static const int kWordRangeCount = 8;
3394static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_',
3395 '_', 'a', 'z' };
3396
3397static const int kDigitRangeCount = 2;
3398static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' };
3399
3400static const int kLineTerminatorRangeCount = 6;
3401static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A,
3402 0x000A, 0x000D, 0x000D, 0x2028, 0x2029 };
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003403
3404RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003405 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003406 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
3407 elms->Add(TextElement::Atom(this));
ager@chromium.org8bb60582008-12-11 12:02:20 +00003408 return new TextNode(elms, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003409}
3410
3411
3412RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003413 RegExpNode* on_success) {
3414 return new TextNode(elements(), on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003415}
3416
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003417static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
3418 const uc16* special_class,
3419 int length) {
3420 ASSERT(ranges->length() != 0);
3421 ASSERT(length != 0);
3422 ASSERT(special_class[0] != 0);
3423 if (ranges->length() != (length >> 1) + 1) {
3424 return false;
3425 }
3426 CharacterRange range = ranges->at(0);
3427 if (range.from() != 0) {
3428 return false;
3429 }
3430 for (int i = 0; i < length; i += 2) {
3431 if (special_class[i] != (range.to() + 1)) {
3432 return false;
3433 }
3434 range = ranges->at((i >> 1) + 1);
3435 if (special_class[i+1] != range.from() - 1) {
3436 return false;
3437 }
3438 }
3439 if (range.to() != 0xffff) {
3440 return false;
3441 }
3442 return true;
3443}
3444
3445
3446static bool CompareRanges(ZoneList<CharacterRange>* ranges,
3447 const uc16* special_class,
3448 int length) {
3449 if (ranges->length() * 2 != length) {
3450 return false;
3451 }
3452 for (int i = 0; i < length; i += 2) {
3453 CharacterRange range = ranges->at(i >> 1);
3454 if (range.from() != special_class[i] || range.to() != special_class[i+1]) {
3455 return false;
3456 }
3457 }
3458 return true;
3459}
3460
3461
3462bool RegExpCharacterClass::is_standard() {
3463 // TODO(lrn): Remove need for this function, by not throwing away information
3464 // along the way.
3465 if (is_negated_) {
3466 return false;
3467 }
3468 if (set_.is_standard()) {
3469 return true;
3470 }
3471 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3472 set_.set_standard_set_type('s');
3473 return true;
3474 }
3475 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3476 set_.set_standard_set_type('S');
3477 return true;
3478 }
3479 if (CompareInverseRanges(set_.ranges(),
3480 kLineTerminatorRanges,
3481 kLineTerminatorRangeCount)) {
3482 set_.set_standard_set_type('.');
3483 return true;
3484 }
3485 return false;
3486}
3487
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003488
3489RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003490 RegExpNode* on_success) {
3491 return new TextNode(this, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003492}
3493
3494
3495RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003496 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003497 ZoneList<RegExpTree*>* alternatives = this->alternatives();
3498 int length = alternatives->length();
ager@chromium.org8bb60582008-12-11 12:02:20 +00003499 ChoiceNode* result = new ChoiceNode(length);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003500 for (int i = 0; i < length; i++) {
3501 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003502 on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003503 result->AddAlternative(alternative);
3504 }
3505 return result;
3506}
3507
3508
3509RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003510 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003511 return ToNode(min(),
3512 max(),
3513 is_greedy(),
3514 body(),
3515 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003516 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003517}
3518
3519
3520RegExpNode* RegExpQuantifier::ToNode(int min,
3521 int max,
3522 bool is_greedy,
3523 RegExpTree* body,
3524 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00003525 RegExpNode* on_success,
3526 bool not_at_start) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003527 // x{f, t} becomes this:
3528 //
3529 // (r++)<-.
3530 // | `
3531 // | (x)
3532 // v ^
3533 // (r=0)-->(?)---/ [if r < t]
3534 // |
3535 // [if r >= f] \----> ...
3536 //
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003537
3538 // 15.10.2.5 RepeatMatcher algorithm.
3539 // The parser has already eliminated the case where max is 0. In the case
3540 // where max_match is zero the parser has removed the quantifier if min was
3541 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
3542
3543 // If we know that we cannot match zero length then things are a little
3544 // simpler since we don't need to make the special zero length match check
3545 // from step 2.1. If the min and max are small we can unroll a little in
3546 // this case.
3547 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
3548 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
3549 if (max == 0) return on_success; // This can happen due to recursion.
ager@chromium.org32912102009-01-16 10:38:43 +00003550 bool body_can_be_empty = (body->min_match() == 0);
3551 int body_start_reg = RegExpCompiler::kNoRegister;
3552 Interval capture_registers = body->CaptureRegisters();
3553 bool needs_capture_clearing = !capture_registers.is_empty();
3554 if (body_can_be_empty) {
3555 body_start_reg = compiler->AllocateRegister();
ager@chromium.org381abbb2009-02-25 13:23:22 +00003556 } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
ager@chromium.org32912102009-01-16 10:38:43 +00003557 // Only unroll if there are no captures and the body can't be
3558 // empty.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003559 if (min > 0 && min <= kMaxUnrolledMinMatches) {
3560 int new_max = (max == kInfinity) ? max : max - min;
3561 // Recurse once to get the loop or optional matches after the fixed ones.
iposva@chromium.org245aa852009-02-10 00:49:54 +00003562 RegExpNode* answer = ToNode(
3563 0, new_max, is_greedy, body, compiler, on_success, true);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003564 // Unroll the forced matches from 0 to min. This can cause chains of
3565 // TextNodes (which the parser does not generate). These should be
3566 // combined if it turns out they hinder good code generation.
3567 for (int i = 0; i < min; i++) {
3568 answer = body->ToNode(compiler, answer);
3569 }
3570 return answer;
3571 }
3572 if (max <= kMaxUnrolledMaxMatches) {
3573 ASSERT(min == 0);
3574 // Unroll the optional matches up to max.
3575 RegExpNode* answer = on_success;
3576 for (int i = 0; i < max; i++) {
3577 ChoiceNode* alternation = new ChoiceNode(2);
3578 if (is_greedy) {
3579 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3580 answer)));
3581 alternation->AddAlternative(GuardedAlternative(on_success));
3582 } else {
3583 alternation->AddAlternative(GuardedAlternative(on_success));
3584 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3585 answer)));
3586 }
3587 answer = alternation;
iposva@chromium.org245aa852009-02-10 00:49:54 +00003588 if (not_at_start) alternation->set_not_at_start();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003589 }
3590 return answer;
3591 }
3592 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003593 bool has_min = min > 0;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003594 bool has_max = max < RegExpTree::kInfinity;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003595 bool needs_counter = has_min || has_max;
ager@chromium.org32912102009-01-16 10:38:43 +00003596 int reg_ctr = needs_counter
3597 ? compiler->AllocateRegister()
3598 : RegExpCompiler::kNoRegister;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003599 LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
iposva@chromium.org245aa852009-02-10 00:49:54 +00003600 if (not_at_start) center->set_not_at_start();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003601 RegExpNode* loop_return = needs_counter
3602 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
3603 : static_cast<RegExpNode*>(center);
ager@chromium.org32912102009-01-16 10:38:43 +00003604 if (body_can_be_empty) {
3605 // If the body can be empty we need to check if it was and then
3606 // backtrack.
3607 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
3608 reg_ctr,
3609 min,
3610 loop_return);
3611 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003612 RegExpNode* body_node = body->ToNode(compiler, loop_return);
ager@chromium.org32912102009-01-16 10:38:43 +00003613 if (body_can_be_empty) {
3614 // If the body can be empty we need to store the start position
3615 // so we can bail out if it was empty.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003616 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
ager@chromium.org32912102009-01-16 10:38:43 +00003617 }
3618 if (needs_capture_clearing) {
3619 // Before entering the body of this loop we need to clear captures.
3620 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
3621 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003622 GuardedAlternative body_alt(body_node);
3623 if (has_max) {
3624 Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
3625 body_alt.AddGuard(body_guard);
3626 }
3627 GuardedAlternative rest_alt(on_success);
3628 if (has_min) {
3629 Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
3630 rest_alt.AddGuard(rest_guard);
3631 }
3632 if (is_greedy) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003633 center->AddLoopAlternative(body_alt);
3634 center->AddContinueAlternative(rest_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003635 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003636 center->AddContinueAlternative(rest_alt);
3637 center->AddLoopAlternative(body_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003638 }
3639 if (needs_counter) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003640 return ActionNode::SetRegister(reg_ctr, 0, center);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003641 } else {
3642 return center;
3643 }
3644}
3645
3646
3647RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003648 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003649 NodeInfo info;
3650 switch (type()) {
3651 case START_OF_LINE:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003652 return AssertionNode::AfterNewline(on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003653 case START_OF_INPUT:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003654 return AssertionNode::AtStart(on_success);
3655 case BOUNDARY:
3656 return AssertionNode::AtBoundary(on_success);
3657 case NON_BOUNDARY:
3658 return AssertionNode::AtNonBoundary(on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003659 case END_OF_INPUT:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003660 return AssertionNode::AtEnd(on_success);
3661 case END_OF_LINE: {
3662 // Compile $ in multiline regexps as an alternation with a positive
3663 // lookahead in one side and an end-of-input on the other side.
3664 // We need two registers for the lookahead.
3665 int stack_pointer_register = compiler->AllocateRegister();
3666 int position_register = compiler->AllocateRegister();
3667 // The ChoiceNode to distinguish between a newline and end-of-input.
3668 ChoiceNode* result = new ChoiceNode(2);
3669 // Create a newline atom.
3670 ZoneList<CharacterRange>* newline_ranges =
3671 new ZoneList<CharacterRange>(3);
3672 CharacterRange::AddClassEscape('n', newline_ranges);
3673 RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
3674 TextNode* newline_matcher = new TextNode(
3675 newline_atom,
3676 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
3677 position_register,
3678 0, // No captures inside.
3679 -1, // Ignored if no captures.
3680 on_success));
3681 // Create an end-of-input matcher.
3682 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
3683 stack_pointer_register,
3684 position_register,
3685 newline_matcher);
3686 // Add the two alternatives to the ChoiceNode.
3687 GuardedAlternative eol_alternative(end_of_line);
3688 result->AddAlternative(eol_alternative);
3689 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
3690 result->AddAlternative(end_alternative);
3691 return result;
3692 }
3693 default:
3694 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003695 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003696 return on_success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003697}
3698
3699
3700RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003701 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003702 return new BackReferenceNode(RegExpCapture::StartRegister(index()),
3703 RegExpCapture::EndRegister(index()),
ager@chromium.org8bb60582008-12-11 12:02:20 +00003704 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003705}
3706
3707
3708RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003709 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003710 return on_success;
3711}
3712
3713
3714RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003715 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003716 int stack_pointer_register = compiler->AllocateRegister();
3717 int position_register = compiler->AllocateRegister();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003718
3719 const int registers_per_capture = 2;
3720 const int register_of_first_capture = 2;
3721 int register_count = capture_count_ * registers_per_capture;
3722 int register_start =
3723 register_of_first_capture + capture_from_ * registers_per_capture;
3724
ager@chromium.org8bb60582008-12-11 12:02:20 +00003725 RegExpNode* success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003726 if (is_positive()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003727 RegExpNode* node = ActionNode::BeginSubmatch(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003728 stack_pointer_register,
3729 position_register,
3730 body()->ToNode(
3731 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003732 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
3733 position_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003734 register_count,
3735 register_start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003736 on_success)));
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003737 return node;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003738 } else {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003739 // We use a ChoiceNode for a negative lookahead because it has most of
3740 // the characteristics we need. It has the body of the lookahead as its
3741 // first alternative and the expression after the lookahead of the second
3742 // alternative. If the first alternative succeeds then the
3743 // NegativeSubmatchSuccess will unwind the stack including everything the
3744 // choice node set up and backtrack. If the first alternative fails then
3745 // the second alternative is tried, which is exactly the desired result
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003746 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
3747 // ChoiceNode that knows to ignore the first exit when calculating quick
3748 // checks.
ager@chromium.org8bb60582008-12-11 12:02:20 +00003749 GuardedAlternative body_alt(
3750 body()->ToNode(
3751 compiler,
3752 success = new NegativeSubmatchSuccess(stack_pointer_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003753 position_register,
3754 register_count,
3755 register_start)));
3756 ChoiceNode* choice_node =
3757 new NegativeLookaheadChoiceNode(body_alt,
3758 GuardedAlternative(on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003759 return ActionNode::BeginSubmatch(stack_pointer_register,
3760 position_register,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003761 choice_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003762 }
3763}
3764
3765
3766RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003767 RegExpNode* on_success) {
3768 return ToNode(body(), index(), compiler, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003769}
3770
3771
3772RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
3773 int index,
3774 RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003775 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003776 int start_reg = RegExpCapture::StartRegister(index);
3777 int end_reg = RegExpCapture::EndRegister(index);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003778 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003779 RegExpNode* body_node = body->ToNode(compiler, store_end);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003780 return ActionNode::StorePosition(start_reg, true, body_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003781}
3782
3783
3784RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003785 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003786 ZoneList<RegExpTree*>* children = nodes();
3787 RegExpNode* current = on_success;
3788 for (int i = children->length() - 1; i >= 0; i--) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003789 current = children->at(i)->ToNode(compiler, current);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003790 }
3791 return current;
3792}
3793
3794
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003795static void AddClass(const uc16* elmv,
3796 int elmc,
3797 ZoneList<CharacterRange>* ranges) {
3798 for (int i = 0; i < elmc; i += 2) {
3799 ASSERT(elmv[i] <= elmv[i + 1]);
3800 ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
3801 }
3802}
3803
3804
3805static void AddClassNegated(const uc16 *elmv,
3806 int elmc,
3807 ZoneList<CharacterRange>* ranges) {
3808 ASSERT(elmv[0] != 0x0000);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003809 ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003810 uc16 last = 0x0000;
3811 for (int i = 0; i < elmc; i += 2) {
3812 ASSERT(last <= elmv[i] - 1);
3813 ASSERT(elmv[i] <= elmv[i + 1]);
3814 ranges->Add(CharacterRange(last, elmv[i] - 1));
3815 last = elmv[i + 1] + 1;
3816 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003817 ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003818}
3819
3820
3821void CharacterRange::AddClassEscape(uc16 type,
3822 ZoneList<CharacterRange>* ranges) {
3823 switch (type) {
3824 case 's':
3825 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
3826 break;
3827 case 'S':
3828 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
3829 break;
3830 case 'w':
3831 AddClass(kWordRanges, kWordRangeCount, ranges);
3832 break;
3833 case 'W':
3834 AddClassNegated(kWordRanges, kWordRangeCount, ranges);
3835 break;
3836 case 'd':
3837 AddClass(kDigitRanges, kDigitRangeCount, ranges);
3838 break;
3839 case 'D':
3840 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
3841 break;
3842 case '.':
3843 AddClassNegated(kLineTerminatorRanges,
3844 kLineTerminatorRangeCount,
3845 ranges);
3846 break;
3847 // This is not a character range as defined by the spec but a
3848 // convenient shorthand for a character class that matches any
3849 // character.
3850 case '*':
3851 ranges->Add(CharacterRange::Everything());
3852 break;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003853 // This is the set of characters matched by the $ and ^ symbols
3854 // in multiline mode.
3855 case 'n':
3856 AddClass(kLineTerminatorRanges,
3857 kLineTerminatorRangeCount,
3858 ranges);
3859 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003860 default:
3861 UNREACHABLE();
3862 }
3863}
3864
3865
3866Vector<const uc16> CharacterRange::GetWordBounds() {
3867 return Vector<const uc16>(kWordRanges, kWordRangeCount);
3868}
3869
3870
3871class CharacterRangeSplitter {
3872 public:
3873 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
3874 ZoneList<CharacterRange>** excluded)
3875 : included_(included),
3876 excluded_(excluded) { }
3877 void Call(uc16 from, DispatchTable::Entry entry);
3878
3879 static const int kInBase = 0;
3880 static const int kInOverlay = 1;
3881
3882 private:
3883 ZoneList<CharacterRange>** included_;
3884 ZoneList<CharacterRange>** excluded_;
3885};
3886
3887
3888void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
3889 if (!entry.out_set()->Get(kInBase)) return;
3890 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
3891 ? included_
3892 : excluded_;
3893 if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
3894 (*target)->Add(CharacterRange(entry.from(), entry.to()));
3895}
3896
3897
3898void CharacterRange::Split(ZoneList<CharacterRange>* base,
3899 Vector<const uc16> overlay,
3900 ZoneList<CharacterRange>** included,
3901 ZoneList<CharacterRange>** excluded) {
3902 ASSERT_EQ(NULL, *included);
3903 ASSERT_EQ(NULL, *excluded);
3904 DispatchTable table;
3905 for (int i = 0; i < base->length(); i++)
3906 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
3907 for (int i = 0; i < overlay.length(); i += 2) {
3908 table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
3909 CharacterRangeSplitter::kInOverlay);
3910 }
3911 CharacterRangeSplitter callback(included, excluded);
3912 table.ForEach(&callback);
3913}
3914
3915
3916void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
3917 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3918 if (IsSingleton()) {
3919 // If this is a singleton we just expand the one character.
3920 int length = uncanonicalize.get(from(), '\0', chars);
3921 for (int i = 0; i < length; i++) {
3922 uc32 chr = chars[i];
3923 if (chr != from()) {
3924 ranges->Add(CharacterRange::Singleton(chars[i]));
3925 }
3926 }
3927 } else if (from() <= kRangeCanonicalizeMax &&
3928 to() <= kRangeCanonicalizeMax) {
3929 // If this is a range we expand the characters block by block,
3930 // expanding contiguous subranges (blocks) one at a time.
3931 // The approach is as follows. For a given start character we
3932 // look up the block that contains it, for instance 'a' if the
3933 // start character is 'c'. A block is characterized by the property
3934 // that all characters uncanonicalize in the same way as the first
3935 // element, except that each entry in the result is incremented
3936 // by the distance from the first element. So a-z is a block
3937 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
3938 // uncanonicalizes to ['a' + k, 'A' + k].
3939 // Once we've found the start point we look up its uncanonicalization
3940 // and produce a range for each element. For instance for [c-f]
3941 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
3942 // add a range if it is not already contained in the input, so [c-f]
3943 // will be skipped but [C-F] will be added. If this range is not
3944 // completely contained in a block we do this for all the blocks
3945 // covered by the range.
3946 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3947 // First, look up the block that contains the 'from' character.
3948 int length = canonrange.get(from(), '\0', range);
3949 if (length == 0) {
3950 range[0] = from();
3951 } else {
3952 ASSERT_EQ(1, length);
3953 }
3954 int pos = from();
3955 // The start of the current block. Note that except for the first
3956 // iteration 'start' is always equal to 'pos'.
3957 int start;
3958 // If it is not the start point of a block the entry contains the
3959 // offset of the character from the start point.
3960 if ((range[0] & kStartMarker) == 0) {
3961 start = pos - range[0];
3962 } else {
3963 start = pos;
3964 }
3965 // Then we add the ranges on at a time, incrementing the current
3966 // position to be after the last block each time. The position
3967 // always points to the start of a block.
3968 while (pos < to()) {
3969 length = canonrange.get(start, '\0', range);
3970 if (length == 0) {
3971 range[0] = start;
3972 } else {
3973 ASSERT_EQ(1, length);
3974 }
3975 ASSERT((range[0] & kStartMarker) != 0);
3976 // The start point of a block contains the distance to the end
3977 // of the range.
3978 int block_end = start + (range[0] & kPayloadMask) - 1;
3979 int end = (block_end > to()) ? to() : block_end;
3980 length = uncanonicalize.get(start, '\0', range);
3981 for (int i = 0; i < length; i++) {
3982 uc32 c = range[i];
3983 uc16 range_from = c + (pos - start);
3984 uc16 range_to = c + (end - start);
3985 if (!(from() <= range_from && range_to <= to())) {
3986 ranges->Add(CharacterRange(range_from, range_to));
3987 }
3988 }
3989 start = pos = block_end + 1;
3990 }
3991 } else {
3992 // TODO(plesner) when we've fixed the 2^11 bug in unibrow.
3993 }
3994}
3995
3996
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003997ZoneList<CharacterRange>* CharacterSet::ranges() {
3998 if (ranges_ == NULL) {
3999 ranges_ = new ZoneList<CharacterRange>(2);
4000 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
4001 }
4002 return ranges_;
4003}
4004
4005
4006
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004007// -------------------------------------------------------------------
4008// Interest propagation
4009
4010
4011RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
4012 for (int i = 0; i < siblings_.length(); i++) {
4013 RegExpNode* sibling = siblings_.Get(i);
4014 if (sibling->info()->Matches(info))
4015 return sibling;
4016 }
4017 return NULL;
4018}
4019
4020
4021RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
4022 ASSERT_EQ(false, *cloned);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004023 siblings_.Ensure(this);
4024 RegExpNode* result = TryGetSibling(info);
4025 if (result != NULL) return result;
4026 result = this->Clone();
4027 NodeInfo* new_info = result->info();
4028 new_info->ResetCompilationState();
4029 new_info->AddFromPreceding(info);
4030 AddSibling(result);
4031 *cloned = true;
4032 return result;
4033}
4034
4035
4036template <class C>
4037static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
4038 NodeInfo full_info(*node->info());
4039 full_info.AddFromPreceding(info);
4040 bool cloned = false;
4041 return RegExpNode::EnsureSibling(node, &full_info, &cloned);
4042}
4043
4044
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004045// -------------------------------------------------------------------
4046// Splay tree
4047
4048
4049OutSet* OutSet::Extend(unsigned value) {
4050 if (Get(value))
4051 return this;
4052 if (successors() != NULL) {
4053 for (int i = 0; i < successors()->length(); i++) {
4054 OutSet* successor = successors()->at(i);
4055 if (successor->Get(value))
4056 return successor;
4057 }
4058 } else {
4059 successors_ = new ZoneList<OutSet*>(2);
4060 }
4061 OutSet* result = new OutSet(first_, remaining_);
4062 result->Set(value);
4063 successors()->Add(result);
4064 return result;
4065}
4066
4067
4068void OutSet::Set(unsigned value) {
4069 if (value < kFirstLimit) {
4070 first_ |= (1 << value);
4071 } else {
4072 if (remaining_ == NULL)
4073 remaining_ = new ZoneList<unsigned>(1);
4074 if (remaining_->is_empty() || !remaining_->Contains(value))
4075 remaining_->Add(value);
4076 }
4077}
4078
4079
4080bool OutSet::Get(unsigned value) {
4081 if (value < kFirstLimit) {
4082 return (first_ & (1 << value)) != 0;
4083 } else if (remaining_ == NULL) {
4084 return false;
4085 } else {
4086 return remaining_->Contains(value);
4087 }
4088}
4089
4090
4091const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
4092const DispatchTable::Entry DispatchTable::Config::kNoValue;
4093
4094
4095void DispatchTable::AddRange(CharacterRange full_range, int value) {
4096 CharacterRange current = full_range;
4097 if (tree()->is_empty()) {
4098 // If this is the first range we just insert into the table.
4099 ZoneSplayTree<Config>::Locator loc;
4100 ASSERT_RESULT(tree()->Insert(current.from(), &loc));
4101 loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
4102 return;
4103 }
4104 // First see if there is a range to the left of this one that
4105 // overlaps.
4106 ZoneSplayTree<Config>::Locator loc;
4107 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
4108 Entry* entry = &loc.value();
4109 // If we've found a range that overlaps with this one, and it
4110 // starts strictly to the left of this one, we have to fix it
4111 // because the following code only handles ranges that start on
4112 // or after the start point of the range we're adding.
4113 if (entry->from() < current.from() && entry->to() >= current.from()) {
4114 // Snap the overlapping range in half around the start point of
4115 // the range we're adding.
4116 CharacterRange left(entry->from(), current.from() - 1);
4117 CharacterRange right(current.from(), entry->to());
4118 // The left part of the overlapping range doesn't overlap.
4119 // Truncate the whole entry to be just the left part.
4120 entry->set_to(left.to());
4121 // The right part is the one that overlaps. We add this part
4122 // to the map and let the next step deal with merging it with
4123 // the range we're adding.
4124 ZoneSplayTree<Config>::Locator loc;
4125 ASSERT_RESULT(tree()->Insert(right.from(), &loc));
4126 loc.set_value(Entry(right.from(),
4127 right.to(),
4128 entry->out_set()));
4129 }
4130 }
4131 while (current.is_valid()) {
4132 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
4133 (loc.value().from() <= current.to()) &&
4134 (loc.value().to() >= current.from())) {
4135 Entry* entry = &loc.value();
4136 // We have overlap. If there is space between the start point of
4137 // the range we're adding and where the overlapping range starts
4138 // then we have to add a range covering just that space.
4139 if (current.from() < entry->from()) {
4140 ZoneSplayTree<Config>::Locator ins;
4141 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4142 ins.set_value(Entry(current.from(),
4143 entry->from() - 1,
4144 empty()->Extend(value)));
4145 current.set_from(entry->from());
4146 }
4147 ASSERT_EQ(current.from(), entry->from());
4148 // If the overlapping range extends beyond the one we want to add
4149 // we have to snap the right part off and add it separately.
4150 if (entry->to() > current.to()) {
4151 ZoneSplayTree<Config>::Locator ins;
4152 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
4153 ins.set_value(Entry(current.to() + 1,
4154 entry->to(),
4155 entry->out_set()));
4156 entry->set_to(current.to());
4157 }
4158 ASSERT(entry->to() <= current.to());
4159 // The overlapping range is now completely contained by the range
4160 // we're adding so we can just update it and move the start point
4161 // of the range we're adding just past it.
4162 entry->AddValue(value);
4163 // Bail out if the last interval ended at 0xFFFF since otherwise
4164 // adding 1 will wrap around to 0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004165 if (entry->to() == String::kMaxUC16CharCode)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004166 break;
4167 ASSERT(entry->to() + 1 > current.from());
4168 current.set_from(entry->to() + 1);
4169 } else {
4170 // There is no overlap so we can just add the range
4171 ZoneSplayTree<Config>::Locator ins;
4172 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4173 ins.set_value(Entry(current.from(),
4174 current.to(),
4175 empty()->Extend(value)));
4176 break;
4177 }
4178 }
4179}
4180
4181
4182OutSet* DispatchTable::Get(uc16 value) {
4183 ZoneSplayTree<Config>::Locator loc;
4184 if (!tree()->FindGreatestLessThan(value, &loc))
4185 return empty();
4186 Entry* entry = &loc.value();
4187 if (value <= entry->to())
4188 return entry->out_set();
4189 else
4190 return empty();
4191}
4192
4193
4194// -------------------------------------------------------------------
4195// Analysis
4196
4197
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004198void Analysis::EnsureAnalyzed(RegExpNode* that) {
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00004199 StackLimitCheck check;
4200 if (check.HasOverflowed()) {
4201 fail("Stack overflow");
4202 return;
4203 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004204 if (that->info()->been_analyzed || that->info()->being_analyzed)
4205 return;
4206 that->info()->being_analyzed = true;
4207 that->Accept(this);
4208 that->info()->being_analyzed = false;
4209 that->info()->been_analyzed = true;
4210}
4211
4212
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004213void Analysis::VisitEnd(EndNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004214 // nothing to do
4215}
4216
4217
ager@chromium.org8bb60582008-12-11 12:02:20 +00004218void TextNode::CalculateOffsets() {
4219 int element_count = elements()->length();
4220 // Set up the offsets of the elements relative to the start. This is a fixed
4221 // quantity since a TextNode can only contain fixed-width things.
4222 int cp_offset = 0;
4223 for (int i = 0; i < element_count; i++) {
4224 TextElement& elm = elements()->at(i);
4225 elm.cp_offset = cp_offset;
4226 if (elm.type == TextElement::ATOM) {
4227 cp_offset += elm.data.u_atom->data().length();
4228 } else {
4229 cp_offset++;
4230 Vector<const uc16> quarks = elm.data.u_atom->data();
4231 }
4232 }
4233}
4234
4235
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004236void Analysis::VisitText(TextNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004237 if (ignore_case_) {
4238 that->MakeCaseIndependent();
4239 }
4240 EnsureAnalyzed(that->on_success());
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00004241 if (!has_failed()) {
4242 that->CalculateOffsets();
4243 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004244}
4245
4246
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004247void Analysis::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004248 RegExpNode* target = that->on_success();
4249 EnsureAnalyzed(target);
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00004250 if (!has_failed()) {
4251 // If the next node is interested in what it follows then this node
4252 // has to be interested too so it can pass the information on.
4253 that->info()->AddFromFollowing(target->info());
4254 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004255}
4256
4257
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004258void Analysis::VisitChoice(ChoiceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004259 NodeInfo* info = that->info();
4260 for (int i = 0; i < that->alternatives()->length(); i++) {
4261 RegExpNode* node = that->alternatives()->at(i).node();
4262 EnsureAnalyzed(node);
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00004263 if (has_failed()) return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004264 // Anything the following nodes need to know has to be known by
4265 // this node also, so it can pass it on.
4266 info->AddFromFollowing(node->info());
4267 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004268}
4269
4270
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004271void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
4272 NodeInfo* info = that->info();
4273 for (int i = 0; i < that->alternatives()->length(); i++) {
4274 RegExpNode* node = that->alternatives()->at(i).node();
4275 if (node != that->loop_node()) {
4276 EnsureAnalyzed(node);
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00004277 if (has_failed()) return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004278 info->AddFromFollowing(node->info());
4279 }
4280 }
4281 // Check the loop last since it may need the value of this node
4282 // to get a correct result.
4283 EnsureAnalyzed(that->loop_node());
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00004284 if (!has_failed()) {
4285 info->AddFromFollowing(that->loop_node()->info());
4286 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004287}
4288
4289
4290void Analysis::VisitBackReference(BackReferenceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004291 EnsureAnalyzed(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004292}
4293
4294
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004295void Analysis::VisitAssertion(AssertionNode* that) {
4296 EnsureAnalyzed(that->on_success());
4297}
4298
4299
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004300// -------------------------------------------------------------------
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004301// Dispatch table construction
4302
4303
4304void DispatchTableConstructor::VisitEnd(EndNode* that) {
4305 AddRange(CharacterRange::Everything());
4306}
4307
4308
4309void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
4310 node->set_being_calculated(true);
4311 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
4312 for (int i = 0; i < alternatives->length(); i++) {
4313 set_choice_index(i);
4314 alternatives->at(i).node()->Accept(this);
4315 }
4316 node->set_being_calculated(false);
4317}
4318
4319
4320class AddDispatchRange {
4321 public:
4322 explicit AddDispatchRange(DispatchTableConstructor* constructor)
4323 : constructor_(constructor) { }
4324 void Call(uc32 from, DispatchTable::Entry entry);
4325 private:
4326 DispatchTableConstructor* constructor_;
4327};
4328
4329
4330void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
4331 CharacterRange range(from, entry.to());
4332 constructor_->AddRange(range);
4333}
4334
4335
4336void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
4337 if (node->being_calculated())
4338 return;
4339 DispatchTable* table = node->GetTable(ignore_case_);
4340 AddDispatchRange adder(this);
4341 table->ForEach(&adder);
4342}
4343
4344
4345void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
4346 // TODO(160): Find the node that we refer back to and propagate its start
4347 // set back to here. For now we just accept anything.
4348 AddRange(CharacterRange::Everything());
4349}
4350
4351
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004352void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
4353 RegExpNode* target = that->on_success();
4354 target->Accept(this);
4355}
4356
4357
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004358
4359static int CompareRangeByFrom(const CharacterRange* a,
4360 const CharacterRange* b) {
4361 return Compare<uc16>(a->from(), b->from());
4362}
4363
4364
4365void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
4366 ranges->Sort(CompareRangeByFrom);
4367 uc16 last = 0;
4368 for (int i = 0; i < ranges->length(); i++) {
4369 CharacterRange range = ranges->at(i);
4370 if (last < range.from())
4371 AddRange(CharacterRange(last, range.from() - 1));
4372 if (range.to() >= last) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004373 if (range.to() == String::kMaxUC16CharCode) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004374 return;
4375 } else {
4376 last = range.to() + 1;
4377 }
4378 }
4379 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004380 AddRange(CharacterRange(last, String::kMaxUC16CharCode));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004381}
4382
4383
4384void DispatchTableConstructor::VisitText(TextNode* that) {
4385 TextElement elm = that->elements()->at(0);
4386 switch (elm.type) {
4387 case TextElement::ATOM: {
4388 uc16 c = elm.data.u_atom->data()[0];
4389 AddRange(CharacterRange(c, c));
4390 break;
4391 }
4392 case TextElement::CHAR_CLASS: {
4393 RegExpCharacterClass* tree = elm.data.u_char_class;
4394 ZoneList<CharacterRange>* ranges = tree->ranges();
4395 if (tree->is_negated()) {
4396 AddInverse(ranges);
4397 } else {
4398 for (int i = 0; i < ranges->length(); i++)
4399 AddRange(ranges->at(i));
4400 }
4401 break;
4402 }
4403 default: {
4404 UNIMPLEMENTED();
4405 }
4406 }
4407}
4408
4409
4410void DispatchTableConstructor::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004411 RegExpNode* target = that->on_success();
4412 target->Accept(this);
4413}
4414
4415
kasperl@chromium.org7be3c992009-03-12 07:19:55 +00004416RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data,
4417 bool ignore_case,
4418 bool is_multiline,
4419 Handle<String> pattern,
4420 bool is_ascii) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004421 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +00004422 return IrregexpRegExpTooBig();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004423 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004424 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004425 // Wrap the body of the regexp in capture #0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004426 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004427 0,
4428 &compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004429 compiler.accept());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004430 RegExpNode* node = captured_body;
4431 if (!data->tree->IsAnchored()) {
4432 // Add a .*? at the beginning, outside the body capture, unless
4433 // this expression is anchored at the beginning.
iposva@chromium.org245aa852009-02-10 00:49:54 +00004434 RegExpNode* loop_node =
4435 RegExpQuantifier::ToNode(0,
4436 RegExpTree::kInfinity,
4437 false,
4438 new RegExpCharacterClass('*'),
4439 &compiler,
4440 captured_body,
4441 data->contains_anchor);
4442
4443 if (data->contains_anchor) {
4444 // Unroll loop once, to take care of the case that might start
4445 // at the start of input.
4446 ChoiceNode* first_step_node = new ChoiceNode(2);
4447 first_step_node->AddAlternative(GuardedAlternative(captured_body));
4448 first_step_node->AddAlternative(GuardedAlternative(
4449 new TextNode(new RegExpCharacterClass('*'), loop_node)));
4450 node = first_step_node;
4451 } else {
4452 node = loop_node;
4453 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004454 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004455 data->node = node;
4456 Analysis analysis(ignore_case);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004457 analysis.EnsureAnalyzed(node);
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00004458 if (analysis.has_failed()) {
4459 const char* error_message = analysis.error_message();
4460 return CompilationResult(error_message);
4461 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004462
4463 NodeInfo info = *node->info();
ager@chromium.org8bb60582008-12-11 12:02:20 +00004464
sgjesse@chromium.org911335c2009-08-19 12:59:44 +00004465 // Create the correct assembler for the architecture.
kasperl@chromium.org68ac0092009-07-09 06:00:35 +00004466#ifdef V8_NATIVE_REGEXP
sgjesse@chromium.org911335c2009-08-19 12:59:44 +00004467 // Native regexp implementation.
4468
4469 NativeRegExpMacroAssembler::Mode mode =
4470 is_ascii ? NativeRegExpMacroAssembler::ASCII
4471 : NativeRegExpMacroAssembler::UC16;
4472
ager@chromium.org9085a012009-05-11 19:22:57 +00004473#ifdef V8_TARGET_ARCH_IA32
kasperl@chromium.org68ac0092009-07-09 06:00:35 +00004474 RegExpMacroAssemblerIA32 macro_assembler(mode,
4475 (data->capture_count + 1) * 2);
kasperl@chromium.org68ac0092009-07-09 06:00:35 +00004476#endif
sgjesse@chromium.org911335c2009-08-19 12:59:44 +00004477#ifdef V8_TARGET_ARCH_X64
4478 RegExpMacroAssemblerX64 macro_assembler(mode,
4479 (data->capture_count + 1) * 2);
4480#endif
4481#ifdef V8_TARGET_ARCH_ARM
4482 UNIMPLEMENTED();
4483#endif
4484
kasperl@chromium.org68ac0092009-07-09 06:00:35 +00004485#else // ! V8_NATIVE_REGEXP
sgjesse@chromium.org911335c2009-08-19 12:59:44 +00004486 // Interpreted regexp implementation.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004487 EmbeddedVector<byte, 1024> codes;
4488 RegExpMacroAssemblerIrregexp macro_assembler(codes);
sgjesse@chromium.org911335c2009-08-19 12:59:44 +00004489#endif
4490
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004491 return compiler.Assemble(&macro_assembler,
4492 node,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004493 data->capture_count,
4494 pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004495}
4496
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00004497}} // namespace v8::internal