blob: e1171894335aebb5ef9a2079b67d814924512cf5 [file] [log] [blame]
ager@chromium.org381abbb2009-02-25 13:23:22 +00001// Copyright 2006-2009 the V8 project authors. All rights reserved.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include "v8.h"
29
ager@chromium.orga74f0da2008-12-03 16:05:52 +000030#include "ast.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000031#include "execution.h"
32#include "factory.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000033#include "jsregexp-inl.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000034#include "platform.h"
kasperl@chromium.org41044eb2008-10-06 08:24:46 +000035#include "runtime.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000036#include "top.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000037#include "compilation-cache.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000038#include "string-stream.h"
39#include "parser.h"
40#include "regexp-macro-assembler.h"
41#include "regexp-macro-assembler-tracer.h"
42#include "regexp-macro-assembler-irregexp.h"
ager@chromium.org32912102009-01-16 10:38:43 +000043#include "regexp-stack.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000044
45#ifdef ARM
46#include "regexp-macro-assembler-arm.h"
47#else // IA32
48#include "macro-assembler-ia32.h"
49#include "regexp-macro-assembler-ia32.h"
50#endif
51
52#include "interpreter-irregexp.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000053
ager@chromium.orga74f0da2008-12-03 16:05:52 +000054
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000055namespace v8 { namespace internal {
56
57
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +000058Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
59 Handle<String> pattern,
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000060 Handle<String> flags,
61 bool* has_pending_exception) {
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +000062 // Ensure that the constructor function has been loaded.
63 if (!constructor->IsLoaded()) {
64 LoadLazy(constructor, has_pending_exception);
ager@chromium.orga74f0da2008-12-03 16:05:52 +000065 if (*has_pending_exception) return Handle<Object>();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000066 }
67 // Call the construct code with 2 arguments.
68 Object** argv[2] = { Handle<Object>::cast(pattern).location(),
69 Handle<Object>::cast(flags).location() };
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +000070 return Execution::New(constructor, 2, argv, has_pending_exception);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000071}
72
73
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000074static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
75 int flags = JSRegExp::NONE;
ager@chromium.orgbb29dc92009-03-24 13:25:23 +000076 for (int i = 0; i < str->length(); i++) {
77 switch (str->Get(i)) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000078 case 'i':
79 flags |= JSRegExp::IGNORE_CASE;
80 break;
81 case 'g':
82 flags |= JSRegExp::GLOBAL;
83 break;
84 case 'm':
85 flags |= JSRegExp::MULTILINE;
86 break;
87 }
88 }
89 return JSRegExp::Flags(flags);
90}
91
92
ager@chromium.orga74f0da2008-12-03 16:05:52 +000093static inline void ThrowRegExpException(Handle<JSRegExp> re,
94 Handle<String> pattern,
95 Handle<String> error_text,
96 const char* message) {
97 Handle<JSArray> array = Factory::NewJSArray(2);
98 SetElement(array, 0, pattern);
99 SetElement(array, 1, error_text);
100 Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
101 Top::Throw(*regexp_err);
102}
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000103
104
ager@chromium.org8bb60582008-12-11 12:02:20 +0000105// Generic RegExp methods. Dispatches to implementation specific methods.
106
107
108class OffsetsVector {
109 public:
110 inline OffsetsVector(int num_registers)
111 : offsets_vector_length_(num_registers) {
112 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
113 vector_ = NewArray<int>(offsets_vector_length_);
114 } else {
115 vector_ = static_offsets_vector_;
116 }
117 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000118 inline ~OffsetsVector() {
119 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
120 DeleteArray(vector_);
121 vector_ = NULL;
122 }
123 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000124 inline int* vector() { return vector_; }
125 inline int length() { return offsets_vector_length_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000126
127 private:
128 int* vector_;
129 int offsets_vector_length_;
130 static const int kStaticOffsetsVectorSize = 50;
131 static int static_offsets_vector_[kStaticOffsetsVectorSize];
132};
133
134
135int OffsetsVector::static_offsets_vector_[
136 OffsetsVector::kStaticOffsetsVectorSize];
137
138
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000139Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
140 Handle<String> pattern,
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000141 Handle<String> flag_str) {
142 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
143 Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
144 bool in_cache = !cached.is_null();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000145 LOG(RegExpCompileEvent(re, in_cache));
146
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000147 Handle<Object> result;
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000148 if (in_cache) {
149 re->set_data(*cached);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000150 return re;
151 }
152 FlattenString(pattern);
153 ZoneScope zone_scope(DELETE_ON_EXIT);
154 RegExpCompileData parse_result;
155 FlatStringReader reader(pattern);
156 if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
157 // Throw an exception if we fail to parse the pattern.
158 ThrowRegExpException(re,
159 pattern,
160 parse_result.error,
161 "malformed_regexp");
162 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000163 }
164
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000165 if (parse_result.simple && !flags.is_ignore_case()) {
166 // Parse-tree is a single atom that is equal to the pattern.
167 AtomCompile(re, pattern, flags, pattern);
168 } else if (parse_result.tree->IsAtom() &&
169 !flags.is_ignore_case() &&
170 parse_result.capture_count == 0) {
171 RegExpAtom* atom = parse_result.tree->AsAtom();
172 Vector<const uc16> atom_pattern = atom->data();
173 Handle<String> atom_string = Factory::NewStringFromTwoByte(atom_pattern);
174 AtomCompile(re, pattern, flags, atom_string);
175 } else {
176 IrregexpPrepare(re, pattern, flags, parse_result.capture_count);
177 }
178 ASSERT(re->data()->IsFixedArray());
179 // Compilation succeeded so the data is set on the regexp
180 // and we can store it in the cache.
181 Handle<FixedArray> data(FixedArray::cast(re->data()));
182 CompilationCache::PutRegExp(pattern, flags, data);
183
184 return re;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000185}
186
187
188Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
189 Handle<String> subject,
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000190 int index,
191 Handle<JSArray> last_match_info) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000192 switch (regexp->TypeTag()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000193 case JSRegExp::ATOM:
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000194 return AtomExec(regexp, subject, index, last_match_info);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000195 case JSRegExp::IRREGEXP: {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000196 Handle<Object> result =
197 IrregexpExec(regexp, subject, index, last_match_info);
ager@chromium.org6f10e412009-02-13 10:11:16 +0000198 ASSERT(!result.is_null() || Top::has_pending_exception());
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000199 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000200 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000201 default:
202 UNREACHABLE();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000203 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000204 }
205}
206
207
208Handle<Object> RegExpImpl::ExecGlobal(Handle<JSRegExp> regexp,
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000209 Handle<String> subject,
210 Handle<JSArray> last_match_info) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000211 switch (regexp->TypeTag()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000212 case JSRegExp::ATOM:
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000213 return AtomExecGlobal(regexp, subject, last_match_info);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000214 case JSRegExp::IRREGEXP: {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000215 Handle<Object> result =
216 IrregexpExecGlobal(regexp, subject, last_match_info);
ager@chromium.org6f10e412009-02-13 10:11:16 +0000217 ASSERT(!result.is_null() || Top::has_pending_exception());
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000218 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000219 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000220 default:
221 UNREACHABLE();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000222 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000223 }
224}
225
226
ager@chromium.org8bb60582008-12-11 12:02:20 +0000227// RegExp Atom implementation: Simple string search using indexOf.
228
229
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000230void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
231 Handle<String> pattern,
232 JSRegExp::Flags flags,
233 Handle<String> match_pattern) {
234 Factory::SetRegExpAtomData(re,
235 JSRegExp::ATOM,
236 pattern,
237 flags,
238 match_pattern);
239}
240
241
242static void SetAtomLastCapture(FixedArray* array,
243 String* subject,
244 int from,
245 int to) {
246 NoHandleAllocation no_handles;
247 RegExpImpl::SetLastCaptureCount(array, 2);
248 RegExpImpl::SetLastSubject(array, subject);
249 RegExpImpl::SetLastInput(array, subject);
250 RegExpImpl::SetCapture(array, 0, from);
251 RegExpImpl::SetCapture(array, 1, to);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000252}
253
254
255Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
256 Handle<String> subject,
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000257 int index,
258 Handle<JSArray> last_match_info) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000259 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000260
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000261 uint32_t start_index = index;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000262
ager@chromium.org7c537e22008-10-16 08:43:32 +0000263 int value = Runtime::StringMatch(subject, needle, start_index);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000264 if (value == -1) return Factory::null_value();
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000265 ASSERT(last_match_info->HasFastElements());
ager@chromium.org7c537e22008-10-16 08:43:32 +0000266
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000267 {
268 NoHandleAllocation no_handles;
269 FixedArray* array = last_match_info->elements();
270 SetAtomLastCapture(array, *subject, value, value + needle->length());
271 }
272 return last_match_info;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000273}
274
275
276Handle<Object> RegExpImpl::AtomExecGlobal(Handle<JSRegExp> re,
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000277 Handle<String> subject,
278 Handle<JSArray> last_match_info) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000279 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000280 ASSERT(last_match_info->HasFastElements());
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000281 Handle<JSArray> result = Factory::NewJSArray(1);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000282 int index = 0;
283 int match_count = 0;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000284 int subject_length = subject->length();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000285 int needle_length = needle->length();
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000286 int last_value = -1;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000287 while (true) {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000288 HandleScope scope;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000289 int value = -1;
290 if (index + needle_length <= subject_length) {
291 value = Runtime::StringMatch(subject, needle, index);
292 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000293 if (value == -1) {
294 if (last_value != -1) {
295 Handle<FixedArray> array(last_match_info->elements());
296 SetAtomLastCapture(*array,
297 *subject,
298 last_value,
299 last_value + needle->length());
300 }
301 break;
302 }
303
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000304 int end = value + needle_length;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000305
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000306 // Create an array that looks like the static last_match_info array
307 // that is attached to the global RegExp object. We will be returning
308 // an array of these.
309 Handle<FixedArray> array = Factory::NewFixedArray(kFirstCapture + 2);
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000310 SetLastCaptureCount(*array, 2);
311 // Ignore subject and input fields.
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000312 SetCapture(*array, 0, value);
313 SetCapture(*array, 1, end);
ager@chromium.org7c537e22008-10-16 08:43:32 +0000314 Handle<JSArray> pair = Factory::NewJSArrayWithElements(array);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000315 SetElement(result, match_count, pair);
316 match_count++;
317 index = end;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000318 if (needle_length == 0) index++;
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000319 last_value = value;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000320 }
321 return result;
322}
323
324
ager@chromium.org8bb60582008-12-11 12:02:20 +0000325// Irregexp implementation.
326
327
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000328// Ensures that the regexp object contains a compiled version of the
329// source for either ASCII or non-ASCII strings.
330// If the compiled version doesn't already exist, it is compiled
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000331// from the source pattern.
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000332// If compilation fails, an exception is thrown and this function
333// returns false.
334bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
335 bool is_ascii) {
336 int index;
337 if (is_ascii) {
338 index = JSRegExp::kIrregexpASCIICodeIndex;
339 } else {
340 index = JSRegExp::kIrregexpUC16CodeIndex;
341 }
342 Object* entry = re->DataAt(index);
343 if (!entry->IsTheHole()) {
344 // A value has already been compiled.
345 if (entry->IsJSObject()) {
346 // If it's a JS value, it's an error.
347 Top::Throw(entry);
348 return false;
349 }
350 return true;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000351 }
352
353 // Compile the RegExp.
354 ZoneScope zone_scope(DELETE_ON_EXIT);
355
356 JSRegExp::Flags flags = re->GetFlags();
357
358 Handle<String> pattern(re->Pattern());
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000359 if (!pattern->IsFlat()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000360 FlattenString(pattern);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000361 }
362
363 RegExpCompileData compile_data;
364 FlatStringReader reader(pattern);
365 if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) {
366 // Throw an exception if we fail to parse the pattern.
367 // THIS SHOULD NOT HAPPEN. We already parsed it successfully once.
368 ThrowRegExpException(re,
369 pattern,
370 compile_data.error,
371 "malformed_regexp");
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000372 return false;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000373 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000374 RegExpEngine::CompilationResult result =
ager@chromium.org8bb60582008-12-11 12:02:20 +0000375 RegExpEngine::Compile(&compile_data,
376 flags.is_ignore_case(),
377 flags.is_multiline(),
378 pattern,
379 is_ascii);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000380 if (result.error_message != NULL) {
381 // Unable to compile regexp.
382 Handle<JSArray> array = Factory::NewJSArray(2);
383 SetElement(array, 0, pattern);
384 SetElement(array,
385 1,
386 Factory::NewStringFromUtf8(CStrVector(result.error_message)));
387 Handle<Object> regexp_err =
388 Factory::NewSyntaxError("malformed_regexp", array);
389 Top::Throw(*regexp_err);
390 re->SetDataAt(index, *regexp_err);
391 return false;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000392 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000393
394 NoHandleAllocation no_handles;
395
396 FixedArray* data = FixedArray::cast(re->data());
397 data->set(index, result.code);
398 int register_max = IrregexpMaxRegisterCount(data);
399 if (result.num_registers > register_max) {
400 SetIrregexpMaxRegisterCount(data, result.num_registers);
401 }
402
403 return true;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000404}
405
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000406
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000407int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
408 return Smi::cast(
409 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000410}
411
412
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000413void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
414 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000415}
416
417
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000418int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
419 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000420}
421
422
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000423int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
424 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000425}
426
427
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000428ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
429 int index;
430 if (is_ascii) {
431 index = JSRegExp::kIrregexpASCIICodeIndex;
432 } else {
433 index = JSRegExp::kIrregexpUC16CodeIndex;
434 }
435 return ByteArray::cast(re->get(index));
436}
437
438
439Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
440 int index;
441 if (is_ascii) {
442 index = JSRegExp::kIrregexpASCIICodeIndex;
443 } else {
444 index = JSRegExp::kIrregexpUC16CodeIndex;
445 }
446 return Code::cast(re->get(index));
447}
448
449
450void RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
451 Handle<String> pattern,
452 JSRegExp::Flags flags,
453 int capture_count) {
454 // Initialize compiled code entries to null.
455 Factory::SetRegExpIrregexpData(re,
456 JSRegExp::IRREGEXP,
457 pattern,
458 flags,
459 capture_count);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000460}
461
462
463Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
464 Handle<String> subject,
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000465 int index,
466 Handle<JSArray> last_match_info) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000467 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000468
ager@chromium.org8bb60582008-12-11 12:02:20 +0000469 // Prepare space for the return values.
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000470 int number_of_capture_registers =
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000471 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000472 OffsetsVector offsets(number_of_capture_registers);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000473
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000474 int previous_index = index;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000475
476#ifdef DEBUG
477 if (FLAG_trace_regexp_bytecodes) {
478 String* pattern = regexp->Pattern();
479 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
480 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
481 }
482#endif
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000483
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000484 if (!subject->IsFlat()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000485 FlattenString(subject);
486 }
487
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000488 last_match_info->EnsureSize(number_of_capture_registers + kLastMatchOverhead);
489
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000490 return IrregexpExecOnce(regexp,
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000491 number_of_capture_registers,
492 last_match_info,
ager@chromium.org8bb60582008-12-11 12:02:20 +0000493 subject,
494 previous_index,
495 offsets.vector(),
496 offsets.length());
497}
498
499
500Handle<Object> RegExpImpl::IrregexpExecGlobal(Handle<JSRegExp> regexp,
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000501 Handle<String> subject,
502 Handle<JSArray> last_match_info) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000503 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000504
505 // Prepare space for the return values.
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000506 int number_of_capture_registers =
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000507 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000508 OffsetsVector offsets(number_of_capture_registers);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000509
510 int previous_index = 0;
511
512 Handle<JSArray> result = Factory::NewJSArray(0);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000513 int result_length = 0;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000514 Handle<Object> matches;
515
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000516 if (!subject->IsFlat()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000517 FlattenString(subject);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000518 }
519
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000520 last_match_info->EnsureSize(number_of_capture_registers + kLastMatchOverhead);
521
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000522 while (true) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000523 if (previous_index > subject->length() || previous_index < 0) {
524 // Per ECMA-262 15.10.6.2, if the previous index is greater than the
525 // string length, there is no match.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000526 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000527 } else {
528#ifdef DEBUG
529 if (FLAG_trace_regexp_bytecodes) {
530 String* pattern = regexp->Pattern();
531 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
532 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
533 }
534#endif
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000535 HandleScope scope;
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000536 matches = IrregexpExecOnce(regexp,
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000537 number_of_capture_registers,
538 last_match_info,
ager@chromium.org8bb60582008-12-11 12:02:20 +0000539 subject,
540 previous_index,
541 offsets.vector(),
542 offsets.length());
543
ager@chromium.org6f10e412009-02-13 10:11:16 +0000544 if (matches.is_null()) {
545 ASSERT(Top::has_pending_exception());
546 return matches;
547 }
548
ager@chromium.org8bb60582008-12-11 12:02:20 +0000549 if (matches->IsJSArray()) {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000550 // Create an array that looks like the static last_match_info array
551 // that is attached to the global RegExp object. We will be returning
552 // an array of these.
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000553 int match_length = kFirstCapture + number_of_capture_registers;
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000554 Handle<JSArray> latest_match =
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000555 Factory::NewJSArray(match_length);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000556
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000557 AssertNoAllocation no_allocation;
558 FixedArray* match_array = JSArray::cast(*matches)->elements();
559 match_array->CopyTo(0,
560 latest_match->elements(),
561 0,
562 match_length);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000563 SetElement(result, result_length, latest_match);
564 result_length++;
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000565 previous_index = GetCapture(match_array, 1);
566 if (GetCapture(match_array, 0) == previous_index) {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000567 previous_index++;
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000568 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000569 } else {
ager@chromium.org6f10e412009-02-13 10:11:16 +0000570 ASSERT(matches->IsNull());
571 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000572 }
573 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000574 }
575}
576
577
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000578Handle<Object> RegExpImpl::IrregexpExecOnce(Handle<JSRegExp> jsregexp,
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000579 int number_of_capture_registers,
580 Handle<JSArray> last_match_info,
ager@chromium.org8bb60582008-12-11 12:02:20 +0000581 Handle<String> subject,
582 int previous_index,
583 int* offsets_vector,
584 int offsets_vector_length) {
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000585 ASSERT(subject->IsFlat());
ager@chromium.org8bb60582008-12-11 12:02:20 +0000586 bool rc;
587
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000588 Handle<String> original_subject = subject;
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000589 Handle<FixedArray> regexp(FixedArray::cast(jsregexp->data()));
590 if (UseNativeRegexp()) {
591#ifdef ARM
592 UNREACHABLE();
593#else
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000594 RegExpMacroAssemblerIA32::Result res;
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000595 do {
596 bool is_ascii = StringShape(*subject).IsAsciiRepresentation();
597 if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) {
598 return Handle<Object>::null();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000599 }
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000600 Handle<Code> code(RegExpImpl::IrregexpNativeCode(*regexp, is_ascii));
601 res = RegExpMacroAssemblerIA32::Match(code,
602 subject,
603 offsets_vector,
604 offsets_vector_length,
605 previous_index);
606 // If result is RETRY, the string have changed representation, and we
607 // must restart from scratch.
608 } while (res == RegExpMacroAssemblerIA32::RETRY);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000609 if (res == RegExpMacroAssemblerIA32::EXCEPTION) {
610 ASSERT(Top::has_pending_exception());
611 return Handle<Object>::null();
612 }
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000613 ASSERT(res == RegExpMacroAssemblerIA32::SUCCESS
614 || res == RegExpMacroAssemblerIA32::FAILURE);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000615
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000616 rc = (res == RegExpMacroAssemblerIA32::SUCCESS);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000617#endif
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000618 } else {
619 bool is_ascii = StringShape(*subject).IsAsciiRepresentation();
620 if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) {
621 return Handle<Object>::null();
622 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000623 for (int i = number_of_capture_registers - 1; i >= 0; i--) {
624 offsets_vector[i] = -1;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000625 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000626 Handle<ByteArray> byte_codes(IrregexpByteCode(*regexp, is_ascii));
ager@chromium.org8bb60582008-12-11 12:02:20 +0000627
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000628 rc = IrregexpInterpreter::Match(byte_codes,
629 subject,
630 offsets_vector,
631 previous_index);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000632 }
633
634 if (!rc) {
635 return Factory::null_value();
636 }
637
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000638 FixedArray* array = last_match_info->elements();
639 ASSERT(array->length() >= number_of_capture_registers + kLastMatchOverhead);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000640 // The captures come in (start, end+1) pairs.
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000641 SetLastCaptureCount(array, number_of_capture_registers);
642 SetLastSubject(array, *original_subject);
643 SetLastInput(array, *original_subject);
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000644 for (int i = 0; i < number_of_capture_registers; i+=2) {
645 SetCapture(array, i, offsets_vector[i]);
646 SetCapture(array, i + 1, offsets_vector[i + 1]);
647 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000648 return last_match_info;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000649}
650
651
652// -------------------------------------------------------------------
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000653// Implementation of the Irregexp regular expression engine.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000654//
655// The Irregexp regular expression engine is intended to be a complete
656// implementation of ECMAScript regular expressions. It generates either
657// bytecodes or native code.
658
659// The Irregexp regexp engine is structured in three steps.
660// 1) The parser generates an abstract syntax tree. See ast.cc.
661// 2) From the AST a node network is created. The nodes are all
662// subclasses of RegExpNode. The nodes represent states when
663// executing a regular expression. Several optimizations are
664// performed on the node network.
665// 3) From the nodes we generate either byte codes or native code
666// that can actually execute the regular expression (perform
667// the search). The code generation step is described in more
668// detail below.
669
670// Code generation.
671//
672// The nodes are divided into four main categories.
673// * Choice nodes
674// These represent places where the regular expression can
675// match in more than one way. For example on entry to an
676// alternation (foo|bar) or a repetition (*, +, ? or {}).
677// * Action nodes
678// These represent places where some action should be
679// performed. Examples include recording the current position
680// in the input string to a register (in order to implement
681// captures) or other actions on register for example in order
682// to implement the counters needed for {} repetitions.
683// * Matching nodes
684// These attempt to match some element part of the input string.
685// Examples of elements include character classes, plain strings
686// or back references.
687// * End nodes
688// These are used to implement the actions required on finding
689// a successful match or failing to find a match.
690//
691// The code generated (whether as byte codes or native code) maintains
692// some state as it runs. This consists of the following elements:
693//
694// * The capture registers. Used for string captures.
695// * Other registers. Used for counters etc.
696// * The current position.
697// * The stack of backtracking information. Used when a matching node
698// fails to find a match and needs to try an alternative.
699//
700// Conceptual regular expression execution model:
701//
702// There is a simple conceptual model of regular expression execution
703// which will be presented first. The actual code generated is a more
704// efficient simulation of the simple conceptual model:
705//
706// * Choice nodes are implemented as follows:
707// For each choice except the last {
708// push current position
709// push backtrack code location
710// <generate code to test for choice>
711// backtrack code location:
712// pop current position
713// }
714// <generate code to test for last choice>
715//
716// * Actions nodes are generated as follows
717// <push affected registers on backtrack stack>
718// <generate code to perform action>
719// push backtrack code location
720// <generate code to test for following nodes>
721// backtrack code location:
722// <pop affected registers to restore their state>
723// <pop backtrack location from stack and go to it>
724//
725// * Matching nodes are generated as follows:
726// if input string matches at current position
727// update current position
728// <generate code to test for following nodes>
729// else
730// <pop backtrack location from stack and go to it>
731//
732// Thus it can be seen that the current position is saved and restored
733// by the choice nodes, whereas the registers are saved and restored by
734// by the action nodes that manipulate them.
735//
736// The other interesting aspect of this model is that nodes are generated
737// at the point where they are needed by a recursive call to Emit(). If
738// the node has already been code generated then the Emit() call will
739// generate a jump to the previously generated code instead. In order to
740// limit recursion it is possible for the Emit() function to put the node
741// on a work list for later generation and instead generate a jump. The
742// destination of the jump is resolved later when the code is generated.
743//
744// Actual regular expression code generation.
745//
746// Code generation is actually more complicated than the above. In order
747// to improve the efficiency of the generated code some optimizations are
748// performed
749//
750// * Choice nodes have 1-character lookahead.
751// A choice node looks at the following character and eliminates some of
752// the choices immediately based on that character. This is not yet
753// implemented.
754// * Simple greedy loops store reduced backtracking information.
755// A quantifier like /.*foo/m will greedily match the whole input. It will
756// then need to backtrack to a point where it can match "foo". The naive
757// implementation of this would push each character position onto the
758// backtracking stack, then pop them off one by one. This would use space
759// proportional to the length of the input string. However since the "."
760// can only match in one way and always has a constant length (in this case
761// of 1) it suffices to store the current position on the top of the stack
762// once. Matching now becomes merely incrementing the current position and
763// backtracking becomes decrementing the current position and checking the
764// result against the stored current position. This is faster and saves
765// space.
766// * The current state is virtualized.
767// This is used to defer expensive operations until it is clear that they
768// are needed and to generate code for a node more than once, allowing
769// specialized an efficient versions of the code to be created. This is
770// explained in the section below.
771//
772// Execution state virtualization.
773//
774// Instead of emitting code, nodes that manipulate the state can record their
ager@chromium.org32912102009-01-16 10:38:43 +0000775// manipulation in an object called the Trace. The Trace object can record a
776// current position offset, an optional backtrack code location on the top of
777// the virtualized backtrack stack and some register changes. When a node is
778// to be emitted it can flush the Trace or update it. Flushing the Trace
ager@chromium.org8bb60582008-12-11 12:02:20 +0000779// will emit code to bring the actual state into line with the virtual state.
780// Avoiding flushing the state can postpone some work (eg updates of capture
781// registers). Postponing work can save time when executing the regular
782// expression since it may be found that the work never has to be done as a
783// failure to match can occur. In addition it is much faster to jump to a
784// known backtrack code location than it is to pop an unknown backtrack
785// location from the stack and jump there.
786//
ager@chromium.org32912102009-01-16 10:38:43 +0000787// The virtual state found in the Trace affects code generation. For example
788// the virtual state contains the difference between the actual current
789// position and the virtual current position, and matching code needs to use
790// this offset to attempt a match in the correct location of the input
791// string. Therefore code generated for a non-trivial trace is specialized
792// to that trace. The code generator therefore has the ability to generate
793// code for each node several times. In order to limit the size of the
794// generated code there is an arbitrary limit on how many specialized sets of
795// code may be generated for a given node. If the limit is reached, the
796// trace is flushed and a generic version of the code for a node is emitted.
797// This is subsequently used for that node. The code emitted for non-generic
798// trace is not recorded in the node and so it cannot currently be reused in
799// the event that code generation is requested for an identical trace.
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000800
801
802void RegExpTree::AppendToText(RegExpText* text) {
803 UNREACHABLE();
804}
805
806
807void RegExpAtom::AppendToText(RegExpText* text) {
808 text->AddElement(TextElement::Atom(this));
809}
810
811
812void RegExpCharacterClass::AppendToText(RegExpText* text) {
813 text->AddElement(TextElement::CharClass(this));
814}
815
816
817void RegExpText::AppendToText(RegExpText* text) {
818 for (int i = 0; i < elements()->length(); i++)
819 text->AddElement(elements()->at(i));
820}
821
822
823TextElement TextElement::Atom(RegExpAtom* atom) {
824 TextElement result = TextElement(ATOM);
825 result.data.u_atom = atom;
826 return result;
827}
828
829
830TextElement TextElement::CharClass(
831 RegExpCharacterClass* char_class) {
832 TextElement result = TextElement(CHAR_CLASS);
833 result.data.u_char_class = char_class;
834 return result;
835}
836
837
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000838int TextElement::length() {
839 if (type == ATOM) {
840 return data.u_atom->length();
841 } else {
842 ASSERT(type == CHAR_CLASS);
843 return 1;
844 }
845}
846
847
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000848DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
849 if (table_ == NULL) {
850 table_ = new DispatchTable();
851 DispatchTableConstructor cons(table_, ignore_case);
852 cons.BuildTable(this);
853 }
854 return table_;
855}
856
857
858class RegExpCompiler {
859 public:
ager@chromium.org8bb60582008-12-11 12:02:20 +0000860 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000861
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000862 int AllocateRegister() {
863 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
864 reg_exp_too_big_ = true;
865 return next_register_;
866 }
867 return next_register_++;
868 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000869
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000870 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
871 RegExpNode* start,
872 int capture_count,
873 Handle<String> pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000874
875 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
876
877 static const int kImplementationOffset = 0;
878 static const int kNumberOfRegistersOffset = 0;
879 static const int kCodeOffset = 1;
880
881 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
882 EndNode* accept() { return accept_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000883
884 static const int kMaxRecursion = 100;
885 inline int recursion_depth() { return recursion_depth_; }
886 inline void IncrementRecursionDepth() { recursion_depth_++; }
887 inline void DecrementRecursionDepth() { recursion_depth_--; }
888
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000889 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
890
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000891 inline bool ignore_case() { return ignore_case_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000892 inline bool ascii() { return ascii_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000893
ager@chromium.org32912102009-01-16 10:38:43 +0000894 static const int kNoRegister = -1;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000895 private:
896 EndNode* accept_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000897 int next_register_;
898 List<RegExpNode*>* work_list_;
899 int recursion_depth_;
900 RegExpMacroAssembler* macro_assembler_;
901 bool ignore_case_;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000902 bool ascii_;
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000903 bool reg_exp_too_big_;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000904};
905
906
907class RecursionCheck {
908 public:
909 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
910 compiler->IncrementRecursionDepth();
911 }
912 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
913 private:
914 RegExpCompiler* compiler_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000915};
916
917
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000918static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
919 return RegExpEngine::CompilationResult("RegExp too big");
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000920}
921
922
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000923// Attempts to compile the regexp using an Irregexp code generator. Returns
924// a fixed array or a null handle depending on whether it succeeded.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000925RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000926 : next_register_(2 * (capture_count + 1)),
927 work_list_(NULL),
928 recursion_depth_(0),
ager@chromium.org8bb60582008-12-11 12:02:20 +0000929 ignore_case_(ignore_case),
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000930 ascii_(ascii),
931 reg_exp_too_big_(false) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000932 accept_ = new EndNode(EndNode::ACCEPT);
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000933 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000934}
935
936
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000937RegExpEngine::CompilationResult RegExpCompiler::Assemble(
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000938 RegExpMacroAssembler* macro_assembler,
939 RegExpNode* start,
ager@chromium.org8bb60582008-12-11 12:02:20 +0000940 int capture_count,
941 Handle<String> pattern) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000942#ifdef DEBUG
943 if (FLAG_trace_regexp_assembler)
944 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
945 else
946#endif
947 macro_assembler_ = macro_assembler;
948 List <RegExpNode*> work_list(0);
949 work_list_ = &work_list;
950 Label fail;
iposva@chromium.org245aa852009-02-10 00:49:54 +0000951 macro_assembler_->PushBacktrack(&fail);
ager@chromium.org32912102009-01-16 10:38:43 +0000952 Trace new_trace;
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000953 start->Emit(this, &new_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000954 macro_assembler_->Bind(&fail);
955 macro_assembler_->Fail();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000956 while (!work_list.is_empty()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000957 work_list.RemoveLast()->Emit(this, &new_trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000958 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000959 if (reg_exp_too_big_) return IrregexpRegExpTooBig();
960
ager@chromium.org8bb60582008-12-11 12:02:20 +0000961 Handle<Object> code = macro_assembler_->GetCode(pattern);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000962
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000963 work_list_ = NULL;
964#ifdef DEBUG
965 if (FLAG_trace_regexp_assembler) {
966 delete macro_assembler_;
967 }
968#endif
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000969 return RegExpEngine::CompilationResult(*code, next_register_);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000970}
971
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000972
ager@chromium.org32912102009-01-16 10:38:43 +0000973bool Trace::DeferredAction::Mentions(int that) {
974 if (type() == ActionNode::CLEAR_CAPTURES) {
975 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
976 return range.Contains(that);
977 } else {
978 return reg() == that;
979 }
980}
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000981
ager@chromium.org32912102009-01-16 10:38:43 +0000982
983bool Trace::mentions_reg(int reg) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000984 for (DeferredAction* action = actions_;
985 action != NULL;
986 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +0000987 if (action->Mentions(reg))
988 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000989 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000990 return false;
991}
992
993
ager@chromium.org32912102009-01-16 10:38:43 +0000994bool Trace::GetStoredPosition(int reg, int* cp_offset) {
995 ASSERT_EQ(0, *cp_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000996 for (DeferredAction* action = actions_;
997 action != NULL;
998 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +0000999 if (action->Mentions(reg)) {
1000 if (action->type() == ActionNode::STORE_POSITION) {
1001 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1002 return true;
1003 } else {
1004 return false;
1005 }
1006 }
1007 }
1008 return false;
1009}
1010
1011
1012int Trace::FindAffectedRegisters(OutSet* affected_registers) {
1013 int max_register = RegExpCompiler::kNoRegister;
1014 for (DeferredAction* action = actions_;
1015 action != NULL;
1016 action = action->next()) {
1017 if (action->type() == ActionNode::CLEAR_CAPTURES) {
1018 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1019 for (int i = range.from(); i <= range.to(); i++)
1020 affected_registers->Set(i);
1021 if (range.to() > max_register) max_register = range.to();
1022 } else {
1023 affected_registers->Set(action->reg());
1024 if (action->reg() > max_register) max_register = action->reg();
1025 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001026 }
1027 return max_register;
1028}
1029
1030
ager@chromium.org32912102009-01-16 10:38:43 +00001031void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1032 int max_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001033 OutSet& registers_to_pop,
1034 OutSet& registers_to_clear) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001035 for (int reg = max_register; reg >= 0; reg--) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001036 if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
1037 else if (registers_to_clear.Get(reg)) {
1038 int clear_to = reg;
1039 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1040 reg--;
1041 }
1042 assembler->ClearRegisters(reg, clear_to);
1043 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001044 }
1045}
1046
1047
ager@chromium.org32912102009-01-16 10:38:43 +00001048void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1049 int max_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001050 OutSet& affected_registers,
1051 OutSet* registers_to_pop,
1052 OutSet* registers_to_clear) {
1053 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1054 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1055
ager@chromium.org8bb60582008-12-11 12:02:20 +00001056 for (int reg = 0; reg <= max_register; reg++) {
1057 if (!affected_registers.Get(reg)) {
1058 continue;
1059 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001060 // Count pushes performed to force a stack limit check occasionally.
1061 int pushes = 0;
1062
1063 // The chronologically first deferred action in the trace
1064 // is used to infer the action needed to restore a register
1065 // to its previous state (or not, if it's safe to ignore it).
1066 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1067 DeferredActionUndoType undo_action = IGNORE;
1068
ager@chromium.org8bb60582008-12-11 12:02:20 +00001069 int value = 0;
1070 bool absolute = false;
ager@chromium.org32912102009-01-16 10:38:43 +00001071 bool clear = false;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001072 int store_position = -1;
1073 // This is a little tricky because we are scanning the actions in reverse
1074 // historical order (newest first).
1075 for (DeferredAction* action = actions_;
1076 action != NULL;
1077 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +00001078 if (action->Mentions(reg)) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001079 switch (action->type()) {
1080 case ActionNode::SET_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00001081 Trace::DeferredSetRegister* psr =
1082 static_cast<Trace::DeferredSetRegister*>(action);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001083 if (!absolute) {
1084 value += psr->value();
1085 absolute = true;
1086 }
1087 // SET_REGISTER is currently only used for newly introduced loop
1088 // counters. They can have a significant previous value if they
1089 // occour in a loop. TODO(lrn): Propagate this information, so
1090 // we can set undo_action to IGNORE if we know there is no value to
1091 // restore.
1092 undo_action = RESTORE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001093 ASSERT_EQ(store_position, -1);
ager@chromium.org32912102009-01-16 10:38:43 +00001094 ASSERT(!clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001095 break;
1096 }
1097 case ActionNode::INCREMENT_REGISTER:
1098 if (!absolute) {
1099 value++;
1100 }
1101 ASSERT_EQ(store_position, -1);
ager@chromium.org32912102009-01-16 10:38:43 +00001102 ASSERT(!clear);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001103 undo_action = RESTORE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001104 break;
1105 case ActionNode::STORE_POSITION: {
ager@chromium.org32912102009-01-16 10:38:43 +00001106 Trace::DeferredCapture* pc =
1107 static_cast<Trace::DeferredCapture*>(action);
1108 if (!clear && store_position == -1) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001109 store_position = pc->cp_offset();
1110 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001111
1112 // For captures we know that stores and clears alternate.
1113 // Other register, are never cleared, and if the occur
1114 // inside a loop, they might be assigned more than once.
1115 if (reg <= 1) {
1116 // Registers zero and one, aka "capture zero", is
1117 // always set correctly if we succeed. There is no
1118 // need to undo a setting on backtrack, because we
1119 // will set it again or fail.
1120 undo_action = IGNORE;
1121 } else {
1122 undo_action = pc->is_capture() ? CLEAR : RESTORE;
1123 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001124 ASSERT(!absolute);
1125 ASSERT_EQ(value, 0);
1126 break;
1127 }
ager@chromium.org32912102009-01-16 10:38:43 +00001128 case ActionNode::CLEAR_CAPTURES: {
1129 // Since we're scanning in reverse order, if we've already
1130 // set the position we have to ignore historically earlier
1131 // clearing operations.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001132 if (store_position == -1) {
ager@chromium.org32912102009-01-16 10:38:43 +00001133 clear = true;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001134 }
1135 undo_action = RESTORE;
ager@chromium.org32912102009-01-16 10:38:43 +00001136 ASSERT(!absolute);
1137 ASSERT_EQ(value, 0);
1138 break;
1139 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001140 default:
1141 UNREACHABLE();
1142 break;
1143 }
1144 }
1145 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001146 // Prepare for the undo-action (e.g., push if it's going to be popped).
1147 if (undo_action == RESTORE) {
1148 pushes++;
1149 RegExpMacroAssembler::StackCheckFlag stack_check =
1150 RegExpMacroAssembler::kNoStackLimitCheck;
1151 if (pushes == push_limit) {
1152 stack_check = RegExpMacroAssembler::kCheckStackLimit;
1153 pushes = 0;
1154 }
1155
1156 assembler->PushRegister(reg, stack_check);
1157 registers_to_pop->Set(reg);
1158 } else if (undo_action == CLEAR) {
1159 registers_to_clear->Set(reg);
1160 }
1161 // Perform the chronologically last action (or accumulated increment)
1162 // for the register.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001163 if (store_position != -1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001164 assembler->WriteCurrentPositionToRegister(reg, store_position);
ager@chromium.org32912102009-01-16 10:38:43 +00001165 } else if (clear) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001166 assembler->ClearRegisters(reg, reg);
ager@chromium.org32912102009-01-16 10:38:43 +00001167 } else if (absolute) {
1168 assembler->SetRegister(reg, value);
1169 } else if (value != 0) {
1170 assembler->AdvanceRegister(reg, value);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001171 }
1172 }
1173}
1174
1175
ager@chromium.org8bb60582008-12-11 12:02:20 +00001176// This is called as we come into a loop choice node and some other tricky
ager@chromium.org32912102009-01-16 10:38:43 +00001177// nodes. It normalizes the state of the code generator to ensure we can
ager@chromium.org8bb60582008-12-11 12:02:20 +00001178// generate generic code.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001179void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001180 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001181
iposva@chromium.org245aa852009-02-10 00:49:54 +00001182 ASSERT(!is_trivial());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001183
1184 if (actions_ == NULL && backtrack() == NULL) {
1185 // Here we just have some deferred cp advances to fix and we are back to
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001186 // a normal situation. We may also have to forget some information gained
1187 // through a quick check that was already performed.
1188 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001189 // Create a new trivial state and generate the node with that.
ager@chromium.org32912102009-01-16 10:38:43 +00001190 Trace new_state;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001191 successor->Emit(compiler, &new_state);
1192 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001193 }
1194
1195 // Generate deferred actions here along with code to undo them again.
1196 OutSet affected_registers;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001197
ager@chromium.org381abbb2009-02-25 13:23:22 +00001198 if (backtrack() != NULL) {
1199 // Here we have a concrete backtrack location. These are set up by choice
1200 // nodes and so they indicate that we have a deferred save of the current
1201 // position which we may need to emit here.
1202 assembler->PushCurrentPosition();
1203 }
1204
ager@chromium.org8bb60582008-12-11 12:02:20 +00001205 int max_register = FindAffectedRegisters(&affected_registers);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001206 OutSet registers_to_pop;
1207 OutSet registers_to_clear;
1208 PerformDeferredActions(assembler,
1209 max_register,
1210 affected_registers,
1211 &registers_to_pop,
1212 &registers_to_clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001213 if (cp_offset_ != 0) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001214 assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001215 }
1216
1217 // Create a new trivial state and generate the node with that.
1218 Label undo;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001219 assembler->PushBacktrack(&undo);
ager@chromium.org32912102009-01-16 10:38:43 +00001220 Trace new_state;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001221 successor->Emit(compiler, &new_state);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001222
1223 // On backtrack we need to restore state.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001224 assembler->Bind(&undo);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001225 RestoreAffectedRegisters(assembler,
1226 max_register,
1227 registers_to_pop,
1228 registers_to_clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001229 if (backtrack() == NULL) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001230 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001231 } else {
ager@chromium.org381abbb2009-02-25 13:23:22 +00001232 assembler->PopCurrentPosition();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001233 assembler->GoTo(backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001234 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001235}
1236
1237
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001238void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001239 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001240
1241 // Omit flushing the trace. We discard the entire stack frame anyway.
1242
ager@chromium.org8bb60582008-12-11 12:02:20 +00001243 if (!label()->is_bound()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001244 // We are completely independent of the trace, since we ignore it,
1245 // so this code can be used as the generic version.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001246 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001247 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001248
1249 // Throw away everything on the backtrack stack since the start
1250 // of the negative submatch and restore the character position.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001251 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1252 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001253 if (clear_capture_count_ > 0) {
1254 // Clear any captures that might have been performed during the success
1255 // of the body of the negative look-ahead.
1256 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1257 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1258 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001259 // Now that we have unwound the stack we find at the top of the stack the
1260 // backtrack that the BeginSubmatch node got.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001261 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001262}
1263
1264
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001265void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org32912102009-01-16 10:38:43 +00001266 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001267 trace->Flush(compiler, this);
1268 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001269 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001270 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001271 if (!label()->is_bound()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001272 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001273 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001274 switch (action_) {
1275 case ACCEPT:
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001276 assembler->Succeed();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001277 return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001278 case BACKTRACK:
ager@chromium.org32912102009-01-16 10:38:43 +00001279 assembler->GoTo(trace->backtrack());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001280 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001281 case NEGATIVE_SUBMATCH_SUCCESS:
1282 // This case is handled in a different virtual method.
1283 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001284 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001285 UNIMPLEMENTED();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001286}
1287
1288
1289void GuardedAlternative::AddGuard(Guard* guard) {
1290 if (guards_ == NULL)
1291 guards_ = new ZoneList<Guard*>(1);
1292 guards_->Add(guard);
1293}
1294
1295
ager@chromium.org8bb60582008-12-11 12:02:20 +00001296ActionNode* ActionNode::SetRegister(int reg,
1297 int val,
1298 RegExpNode* on_success) {
1299 ActionNode* result = new ActionNode(SET_REGISTER, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001300 result->data_.u_store_register.reg = reg;
1301 result->data_.u_store_register.value = val;
1302 return result;
1303}
1304
1305
1306ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1307 ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
1308 result->data_.u_increment_register.reg = reg;
1309 return result;
1310}
1311
1312
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001313ActionNode* ActionNode::StorePosition(int reg,
1314 bool is_capture,
1315 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001316 ActionNode* result = new ActionNode(STORE_POSITION, on_success);
1317 result->data_.u_position_register.reg = reg;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001318 result->data_.u_position_register.is_capture = is_capture;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001319 return result;
1320}
1321
1322
ager@chromium.org32912102009-01-16 10:38:43 +00001323ActionNode* ActionNode::ClearCaptures(Interval range,
1324 RegExpNode* on_success) {
1325 ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
1326 result->data_.u_clear_captures.range_from = range.from();
1327 result->data_.u_clear_captures.range_to = range.to();
1328 return result;
1329}
1330
1331
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001332ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1333 int position_reg,
1334 RegExpNode* on_success) {
1335 ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
1336 result->data_.u_submatch.stack_pointer_register = stack_reg;
1337 result->data_.u_submatch.current_position_register = position_reg;
1338 return result;
1339}
1340
1341
ager@chromium.org8bb60582008-12-11 12:02:20 +00001342ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1343 int position_reg,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001344 int clear_register_count,
1345 int clear_register_from,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001346 RegExpNode* on_success) {
1347 ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001348 result->data_.u_submatch.stack_pointer_register = stack_reg;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001349 result->data_.u_submatch.current_position_register = position_reg;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001350 result->data_.u_submatch.clear_register_count = clear_register_count;
1351 result->data_.u_submatch.clear_register_from = clear_register_from;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001352 return result;
1353}
1354
1355
ager@chromium.org32912102009-01-16 10:38:43 +00001356ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1357 int repetition_register,
1358 int repetition_limit,
1359 RegExpNode* on_success) {
1360 ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
1361 result->data_.u_empty_match_check.start_register = start_register;
1362 result->data_.u_empty_match_check.repetition_register = repetition_register;
1363 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1364 return result;
1365}
1366
1367
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001368#define DEFINE_ACCEPT(Type) \
1369 void Type##Node::Accept(NodeVisitor* visitor) { \
1370 visitor->Visit##Type(this); \
1371 }
1372FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1373#undef DEFINE_ACCEPT
1374
1375
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001376void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1377 visitor->VisitLoopChoice(this);
1378}
1379
1380
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001381// -------------------------------------------------------------------
1382// Emit code.
1383
1384
1385void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1386 Guard* guard,
ager@chromium.org32912102009-01-16 10:38:43 +00001387 Trace* trace) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001388 switch (guard->op()) {
1389 case Guard::LT:
ager@chromium.org32912102009-01-16 10:38:43 +00001390 ASSERT(!trace->mentions_reg(guard->reg()));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001391 macro_assembler->IfRegisterGE(guard->reg(),
1392 guard->value(),
ager@chromium.org32912102009-01-16 10:38:43 +00001393 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001394 break;
1395 case Guard::GEQ:
ager@chromium.org32912102009-01-16 10:38:43 +00001396 ASSERT(!trace->mentions_reg(guard->reg()));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001397 macro_assembler->IfRegisterLT(guard->reg(),
1398 guard->value(),
ager@chromium.org32912102009-01-16 10:38:43 +00001399 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001400 break;
1401 }
1402}
1403
1404
1405static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
1406static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
1407
1408
ager@chromium.org381abbb2009-02-25 13:23:22 +00001409// Returns the number of characters in the equivalence class, omitting those
1410// that cannot occur in the source string because it is ASCII.
1411static int GetCaseIndependentLetters(uc16 character,
1412 bool ascii_subject,
1413 unibrow::uchar* letters) {
1414 int length = uncanonicalize.get(character, '\0', letters);
1415 // Unibrow returns 0 or 1 for characters where case independependence is
1416 // trivial.
1417 if (length == 0) {
1418 letters[0] = character;
1419 length = 1;
1420 }
1421 if (!ascii_subject || character <= String::kMaxAsciiCharCode) {
1422 return length;
1423 }
1424 // The standard requires that non-ASCII characters cannot have ASCII
1425 // character codes in their equivalence class.
1426 return 0;
1427}
1428
1429
1430static inline bool EmitSimpleCharacter(RegExpCompiler* compiler,
1431 uc16 c,
1432 Label* on_failure,
1433 int cp_offset,
1434 bool check,
1435 bool preloaded) {
1436 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1437 bool bound_checked = false;
1438 if (!preloaded) {
1439 assembler->LoadCurrentCharacter(
1440 cp_offset,
1441 on_failure,
1442 check);
1443 bound_checked = true;
1444 }
1445 assembler->CheckNotCharacter(c, on_failure);
1446 return bound_checked;
1447}
1448
1449
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001450// Only emits non-letters (things that don't have case). Only used for case
1451// independent matches.
ager@chromium.org381abbb2009-02-25 13:23:22 +00001452static inline bool EmitAtomNonLetter(RegExpCompiler* compiler,
1453 uc16 c,
1454 Label* on_failure,
1455 int cp_offset,
1456 bool check,
1457 bool preloaded) {
1458 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1459 bool ascii = compiler->ascii();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001460 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
ager@chromium.org381abbb2009-02-25 13:23:22 +00001461 int length = GetCaseIndependentLetters(c, ascii, chars);
1462 if (length < 1) {
1463 // This can't match. Must be an ASCII subject and a non-ASCII character.
1464 // We do not need to do anything since the ASCII pass already handled this.
1465 return false; // Bounds not checked.
1466 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001467 bool checked = false;
ager@chromium.org381abbb2009-02-25 13:23:22 +00001468 // We handle the length > 1 case in a later pass.
1469 if (length == 1) {
1470 if (ascii && c > String::kMaxAsciiCharCodeU) {
1471 // Can't match - see above.
1472 return false; // Bounds not checked.
1473 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001474 if (!preloaded) {
1475 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1476 checked = check;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001477 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001478 macro_assembler->CheckNotCharacter(c, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001479 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001480 return checked;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001481}
1482
1483
1484static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001485 bool ascii,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001486 uc16 c1,
1487 uc16 c2,
1488 Label* on_failure) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001489 uc16 char_mask;
1490 if (ascii) {
1491 char_mask = String::kMaxAsciiCharCode;
1492 } else {
1493 char_mask = String::kMaxUC16CharCode;
1494 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001495 uc16 exor = c1 ^ c2;
1496 // Check whether exor has only one bit set.
1497 if (((exor - 1) & exor) == 0) {
1498 // If c1 and c2 differ only by one bit.
1499 // Ecma262UnCanonicalize always gives the highest number last.
1500 ASSERT(c2 > c1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001501 uc16 mask = char_mask ^ exor;
1502 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001503 return true;
1504 }
1505 ASSERT(c2 > c1);
1506 uc16 diff = c2 - c1;
1507 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1508 // If the characters differ by 2^n but don't differ by one bit then
1509 // subtract the difference from the found character, then do the or
1510 // trick. We avoid the theoretical case where negative numbers are
1511 // involved in order to simplify code generation.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001512 uc16 mask = char_mask ^ diff;
1513 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1514 diff,
1515 mask,
1516 on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001517 return true;
1518 }
1519 return false;
1520}
1521
1522
ager@chromium.org381abbb2009-02-25 13:23:22 +00001523typedef bool EmitCharacterFunction(RegExpCompiler* compiler,
1524 uc16 c,
1525 Label* on_failure,
1526 int cp_offset,
1527 bool check,
1528 bool preloaded);
1529
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001530// Only emits letters (things that have case). Only used for case independent
1531// matches.
ager@chromium.org381abbb2009-02-25 13:23:22 +00001532static inline bool EmitAtomLetter(RegExpCompiler* compiler,
1533 uc16 c,
1534 Label* on_failure,
1535 int cp_offset,
1536 bool check,
1537 bool preloaded) {
1538 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1539 bool ascii = compiler->ascii();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001540 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
ager@chromium.org381abbb2009-02-25 13:23:22 +00001541 int length = GetCaseIndependentLetters(c, ascii, chars);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001542 if (length <= 1) return false;
1543 // We may not need to check against the end of the input string
1544 // if this character lies before a character that matched.
1545 if (!preloaded) {
1546 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001547 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001548 Label ok;
1549 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1550 switch (length) {
1551 case 2: {
1552 if (ShortCutEmitCharacterPair(macro_assembler,
1553 ascii,
1554 chars[0],
1555 chars[1],
1556 on_failure)) {
1557 } else {
1558 macro_assembler->CheckCharacter(chars[0], &ok);
1559 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1560 macro_assembler->Bind(&ok);
1561 }
1562 break;
1563 }
1564 case 4:
1565 macro_assembler->CheckCharacter(chars[3], &ok);
1566 // Fall through!
1567 case 3:
1568 macro_assembler->CheckCharacter(chars[0], &ok);
1569 macro_assembler->CheckCharacter(chars[1], &ok);
1570 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1571 macro_assembler->Bind(&ok);
1572 break;
1573 default:
1574 UNREACHABLE();
1575 break;
1576 }
1577 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001578}
1579
1580
1581static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1582 RegExpCharacterClass* cc,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001583 bool ascii,
ager@chromium.org381abbb2009-02-25 13:23:22 +00001584 Label* on_failure,
1585 int cp_offset,
1586 bool check_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001587 bool preloaded) {
1588 if (cc->is_standard() &&
1589 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1590 cp_offset,
1591 check_offset,
1592 on_failure)) {
1593 return;
1594 }
1595
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001596 ZoneList<CharacterRange>* ranges = cc->ranges();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001597 int max_char;
1598 if (ascii) {
1599 max_char = String::kMaxAsciiCharCode;
1600 } else {
1601 max_char = String::kMaxUC16CharCode;
1602 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001603
1604 Label success;
1605
1606 Label* char_is_in_class =
1607 cc->is_negated() ? on_failure : &success;
1608
1609 int range_count = ranges->length();
1610
ager@chromium.org8bb60582008-12-11 12:02:20 +00001611 int last_valid_range = range_count - 1;
1612 while (last_valid_range >= 0) {
1613 CharacterRange& range = ranges->at(last_valid_range);
1614 if (range.from() <= max_char) {
1615 break;
1616 }
1617 last_valid_range--;
1618 }
1619
1620 if (last_valid_range < 0) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001621 if (!cc->is_negated()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001622 // TODO(plesner): We can remove this when the node level does our
1623 // ASCII optimizations for us.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001624 macro_assembler->GoTo(on_failure);
1625 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001626 if (check_offset) {
1627 macro_assembler->CheckPosition(cp_offset, on_failure);
1628 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001629 return;
1630 }
1631
ager@chromium.org8bb60582008-12-11 12:02:20 +00001632 if (last_valid_range == 0 &&
1633 !cc->is_negated() &&
1634 ranges->at(0).IsEverything(max_char)) {
1635 // This is a common case hit by non-anchored expressions.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001636 if (check_offset) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001637 macro_assembler->CheckPosition(cp_offset, on_failure);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001638 }
1639 return;
1640 }
1641
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001642 if (!preloaded) {
1643 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001644 }
1645
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001646 for (int i = 0; i < last_valid_range; i++) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001647 CharacterRange& range = ranges->at(i);
1648 Label next_range;
1649 uc16 from = range.from();
1650 uc16 to = range.to();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001651 if (from > max_char) {
1652 continue;
1653 }
1654 if (to > max_char) to = max_char;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001655 if (to == from) {
1656 macro_assembler->CheckCharacter(to, char_is_in_class);
1657 } else {
1658 if (from != 0) {
1659 macro_assembler->CheckCharacterLT(from, &next_range);
1660 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001661 if (to != max_char) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001662 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
1663 } else {
1664 macro_assembler->GoTo(char_is_in_class);
1665 }
1666 }
1667 macro_assembler->Bind(&next_range);
1668 }
1669
ager@chromium.org8bb60582008-12-11 12:02:20 +00001670 CharacterRange& range = ranges->at(last_valid_range);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001671 uc16 from = range.from();
1672 uc16 to = range.to();
1673
ager@chromium.org8bb60582008-12-11 12:02:20 +00001674 if (to > max_char) to = max_char;
1675 ASSERT(to >= from);
1676
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001677 if (to == from) {
1678 if (cc->is_negated()) {
1679 macro_assembler->CheckCharacter(to, on_failure);
1680 } else {
1681 macro_assembler->CheckNotCharacter(to, on_failure);
1682 }
1683 } else {
1684 if (from != 0) {
1685 if (cc->is_negated()) {
1686 macro_assembler->CheckCharacterLT(from, &success);
1687 } else {
1688 macro_assembler->CheckCharacterLT(from, on_failure);
1689 }
1690 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001691 if (to != String::kMaxUC16CharCode) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001692 if (cc->is_negated()) {
1693 macro_assembler->CheckCharacterLT(to + 1, on_failure);
1694 } else {
1695 macro_assembler->CheckCharacterGT(to, on_failure);
1696 }
1697 } else {
1698 if (cc->is_negated()) {
1699 macro_assembler->GoTo(on_failure);
1700 }
1701 }
1702 }
1703 macro_assembler->Bind(&success);
1704}
1705
1706
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001707RegExpNode::~RegExpNode() {
1708}
1709
1710
ager@chromium.org8bb60582008-12-11 12:02:20 +00001711RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00001712 Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001713 // If we are generating a greedy loop then don't stop and don't reuse code.
ager@chromium.org32912102009-01-16 10:38:43 +00001714 if (trace->stop_node() != NULL) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001715 return CONTINUE;
1716 }
1717
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001718 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00001719 if (trace->is_trivial()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001720 if (label_.is_bound()) {
1721 // We are being asked to generate a generic version, but that's already
1722 // been done so just go to it.
1723 macro_assembler->GoTo(&label_);
1724 return DONE;
1725 }
1726 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1727 // To avoid too deep recursion we push the node to the work queue and just
1728 // generate a goto here.
1729 compiler->AddWork(this);
1730 macro_assembler->GoTo(&label_);
1731 return DONE;
1732 }
1733 // Generate generic version of the node and bind the label for later use.
1734 macro_assembler->Bind(&label_);
1735 return CONTINUE;
1736 }
1737
1738 // We are being asked to make a non-generic version. Keep track of how many
1739 // non-generic versions we generate so as not to overdo it.
ager@chromium.org32912102009-01-16 10:38:43 +00001740 trace_count_++;
ager@chromium.org381abbb2009-02-25 13:23:22 +00001741 if (FLAG_regexp_optimization &&
iposva@chromium.org245aa852009-02-10 00:49:54 +00001742 trace_count_ < kMaxCopiesCodeGenerated &&
ager@chromium.org8bb60582008-12-11 12:02:20 +00001743 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
1744 return CONTINUE;
1745 }
1746
ager@chromium.org32912102009-01-16 10:38:43 +00001747 // If we get here code has been generated for this node too many times or
1748 // recursion is too deep. Time to switch to a generic version. The code for
ager@chromium.org8bb60582008-12-11 12:02:20 +00001749 // generic versions above can handle deep recursion properly.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001750 trace->Flush(compiler, this);
1751 return DONE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001752}
1753
1754
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001755int ActionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001756 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1757 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001758 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001759}
1760
1761
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001762int AssertionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1763 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1764 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
1765}
1766
1767
1768int BackReferenceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1769 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1770 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
1771}
1772
1773
1774int TextNode::EatsAtLeast(int still_to_find, int recursion_depth) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001775 int answer = Length();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001776 if (answer >= still_to_find) return answer;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001777 if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001778 return answer + on_success()->EatsAtLeast(still_to_find - answer,
1779 recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001780}
1781
1782
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001783int NegativeLookaheadChoiceNode:: EatsAtLeast(int still_to_find,
1784 int recursion_depth) {
1785 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1786 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1787 // afterwards.
1788 RegExpNode* node = alternatives_->at(1).node();
1789 return node->EatsAtLeast(still_to_find, recursion_depth + 1);
1790}
1791
1792
1793void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
1794 QuickCheckDetails* details,
1795 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00001796 int filled_in,
1797 bool not_at_start) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001798 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1799 // afterwards.
1800 RegExpNode* node = alternatives_->at(1).node();
iposva@chromium.org245aa852009-02-10 00:49:54 +00001801 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001802}
1803
1804
1805int ChoiceNode::EatsAtLeastHelper(int still_to_find,
1806 int recursion_depth,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001807 RegExpNode* ignore_this_node) {
1808 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1809 int min = 100;
1810 int choice_count = alternatives_->length();
1811 for (int i = 0; i < choice_count; i++) {
1812 RegExpNode* node = alternatives_->at(i).node();
1813 if (node == ignore_this_node) continue;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001814 int node_eats_at_least = node->EatsAtLeast(still_to_find,
1815 recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001816 if (node_eats_at_least < min) min = node_eats_at_least;
1817 }
1818 return min;
1819}
1820
1821
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001822int LoopChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1823 return EatsAtLeastHelper(still_to_find, recursion_depth, loop_node_);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001824}
1825
1826
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001827int ChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1828 return EatsAtLeastHelper(still_to_find, recursion_depth, NULL);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001829}
1830
1831
1832// Takes the left-most 1-bit and smears it out, setting all bits to its right.
1833static inline uint32_t SmearBitsRight(uint32_t v) {
1834 v |= v >> 1;
1835 v |= v >> 2;
1836 v |= v >> 4;
1837 v |= v >> 8;
1838 v |= v >> 16;
1839 return v;
1840}
1841
1842
1843bool QuickCheckDetails::Rationalize(bool asc) {
1844 bool found_useful_op = false;
1845 uint32_t char_mask;
1846 if (asc) {
1847 char_mask = String::kMaxAsciiCharCode;
1848 } else {
1849 char_mask = String::kMaxUC16CharCode;
1850 }
1851 mask_ = 0;
1852 value_ = 0;
1853 int char_shift = 0;
1854 for (int i = 0; i < characters_; i++) {
1855 Position* pos = &positions_[i];
1856 if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
1857 found_useful_op = true;
1858 }
1859 mask_ |= (pos->mask & char_mask) << char_shift;
1860 value_ |= (pos->value & char_mask) << char_shift;
1861 char_shift += asc ? 8 : 16;
1862 }
1863 return found_useful_op;
1864}
1865
1866
1867bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00001868 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001869 bool preload_has_checked_bounds,
1870 Label* on_possible_success,
1871 QuickCheckDetails* details,
1872 bool fall_through_on_failure) {
1873 if (details->characters() == 0) return false;
iposva@chromium.org245aa852009-02-10 00:49:54 +00001874 GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
1875 if (details->cannot_match()) return false;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001876 if (!details->Rationalize(compiler->ascii())) return false;
1877 uint32_t mask = details->mask();
1878 uint32_t value = details->value();
1879
1880 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1881
ager@chromium.org32912102009-01-16 10:38:43 +00001882 if (trace->characters_preloaded() != details->characters()) {
1883 assembler->LoadCurrentCharacter(trace->cp_offset(),
1884 trace->backtrack(),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001885 !preload_has_checked_bounds,
1886 details->characters());
1887 }
1888
1889
1890 bool need_mask = true;
1891
1892 if (details->characters() == 1) {
1893 // If number of characters preloaded is 1 then we used a byte or 16 bit
1894 // load so the value is already masked down.
1895 uint32_t char_mask;
1896 if (compiler->ascii()) {
1897 char_mask = String::kMaxAsciiCharCode;
1898 } else {
1899 char_mask = String::kMaxUC16CharCode;
1900 }
1901 if ((mask & char_mask) == char_mask) need_mask = false;
1902 mask &= char_mask;
1903 } else {
1904 // For 2-character preloads in ASCII mode we also use a 16 bit load with
1905 // zero extend.
1906 if (details->characters() == 2 && compiler->ascii()) {
1907 if ((mask & 0xffff) == 0xffff) need_mask = false;
1908 } else {
1909 if (mask == 0xffffffff) need_mask = false;
1910 }
1911 }
1912
1913 if (fall_through_on_failure) {
1914 if (need_mask) {
1915 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1916 } else {
1917 assembler->CheckCharacter(value, on_possible_success);
1918 }
1919 } else {
1920 if (need_mask) {
ager@chromium.org32912102009-01-16 10:38:43 +00001921 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001922 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00001923 assembler->CheckNotCharacter(value, trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001924 }
1925 }
1926 return true;
1927}
1928
1929
1930// Here is the meat of GetQuickCheckDetails (see also the comment on the
1931// super-class in the .h file).
1932//
1933// We iterate along the text object, building up for each character a
1934// mask and value that can be used to test for a quick failure to match.
1935// The masks and values for the positions will be combined into a single
1936// machine word for the current character width in order to be used in
1937// generating a quick check.
1938void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
1939 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00001940 int characters_filled_in,
1941 bool not_at_start) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001942 ASSERT(characters_filled_in < details->characters());
1943 int characters = details->characters();
1944 int char_mask;
1945 int char_shift;
1946 if (compiler->ascii()) {
1947 char_mask = String::kMaxAsciiCharCode;
1948 char_shift = 8;
1949 } else {
1950 char_mask = String::kMaxUC16CharCode;
1951 char_shift = 16;
1952 }
1953 for (int k = 0; k < elms_->length(); k++) {
1954 TextElement elm = elms_->at(k);
1955 if (elm.type == TextElement::ATOM) {
1956 Vector<const uc16> quarks = elm.data.u_atom->data();
1957 for (int i = 0; i < characters && i < quarks.length(); i++) {
1958 QuickCheckDetails::Position* pos =
1959 details->positions(characters_filled_in);
ager@chromium.org6f10e412009-02-13 10:11:16 +00001960 uc16 c = quarks[i];
1961 if (c > char_mask) {
1962 // If we expect a non-ASCII character from an ASCII string,
1963 // there is no way we can match. Not even case independent
1964 // matching can turn an ASCII character into non-ASCII or
1965 // vice versa.
1966 details->set_cannot_match();
ager@chromium.org381abbb2009-02-25 13:23:22 +00001967 pos->determines_perfectly = false;
ager@chromium.org6f10e412009-02-13 10:11:16 +00001968 return;
1969 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001970 if (compiler->ignore_case()) {
1971 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
ager@chromium.org381abbb2009-02-25 13:23:22 +00001972 int length = GetCaseIndependentLetters(c, compiler->ascii(), chars);
1973 ASSERT(length != 0); // Can only happen if c > char_mask (see above).
1974 if (length == 1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001975 // This letter has no case equivalents, so it's nice and simple
1976 // and the mask-compare will determine definitely whether we have
1977 // a match at this character position.
1978 pos->mask = char_mask;
1979 pos->value = c;
1980 pos->determines_perfectly = true;
1981 } else {
1982 uint32_t common_bits = char_mask;
1983 uint32_t bits = chars[0];
1984 for (int j = 1; j < length; j++) {
1985 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1986 common_bits ^= differing_bits;
1987 bits &= common_bits;
1988 }
1989 // If length is 2 and common bits has only one zero in it then
1990 // our mask and compare instruction will determine definitely
1991 // whether we have a match at this character position. Otherwise
1992 // it can only be an approximate check.
1993 uint32_t one_zero = (common_bits | ~char_mask);
1994 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
1995 pos->determines_perfectly = true;
1996 }
1997 pos->mask = common_bits;
1998 pos->value = bits;
1999 }
2000 } else {
2001 // Don't ignore case. Nice simple case where the mask-compare will
2002 // determine definitely whether we have a match at this character
2003 // position.
2004 pos->mask = char_mask;
ager@chromium.org6f10e412009-02-13 10:11:16 +00002005 pos->value = c;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002006 pos->determines_perfectly = true;
2007 }
2008 characters_filled_in++;
2009 ASSERT(characters_filled_in <= details->characters());
2010 if (characters_filled_in == details->characters()) {
2011 return;
2012 }
2013 }
2014 } else {
2015 QuickCheckDetails::Position* pos =
2016 details->positions(characters_filled_in);
2017 RegExpCharacterClass* tree = elm.data.u_char_class;
2018 ZoneList<CharacterRange>* ranges = tree->ranges();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002019 if (tree->is_negated()) {
2020 // A quick check uses multi-character mask and compare. There is no
2021 // useful way to incorporate a negative char class into this scheme
2022 // so we just conservatively create a mask and value that will always
2023 // succeed.
2024 pos->mask = 0;
2025 pos->value = 0;
2026 } else {
ager@chromium.org381abbb2009-02-25 13:23:22 +00002027 int first_range = 0;
2028 while (ranges->at(first_range).from() > char_mask) {
2029 first_range++;
2030 if (first_range == ranges->length()) {
2031 details->set_cannot_match();
2032 pos->determines_perfectly = false;
2033 return;
2034 }
2035 }
2036 CharacterRange range = ranges->at(first_range);
2037 uc16 from = range.from();
2038 uc16 to = range.to();
2039 if (to > char_mask) {
2040 to = char_mask;
2041 }
2042 uint32_t differing_bits = (from ^ to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002043 // A mask and compare is only perfect if the differing bits form a
2044 // number like 00011111 with one single block of trailing 1s.
2045 if ((differing_bits & (differing_bits + 1)) == 0) {
2046 pos->determines_perfectly = true;
2047 }
2048 uint32_t common_bits = ~SmearBitsRight(differing_bits);
ager@chromium.org381abbb2009-02-25 13:23:22 +00002049 uint32_t bits = (from & common_bits);
2050 for (int i = first_range + 1; i < ranges->length(); i++) {
2051 CharacterRange range = ranges->at(i);
2052 uc16 from = range.from();
2053 uc16 to = range.to();
2054 if (from > char_mask) continue;
2055 if (to > char_mask) to = char_mask;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002056 // Here we are combining more ranges into the mask and compare
2057 // value. With each new range the mask becomes more sparse and
2058 // so the chances of a false positive rise. A character class
2059 // with multiple ranges is assumed never to be equivalent to a
2060 // mask and compare operation.
2061 pos->determines_perfectly = false;
ager@chromium.org381abbb2009-02-25 13:23:22 +00002062 uint32_t new_common_bits = (from ^ to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002063 new_common_bits = ~SmearBitsRight(new_common_bits);
2064 common_bits &= new_common_bits;
2065 bits &= new_common_bits;
ager@chromium.org381abbb2009-02-25 13:23:22 +00002066 uint32_t differing_bits = (from & common_bits) ^ bits;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002067 common_bits ^= differing_bits;
2068 bits &= common_bits;
2069 }
2070 pos->mask = common_bits;
2071 pos->value = bits;
2072 }
2073 characters_filled_in++;
2074 ASSERT(characters_filled_in <= details->characters());
2075 if (characters_filled_in == details->characters()) {
2076 return;
2077 }
2078 }
2079 }
2080 ASSERT(characters_filled_in != details->characters());
iposva@chromium.org245aa852009-02-10 00:49:54 +00002081 on_success()-> GetQuickCheckDetails(details,
2082 compiler,
2083 characters_filled_in,
2084 true);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002085}
2086
2087
2088void QuickCheckDetails::Clear() {
2089 for (int i = 0; i < characters_; i++) {
2090 positions_[i].mask = 0;
2091 positions_[i].value = 0;
2092 positions_[i].determines_perfectly = false;
2093 }
2094 characters_ = 0;
2095}
2096
2097
2098void QuickCheckDetails::Advance(int by, bool ascii) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002099 ASSERT(by >= 0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002100 if (by >= characters_) {
2101 Clear();
2102 return;
2103 }
2104 for (int i = 0; i < characters_ - by; i++) {
2105 positions_[i] = positions_[by + i];
2106 }
2107 for (int i = characters_ - by; i < characters_; i++) {
2108 positions_[i].mask = 0;
2109 positions_[i].value = 0;
2110 positions_[i].determines_perfectly = false;
2111 }
2112 characters_ -= by;
2113 // We could change mask_ and value_ here but we would never advance unless
2114 // they had already been used in a check and they won't be used again because
2115 // it would gain us nothing. So there's no point.
2116}
2117
2118
2119void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2120 ASSERT(characters_ == other->characters_);
iposva@chromium.org245aa852009-02-10 00:49:54 +00002121 if (other->cannot_match_) {
2122 return;
2123 }
2124 if (cannot_match_) {
2125 *this = *other;
2126 return;
2127 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002128 for (int i = from_index; i < characters_; i++) {
2129 QuickCheckDetails::Position* pos = positions(i);
2130 QuickCheckDetails::Position* other_pos = other->positions(i);
2131 if (pos->mask != other_pos->mask ||
2132 pos->value != other_pos->value ||
2133 !other_pos->determines_perfectly) {
2134 // Our mask-compare operation will be approximate unless we have the
2135 // exact same operation on both sides of the alternation.
2136 pos->determines_perfectly = false;
2137 }
2138 pos->mask &= other_pos->mask;
2139 pos->value &= pos->mask;
2140 other_pos->value &= pos->mask;
2141 uc16 differing_bits = (pos->value ^ other_pos->value);
2142 pos->mask &= ~differing_bits;
2143 pos->value &= pos->mask;
2144 }
2145}
2146
2147
ager@chromium.org32912102009-01-16 10:38:43 +00002148class VisitMarker {
2149 public:
2150 explicit VisitMarker(NodeInfo* info) : info_(info) {
2151 ASSERT(!info->visited);
2152 info->visited = true;
2153 }
2154 ~VisitMarker() {
2155 info_->visited = false;
2156 }
2157 private:
2158 NodeInfo* info_;
2159};
2160
2161
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002162void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2163 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002164 int characters_filled_in,
2165 bool not_at_start) {
ager@chromium.org32912102009-01-16 10:38:43 +00002166 if (body_can_be_zero_length_ || info()->visited) return;
2167 VisitMarker marker(info());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002168 return ChoiceNode::GetQuickCheckDetails(details,
2169 compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002170 characters_filled_in,
2171 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002172}
2173
2174
2175void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2176 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002177 int characters_filled_in,
2178 bool not_at_start) {
2179 not_at_start = (not_at_start || not_at_start_);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002180 int choice_count = alternatives_->length();
2181 ASSERT(choice_count > 0);
2182 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2183 compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002184 characters_filled_in,
2185 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002186 for (int i = 1; i < choice_count; i++) {
2187 QuickCheckDetails new_details(details->characters());
2188 RegExpNode* node = alternatives_->at(i).node();
iposva@chromium.org245aa852009-02-10 00:49:54 +00002189 node->GetQuickCheckDetails(&new_details, compiler,
2190 characters_filled_in,
2191 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002192 // Here we merge the quick match details of the two branches.
2193 details->Merge(&new_details, characters_filled_in);
2194 }
2195}
2196
2197
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002198// Check for [0-9A-Z_a-z].
2199static void EmitWordCheck(RegExpMacroAssembler* assembler,
2200 Label* word,
2201 Label* non_word,
2202 bool fall_through_on_word) {
2203 assembler->CheckCharacterGT('z', non_word);
2204 assembler->CheckCharacterLT('0', non_word);
2205 assembler->CheckCharacterGT('a' - 1, word);
2206 assembler->CheckCharacterLT('9' + 1, word);
2207 assembler->CheckCharacterLT('A', non_word);
2208 assembler->CheckCharacterLT('Z' + 1, word);
2209 if (fall_through_on_word) {
2210 assembler->CheckNotCharacter('_', non_word);
2211 } else {
2212 assembler->CheckCharacter('_', word);
2213 }
2214}
2215
2216
2217// Emit the code to check for a ^ in multiline mode (1-character lookbehind
2218// that matches newline or the start of input).
2219static void EmitHat(RegExpCompiler* compiler,
2220 RegExpNode* on_success,
2221 Trace* trace) {
2222 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2223 // We will be loading the previous character into the current character
2224 // register.
2225 Trace new_trace(*trace);
2226 new_trace.InvalidateCurrentCharacter();
2227
2228 Label ok;
2229 if (new_trace.cp_offset() == 0) {
2230 // The start of input counts as a newline in this context, so skip to
2231 // ok if we are at the start.
2232 assembler->CheckAtStart(&ok);
2233 }
2234 // We already checked that we are not at the start of input so it must be
2235 // OK to load the previous character.
2236 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2237 new_trace.backtrack(),
2238 false);
2239 // Newline means \n, \r, 0x2028 or 0x2029.
2240 if (!compiler->ascii()) {
2241 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2242 }
2243 assembler->CheckCharacter('\n', &ok);
2244 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2245 assembler->Bind(&ok);
2246 on_success->Emit(compiler, &new_trace);
2247}
2248
2249
2250// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2251static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
2252 RegExpCompiler* compiler,
2253 RegExpNode* on_success,
2254 Trace* trace) {
2255 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2256 Label before_non_word;
2257 Label before_word;
2258 if (trace->characters_preloaded() != 1) {
2259 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2260 }
2261 // Fall through on non-word.
2262 EmitWordCheck(assembler, &before_word, &before_non_word, false);
2263
2264 // We will be loading the previous character into the current character
2265 // register.
2266 Trace new_trace(*trace);
2267 new_trace.InvalidateCurrentCharacter();
2268
2269 Label ok;
2270 Label* boundary;
2271 Label* not_boundary;
2272 if (type == AssertionNode::AT_BOUNDARY) {
2273 boundary = &ok;
2274 not_boundary = new_trace.backtrack();
2275 } else {
2276 not_boundary = &ok;
2277 boundary = new_trace.backtrack();
2278 }
2279
2280 // Next character is not a word character.
2281 assembler->Bind(&before_non_word);
2282 if (new_trace.cp_offset() == 0) {
2283 // The start of input counts as a non-word character, so the question is
2284 // decided if we are at the start.
2285 assembler->CheckAtStart(not_boundary);
2286 }
2287 // We already checked that we are not at the start of input so it must be
2288 // OK to load the previous character.
2289 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2290 &ok, // Unused dummy label in this call.
2291 false);
2292 // Fall through on non-word.
2293 EmitWordCheck(assembler, boundary, not_boundary, false);
2294 assembler->GoTo(not_boundary);
2295
2296 // Next character is a word character.
2297 assembler->Bind(&before_word);
2298 if (new_trace.cp_offset() == 0) {
2299 // The start of input counts as a non-word character, so the question is
2300 // decided if we are at the start.
2301 assembler->CheckAtStart(boundary);
2302 }
2303 // We already checked that we are not at the start of input so it must be
2304 // OK to load the previous character.
2305 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2306 &ok, // Unused dummy label in this call.
2307 false);
2308 bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
2309 EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
2310
2311 assembler->Bind(&ok);
2312
2313 on_success->Emit(compiler, &new_trace);
2314}
2315
2316
iposva@chromium.org245aa852009-02-10 00:49:54 +00002317void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2318 RegExpCompiler* compiler,
2319 int filled_in,
2320 bool not_at_start) {
2321 if (type_ == AT_START && not_at_start) {
2322 details->set_cannot_match();
2323 return;
2324 }
2325 return on_success()->GetQuickCheckDetails(details,
2326 compiler,
2327 filled_in,
2328 not_at_start);
2329}
2330
2331
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002332void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2333 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2334 switch (type_) {
2335 case AT_END: {
2336 Label ok;
2337 assembler->CheckPosition(trace->cp_offset(), &ok);
2338 assembler->GoTo(trace->backtrack());
2339 assembler->Bind(&ok);
2340 break;
2341 }
iposva@chromium.org245aa852009-02-10 00:49:54 +00002342 case AT_START: {
2343 if (trace->at_start() == Trace::FALSE) {
2344 assembler->GoTo(trace->backtrack());
2345 return;
2346 }
2347 if (trace->at_start() == Trace::UNKNOWN) {
2348 assembler->CheckNotAtStart(trace->backtrack());
2349 Trace at_start_trace = *trace;
2350 at_start_trace.set_at_start(true);
2351 on_success()->Emit(compiler, &at_start_trace);
2352 return;
2353 }
2354 }
2355 break;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002356 case AFTER_NEWLINE:
2357 EmitHat(compiler, on_success(), trace);
2358 return;
2359 case AT_NON_BOUNDARY:
2360 case AT_BOUNDARY:
2361 EmitBoundaryCheck(type_, compiler, on_success(), trace);
2362 return;
2363 }
2364 on_success()->Emit(compiler, trace);
2365}
2366
2367
ager@chromium.org381abbb2009-02-25 13:23:22 +00002368static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
2369 if (quick_check == NULL) return false;
2370 if (offset >= quick_check->characters()) return false;
2371 return quick_check->positions(offset)->determines_perfectly;
2372}
2373
2374
2375static void UpdateBoundsCheck(int index, int* checked_up_to) {
2376 if (index > *checked_up_to) {
2377 *checked_up_to = index;
2378 }
2379}
2380
2381
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002382// We call this repeatedly to generate code for each pass over the text node.
2383// The passes are in increasing order of difficulty because we hope one
2384// of the first passes will fail in which case we are saved the work of the
2385// later passes. for example for the case independent regexp /%[asdfghjkl]a/
2386// we will check the '%' in the first pass, the case independent 'a' in the
2387// second pass and the character class in the last pass.
2388//
2389// The passes are done from right to left, so for example to test for /bar/
2390// we will first test for an 'r' with offset 2, then an 'a' with offset 1
2391// and then a 'b' with offset 0. This means we can avoid the end-of-input
2392// bounds check most of the time. In the example we only need to check for
2393// end-of-input when loading the putative 'r'.
2394//
2395// A slight complication involves the fact that the first character may already
2396// be fetched into a register by the previous node. In this case we want to
2397// do the test for that character first. We do this in separate passes. The
2398// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2399// pass has been performed then subsequent passes will have true in
2400// first_element_checked to indicate that that character does not need to be
2401// checked again.
2402//
ager@chromium.org32912102009-01-16 10:38:43 +00002403// In addition to all this we are passed a Trace, which can
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002404// contain an AlternativeGeneration object. In this AlternativeGeneration
2405// object we can see details of any quick check that was already passed in
2406// order to get to the code we are now generating. The quick check can involve
2407// loading characters, which means we do not need to recheck the bounds
2408// up to the limit the quick check already checked. In addition the quick
2409// check can have involved a mask and compare operation which may simplify
2410// or obviate the need for further checks at some character positions.
2411void TextNode::TextEmitPass(RegExpCompiler* compiler,
2412 TextEmitPassType pass,
2413 bool preloaded,
ager@chromium.org32912102009-01-16 10:38:43 +00002414 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002415 bool first_element_checked,
2416 int* checked_up_to) {
2417 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2418 bool ascii = compiler->ascii();
ager@chromium.org32912102009-01-16 10:38:43 +00002419 Label* backtrack = trace->backtrack();
2420 QuickCheckDetails* quick_check = trace->quick_check_performed();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002421 int element_count = elms_->length();
2422 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2423 TextElement elm = elms_->at(i);
ager@chromium.org32912102009-01-16 10:38:43 +00002424 int cp_offset = trace->cp_offset() + elm.cp_offset;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002425 if (elm.type == TextElement::ATOM) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00002426 Vector<const uc16> quarks = elm.data.u_atom->data();
2427 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2428 if (first_element_checked && i == 0 && j == 0) continue;
2429 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
2430 EmitCharacterFunction* emit_function = NULL;
2431 switch (pass) {
2432 case NON_ASCII_MATCH:
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002433 ASSERT(ascii);
2434 if (quarks[j] > String::kMaxAsciiCharCode) {
2435 assembler->GoTo(backtrack);
2436 return;
2437 }
ager@chromium.org381abbb2009-02-25 13:23:22 +00002438 break;
2439 case NON_LETTER_CHARACTER_MATCH:
2440 emit_function = &EmitAtomNonLetter;
2441 break;
2442 case SIMPLE_CHARACTER_MATCH:
2443 emit_function = &EmitSimpleCharacter;
2444 break;
2445 case CASE_CHARACTER_MATCH:
2446 emit_function = &EmitAtomLetter;
2447 break;
2448 default:
2449 break;
2450 }
2451 if (emit_function != NULL) {
2452 bool bound_checked = emit_function(compiler,
ager@chromium.org6f10e412009-02-13 10:11:16 +00002453 quarks[j],
2454 backtrack,
2455 cp_offset + j,
2456 *checked_up_to < cp_offset + j,
2457 preloaded);
ager@chromium.org381abbb2009-02-25 13:23:22 +00002458 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002459 }
2460 }
2461 } else {
2462 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002463 if (pass == CHARACTER_CLASS_MATCH) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00002464 if (first_element_checked && i == 0) continue;
2465 if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002466 RegExpCharacterClass* cc = elm.data.u_char_class;
2467 EmitCharClass(assembler,
2468 cc,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002469 ascii,
ager@chromium.org381abbb2009-02-25 13:23:22 +00002470 backtrack,
2471 cp_offset,
2472 *checked_up_to < cp_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002473 preloaded);
ager@chromium.org381abbb2009-02-25 13:23:22 +00002474 UpdateBoundsCheck(cp_offset, checked_up_to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002475 }
2476 }
2477 }
2478}
2479
2480
2481int TextNode::Length() {
2482 TextElement elm = elms_->last();
2483 ASSERT(elm.cp_offset >= 0);
2484 if (elm.type == TextElement::ATOM) {
2485 return elm.cp_offset + elm.data.u_atom->data().length();
2486 } else {
2487 return elm.cp_offset + 1;
2488 }
2489}
2490
2491
ager@chromium.org381abbb2009-02-25 13:23:22 +00002492bool TextNode::SkipPass(int int_pass, bool ignore_case) {
2493 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
2494 if (ignore_case) {
2495 return pass == SIMPLE_CHARACTER_MATCH;
2496 } else {
2497 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2498 }
2499}
2500
2501
ager@chromium.org8bb60582008-12-11 12:02:20 +00002502// This generates the code to match a text node. A text node can contain
2503// straight character sequences (possibly to be matched in a case-independent
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002504// way) and character classes. For efficiency we do not do this in a single
2505// pass from left to right. Instead we pass over the text node several times,
2506// emitting code for some character positions every time. See the comment on
2507// TextEmitPass for details.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002508void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org32912102009-01-16 10:38:43 +00002509 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002510 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002511 ASSERT(limit_result == CONTINUE);
2512
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002513 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2514 compiler->SetRegExpTooBig();
2515 return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002516 }
2517
2518 if (compiler->ascii()) {
2519 int dummy = 0;
ager@chromium.org32912102009-01-16 10:38:43 +00002520 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002521 }
2522
2523 bool first_elt_done = false;
ager@chromium.org32912102009-01-16 10:38:43 +00002524 int bound_checked_to = trace->cp_offset() - 1;
2525 bound_checked_to += trace->bound_checked_up_to();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002526
2527 // If a character is preloaded into the current character register then
2528 // check that now.
ager@chromium.org32912102009-01-16 10:38:43 +00002529 if (trace->characters_preloaded() == 1) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00002530 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2531 if (!SkipPass(pass, compiler->ignore_case())) {
2532 TextEmitPass(compiler,
2533 static_cast<TextEmitPassType>(pass),
2534 true,
2535 trace,
2536 false,
2537 &bound_checked_to);
2538 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002539 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002540 first_elt_done = true;
2541 }
2542
ager@chromium.org381abbb2009-02-25 13:23:22 +00002543 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2544 if (!SkipPass(pass, compiler->ignore_case())) {
2545 TextEmitPass(compiler,
2546 static_cast<TextEmitPassType>(pass),
2547 false,
2548 trace,
2549 first_elt_done,
2550 &bound_checked_to);
2551 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002552 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002553
ager@chromium.org32912102009-01-16 10:38:43 +00002554 Trace successor_trace(*trace);
iposva@chromium.org245aa852009-02-10 00:49:54 +00002555 successor_trace.set_at_start(false);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002556 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002557 RecursionCheck rc(compiler);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002558 on_success()->Emit(compiler, &successor_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002559}
2560
2561
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002562void Trace::InvalidateCurrentCharacter() {
2563 characters_preloaded_ = 0;
2564}
2565
2566
2567void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002568 ASSERT(by > 0);
2569 // We don't have an instruction for shifting the current character register
2570 // down or for using a shifted value for anything so lets just forget that
2571 // we preloaded any characters into it.
2572 characters_preloaded_ = 0;
2573 // Adjust the offsets of the quick check performed information. This
2574 // information is used to find out what we already determined about the
2575 // characters by means of mask and compare.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002576 quick_check_performed_.Advance(by, compiler->ascii());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002577 cp_offset_ += by;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002578 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2579 compiler->SetRegExpTooBig();
2580 cp_offset_ = 0;
2581 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002582 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002583}
2584
2585
2586void TextNode::MakeCaseIndependent() {
2587 int element_count = elms_->length();
2588 for (int i = 0; i < element_count; i++) {
2589 TextElement elm = elms_->at(i);
2590 if (elm.type == TextElement::CHAR_CLASS) {
2591 RegExpCharacterClass* cc = elm.data.u_char_class;
2592 ZoneList<CharacterRange>* ranges = cc->ranges();
2593 int range_count = ranges->length();
2594 for (int i = 0; i < range_count; i++) {
2595 ranges->at(i).AddCaseEquivalents(ranges);
2596 }
2597 }
2598 }
2599}
2600
2601
ager@chromium.org8bb60582008-12-11 12:02:20 +00002602int TextNode::GreedyLoopTextLength() {
2603 TextElement elm = elms_->at(elms_->length() - 1);
2604 if (elm.type == TextElement::CHAR_CLASS) {
2605 return elm.cp_offset + 1;
2606 } else {
2607 return elm.cp_offset + elm.data.u_atom->data().length();
2608 }
2609}
2610
2611
2612// Finds the fixed match length of a sequence of nodes that goes from
2613// this alternative and back to this choice node. If there are variable
2614// length nodes or other complications in the way then return a sentinel
2615// value indicating that a greedy loop cannot be constructed.
2616int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) {
2617 int length = 0;
2618 RegExpNode* node = alternative->node();
2619 // Later we will generate code for all these text nodes using recursion
2620 // so we have to limit the max number.
2621 int recursion_depth = 0;
2622 while (node != this) {
2623 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2624 return kNodeIsTooComplexForGreedyLoops;
2625 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002626 int node_length = node->GreedyLoopTextLength();
2627 if (node_length == kNodeIsTooComplexForGreedyLoops) {
2628 return kNodeIsTooComplexForGreedyLoops;
2629 }
2630 length += node_length;
2631 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2632 node = seq_node->on_success();
2633 }
2634 return length;
2635}
2636
2637
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002638void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2639 ASSERT_EQ(loop_node_, NULL);
2640 AddAlternative(alt);
2641 loop_node_ = alt.node();
2642}
2643
2644
2645void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2646 ASSERT_EQ(continue_node_, NULL);
2647 AddAlternative(alt);
2648 continue_node_ = alt.node();
2649}
2650
2651
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002652void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002653 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00002654 if (trace->stop_node() == this) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002655 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2656 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2657 // Update the counter-based backtracking info on the stack. This is an
2658 // optimization for greedy loops (see below).
ager@chromium.org32912102009-01-16 10:38:43 +00002659 ASSERT(trace->cp_offset() == text_length);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002660 macro_assembler->AdvanceCurrentPosition(text_length);
ager@chromium.org32912102009-01-16 10:38:43 +00002661 macro_assembler->GoTo(trace->loop_label());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002662 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002663 }
ager@chromium.org32912102009-01-16 10:38:43 +00002664 ASSERT(trace->stop_node() == NULL);
2665 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002666 trace->Flush(compiler, this);
2667 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002668 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002669 ChoiceNode::Emit(compiler, trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002670}
2671
2672
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002673int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002674 int preload_characters = EatsAtLeast(4, 0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002675#ifdef CAN_READ_UNALIGNED
2676 bool ascii = compiler->ascii();
2677 if (ascii) {
2678 if (preload_characters > 4) preload_characters = 4;
2679 // We can't preload 3 characters because there is no machine instruction
2680 // to do that. We can't just load 4 because we could be reading
2681 // beyond the end of the string, which could cause a memory fault.
2682 if (preload_characters == 3) preload_characters = 2;
2683 } else {
2684 if (preload_characters > 2) preload_characters = 2;
2685 }
2686#else
2687 if (preload_characters > 1) preload_characters = 1;
2688#endif
2689 return preload_characters;
2690}
2691
2692
2693// This class is used when generating the alternatives in a choice node. It
2694// records the way the alternative is being code generated.
2695class AlternativeGeneration: public Malloced {
2696 public:
2697 AlternativeGeneration()
2698 : possible_success(),
2699 expects_preload(false),
2700 after(),
2701 quick_check_details() { }
2702 Label possible_success;
2703 bool expects_preload;
2704 Label after;
2705 QuickCheckDetails quick_check_details;
2706};
2707
2708
2709// Creates a list of AlternativeGenerations. If the list has a reasonable
2710// size then it is on the stack, otherwise the excess is on the heap.
2711class AlternativeGenerationList {
2712 public:
2713 explicit AlternativeGenerationList(int count)
2714 : alt_gens_(count) {
2715 for (int i = 0; i < count && i < kAFew; i++) {
2716 alt_gens_.Add(a_few_alt_gens_ + i);
2717 }
2718 for (int i = kAFew; i < count; i++) {
2719 alt_gens_.Add(new AlternativeGeneration());
2720 }
2721 }
2722 ~AlternativeGenerationList() {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002723 for (int i = kAFew; i < alt_gens_.length(); i++) {
2724 delete alt_gens_[i];
2725 alt_gens_[i] = NULL;
2726 }
2727 }
2728
2729 AlternativeGeneration* at(int i) {
2730 return alt_gens_[i];
2731 }
2732 private:
2733 static const int kAFew = 10;
2734 ZoneList<AlternativeGeneration*> alt_gens_;
2735 AlternativeGeneration a_few_alt_gens_[kAFew];
2736};
2737
2738
2739/* Code generation for choice nodes.
2740 *
2741 * We generate quick checks that do a mask and compare to eliminate a
2742 * choice. If the quick check succeeds then it jumps to the continuation to
2743 * do slow checks and check subsequent nodes. If it fails (the common case)
2744 * it falls through to the next choice.
2745 *
2746 * Here is the desired flow graph. Nodes directly below each other imply
2747 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
2748 * 3 doesn't have a quick check so we have to call the slow check.
2749 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
2750 * regexp continuation is generated directly after the Sn node, up to the
2751 * next GoTo if we decide to reuse some already generated code. Some
2752 * nodes expect preload_characters to be preloaded into the current
2753 * character register. R nodes do this preloading. Vertices are marked
2754 * F for failures and S for success (possible success in the case of quick
2755 * nodes). L, V, < and > are used as arrow heads.
2756 *
2757 * ----------> R
2758 * |
2759 * V
2760 * Q1 -----> S1
2761 * | S /
2762 * F| /
2763 * | F/
2764 * | /
2765 * | R
2766 * | /
2767 * V L
2768 * Q2 -----> S2
2769 * | S /
2770 * F| /
2771 * | F/
2772 * | /
2773 * | R
2774 * | /
2775 * V L
2776 * S3
2777 * |
2778 * F|
2779 * |
2780 * R
2781 * |
2782 * backtrack V
2783 * <----------Q4
2784 * \ F |
2785 * \ |S
2786 * \ F V
2787 * \-----S4
2788 *
2789 * For greedy loops we reverse our expectation and expect to match rather
2790 * than fail. Therefore we want the loop code to look like this (U is the
2791 * unwind code that steps back in the greedy loop). The following alternatives
2792 * look the same as above.
2793 * _____
2794 * / \
2795 * V |
2796 * ----------> S1 |
2797 * /| |
2798 * / |S |
2799 * F/ \_____/
2800 * /
2801 * |<-----------
2802 * | \
2803 * V \
2804 * Q2 ---> S2 \
2805 * | S / |
2806 * F| / |
2807 * | F/ |
2808 * | / |
2809 * | R |
2810 * | / |
2811 * F VL |
2812 * <------U |
2813 * back |S |
2814 * \______________/
2815 */
2816
2817
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002818void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002819 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2820 int choice_count = alternatives_->length();
2821#ifdef DEBUG
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002822 for (int i = 0; i < choice_count - 1; i++) {
2823 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002824 ZoneList<Guard*>* guards = alternative.guards();
ager@chromium.org8bb60582008-12-11 12:02:20 +00002825 int guard_count = (guards == NULL) ? 0 : guards->length();
2826 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00002827 ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002828 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002829 }
2830#endif
2831
ager@chromium.org32912102009-01-16 10:38:43 +00002832 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002833 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002834 ASSERT(limit_result == CONTINUE);
2835
ager@chromium.org381abbb2009-02-25 13:23:22 +00002836 int new_flush_budget = trace->flush_budget() / choice_count;
2837 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
2838 trace->Flush(compiler, this);
2839 return;
2840 }
2841
ager@chromium.org8bb60582008-12-11 12:02:20 +00002842 RecursionCheck rc(compiler);
2843
ager@chromium.org32912102009-01-16 10:38:43 +00002844 Trace* current_trace = trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002845
2846 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2847 bool greedy_loop = false;
2848 Label greedy_loop_label;
ager@chromium.org32912102009-01-16 10:38:43 +00002849 Trace counter_backtrack_trace;
2850 counter_backtrack_trace.set_backtrack(&greedy_loop_label);
iposva@chromium.org245aa852009-02-10 00:49:54 +00002851 if (not_at_start()) counter_backtrack_trace.set_at_start(false);
2852
ager@chromium.org8bb60582008-12-11 12:02:20 +00002853 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
2854 // Here we have special handling for greedy loops containing only text nodes
2855 // and other simple nodes. These are handled by pushing the current
2856 // position on the stack and then incrementing the current position each
2857 // time around the switch. On backtrack we decrement the current position
2858 // and check it against the pushed value. This avoids pushing backtrack
2859 // information for each iteration of the loop, which could take up a lot of
2860 // space.
2861 greedy_loop = true;
ager@chromium.org32912102009-01-16 10:38:43 +00002862 ASSERT(trace->stop_node() == NULL);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002863 macro_assembler->PushCurrentPosition();
ager@chromium.org32912102009-01-16 10:38:43 +00002864 current_trace = &counter_backtrack_trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002865 Label greedy_match_failed;
ager@chromium.org32912102009-01-16 10:38:43 +00002866 Trace greedy_match_trace;
iposva@chromium.org245aa852009-02-10 00:49:54 +00002867 if (not_at_start()) greedy_match_trace.set_at_start(false);
ager@chromium.org32912102009-01-16 10:38:43 +00002868 greedy_match_trace.set_backtrack(&greedy_match_failed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002869 Label loop_label;
2870 macro_assembler->Bind(&loop_label);
ager@chromium.org32912102009-01-16 10:38:43 +00002871 greedy_match_trace.set_stop_node(this);
2872 greedy_match_trace.set_loop_label(&loop_label);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002873 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002874 macro_assembler->Bind(&greedy_match_failed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002875 }
2876
2877 Label second_choice; // For use in greedy matches.
2878 macro_assembler->Bind(&second_choice);
2879
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002880 int first_normal_choice = greedy_loop ? 1 : 0;
2881
2882 int preload_characters = CalculatePreloadCharacters(compiler);
2883 bool preload_is_current =
ager@chromium.org32912102009-01-16 10:38:43 +00002884 (current_trace->characters_preloaded() == preload_characters);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002885 bool preload_has_checked_bounds = preload_is_current;
2886
2887 AlternativeGenerationList alt_gens(choice_count);
2888
ager@chromium.org8bb60582008-12-11 12:02:20 +00002889 // For now we just call all choices one after the other. The idea ultimately
2890 // is to use the Dispatch table to try only the relevant ones.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002891 for (int i = first_normal_choice; i < choice_count; i++) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002892 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.org32912102009-01-16 10:38:43 +00002893 AlternativeGeneration* alt_gen = alt_gens.at(i);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002894 alt_gen->quick_check_details.set_characters(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002895 ZoneList<Guard*>* guards = alternative.guards();
2896 int guard_count = (guards == NULL) ? 0 : guards->length();
ager@chromium.org32912102009-01-16 10:38:43 +00002897 Trace new_trace(*current_trace);
2898 new_trace.set_characters_preloaded(preload_is_current ?
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002899 preload_characters :
2900 0);
2901 if (preload_has_checked_bounds) {
ager@chromium.org32912102009-01-16 10:38:43 +00002902 new_trace.set_bound_checked_up_to(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002903 }
ager@chromium.org32912102009-01-16 10:38:43 +00002904 new_trace.quick_check_performed()->Clear();
iposva@chromium.org245aa852009-02-10 00:49:54 +00002905 if (not_at_start_) new_trace.set_at_start(Trace::FALSE);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002906 alt_gen->expects_preload = preload_is_current;
2907 bool generate_full_check_inline = false;
ager@chromium.org381abbb2009-02-25 13:23:22 +00002908 if (FLAG_regexp_optimization &&
iposva@chromium.org245aa852009-02-10 00:49:54 +00002909 try_to_emit_quick_check_for_alternative(i) &&
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002910 alternative.node()->EmitQuickCheck(compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00002911 &new_trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002912 preload_has_checked_bounds,
2913 &alt_gen->possible_success,
2914 &alt_gen->quick_check_details,
2915 i < choice_count - 1)) {
2916 // Quick check was generated for this choice.
2917 preload_is_current = true;
2918 preload_has_checked_bounds = true;
2919 // On the last choice in the ChoiceNode we generated the quick
2920 // check to fall through on possible success. So now we need to
2921 // generate the full check inline.
2922 if (i == choice_count - 1) {
2923 macro_assembler->Bind(&alt_gen->possible_success);
ager@chromium.org32912102009-01-16 10:38:43 +00002924 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
2925 new_trace.set_characters_preloaded(preload_characters);
2926 new_trace.set_bound_checked_up_to(preload_characters);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002927 generate_full_check_inline = true;
2928 }
iposva@chromium.org245aa852009-02-10 00:49:54 +00002929 } else if (alt_gen->quick_check_details.cannot_match()) {
2930 if (i == choice_count - 1 && !greedy_loop) {
2931 macro_assembler->GoTo(trace->backtrack());
2932 }
2933 continue;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002934 } else {
2935 // No quick check was generated. Put the full code here.
2936 // If this is not the first choice then there could be slow checks from
2937 // previous cases that go here when they fail. There's no reason to
2938 // insist that they preload characters since the slow check we are about
2939 // to generate probably can't use it.
2940 if (i != first_normal_choice) {
2941 alt_gen->expects_preload = false;
ager@chromium.org32912102009-01-16 10:38:43 +00002942 new_trace.set_characters_preloaded(0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002943 }
2944 if (i < choice_count - 1) {
ager@chromium.org32912102009-01-16 10:38:43 +00002945 new_trace.set_backtrack(&alt_gen->after);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002946 }
2947 generate_full_check_inline = true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002948 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002949 if (generate_full_check_inline) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00002950 if (new_trace.actions() != NULL) {
2951 new_trace.set_flush_budget(new_flush_budget);
2952 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002953 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00002954 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002955 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002956 alternative.node()->Emit(compiler, &new_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002957 preload_is_current = false;
2958 }
2959 macro_assembler->Bind(&alt_gen->after);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002960 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002961 if (greedy_loop) {
2962 macro_assembler->Bind(&greedy_loop_label);
2963 // If we have unwound to the bottom then backtrack.
ager@chromium.org32912102009-01-16 10:38:43 +00002964 macro_assembler->CheckGreedyLoop(trace->backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00002965 // Otherwise try the second priority at an earlier position.
2966 macro_assembler->AdvanceCurrentPosition(-text_length);
2967 macro_assembler->GoTo(&second_choice);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002968 }
ager@chromium.org381abbb2009-02-25 13:23:22 +00002969
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002970 // At this point we need to generate slow checks for the alternatives where
2971 // the quick check was inlined. We can recognize these because the associated
2972 // label was bound.
2973 for (int i = first_normal_choice; i < choice_count - 1; i++) {
2974 AlternativeGeneration* alt_gen = alt_gens.at(i);
ager@chromium.org381abbb2009-02-25 13:23:22 +00002975 Trace new_trace(*current_trace);
2976 // If there are actions to be flushed we have to limit how many times
2977 // they are flushed. Take the budget of the parent trace and distribute
2978 // it fairly amongst the children.
2979 if (new_trace.actions() != NULL) {
2980 new_trace.set_flush_budget(new_flush_budget);
2981 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002982 EmitOutOfLineContinuation(compiler,
ager@chromium.org381abbb2009-02-25 13:23:22 +00002983 &new_trace,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002984 alternatives_->at(i),
2985 alt_gen,
2986 preload_characters,
2987 alt_gens.at(i + 1)->expects_preload);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002988 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002989}
2990
2991
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002992void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00002993 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002994 GuardedAlternative alternative,
2995 AlternativeGeneration* alt_gen,
2996 int preload_characters,
2997 bool next_expects_preload) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002998 if (!alt_gen->possible_success.is_linked()) return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002999
3000 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3001 macro_assembler->Bind(&alt_gen->possible_success);
ager@chromium.org32912102009-01-16 10:38:43 +00003002 Trace out_of_line_trace(*trace);
3003 out_of_line_trace.set_characters_preloaded(preload_characters);
3004 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
iposva@chromium.org245aa852009-02-10 00:49:54 +00003005 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003006 ZoneList<Guard*>* guards = alternative.guards();
3007 int guard_count = (guards == NULL) ? 0 : guards->length();
3008 if (next_expects_preload) {
3009 Label reload_current_char;
ager@chromium.org32912102009-01-16 10:38:43 +00003010 out_of_line_trace.set_backtrack(&reload_current_char);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003011 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00003012 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003013 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003014 alternative.node()->Emit(compiler, &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003015 macro_assembler->Bind(&reload_current_char);
3016 // Reload the current character, since the next quick check expects that.
3017 // We don't need to check bounds here because we only get into this
3018 // code through a quick check which already did the checked load.
ager@chromium.org32912102009-01-16 10:38:43 +00003019 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003020 NULL,
3021 false,
3022 preload_characters);
3023 macro_assembler->GoTo(&(alt_gen->after));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003024 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00003025 out_of_line_trace.set_backtrack(&(alt_gen->after));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003026 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00003027 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003028 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003029 alternative.node()->Emit(compiler, &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003030 }
3031}
3032
3033
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003034void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003035 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00003036 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003037 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003038 ASSERT(limit_result == CONTINUE);
3039
3040 RecursionCheck rc(compiler);
3041
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003042 switch (type_) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003043 case STORE_POSITION: {
ager@chromium.org32912102009-01-16 10:38:43 +00003044 Trace::DeferredCapture
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003045 new_capture(data_.u_position_register.reg,
3046 data_.u_position_register.is_capture,
3047 trace);
ager@chromium.org32912102009-01-16 10:38:43 +00003048 Trace new_trace = *trace;
3049 new_trace.add_action(&new_capture);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003050 on_success()->Emit(compiler, &new_trace);
3051 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003052 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003053 case INCREMENT_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00003054 Trace::DeferredIncrementRegister
ager@chromium.org8bb60582008-12-11 12:02:20 +00003055 new_increment(data_.u_increment_register.reg);
ager@chromium.org32912102009-01-16 10:38:43 +00003056 Trace new_trace = *trace;
3057 new_trace.add_action(&new_increment);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003058 on_success()->Emit(compiler, &new_trace);
3059 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003060 }
3061 case SET_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00003062 Trace::DeferredSetRegister
ager@chromium.org8bb60582008-12-11 12:02:20 +00003063 new_set(data_.u_store_register.reg, data_.u_store_register.value);
ager@chromium.org32912102009-01-16 10:38:43 +00003064 Trace new_trace = *trace;
3065 new_trace.add_action(&new_set);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003066 on_success()->Emit(compiler, &new_trace);
3067 break;
ager@chromium.org32912102009-01-16 10:38:43 +00003068 }
3069 case CLEAR_CAPTURES: {
3070 Trace::DeferredClearCaptures
3071 new_capture(Interval(data_.u_clear_captures.range_from,
3072 data_.u_clear_captures.range_to));
3073 Trace new_trace = *trace;
3074 new_trace.add_action(&new_capture);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003075 on_success()->Emit(compiler, &new_trace);
3076 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003077 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003078 case BEGIN_SUBMATCH:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003079 if (!trace->is_trivial()) {
3080 trace->Flush(compiler, this);
3081 } else {
3082 assembler->WriteCurrentPositionToRegister(
3083 data_.u_submatch.current_position_register, 0);
3084 assembler->WriteStackPointerToRegister(
3085 data_.u_submatch.stack_pointer_register);
3086 on_success()->Emit(compiler, trace);
3087 }
3088 break;
ager@chromium.org32912102009-01-16 10:38:43 +00003089 case EMPTY_MATCH_CHECK: {
3090 int start_pos_reg = data_.u_empty_match_check.start_register;
3091 int stored_pos = 0;
3092 int rep_reg = data_.u_empty_match_check.repetition_register;
3093 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
3094 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
3095 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
3096 // If we know we haven't advanced and there is no minimum we
3097 // can just backtrack immediately.
3098 assembler->GoTo(trace->backtrack());
ager@chromium.org32912102009-01-16 10:38:43 +00003099 } else if (know_dist && stored_pos < trace->cp_offset()) {
3100 // If we know we've advanced we can generate the continuation
3101 // immediately.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003102 on_success()->Emit(compiler, trace);
3103 } else if (!trace->is_trivial()) {
3104 trace->Flush(compiler, this);
3105 } else {
3106 Label skip_empty_check;
3107 // If we have a minimum number of repetitions we check the current
3108 // number first and skip the empty check if it's not enough.
3109 if (has_minimum) {
3110 int limit = data_.u_empty_match_check.repetition_limit;
3111 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
3112 }
3113 // If the match is empty we bail out, otherwise we fall through
3114 // to the on-success continuation.
3115 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
3116 trace->backtrack());
3117 assembler->Bind(&skip_empty_check);
3118 on_success()->Emit(compiler, trace);
ager@chromium.org32912102009-01-16 10:38:43 +00003119 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003120 break;
ager@chromium.org32912102009-01-16 10:38:43 +00003121 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003122 case POSITIVE_SUBMATCH_SUCCESS: {
3123 if (!trace->is_trivial()) {
3124 trace->Flush(compiler, this);
3125 return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003126 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003127 assembler->ReadCurrentPositionFromRegister(
ager@chromium.org8bb60582008-12-11 12:02:20 +00003128 data_.u_submatch.current_position_register);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003129 assembler->ReadStackPointerFromRegister(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003130 data_.u_submatch.stack_pointer_register);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003131 int clear_register_count = data_.u_submatch.clear_register_count;
3132 if (clear_register_count == 0) {
3133 on_success()->Emit(compiler, trace);
3134 return;
3135 }
3136 int clear_registers_from = data_.u_submatch.clear_register_from;
3137 Label clear_registers_backtrack;
3138 Trace new_trace = *trace;
3139 new_trace.set_backtrack(&clear_registers_backtrack);
3140 on_success()->Emit(compiler, &new_trace);
3141
3142 assembler->Bind(&clear_registers_backtrack);
3143 int clear_registers_to = clear_registers_from + clear_register_count - 1;
3144 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
3145
3146 ASSERT(trace->backtrack() == NULL);
3147 assembler->Backtrack();
3148 return;
3149 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003150 default:
3151 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003152 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003153}
3154
3155
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003156void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003157 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00003158 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003159 trace->Flush(compiler, this);
3160 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003161 }
3162
ager@chromium.org32912102009-01-16 10:38:43 +00003163 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003164 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003165 ASSERT(limit_result == CONTINUE);
3166
3167 RecursionCheck rc(compiler);
3168
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003169 ASSERT_EQ(start_reg_ + 1, end_reg_);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003170 if (compiler->ignore_case()) {
3171 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
3172 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003173 } else {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003174 assembler->CheckNotBackReference(start_reg_, trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003175 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003176 on_success()->Emit(compiler, trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003177}
3178
3179
3180// -------------------------------------------------------------------
3181// Dot/dotty output
3182
3183
3184#ifdef DEBUG
3185
3186
3187class DotPrinter: public NodeVisitor {
3188 public:
3189 explicit DotPrinter(bool ignore_case)
3190 : ignore_case_(ignore_case),
3191 stream_(&alloc_) { }
3192 void PrintNode(const char* label, RegExpNode* node);
3193 void Visit(RegExpNode* node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003194 void PrintAttributes(RegExpNode* from);
3195 StringStream* stream() { return &stream_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003196 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003197#define DECLARE_VISIT(Type) \
3198 virtual void Visit##Type(Type##Node* that);
3199FOR_EACH_NODE_TYPE(DECLARE_VISIT)
3200#undef DECLARE_VISIT
3201 private:
3202 bool ignore_case_;
3203 HeapStringAllocator alloc_;
3204 StringStream stream_;
3205};
3206
3207
3208void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
3209 stream()->Add("digraph G {\n graph [label=\"");
3210 for (int i = 0; label[i]; i++) {
3211 switch (label[i]) {
3212 case '\\':
3213 stream()->Add("\\\\");
3214 break;
3215 case '"':
3216 stream()->Add("\"");
3217 break;
3218 default:
3219 stream()->Put(label[i]);
3220 break;
3221 }
3222 }
3223 stream()->Add("\"];\n");
3224 Visit(node);
3225 stream()->Add("}\n");
3226 printf("%s", *(stream()->ToCString()));
3227}
3228
3229
3230void DotPrinter::Visit(RegExpNode* node) {
3231 if (node->info()->visited) return;
3232 node->info()->visited = true;
3233 node->Accept(this);
3234}
3235
3236
3237void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003238 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
3239 Visit(on_failure);
3240}
3241
3242
3243class TableEntryBodyPrinter {
3244 public:
3245 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
3246 : stream_(stream), choice_(choice) { }
3247 void Call(uc16 from, DispatchTable::Entry entry) {
3248 OutSet* out_set = entry.out_set();
3249 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3250 if (out_set->Get(i)) {
3251 stream()->Add(" n%p:s%io%i -> n%p;\n",
3252 choice(),
3253 from,
3254 i,
3255 choice()->alternatives()->at(i).node());
3256 }
3257 }
3258 }
3259 private:
3260 StringStream* stream() { return stream_; }
3261 ChoiceNode* choice() { return choice_; }
3262 StringStream* stream_;
3263 ChoiceNode* choice_;
3264};
3265
3266
3267class TableEntryHeaderPrinter {
3268 public:
3269 explicit TableEntryHeaderPrinter(StringStream* stream)
3270 : first_(true), stream_(stream) { }
3271 void Call(uc16 from, DispatchTable::Entry entry) {
3272 if (first_) {
3273 first_ = false;
3274 } else {
3275 stream()->Add("|");
3276 }
3277 stream()->Add("{\\%k-\\%k|{", from, entry.to());
3278 OutSet* out_set = entry.out_set();
3279 int priority = 0;
3280 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3281 if (out_set->Get(i)) {
3282 if (priority > 0) stream()->Add("|");
3283 stream()->Add("<s%io%i> %i", from, i, priority);
3284 priority++;
3285 }
3286 }
3287 stream()->Add("}}");
3288 }
3289 private:
3290 bool first_;
3291 StringStream* stream() { return stream_; }
3292 StringStream* stream_;
3293};
3294
3295
3296class AttributePrinter {
3297 public:
3298 explicit AttributePrinter(DotPrinter* out)
3299 : out_(out), first_(true) { }
3300 void PrintSeparator() {
3301 if (first_) {
3302 first_ = false;
3303 } else {
3304 out_->stream()->Add("|");
3305 }
3306 }
3307 void PrintBit(const char* name, bool value) {
3308 if (!value) return;
3309 PrintSeparator();
3310 out_->stream()->Add("{%s}", name);
3311 }
3312 void PrintPositive(const char* name, int value) {
3313 if (value < 0) return;
3314 PrintSeparator();
3315 out_->stream()->Add("{%s|%x}", name, value);
3316 }
3317 private:
3318 DotPrinter* out_;
3319 bool first_;
3320};
3321
3322
3323void DotPrinter::PrintAttributes(RegExpNode* that) {
3324 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
3325 "margin=0.1, fontsize=10, label=\"{",
3326 that);
3327 AttributePrinter printer(this);
3328 NodeInfo* info = that->info();
3329 printer.PrintBit("NI", info->follows_newline_interest);
3330 printer.PrintBit("WI", info->follows_word_interest);
3331 printer.PrintBit("SI", info->follows_start_interest);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003332 Label* label = that->label();
3333 if (label->is_bound())
3334 printer.PrintPositive("@", label->pos());
3335 stream()->Add("}\"];\n");
3336 stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
3337 "arrowhead=none];\n", that, that);
3338}
3339
3340
3341static const bool kPrintDispatchTable = false;
3342void DotPrinter::VisitChoice(ChoiceNode* that) {
3343 if (kPrintDispatchTable) {
3344 stream()->Add(" n%p [shape=Mrecord, label=\"", that);
3345 TableEntryHeaderPrinter header_printer(stream());
3346 that->GetTable(ignore_case_)->ForEach(&header_printer);
3347 stream()->Add("\"]\n", that);
3348 PrintAttributes(that);
3349 TableEntryBodyPrinter body_printer(stream(), that);
3350 that->GetTable(ignore_case_)->ForEach(&body_printer);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003351 } else {
3352 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
3353 for (int i = 0; i < that->alternatives()->length(); i++) {
3354 GuardedAlternative alt = that->alternatives()->at(i);
3355 stream()->Add(" n%p -> n%p;\n", that, alt.node());
3356 }
3357 }
3358 for (int i = 0; i < that->alternatives()->length(); i++) {
3359 GuardedAlternative alt = that->alternatives()->at(i);
3360 alt.node()->Accept(this);
3361 }
3362}
3363
3364
3365void DotPrinter::VisitText(TextNode* that) {
3366 stream()->Add(" n%p [label=\"", that);
3367 for (int i = 0; i < that->elements()->length(); i++) {
3368 if (i > 0) stream()->Add(" ");
3369 TextElement elm = that->elements()->at(i);
3370 switch (elm.type) {
3371 case TextElement::ATOM: {
3372 stream()->Add("'%w'", elm.data.u_atom->data());
3373 break;
3374 }
3375 case TextElement::CHAR_CLASS: {
3376 RegExpCharacterClass* node = elm.data.u_char_class;
3377 stream()->Add("[");
3378 if (node->is_negated())
3379 stream()->Add("^");
3380 for (int j = 0; j < node->ranges()->length(); j++) {
3381 CharacterRange range = node->ranges()->at(j);
3382 stream()->Add("%k-%k", range.from(), range.to());
3383 }
3384 stream()->Add("]");
3385 break;
3386 }
3387 default:
3388 UNREACHABLE();
3389 }
3390 }
3391 stream()->Add("\", shape=box, peripheries=2];\n");
3392 PrintAttributes(that);
3393 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3394 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003395}
3396
3397
3398void DotPrinter::VisitBackReference(BackReferenceNode* that) {
3399 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
3400 that,
3401 that->start_register(),
3402 that->end_register());
3403 PrintAttributes(that);
3404 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3405 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003406}
3407
3408
3409void DotPrinter::VisitEnd(EndNode* that) {
3410 stream()->Add(" n%p [style=bold, shape=point];\n", that);
3411 PrintAttributes(that);
3412}
3413
3414
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003415void DotPrinter::VisitAssertion(AssertionNode* that) {
3416 stream()->Add(" n%p [", that);
3417 switch (that->type()) {
3418 case AssertionNode::AT_END:
3419 stream()->Add("label=\"$\", shape=septagon");
3420 break;
3421 case AssertionNode::AT_START:
3422 stream()->Add("label=\"^\", shape=septagon");
3423 break;
3424 case AssertionNode::AT_BOUNDARY:
3425 stream()->Add("label=\"\\b\", shape=septagon");
3426 break;
3427 case AssertionNode::AT_NON_BOUNDARY:
3428 stream()->Add("label=\"\\B\", shape=septagon");
3429 break;
3430 case AssertionNode::AFTER_NEWLINE:
3431 stream()->Add("label=\"(?<=\\n)\", shape=septagon");
3432 break;
3433 }
3434 stream()->Add("];\n");
3435 PrintAttributes(that);
3436 RegExpNode* successor = that->on_success();
3437 stream()->Add(" n%p -> n%p;\n", that, successor);
3438 Visit(successor);
3439}
3440
3441
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003442void DotPrinter::VisitAction(ActionNode* that) {
3443 stream()->Add(" n%p [", that);
3444 switch (that->type_) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003445 case ActionNode::SET_REGISTER:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003446 stream()->Add("label=\"$%i:=%i\", shape=octagon",
3447 that->data_.u_store_register.reg,
3448 that->data_.u_store_register.value);
3449 break;
3450 case ActionNode::INCREMENT_REGISTER:
3451 stream()->Add("label=\"$%i++\", shape=octagon",
3452 that->data_.u_increment_register.reg);
3453 break;
3454 case ActionNode::STORE_POSITION:
3455 stream()->Add("label=\"$%i:=$pos\", shape=octagon",
3456 that->data_.u_position_register.reg);
3457 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003458 case ActionNode::BEGIN_SUBMATCH:
3459 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
3460 that->data_.u_submatch.current_position_register);
3461 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003462 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003463 stream()->Add("label=\"escape\", shape=septagon");
3464 break;
ager@chromium.org32912102009-01-16 10:38:43 +00003465 case ActionNode::EMPTY_MATCH_CHECK:
3466 stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
3467 that->data_.u_empty_match_check.start_register,
3468 that->data_.u_empty_match_check.repetition_register,
3469 that->data_.u_empty_match_check.repetition_limit);
3470 break;
3471 case ActionNode::CLEAR_CAPTURES: {
3472 stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
3473 that->data_.u_clear_captures.range_from,
3474 that->data_.u_clear_captures.range_to);
3475 break;
3476 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003477 }
3478 stream()->Add("];\n");
3479 PrintAttributes(that);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003480 RegExpNode* successor = that->on_success();
3481 stream()->Add(" n%p -> n%p;\n", that, successor);
3482 Visit(successor);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003483}
3484
3485
3486class DispatchTableDumper {
3487 public:
3488 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
3489 void Call(uc16 key, DispatchTable::Entry entry);
3490 StringStream* stream() { return stream_; }
3491 private:
3492 StringStream* stream_;
3493};
3494
3495
3496void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
3497 stream()->Add("[%k-%k]: {", key, entry.to());
3498 OutSet* set = entry.out_set();
3499 bool first = true;
3500 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3501 if (set->Get(i)) {
3502 if (first) {
3503 first = false;
3504 } else {
3505 stream()->Add(", ");
3506 }
3507 stream()->Add("%i", i);
3508 }
3509 }
3510 stream()->Add("}\n");
3511}
3512
3513
3514void DispatchTable::Dump() {
3515 HeapStringAllocator alloc;
3516 StringStream stream(&alloc);
3517 DispatchTableDumper dumper(&stream);
3518 tree()->ForEach(&dumper);
3519 OS::PrintError("%s", *stream.ToCString());
3520}
3521
3522
3523void RegExpEngine::DotPrint(const char* label,
3524 RegExpNode* node,
3525 bool ignore_case) {
3526 DotPrinter printer(ignore_case);
3527 printer.PrintNode(label, node);
3528}
3529
3530
3531#endif // DEBUG
3532
3533
3534// -------------------------------------------------------------------
3535// Tree to graph conversion
3536
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003537static const int kSpaceRangeCount = 20;
3538static const int kSpaceRangeAsciiCount = 4;
3539static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020,
3540 0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A,
3541 0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 };
3542
3543static const int kWordRangeCount = 8;
3544static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_',
3545 '_', 'a', 'z' };
3546
3547static const int kDigitRangeCount = 2;
3548static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' };
3549
3550static const int kLineTerminatorRangeCount = 6;
3551static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A,
3552 0x000A, 0x000D, 0x000D, 0x2028, 0x2029 };
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003553
3554RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003555 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003556 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
3557 elms->Add(TextElement::Atom(this));
ager@chromium.org8bb60582008-12-11 12:02:20 +00003558 return new TextNode(elms, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003559}
3560
3561
3562RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003563 RegExpNode* on_success) {
3564 return new TextNode(elements(), on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003565}
3566
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003567static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
3568 const uc16* special_class,
3569 int length) {
3570 ASSERT(ranges->length() != 0);
3571 ASSERT(length != 0);
3572 ASSERT(special_class[0] != 0);
3573 if (ranges->length() != (length >> 1) + 1) {
3574 return false;
3575 }
3576 CharacterRange range = ranges->at(0);
3577 if (range.from() != 0) {
3578 return false;
3579 }
3580 for (int i = 0; i < length; i += 2) {
3581 if (special_class[i] != (range.to() + 1)) {
3582 return false;
3583 }
3584 range = ranges->at((i >> 1) + 1);
3585 if (special_class[i+1] != range.from() - 1) {
3586 return false;
3587 }
3588 }
3589 if (range.to() != 0xffff) {
3590 return false;
3591 }
3592 return true;
3593}
3594
3595
3596static bool CompareRanges(ZoneList<CharacterRange>* ranges,
3597 const uc16* special_class,
3598 int length) {
3599 if (ranges->length() * 2 != length) {
3600 return false;
3601 }
3602 for (int i = 0; i < length; i += 2) {
3603 CharacterRange range = ranges->at(i >> 1);
3604 if (range.from() != special_class[i] || range.to() != special_class[i+1]) {
3605 return false;
3606 }
3607 }
3608 return true;
3609}
3610
3611
3612bool RegExpCharacterClass::is_standard() {
3613 // TODO(lrn): Remove need for this function, by not throwing away information
3614 // along the way.
3615 if (is_negated_) {
3616 return false;
3617 }
3618 if (set_.is_standard()) {
3619 return true;
3620 }
3621 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3622 set_.set_standard_set_type('s');
3623 return true;
3624 }
3625 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3626 set_.set_standard_set_type('S');
3627 return true;
3628 }
3629 if (CompareInverseRanges(set_.ranges(),
3630 kLineTerminatorRanges,
3631 kLineTerminatorRangeCount)) {
3632 set_.set_standard_set_type('.');
3633 return true;
3634 }
3635 return false;
3636}
3637
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003638
3639RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003640 RegExpNode* on_success) {
3641 return new TextNode(this, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003642}
3643
3644
3645RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003646 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003647 ZoneList<RegExpTree*>* alternatives = this->alternatives();
3648 int length = alternatives->length();
ager@chromium.org8bb60582008-12-11 12:02:20 +00003649 ChoiceNode* result = new ChoiceNode(length);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003650 for (int i = 0; i < length; i++) {
3651 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003652 on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003653 result->AddAlternative(alternative);
3654 }
3655 return result;
3656}
3657
3658
3659RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003660 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003661 return ToNode(min(),
3662 max(),
3663 is_greedy(),
3664 body(),
3665 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003666 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003667}
3668
3669
3670RegExpNode* RegExpQuantifier::ToNode(int min,
3671 int max,
3672 bool is_greedy,
3673 RegExpTree* body,
3674 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00003675 RegExpNode* on_success,
3676 bool not_at_start) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003677 // x{f, t} becomes this:
3678 //
3679 // (r++)<-.
3680 // | `
3681 // | (x)
3682 // v ^
3683 // (r=0)-->(?)---/ [if r < t]
3684 // |
3685 // [if r >= f] \----> ...
3686 //
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003687
3688 // 15.10.2.5 RepeatMatcher algorithm.
3689 // The parser has already eliminated the case where max is 0. In the case
3690 // where max_match is zero the parser has removed the quantifier if min was
3691 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
3692
3693 // If we know that we cannot match zero length then things are a little
3694 // simpler since we don't need to make the special zero length match check
3695 // from step 2.1. If the min and max are small we can unroll a little in
3696 // this case.
3697 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
3698 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
3699 if (max == 0) return on_success; // This can happen due to recursion.
ager@chromium.org32912102009-01-16 10:38:43 +00003700 bool body_can_be_empty = (body->min_match() == 0);
3701 int body_start_reg = RegExpCompiler::kNoRegister;
3702 Interval capture_registers = body->CaptureRegisters();
3703 bool needs_capture_clearing = !capture_registers.is_empty();
3704 if (body_can_be_empty) {
3705 body_start_reg = compiler->AllocateRegister();
ager@chromium.org381abbb2009-02-25 13:23:22 +00003706 } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
ager@chromium.org32912102009-01-16 10:38:43 +00003707 // Only unroll if there are no captures and the body can't be
3708 // empty.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003709 if (min > 0 && min <= kMaxUnrolledMinMatches) {
3710 int new_max = (max == kInfinity) ? max : max - min;
3711 // Recurse once to get the loop or optional matches after the fixed ones.
iposva@chromium.org245aa852009-02-10 00:49:54 +00003712 RegExpNode* answer = ToNode(
3713 0, new_max, is_greedy, body, compiler, on_success, true);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003714 // Unroll the forced matches from 0 to min. This can cause chains of
3715 // TextNodes (which the parser does not generate). These should be
3716 // combined if it turns out they hinder good code generation.
3717 for (int i = 0; i < min; i++) {
3718 answer = body->ToNode(compiler, answer);
3719 }
3720 return answer;
3721 }
3722 if (max <= kMaxUnrolledMaxMatches) {
3723 ASSERT(min == 0);
3724 // Unroll the optional matches up to max.
3725 RegExpNode* answer = on_success;
3726 for (int i = 0; i < max; i++) {
3727 ChoiceNode* alternation = new ChoiceNode(2);
3728 if (is_greedy) {
3729 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3730 answer)));
3731 alternation->AddAlternative(GuardedAlternative(on_success));
3732 } else {
3733 alternation->AddAlternative(GuardedAlternative(on_success));
3734 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3735 answer)));
3736 }
3737 answer = alternation;
iposva@chromium.org245aa852009-02-10 00:49:54 +00003738 if (not_at_start) alternation->set_not_at_start();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003739 }
3740 return answer;
3741 }
3742 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003743 bool has_min = min > 0;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003744 bool has_max = max < RegExpTree::kInfinity;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003745 bool needs_counter = has_min || has_max;
ager@chromium.org32912102009-01-16 10:38:43 +00003746 int reg_ctr = needs_counter
3747 ? compiler->AllocateRegister()
3748 : RegExpCompiler::kNoRegister;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003749 LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
iposva@chromium.org245aa852009-02-10 00:49:54 +00003750 if (not_at_start) center->set_not_at_start();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003751 RegExpNode* loop_return = needs_counter
3752 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
3753 : static_cast<RegExpNode*>(center);
ager@chromium.org32912102009-01-16 10:38:43 +00003754 if (body_can_be_empty) {
3755 // If the body can be empty we need to check if it was and then
3756 // backtrack.
3757 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
3758 reg_ctr,
3759 min,
3760 loop_return);
3761 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003762 RegExpNode* body_node = body->ToNode(compiler, loop_return);
ager@chromium.org32912102009-01-16 10:38:43 +00003763 if (body_can_be_empty) {
3764 // If the body can be empty we need to store the start position
3765 // so we can bail out if it was empty.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003766 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
ager@chromium.org32912102009-01-16 10:38:43 +00003767 }
3768 if (needs_capture_clearing) {
3769 // Before entering the body of this loop we need to clear captures.
3770 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
3771 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003772 GuardedAlternative body_alt(body_node);
3773 if (has_max) {
3774 Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
3775 body_alt.AddGuard(body_guard);
3776 }
3777 GuardedAlternative rest_alt(on_success);
3778 if (has_min) {
3779 Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
3780 rest_alt.AddGuard(rest_guard);
3781 }
3782 if (is_greedy) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003783 center->AddLoopAlternative(body_alt);
3784 center->AddContinueAlternative(rest_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003785 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003786 center->AddContinueAlternative(rest_alt);
3787 center->AddLoopAlternative(body_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003788 }
3789 if (needs_counter) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003790 return ActionNode::SetRegister(reg_ctr, 0, center);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003791 } else {
3792 return center;
3793 }
3794}
3795
3796
3797RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003798 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003799 NodeInfo info;
3800 switch (type()) {
3801 case START_OF_LINE:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003802 return AssertionNode::AfterNewline(on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003803 case START_OF_INPUT:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003804 return AssertionNode::AtStart(on_success);
3805 case BOUNDARY:
3806 return AssertionNode::AtBoundary(on_success);
3807 case NON_BOUNDARY:
3808 return AssertionNode::AtNonBoundary(on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003809 case END_OF_INPUT:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003810 return AssertionNode::AtEnd(on_success);
3811 case END_OF_LINE: {
3812 // Compile $ in multiline regexps as an alternation with a positive
3813 // lookahead in one side and an end-of-input on the other side.
3814 // We need two registers for the lookahead.
3815 int stack_pointer_register = compiler->AllocateRegister();
3816 int position_register = compiler->AllocateRegister();
3817 // The ChoiceNode to distinguish between a newline and end-of-input.
3818 ChoiceNode* result = new ChoiceNode(2);
3819 // Create a newline atom.
3820 ZoneList<CharacterRange>* newline_ranges =
3821 new ZoneList<CharacterRange>(3);
3822 CharacterRange::AddClassEscape('n', newline_ranges);
3823 RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
3824 TextNode* newline_matcher = new TextNode(
3825 newline_atom,
3826 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
3827 position_register,
3828 0, // No captures inside.
3829 -1, // Ignored if no captures.
3830 on_success));
3831 // Create an end-of-input matcher.
3832 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
3833 stack_pointer_register,
3834 position_register,
3835 newline_matcher);
3836 // Add the two alternatives to the ChoiceNode.
3837 GuardedAlternative eol_alternative(end_of_line);
3838 result->AddAlternative(eol_alternative);
3839 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
3840 result->AddAlternative(end_alternative);
3841 return result;
3842 }
3843 default:
3844 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003845 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003846 return on_success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003847}
3848
3849
3850RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003851 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003852 return new BackReferenceNode(RegExpCapture::StartRegister(index()),
3853 RegExpCapture::EndRegister(index()),
ager@chromium.org8bb60582008-12-11 12:02:20 +00003854 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003855}
3856
3857
3858RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003859 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003860 return on_success;
3861}
3862
3863
3864RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003865 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003866 int stack_pointer_register = compiler->AllocateRegister();
3867 int position_register = compiler->AllocateRegister();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003868
3869 const int registers_per_capture = 2;
3870 const int register_of_first_capture = 2;
3871 int register_count = capture_count_ * registers_per_capture;
3872 int register_start =
3873 register_of_first_capture + capture_from_ * registers_per_capture;
3874
ager@chromium.org8bb60582008-12-11 12:02:20 +00003875 RegExpNode* success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003876 if (is_positive()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003877 RegExpNode* node = ActionNode::BeginSubmatch(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003878 stack_pointer_register,
3879 position_register,
3880 body()->ToNode(
3881 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003882 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
3883 position_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003884 register_count,
3885 register_start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003886 on_success)));
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003887 return node;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003888 } else {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003889 // We use a ChoiceNode for a negative lookahead because it has most of
3890 // the characteristics we need. It has the body of the lookahead as its
3891 // first alternative and the expression after the lookahead of the second
3892 // alternative. If the first alternative succeeds then the
3893 // NegativeSubmatchSuccess will unwind the stack including everything the
3894 // choice node set up and backtrack. If the first alternative fails then
3895 // the second alternative is tried, which is exactly the desired result
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003896 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
3897 // ChoiceNode that knows to ignore the first exit when calculating quick
3898 // checks.
ager@chromium.org8bb60582008-12-11 12:02:20 +00003899 GuardedAlternative body_alt(
3900 body()->ToNode(
3901 compiler,
3902 success = new NegativeSubmatchSuccess(stack_pointer_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003903 position_register,
3904 register_count,
3905 register_start)));
3906 ChoiceNode* choice_node =
3907 new NegativeLookaheadChoiceNode(body_alt,
3908 GuardedAlternative(on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003909 return ActionNode::BeginSubmatch(stack_pointer_register,
3910 position_register,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003911 choice_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003912 }
3913}
3914
3915
3916RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003917 RegExpNode* on_success) {
3918 return ToNode(body(), index(), compiler, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003919}
3920
3921
3922RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
3923 int index,
3924 RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003925 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003926 int start_reg = RegExpCapture::StartRegister(index);
3927 int end_reg = RegExpCapture::EndRegister(index);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003928 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003929 RegExpNode* body_node = body->ToNode(compiler, store_end);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003930 return ActionNode::StorePosition(start_reg, true, body_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003931}
3932
3933
3934RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003935 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003936 ZoneList<RegExpTree*>* children = nodes();
3937 RegExpNode* current = on_success;
3938 for (int i = children->length() - 1; i >= 0; i--) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003939 current = children->at(i)->ToNode(compiler, current);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003940 }
3941 return current;
3942}
3943
3944
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003945static void AddClass(const uc16* elmv,
3946 int elmc,
3947 ZoneList<CharacterRange>* ranges) {
3948 for (int i = 0; i < elmc; i += 2) {
3949 ASSERT(elmv[i] <= elmv[i + 1]);
3950 ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
3951 }
3952}
3953
3954
3955static void AddClassNegated(const uc16 *elmv,
3956 int elmc,
3957 ZoneList<CharacterRange>* ranges) {
3958 ASSERT(elmv[0] != 0x0000);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003959 ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003960 uc16 last = 0x0000;
3961 for (int i = 0; i < elmc; i += 2) {
3962 ASSERT(last <= elmv[i] - 1);
3963 ASSERT(elmv[i] <= elmv[i + 1]);
3964 ranges->Add(CharacterRange(last, elmv[i] - 1));
3965 last = elmv[i + 1] + 1;
3966 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003967 ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003968}
3969
3970
3971void CharacterRange::AddClassEscape(uc16 type,
3972 ZoneList<CharacterRange>* ranges) {
3973 switch (type) {
3974 case 's':
3975 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
3976 break;
3977 case 'S':
3978 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
3979 break;
3980 case 'w':
3981 AddClass(kWordRanges, kWordRangeCount, ranges);
3982 break;
3983 case 'W':
3984 AddClassNegated(kWordRanges, kWordRangeCount, ranges);
3985 break;
3986 case 'd':
3987 AddClass(kDigitRanges, kDigitRangeCount, ranges);
3988 break;
3989 case 'D':
3990 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
3991 break;
3992 case '.':
3993 AddClassNegated(kLineTerminatorRanges,
3994 kLineTerminatorRangeCount,
3995 ranges);
3996 break;
3997 // This is not a character range as defined by the spec but a
3998 // convenient shorthand for a character class that matches any
3999 // character.
4000 case '*':
4001 ranges->Add(CharacterRange::Everything());
4002 break;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004003 // This is the set of characters matched by the $ and ^ symbols
4004 // in multiline mode.
4005 case 'n':
4006 AddClass(kLineTerminatorRanges,
4007 kLineTerminatorRangeCount,
4008 ranges);
4009 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004010 default:
4011 UNREACHABLE();
4012 }
4013}
4014
4015
4016Vector<const uc16> CharacterRange::GetWordBounds() {
4017 return Vector<const uc16>(kWordRanges, kWordRangeCount);
4018}
4019
4020
4021class CharacterRangeSplitter {
4022 public:
4023 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
4024 ZoneList<CharacterRange>** excluded)
4025 : included_(included),
4026 excluded_(excluded) { }
4027 void Call(uc16 from, DispatchTable::Entry entry);
4028
4029 static const int kInBase = 0;
4030 static const int kInOverlay = 1;
4031
4032 private:
4033 ZoneList<CharacterRange>** included_;
4034 ZoneList<CharacterRange>** excluded_;
4035};
4036
4037
4038void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
4039 if (!entry.out_set()->Get(kInBase)) return;
4040 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
4041 ? included_
4042 : excluded_;
4043 if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
4044 (*target)->Add(CharacterRange(entry.from(), entry.to()));
4045}
4046
4047
4048void CharacterRange::Split(ZoneList<CharacterRange>* base,
4049 Vector<const uc16> overlay,
4050 ZoneList<CharacterRange>** included,
4051 ZoneList<CharacterRange>** excluded) {
4052 ASSERT_EQ(NULL, *included);
4053 ASSERT_EQ(NULL, *excluded);
4054 DispatchTable table;
4055 for (int i = 0; i < base->length(); i++)
4056 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
4057 for (int i = 0; i < overlay.length(); i += 2) {
4058 table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
4059 CharacterRangeSplitter::kInOverlay);
4060 }
4061 CharacterRangeSplitter callback(included, excluded);
4062 table.ForEach(&callback);
4063}
4064
4065
4066void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
4067 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4068 if (IsSingleton()) {
4069 // If this is a singleton we just expand the one character.
4070 int length = uncanonicalize.get(from(), '\0', chars);
4071 for (int i = 0; i < length; i++) {
4072 uc32 chr = chars[i];
4073 if (chr != from()) {
4074 ranges->Add(CharacterRange::Singleton(chars[i]));
4075 }
4076 }
4077 } else if (from() <= kRangeCanonicalizeMax &&
4078 to() <= kRangeCanonicalizeMax) {
4079 // If this is a range we expand the characters block by block,
4080 // expanding contiguous subranges (blocks) one at a time.
4081 // The approach is as follows. For a given start character we
4082 // look up the block that contains it, for instance 'a' if the
4083 // start character is 'c'. A block is characterized by the property
4084 // that all characters uncanonicalize in the same way as the first
4085 // element, except that each entry in the result is incremented
4086 // by the distance from the first element. So a-z is a block
4087 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
4088 // uncanonicalizes to ['a' + k, 'A' + k].
4089 // Once we've found the start point we look up its uncanonicalization
4090 // and produce a range for each element. For instance for [c-f]
4091 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
4092 // add a range if it is not already contained in the input, so [c-f]
4093 // will be skipped but [C-F] will be added. If this range is not
4094 // completely contained in a block we do this for all the blocks
4095 // covered by the range.
4096 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4097 // First, look up the block that contains the 'from' character.
4098 int length = canonrange.get(from(), '\0', range);
4099 if (length == 0) {
4100 range[0] = from();
4101 } else {
4102 ASSERT_EQ(1, length);
4103 }
4104 int pos = from();
4105 // The start of the current block. Note that except for the first
4106 // iteration 'start' is always equal to 'pos'.
4107 int start;
4108 // If it is not the start point of a block the entry contains the
4109 // offset of the character from the start point.
4110 if ((range[0] & kStartMarker) == 0) {
4111 start = pos - range[0];
4112 } else {
4113 start = pos;
4114 }
4115 // Then we add the ranges on at a time, incrementing the current
4116 // position to be after the last block each time. The position
4117 // always points to the start of a block.
4118 while (pos < to()) {
4119 length = canonrange.get(start, '\0', range);
4120 if (length == 0) {
4121 range[0] = start;
4122 } else {
4123 ASSERT_EQ(1, length);
4124 }
4125 ASSERT((range[0] & kStartMarker) != 0);
4126 // The start point of a block contains the distance to the end
4127 // of the range.
4128 int block_end = start + (range[0] & kPayloadMask) - 1;
4129 int end = (block_end > to()) ? to() : block_end;
4130 length = uncanonicalize.get(start, '\0', range);
4131 for (int i = 0; i < length; i++) {
4132 uc32 c = range[i];
4133 uc16 range_from = c + (pos - start);
4134 uc16 range_to = c + (end - start);
4135 if (!(from() <= range_from && range_to <= to())) {
4136 ranges->Add(CharacterRange(range_from, range_to));
4137 }
4138 }
4139 start = pos = block_end + 1;
4140 }
4141 } else {
4142 // TODO(plesner) when we've fixed the 2^11 bug in unibrow.
4143 }
4144}
4145
4146
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004147ZoneList<CharacterRange>* CharacterSet::ranges() {
4148 if (ranges_ == NULL) {
4149 ranges_ = new ZoneList<CharacterRange>(2);
4150 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
4151 }
4152 return ranges_;
4153}
4154
4155
4156
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004157// -------------------------------------------------------------------
4158// Interest propagation
4159
4160
4161RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
4162 for (int i = 0; i < siblings_.length(); i++) {
4163 RegExpNode* sibling = siblings_.Get(i);
4164 if (sibling->info()->Matches(info))
4165 return sibling;
4166 }
4167 return NULL;
4168}
4169
4170
4171RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
4172 ASSERT_EQ(false, *cloned);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004173 siblings_.Ensure(this);
4174 RegExpNode* result = TryGetSibling(info);
4175 if (result != NULL) return result;
4176 result = this->Clone();
4177 NodeInfo* new_info = result->info();
4178 new_info->ResetCompilationState();
4179 new_info->AddFromPreceding(info);
4180 AddSibling(result);
4181 *cloned = true;
4182 return result;
4183}
4184
4185
4186template <class C>
4187static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
4188 NodeInfo full_info(*node->info());
4189 full_info.AddFromPreceding(info);
4190 bool cloned = false;
4191 return RegExpNode::EnsureSibling(node, &full_info, &cloned);
4192}
4193
4194
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004195// -------------------------------------------------------------------
4196// Splay tree
4197
4198
4199OutSet* OutSet::Extend(unsigned value) {
4200 if (Get(value))
4201 return this;
4202 if (successors() != NULL) {
4203 for (int i = 0; i < successors()->length(); i++) {
4204 OutSet* successor = successors()->at(i);
4205 if (successor->Get(value))
4206 return successor;
4207 }
4208 } else {
4209 successors_ = new ZoneList<OutSet*>(2);
4210 }
4211 OutSet* result = new OutSet(first_, remaining_);
4212 result->Set(value);
4213 successors()->Add(result);
4214 return result;
4215}
4216
4217
4218void OutSet::Set(unsigned value) {
4219 if (value < kFirstLimit) {
4220 first_ |= (1 << value);
4221 } else {
4222 if (remaining_ == NULL)
4223 remaining_ = new ZoneList<unsigned>(1);
4224 if (remaining_->is_empty() || !remaining_->Contains(value))
4225 remaining_->Add(value);
4226 }
4227}
4228
4229
4230bool OutSet::Get(unsigned value) {
4231 if (value < kFirstLimit) {
4232 return (first_ & (1 << value)) != 0;
4233 } else if (remaining_ == NULL) {
4234 return false;
4235 } else {
4236 return remaining_->Contains(value);
4237 }
4238}
4239
4240
4241const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
4242const DispatchTable::Entry DispatchTable::Config::kNoValue;
4243
4244
4245void DispatchTable::AddRange(CharacterRange full_range, int value) {
4246 CharacterRange current = full_range;
4247 if (tree()->is_empty()) {
4248 // If this is the first range we just insert into the table.
4249 ZoneSplayTree<Config>::Locator loc;
4250 ASSERT_RESULT(tree()->Insert(current.from(), &loc));
4251 loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
4252 return;
4253 }
4254 // First see if there is a range to the left of this one that
4255 // overlaps.
4256 ZoneSplayTree<Config>::Locator loc;
4257 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
4258 Entry* entry = &loc.value();
4259 // If we've found a range that overlaps with this one, and it
4260 // starts strictly to the left of this one, we have to fix it
4261 // because the following code only handles ranges that start on
4262 // or after the start point of the range we're adding.
4263 if (entry->from() < current.from() && entry->to() >= current.from()) {
4264 // Snap the overlapping range in half around the start point of
4265 // the range we're adding.
4266 CharacterRange left(entry->from(), current.from() - 1);
4267 CharacterRange right(current.from(), entry->to());
4268 // The left part of the overlapping range doesn't overlap.
4269 // Truncate the whole entry to be just the left part.
4270 entry->set_to(left.to());
4271 // The right part is the one that overlaps. We add this part
4272 // to the map and let the next step deal with merging it with
4273 // the range we're adding.
4274 ZoneSplayTree<Config>::Locator loc;
4275 ASSERT_RESULT(tree()->Insert(right.from(), &loc));
4276 loc.set_value(Entry(right.from(),
4277 right.to(),
4278 entry->out_set()));
4279 }
4280 }
4281 while (current.is_valid()) {
4282 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
4283 (loc.value().from() <= current.to()) &&
4284 (loc.value().to() >= current.from())) {
4285 Entry* entry = &loc.value();
4286 // We have overlap. If there is space between the start point of
4287 // the range we're adding and where the overlapping range starts
4288 // then we have to add a range covering just that space.
4289 if (current.from() < entry->from()) {
4290 ZoneSplayTree<Config>::Locator ins;
4291 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4292 ins.set_value(Entry(current.from(),
4293 entry->from() - 1,
4294 empty()->Extend(value)));
4295 current.set_from(entry->from());
4296 }
4297 ASSERT_EQ(current.from(), entry->from());
4298 // If the overlapping range extends beyond the one we want to add
4299 // we have to snap the right part off and add it separately.
4300 if (entry->to() > current.to()) {
4301 ZoneSplayTree<Config>::Locator ins;
4302 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
4303 ins.set_value(Entry(current.to() + 1,
4304 entry->to(),
4305 entry->out_set()));
4306 entry->set_to(current.to());
4307 }
4308 ASSERT(entry->to() <= current.to());
4309 // The overlapping range is now completely contained by the range
4310 // we're adding so we can just update it and move the start point
4311 // of the range we're adding just past it.
4312 entry->AddValue(value);
4313 // Bail out if the last interval ended at 0xFFFF since otherwise
4314 // adding 1 will wrap around to 0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004315 if (entry->to() == String::kMaxUC16CharCode)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004316 break;
4317 ASSERT(entry->to() + 1 > current.from());
4318 current.set_from(entry->to() + 1);
4319 } else {
4320 // There is no overlap so we can just add the range
4321 ZoneSplayTree<Config>::Locator ins;
4322 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4323 ins.set_value(Entry(current.from(),
4324 current.to(),
4325 empty()->Extend(value)));
4326 break;
4327 }
4328 }
4329}
4330
4331
4332OutSet* DispatchTable::Get(uc16 value) {
4333 ZoneSplayTree<Config>::Locator loc;
4334 if (!tree()->FindGreatestLessThan(value, &loc))
4335 return empty();
4336 Entry* entry = &loc.value();
4337 if (value <= entry->to())
4338 return entry->out_set();
4339 else
4340 return empty();
4341}
4342
4343
4344// -------------------------------------------------------------------
4345// Analysis
4346
4347
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004348void Analysis::EnsureAnalyzed(RegExpNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004349 if (that->info()->been_analyzed || that->info()->being_analyzed)
4350 return;
4351 that->info()->being_analyzed = true;
4352 that->Accept(this);
4353 that->info()->being_analyzed = false;
4354 that->info()->been_analyzed = true;
4355}
4356
4357
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004358void Analysis::VisitEnd(EndNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004359 // nothing to do
4360}
4361
4362
ager@chromium.org8bb60582008-12-11 12:02:20 +00004363void TextNode::CalculateOffsets() {
4364 int element_count = elements()->length();
4365 // Set up the offsets of the elements relative to the start. This is a fixed
4366 // quantity since a TextNode can only contain fixed-width things.
4367 int cp_offset = 0;
4368 for (int i = 0; i < element_count; i++) {
4369 TextElement& elm = elements()->at(i);
4370 elm.cp_offset = cp_offset;
4371 if (elm.type == TextElement::ATOM) {
4372 cp_offset += elm.data.u_atom->data().length();
4373 } else {
4374 cp_offset++;
4375 Vector<const uc16> quarks = elm.data.u_atom->data();
4376 }
4377 }
4378}
4379
4380
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004381void Analysis::VisitText(TextNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004382 if (ignore_case_) {
4383 that->MakeCaseIndependent();
4384 }
4385 EnsureAnalyzed(that->on_success());
ager@chromium.org8bb60582008-12-11 12:02:20 +00004386 that->CalculateOffsets();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004387}
4388
4389
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004390void Analysis::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004391 RegExpNode* target = that->on_success();
4392 EnsureAnalyzed(target);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004393 // If the next node is interested in what it follows then this node
4394 // has to be interested too so it can pass the information on.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004395 that->info()->AddFromFollowing(target->info());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004396}
4397
4398
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004399void Analysis::VisitChoice(ChoiceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004400 NodeInfo* info = that->info();
4401 for (int i = 0; i < that->alternatives()->length(); i++) {
4402 RegExpNode* node = that->alternatives()->at(i).node();
4403 EnsureAnalyzed(node);
4404 // Anything the following nodes need to know has to be known by
4405 // this node also, so it can pass it on.
4406 info->AddFromFollowing(node->info());
4407 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004408}
4409
4410
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004411void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
4412 NodeInfo* info = that->info();
4413 for (int i = 0; i < that->alternatives()->length(); i++) {
4414 RegExpNode* node = that->alternatives()->at(i).node();
4415 if (node != that->loop_node()) {
4416 EnsureAnalyzed(node);
4417 info->AddFromFollowing(node->info());
4418 }
4419 }
4420 // Check the loop last since it may need the value of this node
4421 // to get a correct result.
4422 EnsureAnalyzed(that->loop_node());
4423 info->AddFromFollowing(that->loop_node()->info());
4424}
4425
4426
4427void Analysis::VisitBackReference(BackReferenceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004428 EnsureAnalyzed(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004429}
4430
4431
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004432void Analysis::VisitAssertion(AssertionNode* that) {
4433 EnsureAnalyzed(that->on_success());
4434}
4435
4436
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004437// -------------------------------------------------------------------
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004438// Dispatch table construction
4439
4440
4441void DispatchTableConstructor::VisitEnd(EndNode* that) {
4442 AddRange(CharacterRange::Everything());
4443}
4444
4445
4446void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
4447 node->set_being_calculated(true);
4448 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
4449 for (int i = 0; i < alternatives->length(); i++) {
4450 set_choice_index(i);
4451 alternatives->at(i).node()->Accept(this);
4452 }
4453 node->set_being_calculated(false);
4454}
4455
4456
4457class AddDispatchRange {
4458 public:
4459 explicit AddDispatchRange(DispatchTableConstructor* constructor)
4460 : constructor_(constructor) { }
4461 void Call(uc32 from, DispatchTable::Entry entry);
4462 private:
4463 DispatchTableConstructor* constructor_;
4464};
4465
4466
4467void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
4468 CharacterRange range(from, entry.to());
4469 constructor_->AddRange(range);
4470}
4471
4472
4473void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
4474 if (node->being_calculated())
4475 return;
4476 DispatchTable* table = node->GetTable(ignore_case_);
4477 AddDispatchRange adder(this);
4478 table->ForEach(&adder);
4479}
4480
4481
4482void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
4483 // TODO(160): Find the node that we refer back to and propagate its start
4484 // set back to here. For now we just accept anything.
4485 AddRange(CharacterRange::Everything());
4486}
4487
4488
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004489void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
4490 RegExpNode* target = that->on_success();
4491 target->Accept(this);
4492}
4493
4494
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004495
4496static int CompareRangeByFrom(const CharacterRange* a,
4497 const CharacterRange* b) {
4498 return Compare<uc16>(a->from(), b->from());
4499}
4500
4501
4502void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
4503 ranges->Sort(CompareRangeByFrom);
4504 uc16 last = 0;
4505 for (int i = 0; i < ranges->length(); i++) {
4506 CharacterRange range = ranges->at(i);
4507 if (last < range.from())
4508 AddRange(CharacterRange(last, range.from() - 1));
4509 if (range.to() >= last) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004510 if (range.to() == String::kMaxUC16CharCode) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004511 return;
4512 } else {
4513 last = range.to() + 1;
4514 }
4515 }
4516 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004517 AddRange(CharacterRange(last, String::kMaxUC16CharCode));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004518}
4519
4520
4521void DispatchTableConstructor::VisitText(TextNode* that) {
4522 TextElement elm = that->elements()->at(0);
4523 switch (elm.type) {
4524 case TextElement::ATOM: {
4525 uc16 c = elm.data.u_atom->data()[0];
4526 AddRange(CharacterRange(c, c));
4527 break;
4528 }
4529 case TextElement::CHAR_CLASS: {
4530 RegExpCharacterClass* tree = elm.data.u_char_class;
4531 ZoneList<CharacterRange>* ranges = tree->ranges();
4532 if (tree->is_negated()) {
4533 AddInverse(ranges);
4534 } else {
4535 for (int i = 0; i < ranges->length(); i++)
4536 AddRange(ranges->at(i));
4537 }
4538 break;
4539 }
4540 default: {
4541 UNIMPLEMENTED();
4542 }
4543 }
4544}
4545
4546
4547void DispatchTableConstructor::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004548 RegExpNode* target = that->on_success();
4549 target->Accept(this);
4550}
4551
4552
kasperl@chromium.org7be3c992009-03-12 07:19:55 +00004553RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data,
4554 bool ignore_case,
4555 bool is_multiline,
4556 Handle<String> pattern,
4557 bool is_ascii) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004558 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +00004559 return IrregexpRegExpTooBig();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004560 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004561 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004562 // Wrap the body of the regexp in capture #0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004563 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004564 0,
4565 &compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004566 compiler.accept());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004567 RegExpNode* node = captured_body;
4568 if (!data->tree->IsAnchored()) {
4569 // Add a .*? at the beginning, outside the body capture, unless
4570 // this expression is anchored at the beginning.
iposva@chromium.org245aa852009-02-10 00:49:54 +00004571 RegExpNode* loop_node =
4572 RegExpQuantifier::ToNode(0,
4573 RegExpTree::kInfinity,
4574 false,
4575 new RegExpCharacterClass('*'),
4576 &compiler,
4577 captured_body,
4578 data->contains_anchor);
4579
4580 if (data->contains_anchor) {
4581 // Unroll loop once, to take care of the case that might start
4582 // at the start of input.
4583 ChoiceNode* first_step_node = new ChoiceNode(2);
4584 first_step_node->AddAlternative(GuardedAlternative(captured_body));
4585 first_step_node->AddAlternative(GuardedAlternative(
4586 new TextNode(new RegExpCharacterClass('*'), loop_node)));
4587 node = first_step_node;
4588 } else {
4589 node = loop_node;
4590 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004591 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004592 data->node = node;
4593 Analysis analysis(ignore_case);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004594 analysis.EnsureAnalyzed(node);
4595
4596 NodeInfo info = *node->info();
ager@chromium.org8bb60582008-12-11 12:02:20 +00004597
ager@chromium.orgbb29dc92009-03-24 13:25:23 +00004598 if (RegExpImpl::UseNativeRegexp()) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004599#ifdef ARM
ager@chromium.orgbb29dc92009-03-24 13:25:23 +00004600 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004601#else // IA32
ager@chromium.org8bb60582008-12-11 12:02:20 +00004602 RegExpMacroAssemblerIA32::Mode mode;
4603 if (is_ascii) {
4604 mode = RegExpMacroAssemblerIA32::ASCII;
4605 } else {
4606 mode = RegExpMacroAssemblerIA32::UC16;
4607 }
4608 RegExpMacroAssemblerIA32 macro_assembler(mode,
4609 (data->capture_count + 1) * 2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004610 return compiler.Assemble(&macro_assembler,
4611 node,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004612 data->capture_count,
4613 pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004614#endif
4615 }
4616 EmbeddedVector<byte, 1024> codes;
4617 RegExpMacroAssemblerIrregexp macro_assembler(codes);
4618 return compiler.Assemble(&macro_assembler,
4619 node,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004620 data->capture_count,
4621 pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004622}
4623
4624
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00004625}} // namespace v8::internal