blob: b6165c482a6f73dbae8223965bb502cc3aae7e90 [file] [log] [blame]
ager@chromium.org9258b6b2008-09-11 09:11:10 +00001// Copyright 2006-2008 the V8 project authors. All rights reserved.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include "v8.h"
29
ager@chromium.orga74f0da2008-12-03 16:05:52 +000030#include "ast.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000031#include "execution.h"
32#include "factory.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000033#include "jsregexp-inl.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000034#include "platform.h"
kasperl@chromium.org41044eb2008-10-06 08:24:46 +000035#include "runtime.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000036#include "top.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000037#include "compilation-cache.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000038#include "string-stream.h"
39#include "parser.h"
40#include "regexp-macro-assembler.h"
41#include "regexp-macro-assembler-tracer.h"
42#include "regexp-macro-assembler-irregexp.h"
43
44#ifdef ARM
45#include "regexp-macro-assembler-arm.h"
46#else // IA32
47#include "macro-assembler-ia32.h"
48#include "regexp-macro-assembler-ia32.h"
49#endif
50
51#include "interpreter-irregexp.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000052
53// Including pcre.h undefines DEBUG to avoid getting debug output from
54// the JSCRE implementation. Make sure to redefine it in debug mode
55// after having included the header file.
56#ifdef DEBUG
57#include "third_party/jscre/pcre.h"
58#define DEBUG
59#else
60#include "third_party/jscre/pcre.h"
61#endif
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000062
ager@chromium.orga74f0da2008-12-03 16:05:52 +000063
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000064namespace v8 { namespace internal {
65
66
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000067static Failure* malloc_failure;
68
69static void* JSREMalloc(size_t size) {
70 Object* obj = Heap::AllocateByteArray(size);
71
72 // If allocation failed, return a NULL pointer to JSRE, and jsRegExpCompile
73 // will return NULL to the caller, performs GC there.
74 // Also pass failure information to the caller.
75 if (obj->IsFailure()) {
76 malloc_failure = Failure::cast(obj);
77 return NULL;
78 }
79
80 // Note: object is unrooted, the caller of jsRegExpCompile must
81 // create a handle for the return value before doing heap allocation.
82 return reinterpret_cast<void*>(ByteArray::cast(obj)->GetDataStartAddress());
83}
84
85
86static void JSREFree(void* p) {
87 USE(p); // Do nothing, memory is garbage collected.
88}
89
90
91String* RegExpImpl::last_ascii_string_ = NULL;
92String* RegExpImpl::two_byte_cached_string_ = NULL;
93
94
95void RegExpImpl::NewSpaceCollectionPrologue() {
96 // The two byte string is always in the old space. The Ascii string may be
97 // in either place. If it is in the old space we don't need to do anything.
98 if (Heap::InNewSpace(last_ascii_string_)) {
99 // Invalidate the cache.
100 last_ascii_string_ = NULL;
101 two_byte_cached_string_ = NULL;
102 }
103}
104
105
106void RegExpImpl::OldSpaceCollectionPrologue() {
107 last_ascii_string_ = NULL;
108 two_byte_cached_string_ = NULL;
109}
110
111
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000112Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
113 Handle<String> pattern,
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000114 Handle<String> flags,
115 bool* has_pending_exception) {
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000116 // Ensure that the constructor function has been loaded.
117 if (!constructor->IsLoaded()) {
118 LoadLazy(constructor, has_pending_exception);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000119 if (*has_pending_exception) return Handle<Object>();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000120 }
121 // Call the construct code with 2 arguments.
122 Object** argv[2] = { Handle<Object>::cast(pattern).location(),
123 Handle<Object>::cast(flags).location() };
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000124 return Execution::New(constructor, 2, argv, has_pending_exception);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000125}
126
127
128// Converts a source string to a 16 bit flat string or a SlicedString containing
129// a 16 bit flat string).
130Handle<String> RegExpImpl::CachedStringToTwoByte(Handle<String> subject) {
131 if (*subject == last_ascii_string_) {
132 ASSERT(two_byte_cached_string_ != NULL);
133 return Handle<String>(String::cast(two_byte_cached_string_));
134 }
135 Handle<String> two_byte_string = StringToTwoByte(subject);
136 last_ascii_string_ = *subject;
137 two_byte_cached_string_ = *two_byte_string;
138 return two_byte_string;
139}
140
141
142// Converts a source string to a 16 bit flat string or a SlicedString containing
143// a 16 bit flat string).
144Handle<String> RegExpImpl::StringToTwoByte(Handle<String> pattern) {
ager@chromium.org870a0b62008-11-04 11:43:05 +0000145 StringShape shape(*pattern);
146 if (!pattern->IsFlat(shape)) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000147 FlattenString(pattern);
ager@chromium.orgc3e50d82008-11-05 11:53:10 +0000148 shape = StringShape(*pattern);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000149 }
ager@chromium.org870a0b62008-11-04 11:43:05 +0000150 Handle<String> flat_string(shape.IsCons() ?
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000151 String::cast(ConsString::cast(*pattern)->first()) :
152 *pattern);
ager@chromium.org870a0b62008-11-04 11:43:05 +0000153 ASSERT(flat_string->IsString());
154 StringShape flat_shape(*flat_string);
155 ASSERT(!flat_shape.IsCons());
156 ASSERT(flat_shape.IsSequential() ||
157 flat_shape.IsSliced() ||
158 flat_shape.IsExternal());
159 if (!flat_shape.IsAsciiRepresentation()) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000160 return flat_string;
161 }
162
ager@chromium.org870a0b62008-11-04 11:43:05 +0000163 int len = flat_string->length(flat_shape);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000164 Handle<String> two_byte_string =
ager@chromium.org870a0b62008-11-04 11:43:05 +0000165 Factory::NewRawTwoByteString(len, TENURED);
166 uc16* dest = SeqTwoByteString::cast(*two_byte_string)->GetChars();
167 String::WriteToFlat(*flat_string, flat_shape, dest, 0, len);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000168 return two_byte_string;
169}
170
171
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000172static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
173 int flags = JSRegExp::NONE;
ager@chromium.org870a0b62008-11-04 11:43:05 +0000174 StringShape shape(*str);
175 for (int i = 0; i < str->length(shape); i++) {
176 switch (str->Get(shape, i)) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000177 case 'i':
178 flags |= JSRegExp::IGNORE_CASE;
179 break;
180 case 'g':
181 flags |= JSRegExp::GLOBAL;
182 break;
183 case 'm':
184 flags |= JSRegExp::MULTILINE;
185 break;
186 }
187 }
188 return JSRegExp::Flags(flags);
189}
190
191
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000192static inline void ThrowRegExpException(Handle<JSRegExp> re,
193 Handle<String> pattern,
194 Handle<String> error_text,
195 const char* message) {
196 Handle<JSArray> array = Factory::NewJSArray(2);
197 SetElement(array, 0, pattern);
198 SetElement(array, 1, error_text);
199 Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
200 Top::Throw(*regexp_err);
201}
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000202
203
ager@chromium.org8bb60582008-12-11 12:02:20 +0000204// Generic RegExp methods. Dispatches to implementation specific methods.
205
206
207class OffsetsVector {
208 public:
209 inline OffsetsVector(int num_registers)
210 : offsets_vector_length_(num_registers) {
211 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
212 vector_ = NewArray<int>(offsets_vector_length_);
213 } else {
214 vector_ = static_offsets_vector_;
215 }
216 }
217
218
219 inline ~OffsetsVector() {
220 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
221 DeleteArray(vector_);
222 vector_ = NULL;
223 }
224 }
225
226
227 inline int* vector() {
228 return vector_;
229 }
230
231
232 inline int length() {
233 return offsets_vector_length_;
234 }
235
236 private:
237 int* vector_;
238 int offsets_vector_length_;
239 static const int kStaticOffsetsVectorSize = 50;
240 static int static_offsets_vector_[kStaticOffsetsVectorSize];
241};
242
243
244int OffsetsVector::static_offsets_vector_[
245 OffsetsVector::kStaticOffsetsVectorSize];
246
247
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000248Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
249 Handle<String> pattern,
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000250 Handle<String> flag_str) {
251 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
252 Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
253 bool in_cache = !cached.is_null();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000254 LOG(RegExpCompileEvent(re, in_cache));
255
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000256 Handle<Object> result;
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000257 if (in_cache) {
258 re->set_data(*cached);
259 result = re;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000260 } else {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000261 FlattenString(pattern);
262 ZoneScope zone_scope(DELETE_ON_EXIT);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000263 RegExpCompileData parse_result;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000264 FlatStringReader reader(pattern);
265 if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
266 // Throw an exception if we fail to parse the pattern.
267 ThrowRegExpException(re,
268 pattern,
269 parse_result.error,
270 "malformed_regexp");
ager@chromium.org8bb60582008-12-11 12:02:20 +0000271 return Handle<Object>::null();
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000272 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000273 RegExpAtom* atom = parse_result.tree->AsAtom();
274 if (atom != NULL && !flags.is_ignore_case()) {
275 if (parse_result.has_character_escapes) {
276 Vector<const uc16> atom_pattern = atom->data();
277 Handle<String> atom_string =
278 Factory::NewStringFromTwoByte(atom_pattern);
279 result = AtomCompile(re, pattern, flags, atom_string);
280 } else {
281 result = AtomCompile(re, pattern, flags, pattern);
282 }
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000283 } else {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000284 if (FLAG_irregexp) {
285 result = IrregexpPrepare(re, pattern, flags);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000286 } else {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000287 result = JscrePrepare(re, pattern, flags);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000288 }
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000289 }
290 Object* data = re->data();
291 if (data->IsFixedArray()) {
292 // If compilation succeeded then the data is set on the regexp
293 // and we can store it in the cache.
294 Handle<FixedArray> data(FixedArray::cast(re->data()));
295 CompilationCache::PutRegExp(pattern, flags, data);
296 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000297 }
298
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000299 return result;
300}
301
302
303Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
304 Handle<String> subject,
305 Handle<Object> index) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000306 switch (regexp->TypeTag()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000307 case JSRegExp::ATOM:
308 return AtomExec(regexp, subject, index);
309 case JSRegExp::IRREGEXP: {
310 Handle<Object> result = IrregexpExec(regexp, subject, index);
311 if (!result.is_null()) {
312 return result;
313 }
314 // We couldn't handle the regexp using Irregexp, so fall back
315 // on JSCRE.
316 // Reset the JSRegExp to use JSCRE.
317 JscrePrepare(regexp,
318 Handle<String>(regexp->Pattern()),
319 regexp->GetFlags());
320 // Fall-through to JSCRE.
321 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000322 case JSRegExp::JSCRE:
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000323 if (FLAG_disable_jscre) {
324 UNIMPLEMENTED();
325 }
326 return JscreExec(regexp, subject, index);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000327 default:
328 UNREACHABLE();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000329 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000330 }
331}
332
333
334Handle<Object> RegExpImpl::ExecGlobal(Handle<JSRegExp> regexp,
335 Handle<String> subject) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000336 switch (regexp->TypeTag()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000337 case JSRegExp::ATOM:
338 return AtomExecGlobal(regexp, subject);
339 case JSRegExp::IRREGEXP: {
340 Handle<Object> result = IrregexpExecGlobal(regexp, subject);
341 if (!result.is_null()) {
342 return result;
343 }
344 // We couldn't handle the regexp using Irregexp, so fall back
345 // on JSCRE.
346 // Reset the JSRegExp to use JSCRE.
347 JscrePrepare(regexp,
348 Handle<String>(regexp->Pattern()),
349 regexp->GetFlags());
350 // Fall-through to JSCRE.
351 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000352 case JSRegExp::JSCRE:
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000353 if (FLAG_disable_jscre) {
354 UNIMPLEMENTED();
355 }
356 return JscreExecGlobal(regexp, subject);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000357 default:
358 UNREACHABLE();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000359 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000360 }
361}
362
363
ager@chromium.org8bb60582008-12-11 12:02:20 +0000364// RegExp Atom implementation: Simple string search using indexOf.
365
366
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000367Handle<Object> RegExpImpl::AtomCompile(Handle<JSRegExp> re,
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000368 Handle<String> pattern,
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000369 JSRegExp::Flags flags,
370 Handle<String> match_pattern) {
371 Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, match_pattern);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000372 return re;
373}
374
375
376Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
377 Handle<String> subject,
378 Handle<Object> index) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000379 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000380
381 uint32_t start_index;
382 if (!Array::IndexFromObject(*index, &start_index)) {
383 return Handle<Smi>(Smi::FromInt(-1));
384 }
385
386 LOG(RegExpExecEvent(re, start_index, subject));
ager@chromium.org7c537e22008-10-16 08:43:32 +0000387 int value = Runtime::StringMatch(subject, needle, start_index);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000388 if (value == -1) return Factory::null_value();
ager@chromium.org7c537e22008-10-16 08:43:32 +0000389
390 Handle<FixedArray> array = Factory::NewFixedArray(2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000391 array->set(0, Smi::FromInt(value));
392 array->set(1, Smi::FromInt(value + needle->length()));
ager@chromium.org7c537e22008-10-16 08:43:32 +0000393 return Factory::NewJSArrayWithElements(array);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000394}
395
396
397Handle<Object> RegExpImpl::AtomExecGlobal(Handle<JSRegExp> re,
398 Handle<String> subject) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000399 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000400 Handle<JSArray> result = Factory::NewJSArray(1);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000401 int index = 0;
402 int match_count = 0;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000403 int subject_length = subject->length();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000404 int needle_length = needle->length();
ager@chromium.org7c537e22008-10-16 08:43:32 +0000405 while (true) {
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000406 LOG(RegExpExecEvent(re, index, subject));
ager@chromium.org7c537e22008-10-16 08:43:32 +0000407 int value = -1;
408 if (index + needle_length <= subject_length) {
409 value = Runtime::StringMatch(subject, needle, index);
410 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000411 if (value == -1) break;
412 HandleScope scope;
413 int end = value + needle_length;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000414
415 Handle<FixedArray> array = Factory::NewFixedArray(2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000416 array->set(0, Smi::FromInt(value));
417 array->set(1, Smi::FromInt(end));
ager@chromium.org7c537e22008-10-16 08:43:32 +0000418 Handle<JSArray> pair = Factory::NewJSArrayWithElements(array);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000419 SetElement(result, match_count, pair);
420 match_count++;
421 index = end;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000422 if (needle_length == 0) index++;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000423 }
424 return result;
425}
426
427
ager@chromium.org8bb60582008-12-11 12:02:20 +0000428// JSCRE implementation.
429
430
431int RegExpImpl::JscreNumberOfCaptures(Handle<JSRegExp> re) {
432 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
433 return Smi::cast(value->get(kJscreNumberOfCapturesIndex))->value();
434}
435
436
437ByteArray* RegExpImpl::JscreInternal(Handle<JSRegExp> re) {
438 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
439 return ByteArray::cast(value->get(kJscreInternalIndex));
440}
441
442
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000443Handle<Object>RegExpImpl::JscrePrepare(Handle<JSRegExp> re,
444 Handle<String> pattern,
445 JSRegExp::Flags flags) {
446 Handle<Object> value(Heap::undefined_value());
447 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
448 return re;
449}
450
451
ager@chromium.org8bb60582008-12-11 12:02:20 +0000452static inline Object* JscreDoCompile(String* pattern,
453 JSRegExp::Flags flags,
454 unsigned* number_of_captures,
455 const char** error_message,
456 v8::jscre::JscreRegExp** code) {
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000457 v8::jscre::JSRegExpIgnoreCaseOption case_option = flags.is_ignore_case()
458 ? v8::jscre::JSRegExpIgnoreCase
459 : v8::jscre::JSRegExpDoNotIgnoreCase;
460 v8::jscre::JSRegExpMultilineOption multiline_option = flags.is_multiline()
461 ? v8::jscre::JSRegExpMultiline
462 : v8::jscre::JSRegExpSingleLine;
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000463 *error_message = NULL;
464 malloc_failure = Failure::Exception();
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000465 *code = v8::jscre::jsRegExpCompile(pattern->GetTwoByteData(),
466 pattern->length(),
467 case_option,
468 multiline_option,
469 number_of_captures,
470 error_message,
471 &JSREMalloc,
472 &JSREFree);
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000473 if (*code == NULL && (malloc_failure->IsRetryAfterGC() ||
474 malloc_failure->IsOutOfMemoryFailure())) {
475 return malloc_failure;
476 } else {
477 // It doesn't matter which object we return here, we just need to return
478 // a non-failure to indicate to the GC-retry code that there was no
479 // allocation failure.
480 return pattern;
481 }
482}
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000483
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000484
ager@chromium.org8bb60582008-12-11 12:02:20 +0000485static void JscreCompileWithRetryAfterGC(Handle<String> pattern,
486 JSRegExp::Flags flags,
487 unsigned* number_of_captures,
488 const char** error_message,
489 v8::jscre::JscreRegExp** code) {
490 CALL_HEAP_FUNCTION_VOID(JscreDoCompile(*pattern,
491 flags,
492 number_of_captures,
493 error_message,
494 code));
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000495}
496
497
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000498Handle<Object> RegExpImpl::JscreCompile(Handle<JSRegExp> re) {
499 ASSERT_EQ(re->TypeTag(), JSRegExp::JSCRE);
500 ASSERT(re->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined());
501
502 Handle<String> pattern(re->Pattern());
503 JSRegExp::Flags flags = re->GetFlags();
504
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000505 Handle<String> two_byte_pattern = StringToTwoByte(pattern);
506
507 unsigned number_of_captures;
508 const char* error_message = NULL;
509
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000510 v8::jscre::JscreRegExp* code = NULL;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000511 FlattenString(pattern);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000512
ager@chromium.org8bb60582008-12-11 12:02:20 +0000513 JscreCompileWithRetryAfterGC(two_byte_pattern,
514 flags,
515 &number_of_captures,
516 &error_message,
517 &code);
ager@chromium.org7c537e22008-10-16 08:43:32 +0000518
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000519 if (code == NULL) {
520 // Throw an exception.
521 Handle<JSArray> array = Factory::NewJSArray(2);
522 SetElement(array, 0, pattern);
523 SetElement(array, 1, Factory::NewStringFromUtf8(CStrVector(
524 (error_message == NULL) ? "Unknown regexp error" : error_message)));
525 Handle<Object> regexp_err =
526 Factory::NewSyntaxError("malformed_regexp", array);
ager@chromium.org3bf7b912008-11-17 09:09:45 +0000527 Top::Throw(*regexp_err);
528 return Handle<Object>();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000529 }
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000530
531 // Convert the return address to a ByteArray pointer.
532 Handle<ByteArray> internal(
533 ByteArray::FromDataStartAddress(reinterpret_cast<Address>(code)));
534
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000535 Handle<FixedArray> value = Factory::NewFixedArray(kJscreDataLength);
536 value->set(kJscreNumberOfCapturesIndex, Smi::FromInt(number_of_captures));
537 value->set(kJscreInternalIndex, *internal);
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000538 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
539
540 return re;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000541}
542
543
ager@chromium.org8bb60582008-12-11 12:02:20 +0000544Handle<Object> RegExpImpl::JscreExec(Handle<JSRegExp> regexp,
545 Handle<String> subject,
546 Handle<Object> index) {
547 ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);
548 if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {
549 Handle<Object> compile_result = JscreCompile(regexp);
550 if (compile_result.is_null()) return compile_result;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000551 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000552 ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000553
ager@chromium.org8bb60582008-12-11 12:02:20 +0000554 int num_captures = JscreNumberOfCaptures(regexp);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000555
ager@chromium.org8bb60582008-12-11 12:02:20 +0000556 OffsetsVector offsets((num_captures + 1) * 3);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000557
ager@chromium.org8bb60582008-12-11 12:02:20 +0000558 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000559
ager@chromium.org8bb60582008-12-11 12:02:20 +0000560 Handle<String> subject16 = CachedStringToTwoByte(subject);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000561
ager@chromium.org8bb60582008-12-11 12:02:20 +0000562 return JscreExecOnce(regexp,
563 num_captures,
564 subject,
565 previous_index,
566 subject16->GetTwoByteData(),
567 offsets.vector(),
568 offsets.length());
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000569}
570
571
572Handle<Object> RegExpImpl::JscreExecOnce(Handle<JSRegExp> regexp,
573 int num_captures,
574 Handle<String> subject,
575 int previous_index,
576 const uc16* two_byte_subject,
577 int* offsets_vector,
578 int offsets_vector_length) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000579 int rc;
580 {
581 AssertNoAllocation a;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000582 ByteArray* internal = JscreInternal(regexp);
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000583 const v8::jscre::JscreRegExp* js_regexp =
584 reinterpret_cast<v8::jscre::JscreRegExp*>(
585 internal->GetDataStartAddress());
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000586
kasperl@chromium.orgb9123622008-09-17 14:05:56 +0000587 LOG(RegExpExecEvent(regexp, previous_index, subject));
588
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000589 rc = v8::jscre::jsRegExpExecute(js_regexp,
590 two_byte_subject,
591 subject->length(),
592 previous_index,
593 offsets_vector,
594 offsets_vector_length);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000595 }
596
597 // The KJS JavaScript engine returns null (ie, a failed match) when
598 // JSRE's internal match limit is exceeded. We duplicate that behavior here.
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000599 if (rc == v8::jscre::JSRegExpErrorNoMatch
600 || rc == v8::jscre::JSRegExpErrorHitLimit) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000601 return Factory::null_value();
602 }
603
604 // Other JSRE errors:
605 if (rc < 0) {
606 // Throw an exception.
607 Handle<Object> code(Smi::FromInt(rc));
608 Handle<Object> args[2] = { Factory::LookupAsciiSymbol("jsre_exec"), code };
609 Handle<Object> regexp_err(
610 Factory::NewTypeError("jsre_error", HandleVector(args, 2)));
611 return Handle<Object>(Top::Throw(*regexp_err));
612 }
613
ager@chromium.org7c537e22008-10-16 08:43:32 +0000614 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000615 // The captures come in (start, end+1) pairs.
616 for (int i = 0; i < 2 * (num_captures+1); i += 2) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000617 array->set(i, Smi::FromInt(offsets_vector[i]));
618 array->set(i+1, Smi::FromInt(offsets_vector[i+1]));
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000619 }
ager@chromium.org7c537e22008-10-16 08:43:32 +0000620 return Factory::NewJSArrayWithElements(array);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000621}
622
623
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000624Handle<Object> RegExpImpl::JscreExecGlobal(Handle<JSRegExp> regexp,
625 Handle<String> subject) {
626 ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);
627 if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {
628 Handle<Object> compile_result = JscreCompile(regexp);
629 if (compile_result.is_null()) return compile_result;
630 }
631 ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());
632
633 // Prepare space for the return values.
634 int num_captures = JscreNumberOfCaptures(regexp);
635
636 OffsetsVector offsets((num_captures + 1) * 3);
637
638 int previous_index = 0;
639
640 Handle<JSArray> result = Factory::NewJSArray(0);
641 int i = 0;
642 Handle<Object> matches;
643
644 Handle<String> subject16 = CachedStringToTwoByte(subject);
645
646 do {
647 if (previous_index > subject->length() || previous_index < 0) {
648 // Per ECMA-262 15.10.6.2, if the previous index is greater than the
649 // string length, there is no match.
650 matches = Factory::null_value();
651 } else {
652 matches = JscreExecOnce(regexp,
653 num_captures,
654 subject,
655 previous_index,
656 subject16->GetTwoByteData(),
657 offsets.vector(),
658 offsets.length());
659
660 if (matches->IsJSArray()) {
661 SetElement(result, i, matches);
662 i++;
663 previous_index = offsets.vector()[1];
664 if (offsets.vector()[0] == offsets.vector()[1]) {
665 previous_index++;
666 }
667 }
668 }
669 } while (matches->IsJSArray());
670
671 // If we exited the loop with an exception, throw it.
672 if (matches->IsNull()) {
673 // Exited loop normally.
674 return result;
675 } else {
676 // Exited loop with the exception in matches.
677 return matches;
678 }
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000679}
680
681
ager@chromium.org8bb60582008-12-11 12:02:20 +0000682// Irregexp implementation.
683
684
685static Handle<FixedArray> GetCompiledIrregexp(Handle<JSRegExp> re,
686 bool is_ascii) {
687 ASSERT(re->DataAt(JSRegExp::kIrregexpDataIndex)->IsFixedArray());
688 Handle<FixedArray> alternatives(
689 FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex)));
690 ASSERT_EQ(2, alternatives->length());
691
692 int index = is_ascii ? 0 : 1;
693 Object* entry = alternatives->get(index);
694 if (!entry->IsNull()) {
695 return Handle<FixedArray>(FixedArray::cast(entry));
696 }
697
698 // Compile the RegExp.
699 ZoneScope zone_scope(DELETE_ON_EXIT);
700
701 JSRegExp::Flags flags = re->GetFlags();
702
703 Handle<String> pattern(re->Pattern());
704 StringShape shape(*pattern);
705 if (!pattern->IsFlat(shape)) {
706 pattern->Flatten(shape);
707 }
708
709 RegExpCompileData compile_data;
710 FlatStringReader reader(pattern);
711 if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) {
712 // Throw an exception if we fail to parse the pattern.
713 // THIS SHOULD NOT HAPPEN. We already parsed it successfully once.
714 ThrowRegExpException(re,
715 pattern,
716 compile_data.error,
717 "malformed_regexp");
718 return Handle<FixedArray>::null();
719 }
720 Handle<FixedArray> compiled_entry =
721 RegExpEngine::Compile(&compile_data,
722 flags.is_ignore_case(),
723 flags.is_multiline(),
724 pattern,
725 is_ascii);
726 if (!compiled_entry.is_null()) {
727 alternatives->set(index, *compiled_entry);
728 }
729 return compiled_entry;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000730}
731
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000732
ager@chromium.org8bb60582008-12-11 12:02:20 +0000733int RegExpImpl::IrregexpNumberOfCaptures(Handle<FixedArray> irre) {
734 return Smi::cast(irre->get(kIrregexpNumberOfCapturesIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000735}
736
737
ager@chromium.org8bb60582008-12-11 12:02:20 +0000738int RegExpImpl::IrregexpNumberOfRegisters(Handle<FixedArray> irre) {
739 return Smi::cast(irre->get(kIrregexpNumberOfRegistersIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000740}
741
742
ager@chromium.org8bb60582008-12-11 12:02:20 +0000743Handle<ByteArray> RegExpImpl::IrregexpByteCode(Handle<FixedArray> irre) {
744 ASSERT(Smi::cast(irre->get(kIrregexpImplementationIndex))->value()
745 == RegExpMacroAssembler::kBytecodeImplementation);
746 return Handle<ByteArray>(ByteArray::cast(irre->get(kIrregexpCodeIndex)));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000747}
748
749
ager@chromium.org8bb60582008-12-11 12:02:20 +0000750Handle<Code> RegExpImpl::IrregexpNativeCode(Handle<FixedArray> irre) {
751 ASSERT(Smi::cast(irre->get(kIrregexpImplementationIndex))->value()
752 != RegExpMacroAssembler::kBytecodeImplementation);
753 return Handle<Code>(Code::cast(irre->get(kIrregexpCodeIndex)));
754}
755
756
757Handle<Object>RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
758 Handle<String> pattern,
759 JSRegExp::Flags flags) {
760 // Make space for ASCII and UC16 versions.
761 Handle<FixedArray> alternatives = Factory::NewFixedArray(2);
762 alternatives->set_null(0);
763 alternatives->set_null(1);
764 Factory::SetRegExpData(re, JSRegExp::IRREGEXP, pattern, flags, alternatives);
765 return re;
766}
767
768
769Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
770 Handle<String> subject,
771 Handle<Object> index) {
772 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
773 ASSERT(regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsFixedArray());
774
775 bool is_ascii = StringShape(*subject).IsAsciiRepresentation();
776 Handle<FixedArray> irregexp = GetCompiledIrregexp(regexp, is_ascii);
777 if (irregexp.is_null()) {
778 // We can't handle the RegExp with IRRegExp.
779 return Handle<Object>::null();
780 }
781
782 // Prepare space for the return values.
783 int number_of_registers = IrregexpNumberOfRegisters(irregexp);
784 OffsetsVector offsets(number_of_registers);
785
786 int num_captures = IrregexpNumberOfCaptures(irregexp);
787
788 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
789
790#ifdef DEBUG
791 if (FLAG_trace_regexp_bytecodes) {
792 String* pattern = regexp->Pattern();
793 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
794 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
795 }
796#endif
797 LOG(RegExpExecEvent(regexp, previous_index, subject));
798 return IrregexpExecOnce(irregexp,
799 num_captures,
800 subject,
801 previous_index,
802 offsets.vector(),
803 offsets.length());
804}
805
806
807Handle<Object> RegExpImpl::IrregexpExecGlobal(Handle<JSRegExp> regexp,
808 Handle<String> subject) {
809 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
810
811 StringShape shape(*subject);
812 bool is_ascii = shape.IsAsciiRepresentation();
813 Handle<FixedArray> irregexp = GetCompiledIrregexp(regexp, is_ascii);
814 if (irregexp.is_null()) {
815 return Handle<Object>::null();
816 }
817
818 // Prepare space for the return values.
819 int number_of_registers = IrregexpNumberOfRegisters(irregexp);
820 OffsetsVector offsets(number_of_registers);
821
822 int previous_index = 0;
823
824 Handle<JSArray> result = Factory::NewJSArray(0);
825 int i = 0;
826 Handle<Object> matches;
827
828 if (!subject->IsFlat(shape)) {
829 subject->Flatten(shape);
830 }
831
832 do {
833 if (previous_index > subject->length() || previous_index < 0) {
834 // Per ECMA-262 15.10.6.2, if the previous index is greater than the
835 // string length, there is no match.
836 matches = Factory::null_value();
837 } else {
838#ifdef DEBUG
839 if (FLAG_trace_regexp_bytecodes) {
840 String* pattern = regexp->Pattern();
841 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
842 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
843 }
844#endif
845 LOG(RegExpExecEvent(regexp, previous_index, subject));
846 matches = IrregexpExecOnce(irregexp,
847 IrregexpNumberOfCaptures(irregexp),
848 subject,
849 previous_index,
850 offsets.vector(),
851 offsets.length());
852
853 if (matches->IsJSArray()) {
854 SetElement(result, i, matches);
855 i++;
856 previous_index = offsets.vector()[1];
857 if (offsets.vector()[0] == offsets.vector()[1]) {
858 previous_index++;
859 }
860 }
861 }
862 } while (matches->IsJSArray());
863
864 // If we exited the loop with an exception, throw it.
865 if (matches->IsNull()) {
866 // Exited loop normally.
867 return result;
868 } else {
869 // Exited loop with the exception in matches.
870 return matches;
871 }
872}
873
874
875Handle<Object> RegExpImpl::IrregexpExecOnce(Handle<FixedArray> irregexp,
876 int num_captures,
877 Handle<String> subject,
878 int previous_index,
879 int* offsets_vector,
880 int offsets_vector_length) {
881 bool rc;
882
883 int tag = Smi::cast(irregexp->get(kIrregexpImplementationIndex))->value();
884
885 if (!subject->IsFlat(StringShape(*subject))) {
886 FlattenString(subject);
887 }
888
889 switch (tag) {
890 case RegExpMacroAssembler::kIA32Implementation: {
891#ifndef ARM
892 Handle<Code> code = IrregexpNativeCode(irregexp);
893
894 StringShape shape(*subject);
895
896 // Character offsets into string.
897 int start_offset = previous_index;
898 int end_offset = subject->length(shape);
899
900 if (shape.IsCons()) {
901 subject = Handle<String>(ConsString::cast(*subject)->first());
902 } else if (shape.IsSliced()) {
903 SlicedString* slice = SlicedString::cast(*subject);
904 start_offset += slice->start();
905 end_offset += slice->start();
906 subject = Handle<String>(slice->buffer());
907 }
908
909 // String is now either Sequential or External
910 StringShape flatshape(*subject);
911 bool is_ascii = flatshape.IsAsciiRepresentation();
912 int char_size_shift = is_ascii ? 0 : 1;
913
914 if (flatshape.IsExternal()) {
915 const byte* address;
916 if (is_ascii) {
917 ExternalAsciiString* ext = ExternalAsciiString::cast(*subject);
918 address = reinterpret_cast<const byte*>(ext->resource()->data());
919 } else {
920 ExternalTwoByteString* ext = ExternalTwoByteString::cast(*subject);
921 address = reinterpret_cast<const byte*>(ext->resource()->data());
922 }
923 rc = RegExpMacroAssemblerIA32::Execute(
924 *code,
925 &address,
926 start_offset << char_size_shift,
927 end_offset << char_size_shift,
928 offsets_vector,
929 previous_index == 0);
930 } else { // Sequential string
931 Address char_address =
932 is_ascii ? SeqAsciiString::cast(*subject)->GetCharsAddress()
933 : SeqTwoByteString::cast(*subject)->GetCharsAddress();
934 int byte_offset = char_address - reinterpret_cast<Address>(*subject);
935 rc = RegExpMacroAssemblerIA32::Execute(
936 *code,
937 subject.location(),
938 byte_offset + (start_offset << char_size_shift),
939 byte_offset + (end_offset << char_size_shift),
940 offsets_vector,
941 previous_index == 0);
942 }
943
944 if (rc) {
945 // Capture values are relative to start_offset only.
946 for (int i = 0; i < offsets_vector_length; i++) {
947 if (offsets_vector[i] >= 0) {
948 offsets_vector[i] += previous_index;
949 }
950 }
951 }
952 break;
953#else
954 UNIMPLEMENTED();
955 rc = false;
956 break;
957#endif
958 }
959 case RegExpMacroAssembler::kBytecodeImplementation: {
960 for (int i = (num_captures + 1) * 2 - 1; i >= 0; i--) {
961 offsets_vector[i] = -1;
962 }
963 Handle<ByteArray> byte_codes = IrregexpByteCode(irregexp);
964
965 rc = IrregexpInterpreter::Match(byte_codes,
966 subject,
967 offsets_vector,
968 previous_index);
969 break;
970 }
971 case RegExpMacroAssembler::kARMImplementation:
972 default:
973 UNREACHABLE();
974 rc = false;
975 break;
976 }
977
978 if (!rc) {
979 return Factory::null_value();
980 }
981
982 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
983 // The captures come in (start, end+1) pairs.
984 for (int i = 0; i < 2 * (num_captures+1); i += 2) {
985 array->set(i, Smi::FromInt(offsets_vector[i]));
986 array->set(i+1, Smi::FromInt(offsets_vector[i+1]));
987 }
988 return Factory::NewJSArrayWithElements(array);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000989}
990
991
992// -------------------------------------------------------------------
993// Implmentation of the Irregexp regular expression engine.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000994//
995// The Irregexp regular expression engine is intended to be a complete
996// implementation of ECMAScript regular expressions. It generates either
997// bytecodes or native code.
998
999// The Irregexp regexp engine is structured in three steps.
1000// 1) The parser generates an abstract syntax tree. See ast.cc.
1001// 2) From the AST a node network is created. The nodes are all
1002// subclasses of RegExpNode. The nodes represent states when
1003// executing a regular expression. Several optimizations are
1004// performed on the node network.
1005// 3) From the nodes we generate either byte codes or native code
1006// that can actually execute the regular expression (perform
1007// the search). The code generation step is described in more
1008// detail below.
1009
1010// Code generation.
1011//
1012// The nodes are divided into four main categories.
1013// * Choice nodes
1014// These represent places where the regular expression can
1015// match in more than one way. For example on entry to an
1016// alternation (foo|bar) or a repetition (*, +, ? or {}).
1017// * Action nodes
1018// These represent places where some action should be
1019// performed. Examples include recording the current position
1020// in the input string to a register (in order to implement
1021// captures) or other actions on register for example in order
1022// to implement the counters needed for {} repetitions.
1023// * Matching nodes
1024// These attempt to match some element part of the input string.
1025// Examples of elements include character classes, plain strings
1026// or back references.
1027// * End nodes
1028// These are used to implement the actions required on finding
1029// a successful match or failing to find a match.
1030//
1031// The code generated (whether as byte codes or native code) maintains
1032// some state as it runs. This consists of the following elements:
1033//
1034// * The capture registers. Used for string captures.
1035// * Other registers. Used for counters etc.
1036// * The current position.
1037// * The stack of backtracking information. Used when a matching node
1038// fails to find a match and needs to try an alternative.
1039//
1040// Conceptual regular expression execution model:
1041//
1042// There is a simple conceptual model of regular expression execution
1043// which will be presented first. The actual code generated is a more
1044// efficient simulation of the simple conceptual model:
1045//
1046// * Choice nodes are implemented as follows:
1047// For each choice except the last {
1048// push current position
1049// push backtrack code location
1050// <generate code to test for choice>
1051// backtrack code location:
1052// pop current position
1053// }
1054// <generate code to test for last choice>
1055//
1056// * Actions nodes are generated as follows
1057// <push affected registers on backtrack stack>
1058// <generate code to perform action>
1059// push backtrack code location
1060// <generate code to test for following nodes>
1061// backtrack code location:
1062// <pop affected registers to restore their state>
1063// <pop backtrack location from stack and go to it>
1064//
1065// * Matching nodes are generated as follows:
1066// if input string matches at current position
1067// update current position
1068// <generate code to test for following nodes>
1069// else
1070// <pop backtrack location from stack and go to it>
1071//
1072// Thus it can be seen that the current position is saved and restored
1073// by the choice nodes, whereas the registers are saved and restored by
1074// by the action nodes that manipulate them.
1075//
1076// The other interesting aspect of this model is that nodes are generated
1077// at the point where they are needed by a recursive call to Emit(). If
1078// the node has already been code generated then the Emit() call will
1079// generate a jump to the previously generated code instead. In order to
1080// limit recursion it is possible for the Emit() function to put the node
1081// on a work list for later generation and instead generate a jump. The
1082// destination of the jump is resolved later when the code is generated.
1083//
1084// Actual regular expression code generation.
1085//
1086// Code generation is actually more complicated than the above. In order
1087// to improve the efficiency of the generated code some optimizations are
1088// performed
1089//
1090// * Choice nodes have 1-character lookahead.
1091// A choice node looks at the following character and eliminates some of
1092// the choices immediately based on that character. This is not yet
1093// implemented.
1094// * Simple greedy loops store reduced backtracking information.
1095// A quantifier like /.*foo/m will greedily match the whole input. It will
1096// then need to backtrack to a point where it can match "foo". The naive
1097// implementation of this would push each character position onto the
1098// backtracking stack, then pop them off one by one. This would use space
1099// proportional to the length of the input string. However since the "."
1100// can only match in one way and always has a constant length (in this case
1101// of 1) it suffices to store the current position on the top of the stack
1102// once. Matching now becomes merely incrementing the current position and
1103// backtracking becomes decrementing the current position and checking the
1104// result against the stored current position. This is faster and saves
1105// space.
1106// * The current state is virtualized.
1107// This is used to defer expensive operations until it is clear that they
1108// are needed and to generate code for a node more than once, allowing
1109// specialized an efficient versions of the code to be created. This is
1110// explained in the section below.
1111//
1112// Execution state virtualization.
1113//
1114// Instead of emitting code, nodes that manipulate the state can record their
1115// manipulation in an object called the GenerationVariant. The
1116// GenerationVariant object can record a current position offset, an
1117// optional backtrack code location on the top of the virtualized backtrack
1118// stack and some register changes. When a node is to be emitted it can flush
1119// the GenerationVariant or update it. Flushing the GenerationVariant
1120// will emit code to bring the actual state into line with the virtual state.
1121// Avoiding flushing the state can postpone some work (eg updates of capture
1122// registers). Postponing work can save time when executing the regular
1123// expression since it may be found that the work never has to be done as a
1124// failure to match can occur. In addition it is much faster to jump to a
1125// known backtrack code location than it is to pop an unknown backtrack
1126// location from the stack and jump there.
1127//
1128// The virtual state found in the GenerationVariant affects code generation.
1129// For example the virtual state contains the difference between the actual
1130// current position and the virtual current position, and matching code needs
1131// to use this offset to attempt a match in the correct location of the input
1132// string. Therefore code generated for a non-trivial GenerationVariant is
1133// specialized to that GenerationVariant. The code generator therefore
1134// has the ability to generate code for each node several times. In order to
1135// limit the size of the generated code there is an arbitrary limit on how
1136// many specialized sets of code may be generated for a given node. If the
1137// limit is reached, the GenerationVariant is flushed and a generic version of
1138// the code for a node is emitted. This is subsequently used for that node.
1139// The code emitted for non-generic GenerationVariants is not recorded in the
1140// node and so it cannot currently be reused in the event that code generation
1141// is requested for an identical GenerationVariant.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001142
1143
1144void RegExpTree::AppendToText(RegExpText* text) {
1145 UNREACHABLE();
1146}
1147
1148
1149void RegExpAtom::AppendToText(RegExpText* text) {
1150 text->AddElement(TextElement::Atom(this));
1151}
1152
1153
1154void RegExpCharacterClass::AppendToText(RegExpText* text) {
1155 text->AddElement(TextElement::CharClass(this));
1156}
1157
1158
1159void RegExpText::AppendToText(RegExpText* text) {
1160 for (int i = 0; i < elements()->length(); i++)
1161 text->AddElement(elements()->at(i));
1162}
1163
1164
1165TextElement TextElement::Atom(RegExpAtom* atom) {
1166 TextElement result = TextElement(ATOM);
1167 result.data.u_atom = atom;
1168 return result;
1169}
1170
1171
1172TextElement TextElement::CharClass(
1173 RegExpCharacterClass* char_class) {
1174 TextElement result = TextElement(CHAR_CLASS);
1175 result.data.u_char_class = char_class;
1176 return result;
1177}
1178
1179
1180DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
1181 if (table_ == NULL) {
1182 table_ = new DispatchTable();
1183 DispatchTableConstructor cons(table_, ignore_case);
1184 cons.BuildTable(this);
1185 }
1186 return table_;
1187}
1188
1189
1190class RegExpCompiler {
1191 public:
ager@chromium.org8bb60582008-12-11 12:02:20 +00001192 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001193
1194 int AllocateRegister() { return next_register_++; }
1195
1196 Handle<FixedArray> Assemble(RegExpMacroAssembler* assembler,
1197 RegExpNode* start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001198 int capture_count,
1199 Handle<String> pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001200
1201 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
1202
1203 static const int kImplementationOffset = 0;
1204 static const int kNumberOfRegistersOffset = 0;
1205 static const int kCodeOffset = 1;
1206
1207 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
1208 EndNode* accept() { return accept_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001209
1210 static const int kMaxRecursion = 100;
1211 inline int recursion_depth() { return recursion_depth_; }
1212 inline void IncrementRecursionDepth() { recursion_depth_++; }
1213 inline void DecrementRecursionDepth() { recursion_depth_--; }
1214
1215 inline bool ignore_case() { return ignore_case_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001216 inline bool ascii() { return ascii_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001217
1218 private:
1219 EndNode* accept_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001220 int next_register_;
1221 List<RegExpNode*>* work_list_;
1222 int recursion_depth_;
1223 RegExpMacroAssembler* macro_assembler_;
1224 bool ignore_case_;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001225 bool ascii_;
1226};
1227
1228
1229class RecursionCheck {
1230 public:
1231 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1232 compiler->IncrementRecursionDepth();
1233 }
1234 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1235 private:
1236 RegExpCompiler* compiler_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001237};
1238
1239
1240// Attempts to compile the regexp using an Irregexp code generator. Returns
1241// a fixed array or a null handle depending on whether it succeeded.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001242RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001243 : next_register_(2 * (capture_count + 1)),
1244 work_list_(NULL),
1245 recursion_depth_(0),
ager@chromium.org8bb60582008-12-11 12:02:20 +00001246 ignore_case_(ignore_case),
1247 ascii_(ascii) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001248 accept_ = new EndNode(EndNode::ACCEPT);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001249}
1250
1251
1252Handle<FixedArray> RegExpCompiler::Assemble(
1253 RegExpMacroAssembler* macro_assembler,
1254 RegExpNode* start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001255 int capture_count,
1256 Handle<String> pattern) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001257#ifdef DEBUG
1258 if (FLAG_trace_regexp_assembler)
1259 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
1260 else
1261#endif
1262 macro_assembler_ = macro_assembler;
1263 List <RegExpNode*> work_list(0);
1264 work_list_ = &work_list;
1265 Label fail;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001266 macro_assembler->PushBacktrack(&fail);
1267 GenerationVariant generic_variant;
1268 if (!start->Emit(this, &generic_variant)) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001269 fail.Unuse();
1270 return Handle<FixedArray>::null();
1271 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001272 macro_assembler_->Bind(&fail);
1273 macro_assembler_->Fail();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001274 while (!work_list.is_empty()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001275 if (!work_list.RemoveLast()->Emit(this, &generic_variant)) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001276 return Handle<FixedArray>::null();
1277 }
1278 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001279 Handle<FixedArray> array =
1280 Factory::NewFixedArray(RegExpImpl::kIrregexpDataLength);
1281 array->set(RegExpImpl::kIrregexpImplementationIndex,
1282 Smi::FromInt(macro_assembler_->Implementation()));
1283 array->set(RegExpImpl::kIrregexpNumberOfRegistersIndex,
1284 Smi::FromInt(next_register_));
1285 array->set(RegExpImpl::kIrregexpNumberOfCapturesIndex,
1286 Smi::FromInt(capture_count));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001287 Handle<Object> code = macro_assembler_->GetCode(pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001288 array->set(RegExpImpl::kIrregexpCodeIndex, *code);
1289 work_list_ = NULL;
1290#ifdef DEBUG
1291 if (FLAG_trace_regexp_assembler) {
1292 delete macro_assembler_;
1293 }
1294#endif
1295 return array;
1296}
1297
1298
ager@chromium.org8bb60582008-12-11 12:02:20 +00001299bool GenerationVariant::mentions_reg(int reg) {
1300 for (DeferredAction* action = actions_;
1301 action != NULL;
1302 action = action->next()) {
1303 if (reg == action->reg()) return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001304 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001305 return false;
1306}
1307
1308
1309int GenerationVariant::FindAffectedRegisters(OutSet* affected_registers) {
1310 int max_register = -1;
1311 for (DeferredAction* action = actions_;
1312 action != NULL;
1313 action = action->next()) {
1314 affected_registers->Set(action->reg());
1315 if (action->reg() > max_register) max_register = action->reg();
1316 }
1317 return max_register;
1318}
1319
1320
1321void GenerationVariant::PushAffectedRegisters(RegExpMacroAssembler* macro,
1322 int max_register,
1323 OutSet& affected_registers) {
1324 for (int reg = 0; reg <= max_register; reg++) {
1325 if (affected_registers.Get(reg)) macro->PushRegister(reg);
1326 }
1327}
1328
1329
1330void GenerationVariant::RestoreAffectedRegisters(RegExpMacroAssembler* macro,
1331 int max_register,
1332 OutSet& affected_registers) {
1333 for (int reg = max_register; reg >= 0; reg--) {
1334 if (affected_registers.Get(reg)) macro->PopRegister(reg);
1335 }
1336}
1337
1338
1339void GenerationVariant::PerformDeferredActions(RegExpMacroAssembler* macro,
1340 int max_register,
1341 OutSet& affected_registers) {
1342 for (int reg = 0; reg <= max_register; reg++) {
1343 if (!affected_registers.Get(reg)) {
1344 continue;
1345 }
1346 int value = 0;
1347 bool absolute = false;
1348 int store_position = -1;
1349 // This is a little tricky because we are scanning the actions in reverse
1350 // historical order (newest first).
1351 for (DeferredAction* action = actions_;
1352 action != NULL;
1353 action = action->next()) {
1354 if (action->reg() == reg) {
1355 switch (action->type()) {
1356 case ActionNode::SET_REGISTER: {
1357 GenerationVariant::DeferredSetRegister* psr =
1358 static_cast<GenerationVariant::DeferredSetRegister*>(action);
1359 value += psr->value();
1360 absolute = true;
1361 ASSERT_EQ(store_position, -1);
1362 break;
1363 }
1364 case ActionNode::INCREMENT_REGISTER:
1365 if (!absolute) {
1366 value++;
1367 }
1368 ASSERT_EQ(store_position, -1);
1369 break;
1370 case ActionNode::STORE_POSITION: {
1371 GenerationVariant::DeferredCapture* pc =
1372 static_cast<GenerationVariant::DeferredCapture*>(action);
1373 if (store_position == -1) {
1374 store_position = pc->cp_offset();
1375 }
1376 ASSERT(!absolute);
1377 ASSERT_EQ(value, 0);
1378 break;
1379 }
1380 default:
1381 UNREACHABLE();
1382 break;
1383 }
1384 }
1385 }
1386 if (store_position != -1) {
1387 macro->WriteCurrentPositionToRegister(reg, store_position);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001388 } else {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001389 if (absolute) {
1390 macro->SetRegister(reg, value);
1391 } else {
1392 if (value != 0) {
1393 macro->AdvanceRegister(reg, value);
1394 }
1395 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001396 }
1397 }
1398}
1399
1400
ager@chromium.org8bb60582008-12-11 12:02:20 +00001401// This is called as we come into a loop choice node and some other tricky
1402// nodes. It normalises the state of the code generator to ensure we can
1403// generate generic code.
1404bool GenerationVariant::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001405 RegExpMacroAssembler* macro = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001406
1407 ASSERT(actions_ != NULL || cp_offset_ != 0 || backtrack() != NULL);
1408
1409 if (actions_ == NULL && backtrack() == NULL) {
1410 // Here we just have some deferred cp advances to fix and we are back to
1411 // a normal situation.
1412 macro->AdvanceCurrentPosition(cp_offset_);
1413 // Create a new trivial state and generate the node with that.
1414 GenerationVariant new_state;
1415 return successor->Emit(compiler, &new_state);
1416 }
1417
1418 // Generate deferred actions here along with code to undo them again.
1419 OutSet affected_registers;
1420 int max_register = FindAffectedRegisters(&affected_registers);
1421 PushAffectedRegisters(macro, max_register, affected_registers);
1422 PerformDeferredActions(macro, max_register, affected_registers);
1423 if (backtrack() != NULL) {
1424 // Here we have a concrete backtrack location. These are set up by choice
1425 // nodes and so they indicate that we have a deferred save of the current
1426 // position which we may need to emit here.
1427 macro->PushCurrentPosition();
1428 }
1429 if (cp_offset_ != 0) {
1430 macro->AdvanceCurrentPosition(cp_offset_);
1431 }
1432
1433 // Create a new trivial state and generate the node with that.
1434 Label undo;
1435 macro->PushBacktrack(&undo);
1436 GenerationVariant new_state;
1437 bool ok = successor->Emit(compiler, &new_state);
1438
1439 // On backtrack we need to restore state.
1440 macro->Bind(&undo);
1441 if (!ok) return false;
1442 if (backtrack() != NULL) {
1443 macro->PopCurrentPosition();
1444 }
1445 RestoreAffectedRegisters(macro, max_register, affected_registers);
1446 if (backtrack() == NULL) {
1447 macro->Backtrack();
1448 } else {
1449 macro->GoTo(backtrack());
1450 }
1451
1452 return true;
1453}
1454
1455
1456void EndNode::EmitInfoChecks(RegExpMacroAssembler* macro,
1457 GenerationVariant* variant) {
1458 if (info()->at_end) {
1459 Label succeed;
1460 // LoadCurrentCharacter will go to the label if we are at the end of the
1461 // input string.
1462 macro->LoadCurrentCharacter(0, &succeed);
1463 macro->GoTo(variant->backtrack());
1464 macro->Bind(&succeed);
1465 }
1466}
1467
1468
1469bool NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler,
1470 GenerationVariant* variant) {
1471 if (!variant->is_trivial()) {
1472 return variant->Flush(compiler, this);
1473 }
1474 RegExpMacroAssembler* macro = compiler->macro_assembler();
1475 if (!label()->is_bound()) {
1476 macro->Bind(label());
1477 }
1478 EmitInfoChecks(macro, variant);
1479 macro->ReadCurrentPositionFromRegister(current_position_register_);
1480 macro->ReadStackPointerFromRegister(stack_pointer_register_);
1481 // Now that we have unwound the stack we find at the top of the stack the
1482 // backtrack that the BeginSubmatch node got.
1483 macro->Backtrack();
1484 return true;
1485}
1486
1487
1488bool EndNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) {
1489 if (!variant->is_trivial()) {
1490 return variant->Flush(compiler, this);
1491 }
1492 RegExpMacroAssembler* macro = compiler->macro_assembler();
1493 if (!label()->is_bound()) {
1494 macro->Bind(label());
1495 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001496 switch (action_) {
1497 case ACCEPT:
ager@chromium.org8bb60582008-12-11 12:02:20 +00001498 EmitInfoChecks(macro, variant);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001499 macro->Succeed();
1500 return true;
1501 case BACKTRACK:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001502 ASSERT(!info()->at_end);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001503 macro->GoTo(variant->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001504 return true;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001505 case NEGATIVE_SUBMATCH_SUCCESS:
1506 // This case is handled in a different virtual method.
1507 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001508 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001509 UNIMPLEMENTED();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001510 return false;
1511}
1512
1513
1514void GuardedAlternative::AddGuard(Guard* guard) {
1515 if (guards_ == NULL)
1516 guards_ = new ZoneList<Guard*>(1);
1517 guards_->Add(guard);
1518}
1519
1520
ager@chromium.org8bb60582008-12-11 12:02:20 +00001521ActionNode* ActionNode::SetRegister(int reg,
1522 int val,
1523 RegExpNode* on_success) {
1524 ActionNode* result = new ActionNode(SET_REGISTER, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001525 result->data_.u_store_register.reg = reg;
1526 result->data_.u_store_register.value = val;
1527 return result;
1528}
1529
1530
1531ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1532 ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
1533 result->data_.u_increment_register.reg = reg;
1534 return result;
1535}
1536
1537
1538ActionNode* ActionNode::StorePosition(int reg, RegExpNode* on_success) {
1539 ActionNode* result = new ActionNode(STORE_POSITION, on_success);
1540 result->data_.u_position_register.reg = reg;
1541 return result;
1542}
1543
1544
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001545ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1546 int position_reg,
1547 RegExpNode* on_success) {
1548 ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
1549 result->data_.u_submatch.stack_pointer_register = stack_reg;
1550 result->data_.u_submatch.current_position_register = position_reg;
1551 return result;
1552}
1553
1554
ager@chromium.org8bb60582008-12-11 12:02:20 +00001555ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1556 int position_reg,
1557 RegExpNode* on_success) {
1558 ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001559 result->data_.u_submatch.stack_pointer_register = stack_reg;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001560 result->data_.u_submatch.current_position_register = position_reg;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001561 return result;
1562}
1563
1564
1565#define DEFINE_ACCEPT(Type) \
1566 void Type##Node::Accept(NodeVisitor* visitor) { \
1567 visitor->Visit##Type(this); \
1568 }
1569FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1570#undef DEFINE_ACCEPT
1571
1572
1573// -------------------------------------------------------------------
1574// Emit code.
1575
1576
1577void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1578 Guard* guard,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001579 GenerationVariant* variant) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001580 switch (guard->op()) {
1581 case Guard::LT:
ager@chromium.org8bb60582008-12-11 12:02:20 +00001582 ASSERT(!variant->mentions_reg(guard->reg()));
1583 macro_assembler->IfRegisterGE(guard->reg(),
1584 guard->value(),
1585 variant->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001586 break;
1587 case Guard::GEQ:
ager@chromium.org8bb60582008-12-11 12:02:20 +00001588 ASSERT(!variant->mentions_reg(guard->reg()));
1589 macro_assembler->IfRegisterLT(guard->reg(),
1590 guard->value(),
1591 variant->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001592 break;
1593 }
1594}
1595
1596
1597static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
1598static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
1599
1600
1601static inline void EmitAtomNonLetters(
1602 RegExpMacroAssembler* macro_assembler,
1603 TextElement elm,
1604 Vector<const uc16> quarks,
1605 Label* on_failure,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001606 int cp_offset,
1607 bool check_offset) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001608 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
ager@chromium.org8bb60582008-12-11 12:02:20 +00001609 // It is vital that this loop is backwards due to the unchecked character
1610 // load below.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001611 for (int i = quarks.length() - 1; i >= 0; i--) {
1612 uc16 c = quarks[i];
1613 int length = uncanonicalize.get(c, '\0', chars);
1614 if (length <= 1) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001615 if (check_offset && i == quarks.length() - 1) {
1616 macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure);
1617 } else {
1618 // Here we don't need to check against the end of the input string
1619 // since this character lies before a character that matched.
1620 macro_assembler->LoadCurrentCharacterUnchecked(cp_offset + i);
1621 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001622 macro_assembler->CheckNotCharacter(c, on_failure);
1623 }
1624 }
1625}
1626
1627
1628static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1629 uc16 c1,
1630 uc16 c2,
1631 Label* on_failure) {
1632 uc16 exor = c1 ^ c2;
1633 // Check whether exor has only one bit set.
1634 if (((exor - 1) & exor) == 0) {
1635 // If c1 and c2 differ only by one bit.
1636 // Ecma262UnCanonicalize always gives the highest number last.
1637 ASSERT(c2 > c1);
1638 macro_assembler->CheckNotCharacterAfterOr(c2, exor, on_failure);
1639 return true;
1640 }
1641 ASSERT(c2 > c1);
1642 uc16 diff = c2 - c1;
1643 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1644 // If the characters differ by 2^n but don't differ by one bit then
1645 // subtract the difference from the found character, then do the or
1646 // trick. We avoid the theoretical case where negative numbers are
1647 // involved in order to simplify code generation.
1648 macro_assembler->CheckNotCharacterAfterMinusOr(c2 - diff,
1649 diff,
1650 on_failure);
1651 return true;
1652 }
1653 return false;
1654}
1655
1656
1657static inline void EmitAtomLetters(
1658 RegExpMacroAssembler* macro_assembler,
1659 TextElement elm,
1660 Vector<const uc16> quarks,
1661 Label* on_failure,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001662 int cp_offset,
1663 bool check_offset) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001664 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
ager@chromium.org8bb60582008-12-11 12:02:20 +00001665 // It is vital that this loop is backwards due to the unchecked character
1666 // load below.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001667 for (int i = quarks.length() - 1; i >= 0; i--) {
1668 uc16 c = quarks[i];
1669 int length = uncanonicalize.get(c, '\0', chars);
1670 if (length <= 1) continue;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001671 if (check_offset && i == quarks.length() - 1) {
1672 macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure);
1673 } else {
1674 // Here we don't need to check against the end of the input string
1675 // since this character lies before a character that matched.
1676 macro_assembler->LoadCurrentCharacterUnchecked(cp_offset + i);
1677 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001678 Label ok;
1679 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1680 switch (length) {
1681 case 2: {
1682 if (ShortCutEmitCharacterPair(macro_assembler,
1683 chars[0],
1684 chars[1],
1685 on_failure)) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001686 } else {
1687 macro_assembler->CheckCharacter(chars[0], &ok);
1688 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1689 macro_assembler->Bind(&ok);
1690 }
1691 break;
1692 }
1693 case 4:
1694 macro_assembler->CheckCharacter(chars[3], &ok);
1695 // Fall through!
1696 case 3:
1697 macro_assembler->CheckCharacter(chars[0], &ok);
1698 macro_assembler->CheckCharacter(chars[1], &ok);
1699 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1700 macro_assembler->Bind(&ok);
1701 break;
1702 default:
1703 UNREACHABLE();
1704 break;
1705 }
1706 }
1707}
1708
1709
1710static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1711 RegExpCharacterClass* cc,
1712 int cp_offset,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001713 Label* on_failure,
1714 bool check_offset,
1715 bool ascii) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001716 ZoneList<CharacterRange>* ranges = cc->ranges();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001717 int max_char;
1718 if (ascii) {
1719 max_char = String::kMaxAsciiCharCode;
1720 } else {
1721 max_char = String::kMaxUC16CharCode;
1722 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001723
1724 Label success;
1725
1726 Label* char_is_in_class =
1727 cc->is_negated() ? on_failure : &success;
1728
1729 int range_count = ranges->length();
1730
ager@chromium.org8bb60582008-12-11 12:02:20 +00001731 int last_valid_range = range_count - 1;
1732 while (last_valid_range >= 0) {
1733 CharacterRange& range = ranges->at(last_valid_range);
1734 if (range.from() <= max_char) {
1735 break;
1736 }
1737 last_valid_range--;
1738 }
1739
1740 if (last_valid_range < 0) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001741 if (!cc->is_negated()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001742 // TODO(plesner): We can remove this when the node level does our
1743 // ASCII optimizations for us.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001744 macro_assembler->GoTo(on_failure);
1745 }
1746 return;
1747 }
1748
ager@chromium.org8bb60582008-12-11 12:02:20 +00001749 if (last_valid_range == 0 &&
1750 !cc->is_negated() &&
1751 ranges->at(0).IsEverything(max_char)) {
1752 // This is a common case hit by non-anchored expressions.
1753 // TODO(erikcorry): We should have a macro assembler instruction that just
1754 // checks for end of string without loading the character.
1755 if (check_offset) {
1756 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure);
1757 }
1758 return;
1759 }
1760
1761 if (check_offset) {
1762 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure);
1763 } else {
1764 // Here we don't need to check against the end of the input string
1765 // since this character lies before a character that matched.
1766 macro_assembler->LoadCurrentCharacterUnchecked(cp_offset);
1767 }
1768
1769 for (int i = 0; i <= last_valid_range; i++) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001770 CharacterRange& range = ranges->at(i);
1771 Label next_range;
1772 uc16 from = range.from();
1773 uc16 to = range.to();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001774 if (from > max_char) {
1775 continue;
1776 }
1777 if (to > max_char) to = max_char;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001778 if (to == from) {
1779 macro_assembler->CheckCharacter(to, char_is_in_class);
1780 } else {
1781 if (from != 0) {
1782 macro_assembler->CheckCharacterLT(from, &next_range);
1783 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001784 if (to != max_char) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001785 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
1786 } else {
1787 macro_assembler->GoTo(char_is_in_class);
1788 }
1789 }
1790 macro_assembler->Bind(&next_range);
1791 }
1792
ager@chromium.org8bb60582008-12-11 12:02:20 +00001793 CharacterRange& range = ranges->at(last_valid_range);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001794 uc16 from = range.from();
1795 uc16 to = range.to();
1796
ager@chromium.org8bb60582008-12-11 12:02:20 +00001797 if (to > max_char) to = max_char;
1798 ASSERT(to >= from);
1799
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001800 if (to == from) {
1801 if (cc->is_negated()) {
1802 macro_assembler->CheckCharacter(to, on_failure);
1803 } else {
1804 macro_assembler->CheckNotCharacter(to, on_failure);
1805 }
1806 } else {
1807 if (from != 0) {
1808 if (cc->is_negated()) {
1809 macro_assembler->CheckCharacterLT(from, &success);
1810 } else {
1811 macro_assembler->CheckCharacterLT(from, on_failure);
1812 }
1813 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001814 if (to != String::kMaxUC16CharCode) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001815 if (cc->is_negated()) {
1816 macro_assembler->CheckCharacterLT(to + 1, on_failure);
1817 } else {
1818 macro_assembler->CheckCharacterGT(to, on_failure);
1819 }
1820 } else {
1821 if (cc->is_negated()) {
1822 macro_assembler->GoTo(on_failure);
1823 }
1824 }
1825 }
1826 macro_assembler->Bind(&success);
1827}
1828
1829
ager@chromium.org8bb60582008-12-11 12:02:20 +00001830RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1831 GenerationVariant* variant) {
1832 // TODO(erikcorry): Implement support.
1833 if (info_.follows_word_interest ||
1834 info_.follows_newline_interest ||
1835 info_.follows_start_interest) {
1836 return FAIL;
1837 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001838
ager@chromium.org8bb60582008-12-11 12:02:20 +00001839 // If we are generating a greedy loop then don't stop and don't reuse code.
1840 if (variant->stop_node() != NULL) {
1841 return CONTINUE;
1842 }
1843
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001844 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001845 if (variant->is_trivial()) {
1846 if (label_.is_bound()) {
1847 // We are being asked to generate a generic version, but that's already
1848 // been done so just go to it.
1849 macro_assembler->GoTo(&label_);
1850 return DONE;
1851 }
1852 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1853 // To avoid too deep recursion we push the node to the work queue and just
1854 // generate a goto here.
1855 compiler->AddWork(this);
1856 macro_assembler->GoTo(&label_);
1857 return DONE;
1858 }
1859 // Generate generic version of the node and bind the label for later use.
1860 macro_assembler->Bind(&label_);
1861 return CONTINUE;
1862 }
1863
1864 // We are being asked to make a non-generic version. Keep track of how many
1865 // non-generic versions we generate so as not to overdo it.
1866 variants_generated_++;
1867 if (variants_generated_ < kMaxVariantsGenerated &&
1868 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
1869 return CONTINUE;
1870 }
1871
1872 // If we get here there have been too many variants generated or recursion
1873 // is too deep. Time to switch to a generic version. The code for
1874 // generic versions above can handle deep recursion properly.
1875 bool ok = variant->Flush(compiler, this);
1876 return ok ? DONE : FAIL;
1877}
1878
1879
1880// This generates the code to match a text node. A text node can contain
1881// straight character sequences (possibly to be matched in a case-independent
1882// way) and character classes. In order to be most efficient we test for the
1883// simple things first and then move on to the more complicated things. The
1884// simplest thing is a non-letter or a letter if we are matching case. The
1885// next-most simple thing is a case-independent letter. The least simple is
1886// a character class. Another optimization is that we test the last one first.
1887// If that succeeds we don't need to test for the end of the string when we
1888// load other characters.
1889bool TextNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) {
1890 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1891 Label *backtrack = variant->backtrack();
1892 LimitResult limit_result = LimitVersions(compiler, variant);
1893 if (limit_result == FAIL) return false;
1894 if (limit_result == DONE) return true;
1895 ASSERT(limit_result == CONTINUE);
1896
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001897 int element_count = elms_->length();
1898 ASSERT(element_count != 0);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001899 if (info()->at_end) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001900 macro_assembler->GoTo(backtrack);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001901 return true;
1902 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001903 // First check for non-ASCII text.
1904 // TODO(plesner): We should do this at node level.
1905 if (compiler->ascii()) {
1906 for (int i = element_count - 1; i >= 0; i--) {
1907 TextElement elm = elms_->at(i);
1908 if (elm.type == TextElement::ATOM) {
1909 Vector<const uc16> quarks = elm.data.u_atom->data();
1910 for (int j = quarks.length() - 1; j >= 0; j--) {
1911 if (quarks[j] > String::kMaxAsciiCharCode) {
1912 macro_assembler->GoTo(backtrack);
1913 return true;
1914 }
1915 }
1916 } else {
1917 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
1918 }
1919 }
1920 }
1921 // Second, handle straight character matches.
1922 int checked_up_to = -1;
1923 for (int i = element_count - 1; i >= 0; i--) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001924 TextElement elm = elms_->at(i);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001925 ASSERT(elm.cp_offset >= 0);
1926 int cp_offset = variant->cp_offset() + elm.cp_offset;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001927 if (elm.type == TextElement::ATOM) {
1928 Vector<const uc16> quarks = elm.data.u_atom->data();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001929 int last_cp_offset = cp_offset + quarks.length();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001930 if (compiler->ignore_case()) {
1931 EmitAtomNonLetters(macro_assembler,
1932 elm,
1933 quarks,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001934 backtrack,
1935 cp_offset,
1936 checked_up_to < last_cp_offset);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001937 } else {
1938 macro_assembler->CheckCharacters(quarks,
1939 cp_offset,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001940 backtrack,
1941 checked_up_to < last_cp_offset);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001942 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001943 if (last_cp_offset > checked_up_to) checked_up_to = last_cp_offset - 1;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001944 } else {
1945 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001946 }
1947 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001948 // Third, handle case independent letter matches if any.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001949 if (compiler->ignore_case()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001950 for (int i = element_count - 1; i >= 0; i--) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001951 TextElement elm = elms_->at(i);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001952 int cp_offset = variant->cp_offset() + elm.cp_offset;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001953 if (elm.type == TextElement::ATOM) {
1954 Vector<const uc16> quarks = elm.data.u_atom->data();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001955 int last_cp_offset = cp_offset + quarks.length();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001956 EmitAtomLetters(macro_assembler,
1957 elm,
1958 quarks,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001959 backtrack,
1960 cp_offset,
1961 checked_up_to < last_cp_offset);
1962 if (last_cp_offset > checked_up_to) checked_up_to = last_cp_offset - 1;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001963 }
1964 }
1965 }
1966 // If the fast character matches passed then do the character classes.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001967 for (int i = element_count - 1; i >= 0; i--) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001968 TextElement elm = elms_->at(i);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001969 int cp_offset = variant->cp_offset() + elm.cp_offset;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001970 if (elm.type == TextElement::CHAR_CLASS) {
1971 RegExpCharacterClass* cc = elm.data.u_char_class;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001972 EmitCharClass(macro_assembler,
1973 cc,
1974 cp_offset,
1975 backtrack,
1976 checked_up_to < cp_offset,
1977 compiler->ascii());
1978 if (cp_offset > checked_up_to) checked_up_to = cp_offset;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001979 }
1980 }
1981
ager@chromium.org8bb60582008-12-11 12:02:20 +00001982 GenerationVariant new_variant(*variant);
1983 new_variant.set_cp_offset(checked_up_to + 1);
1984 RecursionCheck rc(compiler);
1985 return on_success()->Emit(compiler, &new_variant);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001986}
1987
1988
1989void TextNode::MakeCaseIndependent() {
1990 int element_count = elms_->length();
1991 for (int i = 0; i < element_count; i++) {
1992 TextElement elm = elms_->at(i);
1993 if (elm.type == TextElement::CHAR_CLASS) {
1994 RegExpCharacterClass* cc = elm.data.u_char_class;
1995 ZoneList<CharacterRange>* ranges = cc->ranges();
1996 int range_count = ranges->length();
1997 for (int i = 0; i < range_count; i++) {
1998 ranges->at(i).AddCaseEquivalents(ranges);
1999 }
2000 }
2001 }
2002}
2003
2004
ager@chromium.org8bb60582008-12-11 12:02:20 +00002005int TextNode::GreedyLoopTextLength() {
2006 TextElement elm = elms_->at(elms_->length() - 1);
2007 if (elm.type == TextElement::CHAR_CLASS) {
2008 return elm.cp_offset + 1;
2009 } else {
2010 return elm.cp_offset + elm.data.u_atom->data().length();
2011 }
2012}
2013
2014
2015// Finds the fixed match length of a sequence of nodes that goes from
2016// this alternative and back to this choice node. If there are variable
2017// length nodes or other complications in the way then return a sentinel
2018// value indicating that a greedy loop cannot be constructed.
2019int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) {
2020 int length = 0;
2021 RegExpNode* node = alternative->node();
2022 // Later we will generate code for all these text nodes using recursion
2023 // so we have to limit the max number.
2024 int recursion_depth = 0;
2025 while (node != this) {
2026 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2027 return kNodeIsTooComplexForGreedyLoops;
2028 }
2029 NodeInfo* info = node->info();
2030 if (info->follows_word_interest ||
2031 info->follows_newline_interest ||
2032 info->follows_start_interest) {
2033 return kNodeIsTooComplexForGreedyLoops;
2034 }
2035 int node_length = node->GreedyLoopTextLength();
2036 if (node_length == kNodeIsTooComplexForGreedyLoops) {
2037 return kNodeIsTooComplexForGreedyLoops;
2038 }
2039 length += node_length;
2040 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2041 node = seq_node->on_success();
2042 }
2043 return length;
2044}
2045
2046
2047bool LoopChoiceNode::Emit(RegExpCompiler* compiler,
2048 GenerationVariant* variant) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002049 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00002050 if (variant->stop_node() == this) {
2051 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2052 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2053 // Update the counter-based backtracking info on the stack. This is an
2054 // optimization for greedy loops (see below).
2055 ASSERT(variant->cp_offset() == text_length);
2056 macro_assembler->AdvanceCurrentPosition(text_length);
2057 macro_assembler->GoTo(variant->loop_label());
2058 return true;
2059 }
2060 ASSERT(variant->stop_node() == NULL);
2061 if (!variant->is_trivial()) {
2062 return variant->Flush(compiler, this);
2063 }
2064 return ChoiceNode::Emit(compiler, variant);
2065}
2066
2067
2068bool ChoiceNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) {
2069 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2070 int choice_count = alternatives_->length();
2071#ifdef DEBUG
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002072 for (int i = 0; i < choice_count - 1; i++) {
2073 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002074 ZoneList<Guard*>* guards = alternative.guards();
ager@chromium.org8bb60582008-12-11 12:02:20 +00002075 int guard_count = (guards == NULL) ? 0 : guards->length();
2076 for (int j = 0; j < guard_count; j++) {
2077 ASSERT(!variant->mentions_reg(guards->at(j)->reg()));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002078 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002079 }
2080#endif
2081
2082 LimitResult limit_result = LimitVersions(compiler, variant);
2083 if (limit_result == DONE) return true;
2084 if (limit_result == FAIL) return false;
2085 ASSERT(limit_result == CONTINUE);
2086
2087 RecursionCheck rc(compiler);
2088
2089 GenerationVariant* current_variant = variant;
2090
2091 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2092 bool greedy_loop = false;
2093 Label greedy_loop_label;
2094 GenerationVariant counter_backtrack_variant(&greedy_loop_label);
2095 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
2096 // Here we have special handling for greedy loops containing only text nodes
2097 // and other simple nodes. These are handled by pushing the current
2098 // position on the stack and then incrementing the current position each
2099 // time around the switch. On backtrack we decrement the current position
2100 // and check it against the pushed value. This avoids pushing backtrack
2101 // information for each iteration of the loop, which could take up a lot of
2102 // space.
2103 greedy_loop = true;
2104 ASSERT(variant->stop_node() == NULL);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002105 macro_assembler->PushCurrentPosition();
ager@chromium.org8bb60582008-12-11 12:02:20 +00002106 current_variant = &counter_backtrack_variant;
2107 Label greedy_match_failed;
2108 GenerationVariant greedy_match_variant(&greedy_match_failed);
2109 Label loop_label;
2110 macro_assembler->Bind(&loop_label);
2111 greedy_match_variant.set_stop_node(this);
2112 greedy_match_variant.set_loop_label(&loop_label);
2113 bool ok = alternatives_->at(0).node()->Emit(compiler,
2114 &greedy_match_variant);
2115 macro_assembler->Bind(&greedy_match_failed);
2116 if (!ok) {
2117 greedy_loop_label.Unuse();
2118 return false;
2119 }
2120 }
2121
2122 Label second_choice; // For use in greedy matches.
2123 macro_assembler->Bind(&second_choice);
2124
2125 // For now we just call all choices one after the other. The idea ultimately
2126 // is to use the Dispatch table to try only the relevant ones.
2127 for (int i = greedy_loop ? 1 : 0; i < choice_count - 1; i++) {
2128 GuardedAlternative alternative = alternatives_->at(i);
2129 Label after;
2130 ZoneList<Guard*>* guards = alternative.guards();
2131 int guard_count = (guards == NULL) ? 0 : guards->length();
2132 GenerationVariant new_variant(*current_variant);
2133 new_variant.set_backtrack(&after);
2134 for (int j = 0; j < guard_count; j++) {
2135 GenerateGuard(macro_assembler, guards->at(j), &new_variant);
2136 }
2137 if (!alternative.node()->Emit(compiler, &new_variant)) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002138 after.Unuse();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002139 return false;
2140 }
2141 macro_assembler->Bind(&after);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002142 }
2143 GuardedAlternative alternative = alternatives_->at(choice_count - 1);
2144 ZoneList<Guard*>* guards = alternative.guards();
ager@chromium.org8bb60582008-12-11 12:02:20 +00002145 int guard_count = (guards == NULL) ? 0 : guards->length();
2146 for (int j = 0; j < guard_count; j++) {
2147 GenerateGuard(macro_assembler, guards->at(j), current_variant);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002148 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002149 bool ok = alternative.node()->Emit(compiler, current_variant);
2150 if (!ok) return false;
2151 if (greedy_loop) {
2152 macro_assembler->Bind(&greedy_loop_label);
2153 // If we have unwound to the bottom then backtrack.
2154 macro_assembler->CheckGreedyLoop(variant->backtrack());
2155 // Otherwise try the second priority at an earlier position.
2156 macro_assembler->AdvanceCurrentPosition(-text_length);
2157 macro_assembler->GoTo(&second_choice);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002158 }
2159 return true;
2160}
2161
2162
ager@chromium.org8bb60582008-12-11 12:02:20 +00002163bool ActionNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002164 RegExpMacroAssembler* macro = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00002165 LimitResult limit_result = LimitVersions(compiler, variant);
2166 if (limit_result == DONE) return true;
2167 if (limit_result == FAIL) return false;
2168 ASSERT(limit_result == CONTINUE);
2169
2170 RecursionCheck rc(compiler);
2171
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002172 switch (type_) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002173 case STORE_POSITION: {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002174 GenerationVariant::DeferredCapture
2175 new_capture(data_.u_position_register.reg, variant);
2176 GenerationVariant new_variant = *variant;
2177 new_variant.add_action(&new_capture);
2178 return on_success()->Emit(compiler, &new_variant);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002179 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002180 case INCREMENT_REGISTER: {
2181 GenerationVariant::DeferredIncrementRegister
2182 new_increment(data_.u_increment_register.reg);
2183 GenerationVariant new_variant = *variant;
2184 new_variant.add_action(&new_increment);
2185 return on_success()->Emit(compiler, &new_variant);
2186 }
2187 case SET_REGISTER: {
2188 GenerationVariant::DeferredSetRegister
2189 new_set(data_.u_store_register.reg, data_.u_store_register.value);
2190 GenerationVariant new_variant = *variant;
2191 new_variant.add_action(&new_set);
2192 return on_success()->Emit(compiler, &new_variant);
2193 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002194 case BEGIN_SUBMATCH:
ager@chromium.org8bb60582008-12-11 12:02:20 +00002195 if (!variant->is_trivial()) return variant->Flush(compiler, this);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002196 macro->WriteCurrentPositionToRegister(
ager@chromium.org8bb60582008-12-11 12:02:20 +00002197 data_.u_submatch.current_position_register, 0);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002198 macro->WriteStackPointerToRegister(
2199 data_.u_submatch.stack_pointer_register);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002200 return on_success()->Emit(compiler, variant);
2201 case POSITIVE_SUBMATCH_SUCCESS:
2202 if (!variant->is_trivial()) return variant->Flush(compiler, this);
2203 // TODO(erikcorry): Implement support.
2204 if (info()->follows_word_interest ||
2205 info()->follows_newline_interest ||
2206 info()->follows_start_interest) {
2207 return false;
2208 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002209 if (info()->at_end) {
2210 Label at_end;
2211 // Load current character jumps to the label if we are beyond the string
2212 // end.
2213 macro->LoadCurrentCharacter(0, &at_end);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002214 macro->GoTo(variant->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002215 macro->Bind(&at_end);
2216 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002217 macro->ReadCurrentPositionFromRegister(
2218 data_.u_submatch.current_position_register);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002219 macro->ReadStackPointerFromRegister(
2220 data_.u_submatch.stack_pointer_register);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002221 return on_success()->Emit(compiler, variant);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002222 default:
2223 UNREACHABLE();
2224 return false;
2225 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002226}
2227
2228
ager@chromium.org8bb60582008-12-11 12:02:20 +00002229bool BackReferenceNode::Emit(RegExpCompiler* compiler,
2230 GenerationVariant* variant) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002231 RegExpMacroAssembler* macro = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00002232 if (!variant->is_trivial()) {
2233 return variant->Flush(compiler, this);
2234 }
2235
2236 LimitResult limit_result = LimitVersions(compiler, variant);
2237 if (limit_result == DONE) return true;
2238 if (limit_result == FAIL) return false;
2239 ASSERT(limit_result == CONTINUE);
2240
2241 RecursionCheck rc(compiler);
2242
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002243 ASSERT_EQ(start_reg_ + 1, end_reg_);
2244 if (info()->at_end) {
2245 // If we are constrained to match at the end of the input then succeed
2246 // iff the back reference is empty.
ager@chromium.org8bb60582008-12-11 12:02:20 +00002247 macro->CheckNotRegistersEqual(start_reg_, end_reg_, variant->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002248 } else {
2249 if (compiler->ignore_case()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002250 macro->CheckNotBackReferenceIgnoreCase(start_reg_, variant->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002251 } else {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002252 macro->CheckNotBackReference(start_reg_, variant->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002253 }
2254 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002255 return on_success()->Emit(compiler, variant);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002256}
2257
2258
2259// -------------------------------------------------------------------
2260// Dot/dotty output
2261
2262
2263#ifdef DEBUG
2264
2265
2266class DotPrinter: public NodeVisitor {
2267 public:
2268 explicit DotPrinter(bool ignore_case)
2269 : ignore_case_(ignore_case),
2270 stream_(&alloc_) { }
2271 void PrintNode(const char* label, RegExpNode* node);
2272 void Visit(RegExpNode* node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002273 void PrintAttributes(RegExpNode* from);
2274 StringStream* stream() { return &stream_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002275 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002276#define DECLARE_VISIT(Type) \
2277 virtual void Visit##Type(Type##Node* that);
2278FOR_EACH_NODE_TYPE(DECLARE_VISIT)
2279#undef DECLARE_VISIT
2280 private:
2281 bool ignore_case_;
2282 HeapStringAllocator alloc_;
2283 StringStream stream_;
2284};
2285
2286
2287void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
2288 stream()->Add("digraph G {\n graph [label=\"");
2289 for (int i = 0; label[i]; i++) {
2290 switch (label[i]) {
2291 case '\\':
2292 stream()->Add("\\\\");
2293 break;
2294 case '"':
2295 stream()->Add("\"");
2296 break;
2297 default:
2298 stream()->Put(label[i]);
2299 break;
2300 }
2301 }
2302 stream()->Add("\"];\n");
2303 Visit(node);
2304 stream()->Add("}\n");
2305 printf("%s", *(stream()->ToCString()));
2306}
2307
2308
2309void DotPrinter::Visit(RegExpNode* node) {
2310 if (node->info()->visited) return;
2311 node->info()->visited = true;
2312 node->Accept(this);
2313}
2314
2315
2316void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002317 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
2318 Visit(on_failure);
2319}
2320
2321
2322class TableEntryBodyPrinter {
2323 public:
2324 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
2325 : stream_(stream), choice_(choice) { }
2326 void Call(uc16 from, DispatchTable::Entry entry) {
2327 OutSet* out_set = entry.out_set();
2328 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
2329 if (out_set->Get(i)) {
2330 stream()->Add(" n%p:s%io%i -> n%p;\n",
2331 choice(),
2332 from,
2333 i,
2334 choice()->alternatives()->at(i).node());
2335 }
2336 }
2337 }
2338 private:
2339 StringStream* stream() { return stream_; }
2340 ChoiceNode* choice() { return choice_; }
2341 StringStream* stream_;
2342 ChoiceNode* choice_;
2343};
2344
2345
2346class TableEntryHeaderPrinter {
2347 public:
2348 explicit TableEntryHeaderPrinter(StringStream* stream)
2349 : first_(true), stream_(stream) { }
2350 void Call(uc16 from, DispatchTable::Entry entry) {
2351 if (first_) {
2352 first_ = false;
2353 } else {
2354 stream()->Add("|");
2355 }
2356 stream()->Add("{\\%k-\\%k|{", from, entry.to());
2357 OutSet* out_set = entry.out_set();
2358 int priority = 0;
2359 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
2360 if (out_set->Get(i)) {
2361 if (priority > 0) stream()->Add("|");
2362 stream()->Add("<s%io%i> %i", from, i, priority);
2363 priority++;
2364 }
2365 }
2366 stream()->Add("}}");
2367 }
2368 private:
2369 bool first_;
2370 StringStream* stream() { return stream_; }
2371 StringStream* stream_;
2372};
2373
2374
2375class AttributePrinter {
2376 public:
2377 explicit AttributePrinter(DotPrinter* out)
2378 : out_(out), first_(true) { }
2379 void PrintSeparator() {
2380 if (first_) {
2381 first_ = false;
2382 } else {
2383 out_->stream()->Add("|");
2384 }
2385 }
2386 void PrintBit(const char* name, bool value) {
2387 if (!value) return;
2388 PrintSeparator();
2389 out_->stream()->Add("{%s}", name);
2390 }
2391 void PrintPositive(const char* name, int value) {
2392 if (value < 0) return;
2393 PrintSeparator();
2394 out_->stream()->Add("{%s|%x}", name, value);
2395 }
2396 private:
2397 DotPrinter* out_;
2398 bool first_;
2399};
2400
2401
2402void DotPrinter::PrintAttributes(RegExpNode* that) {
2403 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
2404 "margin=0.1, fontsize=10, label=\"{",
2405 that);
2406 AttributePrinter printer(this);
2407 NodeInfo* info = that->info();
2408 printer.PrintBit("NI", info->follows_newline_interest);
2409 printer.PrintBit("WI", info->follows_word_interest);
2410 printer.PrintBit("SI", info->follows_start_interest);
2411 printer.PrintBit("DN", info->determine_newline);
2412 printer.PrintBit("DW", info->determine_word);
2413 printer.PrintBit("DS", info->determine_start);
2414 printer.PrintBit("DDN", info->does_determine_newline);
2415 printer.PrintBit("DDW", info->does_determine_word);
2416 printer.PrintBit("DDS", info->does_determine_start);
2417 printer.PrintPositive("IW", info->is_word);
2418 printer.PrintPositive("IN", info->is_newline);
2419 printer.PrintPositive("FN", info->follows_newline);
2420 printer.PrintPositive("FW", info->follows_word);
2421 printer.PrintPositive("FS", info->follows_start);
2422 Label* label = that->label();
2423 if (label->is_bound())
2424 printer.PrintPositive("@", label->pos());
2425 stream()->Add("}\"];\n");
2426 stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
2427 "arrowhead=none];\n", that, that);
2428}
2429
2430
2431static const bool kPrintDispatchTable = false;
2432void DotPrinter::VisitChoice(ChoiceNode* that) {
2433 if (kPrintDispatchTable) {
2434 stream()->Add(" n%p [shape=Mrecord, label=\"", that);
2435 TableEntryHeaderPrinter header_printer(stream());
2436 that->GetTable(ignore_case_)->ForEach(&header_printer);
2437 stream()->Add("\"]\n", that);
2438 PrintAttributes(that);
2439 TableEntryBodyPrinter body_printer(stream(), that);
2440 that->GetTable(ignore_case_)->ForEach(&body_printer);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002441 } else {
2442 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
2443 for (int i = 0; i < that->alternatives()->length(); i++) {
2444 GuardedAlternative alt = that->alternatives()->at(i);
2445 stream()->Add(" n%p -> n%p;\n", that, alt.node());
2446 }
2447 }
2448 for (int i = 0; i < that->alternatives()->length(); i++) {
2449 GuardedAlternative alt = that->alternatives()->at(i);
2450 alt.node()->Accept(this);
2451 }
2452}
2453
2454
2455void DotPrinter::VisitText(TextNode* that) {
2456 stream()->Add(" n%p [label=\"", that);
2457 for (int i = 0; i < that->elements()->length(); i++) {
2458 if (i > 0) stream()->Add(" ");
2459 TextElement elm = that->elements()->at(i);
2460 switch (elm.type) {
2461 case TextElement::ATOM: {
2462 stream()->Add("'%w'", elm.data.u_atom->data());
2463 break;
2464 }
2465 case TextElement::CHAR_CLASS: {
2466 RegExpCharacterClass* node = elm.data.u_char_class;
2467 stream()->Add("[");
2468 if (node->is_negated())
2469 stream()->Add("^");
2470 for (int j = 0; j < node->ranges()->length(); j++) {
2471 CharacterRange range = node->ranges()->at(j);
2472 stream()->Add("%k-%k", range.from(), range.to());
2473 }
2474 stream()->Add("]");
2475 break;
2476 }
2477 default:
2478 UNREACHABLE();
2479 }
2480 }
2481 stream()->Add("\", shape=box, peripheries=2];\n");
2482 PrintAttributes(that);
2483 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
2484 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002485}
2486
2487
2488void DotPrinter::VisitBackReference(BackReferenceNode* that) {
2489 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
2490 that,
2491 that->start_register(),
2492 that->end_register());
2493 PrintAttributes(that);
2494 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
2495 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002496}
2497
2498
2499void DotPrinter::VisitEnd(EndNode* that) {
2500 stream()->Add(" n%p [style=bold, shape=point];\n", that);
2501 PrintAttributes(that);
2502}
2503
2504
2505void DotPrinter::VisitAction(ActionNode* that) {
2506 stream()->Add(" n%p [", that);
2507 switch (that->type_) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002508 case ActionNode::SET_REGISTER:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002509 stream()->Add("label=\"$%i:=%i\", shape=octagon",
2510 that->data_.u_store_register.reg,
2511 that->data_.u_store_register.value);
2512 break;
2513 case ActionNode::INCREMENT_REGISTER:
2514 stream()->Add("label=\"$%i++\", shape=octagon",
2515 that->data_.u_increment_register.reg);
2516 break;
2517 case ActionNode::STORE_POSITION:
2518 stream()->Add("label=\"$%i:=$pos\", shape=octagon",
2519 that->data_.u_position_register.reg);
2520 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002521 case ActionNode::BEGIN_SUBMATCH:
2522 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
2523 that->data_.u_submatch.current_position_register);
2524 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002525 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002526 stream()->Add("label=\"escape\", shape=septagon");
2527 break;
2528 }
2529 stream()->Add("];\n");
2530 PrintAttributes(that);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002531 RegExpNode* successor = that->on_success();
2532 stream()->Add(" n%p -> n%p;\n", that, successor);
2533 Visit(successor);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002534}
2535
2536
2537class DispatchTableDumper {
2538 public:
2539 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
2540 void Call(uc16 key, DispatchTable::Entry entry);
2541 StringStream* stream() { return stream_; }
2542 private:
2543 StringStream* stream_;
2544};
2545
2546
2547void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
2548 stream()->Add("[%k-%k]: {", key, entry.to());
2549 OutSet* set = entry.out_set();
2550 bool first = true;
2551 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
2552 if (set->Get(i)) {
2553 if (first) {
2554 first = false;
2555 } else {
2556 stream()->Add(", ");
2557 }
2558 stream()->Add("%i", i);
2559 }
2560 }
2561 stream()->Add("}\n");
2562}
2563
2564
2565void DispatchTable::Dump() {
2566 HeapStringAllocator alloc;
2567 StringStream stream(&alloc);
2568 DispatchTableDumper dumper(&stream);
2569 tree()->ForEach(&dumper);
2570 OS::PrintError("%s", *stream.ToCString());
2571}
2572
2573
2574void RegExpEngine::DotPrint(const char* label,
2575 RegExpNode* node,
2576 bool ignore_case) {
2577 DotPrinter printer(ignore_case);
2578 printer.PrintNode(label, node);
2579}
2580
2581
2582#endif // DEBUG
2583
2584
2585// -------------------------------------------------------------------
2586// Tree to graph conversion
2587
2588
2589RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00002590 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002591 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
2592 elms->Add(TextElement::Atom(this));
ager@chromium.org8bb60582008-12-11 12:02:20 +00002593 return new TextNode(elms, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002594}
2595
2596
2597RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00002598 RegExpNode* on_success) {
2599 return new TextNode(elements(), on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002600}
2601
2602
2603RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00002604 RegExpNode* on_success) {
2605 return new TextNode(this, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002606}
2607
2608
2609RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00002610 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002611 ZoneList<RegExpTree*>* alternatives = this->alternatives();
2612 int length = alternatives->length();
ager@chromium.org8bb60582008-12-11 12:02:20 +00002613 ChoiceNode* result = new ChoiceNode(length);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002614 for (int i = 0; i < length; i++) {
2615 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00002616 on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002617 result->AddAlternative(alternative);
2618 }
2619 return result;
2620}
2621
2622
2623RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00002624 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002625 return ToNode(min(),
2626 max(),
2627 is_greedy(),
2628 body(),
2629 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00002630 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002631}
2632
2633
2634RegExpNode* RegExpQuantifier::ToNode(int min,
2635 int max,
2636 bool is_greedy,
2637 RegExpTree* body,
2638 RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00002639 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002640 // x{f, t} becomes this:
2641 //
2642 // (r++)<-.
2643 // | `
2644 // | (x)
2645 // v ^
2646 // (r=0)-->(?)---/ [if r < t]
2647 // |
2648 // [if r >= f] \----> ...
2649 //
2650 //
2651 // TODO(someone): clear captures on repetition and handle empty
2652 // matches.
2653 bool has_min = min > 0;
2654 bool has_max = max < RegExpQuantifier::kInfinity;
2655 bool needs_counter = has_min || has_max;
2656 int reg_ctr = needs_counter ? compiler->AllocateRegister() : -1;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002657 ChoiceNode* center = new LoopChoiceNode(2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002658 RegExpNode* loop_return = needs_counter
2659 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
2660 : static_cast<RegExpNode*>(center);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002661 RegExpNode* body_node = body->ToNode(compiler, loop_return);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002662 GuardedAlternative body_alt(body_node);
2663 if (has_max) {
2664 Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
2665 body_alt.AddGuard(body_guard);
2666 }
2667 GuardedAlternative rest_alt(on_success);
2668 if (has_min) {
2669 Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
2670 rest_alt.AddGuard(rest_guard);
2671 }
2672 if (is_greedy) {
2673 center->AddAlternative(body_alt);
2674 center->AddAlternative(rest_alt);
2675 } else {
2676 center->AddAlternative(rest_alt);
2677 center->AddAlternative(body_alt);
2678 }
2679 if (needs_counter) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002680 return ActionNode::SetRegister(reg_ctr, 0, center);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002681 } else {
2682 return center;
2683 }
2684}
2685
2686
2687RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00002688 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002689 NodeInfo info;
2690 switch (type()) {
2691 case START_OF_LINE:
2692 info.follows_newline_interest = true;
2693 break;
2694 case START_OF_INPUT:
2695 info.follows_start_interest = true;
2696 break;
2697 case BOUNDARY: case NON_BOUNDARY:
2698 info.follows_word_interest = true;
2699 break;
2700 case END_OF_INPUT:
2701 info.at_end = true;
2702 break;
2703 case END_OF_LINE:
2704 // This is wrong but has the effect of making the compiler abort.
2705 info.at_end = true;
2706 }
2707 return on_success->PropagateForward(&info);
2708}
2709
2710
2711RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00002712 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002713 return new BackReferenceNode(RegExpCapture::StartRegister(index()),
2714 RegExpCapture::EndRegister(index()),
ager@chromium.org8bb60582008-12-11 12:02:20 +00002715 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002716}
2717
2718
2719RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00002720 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002721 return on_success;
2722}
2723
2724
2725RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00002726 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002727 int stack_pointer_register = compiler->AllocateRegister();
2728 int position_register = compiler->AllocateRegister();
ager@chromium.org8bb60582008-12-11 12:02:20 +00002729 RegExpNode* success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002730 if (is_positive()) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002731 return ActionNode::BeginSubmatch(
2732 stack_pointer_register,
2733 position_register,
2734 body()->ToNode(
2735 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00002736 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
2737 position_register,
2738 on_success)));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002739 } else {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002740 // We use a ChoiceNode for a negative lookahead because it has most of
2741 // the characteristics we need. It has the body of the lookahead as its
2742 // first alternative and the expression after the lookahead of the second
2743 // alternative. If the first alternative succeeds then the
2744 // NegativeSubmatchSuccess will unwind the stack including everything the
2745 // choice node set up and backtrack. If the first alternative fails then
2746 // the second alternative is tried, which is exactly the desired result
2747 // for a negative lookahead. In the case where the dispatch table
2748 // determines that the first alternative cannot match we will save time
2749 // by not trying it. Things are not quite so well-optimized if the
2750 // dispatch table determines that the second alternative cannot match.
2751 // In this case we could optimize by immediately backtracking.
2752 ChoiceNode* choice_node = new ChoiceNode(2);
2753 GuardedAlternative body_alt(
2754 body()->ToNode(
2755 compiler,
2756 success = new NegativeSubmatchSuccess(stack_pointer_register,
2757 position_register)));
2758 choice_node->AddAlternative(body_alt);
2759 choice_node->AddAlternative(GuardedAlternative(on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002760 return ActionNode::BeginSubmatch(stack_pointer_register,
2761 position_register,
ager@chromium.org8bb60582008-12-11 12:02:20 +00002762 choice_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002763 }
2764}
2765
2766
2767RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00002768 RegExpNode* on_success) {
2769 return ToNode(body(), index(), compiler, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002770}
2771
2772
2773RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
2774 int index,
2775 RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00002776 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002777 int start_reg = RegExpCapture::StartRegister(index);
2778 int end_reg = RegExpCapture::EndRegister(index);
2779 RegExpNode* store_end = ActionNode::StorePosition(end_reg, on_success);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002780 RegExpNode* body_node = body->ToNode(compiler, store_end);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002781 return ActionNode::StorePosition(start_reg, body_node);
2782}
2783
2784
2785RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00002786 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002787 ZoneList<RegExpTree*>* children = nodes();
2788 RegExpNode* current = on_success;
2789 for (int i = children->length() - 1; i >= 0; i--) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002790 current = children->at(i)->ToNode(compiler, current);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002791 }
2792 return current;
2793}
2794
2795
2796static const int kSpaceRangeCount = 20;
2797static const uc16 kSpaceRanges[kSpaceRangeCount] = {
2798 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0, 0x00A0, 0x1680,
2799 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029,
2800 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000
2801};
2802
2803
2804static const int kWordRangeCount = 8;
2805static const uc16 kWordRanges[kWordRangeCount] = {
2806 '0', '9', 'A', 'Z', '_', '_', 'a', 'z'
2807};
2808
2809
2810static const int kDigitRangeCount = 2;
2811static const uc16 kDigitRanges[kDigitRangeCount] = {
2812 '0', '9'
2813};
2814
2815
2816static const int kLineTerminatorRangeCount = 6;
2817static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = {
2818 0x000A, 0x000A, 0x000D, 0x000D, 0x2028, 0x2029
2819};
2820
2821
2822static void AddClass(const uc16* elmv,
2823 int elmc,
2824 ZoneList<CharacterRange>* ranges) {
2825 for (int i = 0; i < elmc; i += 2) {
2826 ASSERT(elmv[i] <= elmv[i + 1]);
2827 ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
2828 }
2829}
2830
2831
2832static void AddClassNegated(const uc16 *elmv,
2833 int elmc,
2834 ZoneList<CharacterRange>* ranges) {
2835 ASSERT(elmv[0] != 0x0000);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002836 ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002837 uc16 last = 0x0000;
2838 for (int i = 0; i < elmc; i += 2) {
2839 ASSERT(last <= elmv[i] - 1);
2840 ASSERT(elmv[i] <= elmv[i + 1]);
2841 ranges->Add(CharacterRange(last, elmv[i] - 1));
2842 last = elmv[i + 1] + 1;
2843 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002844 ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002845}
2846
2847
2848void CharacterRange::AddClassEscape(uc16 type,
2849 ZoneList<CharacterRange>* ranges) {
2850 switch (type) {
2851 case 's':
2852 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
2853 break;
2854 case 'S':
2855 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
2856 break;
2857 case 'w':
2858 AddClass(kWordRanges, kWordRangeCount, ranges);
2859 break;
2860 case 'W':
2861 AddClassNegated(kWordRanges, kWordRangeCount, ranges);
2862 break;
2863 case 'd':
2864 AddClass(kDigitRanges, kDigitRangeCount, ranges);
2865 break;
2866 case 'D':
2867 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
2868 break;
2869 case '.':
2870 AddClassNegated(kLineTerminatorRanges,
2871 kLineTerminatorRangeCount,
2872 ranges);
2873 break;
2874 // This is not a character range as defined by the spec but a
2875 // convenient shorthand for a character class that matches any
2876 // character.
2877 case '*':
2878 ranges->Add(CharacterRange::Everything());
2879 break;
2880 default:
2881 UNREACHABLE();
2882 }
2883}
2884
2885
2886Vector<const uc16> CharacterRange::GetWordBounds() {
2887 return Vector<const uc16>(kWordRanges, kWordRangeCount);
2888}
2889
2890
2891class CharacterRangeSplitter {
2892 public:
2893 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
2894 ZoneList<CharacterRange>** excluded)
2895 : included_(included),
2896 excluded_(excluded) { }
2897 void Call(uc16 from, DispatchTable::Entry entry);
2898
2899 static const int kInBase = 0;
2900 static const int kInOverlay = 1;
2901
2902 private:
2903 ZoneList<CharacterRange>** included_;
2904 ZoneList<CharacterRange>** excluded_;
2905};
2906
2907
2908void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
2909 if (!entry.out_set()->Get(kInBase)) return;
2910 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
2911 ? included_
2912 : excluded_;
2913 if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
2914 (*target)->Add(CharacterRange(entry.from(), entry.to()));
2915}
2916
2917
2918void CharacterRange::Split(ZoneList<CharacterRange>* base,
2919 Vector<const uc16> overlay,
2920 ZoneList<CharacterRange>** included,
2921 ZoneList<CharacterRange>** excluded) {
2922 ASSERT_EQ(NULL, *included);
2923 ASSERT_EQ(NULL, *excluded);
2924 DispatchTable table;
2925 for (int i = 0; i < base->length(); i++)
2926 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
2927 for (int i = 0; i < overlay.length(); i += 2) {
2928 table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
2929 CharacterRangeSplitter::kInOverlay);
2930 }
2931 CharacterRangeSplitter callback(included, excluded);
2932 table.ForEach(&callback);
2933}
2934
2935
2936void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
2937 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2938 if (IsSingleton()) {
2939 // If this is a singleton we just expand the one character.
2940 int length = uncanonicalize.get(from(), '\0', chars);
2941 for (int i = 0; i < length; i++) {
2942 uc32 chr = chars[i];
2943 if (chr != from()) {
2944 ranges->Add(CharacterRange::Singleton(chars[i]));
2945 }
2946 }
2947 } else if (from() <= kRangeCanonicalizeMax &&
2948 to() <= kRangeCanonicalizeMax) {
2949 // If this is a range we expand the characters block by block,
2950 // expanding contiguous subranges (blocks) one at a time.
2951 // The approach is as follows. For a given start character we
2952 // look up the block that contains it, for instance 'a' if the
2953 // start character is 'c'. A block is characterized by the property
2954 // that all characters uncanonicalize in the same way as the first
2955 // element, except that each entry in the result is incremented
2956 // by the distance from the first element. So a-z is a block
2957 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
2958 // uncanonicalizes to ['a' + k, 'A' + k].
2959 // Once we've found the start point we look up its uncanonicalization
2960 // and produce a range for each element. For instance for [c-f]
2961 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
2962 // add a range if it is not already contained in the input, so [c-f]
2963 // will be skipped but [C-F] will be added. If this range is not
2964 // completely contained in a block we do this for all the blocks
2965 // covered by the range.
2966 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2967 // First, look up the block that contains the 'from' character.
2968 int length = canonrange.get(from(), '\0', range);
2969 if (length == 0) {
2970 range[0] = from();
2971 } else {
2972 ASSERT_EQ(1, length);
2973 }
2974 int pos = from();
2975 // The start of the current block. Note that except for the first
2976 // iteration 'start' is always equal to 'pos'.
2977 int start;
2978 // If it is not the start point of a block the entry contains the
2979 // offset of the character from the start point.
2980 if ((range[0] & kStartMarker) == 0) {
2981 start = pos - range[0];
2982 } else {
2983 start = pos;
2984 }
2985 // Then we add the ranges on at a time, incrementing the current
2986 // position to be after the last block each time. The position
2987 // always points to the start of a block.
2988 while (pos < to()) {
2989 length = canonrange.get(start, '\0', range);
2990 if (length == 0) {
2991 range[0] = start;
2992 } else {
2993 ASSERT_EQ(1, length);
2994 }
2995 ASSERT((range[0] & kStartMarker) != 0);
2996 // The start point of a block contains the distance to the end
2997 // of the range.
2998 int block_end = start + (range[0] & kPayloadMask) - 1;
2999 int end = (block_end > to()) ? to() : block_end;
3000 length = uncanonicalize.get(start, '\0', range);
3001 for (int i = 0; i < length; i++) {
3002 uc32 c = range[i];
3003 uc16 range_from = c + (pos - start);
3004 uc16 range_to = c + (end - start);
3005 if (!(from() <= range_from && range_to <= to())) {
3006 ranges->Add(CharacterRange(range_from, range_to));
3007 }
3008 }
3009 start = pos = block_end + 1;
3010 }
3011 } else {
3012 // TODO(plesner) when we've fixed the 2^11 bug in unibrow.
3013 }
3014}
3015
3016
3017// -------------------------------------------------------------------
3018// Interest propagation
3019
3020
3021RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
3022 for (int i = 0; i < siblings_.length(); i++) {
3023 RegExpNode* sibling = siblings_.Get(i);
3024 if (sibling->info()->Matches(info))
3025 return sibling;
3026 }
3027 return NULL;
3028}
3029
3030
3031RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
3032 ASSERT_EQ(false, *cloned);
3033 ASSERT(!info->HasAssertions());
3034 siblings_.Ensure(this);
3035 RegExpNode* result = TryGetSibling(info);
3036 if (result != NULL) return result;
3037 result = this->Clone();
3038 NodeInfo* new_info = result->info();
3039 new_info->ResetCompilationState();
3040 new_info->AddFromPreceding(info);
3041 AddSibling(result);
3042 *cloned = true;
3043 return result;
3044}
3045
3046
3047template <class C>
3048static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
3049 NodeInfo full_info(*node->info());
3050 full_info.AddFromPreceding(info);
3051 bool cloned = false;
3052 return RegExpNode::EnsureSibling(node, &full_info, &cloned);
3053}
3054
3055
3056RegExpNode* ActionNode::PropagateForward(NodeInfo* info) {
3057 NodeInfo full_info(*this->info());
3058 full_info.AddFromPreceding(info);
3059 bool cloned = false;
3060 ActionNode* action = EnsureSibling(this, &full_info, &cloned);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003061 action->set_on_success(action->on_success()->PropagateForward(info));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003062 return action;
3063}
3064
3065
3066RegExpNode* ChoiceNode::PropagateForward(NodeInfo* info) {
3067 NodeInfo full_info(*this->info());
3068 full_info.AddFromPreceding(info);
3069 bool cloned = false;
3070 ChoiceNode* choice = EnsureSibling(this, &full_info, &cloned);
3071 if (cloned) {
3072 ZoneList<GuardedAlternative>* old_alternatives = alternatives();
3073 int count = old_alternatives->length();
3074 choice->alternatives_ = new ZoneList<GuardedAlternative>(count);
3075 for (int i = 0; i < count; i++) {
3076 GuardedAlternative alternative = old_alternatives->at(i);
3077 alternative.set_node(alternative.node()->PropagateForward(info));
3078 choice->alternatives()->Add(alternative);
3079 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003080 }
3081 return choice;
3082}
3083
3084
3085RegExpNode* EndNode::PropagateForward(NodeInfo* info) {
3086 return PropagateToEndpoint(this, info);
3087}
3088
3089
3090RegExpNode* BackReferenceNode::PropagateForward(NodeInfo* info) {
3091 NodeInfo full_info(*this->info());
3092 full_info.AddFromPreceding(info);
3093 bool cloned = false;
3094 BackReferenceNode* back_ref = EnsureSibling(this, &full_info, &cloned);
3095 if (cloned) {
3096 // TODO(erikcorry): A back reference has to have two successors (by default
3097 // the same node). The first is used if the back reference matches a non-
3098 // empty back reference, the second if it matches an empty one. This
3099 // doesn't matter for at_end, which is the only one implemented right now,
3100 // but it will matter for other pieces of info.
3101 back_ref->set_on_success(back_ref->on_success()->PropagateForward(info));
3102 }
3103 return back_ref;
3104}
3105
3106
3107RegExpNode* TextNode::PropagateForward(NodeInfo* info) {
3108 return PropagateToEndpoint(this, info);
3109}
3110
3111
3112// -------------------------------------------------------------------
3113// Splay tree
3114
3115
3116OutSet* OutSet::Extend(unsigned value) {
3117 if (Get(value))
3118 return this;
3119 if (successors() != NULL) {
3120 for (int i = 0; i < successors()->length(); i++) {
3121 OutSet* successor = successors()->at(i);
3122 if (successor->Get(value))
3123 return successor;
3124 }
3125 } else {
3126 successors_ = new ZoneList<OutSet*>(2);
3127 }
3128 OutSet* result = new OutSet(first_, remaining_);
3129 result->Set(value);
3130 successors()->Add(result);
3131 return result;
3132}
3133
3134
3135void OutSet::Set(unsigned value) {
3136 if (value < kFirstLimit) {
3137 first_ |= (1 << value);
3138 } else {
3139 if (remaining_ == NULL)
3140 remaining_ = new ZoneList<unsigned>(1);
3141 if (remaining_->is_empty() || !remaining_->Contains(value))
3142 remaining_->Add(value);
3143 }
3144}
3145
3146
3147bool OutSet::Get(unsigned value) {
3148 if (value < kFirstLimit) {
3149 return (first_ & (1 << value)) != 0;
3150 } else if (remaining_ == NULL) {
3151 return false;
3152 } else {
3153 return remaining_->Contains(value);
3154 }
3155}
3156
3157
3158const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
3159const DispatchTable::Entry DispatchTable::Config::kNoValue;
3160
3161
3162void DispatchTable::AddRange(CharacterRange full_range, int value) {
3163 CharacterRange current = full_range;
3164 if (tree()->is_empty()) {
3165 // If this is the first range we just insert into the table.
3166 ZoneSplayTree<Config>::Locator loc;
3167 ASSERT_RESULT(tree()->Insert(current.from(), &loc));
3168 loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
3169 return;
3170 }
3171 // First see if there is a range to the left of this one that
3172 // overlaps.
3173 ZoneSplayTree<Config>::Locator loc;
3174 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
3175 Entry* entry = &loc.value();
3176 // If we've found a range that overlaps with this one, and it
3177 // starts strictly to the left of this one, we have to fix it
3178 // because the following code only handles ranges that start on
3179 // or after the start point of the range we're adding.
3180 if (entry->from() < current.from() && entry->to() >= current.from()) {
3181 // Snap the overlapping range in half around the start point of
3182 // the range we're adding.
3183 CharacterRange left(entry->from(), current.from() - 1);
3184 CharacterRange right(current.from(), entry->to());
3185 // The left part of the overlapping range doesn't overlap.
3186 // Truncate the whole entry to be just the left part.
3187 entry->set_to(left.to());
3188 // The right part is the one that overlaps. We add this part
3189 // to the map and let the next step deal with merging it with
3190 // the range we're adding.
3191 ZoneSplayTree<Config>::Locator loc;
3192 ASSERT_RESULT(tree()->Insert(right.from(), &loc));
3193 loc.set_value(Entry(right.from(),
3194 right.to(),
3195 entry->out_set()));
3196 }
3197 }
3198 while (current.is_valid()) {
3199 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
3200 (loc.value().from() <= current.to()) &&
3201 (loc.value().to() >= current.from())) {
3202 Entry* entry = &loc.value();
3203 // We have overlap. If there is space between the start point of
3204 // the range we're adding and where the overlapping range starts
3205 // then we have to add a range covering just that space.
3206 if (current.from() < entry->from()) {
3207 ZoneSplayTree<Config>::Locator ins;
3208 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
3209 ins.set_value(Entry(current.from(),
3210 entry->from() - 1,
3211 empty()->Extend(value)));
3212 current.set_from(entry->from());
3213 }
3214 ASSERT_EQ(current.from(), entry->from());
3215 // If the overlapping range extends beyond the one we want to add
3216 // we have to snap the right part off and add it separately.
3217 if (entry->to() > current.to()) {
3218 ZoneSplayTree<Config>::Locator ins;
3219 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
3220 ins.set_value(Entry(current.to() + 1,
3221 entry->to(),
3222 entry->out_set()));
3223 entry->set_to(current.to());
3224 }
3225 ASSERT(entry->to() <= current.to());
3226 // The overlapping range is now completely contained by the range
3227 // we're adding so we can just update it and move the start point
3228 // of the range we're adding just past it.
3229 entry->AddValue(value);
3230 // Bail out if the last interval ended at 0xFFFF since otherwise
3231 // adding 1 will wrap around to 0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00003232 if (entry->to() == String::kMaxUC16CharCode)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003233 break;
3234 ASSERT(entry->to() + 1 > current.from());
3235 current.set_from(entry->to() + 1);
3236 } else {
3237 // There is no overlap so we can just add the range
3238 ZoneSplayTree<Config>::Locator ins;
3239 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
3240 ins.set_value(Entry(current.from(),
3241 current.to(),
3242 empty()->Extend(value)));
3243 break;
3244 }
3245 }
3246}
3247
3248
3249OutSet* DispatchTable::Get(uc16 value) {
3250 ZoneSplayTree<Config>::Locator loc;
3251 if (!tree()->FindGreatestLessThan(value, &loc))
3252 return empty();
3253 Entry* entry = &loc.value();
3254 if (value <= entry->to())
3255 return entry->out_set();
3256 else
3257 return empty();
3258}
3259
3260
3261// -------------------------------------------------------------------
3262// Analysis
3263
3264
ager@chromium.org8bb60582008-12-11 12:02:20 +00003265void AssertionPropagation::EnsureAnalyzed(RegExpNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003266 if (that->info()->been_analyzed || that->info()->being_analyzed)
3267 return;
3268 that->info()->being_analyzed = true;
3269 that->Accept(this);
3270 that->info()->being_analyzed = false;
3271 that->info()->been_analyzed = true;
3272}
3273
3274
ager@chromium.org8bb60582008-12-11 12:02:20 +00003275void AssertionPropagation::VisitEnd(EndNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003276 // nothing to do
3277}
3278
3279
ager@chromium.org8bb60582008-12-11 12:02:20 +00003280void TextNode::CalculateOffsets() {
3281 int element_count = elements()->length();
3282 // Set up the offsets of the elements relative to the start. This is a fixed
3283 // quantity since a TextNode can only contain fixed-width things.
3284 int cp_offset = 0;
3285 for (int i = 0; i < element_count; i++) {
3286 TextElement& elm = elements()->at(i);
3287 elm.cp_offset = cp_offset;
3288 if (elm.type == TextElement::ATOM) {
3289 cp_offset += elm.data.u_atom->data().length();
3290 } else {
3291 cp_offset++;
3292 Vector<const uc16> quarks = elm.data.u_atom->data();
3293 }
3294 }
3295}
3296
3297
3298void AssertionPropagation::VisitText(TextNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003299 if (ignore_case_) {
3300 that->MakeCaseIndependent();
3301 }
3302 EnsureAnalyzed(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003303 NodeInfo* info = that->info();
3304 NodeInfo* next_info = that->on_success()->info();
3305 // If the following node is interested in what it follows then this
3306 // node must determine it.
3307 info->determine_newline = next_info->follows_newline_interest;
3308 info->determine_word = next_info->follows_word_interest;
3309 info->determine_start = next_info->follows_start_interest;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003310 that->CalculateOffsets();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003311}
3312
3313
ager@chromium.org8bb60582008-12-11 12:02:20 +00003314void AssertionPropagation::VisitAction(ActionNode* that) {
3315 RegExpNode* target = that->on_success();
3316 EnsureAnalyzed(target);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003317 // If the next node is interested in what it follows then this node
3318 // has to be interested too so it can pass the information on.
ager@chromium.org8bb60582008-12-11 12:02:20 +00003319 that->info()->AddFromFollowing(target->info());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003320}
3321
3322
ager@chromium.org8bb60582008-12-11 12:02:20 +00003323void AssertionPropagation::VisitChoice(ChoiceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003324 NodeInfo* info = that->info();
3325 for (int i = 0; i < that->alternatives()->length(); i++) {
3326 RegExpNode* node = that->alternatives()->at(i).node();
3327 EnsureAnalyzed(node);
3328 // Anything the following nodes need to know has to be known by
3329 // this node also, so it can pass it on.
3330 info->AddFromFollowing(node->info());
3331 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003332}
3333
3334
ager@chromium.org8bb60582008-12-11 12:02:20 +00003335void AssertionPropagation::VisitBackReference(BackReferenceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003336 EnsureAnalyzed(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003337}
3338
3339
3340// -------------------------------------------------------------------
3341// Assumption expansion
3342
3343
3344RegExpNode* RegExpNode::EnsureExpanded(NodeInfo* info) {
3345 siblings_.Ensure(this);
3346 NodeInfo new_info = *this->info();
3347 if (new_info.follows_word_interest)
3348 new_info.follows_word = info->follows_word;
3349 if (new_info.follows_newline_interest)
3350 new_info.follows_newline = info->follows_newline;
3351 // If the following node should determine something we need to get
3352 // a sibling that determines it.
3353 new_info.does_determine_newline = new_info.determine_newline;
3354 new_info.does_determine_word = new_info.determine_word;
3355 new_info.does_determine_start = new_info.determine_start;
3356 RegExpNode* sibling = TryGetSibling(&new_info);
3357 if (sibling == NULL) {
3358 sibling = ExpandLocal(&new_info);
3359 siblings_.Add(sibling);
3360 sibling->info()->being_expanded = true;
3361 sibling->ExpandChildren();
3362 sibling->info()->being_expanded = false;
3363 sibling->info()->been_expanded = true;
3364 } else {
3365 NodeInfo* sib_info = sibling->info();
3366 if (!sib_info->been_expanded && !sib_info->being_expanded) {
3367 sibling->info()->being_expanded = true;
3368 sibling->ExpandChildren();
3369 sibling->info()->being_expanded = false;
3370 sibling->info()->been_expanded = true;
3371 }
3372 }
3373 return sibling;
3374}
3375
3376
3377RegExpNode* ChoiceNode::ExpandLocal(NodeInfo* info) {
3378 ChoiceNode* clone = this->Clone();
3379 clone->info()->ResetCompilationState();
3380 clone->info()->AddAssumptions(info);
3381 return clone;
3382}
3383
3384
3385void ChoiceNode::ExpandChildren() {
3386 ZoneList<GuardedAlternative>* alts = alternatives();
3387 ZoneList<GuardedAlternative>* new_alts
3388 = new ZoneList<GuardedAlternative>(alts->length());
3389 for (int i = 0; i < alts->length(); i++) {
3390 GuardedAlternative next = alts->at(i);
3391 next.set_node(next.node()->EnsureExpanded(info()));
3392 new_alts->Add(next);
3393 }
3394 alternatives_ = new_alts;
3395}
3396
3397
3398RegExpNode* TextNode::ExpandLocal(NodeInfo* info) {
3399 TextElement last = elements()->last();
3400 if (last.type == TextElement::CHAR_CLASS) {
3401 RegExpCharacterClass* char_class = last.data.u_char_class;
3402 if (info->does_determine_word) {
3403 ZoneList<CharacterRange>* word = NULL;
3404 ZoneList<CharacterRange>* non_word = NULL;
3405 CharacterRange::Split(char_class->ranges(),
3406 CharacterRange::GetWordBounds(),
3407 &word,
3408 &non_word);
3409 if (non_word == NULL) {
3410 // This node contains no non-word characters so it must be
3411 // all word.
3412 this->info()->is_word = NodeInfo::TRUE;
3413 } else if (word == NULL) {
3414 // Vice versa.
3415 this->info()->is_word = NodeInfo::FALSE;
3416 } else {
3417 // If this character class contains both word and non-word
3418 // characters we need to split it into two.
ager@chromium.org8bb60582008-12-11 12:02:20 +00003419 ChoiceNode* result = new ChoiceNode(2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003420 // Welcome to the family, son!
3421 result->set_siblings(this->siblings());
3422 *result->info() = *this->info();
3423 result->info()->ResetCompilationState();
3424 result->info()->AddAssumptions(info);
3425 RegExpNode* word_node
3426 = new TextNode(new RegExpCharacterClass(word, false),
ager@chromium.org8bb60582008-12-11 12:02:20 +00003427 on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003428 word_node->info()->determine_word = true;
3429 word_node->info()->does_determine_word = true;
3430 word_node->info()->is_word = NodeInfo::TRUE;
3431 result->alternatives()->Add(GuardedAlternative(word_node));
3432 RegExpNode* non_word_node
3433 = new TextNode(new RegExpCharacterClass(non_word, false),
ager@chromium.org8bb60582008-12-11 12:02:20 +00003434 on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003435 non_word_node->info()->determine_word = true;
3436 non_word_node->info()->does_determine_word = true;
3437 non_word_node->info()->is_word = NodeInfo::FALSE;
3438 result->alternatives()->Add(GuardedAlternative(non_word_node));
3439 return result;
3440 }
3441 }
3442 }
3443 TextNode* clone = this->Clone();
3444 clone->info()->ResetCompilationState();
3445 clone->info()->AddAssumptions(info);
3446 return clone;
3447}
3448
3449
3450void TextNode::ExpandAtomChildren(RegExpAtom* that) {
3451 NodeInfo new_info = *info();
3452 uc16 last = that->data()[that->data().length() - 1];
3453 if (info()->determine_word) {
3454 new_info.follows_word = IsRegExpWord(last)
3455 ? NodeInfo::TRUE : NodeInfo::FALSE;
3456 } else {
3457 new_info.follows_word = NodeInfo::UNKNOWN;
3458 }
3459 if (info()->determine_newline) {
3460 new_info.follows_newline = IsRegExpNewline(last)
3461 ? NodeInfo::TRUE : NodeInfo::FALSE;
3462 } else {
3463 new_info.follows_newline = NodeInfo::UNKNOWN;
3464 }
3465 if (info()->determine_start) {
3466 new_info.follows_start = NodeInfo::FALSE;
3467 } else {
3468 new_info.follows_start = NodeInfo::UNKNOWN;
3469 }
3470 set_on_success(on_success()->EnsureExpanded(&new_info));
3471}
3472
3473
3474void TextNode::ExpandCharClassChildren(RegExpCharacterClass* that) {
3475 if (info()->does_determine_word) {
3476 // ASSERT(info()->is_word != NodeInfo::UNKNOWN);
3477 NodeInfo next_info = *on_success()->info();
3478 next_info.follows_word = info()->is_word;
3479 set_on_success(on_success()->EnsureExpanded(&next_info));
3480 } else {
3481 set_on_success(on_success()->EnsureExpanded(info()));
3482 }
3483}
3484
3485
3486void TextNode::ExpandChildren() {
3487 TextElement last = elements()->last();
3488 switch (last.type) {
3489 case TextElement::ATOM:
3490 ExpandAtomChildren(last.data.u_atom);
3491 break;
3492 case TextElement::CHAR_CLASS:
3493 ExpandCharClassChildren(last.data.u_char_class);
3494 break;
3495 default:
3496 UNREACHABLE();
3497 }
3498}
3499
3500
3501RegExpNode* ActionNode::ExpandLocal(NodeInfo* info) {
3502 ActionNode* clone = this->Clone();
3503 clone->info()->ResetCompilationState();
3504 clone->info()->AddAssumptions(info);
3505 return clone;
3506}
3507
3508
3509void ActionNode::ExpandChildren() {
3510 set_on_success(on_success()->EnsureExpanded(info()));
3511}
3512
3513
3514RegExpNode* BackReferenceNode::ExpandLocal(NodeInfo* info) {
3515 BackReferenceNode* clone = this->Clone();
3516 clone->info()->ResetCompilationState();
3517 clone->info()->AddAssumptions(info);
3518 return clone;
3519}
3520
3521
3522void BackReferenceNode::ExpandChildren() {
3523 set_on_success(on_success()->EnsureExpanded(info()));
3524}
3525
3526
3527RegExpNode* EndNode::ExpandLocal(NodeInfo* info) {
3528 EndNode* clone = this->Clone();
3529 clone->info()->ResetCompilationState();
3530 clone->info()->AddAssumptions(info);
3531 return clone;
3532}
3533
3534
3535void EndNode::ExpandChildren() {
3536 // nothing to do
3537}
3538
3539
3540// -------------------------------------------------------------------
3541// Dispatch table construction
3542
3543
3544void DispatchTableConstructor::VisitEnd(EndNode* that) {
3545 AddRange(CharacterRange::Everything());
3546}
3547
3548
3549void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
3550 node->set_being_calculated(true);
3551 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
3552 for (int i = 0; i < alternatives->length(); i++) {
3553 set_choice_index(i);
3554 alternatives->at(i).node()->Accept(this);
3555 }
3556 node->set_being_calculated(false);
3557}
3558
3559
3560class AddDispatchRange {
3561 public:
3562 explicit AddDispatchRange(DispatchTableConstructor* constructor)
3563 : constructor_(constructor) { }
3564 void Call(uc32 from, DispatchTable::Entry entry);
3565 private:
3566 DispatchTableConstructor* constructor_;
3567};
3568
3569
3570void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
3571 CharacterRange range(from, entry.to());
3572 constructor_->AddRange(range);
3573}
3574
3575
3576void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
3577 if (node->being_calculated())
3578 return;
3579 DispatchTable* table = node->GetTable(ignore_case_);
3580 AddDispatchRange adder(this);
3581 table->ForEach(&adder);
3582}
3583
3584
3585void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
3586 // TODO(160): Find the node that we refer back to and propagate its start
3587 // set back to here. For now we just accept anything.
3588 AddRange(CharacterRange::Everything());
3589}
3590
3591
3592
3593static int CompareRangeByFrom(const CharacterRange* a,
3594 const CharacterRange* b) {
3595 return Compare<uc16>(a->from(), b->from());
3596}
3597
3598
3599void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
3600 ranges->Sort(CompareRangeByFrom);
3601 uc16 last = 0;
3602 for (int i = 0; i < ranges->length(); i++) {
3603 CharacterRange range = ranges->at(i);
3604 if (last < range.from())
3605 AddRange(CharacterRange(last, range.from() - 1));
3606 if (range.to() >= last) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003607 if (range.to() == String::kMaxUC16CharCode) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003608 return;
3609 } else {
3610 last = range.to() + 1;
3611 }
3612 }
3613 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003614 AddRange(CharacterRange(last, String::kMaxUC16CharCode));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003615}
3616
3617
3618void DispatchTableConstructor::VisitText(TextNode* that) {
3619 TextElement elm = that->elements()->at(0);
3620 switch (elm.type) {
3621 case TextElement::ATOM: {
3622 uc16 c = elm.data.u_atom->data()[0];
3623 AddRange(CharacterRange(c, c));
3624 break;
3625 }
3626 case TextElement::CHAR_CLASS: {
3627 RegExpCharacterClass* tree = elm.data.u_char_class;
3628 ZoneList<CharacterRange>* ranges = tree->ranges();
3629 if (tree->is_negated()) {
3630 AddInverse(ranges);
3631 } else {
3632 for (int i = 0; i < ranges->length(); i++)
3633 AddRange(ranges->at(i));
3634 }
3635 break;
3636 }
3637 default: {
3638 UNIMPLEMENTED();
3639 }
3640 }
3641}
3642
3643
3644void DispatchTableConstructor::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003645 RegExpNode* target = that->on_success();
3646 target->Accept(this);
3647}
3648
3649
3650#ifdef DEBUG
3651
3652
3653class VisitNodeScope {
3654 public:
3655 explicit VisitNodeScope(RegExpNode* node) : node_(node) {
3656 ASSERT(!node->info()->visited);
3657 node->info()->visited = true;
3658 }
3659 ~VisitNodeScope() {
3660 node_->info()->visited = false;
3661 }
3662 private:
3663 RegExpNode* node_;
3664};
3665
3666
3667class NodeValidator : public NodeVisitor {
3668 public:
3669 virtual void ValidateInfo(NodeInfo* info) = 0;
3670#define DECLARE_VISIT(Type) \
3671 virtual void Visit##Type(Type##Node* that);
3672FOR_EACH_NODE_TYPE(DECLARE_VISIT)
3673#undef DECLARE_VISIT
3674};
3675
3676
3677class PostAnalysisNodeValidator : public NodeValidator {
3678 public:
3679 virtual void ValidateInfo(NodeInfo* info);
3680};
3681
3682
3683class PostExpansionNodeValidator : public NodeValidator {
3684 public:
3685 virtual void ValidateInfo(NodeInfo* info);
3686};
3687
3688
3689void PostAnalysisNodeValidator::ValidateInfo(NodeInfo* info) {
3690 ASSERT(info->been_analyzed);
3691}
3692
3693
3694void PostExpansionNodeValidator::ValidateInfo(NodeInfo* info) {
3695 ASSERT_EQ(info->determine_newline, info->does_determine_newline);
3696 ASSERT_EQ(info->determine_start, info->does_determine_start);
3697 ASSERT_EQ(info->determine_word, info->does_determine_word);
3698 ASSERT_EQ(info->follows_word_interest,
3699 (info->follows_word != NodeInfo::UNKNOWN));
3700 if (false) {
3701 // These are still unimplemented.
3702 ASSERT_EQ(info->follows_start_interest,
3703 (info->follows_start != NodeInfo::UNKNOWN));
3704 ASSERT_EQ(info->follows_newline_interest,
3705 (info->follows_newline != NodeInfo::UNKNOWN));
3706 }
3707}
3708
3709
3710void NodeValidator::VisitAction(ActionNode* that) {
3711 if (that->info()->visited) return;
3712 VisitNodeScope scope(that);
3713 ValidateInfo(that->info());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003714 that->on_success()->Accept(this);
3715}
3716
3717
ager@chromium.org8bb60582008-12-11 12:02:20 +00003718void NodeValidator::VisitBackReference(BackReferenceNode* that) {
3719 if (that->info()->visited) return;
3720 VisitNodeScope scope(that);
3721 ValidateInfo(that->info());
3722 that->on_success()->Accept(this);
3723}
3724
3725
3726void NodeValidator::VisitChoice(ChoiceNode* that) {
3727 if (that->info()->visited) return;
3728 VisitNodeScope scope(that);
3729 ValidateInfo(that->info());
3730 ZoneList<GuardedAlternative>* alts = that->alternatives();
3731 for (int i = 0; i < alts->length(); i++)
3732 alts->at(i).node()->Accept(this);
3733}
3734
3735
3736void NodeValidator::VisitEnd(EndNode* that) {
3737 if (that->info()->visited) return;
3738 VisitNodeScope scope(that);
3739 ValidateInfo(that->info());
3740}
3741
3742
3743void NodeValidator::VisitText(TextNode* that) {
3744 if (that->info()->visited) return;
3745 VisitNodeScope scope(that);
3746 ValidateInfo(that->info());
3747 that->on_success()->Accept(this);
3748}
3749
3750
3751#endif
3752
3753
3754Handle<FixedArray> RegExpEngine::Compile(RegExpCompileData* data,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003755 bool ignore_case,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003756 bool is_multiline,
3757 Handle<String> pattern,
3758 bool is_ascii) {
3759 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003760 // Wrap the body of the regexp in capture #0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00003761 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003762 0,
3763 &compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003764 compiler.accept());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003765 // Add a .*? at the beginning, outside the body capture.
3766 // Note: We could choose to not add this if the regexp is anchored at
3767 // the start of the input but I'm not sure how best to do that and
3768 // since we don't even handle ^ yet I'm saving that optimization for
3769 // later.
3770 RegExpNode* node = RegExpQuantifier::ToNode(0,
3771 RegExpQuantifier::kInfinity,
3772 false,
3773 new RegExpCharacterClass('*'),
3774 &compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003775 captured_body);
3776 AssertionPropagation analysis(ignore_case);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003777 analysis.EnsureAnalyzed(node);
3778
3779 NodeInfo info = *node->info();
ager@chromium.org8bb60582008-12-11 12:02:20 +00003780 data->has_lookbehind = info.HasLookbehind();
3781 if (data->has_lookbehind) {
3782 // If this node needs information about the preceding text we let
3783 // it start with a character class that consumes a single character
3784 // and proceeds to wherever is appropriate. This means that if
3785 // has_lookbehind is set the code generator must start one character
3786 // before the start position.
3787 node = new TextNode(new RegExpCharacterClass('*'), node);
3788 analysis.EnsureAnalyzed(node);
3789 }
3790
3791#ifdef DEBUG
3792 PostAnalysisNodeValidator post_analysis_validator;
3793 node->Accept(&post_analysis_validator);
3794#endif
3795
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003796 node = node->EnsureExpanded(&info);
3797
ager@chromium.org8bb60582008-12-11 12:02:20 +00003798#ifdef DEBUG
3799 PostExpansionNodeValidator post_expansion_validator;
3800 node->Accept(&post_expansion_validator);
3801#endif
3802
3803 data->node = node;
3804
3805 if (is_multiline && !FLAG_attempt_multiline_irregexp) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003806 return Handle<FixedArray>::null();
3807 }
3808
ager@chromium.org8bb60582008-12-11 12:02:20 +00003809 if (data->has_lookbehind) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003810 return Handle<FixedArray>::null();
3811 }
3812
3813 if (FLAG_irregexp_native) {
3814#ifdef ARM
3815 // Unimplemented, fall-through to bytecode implementation.
3816#else // IA32
ager@chromium.org8bb60582008-12-11 12:02:20 +00003817 RegExpMacroAssemblerIA32::Mode mode;
3818 if (is_ascii) {
3819 mode = RegExpMacroAssemblerIA32::ASCII;
3820 } else {
3821 mode = RegExpMacroAssemblerIA32::UC16;
3822 }
3823 RegExpMacroAssemblerIA32 macro_assembler(mode,
3824 (data->capture_count + 1) * 2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003825 return compiler.Assemble(&macro_assembler,
3826 node,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003827 data->capture_count,
3828 pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003829#endif
3830 }
3831 EmbeddedVector<byte, 1024> codes;
3832 RegExpMacroAssemblerIrregexp macro_assembler(codes);
3833 return compiler.Assemble(&macro_assembler,
3834 node,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003835 data->capture_count,
3836 pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003837}
3838
3839
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00003840}} // namespace v8::internal