blob: dadec3ba6170738d8e41c51af5ab994ded7431df [file] [log] [blame]
ager@chromium.org9258b6b2008-09-11 09:11:10 +00001// Copyright 2006-2008 the V8 project authors. All rights reserved.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include "v8.h"
29
ager@chromium.orga74f0da2008-12-03 16:05:52 +000030#include "ast.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000031#include "execution.h"
32#include "factory.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000033#include "jsregexp-inl.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000034#include "platform.h"
kasperl@chromium.org41044eb2008-10-06 08:24:46 +000035#include "runtime.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000036#include "top.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000037#include "compilation-cache.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000038#include "string-stream.h"
39#include "parser.h"
40#include "regexp-macro-assembler.h"
41#include "regexp-macro-assembler-tracer.h"
42#include "regexp-macro-assembler-irregexp.h"
ager@chromium.org32912102009-01-16 10:38:43 +000043#include "regexp-stack.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000044
45#ifdef ARM
46#include "regexp-macro-assembler-arm.h"
47#else // IA32
48#include "macro-assembler-ia32.h"
49#include "regexp-macro-assembler-ia32.h"
50#endif
51
52#include "interpreter-irregexp.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000053
54// Including pcre.h undefines DEBUG to avoid getting debug output from
55// the JSCRE implementation. Make sure to redefine it in debug mode
56// after having included the header file.
57#ifdef DEBUG
58#include "third_party/jscre/pcre.h"
59#define DEBUG
60#else
61#include "third_party/jscre/pcre.h"
62#endif
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000063
ager@chromium.orga74f0da2008-12-03 16:05:52 +000064
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000065namespace v8 { namespace internal {
66
67
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000068static Failure* malloc_failure;
69
70static void* JSREMalloc(size_t size) {
71 Object* obj = Heap::AllocateByteArray(size);
72
73 // If allocation failed, return a NULL pointer to JSRE, and jsRegExpCompile
74 // will return NULL to the caller, performs GC there.
75 // Also pass failure information to the caller.
76 if (obj->IsFailure()) {
77 malloc_failure = Failure::cast(obj);
78 return NULL;
79 }
80
81 // Note: object is unrooted, the caller of jsRegExpCompile must
82 // create a handle for the return value before doing heap allocation.
83 return reinterpret_cast<void*>(ByteArray::cast(obj)->GetDataStartAddress());
84}
85
86
87static void JSREFree(void* p) {
88 USE(p); // Do nothing, memory is garbage collected.
89}
90
91
92String* RegExpImpl::last_ascii_string_ = NULL;
93String* RegExpImpl::two_byte_cached_string_ = NULL;
94
95
96void RegExpImpl::NewSpaceCollectionPrologue() {
97 // The two byte string is always in the old space. The Ascii string may be
98 // in either place. If it is in the old space we don't need to do anything.
99 if (Heap::InNewSpace(last_ascii_string_)) {
100 // Invalidate the cache.
101 last_ascii_string_ = NULL;
102 two_byte_cached_string_ = NULL;
103 }
104}
105
106
107void RegExpImpl::OldSpaceCollectionPrologue() {
108 last_ascii_string_ = NULL;
109 two_byte_cached_string_ = NULL;
110}
111
112
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000113Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
114 Handle<String> pattern,
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000115 Handle<String> flags,
116 bool* has_pending_exception) {
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000117 // Ensure that the constructor function has been loaded.
118 if (!constructor->IsLoaded()) {
119 LoadLazy(constructor, has_pending_exception);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000120 if (*has_pending_exception) return Handle<Object>();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000121 }
122 // Call the construct code with 2 arguments.
123 Object** argv[2] = { Handle<Object>::cast(pattern).location(),
124 Handle<Object>::cast(flags).location() };
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000125 return Execution::New(constructor, 2, argv, has_pending_exception);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000126}
127
128
129// Converts a source string to a 16 bit flat string or a SlicedString containing
130// a 16 bit flat string).
131Handle<String> RegExpImpl::CachedStringToTwoByte(Handle<String> subject) {
132 if (*subject == last_ascii_string_) {
133 ASSERT(two_byte_cached_string_ != NULL);
134 return Handle<String>(String::cast(two_byte_cached_string_));
135 }
136 Handle<String> two_byte_string = StringToTwoByte(subject);
137 last_ascii_string_ = *subject;
138 two_byte_cached_string_ = *two_byte_string;
139 return two_byte_string;
140}
141
142
143// Converts a source string to a 16 bit flat string or a SlicedString containing
144// a 16 bit flat string).
145Handle<String> RegExpImpl::StringToTwoByte(Handle<String> pattern) {
ager@chromium.org870a0b62008-11-04 11:43:05 +0000146 StringShape shape(*pattern);
147 if (!pattern->IsFlat(shape)) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000148 FlattenString(pattern);
ager@chromium.orgc3e50d82008-11-05 11:53:10 +0000149 shape = StringShape(*pattern);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000150 }
ager@chromium.org870a0b62008-11-04 11:43:05 +0000151 Handle<String> flat_string(shape.IsCons() ?
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000152 String::cast(ConsString::cast(*pattern)->first()) :
153 *pattern);
ager@chromium.org870a0b62008-11-04 11:43:05 +0000154 ASSERT(flat_string->IsString());
155 StringShape flat_shape(*flat_string);
156 ASSERT(!flat_shape.IsCons());
157 ASSERT(flat_shape.IsSequential() ||
158 flat_shape.IsSliced() ||
159 flat_shape.IsExternal());
160 if (!flat_shape.IsAsciiRepresentation()) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000161 return flat_string;
162 }
163
ager@chromium.org870a0b62008-11-04 11:43:05 +0000164 int len = flat_string->length(flat_shape);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000165 Handle<String> two_byte_string =
ager@chromium.org870a0b62008-11-04 11:43:05 +0000166 Factory::NewRawTwoByteString(len, TENURED);
167 uc16* dest = SeqTwoByteString::cast(*two_byte_string)->GetChars();
168 String::WriteToFlat(*flat_string, flat_shape, dest, 0, len);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000169 return two_byte_string;
170}
171
172
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000173static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
174 int flags = JSRegExp::NONE;
ager@chromium.org870a0b62008-11-04 11:43:05 +0000175 StringShape shape(*str);
176 for (int i = 0; i < str->length(shape); i++) {
177 switch (str->Get(shape, i)) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000178 case 'i':
179 flags |= JSRegExp::IGNORE_CASE;
180 break;
181 case 'g':
182 flags |= JSRegExp::GLOBAL;
183 break;
184 case 'm':
185 flags |= JSRegExp::MULTILINE;
186 break;
187 }
188 }
189 return JSRegExp::Flags(flags);
190}
191
192
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000193static inline void ThrowRegExpException(Handle<JSRegExp> re,
194 Handle<String> pattern,
195 Handle<String> error_text,
196 const char* message) {
197 Handle<JSArray> array = Factory::NewJSArray(2);
198 SetElement(array, 0, pattern);
199 SetElement(array, 1, error_text);
200 Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
201 Top::Throw(*regexp_err);
202}
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000203
204
ager@chromium.org8bb60582008-12-11 12:02:20 +0000205// Generic RegExp methods. Dispatches to implementation specific methods.
206
207
208class OffsetsVector {
209 public:
210 inline OffsetsVector(int num_registers)
211 : offsets_vector_length_(num_registers) {
212 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
213 vector_ = NewArray<int>(offsets_vector_length_);
214 } else {
215 vector_ = static_offsets_vector_;
216 }
217 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000218 inline ~OffsetsVector() {
219 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
220 DeleteArray(vector_);
221 vector_ = NULL;
222 }
223 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000224 inline int* vector() { return vector_; }
225 inline int length() { return offsets_vector_length_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000226
227 private:
228 int* vector_;
229 int offsets_vector_length_;
230 static const int kStaticOffsetsVectorSize = 50;
231 static int static_offsets_vector_[kStaticOffsetsVectorSize];
232};
233
234
235int OffsetsVector::static_offsets_vector_[
236 OffsetsVector::kStaticOffsetsVectorSize];
237
238
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000239Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
240 Handle<String> pattern,
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000241 Handle<String> flag_str) {
242 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
243 Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
244 bool in_cache = !cached.is_null();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000245 LOG(RegExpCompileEvent(re, in_cache));
246
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000247 Handle<Object> result;
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000248 if (in_cache) {
249 re->set_data(*cached);
250 result = re;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000251 } else {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000252 FlattenString(pattern);
253 ZoneScope zone_scope(DELETE_ON_EXIT);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000254 RegExpCompileData parse_result;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000255 FlatStringReader reader(pattern);
256 if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
257 // Throw an exception if we fail to parse the pattern.
258 ThrowRegExpException(re,
259 pattern,
260 parse_result.error,
261 "malformed_regexp");
ager@chromium.org8bb60582008-12-11 12:02:20 +0000262 return Handle<Object>::null();
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000263 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000264
265 if (parse_result.simple && !flags.is_ignore_case()) {
266 // Parse-tree is a single atom that is equal to the pattern.
267 result = AtomCompile(re, pattern, flags, pattern);
268 } else if (parse_result.tree->IsAtom() &&
269 !flags.is_ignore_case() &&
270 parse_result.capture_count == 0) {
271 RegExpAtom* atom = parse_result.tree->AsAtom();
272 Vector<const uc16> atom_pattern = atom->data();
273 Handle<String> atom_string = Factory::NewStringFromTwoByte(atom_pattern);
274 result = AtomCompile(re, pattern, flags, atom_string);
275 } else if (FLAG_irregexp) {
276 result = IrregexpPrepare(re, pattern, flags);
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000277 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000278 result = JscrePrepare(re, pattern, flags);
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000279 }
280 Object* data = re->data();
281 if (data->IsFixedArray()) {
282 // If compilation succeeded then the data is set on the regexp
283 // and we can store it in the cache.
284 Handle<FixedArray> data(FixedArray::cast(re->data()));
285 CompilationCache::PutRegExp(pattern, flags, data);
286 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000287 }
288
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000289 return result;
290}
291
292
293Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
294 Handle<String> subject,
295 Handle<Object> index) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000296 switch (regexp->TypeTag()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000297 case JSRegExp::ATOM:
298 return AtomExec(regexp, subject, index);
299 case JSRegExp::IRREGEXP: {
300 Handle<Object> result = IrregexpExec(regexp, subject, index);
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000301 if (result.is_null()) ASSERT(Top::has_pending_exception());
302 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000303 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000304 case JSRegExp::JSCRE:
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000305 return JscreExec(regexp, subject, index);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000306 default:
307 UNREACHABLE();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000308 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000309 }
310}
311
312
313Handle<Object> RegExpImpl::ExecGlobal(Handle<JSRegExp> regexp,
314 Handle<String> subject) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000315 switch (regexp->TypeTag()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000316 case JSRegExp::ATOM:
317 return AtomExecGlobal(regexp, subject);
318 case JSRegExp::IRREGEXP: {
319 Handle<Object> result = IrregexpExecGlobal(regexp, subject);
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000320 if (result.is_null()) ASSERT(Top::has_pending_exception());
321 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000322 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000323 case JSRegExp::JSCRE:
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000324 return JscreExecGlobal(regexp, subject);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000325 default:
326 UNREACHABLE();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000327 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000328 }
329}
330
331
ager@chromium.org8bb60582008-12-11 12:02:20 +0000332// RegExp Atom implementation: Simple string search using indexOf.
333
334
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000335Handle<Object> RegExpImpl::AtomCompile(Handle<JSRegExp> re,
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000336 Handle<String> pattern,
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000337 JSRegExp::Flags flags,
338 Handle<String> match_pattern) {
339 Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, match_pattern);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000340 return re;
341}
342
343
344Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
345 Handle<String> subject,
346 Handle<Object> index) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000347 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000348
349 uint32_t start_index;
350 if (!Array::IndexFromObject(*index, &start_index)) {
351 return Handle<Smi>(Smi::FromInt(-1));
352 }
353
ager@chromium.org7c537e22008-10-16 08:43:32 +0000354 int value = Runtime::StringMatch(subject, needle, start_index);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000355 if (value == -1) return Factory::null_value();
ager@chromium.org7c537e22008-10-16 08:43:32 +0000356
357 Handle<FixedArray> array = Factory::NewFixedArray(2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000358 array->set(0, Smi::FromInt(value));
359 array->set(1, Smi::FromInt(value + needle->length()));
ager@chromium.org7c537e22008-10-16 08:43:32 +0000360 return Factory::NewJSArrayWithElements(array);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000361}
362
363
364Handle<Object> RegExpImpl::AtomExecGlobal(Handle<JSRegExp> re,
365 Handle<String> subject) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000366 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000367 Handle<JSArray> result = Factory::NewJSArray(1);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000368 int index = 0;
369 int match_count = 0;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000370 int subject_length = subject->length();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000371 int needle_length = needle->length();
ager@chromium.org7c537e22008-10-16 08:43:32 +0000372 while (true) {
ager@chromium.org7c537e22008-10-16 08:43:32 +0000373 int value = -1;
374 if (index + needle_length <= subject_length) {
375 value = Runtime::StringMatch(subject, needle, index);
376 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000377 if (value == -1) break;
378 HandleScope scope;
379 int end = value + needle_length;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000380
381 Handle<FixedArray> array = Factory::NewFixedArray(2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000382 array->set(0, Smi::FromInt(value));
383 array->set(1, Smi::FromInt(end));
ager@chromium.org7c537e22008-10-16 08:43:32 +0000384 Handle<JSArray> pair = Factory::NewJSArrayWithElements(array);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000385 SetElement(result, match_count, pair);
386 match_count++;
387 index = end;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000388 if (needle_length == 0) index++;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000389 }
390 return result;
391}
392
393
ager@chromium.org8bb60582008-12-11 12:02:20 +0000394// JSCRE implementation.
395
396
397int RegExpImpl::JscreNumberOfCaptures(Handle<JSRegExp> re) {
398 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
399 return Smi::cast(value->get(kJscreNumberOfCapturesIndex))->value();
400}
401
402
403ByteArray* RegExpImpl::JscreInternal(Handle<JSRegExp> re) {
404 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
405 return ByteArray::cast(value->get(kJscreInternalIndex));
406}
407
408
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000409Handle<Object>RegExpImpl::JscrePrepare(Handle<JSRegExp> re,
410 Handle<String> pattern,
411 JSRegExp::Flags flags) {
412 Handle<Object> value(Heap::undefined_value());
413 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
414 return re;
415}
416
417
ager@chromium.org8bb60582008-12-11 12:02:20 +0000418static inline Object* JscreDoCompile(String* pattern,
419 JSRegExp::Flags flags,
420 unsigned* number_of_captures,
421 const char** error_message,
422 v8::jscre::JscreRegExp** code) {
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000423 v8::jscre::JSRegExpIgnoreCaseOption case_option = flags.is_ignore_case()
424 ? v8::jscre::JSRegExpIgnoreCase
425 : v8::jscre::JSRegExpDoNotIgnoreCase;
426 v8::jscre::JSRegExpMultilineOption multiline_option = flags.is_multiline()
427 ? v8::jscre::JSRegExpMultiline
428 : v8::jscre::JSRegExpSingleLine;
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000429 *error_message = NULL;
430 malloc_failure = Failure::Exception();
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000431 *code = v8::jscre::jsRegExpCompile(pattern->GetTwoByteData(),
432 pattern->length(),
433 case_option,
434 multiline_option,
435 number_of_captures,
436 error_message,
437 &JSREMalloc,
438 &JSREFree);
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000439 if (*code == NULL && (malloc_failure->IsRetryAfterGC() ||
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000440 malloc_failure->IsOutOfMemoryFailure())) {
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000441 return malloc_failure;
442 } else {
443 // It doesn't matter which object we return here, we just need to return
444 // a non-failure to indicate to the GC-retry code that there was no
445 // allocation failure.
446 return pattern;
447 }
448}
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000449
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000450
ager@chromium.org8bb60582008-12-11 12:02:20 +0000451static void JscreCompileWithRetryAfterGC(Handle<String> pattern,
452 JSRegExp::Flags flags,
453 unsigned* number_of_captures,
454 const char** error_message,
455 v8::jscre::JscreRegExp** code) {
456 CALL_HEAP_FUNCTION_VOID(JscreDoCompile(*pattern,
457 flags,
458 number_of_captures,
459 error_message,
460 code));
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000461}
462
463
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000464Handle<Object> RegExpImpl::JscreCompile(Handle<JSRegExp> re) {
465 ASSERT_EQ(re->TypeTag(), JSRegExp::JSCRE);
466 ASSERT(re->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined());
467
468 Handle<String> pattern(re->Pattern());
469 JSRegExp::Flags flags = re->GetFlags();
470
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000471 Handle<String> two_byte_pattern = StringToTwoByte(pattern);
472
473 unsigned number_of_captures;
474 const char* error_message = NULL;
475
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000476 v8::jscre::JscreRegExp* code = NULL;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000477 FlattenString(pattern);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000478
ager@chromium.org8bb60582008-12-11 12:02:20 +0000479 JscreCompileWithRetryAfterGC(two_byte_pattern,
480 flags,
481 &number_of_captures,
482 &error_message,
483 &code);
ager@chromium.org7c537e22008-10-16 08:43:32 +0000484
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000485 if (code == NULL) {
486 // Throw an exception.
487 Handle<JSArray> array = Factory::NewJSArray(2);
488 SetElement(array, 0, pattern);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000489 const char* message =
490 (error_message == NULL) ? "Unknown regexp error" : error_message;
491 SetElement(array, 1, Factory::NewStringFromUtf8(CStrVector(message)));
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000492 Handle<Object> regexp_err =
493 Factory::NewSyntaxError("malformed_regexp", array);
ager@chromium.org3bf7b912008-11-17 09:09:45 +0000494 Top::Throw(*regexp_err);
495 return Handle<Object>();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000496 }
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000497
498 // Convert the return address to a ByteArray pointer.
499 Handle<ByteArray> internal(
500 ByteArray::FromDataStartAddress(reinterpret_cast<Address>(code)));
501
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000502 Handle<FixedArray> value = Factory::NewFixedArray(kJscreDataLength);
503 value->set(kJscreNumberOfCapturesIndex, Smi::FromInt(number_of_captures));
504 value->set(kJscreInternalIndex, *internal);
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000505 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
506
507 return re;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000508}
509
510
ager@chromium.org8bb60582008-12-11 12:02:20 +0000511Handle<Object> RegExpImpl::JscreExec(Handle<JSRegExp> regexp,
512 Handle<String> subject,
513 Handle<Object> index) {
514 ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);
515 if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {
516 Handle<Object> compile_result = JscreCompile(regexp);
517 if (compile_result.is_null()) return compile_result;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000518 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000519 ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000520
ager@chromium.org8bb60582008-12-11 12:02:20 +0000521 int num_captures = JscreNumberOfCaptures(regexp);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000522
ager@chromium.org8bb60582008-12-11 12:02:20 +0000523 OffsetsVector offsets((num_captures + 1) * 3);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000524
ager@chromium.org8bb60582008-12-11 12:02:20 +0000525 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000526
ager@chromium.org8bb60582008-12-11 12:02:20 +0000527 Handle<String> subject16 = CachedStringToTwoByte(subject);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000528
ager@chromium.org8bb60582008-12-11 12:02:20 +0000529 return JscreExecOnce(regexp,
530 num_captures,
531 subject,
532 previous_index,
533 subject16->GetTwoByteData(),
534 offsets.vector(),
535 offsets.length());
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000536}
537
538
539Handle<Object> RegExpImpl::JscreExecOnce(Handle<JSRegExp> regexp,
540 int num_captures,
541 Handle<String> subject,
542 int previous_index,
543 const uc16* two_byte_subject,
544 int* offsets_vector,
545 int offsets_vector_length) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000546 int rc;
547 {
548 AssertNoAllocation a;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000549 ByteArray* internal = JscreInternal(regexp);
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000550 const v8::jscre::JscreRegExp* js_regexp =
551 reinterpret_cast<v8::jscre::JscreRegExp*>(
552 internal->GetDataStartAddress());
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000553
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000554 rc = v8::jscre::jsRegExpExecute(js_regexp,
555 two_byte_subject,
556 subject->length(),
557 previous_index,
558 offsets_vector,
559 offsets_vector_length);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000560 }
561
562 // The KJS JavaScript engine returns null (ie, a failed match) when
563 // JSRE's internal match limit is exceeded. We duplicate that behavior here.
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000564 if (rc == v8::jscre::JSRegExpErrorNoMatch
565 || rc == v8::jscre::JSRegExpErrorHitLimit) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000566 return Factory::null_value();
567 }
568
569 // Other JSRE errors:
570 if (rc < 0) {
571 // Throw an exception.
572 Handle<Object> code(Smi::FromInt(rc));
573 Handle<Object> args[2] = { Factory::LookupAsciiSymbol("jsre_exec"), code };
574 Handle<Object> regexp_err(
575 Factory::NewTypeError("jsre_error", HandleVector(args, 2)));
576 return Handle<Object>(Top::Throw(*regexp_err));
577 }
578
ager@chromium.org7c537e22008-10-16 08:43:32 +0000579 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000580 // The captures come in (start, end+1) pairs.
581 for (int i = 0; i < 2 * (num_captures+1); i += 2) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000582 array->set(i, Smi::FromInt(offsets_vector[i]));
583 array->set(i+1, Smi::FromInt(offsets_vector[i+1]));
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000584 }
ager@chromium.org7c537e22008-10-16 08:43:32 +0000585 return Factory::NewJSArrayWithElements(array);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000586}
587
588
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000589Handle<Object> RegExpImpl::JscreExecGlobal(Handle<JSRegExp> regexp,
590 Handle<String> subject) {
591 ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);
592 if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {
593 Handle<Object> compile_result = JscreCompile(regexp);
594 if (compile_result.is_null()) return compile_result;
595 }
596 ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());
597
598 // Prepare space for the return values.
599 int num_captures = JscreNumberOfCaptures(regexp);
600
601 OffsetsVector offsets((num_captures + 1) * 3);
602
603 int previous_index = 0;
604
605 Handle<JSArray> result = Factory::NewJSArray(0);
606 int i = 0;
607 Handle<Object> matches;
608
609 Handle<String> subject16 = CachedStringToTwoByte(subject);
610
611 do {
612 if (previous_index > subject->length() || previous_index < 0) {
613 // Per ECMA-262 15.10.6.2, if the previous index is greater than the
614 // string length, there is no match.
615 matches = Factory::null_value();
616 } else {
617 matches = JscreExecOnce(regexp,
618 num_captures,
619 subject,
620 previous_index,
621 subject16->GetTwoByteData(),
622 offsets.vector(),
623 offsets.length());
624
625 if (matches->IsJSArray()) {
626 SetElement(result, i, matches);
627 i++;
628 previous_index = offsets.vector()[1];
629 if (offsets.vector()[0] == offsets.vector()[1]) {
630 previous_index++;
631 }
632 }
633 }
634 } while (matches->IsJSArray());
635
636 // If we exited the loop with an exception, throw it.
637 if (matches->IsNull()) {
638 // Exited loop normally.
639 return result;
640 } else {
641 // Exited loop with the exception in matches.
642 return matches;
643 }
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000644}
645
646
ager@chromium.org8bb60582008-12-11 12:02:20 +0000647// Irregexp implementation.
648
649
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000650// Retrieves a compiled version of the regexp for either ASCII or non-ASCII
651// strings. If the compiled version doesn't already exist, it is compiled
652// from the source pattern.
653// Irregexp is not feature complete yet. If there is something in the
654// regexp that the compiler cannot currently handle, an empty
655// handle is returned, but no exception is thrown.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000656static Handle<FixedArray> GetCompiledIrregexp(Handle<JSRegExp> re,
657 bool is_ascii) {
658 ASSERT(re->DataAt(JSRegExp::kIrregexpDataIndex)->IsFixedArray());
659 Handle<FixedArray> alternatives(
660 FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex)));
661 ASSERT_EQ(2, alternatives->length());
662
663 int index = is_ascii ? 0 : 1;
664 Object* entry = alternatives->get(index);
665 if (!entry->IsNull()) {
666 return Handle<FixedArray>(FixedArray::cast(entry));
667 }
668
669 // Compile the RegExp.
670 ZoneScope zone_scope(DELETE_ON_EXIT);
671
672 JSRegExp::Flags flags = re->GetFlags();
673
674 Handle<String> pattern(re->Pattern());
ager@chromium.orga8d58b82009-02-02 08:33:31 +0000675 if (!pattern->IsFlat(StringShape(*pattern))) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000676 FlattenString(pattern);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000677 }
678
679 RegExpCompileData compile_data;
680 FlatStringReader reader(pattern);
681 if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) {
682 // Throw an exception if we fail to parse the pattern.
683 // THIS SHOULD NOT HAPPEN. We already parsed it successfully once.
684 ThrowRegExpException(re,
685 pattern,
686 compile_data.error,
687 "malformed_regexp");
688 return Handle<FixedArray>::null();
689 }
690 Handle<FixedArray> compiled_entry =
691 RegExpEngine::Compile(&compile_data,
692 flags.is_ignore_case(),
693 flags.is_multiline(),
694 pattern,
695 is_ascii);
696 if (!compiled_entry.is_null()) {
697 alternatives->set(index, *compiled_entry);
698 }
699 return compiled_entry;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000700}
701
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000702
ager@chromium.org8bb60582008-12-11 12:02:20 +0000703int RegExpImpl::IrregexpNumberOfCaptures(Handle<FixedArray> irre) {
704 return Smi::cast(irre->get(kIrregexpNumberOfCapturesIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000705}
706
707
ager@chromium.org8bb60582008-12-11 12:02:20 +0000708int RegExpImpl::IrregexpNumberOfRegisters(Handle<FixedArray> irre) {
709 return Smi::cast(irre->get(kIrregexpNumberOfRegistersIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000710}
711
712
ager@chromium.org8bb60582008-12-11 12:02:20 +0000713Handle<ByteArray> RegExpImpl::IrregexpByteCode(Handle<FixedArray> irre) {
714 ASSERT(Smi::cast(irre->get(kIrregexpImplementationIndex))->value()
715 == RegExpMacroAssembler::kBytecodeImplementation);
716 return Handle<ByteArray>(ByteArray::cast(irre->get(kIrregexpCodeIndex)));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000717}
718
719
ager@chromium.org8bb60582008-12-11 12:02:20 +0000720Handle<Code> RegExpImpl::IrregexpNativeCode(Handle<FixedArray> irre) {
721 ASSERT(Smi::cast(irre->get(kIrregexpImplementationIndex))->value()
722 != RegExpMacroAssembler::kBytecodeImplementation);
723 return Handle<Code>(Code::cast(irre->get(kIrregexpCodeIndex)));
724}
725
726
727Handle<Object>RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
728 Handle<String> pattern,
729 JSRegExp::Flags flags) {
730 // Make space for ASCII and UC16 versions.
731 Handle<FixedArray> alternatives = Factory::NewFixedArray(2);
732 alternatives->set_null(0);
733 alternatives->set_null(1);
734 Factory::SetRegExpData(re, JSRegExp::IRREGEXP, pattern, flags, alternatives);
735 return re;
736}
737
738
739Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
740 Handle<String> subject,
741 Handle<Object> index) {
742 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
743 ASSERT(regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsFixedArray());
744
745 bool is_ascii = StringShape(*subject).IsAsciiRepresentation();
746 Handle<FixedArray> irregexp = GetCompiledIrregexp(regexp, is_ascii);
747 if (irregexp.is_null()) {
748 // We can't handle the RegExp with IRRegExp.
749 return Handle<Object>::null();
750 }
751
752 // Prepare space for the return values.
753 int number_of_registers = IrregexpNumberOfRegisters(irregexp);
754 OffsetsVector offsets(number_of_registers);
755
756 int num_captures = IrregexpNumberOfCaptures(irregexp);
757
758 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
759
760#ifdef DEBUG
761 if (FLAG_trace_regexp_bytecodes) {
762 String* pattern = regexp->Pattern();
763 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
764 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
765 }
766#endif
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000767
768 if (!subject->IsFlat(StringShape(*subject))) {
769 FlattenString(subject);
770 }
771
ager@chromium.org8bb60582008-12-11 12:02:20 +0000772 return IrregexpExecOnce(irregexp,
773 num_captures,
774 subject,
775 previous_index,
776 offsets.vector(),
777 offsets.length());
778}
779
780
781Handle<Object> RegExpImpl::IrregexpExecGlobal(Handle<JSRegExp> regexp,
782 Handle<String> subject) {
783 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
784
ager@chromium.orga8d58b82009-02-02 08:33:31 +0000785 bool is_ascii = StringShape(*subject).IsAsciiRepresentation();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000786 Handle<FixedArray> irregexp = GetCompiledIrregexp(regexp, is_ascii);
787 if (irregexp.is_null()) {
788 return Handle<Object>::null();
789 }
790
791 // Prepare space for the return values.
792 int number_of_registers = IrregexpNumberOfRegisters(irregexp);
793 OffsetsVector offsets(number_of_registers);
794
795 int previous_index = 0;
796
797 Handle<JSArray> result = Factory::NewJSArray(0);
798 int i = 0;
799 Handle<Object> matches;
800
ager@chromium.orga8d58b82009-02-02 08:33:31 +0000801 if (!subject->IsFlat(StringShape(*subject))) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000802 FlattenString(subject);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000803 }
804
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000805 while (true) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000806 if (previous_index > subject->length() || previous_index < 0) {
807 // Per ECMA-262 15.10.6.2, if the previous index is greater than the
808 // string length, there is no match.
809 matches = Factory::null_value();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000810 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000811 } else {
812#ifdef DEBUG
813 if (FLAG_trace_regexp_bytecodes) {
814 String* pattern = regexp->Pattern();
815 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
816 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
817 }
818#endif
ager@chromium.org8bb60582008-12-11 12:02:20 +0000819 matches = IrregexpExecOnce(irregexp,
820 IrregexpNumberOfCaptures(irregexp),
821 subject,
822 previous_index,
823 offsets.vector(),
824 offsets.length());
825
826 if (matches->IsJSArray()) {
827 SetElement(result, i, matches);
828 i++;
829 previous_index = offsets.vector()[1];
830 if (offsets.vector()[0] == offsets.vector()[1]) {
831 previous_index++;
832 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000833 } else if (matches->IsNull()) {
834 return result;
835 } else {
836 return matches;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000837 }
838 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000839 }
840}
841
842
843Handle<Object> RegExpImpl::IrregexpExecOnce(Handle<FixedArray> irregexp,
844 int num_captures,
845 Handle<String> subject,
846 int previous_index,
847 int* offsets_vector,
848 int offsets_vector_length) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000849 ASSERT(subject->IsFlat(StringShape(*subject)));
ager@chromium.org8bb60582008-12-11 12:02:20 +0000850 bool rc;
851
852 int tag = Smi::cast(irregexp->get(kIrregexpImplementationIndex))->value();
853
ager@chromium.org8bb60582008-12-11 12:02:20 +0000854 switch (tag) {
855 case RegExpMacroAssembler::kIA32Implementation: {
856#ifndef ARM
857 Handle<Code> code = IrregexpNativeCode(irregexp);
858
859 StringShape shape(*subject);
860
861 // Character offsets into string.
862 int start_offset = previous_index;
863 int end_offset = subject->length(shape);
864
865 if (shape.IsCons()) {
866 subject = Handle<String>(ConsString::cast(*subject)->first());
867 } else if (shape.IsSliced()) {
868 SlicedString* slice = SlicedString::cast(*subject);
869 start_offset += slice->start();
870 end_offset += slice->start();
871 subject = Handle<String>(slice->buffer());
872 }
873
874 // String is now either Sequential or External
875 StringShape flatshape(*subject);
876 bool is_ascii = flatshape.IsAsciiRepresentation();
877 int char_size_shift = is_ascii ? 0 : 1;
878
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000879 RegExpMacroAssemblerIA32::Result res;
880
ager@chromium.org8bb60582008-12-11 12:02:20 +0000881 if (flatshape.IsExternal()) {
882 const byte* address;
883 if (is_ascii) {
884 ExternalAsciiString* ext = ExternalAsciiString::cast(*subject);
885 address = reinterpret_cast<const byte*>(ext->resource()->data());
886 } else {
887 ExternalTwoByteString* ext = ExternalTwoByteString::cast(*subject);
888 address = reinterpret_cast<const byte*>(ext->resource()->data());
889 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000890 res = RegExpMacroAssemblerIA32::Execute(
ager@chromium.org8bb60582008-12-11 12:02:20 +0000891 *code,
ager@chromium.org32912102009-01-16 10:38:43 +0000892 const_cast<Address*>(&address),
ager@chromium.org8bb60582008-12-11 12:02:20 +0000893 start_offset << char_size_shift,
894 end_offset << char_size_shift,
895 offsets_vector,
896 previous_index == 0);
897 } else { // Sequential string
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000898 ASSERT(StringShape(*subject).IsSequential());
ager@chromium.org8bb60582008-12-11 12:02:20 +0000899 Address char_address =
900 is_ascii ? SeqAsciiString::cast(*subject)->GetCharsAddress()
901 : SeqTwoByteString::cast(*subject)->GetCharsAddress();
902 int byte_offset = char_address - reinterpret_cast<Address>(*subject);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000903 res = RegExpMacroAssemblerIA32::Execute(
ager@chromium.org8bb60582008-12-11 12:02:20 +0000904 *code,
ager@chromium.org32912102009-01-16 10:38:43 +0000905 reinterpret_cast<Address*>(subject.location()),
ager@chromium.org8bb60582008-12-11 12:02:20 +0000906 byte_offset + (start_offset << char_size_shift),
907 byte_offset + (end_offset << char_size_shift),
908 offsets_vector,
909 previous_index == 0);
910 }
911
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000912 if (res == RegExpMacroAssemblerIA32::EXCEPTION) {
913 ASSERT(Top::has_pending_exception());
914 return Handle<Object>::null();
915 }
916 rc = (res == RegExpMacroAssemblerIA32::SUCCESS);
917
ager@chromium.org8bb60582008-12-11 12:02:20 +0000918 if (rc) {
919 // Capture values are relative to start_offset only.
920 for (int i = 0; i < offsets_vector_length; i++) {
921 if (offsets_vector[i] >= 0) {
922 offsets_vector[i] += previous_index;
923 }
924 }
925 }
926 break;
927#else
928 UNIMPLEMENTED();
929 rc = false;
930 break;
931#endif
932 }
933 case RegExpMacroAssembler::kBytecodeImplementation: {
934 for (int i = (num_captures + 1) * 2 - 1; i >= 0; i--) {
935 offsets_vector[i] = -1;
936 }
937 Handle<ByteArray> byte_codes = IrregexpByteCode(irregexp);
938
939 rc = IrregexpInterpreter::Match(byte_codes,
940 subject,
941 offsets_vector,
942 previous_index);
943 break;
944 }
945 case RegExpMacroAssembler::kARMImplementation:
946 default:
947 UNREACHABLE();
948 rc = false;
949 break;
950 }
951
952 if (!rc) {
953 return Factory::null_value();
954 }
955
956 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
957 // The captures come in (start, end+1) pairs.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000958 for (int i = 0; i < 2 * (num_captures + 1); i += 2) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000959 array->set(i, Smi::FromInt(offsets_vector[i]));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000960 array->set(i + 1, Smi::FromInt(offsets_vector[i + 1]));
ager@chromium.org8bb60582008-12-11 12:02:20 +0000961 }
962 return Factory::NewJSArrayWithElements(array);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000963}
964
965
966// -------------------------------------------------------------------
967// Implmentation of the Irregexp regular expression engine.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000968//
969// The Irregexp regular expression engine is intended to be a complete
970// implementation of ECMAScript regular expressions. It generates either
971// bytecodes or native code.
972
973// The Irregexp regexp engine is structured in three steps.
974// 1) The parser generates an abstract syntax tree. See ast.cc.
975// 2) From the AST a node network is created. The nodes are all
976// subclasses of RegExpNode. The nodes represent states when
977// executing a regular expression. Several optimizations are
978// performed on the node network.
979// 3) From the nodes we generate either byte codes or native code
980// that can actually execute the regular expression (perform
981// the search). The code generation step is described in more
982// detail below.
983
984// Code generation.
985//
986// The nodes are divided into four main categories.
987// * Choice nodes
988// These represent places where the regular expression can
989// match in more than one way. For example on entry to an
990// alternation (foo|bar) or a repetition (*, +, ? or {}).
991// * Action nodes
992// These represent places where some action should be
993// performed. Examples include recording the current position
994// in the input string to a register (in order to implement
995// captures) or other actions on register for example in order
996// to implement the counters needed for {} repetitions.
997// * Matching nodes
998// These attempt to match some element part of the input string.
999// Examples of elements include character classes, plain strings
1000// or back references.
1001// * End nodes
1002// These are used to implement the actions required on finding
1003// a successful match or failing to find a match.
1004//
1005// The code generated (whether as byte codes or native code) maintains
1006// some state as it runs. This consists of the following elements:
1007//
1008// * The capture registers. Used for string captures.
1009// * Other registers. Used for counters etc.
1010// * The current position.
1011// * The stack of backtracking information. Used when a matching node
1012// fails to find a match and needs to try an alternative.
1013//
1014// Conceptual regular expression execution model:
1015//
1016// There is a simple conceptual model of regular expression execution
1017// which will be presented first. The actual code generated is a more
1018// efficient simulation of the simple conceptual model:
1019//
1020// * Choice nodes are implemented as follows:
1021// For each choice except the last {
1022// push current position
1023// push backtrack code location
1024// <generate code to test for choice>
1025// backtrack code location:
1026// pop current position
1027// }
1028// <generate code to test for last choice>
1029//
1030// * Actions nodes are generated as follows
1031// <push affected registers on backtrack stack>
1032// <generate code to perform action>
1033// push backtrack code location
1034// <generate code to test for following nodes>
1035// backtrack code location:
1036// <pop affected registers to restore their state>
1037// <pop backtrack location from stack and go to it>
1038//
1039// * Matching nodes are generated as follows:
1040// if input string matches at current position
1041// update current position
1042// <generate code to test for following nodes>
1043// else
1044// <pop backtrack location from stack and go to it>
1045//
1046// Thus it can be seen that the current position is saved and restored
1047// by the choice nodes, whereas the registers are saved and restored by
1048// by the action nodes that manipulate them.
1049//
1050// The other interesting aspect of this model is that nodes are generated
1051// at the point where they are needed by a recursive call to Emit(). If
1052// the node has already been code generated then the Emit() call will
1053// generate a jump to the previously generated code instead. In order to
1054// limit recursion it is possible for the Emit() function to put the node
1055// on a work list for later generation and instead generate a jump. The
1056// destination of the jump is resolved later when the code is generated.
1057//
1058// Actual regular expression code generation.
1059//
1060// Code generation is actually more complicated than the above. In order
1061// to improve the efficiency of the generated code some optimizations are
1062// performed
1063//
1064// * Choice nodes have 1-character lookahead.
1065// A choice node looks at the following character and eliminates some of
1066// the choices immediately based on that character. This is not yet
1067// implemented.
1068// * Simple greedy loops store reduced backtracking information.
1069// A quantifier like /.*foo/m will greedily match the whole input. It will
1070// then need to backtrack to a point where it can match "foo". The naive
1071// implementation of this would push each character position onto the
1072// backtracking stack, then pop them off one by one. This would use space
1073// proportional to the length of the input string. However since the "."
1074// can only match in one way and always has a constant length (in this case
1075// of 1) it suffices to store the current position on the top of the stack
1076// once. Matching now becomes merely incrementing the current position and
1077// backtracking becomes decrementing the current position and checking the
1078// result against the stored current position. This is faster and saves
1079// space.
1080// * The current state is virtualized.
1081// This is used to defer expensive operations until it is clear that they
1082// are needed and to generate code for a node more than once, allowing
1083// specialized an efficient versions of the code to be created. This is
1084// explained in the section below.
1085//
1086// Execution state virtualization.
1087//
1088// Instead of emitting code, nodes that manipulate the state can record their
ager@chromium.org32912102009-01-16 10:38:43 +00001089// manipulation in an object called the Trace. The Trace object can record a
1090// current position offset, an optional backtrack code location on the top of
1091// the virtualized backtrack stack and some register changes. When a node is
1092// to be emitted it can flush the Trace or update it. Flushing the Trace
ager@chromium.org8bb60582008-12-11 12:02:20 +00001093// will emit code to bring the actual state into line with the virtual state.
1094// Avoiding flushing the state can postpone some work (eg updates of capture
1095// registers). Postponing work can save time when executing the regular
1096// expression since it may be found that the work never has to be done as a
1097// failure to match can occur. In addition it is much faster to jump to a
1098// known backtrack code location than it is to pop an unknown backtrack
1099// location from the stack and jump there.
1100//
ager@chromium.org32912102009-01-16 10:38:43 +00001101// The virtual state found in the Trace affects code generation. For example
1102// the virtual state contains the difference between the actual current
1103// position and the virtual current position, and matching code needs to use
1104// this offset to attempt a match in the correct location of the input
1105// string. Therefore code generated for a non-trivial trace is specialized
1106// to that trace. The code generator therefore has the ability to generate
1107// code for each node several times. In order to limit the size of the
1108// generated code there is an arbitrary limit on how many specialized sets of
1109// code may be generated for a given node. If the limit is reached, the
1110// trace is flushed and a generic version of the code for a node is emitted.
1111// This is subsequently used for that node. The code emitted for non-generic
1112// trace is not recorded in the node and so it cannot currently be reused in
1113// the event that code generation is requested for an identical trace.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001114
1115
1116void RegExpTree::AppendToText(RegExpText* text) {
1117 UNREACHABLE();
1118}
1119
1120
1121void RegExpAtom::AppendToText(RegExpText* text) {
1122 text->AddElement(TextElement::Atom(this));
1123}
1124
1125
1126void RegExpCharacterClass::AppendToText(RegExpText* text) {
1127 text->AddElement(TextElement::CharClass(this));
1128}
1129
1130
1131void RegExpText::AppendToText(RegExpText* text) {
1132 for (int i = 0; i < elements()->length(); i++)
1133 text->AddElement(elements()->at(i));
1134}
1135
1136
1137TextElement TextElement::Atom(RegExpAtom* atom) {
1138 TextElement result = TextElement(ATOM);
1139 result.data.u_atom = atom;
1140 return result;
1141}
1142
1143
1144TextElement TextElement::CharClass(
1145 RegExpCharacterClass* char_class) {
1146 TextElement result = TextElement(CHAR_CLASS);
1147 result.data.u_char_class = char_class;
1148 return result;
1149}
1150
1151
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001152int TextElement::length() {
1153 if (type == ATOM) {
1154 return data.u_atom->length();
1155 } else {
1156 ASSERT(type == CHAR_CLASS);
1157 return 1;
1158 }
1159}
1160
1161
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001162DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
1163 if (table_ == NULL) {
1164 table_ = new DispatchTable();
1165 DispatchTableConstructor cons(table_, ignore_case);
1166 cons.BuildTable(this);
1167 }
1168 return table_;
1169}
1170
1171
1172class RegExpCompiler {
1173 public:
ager@chromium.org8bb60582008-12-11 12:02:20 +00001174 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001175
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001176 int AllocateRegister() {
1177 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
1178 reg_exp_too_big_ = true;
1179 return next_register_;
1180 }
1181 return next_register_++;
1182 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001183
1184 Handle<FixedArray> Assemble(RegExpMacroAssembler* assembler,
1185 RegExpNode* start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001186 int capture_count,
1187 Handle<String> pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001188
1189 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
1190
1191 static const int kImplementationOffset = 0;
1192 static const int kNumberOfRegistersOffset = 0;
1193 static const int kCodeOffset = 1;
1194
1195 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
1196 EndNode* accept() { return accept_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001197
1198 static const int kMaxRecursion = 100;
1199 inline int recursion_depth() { return recursion_depth_; }
1200 inline void IncrementRecursionDepth() { recursion_depth_++; }
1201 inline void DecrementRecursionDepth() { recursion_depth_--; }
1202
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001203 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1204
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001205 inline bool ignore_case() { return ignore_case_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001206 inline bool ascii() { return ascii_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001207
ager@chromium.org32912102009-01-16 10:38:43 +00001208 static const int kNoRegister = -1;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001209 private:
1210 EndNode* accept_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001211 int next_register_;
1212 List<RegExpNode*>* work_list_;
1213 int recursion_depth_;
1214 RegExpMacroAssembler* macro_assembler_;
1215 bool ignore_case_;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001216 bool ascii_;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001217 bool reg_exp_too_big_;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001218};
1219
1220
1221class RecursionCheck {
1222 public:
1223 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1224 compiler->IncrementRecursionDepth();
1225 }
1226 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1227 private:
1228 RegExpCompiler* compiler_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001229};
1230
1231
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001232static Handle<FixedArray> IrregexpRegExpTooBig(Handle<String> pattern) {
1233 Handle<JSArray> array = Factory::NewJSArray(2);
1234 SetElement(array, 0, pattern);
1235 const char* message = "RegExp too big";
1236 SetElement(array, 1, Factory::NewStringFromUtf8(CStrVector(message)));
1237 Handle<Object> regexp_err =
1238 Factory::NewSyntaxError("malformed_regexp", array);
1239 Top::Throw(*regexp_err);
1240 return Handle<FixedArray>();
1241}
1242
1243
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001244// Attempts to compile the regexp using an Irregexp code generator. Returns
1245// a fixed array or a null handle depending on whether it succeeded.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001246RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001247 : next_register_(2 * (capture_count + 1)),
1248 work_list_(NULL),
1249 recursion_depth_(0),
ager@chromium.org8bb60582008-12-11 12:02:20 +00001250 ignore_case_(ignore_case),
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001251 ascii_(ascii),
1252 reg_exp_too_big_(false) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001253 accept_ = new EndNode(EndNode::ACCEPT);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001254 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001255}
1256
1257
1258Handle<FixedArray> RegExpCompiler::Assemble(
1259 RegExpMacroAssembler* macro_assembler,
1260 RegExpNode* start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001261 int capture_count,
1262 Handle<String> pattern) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001263#ifdef DEBUG
1264 if (FLAG_trace_regexp_assembler)
1265 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
1266 else
1267#endif
1268 macro_assembler_ = macro_assembler;
1269 List <RegExpNode*> work_list(0);
1270 work_list_ = &work_list;
1271 Label fail;
iposva@chromium.org245aa852009-02-10 00:49:54 +00001272 macro_assembler_->PushBacktrack(&fail);
ager@chromium.org32912102009-01-16 10:38:43 +00001273 Trace new_trace;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001274 start->Emit(this, &new_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001275 macro_assembler_->Bind(&fail);
1276 macro_assembler_->Fail();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001277 while (!work_list.is_empty()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001278 work_list.RemoveLast()->Emit(this, &new_trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001279 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001280 if (reg_exp_too_big_) return IrregexpRegExpTooBig(pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001281 Handle<FixedArray> array =
1282 Factory::NewFixedArray(RegExpImpl::kIrregexpDataLength);
1283 array->set(RegExpImpl::kIrregexpImplementationIndex,
1284 Smi::FromInt(macro_assembler_->Implementation()));
1285 array->set(RegExpImpl::kIrregexpNumberOfRegistersIndex,
1286 Smi::FromInt(next_register_));
1287 array->set(RegExpImpl::kIrregexpNumberOfCapturesIndex,
1288 Smi::FromInt(capture_count));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001289 Handle<Object> code = macro_assembler_->GetCode(pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001290 array->set(RegExpImpl::kIrregexpCodeIndex, *code);
1291 work_list_ = NULL;
1292#ifdef DEBUG
1293 if (FLAG_trace_regexp_assembler) {
1294 delete macro_assembler_;
1295 }
1296#endif
1297 return array;
1298}
1299
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001300
ager@chromium.org32912102009-01-16 10:38:43 +00001301bool Trace::DeferredAction::Mentions(int that) {
1302 if (type() == ActionNode::CLEAR_CAPTURES) {
1303 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1304 return range.Contains(that);
1305 } else {
1306 return reg() == that;
1307 }
1308}
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001309
ager@chromium.org32912102009-01-16 10:38:43 +00001310
1311bool Trace::mentions_reg(int reg) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001312 for (DeferredAction* action = actions_;
1313 action != NULL;
1314 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +00001315 if (action->Mentions(reg))
1316 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001317 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001318 return false;
1319}
1320
1321
ager@chromium.org32912102009-01-16 10:38:43 +00001322bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1323 ASSERT_EQ(0, *cp_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001324 for (DeferredAction* action = actions_;
1325 action != NULL;
1326 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +00001327 if (action->Mentions(reg)) {
1328 if (action->type() == ActionNode::STORE_POSITION) {
1329 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1330 return true;
1331 } else {
1332 return false;
1333 }
1334 }
1335 }
1336 return false;
1337}
1338
1339
1340int Trace::FindAffectedRegisters(OutSet* affected_registers) {
1341 int max_register = RegExpCompiler::kNoRegister;
1342 for (DeferredAction* action = actions_;
1343 action != NULL;
1344 action = action->next()) {
1345 if (action->type() == ActionNode::CLEAR_CAPTURES) {
1346 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1347 for (int i = range.from(); i <= range.to(); i++)
1348 affected_registers->Set(i);
1349 if (range.to() > max_register) max_register = range.to();
1350 } else {
1351 affected_registers->Set(action->reg());
1352 if (action->reg() > max_register) max_register = action->reg();
1353 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001354 }
1355 return max_register;
1356}
1357
1358
ager@chromium.org32912102009-01-16 10:38:43 +00001359void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1360 int max_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001361 OutSet& registers_to_pop,
1362 OutSet& registers_to_clear) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001363 for (int reg = max_register; reg >= 0; reg--) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001364 if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
1365 else if (registers_to_clear.Get(reg)) {
1366 int clear_to = reg;
1367 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1368 reg--;
1369 }
1370 assembler->ClearRegisters(reg, clear_to);
1371 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001372 }
1373}
1374
1375
ager@chromium.org32912102009-01-16 10:38:43 +00001376void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1377 int max_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001378 OutSet& affected_registers,
1379 OutSet* registers_to_pop,
1380 OutSet* registers_to_clear) {
1381 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1382 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1383
ager@chromium.org8bb60582008-12-11 12:02:20 +00001384 for (int reg = 0; reg <= max_register; reg++) {
1385 if (!affected_registers.Get(reg)) {
1386 continue;
1387 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001388 // Count pushes performed to force a stack limit check occasionally.
1389 int pushes = 0;
1390
1391 // The chronologically first deferred action in the trace
1392 // is used to infer the action needed to restore a register
1393 // to its previous state (or not, if it's safe to ignore it).
1394 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1395 DeferredActionUndoType undo_action = IGNORE;
1396
ager@chromium.org8bb60582008-12-11 12:02:20 +00001397 int value = 0;
1398 bool absolute = false;
ager@chromium.org32912102009-01-16 10:38:43 +00001399 bool clear = false;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001400 int store_position = -1;
1401 // This is a little tricky because we are scanning the actions in reverse
1402 // historical order (newest first).
1403 for (DeferredAction* action = actions_;
1404 action != NULL;
1405 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +00001406 if (action->Mentions(reg)) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001407 switch (action->type()) {
1408 case ActionNode::SET_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00001409 Trace::DeferredSetRegister* psr =
1410 static_cast<Trace::DeferredSetRegister*>(action);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001411 if (!absolute) {
1412 value += psr->value();
1413 absolute = true;
1414 }
1415 // SET_REGISTER is currently only used for newly introduced loop
1416 // counters. They can have a significant previous value if they
1417 // occour in a loop. TODO(lrn): Propagate this information, so
1418 // we can set undo_action to IGNORE if we know there is no value to
1419 // restore.
1420 undo_action = RESTORE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001421 ASSERT_EQ(store_position, -1);
ager@chromium.org32912102009-01-16 10:38:43 +00001422 ASSERT(!clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001423 break;
1424 }
1425 case ActionNode::INCREMENT_REGISTER:
1426 if (!absolute) {
1427 value++;
1428 }
1429 ASSERT_EQ(store_position, -1);
ager@chromium.org32912102009-01-16 10:38:43 +00001430 ASSERT(!clear);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001431 undo_action = RESTORE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001432 break;
1433 case ActionNode::STORE_POSITION: {
ager@chromium.org32912102009-01-16 10:38:43 +00001434 Trace::DeferredCapture* pc =
1435 static_cast<Trace::DeferredCapture*>(action);
1436 if (!clear && store_position == -1) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001437 store_position = pc->cp_offset();
1438 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001439
1440 // For captures we know that stores and clears alternate.
1441 // Other register, are never cleared, and if the occur
1442 // inside a loop, they might be assigned more than once.
1443 if (reg <= 1) {
1444 // Registers zero and one, aka "capture zero", is
1445 // always set correctly if we succeed. There is no
1446 // need to undo a setting on backtrack, because we
1447 // will set it again or fail.
1448 undo_action = IGNORE;
1449 } else {
1450 undo_action = pc->is_capture() ? CLEAR : RESTORE;
1451 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001452 ASSERT(!absolute);
1453 ASSERT_EQ(value, 0);
1454 break;
1455 }
ager@chromium.org32912102009-01-16 10:38:43 +00001456 case ActionNode::CLEAR_CAPTURES: {
1457 // Since we're scanning in reverse order, if we've already
1458 // set the position we have to ignore historically earlier
1459 // clearing operations.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001460 if (store_position == -1) {
ager@chromium.org32912102009-01-16 10:38:43 +00001461 clear = true;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001462 }
1463 undo_action = RESTORE;
ager@chromium.org32912102009-01-16 10:38:43 +00001464 ASSERT(!absolute);
1465 ASSERT_EQ(value, 0);
1466 break;
1467 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001468 default:
1469 UNREACHABLE();
1470 break;
1471 }
1472 }
1473 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001474 // Prepare for the undo-action (e.g., push if it's going to be popped).
1475 if (undo_action == RESTORE) {
1476 pushes++;
1477 RegExpMacroAssembler::StackCheckFlag stack_check =
1478 RegExpMacroAssembler::kNoStackLimitCheck;
1479 if (pushes == push_limit) {
1480 stack_check = RegExpMacroAssembler::kCheckStackLimit;
1481 pushes = 0;
1482 }
1483
1484 assembler->PushRegister(reg, stack_check);
1485 registers_to_pop->Set(reg);
1486 } else if (undo_action == CLEAR) {
1487 registers_to_clear->Set(reg);
1488 }
1489 // Perform the chronologically last action (or accumulated increment)
1490 // for the register.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001491 if (store_position != -1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001492 assembler->WriteCurrentPositionToRegister(reg, store_position);
ager@chromium.org32912102009-01-16 10:38:43 +00001493 } else if (clear) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001494 assembler->ClearRegisters(reg, reg);
ager@chromium.org32912102009-01-16 10:38:43 +00001495 } else if (absolute) {
1496 assembler->SetRegister(reg, value);
1497 } else if (value != 0) {
1498 assembler->AdvanceRegister(reg, value);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001499 }
1500 }
1501}
1502
1503
ager@chromium.org8bb60582008-12-11 12:02:20 +00001504// This is called as we come into a loop choice node and some other tricky
ager@chromium.org32912102009-01-16 10:38:43 +00001505// nodes. It normalizes the state of the code generator to ensure we can
ager@chromium.org8bb60582008-12-11 12:02:20 +00001506// generate generic code.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001507void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001508 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001509
iposva@chromium.org245aa852009-02-10 00:49:54 +00001510 ASSERT(!is_trivial());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001511
1512 if (actions_ == NULL && backtrack() == NULL) {
1513 // Here we just have some deferred cp advances to fix and we are back to
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001514 // a normal situation. We may also have to forget some information gained
1515 // through a quick check that was already performed.
1516 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001517 // Create a new trivial state and generate the node with that.
ager@chromium.org32912102009-01-16 10:38:43 +00001518 Trace new_state;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001519 successor->Emit(compiler, &new_state);
1520 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001521 }
1522
1523 // Generate deferred actions here along with code to undo them again.
1524 OutSet affected_registers;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001525
ager@chromium.org8bb60582008-12-11 12:02:20 +00001526 int max_register = FindAffectedRegisters(&affected_registers);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001527 OutSet registers_to_pop;
1528 OutSet registers_to_clear;
1529 PerformDeferredActions(assembler,
1530 max_register,
1531 affected_registers,
1532 &registers_to_pop,
1533 &registers_to_clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001534 if (backtrack() != NULL) {
1535 // Here we have a concrete backtrack location. These are set up by choice
1536 // nodes and so they indicate that we have a deferred save of the current
1537 // position which we may need to emit here.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001538 assembler->PushCurrentPosition();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001539 }
1540 if (cp_offset_ != 0) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001541 assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001542 }
1543
1544 // Create a new trivial state and generate the node with that.
1545 Label undo;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001546 assembler->PushBacktrack(&undo);
ager@chromium.org32912102009-01-16 10:38:43 +00001547 Trace new_state;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001548 successor->Emit(compiler, &new_state);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001549
1550 // On backtrack we need to restore state.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001551 assembler->Bind(&undo);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001552 if (backtrack() != NULL) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001553 assembler->PopCurrentPosition();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001554 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001555 RestoreAffectedRegisters(assembler,
1556 max_register,
1557 registers_to_pop,
1558 registers_to_clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001559 if (backtrack() == NULL) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001560 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001561 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001562 assembler->GoTo(backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001563 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001564}
1565
1566
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001567void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001568 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001569
1570 // Omit flushing the trace. We discard the entire stack frame anyway.
1571
ager@chromium.org8bb60582008-12-11 12:02:20 +00001572 if (!label()->is_bound()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001573 // We are completely independent of the trace, since we ignore it,
1574 // so this code can be used as the generic version.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001575 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001576 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001577
1578 // Throw away everything on the backtrack stack since the start
1579 // of the negative submatch and restore the character position.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001580 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1581 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001582 if (clear_capture_count_ > 0) {
1583 // Clear any captures that might have been performed during the success
1584 // of the body of the negative look-ahead.
1585 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1586 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1587 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001588 // Now that we have unwound the stack we find at the top of the stack the
1589 // backtrack that the BeginSubmatch node got.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001590 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001591}
1592
1593
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001594void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org32912102009-01-16 10:38:43 +00001595 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001596 trace->Flush(compiler, this);
1597 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001598 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001599 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001600 if (!label()->is_bound()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001601 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001602 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001603 switch (action_) {
1604 case ACCEPT:
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001605 assembler->Succeed();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001606 return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001607 case BACKTRACK:
ager@chromium.org32912102009-01-16 10:38:43 +00001608 assembler->GoTo(trace->backtrack());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001609 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001610 case NEGATIVE_SUBMATCH_SUCCESS:
1611 // This case is handled in a different virtual method.
1612 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001613 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001614 UNIMPLEMENTED();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001615}
1616
1617
1618void GuardedAlternative::AddGuard(Guard* guard) {
1619 if (guards_ == NULL)
1620 guards_ = new ZoneList<Guard*>(1);
1621 guards_->Add(guard);
1622}
1623
1624
ager@chromium.org8bb60582008-12-11 12:02:20 +00001625ActionNode* ActionNode::SetRegister(int reg,
1626 int val,
1627 RegExpNode* on_success) {
1628 ActionNode* result = new ActionNode(SET_REGISTER, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001629 result->data_.u_store_register.reg = reg;
1630 result->data_.u_store_register.value = val;
1631 return result;
1632}
1633
1634
1635ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1636 ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
1637 result->data_.u_increment_register.reg = reg;
1638 return result;
1639}
1640
1641
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001642ActionNode* ActionNode::StorePosition(int reg,
1643 bool is_capture,
1644 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001645 ActionNode* result = new ActionNode(STORE_POSITION, on_success);
1646 result->data_.u_position_register.reg = reg;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001647 result->data_.u_position_register.is_capture = is_capture;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001648 return result;
1649}
1650
1651
ager@chromium.org32912102009-01-16 10:38:43 +00001652ActionNode* ActionNode::ClearCaptures(Interval range,
1653 RegExpNode* on_success) {
1654 ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
1655 result->data_.u_clear_captures.range_from = range.from();
1656 result->data_.u_clear_captures.range_to = range.to();
1657 return result;
1658}
1659
1660
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001661ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1662 int position_reg,
1663 RegExpNode* on_success) {
1664 ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
1665 result->data_.u_submatch.stack_pointer_register = stack_reg;
1666 result->data_.u_submatch.current_position_register = position_reg;
1667 return result;
1668}
1669
1670
ager@chromium.org8bb60582008-12-11 12:02:20 +00001671ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1672 int position_reg,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001673 int clear_register_count,
1674 int clear_register_from,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001675 RegExpNode* on_success) {
1676 ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001677 result->data_.u_submatch.stack_pointer_register = stack_reg;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001678 result->data_.u_submatch.current_position_register = position_reg;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001679 result->data_.u_submatch.clear_register_count = clear_register_count;
1680 result->data_.u_submatch.clear_register_from = clear_register_from;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001681 return result;
1682}
1683
1684
ager@chromium.org32912102009-01-16 10:38:43 +00001685ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1686 int repetition_register,
1687 int repetition_limit,
1688 RegExpNode* on_success) {
1689 ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
1690 result->data_.u_empty_match_check.start_register = start_register;
1691 result->data_.u_empty_match_check.repetition_register = repetition_register;
1692 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1693 return result;
1694}
1695
1696
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001697#define DEFINE_ACCEPT(Type) \
1698 void Type##Node::Accept(NodeVisitor* visitor) { \
1699 visitor->Visit##Type(this); \
1700 }
1701FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1702#undef DEFINE_ACCEPT
1703
1704
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001705void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1706 visitor->VisitLoopChoice(this);
1707}
1708
1709
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001710// -------------------------------------------------------------------
1711// Emit code.
1712
1713
1714void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1715 Guard* guard,
ager@chromium.org32912102009-01-16 10:38:43 +00001716 Trace* trace) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001717 switch (guard->op()) {
1718 case Guard::LT:
ager@chromium.org32912102009-01-16 10:38:43 +00001719 ASSERT(!trace->mentions_reg(guard->reg()));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001720 macro_assembler->IfRegisterGE(guard->reg(),
1721 guard->value(),
ager@chromium.org32912102009-01-16 10:38:43 +00001722 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001723 break;
1724 case Guard::GEQ:
ager@chromium.org32912102009-01-16 10:38:43 +00001725 ASSERT(!trace->mentions_reg(guard->reg()));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001726 macro_assembler->IfRegisterLT(guard->reg(),
1727 guard->value(),
ager@chromium.org32912102009-01-16 10:38:43 +00001728 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001729 break;
1730 }
1731}
1732
1733
1734static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
1735static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
1736
1737
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001738// Only emits non-letters (things that don't have case). Only used for case
1739// independent matches.
1740static inline bool EmitAtomNonLetter(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001741 RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001742 uc16 c,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001743 Label* on_failure,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001744 int cp_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001745 bool check,
1746 bool preloaded) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001747 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001748 int length = uncanonicalize.get(c, '\0', chars);
1749 bool checked = false;
1750 if (length <= 1) {
1751 if (!preloaded) {
1752 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1753 checked = check;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001754 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001755 macro_assembler->CheckNotCharacter(c, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001756 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001757 return checked;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001758}
1759
1760
1761static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001762 bool ascii,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001763 uc16 c1,
1764 uc16 c2,
1765 Label* on_failure) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001766 uc16 char_mask;
1767 if (ascii) {
1768 char_mask = String::kMaxAsciiCharCode;
1769 } else {
1770 char_mask = String::kMaxUC16CharCode;
1771 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001772 uc16 exor = c1 ^ c2;
1773 // Check whether exor has only one bit set.
1774 if (((exor - 1) & exor) == 0) {
1775 // If c1 and c2 differ only by one bit.
1776 // Ecma262UnCanonicalize always gives the highest number last.
1777 ASSERT(c2 > c1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001778 uc16 mask = char_mask ^ exor;
1779 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001780 return true;
1781 }
1782 ASSERT(c2 > c1);
1783 uc16 diff = c2 - c1;
1784 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1785 // If the characters differ by 2^n but don't differ by one bit then
1786 // subtract the difference from the found character, then do the or
1787 // trick. We avoid the theoretical case where negative numbers are
1788 // involved in order to simplify code generation.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001789 uc16 mask = char_mask ^ diff;
1790 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1791 diff,
1792 mask,
1793 on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001794 return true;
1795 }
1796 return false;
1797}
1798
1799
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001800// Only emits letters (things that have case). Only used for case independent
1801// matches.
1802static inline bool EmitAtomLetter(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001803 RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001804 bool ascii,
1805 uc16 c,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001806 Label* on_failure,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001807 int cp_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001808 bool check,
1809 bool preloaded) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001810 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001811 int length = uncanonicalize.get(c, '\0', chars);
1812 if (length <= 1) return false;
1813 // We may not need to check against the end of the input string
1814 // if this character lies before a character that matched.
1815 if (!preloaded) {
1816 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001817 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001818 Label ok;
1819 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1820 switch (length) {
1821 case 2: {
1822 if (ShortCutEmitCharacterPair(macro_assembler,
1823 ascii,
1824 chars[0],
1825 chars[1],
1826 on_failure)) {
1827 } else {
1828 macro_assembler->CheckCharacter(chars[0], &ok);
1829 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1830 macro_assembler->Bind(&ok);
1831 }
1832 break;
1833 }
1834 case 4:
1835 macro_assembler->CheckCharacter(chars[3], &ok);
1836 // Fall through!
1837 case 3:
1838 macro_assembler->CheckCharacter(chars[0], &ok);
1839 macro_assembler->CheckCharacter(chars[1], &ok);
1840 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1841 macro_assembler->Bind(&ok);
1842 break;
1843 default:
1844 UNREACHABLE();
1845 break;
1846 }
1847 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001848}
1849
1850
1851static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1852 RegExpCharacterClass* cc,
1853 int cp_offset,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001854 Label* on_failure,
1855 bool check_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001856 bool ascii,
1857 bool preloaded) {
1858 if (cc->is_standard() &&
1859 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1860 cp_offset,
1861 check_offset,
1862 on_failure)) {
1863 return;
1864 }
1865
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001866 ZoneList<CharacterRange>* ranges = cc->ranges();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001867 int max_char;
1868 if (ascii) {
1869 max_char = String::kMaxAsciiCharCode;
1870 } else {
1871 max_char = String::kMaxUC16CharCode;
1872 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001873
1874 Label success;
1875
1876 Label* char_is_in_class =
1877 cc->is_negated() ? on_failure : &success;
1878
1879 int range_count = ranges->length();
1880
ager@chromium.org8bb60582008-12-11 12:02:20 +00001881 int last_valid_range = range_count - 1;
1882 while (last_valid_range >= 0) {
1883 CharacterRange& range = ranges->at(last_valid_range);
1884 if (range.from() <= max_char) {
1885 break;
1886 }
1887 last_valid_range--;
1888 }
1889
1890 if (last_valid_range < 0) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001891 if (!cc->is_negated()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001892 // TODO(plesner): We can remove this when the node level does our
1893 // ASCII optimizations for us.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001894 macro_assembler->GoTo(on_failure);
1895 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001896 if (check_offset) {
1897 macro_assembler->CheckPosition(cp_offset, on_failure);
1898 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001899 return;
1900 }
1901
ager@chromium.org8bb60582008-12-11 12:02:20 +00001902 if (last_valid_range == 0 &&
1903 !cc->is_negated() &&
1904 ranges->at(0).IsEverything(max_char)) {
1905 // This is a common case hit by non-anchored expressions.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001906 if (check_offset) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001907 macro_assembler->CheckPosition(cp_offset, on_failure);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001908 }
1909 return;
1910 }
1911
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001912 if (!preloaded) {
1913 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001914 }
1915
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001916 for (int i = 0; i < last_valid_range; i++) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001917 CharacterRange& range = ranges->at(i);
1918 Label next_range;
1919 uc16 from = range.from();
1920 uc16 to = range.to();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001921 if (from > max_char) {
1922 continue;
1923 }
1924 if (to > max_char) to = max_char;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001925 if (to == from) {
1926 macro_assembler->CheckCharacter(to, char_is_in_class);
1927 } else {
1928 if (from != 0) {
1929 macro_assembler->CheckCharacterLT(from, &next_range);
1930 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001931 if (to != max_char) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001932 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
1933 } else {
1934 macro_assembler->GoTo(char_is_in_class);
1935 }
1936 }
1937 macro_assembler->Bind(&next_range);
1938 }
1939
ager@chromium.org8bb60582008-12-11 12:02:20 +00001940 CharacterRange& range = ranges->at(last_valid_range);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001941 uc16 from = range.from();
1942 uc16 to = range.to();
1943
ager@chromium.org8bb60582008-12-11 12:02:20 +00001944 if (to > max_char) to = max_char;
1945 ASSERT(to >= from);
1946
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001947 if (to == from) {
1948 if (cc->is_negated()) {
1949 macro_assembler->CheckCharacter(to, on_failure);
1950 } else {
1951 macro_assembler->CheckNotCharacter(to, on_failure);
1952 }
1953 } else {
1954 if (from != 0) {
1955 if (cc->is_negated()) {
1956 macro_assembler->CheckCharacterLT(from, &success);
1957 } else {
1958 macro_assembler->CheckCharacterLT(from, on_failure);
1959 }
1960 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001961 if (to != String::kMaxUC16CharCode) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001962 if (cc->is_negated()) {
1963 macro_assembler->CheckCharacterLT(to + 1, on_failure);
1964 } else {
1965 macro_assembler->CheckCharacterGT(to, on_failure);
1966 }
1967 } else {
1968 if (cc->is_negated()) {
1969 macro_assembler->GoTo(on_failure);
1970 }
1971 }
1972 }
1973 macro_assembler->Bind(&success);
1974}
1975
1976
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001977RegExpNode::~RegExpNode() {
1978}
1979
1980
ager@chromium.org8bb60582008-12-11 12:02:20 +00001981RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00001982 Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001983 // If we are generating a greedy loop then don't stop and don't reuse code.
ager@chromium.org32912102009-01-16 10:38:43 +00001984 if (trace->stop_node() != NULL) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001985 return CONTINUE;
1986 }
1987
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001988 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00001989 if (trace->is_trivial()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001990 if (label_.is_bound()) {
1991 // We are being asked to generate a generic version, but that's already
1992 // been done so just go to it.
1993 macro_assembler->GoTo(&label_);
1994 return DONE;
1995 }
1996 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1997 // To avoid too deep recursion we push the node to the work queue and just
1998 // generate a goto here.
1999 compiler->AddWork(this);
2000 macro_assembler->GoTo(&label_);
2001 return DONE;
2002 }
2003 // Generate generic version of the node and bind the label for later use.
2004 macro_assembler->Bind(&label_);
2005 return CONTINUE;
2006 }
2007
2008 // We are being asked to make a non-generic version. Keep track of how many
2009 // non-generic versions we generate so as not to overdo it.
ager@chromium.org32912102009-01-16 10:38:43 +00002010 trace_count_++;
iposva@chromium.org245aa852009-02-10 00:49:54 +00002011 if (FLAG_irregexp_optimization &&
2012 trace_count_ < kMaxCopiesCodeGenerated &&
ager@chromium.org8bb60582008-12-11 12:02:20 +00002013 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
2014 return CONTINUE;
2015 }
2016
ager@chromium.org32912102009-01-16 10:38:43 +00002017 // If we get here code has been generated for this node too many times or
2018 // recursion is too deep. Time to switch to a generic version. The code for
ager@chromium.org8bb60582008-12-11 12:02:20 +00002019 // generic versions above can handle deep recursion properly.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002020 trace->Flush(compiler, this);
2021 return DONE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002022}
2023
2024
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002025int ActionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002026 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2027 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002028 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002029}
2030
2031
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002032int AssertionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
2033 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2034 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
2035}
2036
2037
2038int BackReferenceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
2039 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2040 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
2041}
2042
2043
2044int TextNode::EatsAtLeast(int still_to_find, int recursion_depth) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002045 int answer = Length();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002046 if (answer >= still_to_find) return answer;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002047 if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002048 return answer + on_success()->EatsAtLeast(still_to_find - answer,
2049 recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002050}
2051
2052
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002053int NegativeLookaheadChoiceNode:: EatsAtLeast(int still_to_find,
2054 int recursion_depth) {
2055 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2056 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2057 // afterwards.
2058 RegExpNode* node = alternatives_->at(1).node();
2059 return node->EatsAtLeast(still_to_find, recursion_depth + 1);
2060}
2061
2062
2063void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
2064 QuickCheckDetails* details,
2065 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002066 int filled_in,
2067 bool not_at_start) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002068 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2069 // afterwards.
2070 RegExpNode* node = alternatives_->at(1).node();
iposva@chromium.org245aa852009-02-10 00:49:54 +00002071 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002072}
2073
2074
2075int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2076 int recursion_depth,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002077 RegExpNode* ignore_this_node) {
2078 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2079 int min = 100;
2080 int choice_count = alternatives_->length();
2081 for (int i = 0; i < choice_count; i++) {
2082 RegExpNode* node = alternatives_->at(i).node();
2083 if (node == ignore_this_node) continue;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002084 int node_eats_at_least = node->EatsAtLeast(still_to_find,
2085 recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002086 if (node_eats_at_least < min) min = node_eats_at_least;
2087 }
2088 return min;
2089}
2090
2091
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002092int LoopChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
2093 return EatsAtLeastHelper(still_to_find, recursion_depth, loop_node_);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002094}
2095
2096
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002097int ChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
2098 return EatsAtLeastHelper(still_to_find, recursion_depth, NULL);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002099}
2100
2101
2102// Takes the left-most 1-bit and smears it out, setting all bits to its right.
2103static inline uint32_t SmearBitsRight(uint32_t v) {
2104 v |= v >> 1;
2105 v |= v >> 2;
2106 v |= v >> 4;
2107 v |= v >> 8;
2108 v |= v >> 16;
2109 return v;
2110}
2111
2112
2113bool QuickCheckDetails::Rationalize(bool asc) {
2114 bool found_useful_op = false;
2115 uint32_t char_mask;
2116 if (asc) {
2117 char_mask = String::kMaxAsciiCharCode;
2118 } else {
2119 char_mask = String::kMaxUC16CharCode;
2120 }
2121 mask_ = 0;
2122 value_ = 0;
2123 int char_shift = 0;
2124 for (int i = 0; i < characters_; i++) {
2125 Position* pos = &positions_[i];
2126 if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
2127 found_useful_op = true;
2128 }
2129 mask_ |= (pos->mask & char_mask) << char_shift;
2130 value_ |= (pos->value & char_mask) << char_shift;
2131 char_shift += asc ? 8 : 16;
2132 }
2133 return found_useful_op;
2134}
2135
2136
2137bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00002138 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002139 bool preload_has_checked_bounds,
2140 Label* on_possible_success,
2141 QuickCheckDetails* details,
2142 bool fall_through_on_failure) {
2143 if (details->characters() == 0) return false;
iposva@chromium.org245aa852009-02-10 00:49:54 +00002144 GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
2145 if (details->cannot_match()) return false;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002146 if (!details->Rationalize(compiler->ascii())) return false;
2147 uint32_t mask = details->mask();
2148 uint32_t value = details->value();
2149
2150 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2151
ager@chromium.org32912102009-01-16 10:38:43 +00002152 if (trace->characters_preloaded() != details->characters()) {
2153 assembler->LoadCurrentCharacter(trace->cp_offset(),
2154 trace->backtrack(),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002155 !preload_has_checked_bounds,
2156 details->characters());
2157 }
2158
2159
2160 bool need_mask = true;
2161
2162 if (details->characters() == 1) {
2163 // If number of characters preloaded is 1 then we used a byte or 16 bit
2164 // load so the value is already masked down.
2165 uint32_t char_mask;
2166 if (compiler->ascii()) {
2167 char_mask = String::kMaxAsciiCharCode;
2168 } else {
2169 char_mask = String::kMaxUC16CharCode;
2170 }
2171 if ((mask & char_mask) == char_mask) need_mask = false;
2172 mask &= char_mask;
2173 } else {
2174 // For 2-character preloads in ASCII mode we also use a 16 bit load with
2175 // zero extend.
2176 if (details->characters() == 2 && compiler->ascii()) {
2177 if ((mask & 0xffff) == 0xffff) need_mask = false;
2178 } else {
2179 if (mask == 0xffffffff) need_mask = false;
2180 }
2181 }
2182
2183 if (fall_through_on_failure) {
2184 if (need_mask) {
2185 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2186 } else {
2187 assembler->CheckCharacter(value, on_possible_success);
2188 }
2189 } else {
2190 if (need_mask) {
ager@chromium.org32912102009-01-16 10:38:43 +00002191 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002192 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00002193 assembler->CheckNotCharacter(value, trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002194 }
2195 }
2196 return true;
2197}
2198
2199
2200// Here is the meat of GetQuickCheckDetails (see also the comment on the
2201// super-class in the .h file).
2202//
2203// We iterate along the text object, building up for each character a
2204// mask and value that can be used to test for a quick failure to match.
2205// The masks and values for the positions will be combined into a single
2206// machine word for the current character width in order to be used in
2207// generating a quick check.
2208void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2209 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002210 int characters_filled_in,
2211 bool not_at_start) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002212 ASSERT(characters_filled_in < details->characters());
2213 int characters = details->characters();
2214 int char_mask;
2215 int char_shift;
2216 if (compiler->ascii()) {
2217 char_mask = String::kMaxAsciiCharCode;
2218 char_shift = 8;
2219 } else {
2220 char_mask = String::kMaxUC16CharCode;
2221 char_shift = 16;
2222 }
2223 for (int k = 0; k < elms_->length(); k++) {
2224 TextElement elm = elms_->at(k);
2225 if (elm.type == TextElement::ATOM) {
2226 Vector<const uc16> quarks = elm.data.u_atom->data();
2227 for (int i = 0; i < characters && i < quarks.length(); i++) {
2228 QuickCheckDetails::Position* pos =
2229 details->positions(characters_filled_in);
2230 if (compiler->ignore_case()) {
2231 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2232 uc16 c = quarks[i];
2233 int length = uncanonicalize.get(c, '\0', chars);
2234 if (length < 2) {
2235 // This letter has no case equivalents, so it's nice and simple
2236 // and the mask-compare will determine definitely whether we have
2237 // a match at this character position.
2238 pos->mask = char_mask;
2239 pos->value = c;
2240 pos->determines_perfectly = true;
2241 } else {
2242 uint32_t common_bits = char_mask;
2243 uint32_t bits = chars[0];
2244 for (int j = 1; j < length; j++) {
2245 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2246 common_bits ^= differing_bits;
2247 bits &= common_bits;
2248 }
2249 // If length is 2 and common bits has only one zero in it then
2250 // our mask and compare instruction will determine definitely
2251 // whether we have a match at this character position. Otherwise
2252 // it can only be an approximate check.
2253 uint32_t one_zero = (common_bits | ~char_mask);
2254 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2255 pos->determines_perfectly = true;
2256 }
2257 pos->mask = common_bits;
2258 pos->value = bits;
2259 }
2260 } else {
2261 // Don't ignore case. Nice simple case where the mask-compare will
2262 // determine definitely whether we have a match at this character
2263 // position.
2264 pos->mask = char_mask;
2265 pos->value = quarks[i];
2266 pos->determines_perfectly = true;
2267 }
2268 characters_filled_in++;
2269 ASSERT(characters_filled_in <= details->characters());
2270 if (characters_filled_in == details->characters()) {
2271 return;
2272 }
2273 }
2274 } else {
2275 QuickCheckDetails::Position* pos =
2276 details->positions(characters_filled_in);
2277 RegExpCharacterClass* tree = elm.data.u_char_class;
2278 ZoneList<CharacterRange>* ranges = tree->ranges();
2279 CharacterRange range = ranges->at(0);
2280 if (tree->is_negated()) {
2281 // A quick check uses multi-character mask and compare. There is no
2282 // useful way to incorporate a negative char class into this scheme
2283 // so we just conservatively create a mask and value that will always
2284 // succeed.
2285 pos->mask = 0;
2286 pos->value = 0;
2287 } else {
2288 uint32_t differing_bits = (range.from() ^ range.to());
2289 // A mask and compare is only perfect if the differing bits form a
2290 // number like 00011111 with one single block of trailing 1s.
2291 if ((differing_bits & (differing_bits + 1)) == 0) {
2292 pos->determines_perfectly = true;
2293 }
2294 uint32_t common_bits = ~SmearBitsRight(differing_bits);
2295 uint32_t bits = (range.from() & common_bits);
2296 for (int i = 1; i < ranges->length(); i++) {
2297 // Here we are combining more ranges into the mask and compare
2298 // value. With each new range the mask becomes more sparse and
2299 // so the chances of a false positive rise. A character class
2300 // with multiple ranges is assumed never to be equivalent to a
2301 // mask and compare operation.
2302 pos->determines_perfectly = false;
2303 CharacterRange range = ranges->at(i);
2304 uint32_t new_common_bits = (range.from() ^ range.to());
2305 new_common_bits = ~SmearBitsRight(new_common_bits);
2306 common_bits &= new_common_bits;
2307 bits &= new_common_bits;
2308 uint32_t differing_bits = (range.from() & common_bits) ^ bits;
2309 common_bits ^= differing_bits;
2310 bits &= common_bits;
2311 }
2312 pos->mask = common_bits;
2313 pos->value = bits;
2314 }
2315 characters_filled_in++;
2316 ASSERT(characters_filled_in <= details->characters());
2317 if (characters_filled_in == details->characters()) {
2318 return;
2319 }
2320 }
2321 }
2322 ASSERT(characters_filled_in != details->characters());
iposva@chromium.org245aa852009-02-10 00:49:54 +00002323 on_success()-> GetQuickCheckDetails(details,
2324 compiler,
2325 characters_filled_in,
2326 true);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002327}
2328
2329
2330void QuickCheckDetails::Clear() {
2331 for (int i = 0; i < characters_; i++) {
2332 positions_[i].mask = 0;
2333 positions_[i].value = 0;
2334 positions_[i].determines_perfectly = false;
2335 }
2336 characters_ = 0;
2337}
2338
2339
2340void QuickCheckDetails::Advance(int by, bool ascii) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002341 ASSERT(by >= 0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002342 if (by >= characters_) {
2343 Clear();
2344 return;
2345 }
2346 for (int i = 0; i < characters_ - by; i++) {
2347 positions_[i] = positions_[by + i];
2348 }
2349 for (int i = characters_ - by; i < characters_; i++) {
2350 positions_[i].mask = 0;
2351 positions_[i].value = 0;
2352 positions_[i].determines_perfectly = false;
2353 }
2354 characters_ -= by;
2355 // We could change mask_ and value_ here but we would never advance unless
2356 // they had already been used in a check and they won't be used again because
2357 // it would gain us nothing. So there's no point.
2358}
2359
2360
2361void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2362 ASSERT(characters_ == other->characters_);
iposva@chromium.org245aa852009-02-10 00:49:54 +00002363 if (other->cannot_match_) {
2364 return;
2365 }
2366 if (cannot_match_) {
2367 *this = *other;
2368 return;
2369 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002370 for (int i = from_index; i < characters_; i++) {
2371 QuickCheckDetails::Position* pos = positions(i);
2372 QuickCheckDetails::Position* other_pos = other->positions(i);
2373 if (pos->mask != other_pos->mask ||
2374 pos->value != other_pos->value ||
2375 !other_pos->determines_perfectly) {
2376 // Our mask-compare operation will be approximate unless we have the
2377 // exact same operation on both sides of the alternation.
2378 pos->determines_perfectly = false;
2379 }
2380 pos->mask &= other_pos->mask;
2381 pos->value &= pos->mask;
2382 other_pos->value &= pos->mask;
2383 uc16 differing_bits = (pos->value ^ other_pos->value);
2384 pos->mask &= ~differing_bits;
2385 pos->value &= pos->mask;
2386 }
2387}
2388
2389
ager@chromium.org32912102009-01-16 10:38:43 +00002390class VisitMarker {
2391 public:
2392 explicit VisitMarker(NodeInfo* info) : info_(info) {
2393 ASSERT(!info->visited);
2394 info->visited = true;
2395 }
2396 ~VisitMarker() {
2397 info_->visited = false;
2398 }
2399 private:
2400 NodeInfo* info_;
2401};
2402
2403
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002404void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2405 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002406 int characters_filled_in,
2407 bool not_at_start) {
ager@chromium.org32912102009-01-16 10:38:43 +00002408 if (body_can_be_zero_length_ || info()->visited) return;
2409 VisitMarker marker(info());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002410 return ChoiceNode::GetQuickCheckDetails(details,
2411 compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002412 characters_filled_in,
2413 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002414}
2415
2416
2417void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2418 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002419 int characters_filled_in,
2420 bool not_at_start) {
2421 not_at_start = (not_at_start || not_at_start_);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002422 int choice_count = alternatives_->length();
2423 ASSERT(choice_count > 0);
2424 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2425 compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002426 characters_filled_in,
2427 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002428 for (int i = 1; i < choice_count; i++) {
2429 QuickCheckDetails new_details(details->characters());
2430 RegExpNode* node = alternatives_->at(i).node();
iposva@chromium.org245aa852009-02-10 00:49:54 +00002431 node->GetQuickCheckDetails(&new_details, compiler,
2432 characters_filled_in,
2433 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002434 // Here we merge the quick match details of the two branches.
2435 details->Merge(&new_details, characters_filled_in);
2436 }
2437}
2438
2439
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002440// Check for [0-9A-Z_a-z].
2441static void EmitWordCheck(RegExpMacroAssembler* assembler,
2442 Label* word,
2443 Label* non_word,
2444 bool fall_through_on_word) {
2445 assembler->CheckCharacterGT('z', non_word);
2446 assembler->CheckCharacterLT('0', non_word);
2447 assembler->CheckCharacterGT('a' - 1, word);
2448 assembler->CheckCharacterLT('9' + 1, word);
2449 assembler->CheckCharacterLT('A', non_word);
2450 assembler->CheckCharacterLT('Z' + 1, word);
2451 if (fall_through_on_word) {
2452 assembler->CheckNotCharacter('_', non_word);
2453 } else {
2454 assembler->CheckCharacter('_', word);
2455 }
2456}
2457
2458
2459// Emit the code to check for a ^ in multiline mode (1-character lookbehind
2460// that matches newline or the start of input).
2461static void EmitHat(RegExpCompiler* compiler,
2462 RegExpNode* on_success,
2463 Trace* trace) {
2464 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2465 // We will be loading the previous character into the current character
2466 // register.
2467 Trace new_trace(*trace);
2468 new_trace.InvalidateCurrentCharacter();
2469
2470 Label ok;
2471 if (new_trace.cp_offset() == 0) {
2472 // The start of input counts as a newline in this context, so skip to
2473 // ok if we are at the start.
2474 assembler->CheckAtStart(&ok);
2475 }
2476 // We already checked that we are not at the start of input so it must be
2477 // OK to load the previous character.
2478 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2479 new_trace.backtrack(),
2480 false);
2481 // Newline means \n, \r, 0x2028 or 0x2029.
2482 if (!compiler->ascii()) {
2483 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2484 }
2485 assembler->CheckCharacter('\n', &ok);
2486 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2487 assembler->Bind(&ok);
2488 on_success->Emit(compiler, &new_trace);
2489}
2490
2491
2492// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2493static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
2494 RegExpCompiler* compiler,
2495 RegExpNode* on_success,
2496 Trace* trace) {
2497 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2498 Label before_non_word;
2499 Label before_word;
2500 if (trace->characters_preloaded() != 1) {
2501 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2502 }
2503 // Fall through on non-word.
2504 EmitWordCheck(assembler, &before_word, &before_non_word, false);
2505
2506 // We will be loading the previous character into the current character
2507 // register.
2508 Trace new_trace(*trace);
2509 new_trace.InvalidateCurrentCharacter();
2510
2511 Label ok;
2512 Label* boundary;
2513 Label* not_boundary;
2514 if (type == AssertionNode::AT_BOUNDARY) {
2515 boundary = &ok;
2516 not_boundary = new_trace.backtrack();
2517 } else {
2518 not_boundary = &ok;
2519 boundary = new_trace.backtrack();
2520 }
2521
2522 // Next character is not a word character.
2523 assembler->Bind(&before_non_word);
2524 if (new_trace.cp_offset() == 0) {
2525 // The start of input counts as a non-word character, so the question is
2526 // decided if we are at the start.
2527 assembler->CheckAtStart(not_boundary);
2528 }
2529 // We already checked that we are not at the start of input so it must be
2530 // OK to load the previous character.
2531 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2532 &ok, // Unused dummy label in this call.
2533 false);
2534 // Fall through on non-word.
2535 EmitWordCheck(assembler, boundary, not_boundary, false);
2536 assembler->GoTo(not_boundary);
2537
2538 // Next character is a word character.
2539 assembler->Bind(&before_word);
2540 if (new_trace.cp_offset() == 0) {
2541 // The start of input counts as a non-word character, so the question is
2542 // decided if we are at the start.
2543 assembler->CheckAtStart(boundary);
2544 }
2545 // We already checked that we are not at the start of input so it must be
2546 // OK to load the previous character.
2547 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2548 &ok, // Unused dummy label in this call.
2549 false);
2550 bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
2551 EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
2552
2553 assembler->Bind(&ok);
2554
2555 on_success->Emit(compiler, &new_trace);
2556}
2557
2558
iposva@chromium.org245aa852009-02-10 00:49:54 +00002559void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2560 RegExpCompiler* compiler,
2561 int filled_in,
2562 bool not_at_start) {
2563 if (type_ == AT_START && not_at_start) {
2564 details->set_cannot_match();
2565 return;
2566 }
2567 return on_success()->GetQuickCheckDetails(details,
2568 compiler,
2569 filled_in,
2570 not_at_start);
2571}
2572
2573
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002574void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2575 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2576 switch (type_) {
2577 case AT_END: {
2578 Label ok;
2579 assembler->CheckPosition(trace->cp_offset(), &ok);
2580 assembler->GoTo(trace->backtrack());
2581 assembler->Bind(&ok);
2582 break;
2583 }
iposva@chromium.org245aa852009-02-10 00:49:54 +00002584 case AT_START: {
2585 if (trace->at_start() == Trace::FALSE) {
2586 assembler->GoTo(trace->backtrack());
2587 return;
2588 }
2589 if (trace->at_start() == Trace::UNKNOWN) {
2590 assembler->CheckNotAtStart(trace->backtrack());
2591 Trace at_start_trace = *trace;
2592 at_start_trace.set_at_start(true);
2593 on_success()->Emit(compiler, &at_start_trace);
2594 return;
2595 }
2596 }
2597 break;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002598 case AFTER_NEWLINE:
2599 EmitHat(compiler, on_success(), trace);
2600 return;
2601 case AT_NON_BOUNDARY:
2602 case AT_BOUNDARY:
2603 EmitBoundaryCheck(type_, compiler, on_success(), trace);
2604 return;
2605 }
2606 on_success()->Emit(compiler, trace);
2607}
2608
2609
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002610// We call this repeatedly to generate code for each pass over the text node.
2611// The passes are in increasing order of difficulty because we hope one
2612// of the first passes will fail in which case we are saved the work of the
2613// later passes. for example for the case independent regexp /%[asdfghjkl]a/
2614// we will check the '%' in the first pass, the case independent 'a' in the
2615// second pass and the character class in the last pass.
2616//
2617// The passes are done from right to left, so for example to test for /bar/
2618// we will first test for an 'r' with offset 2, then an 'a' with offset 1
2619// and then a 'b' with offset 0. This means we can avoid the end-of-input
2620// bounds check most of the time. In the example we only need to check for
2621// end-of-input when loading the putative 'r'.
2622//
2623// A slight complication involves the fact that the first character may already
2624// be fetched into a register by the previous node. In this case we want to
2625// do the test for that character first. We do this in separate passes. The
2626// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2627// pass has been performed then subsequent passes will have true in
2628// first_element_checked to indicate that that character does not need to be
2629// checked again.
2630//
ager@chromium.org32912102009-01-16 10:38:43 +00002631// In addition to all this we are passed a Trace, which can
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002632// contain an AlternativeGeneration object. In this AlternativeGeneration
2633// object we can see details of any quick check that was already passed in
2634// order to get to the code we are now generating. The quick check can involve
2635// loading characters, which means we do not need to recheck the bounds
2636// up to the limit the quick check already checked. In addition the quick
2637// check can have involved a mask and compare operation which may simplify
2638// or obviate the need for further checks at some character positions.
2639void TextNode::TextEmitPass(RegExpCompiler* compiler,
2640 TextEmitPassType pass,
2641 bool preloaded,
ager@chromium.org32912102009-01-16 10:38:43 +00002642 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002643 bool first_element_checked,
2644 int* checked_up_to) {
2645 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2646 bool ascii = compiler->ascii();
ager@chromium.org32912102009-01-16 10:38:43 +00002647 Label* backtrack = trace->backtrack();
2648 QuickCheckDetails* quick_check = trace->quick_check_performed();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002649 int element_count = elms_->length();
2650 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2651 TextElement elm = elms_->at(i);
ager@chromium.org32912102009-01-16 10:38:43 +00002652 int cp_offset = trace->cp_offset() + elm.cp_offset;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002653 if (elm.type == TextElement::ATOM) {
2654 if (pass == NON_ASCII_MATCH ||
2655 pass == CHARACTER_MATCH ||
2656 pass == CASE_CHARACTER_MATCH) {
2657 Vector<const uc16> quarks = elm.data.u_atom->data();
2658 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2659 bool bound_checked = true; // Most ops will check their bounds.
2660 if (first_element_checked && i == 0 && j == 0) continue;
2661 if (quick_check != NULL &&
2662 elm.cp_offset + j < quick_check->characters() &&
2663 quick_check->positions(elm.cp_offset + j)->determines_perfectly) {
2664 continue;
2665 }
2666 if (pass == NON_ASCII_MATCH) {
2667 ASSERT(ascii);
2668 if (quarks[j] > String::kMaxAsciiCharCode) {
2669 assembler->GoTo(backtrack);
2670 return;
2671 }
2672 } else if (pass == CHARACTER_MATCH) {
2673 if (compiler->ignore_case()) {
2674 bound_checked = EmitAtomNonLetter(assembler,
2675 quarks[j],
2676 backtrack,
2677 cp_offset + j,
2678 *checked_up_to < cp_offset + j,
2679 preloaded);
2680 } else {
2681 if (!preloaded) {
2682 assembler->LoadCurrentCharacter(cp_offset + j,
2683 backtrack,
2684 *checked_up_to < cp_offset + j);
2685 }
2686 assembler->CheckNotCharacter(quarks[j], backtrack);
2687 }
2688 } else {
2689 ASSERT_EQ(pass, CASE_CHARACTER_MATCH);
2690 ASSERT(compiler->ignore_case());
2691 bound_checked = EmitAtomLetter(assembler,
2692 compiler->ascii(),
2693 quarks[j],
2694 backtrack,
2695 cp_offset + j,
2696 *checked_up_to < cp_offset + j,
2697 preloaded);
2698 }
2699 if (pass != NON_ASCII_MATCH && bound_checked) {
2700 if (cp_offset + j > *checked_up_to) {
2701 *checked_up_to = cp_offset + j;
2702 }
2703 }
2704 }
2705 }
2706 } else {
2707 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
2708 if (first_element_checked && i == 0) continue;
2709 if (quick_check != NULL &&
2710 elm.cp_offset < quick_check->characters() &&
2711 quick_check->positions(elm.cp_offset)->determines_perfectly) {
2712 continue;
2713 }
2714 if (pass == CHARACTER_CLASS_MATCH) {
2715 RegExpCharacterClass* cc = elm.data.u_char_class;
2716 EmitCharClass(assembler,
2717 cc,
2718 cp_offset,
2719 backtrack,
2720 *checked_up_to < cp_offset,
2721 ascii,
2722 preloaded);
2723 if (cp_offset > *checked_up_to) {
2724 *checked_up_to = cp_offset;
2725 }
2726 }
2727 }
2728 }
2729}
2730
2731
2732int TextNode::Length() {
2733 TextElement elm = elms_->last();
2734 ASSERT(elm.cp_offset >= 0);
2735 if (elm.type == TextElement::ATOM) {
2736 return elm.cp_offset + elm.data.u_atom->data().length();
2737 } else {
2738 return elm.cp_offset + 1;
2739 }
2740}
2741
2742
ager@chromium.org8bb60582008-12-11 12:02:20 +00002743// This generates the code to match a text node. A text node can contain
2744// straight character sequences (possibly to be matched in a case-independent
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002745// way) and character classes. For efficiency we do not do this in a single
2746// pass from left to right. Instead we pass over the text node several times,
2747// emitting code for some character positions every time. See the comment on
2748// TextEmitPass for details.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002749void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org32912102009-01-16 10:38:43 +00002750 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002751 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002752 ASSERT(limit_result == CONTINUE);
2753
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002754 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2755 compiler->SetRegExpTooBig();
2756 return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002757 }
2758
2759 if (compiler->ascii()) {
2760 int dummy = 0;
ager@chromium.org32912102009-01-16 10:38:43 +00002761 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002762 }
2763
2764 bool first_elt_done = false;
ager@chromium.org32912102009-01-16 10:38:43 +00002765 int bound_checked_to = trace->cp_offset() - 1;
2766 bound_checked_to += trace->bound_checked_up_to();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002767
2768 // If a character is preloaded into the current character register then
2769 // check that now.
ager@chromium.org32912102009-01-16 10:38:43 +00002770 if (trace->characters_preloaded() == 1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002771 TextEmitPass(compiler,
2772 CHARACTER_MATCH,
2773 true,
ager@chromium.org32912102009-01-16 10:38:43 +00002774 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002775 false,
2776 &bound_checked_to);
2777 if (compiler->ignore_case()) {
2778 TextEmitPass(compiler,
2779 CASE_CHARACTER_MATCH,
2780 true,
ager@chromium.org32912102009-01-16 10:38:43 +00002781 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002782 false,
2783 &bound_checked_to);
2784 }
2785 TextEmitPass(compiler,
2786 CHARACTER_CLASS_MATCH,
2787 true,
ager@chromium.org32912102009-01-16 10:38:43 +00002788 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002789 false,
2790 &bound_checked_to);
2791 first_elt_done = true;
2792 }
2793
2794 TextEmitPass(compiler,
2795 CHARACTER_MATCH,
2796 false,
ager@chromium.org32912102009-01-16 10:38:43 +00002797 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002798 first_elt_done,
2799 &bound_checked_to);
2800 if (compiler->ignore_case()) {
2801 TextEmitPass(compiler,
2802 CASE_CHARACTER_MATCH,
2803 false,
ager@chromium.org32912102009-01-16 10:38:43 +00002804 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002805 first_elt_done,
2806 &bound_checked_to);
2807 }
2808 TextEmitPass(compiler,
2809 CHARACTER_CLASS_MATCH,
2810 false,
ager@chromium.org32912102009-01-16 10:38:43 +00002811 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002812 first_elt_done,
2813 &bound_checked_to);
2814
ager@chromium.org32912102009-01-16 10:38:43 +00002815 Trace successor_trace(*trace);
iposva@chromium.org245aa852009-02-10 00:49:54 +00002816 successor_trace.set_at_start(false);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002817 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002818 RecursionCheck rc(compiler);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002819 on_success()->Emit(compiler, &successor_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002820}
2821
2822
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002823void Trace::InvalidateCurrentCharacter() {
2824 characters_preloaded_ = 0;
2825}
2826
2827
2828void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002829 ASSERT(by > 0);
2830 // We don't have an instruction for shifting the current character register
2831 // down or for using a shifted value for anything so lets just forget that
2832 // we preloaded any characters into it.
2833 characters_preloaded_ = 0;
2834 // Adjust the offsets of the quick check performed information. This
2835 // information is used to find out what we already determined about the
2836 // characters by means of mask and compare.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002837 quick_check_performed_.Advance(by, compiler->ascii());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002838 cp_offset_ += by;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002839 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2840 compiler->SetRegExpTooBig();
2841 cp_offset_ = 0;
2842 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002843 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002844}
2845
2846
2847void TextNode::MakeCaseIndependent() {
2848 int element_count = elms_->length();
2849 for (int i = 0; i < element_count; i++) {
2850 TextElement elm = elms_->at(i);
2851 if (elm.type == TextElement::CHAR_CLASS) {
2852 RegExpCharacterClass* cc = elm.data.u_char_class;
2853 ZoneList<CharacterRange>* ranges = cc->ranges();
2854 int range_count = ranges->length();
2855 for (int i = 0; i < range_count; i++) {
2856 ranges->at(i).AddCaseEquivalents(ranges);
2857 }
2858 }
2859 }
2860}
2861
2862
ager@chromium.org8bb60582008-12-11 12:02:20 +00002863int TextNode::GreedyLoopTextLength() {
2864 TextElement elm = elms_->at(elms_->length() - 1);
2865 if (elm.type == TextElement::CHAR_CLASS) {
2866 return elm.cp_offset + 1;
2867 } else {
2868 return elm.cp_offset + elm.data.u_atom->data().length();
2869 }
2870}
2871
2872
2873// Finds the fixed match length of a sequence of nodes that goes from
2874// this alternative and back to this choice node. If there are variable
2875// length nodes or other complications in the way then return a sentinel
2876// value indicating that a greedy loop cannot be constructed.
2877int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) {
2878 int length = 0;
2879 RegExpNode* node = alternative->node();
2880 // Later we will generate code for all these text nodes using recursion
2881 // so we have to limit the max number.
2882 int recursion_depth = 0;
2883 while (node != this) {
2884 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2885 return kNodeIsTooComplexForGreedyLoops;
2886 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002887 int node_length = node->GreedyLoopTextLength();
2888 if (node_length == kNodeIsTooComplexForGreedyLoops) {
2889 return kNodeIsTooComplexForGreedyLoops;
2890 }
2891 length += node_length;
2892 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2893 node = seq_node->on_success();
2894 }
2895 return length;
2896}
2897
2898
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002899void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2900 ASSERT_EQ(loop_node_, NULL);
2901 AddAlternative(alt);
2902 loop_node_ = alt.node();
2903}
2904
2905
2906void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2907 ASSERT_EQ(continue_node_, NULL);
2908 AddAlternative(alt);
2909 continue_node_ = alt.node();
2910}
2911
2912
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002913void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002914 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00002915 if (trace->stop_node() == this) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002916 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2917 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2918 // Update the counter-based backtracking info on the stack. This is an
2919 // optimization for greedy loops (see below).
ager@chromium.org32912102009-01-16 10:38:43 +00002920 ASSERT(trace->cp_offset() == text_length);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002921 macro_assembler->AdvanceCurrentPosition(text_length);
ager@chromium.org32912102009-01-16 10:38:43 +00002922 macro_assembler->GoTo(trace->loop_label());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002923 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002924 }
ager@chromium.org32912102009-01-16 10:38:43 +00002925 ASSERT(trace->stop_node() == NULL);
2926 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002927 trace->Flush(compiler, this);
2928 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002929 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002930 ChoiceNode::Emit(compiler, trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002931}
2932
2933
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002934int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002935 int preload_characters = EatsAtLeast(4, 0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002936#ifdef CAN_READ_UNALIGNED
2937 bool ascii = compiler->ascii();
2938 if (ascii) {
2939 if (preload_characters > 4) preload_characters = 4;
2940 // We can't preload 3 characters because there is no machine instruction
2941 // to do that. We can't just load 4 because we could be reading
2942 // beyond the end of the string, which could cause a memory fault.
2943 if (preload_characters == 3) preload_characters = 2;
2944 } else {
2945 if (preload_characters > 2) preload_characters = 2;
2946 }
2947#else
2948 if (preload_characters > 1) preload_characters = 1;
2949#endif
2950 return preload_characters;
2951}
2952
2953
2954// This class is used when generating the alternatives in a choice node. It
2955// records the way the alternative is being code generated.
2956class AlternativeGeneration: public Malloced {
2957 public:
2958 AlternativeGeneration()
2959 : possible_success(),
2960 expects_preload(false),
2961 after(),
2962 quick_check_details() { }
2963 Label possible_success;
2964 bool expects_preload;
2965 Label after;
2966 QuickCheckDetails quick_check_details;
2967};
2968
2969
2970// Creates a list of AlternativeGenerations. If the list has a reasonable
2971// size then it is on the stack, otherwise the excess is on the heap.
2972class AlternativeGenerationList {
2973 public:
2974 explicit AlternativeGenerationList(int count)
2975 : alt_gens_(count) {
2976 for (int i = 0; i < count && i < kAFew; i++) {
2977 alt_gens_.Add(a_few_alt_gens_ + i);
2978 }
2979 for (int i = kAFew; i < count; i++) {
2980 alt_gens_.Add(new AlternativeGeneration());
2981 }
2982 }
2983 ~AlternativeGenerationList() {
2984 for (int i = 0; i < alt_gens_.length(); i++) {
2985 alt_gens_[i]->possible_success.Unuse();
2986 alt_gens_[i]->after.Unuse();
2987 }
2988 for (int i = kAFew; i < alt_gens_.length(); i++) {
2989 delete alt_gens_[i];
2990 alt_gens_[i] = NULL;
2991 }
2992 }
2993
2994 AlternativeGeneration* at(int i) {
2995 return alt_gens_[i];
2996 }
2997 private:
2998 static const int kAFew = 10;
2999 ZoneList<AlternativeGeneration*> alt_gens_;
3000 AlternativeGeneration a_few_alt_gens_[kAFew];
3001};
3002
3003
3004/* Code generation for choice nodes.
3005 *
3006 * We generate quick checks that do a mask and compare to eliminate a
3007 * choice. If the quick check succeeds then it jumps to the continuation to
3008 * do slow checks and check subsequent nodes. If it fails (the common case)
3009 * it falls through to the next choice.
3010 *
3011 * Here is the desired flow graph. Nodes directly below each other imply
3012 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3013 * 3 doesn't have a quick check so we have to call the slow check.
3014 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3015 * regexp continuation is generated directly after the Sn node, up to the
3016 * next GoTo if we decide to reuse some already generated code. Some
3017 * nodes expect preload_characters to be preloaded into the current
3018 * character register. R nodes do this preloading. Vertices are marked
3019 * F for failures and S for success (possible success in the case of quick
3020 * nodes). L, V, < and > are used as arrow heads.
3021 *
3022 * ----------> R
3023 * |
3024 * V
3025 * Q1 -----> S1
3026 * | S /
3027 * F| /
3028 * | F/
3029 * | /
3030 * | R
3031 * | /
3032 * V L
3033 * Q2 -----> S2
3034 * | S /
3035 * F| /
3036 * | F/
3037 * | /
3038 * | R
3039 * | /
3040 * V L
3041 * S3
3042 * |
3043 * F|
3044 * |
3045 * R
3046 * |
3047 * backtrack V
3048 * <----------Q4
3049 * \ F |
3050 * \ |S
3051 * \ F V
3052 * \-----S4
3053 *
3054 * For greedy loops we reverse our expectation and expect to match rather
3055 * than fail. Therefore we want the loop code to look like this (U is the
3056 * unwind code that steps back in the greedy loop). The following alternatives
3057 * look the same as above.
3058 * _____
3059 * / \
3060 * V |
3061 * ----------> S1 |
3062 * /| |
3063 * / |S |
3064 * F/ \_____/
3065 * /
3066 * |<-----------
3067 * | \
3068 * V \
3069 * Q2 ---> S2 \
3070 * | S / |
3071 * F| / |
3072 * | F/ |
3073 * | / |
3074 * | R |
3075 * | / |
3076 * F VL |
3077 * <------U |
3078 * back |S |
3079 * \______________/
3080 */
3081
3082
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003083void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003084 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3085 int choice_count = alternatives_->length();
3086#ifdef DEBUG
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003087 for (int i = 0; i < choice_count - 1; i++) {
3088 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003089 ZoneList<Guard*>* guards = alternative.guards();
ager@chromium.org8bb60582008-12-11 12:02:20 +00003090 int guard_count = (guards == NULL) ? 0 : guards->length();
3091 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00003092 ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003093 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003094 }
3095#endif
3096
ager@chromium.org32912102009-01-16 10:38:43 +00003097 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003098 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003099 ASSERT(limit_result == CONTINUE);
3100
3101 RecursionCheck rc(compiler);
3102
ager@chromium.org32912102009-01-16 10:38:43 +00003103 Trace* current_trace = trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003104
3105 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
3106 bool greedy_loop = false;
3107 Label greedy_loop_label;
ager@chromium.org32912102009-01-16 10:38:43 +00003108 Trace counter_backtrack_trace;
3109 counter_backtrack_trace.set_backtrack(&greedy_loop_label);
iposva@chromium.org245aa852009-02-10 00:49:54 +00003110 if (not_at_start()) counter_backtrack_trace.set_at_start(false);
3111
ager@chromium.org8bb60582008-12-11 12:02:20 +00003112 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3113 // Here we have special handling for greedy loops containing only text nodes
3114 // and other simple nodes. These are handled by pushing the current
3115 // position on the stack and then incrementing the current position each
3116 // time around the switch. On backtrack we decrement the current position
3117 // and check it against the pushed value. This avoids pushing backtrack
3118 // information for each iteration of the loop, which could take up a lot of
3119 // space.
3120 greedy_loop = true;
ager@chromium.org32912102009-01-16 10:38:43 +00003121 ASSERT(trace->stop_node() == NULL);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003122 macro_assembler->PushCurrentPosition();
ager@chromium.org32912102009-01-16 10:38:43 +00003123 current_trace = &counter_backtrack_trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003124 Label greedy_match_failed;
ager@chromium.org32912102009-01-16 10:38:43 +00003125 Trace greedy_match_trace;
iposva@chromium.org245aa852009-02-10 00:49:54 +00003126 if (not_at_start()) greedy_match_trace.set_at_start(false);
ager@chromium.org32912102009-01-16 10:38:43 +00003127 greedy_match_trace.set_backtrack(&greedy_match_failed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003128 Label loop_label;
3129 macro_assembler->Bind(&loop_label);
ager@chromium.org32912102009-01-16 10:38:43 +00003130 greedy_match_trace.set_stop_node(this);
3131 greedy_match_trace.set_loop_label(&loop_label);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003132 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003133 macro_assembler->Bind(&greedy_match_failed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003134 }
3135
3136 Label second_choice; // For use in greedy matches.
3137 macro_assembler->Bind(&second_choice);
3138
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003139 int first_normal_choice = greedy_loop ? 1 : 0;
3140
3141 int preload_characters = CalculatePreloadCharacters(compiler);
3142 bool preload_is_current =
ager@chromium.org32912102009-01-16 10:38:43 +00003143 (current_trace->characters_preloaded() == preload_characters);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003144 bool preload_has_checked_bounds = preload_is_current;
3145
3146 AlternativeGenerationList alt_gens(choice_count);
3147
ager@chromium.org8bb60582008-12-11 12:02:20 +00003148 // For now we just call all choices one after the other. The idea ultimately
3149 // is to use the Dispatch table to try only the relevant ones.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003150 for (int i = first_normal_choice; i < choice_count; i++) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003151 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.org32912102009-01-16 10:38:43 +00003152 AlternativeGeneration* alt_gen = alt_gens.at(i);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003153 alt_gen->quick_check_details.set_characters(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003154 ZoneList<Guard*>* guards = alternative.guards();
3155 int guard_count = (guards == NULL) ? 0 : guards->length();
ager@chromium.org32912102009-01-16 10:38:43 +00003156 Trace new_trace(*current_trace);
3157 new_trace.set_characters_preloaded(preload_is_current ?
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003158 preload_characters :
3159 0);
3160 if (preload_has_checked_bounds) {
ager@chromium.org32912102009-01-16 10:38:43 +00003161 new_trace.set_bound_checked_up_to(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003162 }
ager@chromium.org32912102009-01-16 10:38:43 +00003163 new_trace.quick_check_performed()->Clear();
iposva@chromium.org245aa852009-02-10 00:49:54 +00003164 if (not_at_start_) new_trace.set_at_start(Trace::FALSE);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003165 alt_gen->expects_preload = preload_is_current;
3166 bool generate_full_check_inline = false;
iposva@chromium.org245aa852009-02-10 00:49:54 +00003167 if (FLAG_irregexp_optimization &&
3168 try_to_emit_quick_check_for_alternative(i) &&
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003169 alternative.node()->EmitQuickCheck(compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00003170 &new_trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003171 preload_has_checked_bounds,
3172 &alt_gen->possible_success,
3173 &alt_gen->quick_check_details,
3174 i < choice_count - 1)) {
3175 // Quick check was generated for this choice.
3176 preload_is_current = true;
3177 preload_has_checked_bounds = true;
3178 // On the last choice in the ChoiceNode we generated the quick
3179 // check to fall through on possible success. So now we need to
3180 // generate the full check inline.
3181 if (i == choice_count - 1) {
3182 macro_assembler->Bind(&alt_gen->possible_success);
ager@chromium.org32912102009-01-16 10:38:43 +00003183 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3184 new_trace.set_characters_preloaded(preload_characters);
3185 new_trace.set_bound_checked_up_to(preload_characters);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003186 generate_full_check_inline = true;
3187 }
iposva@chromium.org245aa852009-02-10 00:49:54 +00003188 } else if (alt_gen->quick_check_details.cannot_match()) {
3189 if (i == choice_count - 1 && !greedy_loop) {
3190 macro_assembler->GoTo(trace->backtrack());
3191 }
3192 continue;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003193 } else {
3194 // No quick check was generated. Put the full code here.
3195 // If this is not the first choice then there could be slow checks from
3196 // previous cases that go here when they fail. There's no reason to
3197 // insist that they preload characters since the slow check we are about
3198 // to generate probably can't use it.
3199 if (i != first_normal_choice) {
3200 alt_gen->expects_preload = false;
ager@chromium.org32912102009-01-16 10:38:43 +00003201 new_trace.set_characters_preloaded(0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003202 }
3203 if (i < choice_count - 1) {
ager@chromium.org32912102009-01-16 10:38:43 +00003204 new_trace.set_backtrack(&alt_gen->after);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003205 }
3206 generate_full_check_inline = true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003207 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003208 if (generate_full_check_inline) {
3209 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00003210 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003211 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003212 alternative.node()->Emit(compiler, &new_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003213 preload_is_current = false;
3214 }
3215 macro_assembler->Bind(&alt_gen->after);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003216 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003217 if (greedy_loop) {
3218 macro_assembler->Bind(&greedy_loop_label);
3219 // If we have unwound to the bottom then backtrack.
ager@chromium.org32912102009-01-16 10:38:43 +00003220 macro_assembler->CheckGreedyLoop(trace->backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00003221 // Otherwise try the second priority at an earlier position.
3222 macro_assembler->AdvanceCurrentPosition(-text_length);
3223 macro_assembler->GoTo(&second_choice);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003224 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003225 // At this point we need to generate slow checks for the alternatives where
3226 // the quick check was inlined. We can recognize these because the associated
3227 // label was bound.
3228 for (int i = first_normal_choice; i < choice_count - 1; i++) {
3229 AlternativeGeneration* alt_gen = alt_gens.at(i);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003230 EmitOutOfLineContinuation(compiler,
3231 current_trace,
3232 alternatives_->at(i),
3233 alt_gen,
3234 preload_characters,
3235 alt_gens.at(i + 1)->expects_preload);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003236 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003237}
3238
3239
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003240void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00003241 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003242 GuardedAlternative alternative,
3243 AlternativeGeneration* alt_gen,
3244 int preload_characters,
3245 bool next_expects_preload) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003246 if (!alt_gen->possible_success.is_linked()) return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003247
3248 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3249 macro_assembler->Bind(&alt_gen->possible_success);
ager@chromium.org32912102009-01-16 10:38:43 +00003250 Trace out_of_line_trace(*trace);
3251 out_of_line_trace.set_characters_preloaded(preload_characters);
3252 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
iposva@chromium.org245aa852009-02-10 00:49:54 +00003253 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003254 ZoneList<Guard*>* guards = alternative.guards();
3255 int guard_count = (guards == NULL) ? 0 : guards->length();
3256 if (next_expects_preload) {
3257 Label reload_current_char;
ager@chromium.org32912102009-01-16 10:38:43 +00003258 out_of_line_trace.set_backtrack(&reload_current_char);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003259 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00003260 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003261 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003262 alternative.node()->Emit(compiler, &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003263 macro_assembler->Bind(&reload_current_char);
3264 // Reload the current character, since the next quick check expects that.
3265 // We don't need to check bounds here because we only get into this
3266 // code through a quick check which already did the checked load.
ager@chromium.org32912102009-01-16 10:38:43 +00003267 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003268 NULL,
3269 false,
3270 preload_characters);
3271 macro_assembler->GoTo(&(alt_gen->after));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003272 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00003273 out_of_line_trace.set_backtrack(&(alt_gen->after));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003274 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00003275 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003276 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003277 alternative.node()->Emit(compiler, &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003278 }
3279}
3280
3281
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003282void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003283 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00003284 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003285 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003286 ASSERT(limit_result == CONTINUE);
3287
3288 RecursionCheck rc(compiler);
3289
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003290 switch (type_) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003291 case STORE_POSITION: {
ager@chromium.org32912102009-01-16 10:38:43 +00003292 Trace::DeferredCapture
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003293 new_capture(data_.u_position_register.reg,
3294 data_.u_position_register.is_capture,
3295 trace);
ager@chromium.org32912102009-01-16 10:38:43 +00003296 Trace new_trace = *trace;
3297 new_trace.add_action(&new_capture);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003298 on_success()->Emit(compiler, &new_trace);
3299 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003300 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003301 case INCREMENT_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00003302 Trace::DeferredIncrementRegister
ager@chromium.org8bb60582008-12-11 12:02:20 +00003303 new_increment(data_.u_increment_register.reg);
ager@chromium.org32912102009-01-16 10:38:43 +00003304 Trace new_trace = *trace;
3305 new_trace.add_action(&new_increment);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003306 on_success()->Emit(compiler, &new_trace);
3307 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003308 }
3309 case SET_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00003310 Trace::DeferredSetRegister
ager@chromium.org8bb60582008-12-11 12:02:20 +00003311 new_set(data_.u_store_register.reg, data_.u_store_register.value);
ager@chromium.org32912102009-01-16 10:38:43 +00003312 Trace new_trace = *trace;
3313 new_trace.add_action(&new_set);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003314 on_success()->Emit(compiler, &new_trace);
3315 break;
ager@chromium.org32912102009-01-16 10:38:43 +00003316 }
3317 case CLEAR_CAPTURES: {
3318 Trace::DeferredClearCaptures
3319 new_capture(Interval(data_.u_clear_captures.range_from,
3320 data_.u_clear_captures.range_to));
3321 Trace new_trace = *trace;
3322 new_trace.add_action(&new_capture);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003323 on_success()->Emit(compiler, &new_trace);
3324 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003325 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003326 case BEGIN_SUBMATCH:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003327 if (!trace->is_trivial()) {
3328 trace->Flush(compiler, this);
3329 } else {
3330 assembler->WriteCurrentPositionToRegister(
3331 data_.u_submatch.current_position_register, 0);
3332 assembler->WriteStackPointerToRegister(
3333 data_.u_submatch.stack_pointer_register);
3334 on_success()->Emit(compiler, trace);
3335 }
3336 break;
ager@chromium.org32912102009-01-16 10:38:43 +00003337 case EMPTY_MATCH_CHECK: {
3338 int start_pos_reg = data_.u_empty_match_check.start_register;
3339 int stored_pos = 0;
3340 int rep_reg = data_.u_empty_match_check.repetition_register;
3341 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
3342 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
3343 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
3344 // If we know we haven't advanced and there is no minimum we
3345 // can just backtrack immediately.
3346 assembler->GoTo(trace->backtrack());
ager@chromium.org32912102009-01-16 10:38:43 +00003347 } else if (know_dist && stored_pos < trace->cp_offset()) {
3348 // If we know we've advanced we can generate the continuation
3349 // immediately.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003350 on_success()->Emit(compiler, trace);
3351 } else if (!trace->is_trivial()) {
3352 trace->Flush(compiler, this);
3353 } else {
3354 Label skip_empty_check;
3355 // If we have a minimum number of repetitions we check the current
3356 // number first and skip the empty check if it's not enough.
3357 if (has_minimum) {
3358 int limit = data_.u_empty_match_check.repetition_limit;
3359 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
3360 }
3361 // If the match is empty we bail out, otherwise we fall through
3362 // to the on-success continuation.
3363 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
3364 trace->backtrack());
3365 assembler->Bind(&skip_empty_check);
3366 on_success()->Emit(compiler, trace);
ager@chromium.org32912102009-01-16 10:38:43 +00003367 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003368 break;
ager@chromium.org32912102009-01-16 10:38:43 +00003369 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003370 case POSITIVE_SUBMATCH_SUCCESS: {
3371 if (!trace->is_trivial()) {
3372 trace->Flush(compiler, this);
3373 return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003374 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003375 assembler->ReadCurrentPositionFromRegister(
ager@chromium.org8bb60582008-12-11 12:02:20 +00003376 data_.u_submatch.current_position_register);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003377 assembler->ReadStackPointerFromRegister(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003378 data_.u_submatch.stack_pointer_register);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003379 int clear_register_count = data_.u_submatch.clear_register_count;
3380 if (clear_register_count == 0) {
3381 on_success()->Emit(compiler, trace);
3382 return;
3383 }
3384 int clear_registers_from = data_.u_submatch.clear_register_from;
3385 Label clear_registers_backtrack;
3386 Trace new_trace = *trace;
3387 new_trace.set_backtrack(&clear_registers_backtrack);
3388 on_success()->Emit(compiler, &new_trace);
3389
3390 assembler->Bind(&clear_registers_backtrack);
3391 int clear_registers_to = clear_registers_from + clear_register_count - 1;
3392 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
3393
3394 ASSERT(trace->backtrack() == NULL);
3395 assembler->Backtrack();
3396 return;
3397 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003398 default:
3399 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003400 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003401}
3402
3403
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003404void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003405 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00003406 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003407 trace->Flush(compiler, this);
3408 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003409 }
3410
ager@chromium.org32912102009-01-16 10:38:43 +00003411 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003412 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003413 ASSERT(limit_result == CONTINUE);
3414
3415 RecursionCheck rc(compiler);
3416
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003417 ASSERT_EQ(start_reg_ + 1, end_reg_);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003418 if (compiler->ignore_case()) {
3419 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
3420 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003421 } else {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003422 assembler->CheckNotBackReference(start_reg_, trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003423 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003424 on_success()->Emit(compiler, trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003425}
3426
3427
3428// -------------------------------------------------------------------
3429// Dot/dotty output
3430
3431
3432#ifdef DEBUG
3433
3434
3435class DotPrinter: public NodeVisitor {
3436 public:
3437 explicit DotPrinter(bool ignore_case)
3438 : ignore_case_(ignore_case),
3439 stream_(&alloc_) { }
3440 void PrintNode(const char* label, RegExpNode* node);
3441 void Visit(RegExpNode* node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003442 void PrintAttributes(RegExpNode* from);
3443 StringStream* stream() { return &stream_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003444 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003445#define DECLARE_VISIT(Type) \
3446 virtual void Visit##Type(Type##Node* that);
3447FOR_EACH_NODE_TYPE(DECLARE_VISIT)
3448#undef DECLARE_VISIT
3449 private:
3450 bool ignore_case_;
3451 HeapStringAllocator alloc_;
3452 StringStream stream_;
3453};
3454
3455
3456void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
3457 stream()->Add("digraph G {\n graph [label=\"");
3458 for (int i = 0; label[i]; i++) {
3459 switch (label[i]) {
3460 case '\\':
3461 stream()->Add("\\\\");
3462 break;
3463 case '"':
3464 stream()->Add("\"");
3465 break;
3466 default:
3467 stream()->Put(label[i]);
3468 break;
3469 }
3470 }
3471 stream()->Add("\"];\n");
3472 Visit(node);
3473 stream()->Add("}\n");
3474 printf("%s", *(stream()->ToCString()));
3475}
3476
3477
3478void DotPrinter::Visit(RegExpNode* node) {
3479 if (node->info()->visited) return;
3480 node->info()->visited = true;
3481 node->Accept(this);
3482}
3483
3484
3485void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003486 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
3487 Visit(on_failure);
3488}
3489
3490
3491class TableEntryBodyPrinter {
3492 public:
3493 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
3494 : stream_(stream), choice_(choice) { }
3495 void Call(uc16 from, DispatchTable::Entry entry) {
3496 OutSet* out_set = entry.out_set();
3497 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3498 if (out_set->Get(i)) {
3499 stream()->Add(" n%p:s%io%i -> n%p;\n",
3500 choice(),
3501 from,
3502 i,
3503 choice()->alternatives()->at(i).node());
3504 }
3505 }
3506 }
3507 private:
3508 StringStream* stream() { return stream_; }
3509 ChoiceNode* choice() { return choice_; }
3510 StringStream* stream_;
3511 ChoiceNode* choice_;
3512};
3513
3514
3515class TableEntryHeaderPrinter {
3516 public:
3517 explicit TableEntryHeaderPrinter(StringStream* stream)
3518 : first_(true), stream_(stream) { }
3519 void Call(uc16 from, DispatchTable::Entry entry) {
3520 if (first_) {
3521 first_ = false;
3522 } else {
3523 stream()->Add("|");
3524 }
3525 stream()->Add("{\\%k-\\%k|{", from, entry.to());
3526 OutSet* out_set = entry.out_set();
3527 int priority = 0;
3528 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3529 if (out_set->Get(i)) {
3530 if (priority > 0) stream()->Add("|");
3531 stream()->Add("<s%io%i> %i", from, i, priority);
3532 priority++;
3533 }
3534 }
3535 stream()->Add("}}");
3536 }
3537 private:
3538 bool first_;
3539 StringStream* stream() { return stream_; }
3540 StringStream* stream_;
3541};
3542
3543
3544class AttributePrinter {
3545 public:
3546 explicit AttributePrinter(DotPrinter* out)
3547 : out_(out), first_(true) { }
3548 void PrintSeparator() {
3549 if (first_) {
3550 first_ = false;
3551 } else {
3552 out_->stream()->Add("|");
3553 }
3554 }
3555 void PrintBit(const char* name, bool value) {
3556 if (!value) return;
3557 PrintSeparator();
3558 out_->stream()->Add("{%s}", name);
3559 }
3560 void PrintPositive(const char* name, int value) {
3561 if (value < 0) return;
3562 PrintSeparator();
3563 out_->stream()->Add("{%s|%x}", name, value);
3564 }
3565 private:
3566 DotPrinter* out_;
3567 bool first_;
3568};
3569
3570
3571void DotPrinter::PrintAttributes(RegExpNode* that) {
3572 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
3573 "margin=0.1, fontsize=10, label=\"{",
3574 that);
3575 AttributePrinter printer(this);
3576 NodeInfo* info = that->info();
3577 printer.PrintBit("NI", info->follows_newline_interest);
3578 printer.PrintBit("WI", info->follows_word_interest);
3579 printer.PrintBit("SI", info->follows_start_interest);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003580 Label* label = that->label();
3581 if (label->is_bound())
3582 printer.PrintPositive("@", label->pos());
3583 stream()->Add("}\"];\n");
3584 stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
3585 "arrowhead=none];\n", that, that);
3586}
3587
3588
3589static const bool kPrintDispatchTable = false;
3590void DotPrinter::VisitChoice(ChoiceNode* that) {
3591 if (kPrintDispatchTable) {
3592 stream()->Add(" n%p [shape=Mrecord, label=\"", that);
3593 TableEntryHeaderPrinter header_printer(stream());
3594 that->GetTable(ignore_case_)->ForEach(&header_printer);
3595 stream()->Add("\"]\n", that);
3596 PrintAttributes(that);
3597 TableEntryBodyPrinter body_printer(stream(), that);
3598 that->GetTable(ignore_case_)->ForEach(&body_printer);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003599 } else {
3600 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
3601 for (int i = 0; i < that->alternatives()->length(); i++) {
3602 GuardedAlternative alt = that->alternatives()->at(i);
3603 stream()->Add(" n%p -> n%p;\n", that, alt.node());
3604 }
3605 }
3606 for (int i = 0; i < that->alternatives()->length(); i++) {
3607 GuardedAlternative alt = that->alternatives()->at(i);
3608 alt.node()->Accept(this);
3609 }
3610}
3611
3612
3613void DotPrinter::VisitText(TextNode* that) {
3614 stream()->Add(" n%p [label=\"", that);
3615 for (int i = 0; i < that->elements()->length(); i++) {
3616 if (i > 0) stream()->Add(" ");
3617 TextElement elm = that->elements()->at(i);
3618 switch (elm.type) {
3619 case TextElement::ATOM: {
3620 stream()->Add("'%w'", elm.data.u_atom->data());
3621 break;
3622 }
3623 case TextElement::CHAR_CLASS: {
3624 RegExpCharacterClass* node = elm.data.u_char_class;
3625 stream()->Add("[");
3626 if (node->is_negated())
3627 stream()->Add("^");
3628 for (int j = 0; j < node->ranges()->length(); j++) {
3629 CharacterRange range = node->ranges()->at(j);
3630 stream()->Add("%k-%k", range.from(), range.to());
3631 }
3632 stream()->Add("]");
3633 break;
3634 }
3635 default:
3636 UNREACHABLE();
3637 }
3638 }
3639 stream()->Add("\", shape=box, peripheries=2];\n");
3640 PrintAttributes(that);
3641 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3642 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003643}
3644
3645
3646void DotPrinter::VisitBackReference(BackReferenceNode* that) {
3647 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
3648 that,
3649 that->start_register(),
3650 that->end_register());
3651 PrintAttributes(that);
3652 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3653 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003654}
3655
3656
3657void DotPrinter::VisitEnd(EndNode* that) {
3658 stream()->Add(" n%p [style=bold, shape=point];\n", that);
3659 PrintAttributes(that);
3660}
3661
3662
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003663void DotPrinter::VisitAssertion(AssertionNode* that) {
3664 stream()->Add(" n%p [", that);
3665 switch (that->type()) {
3666 case AssertionNode::AT_END:
3667 stream()->Add("label=\"$\", shape=septagon");
3668 break;
3669 case AssertionNode::AT_START:
3670 stream()->Add("label=\"^\", shape=septagon");
3671 break;
3672 case AssertionNode::AT_BOUNDARY:
3673 stream()->Add("label=\"\\b\", shape=septagon");
3674 break;
3675 case AssertionNode::AT_NON_BOUNDARY:
3676 stream()->Add("label=\"\\B\", shape=septagon");
3677 break;
3678 case AssertionNode::AFTER_NEWLINE:
3679 stream()->Add("label=\"(?<=\\n)\", shape=septagon");
3680 break;
3681 }
3682 stream()->Add("];\n");
3683 PrintAttributes(that);
3684 RegExpNode* successor = that->on_success();
3685 stream()->Add(" n%p -> n%p;\n", that, successor);
3686 Visit(successor);
3687}
3688
3689
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003690void DotPrinter::VisitAction(ActionNode* that) {
3691 stream()->Add(" n%p [", that);
3692 switch (that->type_) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003693 case ActionNode::SET_REGISTER:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003694 stream()->Add("label=\"$%i:=%i\", shape=octagon",
3695 that->data_.u_store_register.reg,
3696 that->data_.u_store_register.value);
3697 break;
3698 case ActionNode::INCREMENT_REGISTER:
3699 stream()->Add("label=\"$%i++\", shape=octagon",
3700 that->data_.u_increment_register.reg);
3701 break;
3702 case ActionNode::STORE_POSITION:
3703 stream()->Add("label=\"$%i:=$pos\", shape=octagon",
3704 that->data_.u_position_register.reg);
3705 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003706 case ActionNode::BEGIN_SUBMATCH:
3707 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
3708 that->data_.u_submatch.current_position_register);
3709 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003710 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003711 stream()->Add("label=\"escape\", shape=septagon");
3712 break;
ager@chromium.org32912102009-01-16 10:38:43 +00003713 case ActionNode::EMPTY_MATCH_CHECK:
3714 stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
3715 that->data_.u_empty_match_check.start_register,
3716 that->data_.u_empty_match_check.repetition_register,
3717 that->data_.u_empty_match_check.repetition_limit);
3718 break;
3719 case ActionNode::CLEAR_CAPTURES: {
3720 stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
3721 that->data_.u_clear_captures.range_from,
3722 that->data_.u_clear_captures.range_to);
3723 break;
3724 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003725 }
3726 stream()->Add("];\n");
3727 PrintAttributes(that);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003728 RegExpNode* successor = that->on_success();
3729 stream()->Add(" n%p -> n%p;\n", that, successor);
3730 Visit(successor);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003731}
3732
3733
3734class DispatchTableDumper {
3735 public:
3736 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
3737 void Call(uc16 key, DispatchTable::Entry entry);
3738 StringStream* stream() { return stream_; }
3739 private:
3740 StringStream* stream_;
3741};
3742
3743
3744void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
3745 stream()->Add("[%k-%k]: {", key, entry.to());
3746 OutSet* set = entry.out_set();
3747 bool first = true;
3748 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3749 if (set->Get(i)) {
3750 if (first) {
3751 first = false;
3752 } else {
3753 stream()->Add(", ");
3754 }
3755 stream()->Add("%i", i);
3756 }
3757 }
3758 stream()->Add("}\n");
3759}
3760
3761
3762void DispatchTable::Dump() {
3763 HeapStringAllocator alloc;
3764 StringStream stream(&alloc);
3765 DispatchTableDumper dumper(&stream);
3766 tree()->ForEach(&dumper);
3767 OS::PrintError("%s", *stream.ToCString());
3768}
3769
3770
3771void RegExpEngine::DotPrint(const char* label,
3772 RegExpNode* node,
3773 bool ignore_case) {
3774 DotPrinter printer(ignore_case);
3775 printer.PrintNode(label, node);
3776}
3777
3778
3779#endif // DEBUG
3780
3781
3782// -------------------------------------------------------------------
3783// Tree to graph conversion
3784
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003785static const int kSpaceRangeCount = 20;
3786static const int kSpaceRangeAsciiCount = 4;
3787static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020,
3788 0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A,
3789 0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 };
3790
3791static const int kWordRangeCount = 8;
3792static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_',
3793 '_', 'a', 'z' };
3794
3795static const int kDigitRangeCount = 2;
3796static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' };
3797
3798static const int kLineTerminatorRangeCount = 6;
3799static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A,
3800 0x000A, 0x000D, 0x000D, 0x2028, 0x2029 };
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003801
3802RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003803 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003804 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
3805 elms->Add(TextElement::Atom(this));
ager@chromium.org8bb60582008-12-11 12:02:20 +00003806 return new TextNode(elms, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003807}
3808
3809
3810RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003811 RegExpNode* on_success) {
3812 return new TextNode(elements(), on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003813}
3814
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003815static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
3816 const uc16* special_class,
3817 int length) {
3818 ASSERT(ranges->length() != 0);
3819 ASSERT(length != 0);
3820 ASSERT(special_class[0] != 0);
3821 if (ranges->length() != (length >> 1) + 1) {
3822 return false;
3823 }
3824 CharacterRange range = ranges->at(0);
3825 if (range.from() != 0) {
3826 return false;
3827 }
3828 for (int i = 0; i < length; i += 2) {
3829 if (special_class[i] != (range.to() + 1)) {
3830 return false;
3831 }
3832 range = ranges->at((i >> 1) + 1);
3833 if (special_class[i+1] != range.from() - 1) {
3834 return false;
3835 }
3836 }
3837 if (range.to() != 0xffff) {
3838 return false;
3839 }
3840 return true;
3841}
3842
3843
3844static bool CompareRanges(ZoneList<CharacterRange>* ranges,
3845 const uc16* special_class,
3846 int length) {
3847 if (ranges->length() * 2 != length) {
3848 return false;
3849 }
3850 for (int i = 0; i < length; i += 2) {
3851 CharacterRange range = ranges->at(i >> 1);
3852 if (range.from() != special_class[i] || range.to() != special_class[i+1]) {
3853 return false;
3854 }
3855 }
3856 return true;
3857}
3858
3859
3860bool RegExpCharacterClass::is_standard() {
3861 // TODO(lrn): Remove need for this function, by not throwing away information
3862 // along the way.
3863 if (is_negated_) {
3864 return false;
3865 }
3866 if (set_.is_standard()) {
3867 return true;
3868 }
3869 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3870 set_.set_standard_set_type('s');
3871 return true;
3872 }
3873 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3874 set_.set_standard_set_type('S');
3875 return true;
3876 }
3877 if (CompareInverseRanges(set_.ranges(),
3878 kLineTerminatorRanges,
3879 kLineTerminatorRangeCount)) {
3880 set_.set_standard_set_type('.');
3881 return true;
3882 }
3883 return false;
3884}
3885
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003886
3887RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003888 RegExpNode* on_success) {
3889 return new TextNode(this, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003890}
3891
3892
3893RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003894 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003895 ZoneList<RegExpTree*>* alternatives = this->alternatives();
3896 int length = alternatives->length();
ager@chromium.org8bb60582008-12-11 12:02:20 +00003897 ChoiceNode* result = new ChoiceNode(length);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003898 for (int i = 0; i < length; i++) {
3899 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003900 on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003901 result->AddAlternative(alternative);
3902 }
3903 return result;
3904}
3905
3906
3907RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003908 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003909 return ToNode(min(),
3910 max(),
3911 is_greedy(),
3912 body(),
3913 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003914 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003915}
3916
3917
3918RegExpNode* RegExpQuantifier::ToNode(int min,
3919 int max,
3920 bool is_greedy,
3921 RegExpTree* body,
3922 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00003923 RegExpNode* on_success,
3924 bool not_at_start) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003925 // x{f, t} becomes this:
3926 //
3927 // (r++)<-.
3928 // | `
3929 // | (x)
3930 // v ^
3931 // (r=0)-->(?)---/ [if r < t]
3932 // |
3933 // [if r >= f] \----> ...
3934 //
3935 //
3936 // TODO(someone): clear captures on repetition and handle empty
3937 // matches.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003938
3939 // 15.10.2.5 RepeatMatcher algorithm.
3940 // The parser has already eliminated the case where max is 0. In the case
3941 // where max_match is zero the parser has removed the quantifier if min was
3942 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
3943
3944 // If we know that we cannot match zero length then things are a little
3945 // simpler since we don't need to make the special zero length match check
3946 // from step 2.1. If the min and max are small we can unroll a little in
3947 // this case.
3948 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
3949 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
3950 if (max == 0) return on_success; // This can happen due to recursion.
ager@chromium.org32912102009-01-16 10:38:43 +00003951 bool body_can_be_empty = (body->min_match() == 0);
3952 int body_start_reg = RegExpCompiler::kNoRegister;
3953 Interval capture_registers = body->CaptureRegisters();
3954 bool needs_capture_clearing = !capture_registers.is_empty();
3955 if (body_can_be_empty) {
3956 body_start_reg = compiler->AllocateRegister();
iposva@chromium.org245aa852009-02-10 00:49:54 +00003957 } else if (FLAG_irregexp_optimization && !needs_capture_clearing) {
ager@chromium.org32912102009-01-16 10:38:43 +00003958 // Only unroll if there are no captures and the body can't be
3959 // empty.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003960 if (min > 0 && min <= kMaxUnrolledMinMatches) {
3961 int new_max = (max == kInfinity) ? max : max - min;
3962 // Recurse once to get the loop or optional matches after the fixed ones.
iposva@chromium.org245aa852009-02-10 00:49:54 +00003963 RegExpNode* answer = ToNode(
3964 0, new_max, is_greedy, body, compiler, on_success, true);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003965 // Unroll the forced matches from 0 to min. This can cause chains of
3966 // TextNodes (which the parser does not generate). These should be
3967 // combined if it turns out they hinder good code generation.
3968 for (int i = 0; i < min; i++) {
3969 answer = body->ToNode(compiler, answer);
3970 }
3971 return answer;
3972 }
3973 if (max <= kMaxUnrolledMaxMatches) {
3974 ASSERT(min == 0);
3975 // Unroll the optional matches up to max.
3976 RegExpNode* answer = on_success;
3977 for (int i = 0; i < max; i++) {
3978 ChoiceNode* alternation = new ChoiceNode(2);
3979 if (is_greedy) {
3980 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3981 answer)));
3982 alternation->AddAlternative(GuardedAlternative(on_success));
3983 } else {
3984 alternation->AddAlternative(GuardedAlternative(on_success));
3985 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3986 answer)));
3987 }
3988 answer = alternation;
iposva@chromium.org245aa852009-02-10 00:49:54 +00003989 if (not_at_start) alternation->set_not_at_start();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003990 }
3991 return answer;
3992 }
3993 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003994 bool has_min = min > 0;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003995 bool has_max = max < RegExpTree::kInfinity;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003996 bool needs_counter = has_min || has_max;
ager@chromium.org32912102009-01-16 10:38:43 +00003997 int reg_ctr = needs_counter
3998 ? compiler->AllocateRegister()
3999 : RegExpCompiler::kNoRegister;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004000 LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
iposva@chromium.org245aa852009-02-10 00:49:54 +00004001 if (not_at_start) center->set_not_at_start();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004002 RegExpNode* loop_return = needs_counter
4003 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
4004 : static_cast<RegExpNode*>(center);
ager@chromium.org32912102009-01-16 10:38:43 +00004005 if (body_can_be_empty) {
4006 // If the body can be empty we need to check if it was and then
4007 // backtrack.
4008 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
4009 reg_ctr,
4010 min,
4011 loop_return);
4012 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004013 RegExpNode* body_node = body->ToNode(compiler, loop_return);
ager@chromium.org32912102009-01-16 10:38:43 +00004014 if (body_can_be_empty) {
4015 // If the body can be empty we need to store the start position
4016 // so we can bail out if it was empty.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004017 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
ager@chromium.org32912102009-01-16 10:38:43 +00004018 }
4019 if (needs_capture_clearing) {
4020 // Before entering the body of this loop we need to clear captures.
4021 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
4022 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004023 GuardedAlternative body_alt(body_node);
4024 if (has_max) {
4025 Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
4026 body_alt.AddGuard(body_guard);
4027 }
4028 GuardedAlternative rest_alt(on_success);
4029 if (has_min) {
4030 Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
4031 rest_alt.AddGuard(rest_guard);
4032 }
4033 if (is_greedy) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004034 center->AddLoopAlternative(body_alt);
4035 center->AddContinueAlternative(rest_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004036 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004037 center->AddContinueAlternative(rest_alt);
4038 center->AddLoopAlternative(body_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004039 }
4040 if (needs_counter) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004041 return ActionNode::SetRegister(reg_ctr, 0, center);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004042 } else {
4043 return center;
4044 }
4045}
4046
4047
4048RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004049 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004050 NodeInfo info;
4051 switch (type()) {
4052 case START_OF_LINE:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004053 return AssertionNode::AfterNewline(on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004054 case START_OF_INPUT:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004055 return AssertionNode::AtStart(on_success);
4056 case BOUNDARY:
4057 return AssertionNode::AtBoundary(on_success);
4058 case NON_BOUNDARY:
4059 return AssertionNode::AtNonBoundary(on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004060 case END_OF_INPUT:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004061 return AssertionNode::AtEnd(on_success);
4062 case END_OF_LINE: {
4063 // Compile $ in multiline regexps as an alternation with a positive
4064 // lookahead in one side and an end-of-input on the other side.
4065 // We need two registers for the lookahead.
4066 int stack_pointer_register = compiler->AllocateRegister();
4067 int position_register = compiler->AllocateRegister();
4068 // The ChoiceNode to distinguish between a newline and end-of-input.
4069 ChoiceNode* result = new ChoiceNode(2);
4070 // Create a newline atom.
4071 ZoneList<CharacterRange>* newline_ranges =
4072 new ZoneList<CharacterRange>(3);
4073 CharacterRange::AddClassEscape('n', newline_ranges);
4074 RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
4075 TextNode* newline_matcher = new TextNode(
4076 newline_atom,
4077 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
4078 position_register,
4079 0, // No captures inside.
4080 -1, // Ignored if no captures.
4081 on_success));
4082 // Create an end-of-input matcher.
4083 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
4084 stack_pointer_register,
4085 position_register,
4086 newline_matcher);
4087 // Add the two alternatives to the ChoiceNode.
4088 GuardedAlternative eol_alternative(end_of_line);
4089 result->AddAlternative(eol_alternative);
4090 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
4091 result->AddAlternative(end_alternative);
4092 return result;
4093 }
4094 default:
4095 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004096 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004097 return on_success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004098}
4099
4100
4101RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004102 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004103 return new BackReferenceNode(RegExpCapture::StartRegister(index()),
4104 RegExpCapture::EndRegister(index()),
ager@chromium.org8bb60582008-12-11 12:02:20 +00004105 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004106}
4107
4108
4109RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004110 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004111 return on_success;
4112}
4113
4114
4115RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004116 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004117 int stack_pointer_register = compiler->AllocateRegister();
4118 int position_register = compiler->AllocateRegister();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004119
4120 const int registers_per_capture = 2;
4121 const int register_of_first_capture = 2;
4122 int register_count = capture_count_ * registers_per_capture;
4123 int register_start =
4124 register_of_first_capture + capture_from_ * registers_per_capture;
4125
ager@chromium.org8bb60582008-12-11 12:02:20 +00004126 RegExpNode* success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004127 if (is_positive()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004128 RegExpNode* node = ActionNode::BeginSubmatch(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004129 stack_pointer_register,
4130 position_register,
4131 body()->ToNode(
4132 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004133 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
4134 position_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004135 register_count,
4136 register_start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004137 on_success)));
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004138 return node;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004139 } else {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004140 // We use a ChoiceNode for a negative lookahead because it has most of
4141 // the characteristics we need. It has the body of the lookahead as its
4142 // first alternative and the expression after the lookahead of the second
4143 // alternative. If the first alternative succeeds then the
4144 // NegativeSubmatchSuccess will unwind the stack including everything the
4145 // choice node set up and backtrack. If the first alternative fails then
4146 // the second alternative is tried, which is exactly the desired result
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004147 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
4148 // ChoiceNode that knows to ignore the first exit when calculating quick
4149 // checks.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004150 GuardedAlternative body_alt(
4151 body()->ToNode(
4152 compiler,
4153 success = new NegativeSubmatchSuccess(stack_pointer_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004154 position_register,
4155 register_count,
4156 register_start)));
4157 ChoiceNode* choice_node =
4158 new NegativeLookaheadChoiceNode(body_alt,
4159 GuardedAlternative(on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004160 return ActionNode::BeginSubmatch(stack_pointer_register,
4161 position_register,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004162 choice_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004163 }
4164}
4165
4166
4167RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004168 RegExpNode* on_success) {
4169 return ToNode(body(), index(), compiler, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004170}
4171
4172
4173RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
4174 int index,
4175 RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004176 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004177 int start_reg = RegExpCapture::StartRegister(index);
4178 int end_reg = RegExpCapture::EndRegister(index);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004179 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
ager@chromium.org8bb60582008-12-11 12:02:20 +00004180 RegExpNode* body_node = body->ToNode(compiler, store_end);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004181 return ActionNode::StorePosition(start_reg, true, body_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004182}
4183
4184
4185RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004186 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004187 ZoneList<RegExpTree*>* children = nodes();
4188 RegExpNode* current = on_success;
4189 for (int i = children->length() - 1; i >= 0; i--) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004190 current = children->at(i)->ToNode(compiler, current);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004191 }
4192 return current;
4193}
4194
4195
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004196static void AddClass(const uc16* elmv,
4197 int elmc,
4198 ZoneList<CharacterRange>* ranges) {
4199 for (int i = 0; i < elmc; i += 2) {
4200 ASSERT(elmv[i] <= elmv[i + 1]);
4201 ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
4202 }
4203}
4204
4205
4206static void AddClassNegated(const uc16 *elmv,
4207 int elmc,
4208 ZoneList<CharacterRange>* ranges) {
4209 ASSERT(elmv[0] != 0x0000);
ager@chromium.org8bb60582008-12-11 12:02:20 +00004210 ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004211 uc16 last = 0x0000;
4212 for (int i = 0; i < elmc; i += 2) {
4213 ASSERT(last <= elmv[i] - 1);
4214 ASSERT(elmv[i] <= elmv[i + 1]);
4215 ranges->Add(CharacterRange(last, elmv[i] - 1));
4216 last = elmv[i + 1] + 1;
4217 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004218 ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004219}
4220
4221
4222void CharacterRange::AddClassEscape(uc16 type,
4223 ZoneList<CharacterRange>* ranges) {
4224 switch (type) {
4225 case 's':
4226 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
4227 break;
4228 case 'S':
4229 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
4230 break;
4231 case 'w':
4232 AddClass(kWordRanges, kWordRangeCount, ranges);
4233 break;
4234 case 'W':
4235 AddClassNegated(kWordRanges, kWordRangeCount, ranges);
4236 break;
4237 case 'd':
4238 AddClass(kDigitRanges, kDigitRangeCount, ranges);
4239 break;
4240 case 'D':
4241 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
4242 break;
4243 case '.':
4244 AddClassNegated(kLineTerminatorRanges,
4245 kLineTerminatorRangeCount,
4246 ranges);
4247 break;
4248 // This is not a character range as defined by the spec but a
4249 // convenient shorthand for a character class that matches any
4250 // character.
4251 case '*':
4252 ranges->Add(CharacterRange::Everything());
4253 break;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004254 // This is the set of characters matched by the $ and ^ symbols
4255 // in multiline mode.
4256 case 'n':
4257 AddClass(kLineTerminatorRanges,
4258 kLineTerminatorRangeCount,
4259 ranges);
4260 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004261 default:
4262 UNREACHABLE();
4263 }
4264}
4265
4266
4267Vector<const uc16> CharacterRange::GetWordBounds() {
4268 return Vector<const uc16>(kWordRanges, kWordRangeCount);
4269}
4270
4271
4272class CharacterRangeSplitter {
4273 public:
4274 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
4275 ZoneList<CharacterRange>** excluded)
4276 : included_(included),
4277 excluded_(excluded) { }
4278 void Call(uc16 from, DispatchTable::Entry entry);
4279
4280 static const int kInBase = 0;
4281 static const int kInOverlay = 1;
4282
4283 private:
4284 ZoneList<CharacterRange>** included_;
4285 ZoneList<CharacterRange>** excluded_;
4286};
4287
4288
4289void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
4290 if (!entry.out_set()->Get(kInBase)) return;
4291 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
4292 ? included_
4293 : excluded_;
4294 if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
4295 (*target)->Add(CharacterRange(entry.from(), entry.to()));
4296}
4297
4298
4299void CharacterRange::Split(ZoneList<CharacterRange>* base,
4300 Vector<const uc16> overlay,
4301 ZoneList<CharacterRange>** included,
4302 ZoneList<CharacterRange>** excluded) {
4303 ASSERT_EQ(NULL, *included);
4304 ASSERT_EQ(NULL, *excluded);
4305 DispatchTable table;
4306 for (int i = 0; i < base->length(); i++)
4307 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
4308 for (int i = 0; i < overlay.length(); i += 2) {
4309 table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
4310 CharacterRangeSplitter::kInOverlay);
4311 }
4312 CharacterRangeSplitter callback(included, excluded);
4313 table.ForEach(&callback);
4314}
4315
4316
4317void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
4318 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4319 if (IsSingleton()) {
4320 // If this is a singleton we just expand the one character.
4321 int length = uncanonicalize.get(from(), '\0', chars);
4322 for (int i = 0; i < length; i++) {
4323 uc32 chr = chars[i];
4324 if (chr != from()) {
4325 ranges->Add(CharacterRange::Singleton(chars[i]));
4326 }
4327 }
4328 } else if (from() <= kRangeCanonicalizeMax &&
4329 to() <= kRangeCanonicalizeMax) {
4330 // If this is a range we expand the characters block by block,
4331 // expanding contiguous subranges (blocks) one at a time.
4332 // The approach is as follows. For a given start character we
4333 // look up the block that contains it, for instance 'a' if the
4334 // start character is 'c'. A block is characterized by the property
4335 // that all characters uncanonicalize in the same way as the first
4336 // element, except that each entry in the result is incremented
4337 // by the distance from the first element. So a-z is a block
4338 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
4339 // uncanonicalizes to ['a' + k, 'A' + k].
4340 // Once we've found the start point we look up its uncanonicalization
4341 // and produce a range for each element. For instance for [c-f]
4342 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
4343 // add a range if it is not already contained in the input, so [c-f]
4344 // will be skipped but [C-F] will be added. If this range is not
4345 // completely contained in a block we do this for all the blocks
4346 // covered by the range.
4347 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4348 // First, look up the block that contains the 'from' character.
4349 int length = canonrange.get(from(), '\0', range);
4350 if (length == 0) {
4351 range[0] = from();
4352 } else {
4353 ASSERT_EQ(1, length);
4354 }
4355 int pos = from();
4356 // The start of the current block. Note that except for the first
4357 // iteration 'start' is always equal to 'pos'.
4358 int start;
4359 // If it is not the start point of a block the entry contains the
4360 // offset of the character from the start point.
4361 if ((range[0] & kStartMarker) == 0) {
4362 start = pos - range[0];
4363 } else {
4364 start = pos;
4365 }
4366 // Then we add the ranges on at a time, incrementing the current
4367 // position to be after the last block each time. The position
4368 // always points to the start of a block.
4369 while (pos < to()) {
4370 length = canonrange.get(start, '\0', range);
4371 if (length == 0) {
4372 range[0] = start;
4373 } else {
4374 ASSERT_EQ(1, length);
4375 }
4376 ASSERT((range[0] & kStartMarker) != 0);
4377 // The start point of a block contains the distance to the end
4378 // of the range.
4379 int block_end = start + (range[0] & kPayloadMask) - 1;
4380 int end = (block_end > to()) ? to() : block_end;
4381 length = uncanonicalize.get(start, '\0', range);
4382 for (int i = 0; i < length; i++) {
4383 uc32 c = range[i];
4384 uc16 range_from = c + (pos - start);
4385 uc16 range_to = c + (end - start);
4386 if (!(from() <= range_from && range_to <= to())) {
4387 ranges->Add(CharacterRange(range_from, range_to));
4388 }
4389 }
4390 start = pos = block_end + 1;
4391 }
4392 } else {
4393 // TODO(plesner) when we've fixed the 2^11 bug in unibrow.
4394 }
4395}
4396
4397
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004398ZoneList<CharacterRange>* CharacterSet::ranges() {
4399 if (ranges_ == NULL) {
4400 ranges_ = new ZoneList<CharacterRange>(2);
4401 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
4402 }
4403 return ranges_;
4404}
4405
4406
4407
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004408// -------------------------------------------------------------------
4409// Interest propagation
4410
4411
4412RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
4413 for (int i = 0; i < siblings_.length(); i++) {
4414 RegExpNode* sibling = siblings_.Get(i);
4415 if (sibling->info()->Matches(info))
4416 return sibling;
4417 }
4418 return NULL;
4419}
4420
4421
4422RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
4423 ASSERT_EQ(false, *cloned);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004424 siblings_.Ensure(this);
4425 RegExpNode* result = TryGetSibling(info);
4426 if (result != NULL) return result;
4427 result = this->Clone();
4428 NodeInfo* new_info = result->info();
4429 new_info->ResetCompilationState();
4430 new_info->AddFromPreceding(info);
4431 AddSibling(result);
4432 *cloned = true;
4433 return result;
4434}
4435
4436
4437template <class C>
4438static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
4439 NodeInfo full_info(*node->info());
4440 full_info.AddFromPreceding(info);
4441 bool cloned = false;
4442 return RegExpNode::EnsureSibling(node, &full_info, &cloned);
4443}
4444
4445
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004446// -------------------------------------------------------------------
4447// Splay tree
4448
4449
4450OutSet* OutSet::Extend(unsigned value) {
4451 if (Get(value))
4452 return this;
4453 if (successors() != NULL) {
4454 for (int i = 0; i < successors()->length(); i++) {
4455 OutSet* successor = successors()->at(i);
4456 if (successor->Get(value))
4457 return successor;
4458 }
4459 } else {
4460 successors_ = new ZoneList<OutSet*>(2);
4461 }
4462 OutSet* result = new OutSet(first_, remaining_);
4463 result->Set(value);
4464 successors()->Add(result);
4465 return result;
4466}
4467
4468
4469void OutSet::Set(unsigned value) {
4470 if (value < kFirstLimit) {
4471 first_ |= (1 << value);
4472 } else {
4473 if (remaining_ == NULL)
4474 remaining_ = new ZoneList<unsigned>(1);
4475 if (remaining_->is_empty() || !remaining_->Contains(value))
4476 remaining_->Add(value);
4477 }
4478}
4479
4480
4481bool OutSet::Get(unsigned value) {
4482 if (value < kFirstLimit) {
4483 return (first_ & (1 << value)) != 0;
4484 } else if (remaining_ == NULL) {
4485 return false;
4486 } else {
4487 return remaining_->Contains(value);
4488 }
4489}
4490
4491
4492const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
4493const DispatchTable::Entry DispatchTable::Config::kNoValue;
4494
4495
4496void DispatchTable::AddRange(CharacterRange full_range, int value) {
4497 CharacterRange current = full_range;
4498 if (tree()->is_empty()) {
4499 // If this is the first range we just insert into the table.
4500 ZoneSplayTree<Config>::Locator loc;
4501 ASSERT_RESULT(tree()->Insert(current.from(), &loc));
4502 loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
4503 return;
4504 }
4505 // First see if there is a range to the left of this one that
4506 // overlaps.
4507 ZoneSplayTree<Config>::Locator loc;
4508 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
4509 Entry* entry = &loc.value();
4510 // If we've found a range that overlaps with this one, and it
4511 // starts strictly to the left of this one, we have to fix it
4512 // because the following code only handles ranges that start on
4513 // or after the start point of the range we're adding.
4514 if (entry->from() < current.from() && entry->to() >= current.from()) {
4515 // Snap the overlapping range in half around the start point of
4516 // the range we're adding.
4517 CharacterRange left(entry->from(), current.from() - 1);
4518 CharacterRange right(current.from(), entry->to());
4519 // The left part of the overlapping range doesn't overlap.
4520 // Truncate the whole entry to be just the left part.
4521 entry->set_to(left.to());
4522 // The right part is the one that overlaps. We add this part
4523 // to the map and let the next step deal with merging it with
4524 // the range we're adding.
4525 ZoneSplayTree<Config>::Locator loc;
4526 ASSERT_RESULT(tree()->Insert(right.from(), &loc));
4527 loc.set_value(Entry(right.from(),
4528 right.to(),
4529 entry->out_set()));
4530 }
4531 }
4532 while (current.is_valid()) {
4533 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
4534 (loc.value().from() <= current.to()) &&
4535 (loc.value().to() >= current.from())) {
4536 Entry* entry = &loc.value();
4537 // We have overlap. If there is space between the start point of
4538 // the range we're adding and where the overlapping range starts
4539 // then we have to add a range covering just that space.
4540 if (current.from() < entry->from()) {
4541 ZoneSplayTree<Config>::Locator ins;
4542 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4543 ins.set_value(Entry(current.from(),
4544 entry->from() - 1,
4545 empty()->Extend(value)));
4546 current.set_from(entry->from());
4547 }
4548 ASSERT_EQ(current.from(), entry->from());
4549 // If the overlapping range extends beyond the one we want to add
4550 // we have to snap the right part off and add it separately.
4551 if (entry->to() > current.to()) {
4552 ZoneSplayTree<Config>::Locator ins;
4553 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
4554 ins.set_value(Entry(current.to() + 1,
4555 entry->to(),
4556 entry->out_set()));
4557 entry->set_to(current.to());
4558 }
4559 ASSERT(entry->to() <= current.to());
4560 // The overlapping range is now completely contained by the range
4561 // we're adding so we can just update it and move the start point
4562 // of the range we're adding just past it.
4563 entry->AddValue(value);
4564 // Bail out if the last interval ended at 0xFFFF since otherwise
4565 // adding 1 will wrap around to 0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004566 if (entry->to() == String::kMaxUC16CharCode)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004567 break;
4568 ASSERT(entry->to() + 1 > current.from());
4569 current.set_from(entry->to() + 1);
4570 } else {
4571 // There is no overlap so we can just add the range
4572 ZoneSplayTree<Config>::Locator ins;
4573 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4574 ins.set_value(Entry(current.from(),
4575 current.to(),
4576 empty()->Extend(value)));
4577 break;
4578 }
4579 }
4580}
4581
4582
4583OutSet* DispatchTable::Get(uc16 value) {
4584 ZoneSplayTree<Config>::Locator loc;
4585 if (!tree()->FindGreatestLessThan(value, &loc))
4586 return empty();
4587 Entry* entry = &loc.value();
4588 if (value <= entry->to())
4589 return entry->out_set();
4590 else
4591 return empty();
4592}
4593
4594
4595// -------------------------------------------------------------------
4596// Analysis
4597
4598
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004599void Analysis::EnsureAnalyzed(RegExpNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004600 if (that->info()->been_analyzed || that->info()->being_analyzed)
4601 return;
4602 that->info()->being_analyzed = true;
4603 that->Accept(this);
4604 that->info()->being_analyzed = false;
4605 that->info()->been_analyzed = true;
4606}
4607
4608
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004609void Analysis::VisitEnd(EndNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004610 // nothing to do
4611}
4612
4613
ager@chromium.org8bb60582008-12-11 12:02:20 +00004614void TextNode::CalculateOffsets() {
4615 int element_count = elements()->length();
4616 // Set up the offsets of the elements relative to the start. This is a fixed
4617 // quantity since a TextNode can only contain fixed-width things.
4618 int cp_offset = 0;
4619 for (int i = 0; i < element_count; i++) {
4620 TextElement& elm = elements()->at(i);
4621 elm.cp_offset = cp_offset;
4622 if (elm.type == TextElement::ATOM) {
4623 cp_offset += elm.data.u_atom->data().length();
4624 } else {
4625 cp_offset++;
4626 Vector<const uc16> quarks = elm.data.u_atom->data();
4627 }
4628 }
4629}
4630
4631
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004632void Analysis::VisitText(TextNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004633 if (ignore_case_) {
4634 that->MakeCaseIndependent();
4635 }
4636 EnsureAnalyzed(that->on_success());
ager@chromium.org8bb60582008-12-11 12:02:20 +00004637 that->CalculateOffsets();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004638}
4639
4640
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004641void Analysis::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004642 RegExpNode* target = that->on_success();
4643 EnsureAnalyzed(target);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004644 // If the next node is interested in what it follows then this node
4645 // has to be interested too so it can pass the information on.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004646 that->info()->AddFromFollowing(target->info());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004647}
4648
4649
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004650void Analysis::VisitChoice(ChoiceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004651 NodeInfo* info = that->info();
4652 for (int i = 0; i < that->alternatives()->length(); i++) {
4653 RegExpNode* node = that->alternatives()->at(i).node();
4654 EnsureAnalyzed(node);
4655 // Anything the following nodes need to know has to be known by
4656 // this node also, so it can pass it on.
4657 info->AddFromFollowing(node->info());
4658 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004659}
4660
4661
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004662void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
4663 NodeInfo* info = that->info();
4664 for (int i = 0; i < that->alternatives()->length(); i++) {
4665 RegExpNode* node = that->alternatives()->at(i).node();
4666 if (node != that->loop_node()) {
4667 EnsureAnalyzed(node);
4668 info->AddFromFollowing(node->info());
4669 }
4670 }
4671 // Check the loop last since it may need the value of this node
4672 // to get a correct result.
4673 EnsureAnalyzed(that->loop_node());
4674 info->AddFromFollowing(that->loop_node()->info());
4675}
4676
4677
4678void Analysis::VisitBackReference(BackReferenceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004679 EnsureAnalyzed(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004680}
4681
4682
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004683void Analysis::VisitAssertion(AssertionNode* that) {
4684 EnsureAnalyzed(that->on_success());
4685}
4686
4687
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004688// -------------------------------------------------------------------
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004689// Dispatch table construction
4690
4691
4692void DispatchTableConstructor::VisitEnd(EndNode* that) {
4693 AddRange(CharacterRange::Everything());
4694}
4695
4696
4697void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
4698 node->set_being_calculated(true);
4699 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
4700 for (int i = 0; i < alternatives->length(); i++) {
4701 set_choice_index(i);
4702 alternatives->at(i).node()->Accept(this);
4703 }
4704 node->set_being_calculated(false);
4705}
4706
4707
4708class AddDispatchRange {
4709 public:
4710 explicit AddDispatchRange(DispatchTableConstructor* constructor)
4711 : constructor_(constructor) { }
4712 void Call(uc32 from, DispatchTable::Entry entry);
4713 private:
4714 DispatchTableConstructor* constructor_;
4715};
4716
4717
4718void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
4719 CharacterRange range(from, entry.to());
4720 constructor_->AddRange(range);
4721}
4722
4723
4724void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
4725 if (node->being_calculated())
4726 return;
4727 DispatchTable* table = node->GetTable(ignore_case_);
4728 AddDispatchRange adder(this);
4729 table->ForEach(&adder);
4730}
4731
4732
4733void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
4734 // TODO(160): Find the node that we refer back to and propagate its start
4735 // set back to here. For now we just accept anything.
4736 AddRange(CharacterRange::Everything());
4737}
4738
4739
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004740void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
4741 RegExpNode* target = that->on_success();
4742 target->Accept(this);
4743}
4744
4745
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004746
4747static int CompareRangeByFrom(const CharacterRange* a,
4748 const CharacterRange* b) {
4749 return Compare<uc16>(a->from(), b->from());
4750}
4751
4752
4753void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
4754 ranges->Sort(CompareRangeByFrom);
4755 uc16 last = 0;
4756 for (int i = 0; i < ranges->length(); i++) {
4757 CharacterRange range = ranges->at(i);
4758 if (last < range.from())
4759 AddRange(CharacterRange(last, range.from() - 1));
4760 if (range.to() >= last) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004761 if (range.to() == String::kMaxUC16CharCode) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004762 return;
4763 } else {
4764 last = range.to() + 1;
4765 }
4766 }
4767 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004768 AddRange(CharacterRange(last, String::kMaxUC16CharCode));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004769}
4770
4771
4772void DispatchTableConstructor::VisitText(TextNode* that) {
4773 TextElement elm = that->elements()->at(0);
4774 switch (elm.type) {
4775 case TextElement::ATOM: {
4776 uc16 c = elm.data.u_atom->data()[0];
4777 AddRange(CharacterRange(c, c));
4778 break;
4779 }
4780 case TextElement::CHAR_CLASS: {
4781 RegExpCharacterClass* tree = elm.data.u_char_class;
4782 ZoneList<CharacterRange>* ranges = tree->ranges();
4783 if (tree->is_negated()) {
4784 AddInverse(ranges);
4785 } else {
4786 for (int i = 0; i < ranges->length(); i++)
4787 AddRange(ranges->at(i));
4788 }
4789 break;
4790 }
4791 default: {
4792 UNIMPLEMENTED();
4793 }
4794 }
4795}
4796
4797
4798void DispatchTableConstructor::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004799 RegExpNode* target = that->on_success();
4800 target->Accept(this);
4801}
4802
4803
ager@chromium.org8bb60582008-12-11 12:02:20 +00004804Handle<FixedArray> RegExpEngine::Compile(RegExpCompileData* data,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004805 bool ignore_case,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004806 bool is_multiline,
4807 Handle<String> pattern,
4808 bool is_ascii) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004809 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
4810 return IrregexpRegExpTooBig(pattern);
4811 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004812 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004813 // Wrap the body of the regexp in capture #0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004814 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004815 0,
4816 &compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004817 compiler.accept());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004818 RegExpNode* node = captured_body;
4819 if (!data->tree->IsAnchored()) {
4820 // Add a .*? at the beginning, outside the body capture, unless
4821 // this expression is anchored at the beginning.
iposva@chromium.org245aa852009-02-10 00:49:54 +00004822 RegExpNode* loop_node =
4823 RegExpQuantifier::ToNode(0,
4824 RegExpTree::kInfinity,
4825 false,
4826 new RegExpCharacterClass('*'),
4827 &compiler,
4828 captured_body,
4829 data->contains_anchor);
4830
4831 if (data->contains_anchor) {
4832 // Unroll loop once, to take care of the case that might start
4833 // at the start of input.
4834 ChoiceNode* first_step_node = new ChoiceNode(2);
4835 first_step_node->AddAlternative(GuardedAlternative(captured_body));
4836 first_step_node->AddAlternative(GuardedAlternative(
4837 new TextNode(new RegExpCharacterClass('*'), loop_node)));
4838 node = first_step_node;
4839 } else {
4840 node = loop_node;
4841 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004842 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004843 data->node = node;
4844 Analysis analysis(ignore_case);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004845 analysis.EnsureAnalyzed(node);
4846
4847 NodeInfo info = *node->info();
ager@chromium.org8bb60582008-12-11 12:02:20 +00004848
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004849 if (FLAG_irregexp_native) {
4850#ifdef ARM
4851 // Unimplemented, fall-through to bytecode implementation.
4852#else // IA32
ager@chromium.org8bb60582008-12-11 12:02:20 +00004853 RegExpMacroAssemblerIA32::Mode mode;
4854 if (is_ascii) {
4855 mode = RegExpMacroAssemblerIA32::ASCII;
4856 } else {
4857 mode = RegExpMacroAssemblerIA32::UC16;
4858 }
4859 RegExpMacroAssemblerIA32 macro_assembler(mode,
4860 (data->capture_count + 1) * 2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004861 return compiler.Assemble(&macro_assembler,
4862 node,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004863 data->capture_count,
4864 pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004865#endif
4866 }
4867 EmbeddedVector<byte, 1024> codes;
4868 RegExpMacroAssemblerIrregexp macro_assembler(codes);
4869 return compiler.Assemble(&macro_assembler,
4870 node,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004871 data->capture_count,
4872 pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004873}
4874
4875
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00004876}} // namespace v8::internal