blob: 6cca7fc35c35a91e7f1fdd209a8297402d86b8e0 [file] [log] [blame]
ager@chromium.org9258b6b2008-09-11 09:11:10 +00001// Copyright 2006-2008 the V8 project authors. All rights reserved.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include "v8.h"
29
ager@chromium.orga74f0da2008-12-03 16:05:52 +000030#include "ast.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000031#include "execution.h"
32#include "factory.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000033#include "jsregexp-inl.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000034#include "platform.h"
kasperl@chromium.org41044eb2008-10-06 08:24:46 +000035#include "runtime.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000036#include "top.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000037#include "compilation-cache.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000038#include "string-stream.h"
39#include "parser.h"
40#include "regexp-macro-assembler.h"
41#include "regexp-macro-assembler-tracer.h"
42#include "regexp-macro-assembler-irregexp.h"
43
44#ifdef ARM
45#include "regexp-macro-assembler-arm.h"
46#else // IA32
47#include "macro-assembler-ia32.h"
48#include "regexp-macro-assembler-ia32.h"
49#endif
50
51#include "interpreter-irregexp.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000052
53// Including pcre.h undefines DEBUG to avoid getting debug output from
54// the JSCRE implementation. Make sure to redefine it in debug mode
55// after having included the header file.
56#ifdef DEBUG
57#include "third_party/jscre/pcre.h"
58#define DEBUG
59#else
60#include "third_party/jscre/pcre.h"
61#endif
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000062
ager@chromium.orga74f0da2008-12-03 16:05:52 +000063
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000064namespace v8 { namespace internal {
65
66
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000067static Failure* malloc_failure;
68
69static void* JSREMalloc(size_t size) {
70 Object* obj = Heap::AllocateByteArray(size);
71
72 // If allocation failed, return a NULL pointer to JSRE, and jsRegExpCompile
73 // will return NULL to the caller, performs GC there.
74 // Also pass failure information to the caller.
75 if (obj->IsFailure()) {
76 malloc_failure = Failure::cast(obj);
77 return NULL;
78 }
79
80 // Note: object is unrooted, the caller of jsRegExpCompile must
81 // create a handle for the return value before doing heap allocation.
82 return reinterpret_cast<void*>(ByteArray::cast(obj)->GetDataStartAddress());
83}
84
85
86static void JSREFree(void* p) {
87 USE(p); // Do nothing, memory is garbage collected.
88}
89
90
91String* RegExpImpl::last_ascii_string_ = NULL;
92String* RegExpImpl::two_byte_cached_string_ = NULL;
93
94
95void RegExpImpl::NewSpaceCollectionPrologue() {
96 // The two byte string is always in the old space. The Ascii string may be
97 // in either place. If it is in the old space we don't need to do anything.
98 if (Heap::InNewSpace(last_ascii_string_)) {
99 // Invalidate the cache.
100 last_ascii_string_ = NULL;
101 two_byte_cached_string_ = NULL;
102 }
103}
104
105
106void RegExpImpl::OldSpaceCollectionPrologue() {
107 last_ascii_string_ = NULL;
108 two_byte_cached_string_ = NULL;
109}
110
111
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000112Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
113 Handle<String> pattern,
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000114 Handle<String> flags,
115 bool* has_pending_exception) {
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000116 // Ensure that the constructor function has been loaded.
117 if (!constructor->IsLoaded()) {
118 LoadLazy(constructor, has_pending_exception);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000119 if (*has_pending_exception) return Handle<Object>();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000120 }
121 // Call the construct code with 2 arguments.
122 Object** argv[2] = { Handle<Object>::cast(pattern).location(),
123 Handle<Object>::cast(flags).location() };
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000124 return Execution::New(constructor, 2, argv, has_pending_exception);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000125}
126
127
128// Converts a source string to a 16 bit flat string or a SlicedString containing
129// a 16 bit flat string).
130Handle<String> RegExpImpl::CachedStringToTwoByte(Handle<String> subject) {
131 if (*subject == last_ascii_string_) {
132 ASSERT(two_byte_cached_string_ != NULL);
133 return Handle<String>(String::cast(two_byte_cached_string_));
134 }
135 Handle<String> two_byte_string = StringToTwoByte(subject);
136 last_ascii_string_ = *subject;
137 two_byte_cached_string_ = *two_byte_string;
138 return two_byte_string;
139}
140
141
142// Converts a source string to a 16 bit flat string or a SlicedString containing
143// a 16 bit flat string).
144Handle<String> RegExpImpl::StringToTwoByte(Handle<String> pattern) {
ager@chromium.org870a0b62008-11-04 11:43:05 +0000145 StringShape shape(*pattern);
146 if (!pattern->IsFlat(shape)) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000147 FlattenString(pattern);
ager@chromium.orgc3e50d82008-11-05 11:53:10 +0000148 shape = StringShape(*pattern);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000149 }
ager@chromium.org870a0b62008-11-04 11:43:05 +0000150 Handle<String> flat_string(shape.IsCons() ?
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000151 String::cast(ConsString::cast(*pattern)->first()) :
152 *pattern);
ager@chromium.org870a0b62008-11-04 11:43:05 +0000153 ASSERT(flat_string->IsString());
154 StringShape flat_shape(*flat_string);
155 ASSERT(!flat_shape.IsCons());
156 ASSERT(flat_shape.IsSequential() ||
157 flat_shape.IsSliced() ||
158 flat_shape.IsExternal());
159 if (!flat_shape.IsAsciiRepresentation()) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000160 return flat_string;
161 }
162
ager@chromium.org870a0b62008-11-04 11:43:05 +0000163 int len = flat_string->length(flat_shape);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000164 Handle<String> two_byte_string =
ager@chromium.org870a0b62008-11-04 11:43:05 +0000165 Factory::NewRawTwoByteString(len, TENURED);
166 uc16* dest = SeqTwoByteString::cast(*two_byte_string)->GetChars();
167 String::WriteToFlat(*flat_string, flat_shape, dest, 0, len);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000168 return two_byte_string;
169}
170
171
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000172static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
173 int flags = JSRegExp::NONE;
ager@chromium.org870a0b62008-11-04 11:43:05 +0000174 StringShape shape(*str);
175 for (int i = 0; i < str->length(shape); i++) {
176 switch (str->Get(shape, i)) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000177 case 'i':
178 flags |= JSRegExp::IGNORE_CASE;
179 break;
180 case 'g':
181 flags |= JSRegExp::GLOBAL;
182 break;
183 case 'm':
184 flags |= JSRegExp::MULTILINE;
185 break;
186 }
187 }
188 return JSRegExp::Flags(flags);
189}
190
191
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000192static inline void ThrowRegExpException(Handle<JSRegExp> re,
193 Handle<String> pattern,
194 Handle<String> error_text,
195 const char* message) {
196 Handle<JSArray> array = Factory::NewJSArray(2);
197 SetElement(array, 0, pattern);
198 SetElement(array, 1, error_text);
199 Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
200 Top::Throw(*regexp_err);
201}
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000202
203
ager@chromium.org8bb60582008-12-11 12:02:20 +0000204// Generic RegExp methods. Dispatches to implementation specific methods.
205
206
207class OffsetsVector {
208 public:
209 inline OffsetsVector(int num_registers)
210 : offsets_vector_length_(num_registers) {
211 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
212 vector_ = NewArray<int>(offsets_vector_length_);
213 } else {
214 vector_ = static_offsets_vector_;
215 }
216 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000217 inline ~OffsetsVector() {
218 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
219 DeleteArray(vector_);
220 vector_ = NULL;
221 }
222 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000223 inline int* vector() { return vector_; }
224 inline int length() { return offsets_vector_length_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000225
226 private:
227 int* vector_;
228 int offsets_vector_length_;
229 static const int kStaticOffsetsVectorSize = 50;
230 static int static_offsets_vector_[kStaticOffsetsVectorSize];
231};
232
233
234int OffsetsVector::static_offsets_vector_[
235 OffsetsVector::kStaticOffsetsVectorSize];
236
237
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000238Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
239 Handle<String> pattern,
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000240 Handle<String> flag_str) {
241 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
242 Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
243 bool in_cache = !cached.is_null();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000244 LOG(RegExpCompileEvent(re, in_cache));
245
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000246 Handle<Object> result;
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000247 if (in_cache) {
248 re->set_data(*cached);
249 result = re;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000250 } else {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000251 FlattenString(pattern);
252 ZoneScope zone_scope(DELETE_ON_EXIT);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000253 RegExpCompileData parse_result;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000254 FlatStringReader reader(pattern);
255 if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
256 // Throw an exception if we fail to parse the pattern.
257 ThrowRegExpException(re,
258 pattern,
259 parse_result.error,
260 "malformed_regexp");
ager@chromium.org8bb60582008-12-11 12:02:20 +0000261 return Handle<Object>::null();
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000262 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000263
264 if (parse_result.simple && !flags.is_ignore_case()) {
265 // Parse-tree is a single atom that is equal to the pattern.
266 result = AtomCompile(re, pattern, flags, pattern);
267 } else if (parse_result.tree->IsAtom() &&
268 !flags.is_ignore_case() &&
269 parse_result.capture_count == 0) {
270 RegExpAtom* atom = parse_result.tree->AsAtom();
271 Vector<const uc16> atom_pattern = atom->data();
272 Handle<String> atom_string = Factory::NewStringFromTwoByte(atom_pattern);
273 result = AtomCompile(re, pattern, flags, atom_string);
274 } else if (FLAG_irregexp) {
275 result = IrregexpPrepare(re, pattern, flags);
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000276 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000277 result = JscrePrepare(re, pattern, flags);
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000278 }
279 Object* data = re->data();
280 if (data->IsFixedArray()) {
281 // If compilation succeeded then the data is set on the regexp
282 // and we can store it in the cache.
283 Handle<FixedArray> data(FixedArray::cast(re->data()));
284 CompilationCache::PutRegExp(pattern, flags, data);
285 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000286 }
287
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000288 return result;
289}
290
291
292Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
293 Handle<String> subject,
294 Handle<Object> index) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000295 switch (regexp->TypeTag()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000296 case JSRegExp::ATOM:
297 return AtomExec(regexp, subject, index);
298 case JSRegExp::IRREGEXP: {
299 Handle<Object> result = IrregexpExec(regexp, subject, index);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000300 if (!result.is_null() || Top::has_pending_exception()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000301 return result;
302 }
303 // We couldn't handle the regexp using Irregexp, so fall back
304 // on JSCRE.
305 // Reset the JSRegExp to use JSCRE.
306 JscrePrepare(regexp,
307 Handle<String>(regexp->Pattern()),
308 regexp->GetFlags());
309 // Fall-through to JSCRE.
310 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000311 case JSRegExp::JSCRE:
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000312 if (FLAG_disable_jscre) {
313 UNIMPLEMENTED();
314 }
315 return JscreExec(regexp, subject, index);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000316 default:
317 UNREACHABLE();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000318 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000319 }
320}
321
322
323Handle<Object> RegExpImpl::ExecGlobal(Handle<JSRegExp> regexp,
324 Handle<String> subject) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000325 switch (regexp->TypeTag()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000326 case JSRegExp::ATOM:
327 return AtomExecGlobal(regexp, subject);
328 case JSRegExp::IRREGEXP: {
329 Handle<Object> result = IrregexpExecGlobal(regexp, subject);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000330 if (!result.is_null() || Top::has_pending_exception()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000331 return result;
332 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000333 // Empty handle as result but no exception thrown means that
334 // the regexp contains features not yet handled by the irregexp
335 // compiler.
336 // We have to fall back on JSCRE. Reset the JSRegExp to use JSCRE.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000337 JscrePrepare(regexp,
338 Handle<String>(regexp->Pattern()),
339 regexp->GetFlags());
340 // Fall-through to JSCRE.
341 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000342 case JSRegExp::JSCRE:
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000343 if (FLAG_disable_jscre) {
344 UNIMPLEMENTED();
345 }
346 return JscreExecGlobal(regexp, subject);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000347 default:
348 UNREACHABLE();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000349 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000350 }
351}
352
353
ager@chromium.org8bb60582008-12-11 12:02:20 +0000354// RegExp Atom implementation: Simple string search using indexOf.
355
356
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000357Handle<Object> RegExpImpl::AtomCompile(Handle<JSRegExp> re,
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000358 Handle<String> pattern,
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000359 JSRegExp::Flags flags,
360 Handle<String> match_pattern) {
361 Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, match_pattern);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000362 return re;
363}
364
365
366Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
367 Handle<String> subject,
368 Handle<Object> index) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000369 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000370
371 uint32_t start_index;
372 if (!Array::IndexFromObject(*index, &start_index)) {
373 return Handle<Smi>(Smi::FromInt(-1));
374 }
375
ager@chromium.org7c537e22008-10-16 08:43:32 +0000376 int value = Runtime::StringMatch(subject, needle, start_index);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000377 if (value == -1) return Factory::null_value();
ager@chromium.org7c537e22008-10-16 08:43:32 +0000378
379 Handle<FixedArray> array = Factory::NewFixedArray(2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000380 array->set(0, Smi::FromInt(value));
381 array->set(1, Smi::FromInt(value + needle->length()));
ager@chromium.org7c537e22008-10-16 08:43:32 +0000382 return Factory::NewJSArrayWithElements(array);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000383}
384
385
386Handle<Object> RegExpImpl::AtomExecGlobal(Handle<JSRegExp> re,
387 Handle<String> subject) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000388 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000389 Handle<JSArray> result = Factory::NewJSArray(1);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000390 int index = 0;
391 int match_count = 0;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000392 int subject_length = subject->length();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000393 int needle_length = needle->length();
ager@chromium.org7c537e22008-10-16 08:43:32 +0000394 while (true) {
ager@chromium.org7c537e22008-10-16 08:43:32 +0000395 int value = -1;
396 if (index + needle_length <= subject_length) {
397 value = Runtime::StringMatch(subject, needle, index);
398 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000399 if (value == -1) break;
400 HandleScope scope;
401 int end = value + needle_length;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000402
403 Handle<FixedArray> array = Factory::NewFixedArray(2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000404 array->set(0, Smi::FromInt(value));
405 array->set(1, Smi::FromInt(end));
ager@chromium.org7c537e22008-10-16 08:43:32 +0000406 Handle<JSArray> pair = Factory::NewJSArrayWithElements(array);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000407 SetElement(result, match_count, pair);
408 match_count++;
409 index = end;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000410 if (needle_length == 0) index++;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000411 }
412 return result;
413}
414
415
ager@chromium.org8bb60582008-12-11 12:02:20 +0000416// JSCRE implementation.
417
418
419int RegExpImpl::JscreNumberOfCaptures(Handle<JSRegExp> re) {
420 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
421 return Smi::cast(value->get(kJscreNumberOfCapturesIndex))->value();
422}
423
424
425ByteArray* RegExpImpl::JscreInternal(Handle<JSRegExp> re) {
426 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
427 return ByteArray::cast(value->get(kJscreInternalIndex));
428}
429
430
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000431Handle<Object>RegExpImpl::JscrePrepare(Handle<JSRegExp> re,
432 Handle<String> pattern,
433 JSRegExp::Flags flags) {
434 Handle<Object> value(Heap::undefined_value());
435 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
436 return re;
437}
438
439
ager@chromium.org8bb60582008-12-11 12:02:20 +0000440static inline Object* JscreDoCompile(String* pattern,
441 JSRegExp::Flags flags,
442 unsigned* number_of_captures,
443 const char** error_message,
444 v8::jscre::JscreRegExp** code) {
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000445 v8::jscre::JSRegExpIgnoreCaseOption case_option = flags.is_ignore_case()
446 ? v8::jscre::JSRegExpIgnoreCase
447 : v8::jscre::JSRegExpDoNotIgnoreCase;
448 v8::jscre::JSRegExpMultilineOption multiline_option = flags.is_multiline()
449 ? v8::jscre::JSRegExpMultiline
450 : v8::jscre::JSRegExpSingleLine;
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000451 *error_message = NULL;
452 malloc_failure = Failure::Exception();
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000453 *code = v8::jscre::jsRegExpCompile(pattern->GetTwoByteData(),
454 pattern->length(),
455 case_option,
456 multiline_option,
457 number_of_captures,
458 error_message,
459 &JSREMalloc,
460 &JSREFree);
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000461 if (*code == NULL && (malloc_failure->IsRetryAfterGC() ||
462 malloc_failure->IsOutOfMemoryFailure())) {
463 return malloc_failure;
464 } else {
465 // It doesn't matter which object we return here, we just need to return
466 // a non-failure to indicate to the GC-retry code that there was no
467 // allocation failure.
468 return pattern;
469 }
470}
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000471
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000472
ager@chromium.org8bb60582008-12-11 12:02:20 +0000473static void JscreCompileWithRetryAfterGC(Handle<String> pattern,
474 JSRegExp::Flags flags,
475 unsigned* number_of_captures,
476 const char** error_message,
477 v8::jscre::JscreRegExp** code) {
478 CALL_HEAP_FUNCTION_VOID(JscreDoCompile(*pattern,
479 flags,
480 number_of_captures,
481 error_message,
482 code));
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000483}
484
485
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000486Handle<Object> RegExpImpl::JscreCompile(Handle<JSRegExp> re) {
487 ASSERT_EQ(re->TypeTag(), JSRegExp::JSCRE);
488 ASSERT(re->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined());
489
490 Handle<String> pattern(re->Pattern());
491 JSRegExp::Flags flags = re->GetFlags();
492
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000493 Handle<String> two_byte_pattern = StringToTwoByte(pattern);
494
495 unsigned number_of_captures;
496 const char* error_message = NULL;
497
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000498 v8::jscre::JscreRegExp* code = NULL;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000499 FlattenString(pattern);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000500
ager@chromium.org8bb60582008-12-11 12:02:20 +0000501 JscreCompileWithRetryAfterGC(two_byte_pattern,
502 flags,
503 &number_of_captures,
504 &error_message,
505 &code);
ager@chromium.org7c537e22008-10-16 08:43:32 +0000506
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000507 if (code == NULL) {
508 // Throw an exception.
509 Handle<JSArray> array = Factory::NewJSArray(2);
510 SetElement(array, 0, pattern);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000511 const char* message =
512 (error_message == NULL) ? "Unknown regexp error" : error_message;
513 SetElement(array, 1, Factory::NewStringFromUtf8(CStrVector(message)));
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000514 Handle<Object> regexp_err =
515 Factory::NewSyntaxError("malformed_regexp", array);
ager@chromium.org3bf7b912008-11-17 09:09:45 +0000516 Top::Throw(*regexp_err);
517 return Handle<Object>();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000518 }
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000519
520 // Convert the return address to a ByteArray pointer.
521 Handle<ByteArray> internal(
522 ByteArray::FromDataStartAddress(reinterpret_cast<Address>(code)));
523
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000524 Handle<FixedArray> value = Factory::NewFixedArray(kJscreDataLength);
525 value->set(kJscreNumberOfCapturesIndex, Smi::FromInt(number_of_captures));
526 value->set(kJscreInternalIndex, *internal);
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000527 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
528
529 return re;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000530}
531
532
ager@chromium.org8bb60582008-12-11 12:02:20 +0000533Handle<Object> RegExpImpl::JscreExec(Handle<JSRegExp> regexp,
534 Handle<String> subject,
535 Handle<Object> index) {
536 ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);
537 if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {
538 Handle<Object> compile_result = JscreCompile(regexp);
539 if (compile_result.is_null()) return compile_result;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000540 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000541 ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000542
ager@chromium.org8bb60582008-12-11 12:02:20 +0000543 int num_captures = JscreNumberOfCaptures(regexp);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000544
ager@chromium.org8bb60582008-12-11 12:02:20 +0000545 OffsetsVector offsets((num_captures + 1) * 3);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000546
ager@chromium.org8bb60582008-12-11 12:02:20 +0000547 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000548
ager@chromium.org8bb60582008-12-11 12:02:20 +0000549 Handle<String> subject16 = CachedStringToTwoByte(subject);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000550
ager@chromium.org8bb60582008-12-11 12:02:20 +0000551 return JscreExecOnce(regexp,
552 num_captures,
553 subject,
554 previous_index,
555 subject16->GetTwoByteData(),
556 offsets.vector(),
557 offsets.length());
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000558}
559
560
561Handle<Object> RegExpImpl::JscreExecOnce(Handle<JSRegExp> regexp,
562 int num_captures,
563 Handle<String> subject,
564 int previous_index,
565 const uc16* two_byte_subject,
566 int* offsets_vector,
567 int offsets_vector_length) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000568 int rc;
569 {
570 AssertNoAllocation a;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000571 ByteArray* internal = JscreInternal(regexp);
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000572 const v8::jscre::JscreRegExp* js_regexp =
573 reinterpret_cast<v8::jscre::JscreRegExp*>(
574 internal->GetDataStartAddress());
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000575
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000576 rc = v8::jscre::jsRegExpExecute(js_regexp,
577 two_byte_subject,
578 subject->length(),
579 previous_index,
580 offsets_vector,
581 offsets_vector_length);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000582 }
583
584 // The KJS JavaScript engine returns null (ie, a failed match) when
585 // JSRE's internal match limit is exceeded. We duplicate that behavior here.
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000586 if (rc == v8::jscre::JSRegExpErrorNoMatch
587 || rc == v8::jscre::JSRegExpErrorHitLimit) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000588 return Factory::null_value();
589 }
590
591 // Other JSRE errors:
592 if (rc < 0) {
593 // Throw an exception.
594 Handle<Object> code(Smi::FromInt(rc));
595 Handle<Object> args[2] = { Factory::LookupAsciiSymbol("jsre_exec"), code };
596 Handle<Object> regexp_err(
597 Factory::NewTypeError("jsre_error", HandleVector(args, 2)));
598 return Handle<Object>(Top::Throw(*regexp_err));
599 }
600
ager@chromium.org7c537e22008-10-16 08:43:32 +0000601 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000602 // The captures come in (start, end+1) pairs.
603 for (int i = 0; i < 2 * (num_captures+1); i += 2) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000604 array->set(i, Smi::FromInt(offsets_vector[i]));
605 array->set(i+1, Smi::FromInt(offsets_vector[i+1]));
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000606 }
ager@chromium.org7c537e22008-10-16 08:43:32 +0000607 return Factory::NewJSArrayWithElements(array);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000608}
609
610
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000611Handle<Object> RegExpImpl::JscreExecGlobal(Handle<JSRegExp> regexp,
612 Handle<String> subject) {
613 ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);
614 if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {
615 Handle<Object> compile_result = JscreCompile(regexp);
616 if (compile_result.is_null()) return compile_result;
617 }
618 ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());
619
620 // Prepare space for the return values.
621 int num_captures = JscreNumberOfCaptures(regexp);
622
623 OffsetsVector offsets((num_captures + 1) * 3);
624
625 int previous_index = 0;
626
627 Handle<JSArray> result = Factory::NewJSArray(0);
628 int i = 0;
629 Handle<Object> matches;
630
631 Handle<String> subject16 = CachedStringToTwoByte(subject);
632
633 do {
634 if (previous_index > subject->length() || previous_index < 0) {
635 // Per ECMA-262 15.10.6.2, if the previous index is greater than the
636 // string length, there is no match.
637 matches = Factory::null_value();
638 } else {
639 matches = JscreExecOnce(regexp,
640 num_captures,
641 subject,
642 previous_index,
643 subject16->GetTwoByteData(),
644 offsets.vector(),
645 offsets.length());
646
647 if (matches->IsJSArray()) {
648 SetElement(result, i, matches);
649 i++;
650 previous_index = offsets.vector()[1];
651 if (offsets.vector()[0] == offsets.vector()[1]) {
652 previous_index++;
653 }
654 }
655 }
656 } while (matches->IsJSArray());
657
658 // If we exited the loop with an exception, throw it.
659 if (matches->IsNull()) {
660 // Exited loop normally.
661 return result;
662 } else {
663 // Exited loop with the exception in matches.
664 return matches;
665 }
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000666}
667
668
ager@chromium.org8bb60582008-12-11 12:02:20 +0000669// Irregexp implementation.
670
671
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000672// Retrieves a compiled version of the regexp for either ASCII or non-ASCII
673// strings. If the compiled version doesn't already exist, it is compiled
674// from the source pattern.
675// Irregexp is not feature complete yet. If there is something in the
676// regexp that the compiler cannot currently handle, an empty
677// handle is returned, but no exception is thrown.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000678static Handle<FixedArray> GetCompiledIrregexp(Handle<JSRegExp> re,
679 bool is_ascii) {
680 ASSERT(re->DataAt(JSRegExp::kIrregexpDataIndex)->IsFixedArray());
681 Handle<FixedArray> alternatives(
682 FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex)));
683 ASSERT_EQ(2, alternatives->length());
684
685 int index = is_ascii ? 0 : 1;
686 Object* entry = alternatives->get(index);
687 if (!entry->IsNull()) {
688 return Handle<FixedArray>(FixedArray::cast(entry));
689 }
690
691 // Compile the RegExp.
692 ZoneScope zone_scope(DELETE_ON_EXIT);
693
694 JSRegExp::Flags flags = re->GetFlags();
695
696 Handle<String> pattern(re->Pattern());
697 StringShape shape(*pattern);
698 if (!pattern->IsFlat(shape)) {
699 pattern->Flatten(shape);
700 }
701
702 RegExpCompileData compile_data;
703 FlatStringReader reader(pattern);
704 if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) {
705 // Throw an exception if we fail to parse the pattern.
706 // THIS SHOULD NOT HAPPEN. We already parsed it successfully once.
707 ThrowRegExpException(re,
708 pattern,
709 compile_data.error,
710 "malformed_regexp");
711 return Handle<FixedArray>::null();
712 }
713 Handle<FixedArray> compiled_entry =
714 RegExpEngine::Compile(&compile_data,
715 flags.is_ignore_case(),
716 flags.is_multiline(),
717 pattern,
718 is_ascii);
719 if (!compiled_entry.is_null()) {
720 alternatives->set(index, *compiled_entry);
721 }
722 return compiled_entry;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000723}
724
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000725
ager@chromium.org8bb60582008-12-11 12:02:20 +0000726int RegExpImpl::IrregexpNumberOfCaptures(Handle<FixedArray> irre) {
727 return Smi::cast(irre->get(kIrregexpNumberOfCapturesIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000728}
729
730
ager@chromium.org8bb60582008-12-11 12:02:20 +0000731int RegExpImpl::IrregexpNumberOfRegisters(Handle<FixedArray> irre) {
732 return Smi::cast(irre->get(kIrregexpNumberOfRegistersIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000733}
734
735
ager@chromium.org8bb60582008-12-11 12:02:20 +0000736Handle<ByteArray> RegExpImpl::IrregexpByteCode(Handle<FixedArray> irre) {
737 ASSERT(Smi::cast(irre->get(kIrregexpImplementationIndex))->value()
738 == RegExpMacroAssembler::kBytecodeImplementation);
739 return Handle<ByteArray>(ByteArray::cast(irre->get(kIrregexpCodeIndex)));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000740}
741
742
ager@chromium.org8bb60582008-12-11 12:02:20 +0000743Handle<Code> RegExpImpl::IrregexpNativeCode(Handle<FixedArray> irre) {
744 ASSERT(Smi::cast(irre->get(kIrregexpImplementationIndex))->value()
745 != RegExpMacroAssembler::kBytecodeImplementation);
746 return Handle<Code>(Code::cast(irre->get(kIrregexpCodeIndex)));
747}
748
749
750Handle<Object>RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
751 Handle<String> pattern,
752 JSRegExp::Flags flags) {
753 // Make space for ASCII and UC16 versions.
754 Handle<FixedArray> alternatives = Factory::NewFixedArray(2);
755 alternatives->set_null(0);
756 alternatives->set_null(1);
757 Factory::SetRegExpData(re, JSRegExp::IRREGEXP, pattern, flags, alternatives);
758 return re;
759}
760
761
762Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
763 Handle<String> subject,
764 Handle<Object> index) {
765 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
766 ASSERT(regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsFixedArray());
767
768 bool is_ascii = StringShape(*subject).IsAsciiRepresentation();
769 Handle<FixedArray> irregexp = GetCompiledIrregexp(regexp, is_ascii);
770 if (irregexp.is_null()) {
771 // We can't handle the RegExp with IRRegExp.
772 return Handle<Object>::null();
773 }
774
775 // Prepare space for the return values.
776 int number_of_registers = IrregexpNumberOfRegisters(irregexp);
777 OffsetsVector offsets(number_of_registers);
778
779 int num_captures = IrregexpNumberOfCaptures(irregexp);
780
781 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
782
783#ifdef DEBUG
784 if (FLAG_trace_regexp_bytecodes) {
785 String* pattern = regexp->Pattern();
786 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
787 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
788 }
789#endif
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000790
791 if (!subject->IsFlat(StringShape(*subject))) {
792 FlattenString(subject);
793 }
794
ager@chromium.org8bb60582008-12-11 12:02:20 +0000795 return IrregexpExecOnce(irregexp,
796 num_captures,
797 subject,
798 previous_index,
799 offsets.vector(),
800 offsets.length());
801}
802
803
804Handle<Object> RegExpImpl::IrregexpExecGlobal(Handle<JSRegExp> regexp,
805 Handle<String> subject) {
806 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
807
808 StringShape shape(*subject);
809 bool is_ascii = shape.IsAsciiRepresentation();
810 Handle<FixedArray> irregexp = GetCompiledIrregexp(regexp, is_ascii);
811 if (irregexp.is_null()) {
812 return Handle<Object>::null();
813 }
814
815 // Prepare space for the return values.
816 int number_of_registers = IrregexpNumberOfRegisters(irregexp);
817 OffsetsVector offsets(number_of_registers);
818
819 int previous_index = 0;
820
821 Handle<JSArray> result = Factory::NewJSArray(0);
822 int i = 0;
823 Handle<Object> matches;
824
825 if (!subject->IsFlat(shape)) {
826 subject->Flatten(shape);
827 }
828
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000829 while (true) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000830 if (previous_index > subject->length() || previous_index < 0) {
831 // Per ECMA-262 15.10.6.2, if the previous index is greater than the
832 // string length, there is no match.
833 matches = Factory::null_value();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000834 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000835 } else {
836#ifdef DEBUG
837 if (FLAG_trace_regexp_bytecodes) {
838 String* pattern = regexp->Pattern();
839 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
840 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
841 }
842#endif
ager@chromium.org8bb60582008-12-11 12:02:20 +0000843 matches = IrregexpExecOnce(irregexp,
844 IrregexpNumberOfCaptures(irregexp),
845 subject,
846 previous_index,
847 offsets.vector(),
848 offsets.length());
849
850 if (matches->IsJSArray()) {
851 SetElement(result, i, matches);
852 i++;
853 previous_index = offsets.vector()[1];
854 if (offsets.vector()[0] == offsets.vector()[1]) {
855 previous_index++;
856 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000857 } else if (matches->IsNull()) {
858 return result;
859 } else {
860 return matches;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000861 }
862 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000863 }
864}
865
866
867Handle<Object> RegExpImpl::IrregexpExecOnce(Handle<FixedArray> irregexp,
868 int num_captures,
869 Handle<String> subject,
870 int previous_index,
871 int* offsets_vector,
872 int offsets_vector_length) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000873 ASSERT(subject->IsFlat(StringShape(*subject)));
ager@chromium.org8bb60582008-12-11 12:02:20 +0000874 bool rc;
875
876 int tag = Smi::cast(irregexp->get(kIrregexpImplementationIndex))->value();
877
ager@chromium.org8bb60582008-12-11 12:02:20 +0000878 switch (tag) {
879 case RegExpMacroAssembler::kIA32Implementation: {
880#ifndef ARM
881 Handle<Code> code = IrregexpNativeCode(irregexp);
882
883 StringShape shape(*subject);
884
885 // Character offsets into string.
886 int start_offset = previous_index;
887 int end_offset = subject->length(shape);
888
889 if (shape.IsCons()) {
890 subject = Handle<String>(ConsString::cast(*subject)->first());
891 } else if (shape.IsSliced()) {
892 SlicedString* slice = SlicedString::cast(*subject);
893 start_offset += slice->start();
894 end_offset += slice->start();
895 subject = Handle<String>(slice->buffer());
896 }
897
898 // String is now either Sequential or External
899 StringShape flatshape(*subject);
900 bool is_ascii = flatshape.IsAsciiRepresentation();
901 int char_size_shift = is_ascii ? 0 : 1;
902
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000903 RegExpMacroAssemblerIA32::Result res;
904
ager@chromium.org8bb60582008-12-11 12:02:20 +0000905 if (flatshape.IsExternal()) {
906 const byte* address;
907 if (is_ascii) {
908 ExternalAsciiString* ext = ExternalAsciiString::cast(*subject);
909 address = reinterpret_cast<const byte*>(ext->resource()->data());
910 } else {
911 ExternalTwoByteString* ext = ExternalTwoByteString::cast(*subject);
912 address = reinterpret_cast<const byte*>(ext->resource()->data());
913 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000914 res = RegExpMacroAssemblerIA32::Execute(
ager@chromium.org8bb60582008-12-11 12:02:20 +0000915 *code,
916 &address,
917 start_offset << char_size_shift,
918 end_offset << char_size_shift,
919 offsets_vector,
920 previous_index == 0);
921 } else { // Sequential string
922 Address char_address =
923 is_ascii ? SeqAsciiString::cast(*subject)->GetCharsAddress()
924 : SeqTwoByteString::cast(*subject)->GetCharsAddress();
925 int byte_offset = char_address - reinterpret_cast<Address>(*subject);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000926 res = RegExpMacroAssemblerIA32::Execute(
ager@chromium.org8bb60582008-12-11 12:02:20 +0000927 *code,
928 subject.location(),
929 byte_offset + (start_offset << char_size_shift),
930 byte_offset + (end_offset << char_size_shift),
931 offsets_vector,
932 previous_index == 0);
933 }
934
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000935 if (res == RegExpMacroAssemblerIA32::EXCEPTION) {
936 ASSERT(Top::has_pending_exception());
937 return Handle<Object>::null();
938 }
939 rc = (res == RegExpMacroAssemblerIA32::SUCCESS);
940
ager@chromium.org8bb60582008-12-11 12:02:20 +0000941 if (rc) {
942 // Capture values are relative to start_offset only.
943 for (int i = 0; i < offsets_vector_length; i++) {
944 if (offsets_vector[i] >= 0) {
945 offsets_vector[i] += previous_index;
946 }
947 }
948 }
949 break;
950#else
951 UNIMPLEMENTED();
952 rc = false;
953 break;
954#endif
955 }
956 case RegExpMacroAssembler::kBytecodeImplementation: {
957 for (int i = (num_captures + 1) * 2 - 1; i >= 0; i--) {
958 offsets_vector[i] = -1;
959 }
960 Handle<ByteArray> byte_codes = IrregexpByteCode(irregexp);
961
962 rc = IrregexpInterpreter::Match(byte_codes,
963 subject,
964 offsets_vector,
965 previous_index);
966 break;
967 }
968 case RegExpMacroAssembler::kARMImplementation:
969 default:
970 UNREACHABLE();
971 rc = false;
972 break;
973 }
974
975 if (!rc) {
976 return Factory::null_value();
977 }
978
979 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
980 // The captures come in (start, end+1) pairs.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000981 for (int i = 0; i < 2 * (num_captures + 1); i += 2) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000982 array->set(i, Smi::FromInt(offsets_vector[i]));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000983 array->set(i + 1, Smi::FromInt(offsets_vector[i + 1]));
ager@chromium.org8bb60582008-12-11 12:02:20 +0000984 }
985 return Factory::NewJSArrayWithElements(array);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000986}
987
988
989// -------------------------------------------------------------------
990// Implmentation of the Irregexp regular expression engine.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000991//
992// The Irregexp regular expression engine is intended to be a complete
993// implementation of ECMAScript regular expressions. It generates either
994// bytecodes or native code.
995
996// The Irregexp regexp engine is structured in three steps.
997// 1) The parser generates an abstract syntax tree. See ast.cc.
998// 2) From the AST a node network is created. The nodes are all
999// subclasses of RegExpNode. The nodes represent states when
1000// executing a regular expression. Several optimizations are
1001// performed on the node network.
1002// 3) From the nodes we generate either byte codes or native code
1003// that can actually execute the regular expression (perform
1004// the search). The code generation step is described in more
1005// detail below.
1006
1007// Code generation.
1008//
1009// The nodes are divided into four main categories.
1010// * Choice nodes
1011// These represent places where the regular expression can
1012// match in more than one way. For example on entry to an
1013// alternation (foo|bar) or a repetition (*, +, ? or {}).
1014// * Action nodes
1015// These represent places where some action should be
1016// performed. Examples include recording the current position
1017// in the input string to a register (in order to implement
1018// captures) or other actions on register for example in order
1019// to implement the counters needed for {} repetitions.
1020// * Matching nodes
1021// These attempt to match some element part of the input string.
1022// Examples of elements include character classes, plain strings
1023// or back references.
1024// * End nodes
1025// These are used to implement the actions required on finding
1026// a successful match or failing to find a match.
1027//
1028// The code generated (whether as byte codes or native code) maintains
1029// some state as it runs. This consists of the following elements:
1030//
1031// * The capture registers. Used for string captures.
1032// * Other registers. Used for counters etc.
1033// * The current position.
1034// * The stack of backtracking information. Used when a matching node
1035// fails to find a match and needs to try an alternative.
1036//
1037// Conceptual regular expression execution model:
1038//
1039// There is a simple conceptual model of regular expression execution
1040// which will be presented first. The actual code generated is a more
1041// efficient simulation of the simple conceptual model:
1042//
1043// * Choice nodes are implemented as follows:
1044// For each choice except the last {
1045// push current position
1046// push backtrack code location
1047// <generate code to test for choice>
1048// backtrack code location:
1049// pop current position
1050// }
1051// <generate code to test for last choice>
1052//
1053// * Actions nodes are generated as follows
1054// <push affected registers on backtrack stack>
1055// <generate code to perform action>
1056// push backtrack code location
1057// <generate code to test for following nodes>
1058// backtrack code location:
1059// <pop affected registers to restore their state>
1060// <pop backtrack location from stack and go to it>
1061//
1062// * Matching nodes are generated as follows:
1063// if input string matches at current position
1064// update current position
1065// <generate code to test for following nodes>
1066// else
1067// <pop backtrack location from stack and go to it>
1068//
1069// Thus it can be seen that the current position is saved and restored
1070// by the choice nodes, whereas the registers are saved and restored by
1071// by the action nodes that manipulate them.
1072//
1073// The other interesting aspect of this model is that nodes are generated
1074// at the point where they are needed by a recursive call to Emit(). If
1075// the node has already been code generated then the Emit() call will
1076// generate a jump to the previously generated code instead. In order to
1077// limit recursion it is possible for the Emit() function to put the node
1078// on a work list for later generation and instead generate a jump. The
1079// destination of the jump is resolved later when the code is generated.
1080//
1081// Actual regular expression code generation.
1082//
1083// Code generation is actually more complicated than the above. In order
1084// to improve the efficiency of the generated code some optimizations are
1085// performed
1086//
1087// * Choice nodes have 1-character lookahead.
1088// A choice node looks at the following character and eliminates some of
1089// the choices immediately based on that character. This is not yet
1090// implemented.
1091// * Simple greedy loops store reduced backtracking information.
1092// A quantifier like /.*foo/m will greedily match the whole input. It will
1093// then need to backtrack to a point where it can match "foo". The naive
1094// implementation of this would push each character position onto the
1095// backtracking stack, then pop them off one by one. This would use space
1096// proportional to the length of the input string. However since the "."
1097// can only match in one way and always has a constant length (in this case
1098// of 1) it suffices to store the current position on the top of the stack
1099// once. Matching now becomes merely incrementing the current position and
1100// backtracking becomes decrementing the current position and checking the
1101// result against the stored current position. This is faster and saves
1102// space.
1103// * The current state is virtualized.
1104// This is used to defer expensive operations until it is clear that they
1105// are needed and to generate code for a node more than once, allowing
1106// specialized an efficient versions of the code to be created. This is
1107// explained in the section below.
1108//
1109// Execution state virtualization.
1110//
1111// Instead of emitting code, nodes that manipulate the state can record their
1112// manipulation in an object called the GenerationVariant. The
1113// GenerationVariant object can record a current position offset, an
1114// optional backtrack code location on the top of the virtualized backtrack
1115// stack and some register changes. When a node is to be emitted it can flush
1116// the GenerationVariant or update it. Flushing the GenerationVariant
1117// will emit code to bring the actual state into line with the virtual state.
1118// Avoiding flushing the state can postpone some work (eg updates of capture
1119// registers). Postponing work can save time when executing the regular
1120// expression since it may be found that the work never has to be done as a
1121// failure to match can occur. In addition it is much faster to jump to a
1122// known backtrack code location than it is to pop an unknown backtrack
1123// location from the stack and jump there.
1124//
1125// The virtual state found in the GenerationVariant affects code generation.
1126// For example the virtual state contains the difference between the actual
1127// current position and the virtual current position, and matching code needs
1128// to use this offset to attempt a match in the correct location of the input
1129// string. Therefore code generated for a non-trivial GenerationVariant is
1130// specialized to that GenerationVariant. The code generator therefore
1131// has the ability to generate code for each node several times. In order to
1132// limit the size of the generated code there is an arbitrary limit on how
1133// many specialized sets of code may be generated for a given node. If the
1134// limit is reached, the GenerationVariant is flushed and a generic version of
1135// the code for a node is emitted. This is subsequently used for that node.
1136// The code emitted for non-generic GenerationVariants is not recorded in the
1137// node and so it cannot currently be reused in the event that code generation
1138// is requested for an identical GenerationVariant.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001139
1140
1141void RegExpTree::AppendToText(RegExpText* text) {
1142 UNREACHABLE();
1143}
1144
1145
1146void RegExpAtom::AppendToText(RegExpText* text) {
1147 text->AddElement(TextElement::Atom(this));
1148}
1149
1150
1151void RegExpCharacterClass::AppendToText(RegExpText* text) {
1152 text->AddElement(TextElement::CharClass(this));
1153}
1154
1155
1156void RegExpText::AppendToText(RegExpText* text) {
1157 for (int i = 0; i < elements()->length(); i++)
1158 text->AddElement(elements()->at(i));
1159}
1160
1161
1162TextElement TextElement::Atom(RegExpAtom* atom) {
1163 TextElement result = TextElement(ATOM);
1164 result.data.u_atom = atom;
1165 return result;
1166}
1167
1168
1169TextElement TextElement::CharClass(
1170 RegExpCharacterClass* char_class) {
1171 TextElement result = TextElement(CHAR_CLASS);
1172 result.data.u_char_class = char_class;
1173 return result;
1174}
1175
1176
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001177int TextElement::length() {
1178 if (type == ATOM) {
1179 return data.u_atom->length();
1180 } else {
1181 ASSERT(type == CHAR_CLASS);
1182 return 1;
1183 }
1184}
1185
1186
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001187DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
1188 if (table_ == NULL) {
1189 table_ = new DispatchTable();
1190 DispatchTableConstructor cons(table_, ignore_case);
1191 cons.BuildTable(this);
1192 }
1193 return table_;
1194}
1195
1196
1197class RegExpCompiler {
1198 public:
ager@chromium.org8bb60582008-12-11 12:02:20 +00001199 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001200
1201 int AllocateRegister() { return next_register_++; }
1202
1203 Handle<FixedArray> Assemble(RegExpMacroAssembler* assembler,
1204 RegExpNode* start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001205 int capture_count,
1206 Handle<String> pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001207
1208 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
1209
1210 static const int kImplementationOffset = 0;
1211 static const int kNumberOfRegistersOffset = 0;
1212 static const int kCodeOffset = 1;
1213
1214 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
1215 EndNode* accept() { return accept_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001216
1217 static const int kMaxRecursion = 100;
1218 inline int recursion_depth() { return recursion_depth_; }
1219 inline void IncrementRecursionDepth() { recursion_depth_++; }
1220 inline void DecrementRecursionDepth() { recursion_depth_--; }
1221
1222 inline bool ignore_case() { return ignore_case_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001223 inline bool ascii() { return ascii_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001224
1225 private:
1226 EndNode* accept_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001227 int next_register_;
1228 List<RegExpNode*>* work_list_;
1229 int recursion_depth_;
1230 RegExpMacroAssembler* macro_assembler_;
1231 bool ignore_case_;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001232 bool ascii_;
1233};
1234
1235
1236class RecursionCheck {
1237 public:
1238 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1239 compiler->IncrementRecursionDepth();
1240 }
1241 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1242 private:
1243 RegExpCompiler* compiler_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001244};
1245
1246
1247// Attempts to compile the regexp using an Irregexp code generator. Returns
1248// a fixed array or a null handle depending on whether it succeeded.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001249RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001250 : next_register_(2 * (capture_count + 1)),
1251 work_list_(NULL),
1252 recursion_depth_(0),
ager@chromium.org8bb60582008-12-11 12:02:20 +00001253 ignore_case_(ignore_case),
1254 ascii_(ascii) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001255 accept_ = new EndNode(EndNode::ACCEPT);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001256}
1257
1258
1259Handle<FixedArray> RegExpCompiler::Assemble(
1260 RegExpMacroAssembler* macro_assembler,
1261 RegExpNode* start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001262 int capture_count,
1263 Handle<String> pattern) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001264#ifdef DEBUG
1265 if (FLAG_trace_regexp_assembler)
1266 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
1267 else
1268#endif
1269 macro_assembler_ = macro_assembler;
1270 List <RegExpNode*> work_list(0);
1271 work_list_ = &work_list;
1272 Label fail;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001273 macro_assembler->PushBacktrack(&fail);
1274 GenerationVariant generic_variant;
1275 if (!start->Emit(this, &generic_variant)) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001276 fail.Unuse();
1277 return Handle<FixedArray>::null();
1278 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001279 macro_assembler_->Bind(&fail);
1280 macro_assembler_->Fail();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001281 while (!work_list.is_empty()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001282 if (!work_list.RemoveLast()->Emit(this, &generic_variant)) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001283 return Handle<FixedArray>::null();
1284 }
1285 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001286 Handle<FixedArray> array =
1287 Factory::NewFixedArray(RegExpImpl::kIrregexpDataLength);
1288 array->set(RegExpImpl::kIrregexpImplementationIndex,
1289 Smi::FromInt(macro_assembler_->Implementation()));
1290 array->set(RegExpImpl::kIrregexpNumberOfRegistersIndex,
1291 Smi::FromInt(next_register_));
1292 array->set(RegExpImpl::kIrregexpNumberOfCapturesIndex,
1293 Smi::FromInt(capture_count));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001294 Handle<Object> code = macro_assembler_->GetCode(pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001295 array->set(RegExpImpl::kIrregexpCodeIndex, *code);
1296 work_list_ = NULL;
1297#ifdef DEBUG
1298 if (FLAG_trace_regexp_assembler) {
1299 delete macro_assembler_;
1300 }
1301#endif
1302 return array;
1303}
1304
1305
ager@chromium.org8bb60582008-12-11 12:02:20 +00001306bool GenerationVariant::mentions_reg(int reg) {
1307 for (DeferredAction* action = actions_;
1308 action != NULL;
1309 action = action->next()) {
1310 if (reg == action->reg()) return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001311 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001312 return false;
1313}
1314
1315
1316int GenerationVariant::FindAffectedRegisters(OutSet* affected_registers) {
1317 int max_register = -1;
1318 for (DeferredAction* action = actions_;
1319 action != NULL;
1320 action = action->next()) {
1321 affected_registers->Set(action->reg());
1322 if (action->reg() > max_register) max_register = action->reg();
1323 }
1324 return max_register;
1325}
1326
1327
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001328void GenerationVariant::PushAffectedRegisters(RegExpMacroAssembler* assembler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001329 int max_register,
1330 OutSet& affected_registers) {
1331 for (int reg = 0; reg <= max_register; reg++) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001332 if (affected_registers.Get(reg)) assembler->PushRegister(reg);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001333 }
1334}
1335
1336
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001337void GenerationVariant::RestoreAffectedRegisters(
1338 RegExpMacroAssembler* assembler,
1339 int max_register,
1340 OutSet& affected_registers) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001341 for (int reg = max_register; reg >= 0; reg--) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001342 if (affected_registers.Get(reg)) assembler->PopRegister(reg);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001343 }
1344}
1345
1346
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001347void GenerationVariant::PerformDeferredActions(RegExpMacroAssembler* assembler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001348 int max_register,
1349 OutSet& affected_registers) {
1350 for (int reg = 0; reg <= max_register; reg++) {
1351 if (!affected_registers.Get(reg)) {
1352 continue;
1353 }
1354 int value = 0;
1355 bool absolute = false;
1356 int store_position = -1;
1357 // This is a little tricky because we are scanning the actions in reverse
1358 // historical order (newest first).
1359 for (DeferredAction* action = actions_;
1360 action != NULL;
1361 action = action->next()) {
1362 if (action->reg() == reg) {
1363 switch (action->type()) {
1364 case ActionNode::SET_REGISTER: {
1365 GenerationVariant::DeferredSetRegister* psr =
1366 static_cast<GenerationVariant::DeferredSetRegister*>(action);
1367 value += psr->value();
1368 absolute = true;
1369 ASSERT_EQ(store_position, -1);
1370 break;
1371 }
1372 case ActionNode::INCREMENT_REGISTER:
1373 if (!absolute) {
1374 value++;
1375 }
1376 ASSERT_EQ(store_position, -1);
1377 break;
1378 case ActionNode::STORE_POSITION: {
1379 GenerationVariant::DeferredCapture* pc =
1380 static_cast<GenerationVariant::DeferredCapture*>(action);
1381 if (store_position == -1) {
1382 store_position = pc->cp_offset();
1383 }
1384 ASSERT(!absolute);
1385 ASSERT_EQ(value, 0);
1386 break;
1387 }
1388 default:
1389 UNREACHABLE();
1390 break;
1391 }
1392 }
1393 }
1394 if (store_position != -1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001395 assembler->WriteCurrentPositionToRegister(reg, store_position);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001396 } else {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001397 if (absolute) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001398 assembler->SetRegister(reg, value);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001399 } else {
1400 if (value != 0) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001401 assembler->AdvanceRegister(reg, value);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001402 }
1403 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001404 }
1405 }
1406}
1407
1408
ager@chromium.org8bb60582008-12-11 12:02:20 +00001409// This is called as we come into a loop choice node and some other tricky
1410// nodes. It normalises the state of the code generator to ensure we can
1411// generate generic code.
1412bool GenerationVariant::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001413 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001414
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001415 ASSERT(actions_ != NULL ||
1416 cp_offset_ != 0 ||
1417 backtrack() != NULL ||
1418 characters_preloaded_ != 0 ||
1419 quick_check_performed_.characters() != 0 ||
1420 bound_checked_up_to_ != 0);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001421
1422 if (actions_ == NULL && backtrack() == NULL) {
1423 // Here we just have some deferred cp advances to fix and we are back to
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001424 // a normal situation. We may also have to forget some information gained
1425 // through a quick check that was already performed.
1426 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001427 // Create a new trivial state and generate the node with that.
1428 GenerationVariant new_state;
1429 return successor->Emit(compiler, &new_state);
1430 }
1431
1432 // Generate deferred actions here along with code to undo them again.
1433 OutSet affected_registers;
1434 int max_register = FindAffectedRegisters(&affected_registers);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001435 PushAffectedRegisters(assembler, max_register, affected_registers);
1436 PerformDeferredActions(assembler, max_register, affected_registers);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001437 if (backtrack() != NULL) {
1438 // Here we have a concrete backtrack location. These are set up by choice
1439 // nodes and so they indicate that we have a deferred save of the current
1440 // position which we may need to emit here.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001441 assembler->PushCurrentPosition();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001442 }
1443 if (cp_offset_ != 0) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001444 assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001445 }
1446
1447 // Create a new trivial state and generate the node with that.
1448 Label undo;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001449 assembler->PushBacktrack(&undo);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001450 GenerationVariant new_state;
1451 bool ok = successor->Emit(compiler, &new_state);
1452
1453 // On backtrack we need to restore state.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001454 assembler->Bind(&undo);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001455 if (!ok) return false;
1456 if (backtrack() != NULL) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001457 assembler->PopCurrentPosition();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001458 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001459 RestoreAffectedRegisters(assembler, max_register, affected_registers);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001460 if (backtrack() == NULL) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001461 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001462 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001463 assembler->GoTo(backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001464 }
1465
1466 return true;
1467}
1468
1469
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001470void EndNode::EmitInfoChecks(RegExpMacroAssembler* assembler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001471 GenerationVariant* variant) {
1472 if (info()->at_end) {
1473 Label succeed;
1474 // LoadCurrentCharacter will go to the label if we are at the end of the
1475 // input string.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001476 assembler->LoadCurrentCharacter(0, &succeed);
1477 assembler->GoTo(variant->backtrack());
1478 assembler->Bind(&succeed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001479 }
1480}
1481
1482
1483bool NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler,
1484 GenerationVariant* variant) {
1485 if (!variant->is_trivial()) {
1486 return variant->Flush(compiler, this);
1487 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001488 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001489 if (!label()->is_bound()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001490 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001491 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001492 EmitInfoChecks(assembler, variant);
1493 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1494 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001495 // Now that we have unwound the stack we find at the top of the stack the
1496 // backtrack that the BeginSubmatch node got.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001497 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001498 return true;
1499}
1500
1501
1502bool EndNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) {
1503 if (!variant->is_trivial()) {
1504 return variant->Flush(compiler, this);
1505 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001506 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001507 if (!label()->is_bound()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001508 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001509 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001510 switch (action_) {
1511 case ACCEPT:
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001512 EmitInfoChecks(assembler, variant);
1513 assembler->Succeed();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001514 return true;
1515 case BACKTRACK:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001516 ASSERT(!info()->at_end);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001517 assembler->GoTo(variant->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001518 return true;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001519 case NEGATIVE_SUBMATCH_SUCCESS:
1520 // This case is handled in a different virtual method.
1521 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001522 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001523 UNIMPLEMENTED();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001524 return false;
1525}
1526
1527
1528void GuardedAlternative::AddGuard(Guard* guard) {
1529 if (guards_ == NULL)
1530 guards_ = new ZoneList<Guard*>(1);
1531 guards_->Add(guard);
1532}
1533
1534
ager@chromium.org8bb60582008-12-11 12:02:20 +00001535ActionNode* ActionNode::SetRegister(int reg,
1536 int val,
1537 RegExpNode* on_success) {
1538 ActionNode* result = new ActionNode(SET_REGISTER, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001539 result->data_.u_store_register.reg = reg;
1540 result->data_.u_store_register.value = val;
1541 return result;
1542}
1543
1544
1545ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1546 ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
1547 result->data_.u_increment_register.reg = reg;
1548 return result;
1549}
1550
1551
1552ActionNode* ActionNode::StorePosition(int reg, RegExpNode* on_success) {
1553 ActionNode* result = new ActionNode(STORE_POSITION, on_success);
1554 result->data_.u_position_register.reg = reg;
1555 return result;
1556}
1557
1558
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001559ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1560 int position_reg,
1561 RegExpNode* on_success) {
1562 ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
1563 result->data_.u_submatch.stack_pointer_register = stack_reg;
1564 result->data_.u_submatch.current_position_register = position_reg;
1565 return result;
1566}
1567
1568
ager@chromium.org8bb60582008-12-11 12:02:20 +00001569ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1570 int position_reg,
1571 RegExpNode* on_success) {
1572 ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001573 result->data_.u_submatch.stack_pointer_register = stack_reg;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001574 result->data_.u_submatch.current_position_register = position_reg;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001575 return result;
1576}
1577
1578
1579#define DEFINE_ACCEPT(Type) \
1580 void Type##Node::Accept(NodeVisitor* visitor) { \
1581 visitor->Visit##Type(this); \
1582 }
1583FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1584#undef DEFINE_ACCEPT
1585
1586
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001587void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1588 visitor->VisitLoopChoice(this);
1589}
1590
1591
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001592// -------------------------------------------------------------------
1593// Emit code.
1594
1595
1596void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1597 Guard* guard,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001598 GenerationVariant* variant) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001599 switch (guard->op()) {
1600 case Guard::LT:
ager@chromium.org8bb60582008-12-11 12:02:20 +00001601 ASSERT(!variant->mentions_reg(guard->reg()));
1602 macro_assembler->IfRegisterGE(guard->reg(),
1603 guard->value(),
1604 variant->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001605 break;
1606 case Guard::GEQ:
ager@chromium.org8bb60582008-12-11 12:02:20 +00001607 ASSERT(!variant->mentions_reg(guard->reg()));
1608 macro_assembler->IfRegisterLT(guard->reg(),
1609 guard->value(),
1610 variant->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001611 break;
1612 }
1613}
1614
1615
1616static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
1617static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
1618
1619
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001620// Only emits non-letters (things that don't have case). Only used for case
1621// independent matches.
1622static inline bool EmitAtomNonLetter(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001623 RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001624 uc16 c,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001625 Label* on_failure,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001626 int cp_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001627 bool check,
1628 bool preloaded) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001629 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001630 int length = uncanonicalize.get(c, '\0', chars);
1631 bool checked = false;
1632 if (length <= 1) {
1633 if (!preloaded) {
1634 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1635 checked = check;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001636 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001637 macro_assembler->CheckNotCharacter(c, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001638 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001639 return checked;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001640}
1641
1642
1643static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001644 bool ascii,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001645 uc16 c1,
1646 uc16 c2,
1647 Label* on_failure) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001648 uc16 char_mask;
1649 if (ascii) {
1650 char_mask = String::kMaxAsciiCharCode;
1651 } else {
1652 char_mask = String::kMaxUC16CharCode;
1653 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001654 uc16 exor = c1 ^ c2;
1655 // Check whether exor has only one bit set.
1656 if (((exor - 1) & exor) == 0) {
1657 // If c1 and c2 differ only by one bit.
1658 // Ecma262UnCanonicalize always gives the highest number last.
1659 ASSERT(c2 > c1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001660 uc16 mask = char_mask ^ exor;
1661 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001662 return true;
1663 }
1664 ASSERT(c2 > c1);
1665 uc16 diff = c2 - c1;
1666 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1667 // If the characters differ by 2^n but don't differ by one bit then
1668 // subtract the difference from the found character, then do the or
1669 // trick. We avoid the theoretical case where negative numbers are
1670 // involved in order to simplify code generation.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001671 uc16 mask = char_mask ^ diff;
1672 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1673 diff,
1674 mask,
1675 on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001676 return true;
1677 }
1678 return false;
1679}
1680
1681
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001682// Only emits letters (things that have case). Only used for case independent
1683// matches.
1684static inline bool EmitAtomLetter(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001685 RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001686 bool ascii,
1687 uc16 c,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001688 Label* on_failure,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001689 int cp_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001690 bool check,
1691 bool preloaded) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001692 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001693 int length = uncanonicalize.get(c, '\0', chars);
1694 if (length <= 1) return false;
1695 // We may not need to check against the end of the input string
1696 // if this character lies before a character that matched.
1697 if (!preloaded) {
1698 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001699 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001700 Label ok;
1701 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1702 switch (length) {
1703 case 2: {
1704 if (ShortCutEmitCharacterPair(macro_assembler,
1705 ascii,
1706 chars[0],
1707 chars[1],
1708 on_failure)) {
1709 } else {
1710 macro_assembler->CheckCharacter(chars[0], &ok);
1711 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1712 macro_assembler->Bind(&ok);
1713 }
1714 break;
1715 }
1716 case 4:
1717 macro_assembler->CheckCharacter(chars[3], &ok);
1718 // Fall through!
1719 case 3:
1720 macro_assembler->CheckCharacter(chars[0], &ok);
1721 macro_assembler->CheckCharacter(chars[1], &ok);
1722 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1723 macro_assembler->Bind(&ok);
1724 break;
1725 default:
1726 UNREACHABLE();
1727 break;
1728 }
1729 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001730}
1731
1732
1733static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1734 RegExpCharacterClass* cc,
1735 int cp_offset,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001736 Label* on_failure,
1737 bool check_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001738 bool ascii,
1739 bool preloaded) {
1740 if (cc->is_standard() &&
1741 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1742 cp_offset,
1743 check_offset,
1744 on_failure)) {
1745 return;
1746 }
1747
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001748 ZoneList<CharacterRange>* ranges = cc->ranges();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001749 int max_char;
1750 if (ascii) {
1751 max_char = String::kMaxAsciiCharCode;
1752 } else {
1753 max_char = String::kMaxUC16CharCode;
1754 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001755
1756 Label success;
1757
1758 Label* char_is_in_class =
1759 cc->is_negated() ? on_failure : &success;
1760
1761 int range_count = ranges->length();
1762
ager@chromium.org8bb60582008-12-11 12:02:20 +00001763 int last_valid_range = range_count - 1;
1764 while (last_valid_range >= 0) {
1765 CharacterRange& range = ranges->at(last_valid_range);
1766 if (range.from() <= max_char) {
1767 break;
1768 }
1769 last_valid_range--;
1770 }
1771
1772 if (last_valid_range < 0) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001773 if (!cc->is_negated()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001774 // TODO(plesner): We can remove this when the node level does our
1775 // ASCII optimizations for us.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001776 macro_assembler->GoTo(on_failure);
1777 }
1778 return;
1779 }
1780
ager@chromium.org8bb60582008-12-11 12:02:20 +00001781 if (last_valid_range == 0 &&
1782 !cc->is_negated() &&
1783 ranges->at(0).IsEverything(max_char)) {
1784 // This is a common case hit by non-anchored expressions.
1785 // TODO(erikcorry): We should have a macro assembler instruction that just
1786 // checks for end of string without loading the character.
1787 if (check_offset) {
1788 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure);
1789 }
1790 return;
1791 }
1792
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001793 if (!preloaded) {
1794 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001795 }
1796
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001797 for (int i = 0; i < last_valid_range; i++) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001798 CharacterRange& range = ranges->at(i);
1799 Label next_range;
1800 uc16 from = range.from();
1801 uc16 to = range.to();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001802 if (from > max_char) {
1803 continue;
1804 }
1805 if (to > max_char) to = max_char;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001806 if (to == from) {
1807 macro_assembler->CheckCharacter(to, char_is_in_class);
1808 } else {
1809 if (from != 0) {
1810 macro_assembler->CheckCharacterLT(from, &next_range);
1811 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001812 if (to != max_char) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001813 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
1814 } else {
1815 macro_assembler->GoTo(char_is_in_class);
1816 }
1817 }
1818 macro_assembler->Bind(&next_range);
1819 }
1820
ager@chromium.org8bb60582008-12-11 12:02:20 +00001821 CharacterRange& range = ranges->at(last_valid_range);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001822 uc16 from = range.from();
1823 uc16 to = range.to();
1824
ager@chromium.org8bb60582008-12-11 12:02:20 +00001825 if (to > max_char) to = max_char;
1826 ASSERT(to >= from);
1827
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001828 if (to == from) {
1829 if (cc->is_negated()) {
1830 macro_assembler->CheckCharacter(to, on_failure);
1831 } else {
1832 macro_assembler->CheckNotCharacter(to, on_failure);
1833 }
1834 } else {
1835 if (from != 0) {
1836 if (cc->is_negated()) {
1837 macro_assembler->CheckCharacterLT(from, &success);
1838 } else {
1839 macro_assembler->CheckCharacterLT(from, on_failure);
1840 }
1841 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001842 if (to != String::kMaxUC16CharCode) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001843 if (cc->is_negated()) {
1844 macro_assembler->CheckCharacterLT(to + 1, on_failure);
1845 } else {
1846 macro_assembler->CheckCharacterGT(to, on_failure);
1847 }
1848 } else {
1849 if (cc->is_negated()) {
1850 macro_assembler->GoTo(on_failure);
1851 }
1852 }
1853 }
1854 macro_assembler->Bind(&success);
1855}
1856
1857
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001858RegExpNode::~RegExpNode() {
1859}
1860
1861
ager@chromium.org8bb60582008-12-11 12:02:20 +00001862RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1863 GenerationVariant* variant) {
1864 // TODO(erikcorry): Implement support.
1865 if (info_.follows_word_interest ||
1866 info_.follows_newline_interest ||
1867 info_.follows_start_interest) {
1868 return FAIL;
1869 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001870
ager@chromium.org8bb60582008-12-11 12:02:20 +00001871 // If we are generating a greedy loop then don't stop and don't reuse code.
1872 if (variant->stop_node() != NULL) {
1873 return CONTINUE;
1874 }
1875
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001876 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001877 if (variant->is_trivial()) {
1878 if (label_.is_bound()) {
1879 // We are being asked to generate a generic version, but that's already
1880 // been done so just go to it.
1881 macro_assembler->GoTo(&label_);
1882 return DONE;
1883 }
1884 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1885 // To avoid too deep recursion we push the node to the work queue and just
1886 // generate a goto here.
1887 compiler->AddWork(this);
1888 macro_assembler->GoTo(&label_);
1889 return DONE;
1890 }
1891 // Generate generic version of the node and bind the label for later use.
1892 macro_assembler->Bind(&label_);
1893 return CONTINUE;
1894 }
1895
1896 // We are being asked to make a non-generic version. Keep track of how many
1897 // non-generic versions we generate so as not to overdo it.
1898 variants_generated_++;
1899 if (variants_generated_ < kMaxVariantsGenerated &&
1900 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
1901 return CONTINUE;
1902 }
1903
1904 // If we get here there have been too many variants generated or recursion
1905 // is too deep. Time to switch to a generic version. The code for
1906 // generic versions above can handle deep recursion properly.
1907 bool ok = variant->Flush(compiler, this);
1908 return ok ? DONE : FAIL;
1909}
1910
1911
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001912int ActionNode::EatsAtLeast(int recursion_depth) {
1913 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1914 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
1915 return on_success()->EatsAtLeast(recursion_depth + 1);
1916}
1917
1918
1919int TextNode::EatsAtLeast(int recursion_depth) {
1920 int answer = Length();
1921 if (answer >= 4) return answer;
1922 if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
1923 return answer + on_success()->EatsAtLeast(recursion_depth + 1);
1924}
1925
1926
1927int ChoiceNode::EatsAtLeastHelper(int recursion_depth,
1928 RegExpNode* ignore_this_node) {
1929 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1930 int min = 100;
1931 int choice_count = alternatives_->length();
1932 for (int i = 0; i < choice_count; i++) {
1933 RegExpNode* node = alternatives_->at(i).node();
1934 if (node == ignore_this_node) continue;
1935 int node_eats_at_least = node->EatsAtLeast(recursion_depth + 1);
1936 if (node_eats_at_least < min) min = node_eats_at_least;
1937 }
1938 return min;
1939}
1940
1941
1942int LoopChoiceNode::EatsAtLeast(int recursion_depth) {
1943 return EatsAtLeastHelper(recursion_depth, loop_node_);
1944}
1945
1946
1947int ChoiceNode::EatsAtLeast(int recursion_depth) {
1948 return EatsAtLeastHelper(recursion_depth, NULL);
1949}
1950
1951
1952// Takes the left-most 1-bit and smears it out, setting all bits to its right.
1953static inline uint32_t SmearBitsRight(uint32_t v) {
1954 v |= v >> 1;
1955 v |= v >> 2;
1956 v |= v >> 4;
1957 v |= v >> 8;
1958 v |= v >> 16;
1959 return v;
1960}
1961
1962
1963bool QuickCheckDetails::Rationalize(bool asc) {
1964 bool found_useful_op = false;
1965 uint32_t char_mask;
1966 if (asc) {
1967 char_mask = String::kMaxAsciiCharCode;
1968 } else {
1969 char_mask = String::kMaxUC16CharCode;
1970 }
1971 mask_ = 0;
1972 value_ = 0;
1973 int char_shift = 0;
1974 for (int i = 0; i < characters_; i++) {
1975 Position* pos = &positions_[i];
1976 if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
1977 found_useful_op = true;
1978 }
1979 mask_ |= (pos->mask & char_mask) << char_shift;
1980 value_ |= (pos->value & char_mask) << char_shift;
1981 char_shift += asc ? 8 : 16;
1982 }
1983 return found_useful_op;
1984}
1985
1986
1987bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
1988 GenerationVariant* variant,
1989 bool preload_has_checked_bounds,
1990 Label* on_possible_success,
1991 QuickCheckDetails* details,
1992 bool fall_through_on_failure) {
1993 if (details->characters() == 0) return false;
1994 GetQuickCheckDetails(details, compiler, 0);
1995 if (!details->Rationalize(compiler->ascii())) return false;
1996 uint32_t mask = details->mask();
1997 uint32_t value = details->value();
1998
1999 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2000
2001 if (variant->characters_preloaded() != details->characters()) {
2002 assembler->LoadCurrentCharacter(variant->cp_offset(),
2003 variant->backtrack(),
2004 !preload_has_checked_bounds,
2005 details->characters());
2006 }
2007
2008
2009 bool need_mask = true;
2010
2011 if (details->characters() == 1) {
2012 // If number of characters preloaded is 1 then we used a byte or 16 bit
2013 // load so the value is already masked down.
2014 uint32_t char_mask;
2015 if (compiler->ascii()) {
2016 char_mask = String::kMaxAsciiCharCode;
2017 } else {
2018 char_mask = String::kMaxUC16CharCode;
2019 }
2020 if ((mask & char_mask) == char_mask) need_mask = false;
2021 mask &= char_mask;
2022 } else {
2023 // For 2-character preloads in ASCII mode we also use a 16 bit load with
2024 // zero extend.
2025 if (details->characters() == 2 && compiler->ascii()) {
2026 if ((mask & 0xffff) == 0xffff) need_mask = false;
2027 } else {
2028 if (mask == 0xffffffff) need_mask = false;
2029 }
2030 }
2031
2032 if (fall_through_on_failure) {
2033 if (need_mask) {
2034 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2035 } else {
2036 assembler->CheckCharacter(value, on_possible_success);
2037 }
2038 } else {
2039 if (need_mask) {
2040 assembler->CheckNotCharacterAfterAnd(value, mask, variant->backtrack());
2041 } else {
2042 assembler->CheckNotCharacter(value, variant->backtrack());
2043 }
2044 }
2045 return true;
2046}
2047
2048
2049// Here is the meat of GetQuickCheckDetails (see also the comment on the
2050// super-class in the .h file).
2051//
2052// We iterate along the text object, building up for each character a
2053// mask and value that can be used to test for a quick failure to match.
2054// The masks and values for the positions will be combined into a single
2055// machine word for the current character width in order to be used in
2056// generating a quick check.
2057void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2058 RegExpCompiler* compiler,
2059 int characters_filled_in) {
2060 ASSERT(characters_filled_in < details->characters());
2061 int characters = details->characters();
2062 int char_mask;
2063 int char_shift;
2064 if (compiler->ascii()) {
2065 char_mask = String::kMaxAsciiCharCode;
2066 char_shift = 8;
2067 } else {
2068 char_mask = String::kMaxUC16CharCode;
2069 char_shift = 16;
2070 }
2071 for (int k = 0; k < elms_->length(); k++) {
2072 TextElement elm = elms_->at(k);
2073 if (elm.type == TextElement::ATOM) {
2074 Vector<const uc16> quarks = elm.data.u_atom->data();
2075 for (int i = 0; i < characters && i < quarks.length(); i++) {
2076 QuickCheckDetails::Position* pos =
2077 details->positions(characters_filled_in);
2078 if (compiler->ignore_case()) {
2079 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2080 uc16 c = quarks[i];
2081 int length = uncanonicalize.get(c, '\0', chars);
2082 if (length < 2) {
2083 // This letter has no case equivalents, so it's nice and simple
2084 // and the mask-compare will determine definitely whether we have
2085 // a match at this character position.
2086 pos->mask = char_mask;
2087 pos->value = c;
2088 pos->determines_perfectly = true;
2089 } else {
2090 uint32_t common_bits = char_mask;
2091 uint32_t bits = chars[0];
2092 for (int j = 1; j < length; j++) {
2093 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2094 common_bits ^= differing_bits;
2095 bits &= common_bits;
2096 }
2097 // If length is 2 and common bits has only one zero in it then
2098 // our mask and compare instruction will determine definitely
2099 // whether we have a match at this character position. Otherwise
2100 // it can only be an approximate check.
2101 uint32_t one_zero = (common_bits | ~char_mask);
2102 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2103 pos->determines_perfectly = true;
2104 }
2105 pos->mask = common_bits;
2106 pos->value = bits;
2107 }
2108 } else {
2109 // Don't ignore case. Nice simple case where the mask-compare will
2110 // determine definitely whether we have a match at this character
2111 // position.
2112 pos->mask = char_mask;
2113 pos->value = quarks[i];
2114 pos->determines_perfectly = true;
2115 }
2116 characters_filled_in++;
2117 ASSERT(characters_filled_in <= details->characters());
2118 if (characters_filled_in == details->characters()) {
2119 return;
2120 }
2121 }
2122 } else {
2123 QuickCheckDetails::Position* pos =
2124 details->positions(characters_filled_in);
2125 RegExpCharacterClass* tree = elm.data.u_char_class;
2126 ZoneList<CharacterRange>* ranges = tree->ranges();
2127 CharacterRange range = ranges->at(0);
2128 if (tree->is_negated()) {
2129 // A quick check uses multi-character mask and compare. There is no
2130 // useful way to incorporate a negative char class into this scheme
2131 // so we just conservatively create a mask and value that will always
2132 // succeed.
2133 pos->mask = 0;
2134 pos->value = 0;
2135 } else {
2136 uint32_t differing_bits = (range.from() ^ range.to());
2137 // A mask and compare is only perfect if the differing bits form a
2138 // number like 00011111 with one single block of trailing 1s.
2139 if ((differing_bits & (differing_bits + 1)) == 0) {
2140 pos->determines_perfectly = true;
2141 }
2142 uint32_t common_bits = ~SmearBitsRight(differing_bits);
2143 uint32_t bits = (range.from() & common_bits);
2144 for (int i = 1; i < ranges->length(); i++) {
2145 // Here we are combining more ranges into the mask and compare
2146 // value. With each new range the mask becomes more sparse and
2147 // so the chances of a false positive rise. A character class
2148 // with multiple ranges is assumed never to be equivalent to a
2149 // mask and compare operation.
2150 pos->determines_perfectly = false;
2151 CharacterRange range = ranges->at(i);
2152 uint32_t new_common_bits = (range.from() ^ range.to());
2153 new_common_bits = ~SmearBitsRight(new_common_bits);
2154 common_bits &= new_common_bits;
2155 bits &= new_common_bits;
2156 uint32_t differing_bits = (range.from() & common_bits) ^ bits;
2157 common_bits ^= differing_bits;
2158 bits &= common_bits;
2159 }
2160 pos->mask = common_bits;
2161 pos->value = bits;
2162 }
2163 characters_filled_in++;
2164 ASSERT(characters_filled_in <= details->characters());
2165 if (characters_filled_in == details->characters()) {
2166 return;
2167 }
2168 }
2169 }
2170 ASSERT(characters_filled_in != details->characters());
2171 on_success()-> GetQuickCheckDetails(details, compiler, characters_filled_in);
2172}
2173
2174
2175void QuickCheckDetails::Clear() {
2176 for (int i = 0; i < characters_; i++) {
2177 positions_[i].mask = 0;
2178 positions_[i].value = 0;
2179 positions_[i].determines_perfectly = false;
2180 }
2181 characters_ = 0;
2182}
2183
2184
2185void QuickCheckDetails::Advance(int by, bool ascii) {
2186 ASSERT(by > 0);
2187 if (by >= characters_) {
2188 Clear();
2189 return;
2190 }
2191 for (int i = 0; i < characters_ - by; i++) {
2192 positions_[i] = positions_[by + i];
2193 }
2194 for (int i = characters_ - by; i < characters_; i++) {
2195 positions_[i].mask = 0;
2196 positions_[i].value = 0;
2197 positions_[i].determines_perfectly = false;
2198 }
2199 characters_ -= by;
2200 // We could change mask_ and value_ here but we would never advance unless
2201 // they had already been used in a check and they won't be used again because
2202 // it would gain us nothing. So there's no point.
2203}
2204
2205
2206void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2207 ASSERT(characters_ == other->characters_);
2208 for (int i = from_index; i < characters_; i++) {
2209 QuickCheckDetails::Position* pos = positions(i);
2210 QuickCheckDetails::Position* other_pos = other->positions(i);
2211 if (pos->mask != other_pos->mask ||
2212 pos->value != other_pos->value ||
2213 !other_pos->determines_perfectly) {
2214 // Our mask-compare operation will be approximate unless we have the
2215 // exact same operation on both sides of the alternation.
2216 pos->determines_perfectly = false;
2217 }
2218 pos->mask &= other_pos->mask;
2219 pos->value &= pos->mask;
2220 other_pos->value &= pos->mask;
2221 uc16 differing_bits = (pos->value ^ other_pos->value);
2222 pos->mask &= ~differing_bits;
2223 pos->value &= pos->mask;
2224 }
2225}
2226
2227
2228void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2229 RegExpCompiler* compiler,
2230 int characters_filled_in) {
2231 if (body_can_be_zero_length_) return;
2232 return ChoiceNode::GetQuickCheckDetails(details,
2233 compiler,
2234 characters_filled_in);
2235}
2236
2237
2238void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2239 RegExpCompiler* compiler,
2240 int characters_filled_in) {
2241 int choice_count = alternatives_->length();
2242 ASSERT(choice_count > 0);
2243 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2244 compiler,
2245 characters_filled_in);
2246 for (int i = 1; i < choice_count; i++) {
2247 QuickCheckDetails new_details(details->characters());
2248 RegExpNode* node = alternatives_->at(i).node();
2249 node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in);
2250 // Here we merge the quick match details of the two branches.
2251 details->Merge(&new_details, characters_filled_in);
2252 }
2253}
2254
2255
2256// We call this repeatedly to generate code for each pass over the text node.
2257// The passes are in increasing order of difficulty because we hope one
2258// of the first passes will fail in which case we are saved the work of the
2259// later passes. for example for the case independent regexp /%[asdfghjkl]a/
2260// we will check the '%' in the first pass, the case independent 'a' in the
2261// second pass and the character class in the last pass.
2262//
2263// The passes are done from right to left, so for example to test for /bar/
2264// we will first test for an 'r' with offset 2, then an 'a' with offset 1
2265// and then a 'b' with offset 0. This means we can avoid the end-of-input
2266// bounds check most of the time. In the example we only need to check for
2267// end-of-input when loading the putative 'r'.
2268//
2269// A slight complication involves the fact that the first character may already
2270// be fetched into a register by the previous node. In this case we want to
2271// do the test for that character first. We do this in separate passes. The
2272// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2273// pass has been performed then subsequent passes will have true in
2274// first_element_checked to indicate that that character does not need to be
2275// checked again.
2276//
2277// In addition to all this we are passed a GenerationVariant, which can
2278// contain an AlternativeGeneration object. In this AlternativeGeneration
2279// object we can see details of any quick check that was already passed in
2280// order to get to the code we are now generating. The quick check can involve
2281// loading characters, which means we do not need to recheck the bounds
2282// up to the limit the quick check already checked. In addition the quick
2283// check can have involved a mask and compare operation which may simplify
2284// or obviate the need for further checks at some character positions.
2285void TextNode::TextEmitPass(RegExpCompiler* compiler,
2286 TextEmitPassType pass,
2287 bool preloaded,
2288 GenerationVariant* variant,
2289 bool first_element_checked,
2290 int* checked_up_to) {
2291 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2292 bool ascii = compiler->ascii();
2293 Label* backtrack = variant->backtrack();
2294 QuickCheckDetails* quick_check = variant->quick_check_performed();
2295 int element_count = elms_->length();
2296 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2297 TextElement elm = elms_->at(i);
2298 int cp_offset = variant->cp_offset() + elm.cp_offset;
2299 if (elm.type == TextElement::ATOM) {
2300 if (pass == NON_ASCII_MATCH ||
2301 pass == CHARACTER_MATCH ||
2302 pass == CASE_CHARACTER_MATCH) {
2303 Vector<const uc16> quarks = elm.data.u_atom->data();
2304 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2305 bool bound_checked = true; // Most ops will check their bounds.
2306 if (first_element_checked && i == 0 && j == 0) continue;
2307 if (quick_check != NULL &&
2308 elm.cp_offset + j < quick_check->characters() &&
2309 quick_check->positions(elm.cp_offset + j)->determines_perfectly) {
2310 continue;
2311 }
2312 if (pass == NON_ASCII_MATCH) {
2313 ASSERT(ascii);
2314 if (quarks[j] > String::kMaxAsciiCharCode) {
2315 assembler->GoTo(backtrack);
2316 return;
2317 }
2318 } else if (pass == CHARACTER_MATCH) {
2319 if (compiler->ignore_case()) {
2320 bound_checked = EmitAtomNonLetter(assembler,
2321 quarks[j],
2322 backtrack,
2323 cp_offset + j,
2324 *checked_up_to < cp_offset + j,
2325 preloaded);
2326 } else {
2327 if (!preloaded) {
2328 assembler->LoadCurrentCharacter(cp_offset + j,
2329 backtrack,
2330 *checked_up_to < cp_offset + j);
2331 }
2332 assembler->CheckNotCharacter(quarks[j], backtrack);
2333 }
2334 } else {
2335 ASSERT_EQ(pass, CASE_CHARACTER_MATCH);
2336 ASSERT(compiler->ignore_case());
2337 bound_checked = EmitAtomLetter(assembler,
2338 compiler->ascii(),
2339 quarks[j],
2340 backtrack,
2341 cp_offset + j,
2342 *checked_up_to < cp_offset + j,
2343 preloaded);
2344 }
2345 if (pass != NON_ASCII_MATCH && bound_checked) {
2346 if (cp_offset + j > *checked_up_to) {
2347 *checked_up_to = cp_offset + j;
2348 }
2349 }
2350 }
2351 }
2352 } else {
2353 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
2354 if (first_element_checked && i == 0) continue;
2355 if (quick_check != NULL &&
2356 elm.cp_offset < quick_check->characters() &&
2357 quick_check->positions(elm.cp_offset)->determines_perfectly) {
2358 continue;
2359 }
2360 if (pass == CHARACTER_CLASS_MATCH) {
2361 RegExpCharacterClass* cc = elm.data.u_char_class;
2362 EmitCharClass(assembler,
2363 cc,
2364 cp_offset,
2365 backtrack,
2366 *checked_up_to < cp_offset,
2367 ascii,
2368 preloaded);
2369 if (cp_offset > *checked_up_to) {
2370 *checked_up_to = cp_offset;
2371 }
2372 }
2373 }
2374 }
2375}
2376
2377
2378int TextNode::Length() {
2379 TextElement elm = elms_->last();
2380 ASSERT(elm.cp_offset >= 0);
2381 if (elm.type == TextElement::ATOM) {
2382 return elm.cp_offset + elm.data.u_atom->data().length();
2383 } else {
2384 return elm.cp_offset + 1;
2385 }
2386}
2387
2388
ager@chromium.org8bb60582008-12-11 12:02:20 +00002389// This generates the code to match a text node. A text node can contain
2390// straight character sequences (possibly to be matched in a case-independent
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002391// way) and character classes. For efficiency we do not do this in a single
2392// pass from left to right. Instead we pass over the text node several times,
2393// emitting code for some character positions every time. See the comment on
2394// TextEmitPass for details.
ager@chromium.org8bb60582008-12-11 12:02:20 +00002395bool TextNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002396 LimitResult limit_result = LimitVersions(compiler, variant);
2397 if (limit_result == FAIL) return false;
2398 if (limit_result == DONE) return true;
2399 ASSERT(limit_result == CONTINUE);
2400
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002401 if (info()->follows_word_interest ||
2402 info()->follows_newline_interest ||
2403 info()->follows_start_interest) {
2404 return false;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002405 }
2406
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002407 if (info()->at_end) {
2408 compiler->macro_assembler()->GoTo(variant->backtrack());
2409 return true;
2410 }
2411
2412 if (compiler->ascii()) {
2413 int dummy = 0;
2414 TextEmitPass(compiler, NON_ASCII_MATCH, false, variant, false, &dummy);
2415 }
2416
2417 bool first_elt_done = false;
2418 int bound_checked_to = variant->cp_offset() - 1;
2419 bound_checked_to += variant->bound_checked_up_to();
2420
2421 // If a character is preloaded into the current character register then
2422 // check that now.
2423 if (variant->characters_preloaded() == 1) {
2424 TextEmitPass(compiler,
2425 CHARACTER_MATCH,
2426 true,
2427 variant,
2428 false,
2429 &bound_checked_to);
2430 if (compiler->ignore_case()) {
2431 TextEmitPass(compiler,
2432 CASE_CHARACTER_MATCH,
2433 true,
2434 variant,
2435 false,
2436 &bound_checked_to);
2437 }
2438 TextEmitPass(compiler,
2439 CHARACTER_CLASS_MATCH,
2440 true,
2441 variant,
2442 false,
2443 &bound_checked_to);
2444 first_elt_done = true;
2445 }
2446
2447 TextEmitPass(compiler,
2448 CHARACTER_MATCH,
2449 false,
2450 variant,
2451 first_elt_done,
2452 &bound_checked_to);
2453 if (compiler->ignore_case()) {
2454 TextEmitPass(compiler,
2455 CASE_CHARACTER_MATCH,
2456 false,
2457 variant,
2458 first_elt_done,
2459 &bound_checked_to);
2460 }
2461 TextEmitPass(compiler,
2462 CHARACTER_CLASS_MATCH,
2463 false,
2464 variant,
2465 first_elt_done,
2466 &bound_checked_to);
2467
2468 GenerationVariant successor_variant(*variant);
2469 successor_variant.AdvanceVariant(Length(), compiler->ascii());
ager@chromium.org8bb60582008-12-11 12:02:20 +00002470 RecursionCheck rc(compiler);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002471 return on_success()->Emit(compiler, &successor_variant);
2472}
2473
2474
2475void GenerationVariant::AdvanceVariant(int by, bool ascii) {
2476 ASSERT(by > 0);
2477 // We don't have an instruction for shifting the current character register
2478 // down or for using a shifted value for anything so lets just forget that
2479 // we preloaded any characters into it.
2480 characters_preloaded_ = 0;
2481 // Adjust the offsets of the quick check performed information. This
2482 // information is used to find out what we already determined about the
2483 // characters by means of mask and compare.
2484 quick_check_performed_.Advance(by, ascii);
2485 cp_offset_ += by;
2486 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002487}
2488
2489
2490void TextNode::MakeCaseIndependent() {
2491 int element_count = elms_->length();
2492 for (int i = 0; i < element_count; i++) {
2493 TextElement elm = elms_->at(i);
2494 if (elm.type == TextElement::CHAR_CLASS) {
2495 RegExpCharacterClass* cc = elm.data.u_char_class;
2496 ZoneList<CharacterRange>* ranges = cc->ranges();
2497 int range_count = ranges->length();
2498 for (int i = 0; i < range_count; i++) {
2499 ranges->at(i).AddCaseEquivalents(ranges);
2500 }
2501 }
2502 }
2503}
2504
2505
ager@chromium.org8bb60582008-12-11 12:02:20 +00002506int TextNode::GreedyLoopTextLength() {
2507 TextElement elm = elms_->at(elms_->length() - 1);
2508 if (elm.type == TextElement::CHAR_CLASS) {
2509 return elm.cp_offset + 1;
2510 } else {
2511 return elm.cp_offset + elm.data.u_atom->data().length();
2512 }
2513}
2514
2515
2516// Finds the fixed match length of a sequence of nodes that goes from
2517// this alternative and back to this choice node. If there are variable
2518// length nodes or other complications in the way then return a sentinel
2519// value indicating that a greedy loop cannot be constructed.
2520int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) {
2521 int length = 0;
2522 RegExpNode* node = alternative->node();
2523 // Later we will generate code for all these text nodes using recursion
2524 // so we have to limit the max number.
2525 int recursion_depth = 0;
2526 while (node != this) {
2527 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2528 return kNodeIsTooComplexForGreedyLoops;
2529 }
2530 NodeInfo* info = node->info();
2531 if (info->follows_word_interest ||
2532 info->follows_newline_interest ||
2533 info->follows_start_interest) {
2534 return kNodeIsTooComplexForGreedyLoops;
2535 }
2536 int node_length = node->GreedyLoopTextLength();
2537 if (node_length == kNodeIsTooComplexForGreedyLoops) {
2538 return kNodeIsTooComplexForGreedyLoops;
2539 }
2540 length += node_length;
2541 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2542 node = seq_node->on_success();
2543 }
2544 return length;
2545}
2546
2547
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002548void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2549 ASSERT_EQ(loop_node_, NULL);
2550 AddAlternative(alt);
2551 loop_node_ = alt.node();
2552}
2553
2554
2555void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2556 ASSERT_EQ(continue_node_, NULL);
2557 AddAlternative(alt);
2558 continue_node_ = alt.node();
2559}
2560
2561
ager@chromium.org8bb60582008-12-11 12:02:20 +00002562bool LoopChoiceNode::Emit(RegExpCompiler* compiler,
2563 GenerationVariant* variant) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002564 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00002565 if (variant->stop_node() == this) {
2566 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2567 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2568 // Update the counter-based backtracking info on the stack. This is an
2569 // optimization for greedy loops (see below).
2570 ASSERT(variant->cp_offset() == text_length);
2571 macro_assembler->AdvanceCurrentPosition(text_length);
2572 macro_assembler->GoTo(variant->loop_label());
2573 return true;
2574 }
2575 ASSERT(variant->stop_node() == NULL);
2576 if (!variant->is_trivial()) {
2577 return variant->Flush(compiler, this);
2578 }
2579 return ChoiceNode::Emit(compiler, variant);
2580}
2581
2582
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002583int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler) {
2584 int preload_characters = EatsAtLeast(0);
2585#ifdef CAN_READ_UNALIGNED
2586 bool ascii = compiler->ascii();
2587 if (ascii) {
2588 if (preload_characters > 4) preload_characters = 4;
2589 // We can't preload 3 characters because there is no machine instruction
2590 // to do that. We can't just load 4 because we could be reading
2591 // beyond the end of the string, which could cause a memory fault.
2592 if (preload_characters == 3) preload_characters = 2;
2593 } else {
2594 if (preload_characters > 2) preload_characters = 2;
2595 }
2596#else
2597 if (preload_characters > 1) preload_characters = 1;
2598#endif
2599 return preload_characters;
2600}
2601
2602
2603// This class is used when generating the alternatives in a choice node. It
2604// records the way the alternative is being code generated.
2605class AlternativeGeneration: public Malloced {
2606 public:
2607 AlternativeGeneration()
2608 : possible_success(),
2609 expects_preload(false),
2610 after(),
2611 quick_check_details() { }
2612 Label possible_success;
2613 bool expects_preload;
2614 Label after;
2615 QuickCheckDetails quick_check_details;
2616};
2617
2618
2619// Creates a list of AlternativeGenerations. If the list has a reasonable
2620// size then it is on the stack, otherwise the excess is on the heap.
2621class AlternativeGenerationList {
2622 public:
2623 explicit AlternativeGenerationList(int count)
2624 : alt_gens_(count) {
2625 for (int i = 0; i < count && i < kAFew; i++) {
2626 alt_gens_.Add(a_few_alt_gens_ + i);
2627 }
2628 for (int i = kAFew; i < count; i++) {
2629 alt_gens_.Add(new AlternativeGeneration());
2630 }
2631 }
2632 ~AlternativeGenerationList() {
2633 for (int i = 0; i < alt_gens_.length(); i++) {
2634 alt_gens_[i]->possible_success.Unuse();
2635 alt_gens_[i]->after.Unuse();
2636 }
2637 for (int i = kAFew; i < alt_gens_.length(); i++) {
2638 delete alt_gens_[i];
2639 alt_gens_[i] = NULL;
2640 }
2641 }
2642
2643 AlternativeGeneration* at(int i) {
2644 return alt_gens_[i];
2645 }
2646 private:
2647 static const int kAFew = 10;
2648 ZoneList<AlternativeGeneration*> alt_gens_;
2649 AlternativeGeneration a_few_alt_gens_[kAFew];
2650};
2651
2652
2653/* Code generation for choice nodes.
2654 *
2655 * We generate quick checks that do a mask and compare to eliminate a
2656 * choice. If the quick check succeeds then it jumps to the continuation to
2657 * do slow checks and check subsequent nodes. If it fails (the common case)
2658 * it falls through to the next choice.
2659 *
2660 * Here is the desired flow graph. Nodes directly below each other imply
2661 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
2662 * 3 doesn't have a quick check so we have to call the slow check.
2663 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
2664 * regexp continuation is generated directly after the Sn node, up to the
2665 * next GoTo if we decide to reuse some already generated code. Some
2666 * nodes expect preload_characters to be preloaded into the current
2667 * character register. R nodes do this preloading. Vertices are marked
2668 * F for failures and S for success (possible success in the case of quick
2669 * nodes). L, V, < and > are used as arrow heads.
2670 *
2671 * ----------> R
2672 * |
2673 * V
2674 * Q1 -----> S1
2675 * | S /
2676 * F| /
2677 * | F/
2678 * | /
2679 * | R
2680 * | /
2681 * V L
2682 * Q2 -----> S2
2683 * | S /
2684 * F| /
2685 * | F/
2686 * | /
2687 * | R
2688 * | /
2689 * V L
2690 * S3
2691 * |
2692 * F|
2693 * |
2694 * R
2695 * |
2696 * backtrack V
2697 * <----------Q4
2698 * \ F |
2699 * \ |S
2700 * \ F V
2701 * \-----S4
2702 *
2703 * For greedy loops we reverse our expectation and expect to match rather
2704 * than fail. Therefore we want the loop code to look like this (U is the
2705 * unwind code that steps back in the greedy loop). The following alternatives
2706 * look the same as above.
2707 * _____
2708 * / \
2709 * V |
2710 * ----------> S1 |
2711 * /| |
2712 * / |S |
2713 * F/ \_____/
2714 * /
2715 * |<-----------
2716 * | \
2717 * V \
2718 * Q2 ---> S2 \
2719 * | S / |
2720 * F| / |
2721 * | F/ |
2722 * | / |
2723 * | R |
2724 * | / |
2725 * F VL |
2726 * <------U |
2727 * back |S |
2728 * \______________/
2729 */
2730
2731
ager@chromium.org8bb60582008-12-11 12:02:20 +00002732bool ChoiceNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) {
2733 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2734 int choice_count = alternatives_->length();
2735#ifdef DEBUG
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002736 for (int i = 0; i < choice_count - 1; i++) {
2737 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002738 ZoneList<Guard*>* guards = alternative.guards();
ager@chromium.org8bb60582008-12-11 12:02:20 +00002739 int guard_count = (guards == NULL) ? 0 : guards->length();
2740 for (int j = 0; j < guard_count; j++) {
2741 ASSERT(!variant->mentions_reg(guards->at(j)->reg()));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002742 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002743 }
2744#endif
2745
2746 LimitResult limit_result = LimitVersions(compiler, variant);
2747 if (limit_result == DONE) return true;
2748 if (limit_result == FAIL) return false;
2749 ASSERT(limit_result == CONTINUE);
2750
2751 RecursionCheck rc(compiler);
2752
2753 GenerationVariant* current_variant = variant;
2754
2755 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2756 bool greedy_loop = false;
2757 Label greedy_loop_label;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002758 GenerationVariant counter_backtrack_variant;
2759 counter_backtrack_variant.set_backtrack(&greedy_loop_label);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002760 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
2761 // Here we have special handling for greedy loops containing only text nodes
2762 // and other simple nodes. These are handled by pushing the current
2763 // position on the stack and then incrementing the current position each
2764 // time around the switch. On backtrack we decrement the current position
2765 // and check it against the pushed value. This avoids pushing backtrack
2766 // information for each iteration of the loop, which could take up a lot of
2767 // space.
2768 greedy_loop = true;
2769 ASSERT(variant->stop_node() == NULL);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002770 macro_assembler->PushCurrentPosition();
ager@chromium.org8bb60582008-12-11 12:02:20 +00002771 current_variant = &counter_backtrack_variant;
2772 Label greedy_match_failed;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002773 GenerationVariant greedy_match_variant;
2774 greedy_match_variant.set_backtrack(&greedy_match_failed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002775 Label loop_label;
2776 macro_assembler->Bind(&loop_label);
2777 greedy_match_variant.set_stop_node(this);
2778 greedy_match_variant.set_loop_label(&loop_label);
2779 bool ok = alternatives_->at(0).node()->Emit(compiler,
2780 &greedy_match_variant);
2781 macro_assembler->Bind(&greedy_match_failed);
2782 if (!ok) {
2783 greedy_loop_label.Unuse();
2784 return false;
2785 }
2786 }
2787
2788 Label second_choice; // For use in greedy matches.
2789 macro_assembler->Bind(&second_choice);
2790
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002791 int first_normal_choice = greedy_loop ? 1 : 0;
2792
2793 int preload_characters = CalculatePreloadCharacters(compiler);
2794 bool preload_is_current =
2795 (current_variant->characters_preloaded() == preload_characters);
2796 bool preload_has_checked_bounds = preload_is_current;
2797
2798 AlternativeGenerationList alt_gens(choice_count);
2799
ager@chromium.org8bb60582008-12-11 12:02:20 +00002800 // For now we just call all choices one after the other. The idea ultimately
2801 // is to use the Dispatch table to try only the relevant ones.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002802 for (int i = first_normal_choice; i < choice_count; i++) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002803 GuardedAlternative alternative = alternatives_->at(i);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002804 AlternativeGeneration* alt_gen(alt_gens.at(i));
2805 alt_gen->quick_check_details.set_characters(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002806 ZoneList<Guard*>* guards = alternative.guards();
2807 int guard_count = (guards == NULL) ? 0 : guards->length();
2808 GenerationVariant new_variant(*current_variant);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002809 new_variant.set_characters_preloaded(preload_is_current ?
2810 preload_characters :
2811 0);
2812 if (preload_has_checked_bounds) {
2813 new_variant.set_bound_checked_up_to(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002814 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002815 new_variant.quick_check_performed()->Clear();
2816 alt_gen->expects_preload = preload_is_current;
2817 bool generate_full_check_inline = false;
2818 if (alternative.node()->EmitQuickCheck(compiler,
2819 &new_variant,
2820 preload_has_checked_bounds,
2821 &alt_gen->possible_success,
2822 &alt_gen->quick_check_details,
2823 i < choice_count - 1)) {
2824 // Quick check was generated for this choice.
2825 preload_is_current = true;
2826 preload_has_checked_bounds = true;
2827 // On the last choice in the ChoiceNode we generated the quick
2828 // check to fall through on possible success. So now we need to
2829 // generate the full check inline.
2830 if (i == choice_count - 1) {
2831 macro_assembler->Bind(&alt_gen->possible_success);
2832 new_variant.set_quick_check_performed(&alt_gen->quick_check_details);
2833 new_variant.set_characters_preloaded(preload_characters);
2834 new_variant.set_bound_checked_up_to(preload_characters);
2835 generate_full_check_inline = true;
2836 }
2837 } else {
2838 // No quick check was generated. Put the full code here.
2839 // If this is not the first choice then there could be slow checks from
2840 // previous cases that go here when they fail. There's no reason to
2841 // insist that they preload characters since the slow check we are about
2842 // to generate probably can't use it.
2843 if (i != first_normal_choice) {
2844 alt_gen->expects_preload = false;
2845 new_variant.set_characters_preloaded(0);
2846 }
2847 if (i < choice_count - 1) {
2848 new_variant.set_backtrack(&alt_gen->after);
2849 }
2850 generate_full_check_inline = true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002851 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002852 if (generate_full_check_inline) {
2853 for (int j = 0; j < guard_count; j++) {
2854 GenerateGuard(macro_assembler, guards->at(j), &new_variant);
2855 }
2856 if (!alternative.node()->Emit(compiler, &new_variant)) {
2857 greedy_loop_label.Unuse();
2858 return false;
2859 }
2860 preload_is_current = false;
2861 }
2862 macro_assembler->Bind(&alt_gen->after);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002863 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002864 if (greedy_loop) {
2865 macro_assembler->Bind(&greedy_loop_label);
2866 // If we have unwound to the bottom then backtrack.
2867 macro_assembler->CheckGreedyLoop(variant->backtrack());
2868 // Otherwise try the second priority at an earlier position.
2869 macro_assembler->AdvanceCurrentPosition(-text_length);
2870 macro_assembler->GoTo(&second_choice);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002871 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002872 // At this point we need to generate slow checks for the alternatives where
2873 // the quick check was inlined. We can recognize these because the associated
2874 // label was bound.
2875 for (int i = first_normal_choice; i < choice_count - 1; i++) {
2876 AlternativeGeneration* alt_gen = alt_gens.at(i);
2877 if (!EmitOutOfLineContinuation(compiler,
2878 current_variant,
2879 alternatives_->at(i),
2880 alt_gen,
2881 preload_characters,
2882 alt_gens.at(i + 1)->expects_preload)) {
2883 return false;
2884 }
2885 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002886 return true;
2887}
2888
2889
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002890bool ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
2891 GenerationVariant* variant,
2892 GuardedAlternative alternative,
2893 AlternativeGeneration* alt_gen,
2894 int preload_characters,
2895 bool next_expects_preload) {
2896 if (!alt_gen->possible_success.is_linked()) return true;
2897
2898 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2899 macro_assembler->Bind(&alt_gen->possible_success);
2900 GenerationVariant out_of_line_variant(*variant);
2901 out_of_line_variant.set_characters_preloaded(preload_characters);
2902 out_of_line_variant.set_quick_check_performed(&alt_gen->quick_check_details);
2903 ZoneList<Guard*>* guards = alternative.guards();
2904 int guard_count = (guards == NULL) ? 0 : guards->length();
2905 if (next_expects_preload) {
2906 Label reload_current_char;
2907 out_of_line_variant.set_backtrack(&reload_current_char);
2908 for (int j = 0; j < guard_count; j++) {
2909 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_variant);
2910 }
2911 bool ok = alternative.node()->Emit(compiler, &out_of_line_variant);
2912 macro_assembler->Bind(&reload_current_char);
2913 // Reload the current character, since the next quick check expects that.
2914 // We don't need to check bounds here because we only get into this
2915 // code through a quick check which already did the checked load.
2916 macro_assembler->LoadCurrentCharacter(variant->cp_offset(),
2917 NULL,
2918 false,
2919 preload_characters);
2920 macro_assembler->GoTo(&(alt_gen->after));
2921 return ok;
2922 } else {
2923 out_of_line_variant.set_backtrack(&(alt_gen->after));
2924 for (int j = 0; j < guard_count; j++) {
2925 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_variant);
2926 }
2927 return alternative.node()->Emit(compiler, &out_of_line_variant);
2928 }
2929}
2930
2931
ager@chromium.org8bb60582008-12-11 12:02:20 +00002932bool ActionNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002933 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00002934 LimitResult limit_result = LimitVersions(compiler, variant);
2935 if (limit_result == DONE) return true;
2936 if (limit_result == FAIL) return false;
2937 ASSERT(limit_result == CONTINUE);
2938
2939 RecursionCheck rc(compiler);
2940
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002941 switch (type_) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002942 case STORE_POSITION: {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002943 GenerationVariant::DeferredCapture
2944 new_capture(data_.u_position_register.reg, variant);
2945 GenerationVariant new_variant = *variant;
2946 new_variant.add_action(&new_capture);
2947 return on_success()->Emit(compiler, &new_variant);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002948 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002949 case INCREMENT_REGISTER: {
2950 GenerationVariant::DeferredIncrementRegister
2951 new_increment(data_.u_increment_register.reg);
2952 GenerationVariant new_variant = *variant;
2953 new_variant.add_action(&new_increment);
2954 return on_success()->Emit(compiler, &new_variant);
2955 }
2956 case SET_REGISTER: {
2957 GenerationVariant::DeferredSetRegister
2958 new_set(data_.u_store_register.reg, data_.u_store_register.value);
2959 GenerationVariant new_variant = *variant;
2960 new_variant.add_action(&new_set);
2961 return on_success()->Emit(compiler, &new_variant);
2962 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002963 case BEGIN_SUBMATCH:
ager@chromium.org8bb60582008-12-11 12:02:20 +00002964 if (!variant->is_trivial()) return variant->Flush(compiler, this);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002965 assembler->WriteCurrentPositionToRegister(
ager@chromium.org8bb60582008-12-11 12:02:20 +00002966 data_.u_submatch.current_position_register, 0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002967 assembler->WriteStackPointerToRegister(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002968 data_.u_submatch.stack_pointer_register);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002969 return on_success()->Emit(compiler, variant);
2970 case POSITIVE_SUBMATCH_SUCCESS:
2971 if (!variant->is_trivial()) return variant->Flush(compiler, this);
2972 // TODO(erikcorry): Implement support.
2973 if (info()->follows_word_interest ||
2974 info()->follows_newline_interest ||
2975 info()->follows_start_interest) {
2976 return false;
2977 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002978 if (info()->at_end) {
2979 Label at_end;
2980 // Load current character jumps to the label if we are beyond the string
2981 // end.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002982 assembler->LoadCurrentCharacter(0, &at_end);
2983 assembler->GoTo(variant->backtrack());
2984 assembler->Bind(&at_end);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002985 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002986 assembler->ReadCurrentPositionFromRegister(
ager@chromium.org8bb60582008-12-11 12:02:20 +00002987 data_.u_submatch.current_position_register);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002988 assembler->ReadStackPointerFromRegister(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002989 data_.u_submatch.stack_pointer_register);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002990 return on_success()->Emit(compiler, variant);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002991 default:
2992 UNREACHABLE();
2993 return false;
2994 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002995}
2996
2997
ager@chromium.org8bb60582008-12-11 12:02:20 +00002998bool BackReferenceNode::Emit(RegExpCompiler* compiler,
2999 GenerationVariant* variant) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003000 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00003001 if (!variant->is_trivial()) {
3002 return variant->Flush(compiler, this);
3003 }
3004
3005 LimitResult limit_result = LimitVersions(compiler, variant);
3006 if (limit_result == DONE) return true;
3007 if (limit_result == FAIL) return false;
3008 ASSERT(limit_result == CONTINUE);
3009
3010 RecursionCheck rc(compiler);
3011
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003012 ASSERT_EQ(start_reg_ + 1, end_reg_);
3013 if (info()->at_end) {
3014 // If we are constrained to match at the end of the input then succeed
3015 // iff the back reference is empty.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003016 assembler->CheckNotRegistersEqual(start_reg_,
3017 end_reg_,
3018 variant->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003019 } else {
3020 if (compiler->ignore_case()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003021 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
3022 variant->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003023 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003024 assembler->CheckNotBackReference(start_reg_, variant->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003025 }
3026 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003027 return on_success()->Emit(compiler, variant);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003028}
3029
3030
3031// -------------------------------------------------------------------
3032// Dot/dotty output
3033
3034
3035#ifdef DEBUG
3036
3037
3038class DotPrinter: public NodeVisitor {
3039 public:
3040 explicit DotPrinter(bool ignore_case)
3041 : ignore_case_(ignore_case),
3042 stream_(&alloc_) { }
3043 void PrintNode(const char* label, RegExpNode* node);
3044 void Visit(RegExpNode* node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003045 void PrintAttributes(RegExpNode* from);
3046 StringStream* stream() { return &stream_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003047 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003048#define DECLARE_VISIT(Type) \
3049 virtual void Visit##Type(Type##Node* that);
3050FOR_EACH_NODE_TYPE(DECLARE_VISIT)
3051#undef DECLARE_VISIT
3052 private:
3053 bool ignore_case_;
3054 HeapStringAllocator alloc_;
3055 StringStream stream_;
3056};
3057
3058
3059void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
3060 stream()->Add("digraph G {\n graph [label=\"");
3061 for (int i = 0; label[i]; i++) {
3062 switch (label[i]) {
3063 case '\\':
3064 stream()->Add("\\\\");
3065 break;
3066 case '"':
3067 stream()->Add("\"");
3068 break;
3069 default:
3070 stream()->Put(label[i]);
3071 break;
3072 }
3073 }
3074 stream()->Add("\"];\n");
3075 Visit(node);
3076 stream()->Add("}\n");
3077 printf("%s", *(stream()->ToCString()));
3078}
3079
3080
3081void DotPrinter::Visit(RegExpNode* node) {
3082 if (node->info()->visited) return;
3083 node->info()->visited = true;
3084 node->Accept(this);
3085}
3086
3087
3088void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003089 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
3090 Visit(on_failure);
3091}
3092
3093
3094class TableEntryBodyPrinter {
3095 public:
3096 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
3097 : stream_(stream), choice_(choice) { }
3098 void Call(uc16 from, DispatchTable::Entry entry) {
3099 OutSet* out_set = entry.out_set();
3100 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3101 if (out_set->Get(i)) {
3102 stream()->Add(" n%p:s%io%i -> n%p;\n",
3103 choice(),
3104 from,
3105 i,
3106 choice()->alternatives()->at(i).node());
3107 }
3108 }
3109 }
3110 private:
3111 StringStream* stream() { return stream_; }
3112 ChoiceNode* choice() { return choice_; }
3113 StringStream* stream_;
3114 ChoiceNode* choice_;
3115};
3116
3117
3118class TableEntryHeaderPrinter {
3119 public:
3120 explicit TableEntryHeaderPrinter(StringStream* stream)
3121 : first_(true), stream_(stream) { }
3122 void Call(uc16 from, DispatchTable::Entry entry) {
3123 if (first_) {
3124 first_ = false;
3125 } else {
3126 stream()->Add("|");
3127 }
3128 stream()->Add("{\\%k-\\%k|{", from, entry.to());
3129 OutSet* out_set = entry.out_set();
3130 int priority = 0;
3131 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3132 if (out_set->Get(i)) {
3133 if (priority > 0) stream()->Add("|");
3134 stream()->Add("<s%io%i> %i", from, i, priority);
3135 priority++;
3136 }
3137 }
3138 stream()->Add("}}");
3139 }
3140 private:
3141 bool first_;
3142 StringStream* stream() { return stream_; }
3143 StringStream* stream_;
3144};
3145
3146
3147class AttributePrinter {
3148 public:
3149 explicit AttributePrinter(DotPrinter* out)
3150 : out_(out), first_(true) { }
3151 void PrintSeparator() {
3152 if (first_) {
3153 first_ = false;
3154 } else {
3155 out_->stream()->Add("|");
3156 }
3157 }
3158 void PrintBit(const char* name, bool value) {
3159 if (!value) return;
3160 PrintSeparator();
3161 out_->stream()->Add("{%s}", name);
3162 }
3163 void PrintPositive(const char* name, int value) {
3164 if (value < 0) return;
3165 PrintSeparator();
3166 out_->stream()->Add("{%s|%x}", name, value);
3167 }
3168 private:
3169 DotPrinter* out_;
3170 bool first_;
3171};
3172
3173
3174void DotPrinter::PrintAttributes(RegExpNode* that) {
3175 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
3176 "margin=0.1, fontsize=10, label=\"{",
3177 that);
3178 AttributePrinter printer(this);
3179 NodeInfo* info = that->info();
3180 printer.PrintBit("NI", info->follows_newline_interest);
3181 printer.PrintBit("WI", info->follows_word_interest);
3182 printer.PrintBit("SI", info->follows_start_interest);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003183 Label* label = that->label();
3184 if (label->is_bound())
3185 printer.PrintPositive("@", label->pos());
3186 stream()->Add("}\"];\n");
3187 stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
3188 "arrowhead=none];\n", that, that);
3189}
3190
3191
3192static const bool kPrintDispatchTable = false;
3193void DotPrinter::VisitChoice(ChoiceNode* that) {
3194 if (kPrintDispatchTable) {
3195 stream()->Add(" n%p [shape=Mrecord, label=\"", that);
3196 TableEntryHeaderPrinter header_printer(stream());
3197 that->GetTable(ignore_case_)->ForEach(&header_printer);
3198 stream()->Add("\"]\n", that);
3199 PrintAttributes(that);
3200 TableEntryBodyPrinter body_printer(stream(), that);
3201 that->GetTable(ignore_case_)->ForEach(&body_printer);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003202 } else {
3203 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
3204 for (int i = 0; i < that->alternatives()->length(); i++) {
3205 GuardedAlternative alt = that->alternatives()->at(i);
3206 stream()->Add(" n%p -> n%p;\n", that, alt.node());
3207 }
3208 }
3209 for (int i = 0; i < that->alternatives()->length(); i++) {
3210 GuardedAlternative alt = that->alternatives()->at(i);
3211 alt.node()->Accept(this);
3212 }
3213}
3214
3215
3216void DotPrinter::VisitText(TextNode* that) {
3217 stream()->Add(" n%p [label=\"", that);
3218 for (int i = 0; i < that->elements()->length(); i++) {
3219 if (i > 0) stream()->Add(" ");
3220 TextElement elm = that->elements()->at(i);
3221 switch (elm.type) {
3222 case TextElement::ATOM: {
3223 stream()->Add("'%w'", elm.data.u_atom->data());
3224 break;
3225 }
3226 case TextElement::CHAR_CLASS: {
3227 RegExpCharacterClass* node = elm.data.u_char_class;
3228 stream()->Add("[");
3229 if (node->is_negated())
3230 stream()->Add("^");
3231 for (int j = 0; j < node->ranges()->length(); j++) {
3232 CharacterRange range = node->ranges()->at(j);
3233 stream()->Add("%k-%k", range.from(), range.to());
3234 }
3235 stream()->Add("]");
3236 break;
3237 }
3238 default:
3239 UNREACHABLE();
3240 }
3241 }
3242 stream()->Add("\", shape=box, peripheries=2];\n");
3243 PrintAttributes(that);
3244 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3245 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003246}
3247
3248
3249void DotPrinter::VisitBackReference(BackReferenceNode* that) {
3250 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
3251 that,
3252 that->start_register(),
3253 that->end_register());
3254 PrintAttributes(that);
3255 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3256 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003257}
3258
3259
3260void DotPrinter::VisitEnd(EndNode* that) {
3261 stream()->Add(" n%p [style=bold, shape=point];\n", that);
3262 PrintAttributes(that);
3263}
3264
3265
3266void DotPrinter::VisitAction(ActionNode* that) {
3267 stream()->Add(" n%p [", that);
3268 switch (that->type_) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003269 case ActionNode::SET_REGISTER:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003270 stream()->Add("label=\"$%i:=%i\", shape=octagon",
3271 that->data_.u_store_register.reg,
3272 that->data_.u_store_register.value);
3273 break;
3274 case ActionNode::INCREMENT_REGISTER:
3275 stream()->Add("label=\"$%i++\", shape=octagon",
3276 that->data_.u_increment_register.reg);
3277 break;
3278 case ActionNode::STORE_POSITION:
3279 stream()->Add("label=\"$%i:=$pos\", shape=octagon",
3280 that->data_.u_position_register.reg);
3281 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003282 case ActionNode::BEGIN_SUBMATCH:
3283 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
3284 that->data_.u_submatch.current_position_register);
3285 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003286 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003287 stream()->Add("label=\"escape\", shape=septagon");
3288 break;
3289 }
3290 stream()->Add("];\n");
3291 PrintAttributes(that);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003292 RegExpNode* successor = that->on_success();
3293 stream()->Add(" n%p -> n%p;\n", that, successor);
3294 Visit(successor);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003295}
3296
3297
3298class DispatchTableDumper {
3299 public:
3300 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
3301 void Call(uc16 key, DispatchTable::Entry entry);
3302 StringStream* stream() { return stream_; }
3303 private:
3304 StringStream* stream_;
3305};
3306
3307
3308void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
3309 stream()->Add("[%k-%k]: {", key, entry.to());
3310 OutSet* set = entry.out_set();
3311 bool first = true;
3312 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3313 if (set->Get(i)) {
3314 if (first) {
3315 first = false;
3316 } else {
3317 stream()->Add(", ");
3318 }
3319 stream()->Add("%i", i);
3320 }
3321 }
3322 stream()->Add("}\n");
3323}
3324
3325
3326void DispatchTable::Dump() {
3327 HeapStringAllocator alloc;
3328 StringStream stream(&alloc);
3329 DispatchTableDumper dumper(&stream);
3330 tree()->ForEach(&dumper);
3331 OS::PrintError("%s", *stream.ToCString());
3332}
3333
3334
3335void RegExpEngine::DotPrint(const char* label,
3336 RegExpNode* node,
3337 bool ignore_case) {
3338 DotPrinter printer(ignore_case);
3339 printer.PrintNode(label, node);
3340}
3341
3342
3343#endif // DEBUG
3344
3345
3346// -------------------------------------------------------------------
3347// Tree to graph conversion
3348
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003349static const int kSpaceRangeCount = 20;
3350static const int kSpaceRangeAsciiCount = 4;
3351static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020,
3352 0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A,
3353 0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 };
3354
3355static const int kWordRangeCount = 8;
3356static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_',
3357 '_', 'a', 'z' };
3358
3359static const int kDigitRangeCount = 2;
3360static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' };
3361
3362static const int kLineTerminatorRangeCount = 6;
3363static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A,
3364 0x000A, 0x000D, 0x000D, 0x2028, 0x2029 };
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003365
3366RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003367 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003368 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
3369 elms->Add(TextElement::Atom(this));
ager@chromium.org8bb60582008-12-11 12:02:20 +00003370 return new TextNode(elms, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003371}
3372
3373
3374RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003375 RegExpNode* on_success) {
3376 return new TextNode(elements(), on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003377}
3378
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003379static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
3380 const uc16* special_class,
3381 int length) {
3382 ASSERT(ranges->length() != 0);
3383 ASSERT(length != 0);
3384 ASSERT(special_class[0] != 0);
3385 if (ranges->length() != (length >> 1) + 1) {
3386 return false;
3387 }
3388 CharacterRange range = ranges->at(0);
3389 if (range.from() != 0) {
3390 return false;
3391 }
3392 for (int i = 0; i < length; i += 2) {
3393 if (special_class[i] != (range.to() + 1)) {
3394 return false;
3395 }
3396 range = ranges->at((i >> 1) + 1);
3397 if (special_class[i+1] != range.from() - 1) {
3398 return false;
3399 }
3400 }
3401 if (range.to() != 0xffff) {
3402 return false;
3403 }
3404 return true;
3405}
3406
3407
3408static bool CompareRanges(ZoneList<CharacterRange>* ranges,
3409 const uc16* special_class,
3410 int length) {
3411 if (ranges->length() * 2 != length) {
3412 return false;
3413 }
3414 for (int i = 0; i < length; i += 2) {
3415 CharacterRange range = ranges->at(i >> 1);
3416 if (range.from() != special_class[i] || range.to() != special_class[i+1]) {
3417 return false;
3418 }
3419 }
3420 return true;
3421}
3422
3423
3424bool RegExpCharacterClass::is_standard() {
3425 // TODO(lrn): Remove need for this function, by not throwing away information
3426 // along the way.
3427 if (is_negated_) {
3428 return false;
3429 }
3430 if (set_.is_standard()) {
3431 return true;
3432 }
3433 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3434 set_.set_standard_set_type('s');
3435 return true;
3436 }
3437 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3438 set_.set_standard_set_type('S');
3439 return true;
3440 }
3441 if (CompareInverseRanges(set_.ranges(),
3442 kLineTerminatorRanges,
3443 kLineTerminatorRangeCount)) {
3444 set_.set_standard_set_type('.');
3445 return true;
3446 }
3447 return false;
3448}
3449
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003450
3451RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003452 RegExpNode* on_success) {
3453 return new TextNode(this, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003454}
3455
3456
3457RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003458 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003459 ZoneList<RegExpTree*>* alternatives = this->alternatives();
3460 int length = alternatives->length();
ager@chromium.org8bb60582008-12-11 12:02:20 +00003461 ChoiceNode* result = new ChoiceNode(length);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003462 for (int i = 0; i < length; i++) {
3463 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003464 on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003465 result->AddAlternative(alternative);
3466 }
3467 return result;
3468}
3469
3470
3471RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003472 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003473 return ToNode(min(),
3474 max(),
3475 is_greedy(),
3476 body(),
3477 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003478 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003479}
3480
3481
3482RegExpNode* RegExpQuantifier::ToNode(int min,
3483 int max,
3484 bool is_greedy,
3485 RegExpTree* body,
3486 RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003487 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003488 // x{f, t} becomes this:
3489 //
3490 // (r++)<-.
3491 // | `
3492 // | (x)
3493 // v ^
3494 // (r=0)-->(?)---/ [if r < t]
3495 // |
3496 // [if r >= f] \----> ...
3497 //
3498 //
3499 // TODO(someone): clear captures on repetition and handle empty
3500 // matches.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003501
3502 // 15.10.2.5 RepeatMatcher algorithm.
3503 // The parser has already eliminated the case where max is 0. In the case
3504 // where max_match is zero the parser has removed the quantifier if min was
3505 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
3506
3507 // If we know that we cannot match zero length then things are a little
3508 // simpler since we don't need to make the special zero length match check
3509 // from step 2.1. If the min and max are small we can unroll a little in
3510 // this case.
3511 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
3512 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
3513 if (max == 0) return on_success; // This can happen due to recursion.
3514 if (body->min_match() > 0) {
3515 if (min > 0 && min <= kMaxUnrolledMinMatches) {
3516 int new_max = (max == kInfinity) ? max : max - min;
3517 // Recurse once to get the loop or optional matches after the fixed ones.
3518 RegExpNode* answer =
3519 ToNode(0, new_max, is_greedy, body, compiler, on_success);
3520 // Unroll the forced matches from 0 to min. This can cause chains of
3521 // TextNodes (which the parser does not generate). These should be
3522 // combined if it turns out they hinder good code generation.
3523 for (int i = 0; i < min; i++) {
3524 answer = body->ToNode(compiler, answer);
3525 }
3526 return answer;
3527 }
3528 if (max <= kMaxUnrolledMaxMatches) {
3529 ASSERT(min == 0);
3530 // Unroll the optional matches up to max.
3531 RegExpNode* answer = on_success;
3532 for (int i = 0; i < max; i++) {
3533 ChoiceNode* alternation = new ChoiceNode(2);
3534 if (is_greedy) {
3535 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3536 answer)));
3537 alternation->AddAlternative(GuardedAlternative(on_success));
3538 } else {
3539 alternation->AddAlternative(GuardedAlternative(on_success));
3540 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3541 answer)));
3542 }
3543 answer = alternation;
3544 }
3545 return answer;
3546 }
3547 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003548 bool has_min = min > 0;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003549 bool has_max = max < RegExpTree::kInfinity;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003550 bool needs_counter = has_min || has_max;
3551 int reg_ctr = needs_counter ? compiler->AllocateRegister() : -1;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003552 LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003553 RegExpNode* loop_return = needs_counter
3554 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
3555 : static_cast<RegExpNode*>(center);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003556 RegExpNode* body_node = body->ToNode(compiler, loop_return);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003557 GuardedAlternative body_alt(body_node);
3558 if (has_max) {
3559 Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
3560 body_alt.AddGuard(body_guard);
3561 }
3562 GuardedAlternative rest_alt(on_success);
3563 if (has_min) {
3564 Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
3565 rest_alt.AddGuard(rest_guard);
3566 }
3567 if (is_greedy) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003568 center->AddLoopAlternative(body_alt);
3569 center->AddContinueAlternative(rest_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003570 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003571 center->AddContinueAlternative(rest_alt);
3572 center->AddLoopAlternative(body_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003573 }
3574 if (needs_counter) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003575 return ActionNode::SetRegister(reg_ctr, 0, center);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003576 } else {
3577 return center;
3578 }
3579}
3580
3581
3582RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003583 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003584 NodeInfo info;
3585 switch (type()) {
3586 case START_OF_LINE:
3587 info.follows_newline_interest = true;
3588 break;
3589 case START_OF_INPUT:
3590 info.follows_start_interest = true;
3591 break;
3592 case BOUNDARY: case NON_BOUNDARY:
3593 info.follows_word_interest = true;
3594 break;
3595 case END_OF_INPUT:
3596 info.at_end = true;
3597 break;
3598 case END_OF_LINE:
3599 // This is wrong but has the effect of making the compiler abort.
3600 info.at_end = true;
3601 }
3602 return on_success->PropagateForward(&info);
3603}
3604
3605
3606RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003607 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003608 return new BackReferenceNode(RegExpCapture::StartRegister(index()),
3609 RegExpCapture::EndRegister(index()),
ager@chromium.org8bb60582008-12-11 12:02:20 +00003610 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003611}
3612
3613
3614RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003615 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003616 return on_success;
3617}
3618
3619
3620RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003621 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003622 int stack_pointer_register = compiler->AllocateRegister();
3623 int position_register = compiler->AllocateRegister();
ager@chromium.org8bb60582008-12-11 12:02:20 +00003624 RegExpNode* success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003625 if (is_positive()) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003626 return ActionNode::BeginSubmatch(
3627 stack_pointer_register,
3628 position_register,
3629 body()->ToNode(
3630 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003631 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
3632 position_register,
3633 on_success)));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003634 } else {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003635 // We use a ChoiceNode for a negative lookahead because it has most of
3636 // the characteristics we need. It has the body of the lookahead as its
3637 // first alternative and the expression after the lookahead of the second
3638 // alternative. If the first alternative succeeds then the
3639 // NegativeSubmatchSuccess will unwind the stack including everything the
3640 // choice node set up and backtrack. If the first alternative fails then
3641 // the second alternative is tried, which is exactly the desired result
3642 // for a negative lookahead. In the case where the dispatch table
3643 // determines that the first alternative cannot match we will save time
3644 // by not trying it. Things are not quite so well-optimized if the
3645 // dispatch table determines that the second alternative cannot match.
3646 // In this case we could optimize by immediately backtracking.
3647 ChoiceNode* choice_node = new ChoiceNode(2);
3648 GuardedAlternative body_alt(
3649 body()->ToNode(
3650 compiler,
3651 success = new NegativeSubmatchSuccess(stack_pointer_register,
3652 position_register)));
3653 choice_node->AddAlternative(body_alt);
3654 choice_node->AddAlternative(GuardedAlternative(on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003655 return ActionNode::BeginSubmatch(stack_pointer_register,
3656 position_register,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003657 choice_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003658 }
3659}
3660
3661
3662RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003663 RegExpNode* on_success) {
3664 return ToNode(body(), index(), compiler, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003665}
3666
3667
3668RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
3669 int index,
3670 RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003671 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003672 int start_reg = RegExpCapture::StartRegister(index);
3673 int end_reg = RegExpCapture::EndRegister(index);
3674 RegExpNode* store_end = ActionNode::StorePosition(end_reg, on_success);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003675 RegExpNode* body_node = body->ToNode(compiler, store_end);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003676 return ActionNode::StorePosition(start_reg, body_node);
3677}
3678
3679
3680RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003681 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003682 ZoneList<RegExpTree*>* children = nodes();
3683 RegExpNode* current = on_success;
3684 for (int i = children->length() - 1; i >= 0; i--) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003685 current = children->at(i)->ToNode(compiler, current);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003686 }
3687 return current;
3688}
3689
3690
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003691static void AddClass(const uc16* elmv,
3692 int elmc,
3693 ZoneList<CharacterRange>* ranges) {
3694 for (int i = 0; i < elmc; i += 2) {
3695 ASSERT(elmv[i] <= elmv[i + 1]);
3696 ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
3697 }
3698}
3699
3700
3701static void AddClassNegated(const uc16 *elmv,
3702 int elmc,
3703 ZoneList<CharacterRange>* ranges) {
3704 ASSERT(elmv[0] != 0x0000);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003705 ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003706 uc16 last = 0x0000;
3707 for (int i = 0; i < elmc; i += 2) {
3708 ASSERT(last <= elmv[i] - 1);
3709 ASSERT(elmv[i] <= elmv[i + 1]);
3710 ranges->Add(CharacterRange(last, elmv[i] - 1));
3711 last = elmv[i + 1] + 1;
3712 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003713 ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003714}
3715
3716
3717void CharacterRange::AddClassEscape(uc16 type,
3718 ZoneList<CharacterRange>* ranges) {
3719 switch (type) {
3720 case 's':
3721 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
3722 break;
3723 case 'S':
3724 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
3725 break;
3726 case 'w':
3727 AddClass(kWordRanges, kWordRangeCount, ranges);
3728 break;
3729 case 'W':
3730 AddClassNegated(kWordRanges, kWordRangeCount, ranges);
3731 break;
3732 case 'd':
3733 AddClass(kDigitRanges, kDigitRangeCount, ranges);
3734 break;
3735 case 'D':
3736 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
3737 break;
3738 case '.':
3739 AddClassNegated(kLineTerminatorRanges,
3740 kLineTerminatorRangeCount,
3741 ranges);
3742 break;
3743 // This is not a character range as defined by the spec but a
3744 // convenient shorthand for a character class that matches any
3745 // character.
3746 case '*':
3747 ranges->Add(CharacterRange::Everything());
3748 break;
3749 default:
3750 UNREACHABLE();
3751 }
3752}
3753
3754
3755Vector<const uc16> CharacterRange::GetWordBounds() {
3756 return Vector<const uc16>(kWordRanges, kWordRangeCount);
3757}
3758
3759
3760class CharacterRangeSplitter {
3761 public:
3762 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
3763 ZoneList<CharacterRange>** excluded)
3764 : included_(included),
3765 excluded_(excluded) { }
3766 void Call(uc16 from, DispatchTable::Entry entry);
3767
3768 static const int kInBase = 0;
3769 static const int kInOverlay = 1;
3770
3771 private:
3772 ZoneList<CharacterRange>** included_;
3773 ZoneList<CharacterRange>** excluded_;
3774};
3775
3776
3777void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
3778 if (!entry.out_set()->Get(kInBase)) return;
3779 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
3780 ? included_
3781 : excluded_;
3782 if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
3783 (*target)->Add(CharacterRange(entry.from(), entry.to()));
3784}
3785
3786
3787void CharacterRange::Split(ZoneList<CharacterRange>* base,
3788 Vector<const uc16> overlay,
3789 ZoneList<CharacterRange>** included,
3790 ZoneList<CharacterRange>** excluded) {
3791 ASSERT_EQ(NULL, *included);
3792 ASSERT_EQ(NULL, *excluded);
3793 DispatchTable table;
3794 for (int i = 0; i < base->length(); i++)
3795 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
3796 for (int i = 0; i < overlay.length(); i += 2) {
3797 table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
3798 CharacterRangeSplitter::kInOverlay);
3799 }
3800 CharacterRangeSplitter callback(included, excluded);
3801 table.ForEach(&callback);
3802}
3803
3804
3805void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
3806 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3807 if (IsSingleton()) {
3808 // If this is a singleton we just expand the one character.
3809 int length = uncanonicalize.get(from(), '\0', chars);
3810 for (int i = 0; i < length; i++) {
3811 uc32 chr = chars[i];
3812 if (chr != from()) {
3813 ranges->Add(CharacterRange::Singleton(chars[i]));
3814 }
3815 }
3816 } else if (from() <= kRangeCanonicalizeMax &&
3817 to() <= kRangeCanonicalizeMax) {
3818 // If this is a range we expand the characters block by block,
3819 // expanding contiguous subranges (blocks) one at a time.
3820 // The approach is as follows. For a given start character we
3821 // look up the block that contains it, for instance 'a' if the
3822 // start character is 'c'. A block is characterized by the property
3823 // that all characters uncanonicalize in the same way as the first
3824 // element, except that each entry in the result is incremented
3825 // by the distance from the first element. So a-z is a block
3826 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
3827 // uncanonicalizes to ['a' + k, 'A' + k].
3828 // Once we've found the start point we look up its uncanonicalization
3829 // and produce a range for each element. For instance for [c-f]
3830 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
3831 // add a range if it is not already contained in the input, so [c-f]
3832 // will be skipped but [C-F] will be added. If this range is not
3833 // completely contained in a block we do this for all the blocks
3834 // covered by the range.
3835 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3836 // First, look up the block that contains the 'from' character.
3837 int length = canonrange.get(from(), '\0', range);
3838 if (length == 0) {
3839 range[0] = from();
3840 } else {
3841 ASSERT_EQ(1, length);
3842 }
3843 int pos = from();
3844 // The start of the current block. Note that except for the first
3845 // iteration 'start' is always equal to 'pos'.
3846 int start;
3847 // If it is not the start point of a block the entry contains the
3848 // offset of the character from the start point.
3849 if ((range[0] & kStartMarker) == 0) {
3850 start = pos - range[0];
3851 } else {
3852 start = pos;
3853 }
3854 // Then we add the ranges on at a time, incrementing the current
3855 // position to be after the last block each time. The position
3856 // always points to the start of a block.
3857 while (pos < to()) {
3858 length = canonrange.get(start, '\0', range);
3859 if (length == 0) {
3860 range[0] = start;
3861 } else {
3862 ASSERT_EQ(1, length);
3863 }
3864 ASSERT((range[0] & kStartMarker) != 0);
3865 // The start point of a block contains the distance to the end
3866 // of the range.
3867 int block_end = start + (range[0] & kPayloadMask) - 1;
3868 int end = (block_end > to()) ? to() : block_end;
3869 length = uncanonicalize.get(start, '\0', range);
3870 for (int i = 0; i < length; i++) {
3871 uc32 c = range[i];
3872 uc16 range_from = c + (pos - start);
3873 uc16 range_to = c + (end - start);
3874 if (!(from() <= range_from && range_to <= to())) {
3875 ranges->Add(CharacterRange(range_from, range_to));
3876 }
3877 }
3878 start = pos = block_end + 1;
3879 }
3880 } else {
3881 // TODO(plesner) when we've fixed the 2^11 bug in unibrow.
3882 }
3883}
3884
3885
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003886ZoneList<CharacterRange>* CharacterSet::ranges() {
3887 if (ranges_ == NULL) {
3888 ranges_ = new ZoneList<CharacterRange>(2);
3889 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
3890 }
3891 return ranges_;
3892}
3893
3894
3895
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003896// -------------------------------------------------------------------
3897// Interest propagation
3898
3899
3900RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
3901 for (int i = 0; i < siblings_.length(); i++) {
3902 RegExpNode* sibling = siblings_.Get(i);
3903 if (sibling->info()->Matches(info))
3904 return sibling;
3905 }
3906 return NULL;
3907}
3908
3909
3910RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
3911 ASSERT_EQ(false, *cloned);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003912 siblings_.Ensure(this);
3913 RegExpNode* result = TryGetSibling(info);
3914 if (result != NULL) return result;
3915 result = this->Clone();
3916 NodeInfo* new_info = result->info();
3917 new_info->ResetCompilationState();
3918 new_info->AddFromPreceding(info);
3919 AddSibling(result);
3920 *cloned = true;
3921 return result;
3922}
3923
3924
3925template <class C>
3926static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
3927 NodeInfo full_info(*node->info());
3928 full_info.AddFromPreceding(info);
3929 bool cloned = false;
3930 return RegExpNode::EnsureSibling(node, &full_info, &cloned);
3931}
3932
3933
3934RegExpNode* ActionNode::PropagateForward(NodeInfo* info) {
3935 NodeInfo full_info(*this->info());
3936 full_info.AddFromPreceding(info);
3937 bool cloned = false;
3938 ActionNode* action = EnsureSibling(this, &full_info, &cloned);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003939 action->set_on_success(action->on_success()->PropagateForward(info));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003940 return action;
3941}
3942
3943
3944RegExpNode* ChoiceNode::PropagateForward(NodeInfo* info) {
3945 NodeInfo full_info(*this->info());
3946 full_info.AddFromPreceding(info);
3947 bool cloned = false;
3948 ChoiceNode* choice = EnsureSibling(this, &full_info, &cloned);
3949 if (cloned) {
3950 ZoneList<GuardedAlternative>* old_alternatives = alternatives();
3951 int count = old_alternatives->length();
3952 choice->alternatives_ = new ZoneList<GuardedAlternative>(count);
3953 for (int i = 0; i < count; i++) {
3954 GuardedAlternative alternative = old_alternatives->at(i);
3955 alternative.set_node(alternative.node()->PropagateForward(info));
3956 choice->alternatives()->Add(alternative);
3957 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003958 }
3959 return choice;
3960}
3961
3962
3963RegExpNode* EndNode::PropagateForward(NodeInfo* info) {
3964 return PropagateToEndpoint(this, info);
3965}
3966
3967
3968RegExpNode* BackReferenceNode::PropagateForward(NodeInfo* info) {
3969 NodeInfo full_info(*this->info());
3970 full_info.AddFromPreceding(info);
3971 bool cloned = false;
3972 BackReferenceNode* back_ref = EnsureSibling(this, &full_info, &cloned);
3973 if (cloned) {
3974 // TODO(erikcorry): A back reference has to have two successors (by default
3975 // the same node). The first is used if the back reference matches a non-
3976 // empty back reference, the second if it matches an empty one. This
3977 // doesn't matter for at_end, which is the only one implemented right now,
3978 // but it will matter for other pieces of info.
3979 back_ref->set_on_success(back_ref->on_success()->PropagateForward(info));
3980 }
3981 return back_ref;
3982}
3983
3984
3985RegExpNode* TextNode::PropagateForward(NodeInfo* info) {
3986 return PropagateToEndpoint(this, info);
3987}
3988
3989
3990// -------------------------------------------------------------------
3991// Splay tree
3992
3993
3994OutSet* OutSet::Extend(unsigned value) {
3995 if (Get(value))
3996 return this;
3997 if (successors() != NULL) {
3998 for (int i = 0; i < successors()->length(); i++) {
3999 OutSet* successor = successors()->at(i);
4000 if (successor->Get(value))
4001 return successor;
4002 }
4003 } else {
4004 successors_ = new ZoneList<OutSet*>(2);
4005 }
4006 OutSet* result = new OutSet(first_, remaining_);
4007 result->Set(value);
4008 successors()->Add(result);
4009 return result;
4010}
4011
4012
4013void OutSet::Set(unsigned value) {
4014 if (value < kFirstLimit) {
4015 first_ |= (1 << value);
4016 } else {
4017 if (remaining_ == NULL)
4018 remaining_ = new ZoneList<unsigned>(1);
4019 if (remaining_->is_empty() || !remaining_->Contains(value))
4020 remaining_->Add(value);
4021 }
4022}
4023
4024
4025bool OutSet::Get(unsigned value) {
4026 if (value < kFirstLimit) {
4027 return (first_ & (1 << value)) != 0;
4028 } else if (remaining_ == NULL) {
4029 return false;
4030 } else {
4031 return remaining_->Contains(value);
4032 }
4033}
4034
4035
4036const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
4037const DispatchTable::Entry DispatchTable::Config::kNoValue;
4038
4039
4040void DispatchTable::AddRange(CharacterRange full_range, int value) {
4041 CharacterRange current = full_range;
4042 if (tree()->is_empty()) {
4043 // If this is the first range we just insert into the table.
4044 ZoneSplayTree<Config>::Locator loc;
4045 ASSERT_RESULT(tree()->Insert(current.from(), &loc));
4046 loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
4047 return;
4048 }
4049 // First see if there is a range to the left of this one that
4050 // overlaps.
4051 ZoneSplayTree<Config>::Locator loc;
4052 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
4053 Entry* entry = &loc.value();
4054 // If we've found a range that overlaps with this one, and it
4055 // starts strictly to the left of this one, we have to fix it
4056 // because the following code only handles ranges that start on
4057 // or after the start point of the range we're adding.
4058 if (entry->from() < current.from() && entry->to() >= current.from()) {
4059 // Snap the overlapping range in half around the start point of
4060 // the range we're adding.
4061 CharacterRange left(entry->from(), current.from() - 1);
4062 CharacterRange right(current.from(), entry->to());
4063 // The left part of the overlapping range doesn't overlap.
4064 // Truncate the whole entry to be just the left part.
4065 entry->set_to(left.to());
4066 // The right part is the one that overlaps. We add this part
4067 // to the map and let the next step deal with merging it with
4068 // the range we're adding.
4069 ZoneSplayTree<Config>::Locator loc;
4070 ASSERT_RESULT(tree()->Insert(right.from(), &loc));
4071 loc.set_value(Entry(right.from(),
4072 right.to(),
4073 entry->out_set()));
4074 }
4075 }
4076 while (current.is_valid()) {
4077 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
4078 (loc.value().from() <= current.to()) &&
4079 (loc.value().to() >= current.from())) {
4080 Entry* entry = &loc.value();
4081 // We have overlap. If there is space between the start point of
4082 // the range we're adding and where the overlapping range starts
4083 // then we have to add a range covering just that space.
4084 if (current.from() < entry->from()) {
4085 ZoneSplayTree<Config>::Locator ins;
4086 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4087 ins.set_value(Entry(current.from(),
4088 entry->from() - 1,
4089 empty()->Extend(value)));
4090 current.set_from(entry->from());
4091 }
4092 ASSERT_EQ(current.from(), entry->from());
4093 // If the overlapping range extends beyond the one we want to add
4094 // we have to snap the right part off and add it separately.
4095 if (entry->to() > current.to()) {
4096 ZoneSplayTree<Config>::Locator ins;
4097 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
4098 ins.set_value(Entry(current.to() + 1,
4099 entry->to(),
4100 entry->out_set()));
4101 entry->set_to(current.to());
4102 }
4103 ASSERT(entry->to() <= current.to());
4104 // The overlapping range is now completely contained by the range
4105 // we're adding so we can just update it and move the start point
4106 // of the range we're adding just past it.
4107 entry->AddValue(value);
4108 // Bail out if the last interval ended at 0xFFFF since otherwise
4109 // adding 1 will wrap around to 0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004110 if (entry->to() == String::kMaxUC16CharCode)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004111 break;
4112 ASSERT(entry->to() + 1 > current.from());
4113 current.set_from(entry->to() + 1);
4114 } else {
4115 // There is no overlap so we can just add the range
4116 ZoneSplayTree<Config>::Locator ins;
4117 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4118 ins.set_value(Entry(current.from(),
4119 current.to(),
4120 empty()->Extend(value)));
4121 break;
4122 }
4123 }
4124}
4125
4126
4127OutSet* DispatchTable::Get(uc16 value) {
4128 ZoneSplayTree<Config>::Locator loc;
4129 if (!tree()->FindGreatestLessThan(value, &loc))
4130 return empty();
4131 Entry* entry = &loc.value();
4132 if (value <= entry->to())
4133 return entry->out_set();
4134 else
4135 return empty();
4136}
4137
4138
4139// -------------------------------------------------------------------
4140// Analysis
4141
4142
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004143void Analysis::EnsureAnalyzed(RegExpNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004144 if (that->info()->been_analyzed || that->info()->being_analyzed)
4145 return;
4146 that->info()->being_analyzed = true;
4147 that->Accept(this);
4148 that->info()->being_analyzed = false;
4149 that->info()->been_analyzed = true;
4150}
4151
4152
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004153void Analysis::VisitEnd(EndNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004154 // nothing to do
4155}
4156
4157
ager@chromium.org8bb60582008-12-11 12:02:20 +00004158void TextNode::CalculateOffsets() {
4159 int element_count = elements()->length();
4160 // Set up the offsets of the elements relative to the start. This is a fixed
4161 // quantity since a TextNode can only contain fixed-width things.
4162 int cp_offset = 0;
4163 for (int i = 0; i < element_count; i++) {
4164 TextElement& elm = elements()->at(i);
4165 elm.cp_offset = cp_offset;
4166 if (elm.type == TextElement::ATOM) {
4167 cp_offset += elm.data.u_atom->data().length();
4168 } else {
4169 cp_offset++;
4170 Vector<const uc16> quarks = elm.data.u_atom->data();
4171 }
4172 }
4173}
4174
4175
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004176void Analysis::VisitText(TextNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004177 if (ignore_case_) {
4178 that->MakeCaseIndependent();
4179 }
4180 EnsureAnalyzed(that->on_success());
ager@chromium.org8bb60582008-12-11 12:02:20 +00004181 that->CalculateOffsets();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004182}
4183
4184
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004185void Analysis::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004186 RegExpNode* target = that->on_success();
4187 EnsureAnalyzed(target);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004188 // If the next node is interested in what it follows then this node
4189 // has to be interested too so it can pass the information on.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004190 that->info()->AddFromFollowing(target->info());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004191}
4192
4193
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004194void Analysis::VisitChoice(ChoiceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004195 NodeInfo* info = that->info();
4196 for (int i = 0; i < that->alternatives()->length(); i++) {
4197 RegExpNode* node = that->alternatives()->at(i).node();
4198 EnsureAnalyzed(node);
4199 // Anything the following nodes need to know has to be known by
4200 // this node also, so it can pass it on.
4201 info->AddFromFollowing(node->info());
4202 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004203}
4204
4205
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004206void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
4207 NodeInfo* info = that->info();
4208 for (int i = 0; i < that->alternatives()->length(); i++) {
4209 RegExpNode* node = that->alternatives()->at(i).node();
4210 if (node != that->loop_node()) {
4211 EnsureAnalyzed(node);
4212 info->AddFromFollowing(node->info());
4213 }
4214 }
4215 // Check the loop last since it may need the value of this node
4216 // to get a correct result.
4217 EnsureAnalyzed(that->loop_node());
4218 info->AddFromFollowing(that->loop_node()->info());
4219}
4220
4221
4222void Analysis::VisitBackReference(BackReferenceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004223 EnsureAnalyzed(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004224}
4225
4226
4227// -------------------------------------------------------------------
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004228// Dispatch table construction
4229
4230
4231void DispatchTableConstructor::VisitEnd(EndNode* that) {
4232 AddRange(CharacterRange::Everything());
4233}
4234
4235
4236void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
4237 node->set_being_calculated(true);
4238 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
4239 for (int i = 0; i < alternatives->length(); i++) {
4240 set_choice_index(i);
4241 alternatives->at(i).node()->Accept(this);
4242 }
4243 node->set_being_calculated(false);
4244}
4245
4246
4247class AddDispatchRange {
4248 public:
4249 explicit AddDispatchRange(DispatchTableConstructor* constructor)
4250 : constructor_(constructor) { }
4251 void Call(uc32 from, DispatchTable::Entry entry);
4252 private:
4253 DispatchTableConstructor* constructor_;
4254};
4255
4256
4257void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
4258 CharacterRange range(from, entry.to());
4259 constructor_->AddRange(range);
4260}
4261
4262
4263void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
4264 if (node->being_calculated())
4265 return;
4266 DispatchTable* table = node->GetTable(ignore_case_);
4267 AddDispatchRange adder(this);
4268 table->ForEach(&adder);
4269}
4270
4271
4272void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
4273 // TODO(160): Find the node that we refer back to and propagate its start
4274 // set back to here. For now we just accept anything.
4275 AddRange(CharacterRange::Everything());
4276}
4277
4278
4279
4280static int CompareRangeByFrom(const CharacterRange* a,
4281 const CharacterRange* b) {
4282 return Compare<uc16>(a->from(), b->from());
4283}
4284
4285
4286void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
4287 ranges->Sort(CompareRangeByFrom);
4288 uc16 last = 0;
4289 for (int i = 0; i < ranges->length(); i++) {
4290 CharacterRange range = ranges->at(i);
4291 if (last < range.from())
4292 AddRange(CharacterRange(last, range.from() - 1));
4293 if (range.to() >= last) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004294 if (range.to() == String::kMaxUC16CharCode) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004295 return;
4296 } else {
4297 last = range.to() + 1;
4298 }
4299 }
4300 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004301 AddRange(CharacterRange(last, String::kMaxUC16CharCode));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004302}
4303
4304
4305void DispatchTableConstructor::VisitText(TextNode* that) {
4306 TextElement elm = that->elements()->at(0);
4307 switch (elm.type) {
4308 case TextElement::ATOM: {
4309 uc16 c = elm.data.u_atom->data()[0];
4310 AddRange(CharacterRange(c, c));
4311 break;
4312 }
4313 case TextElement::CHAR_CLASS: {
4314 RegExpCharacterClass* tree = elm.data.u_char_class;
4315 ZoneList<CharacterRange>* ranges = tree->ranges();
4316 if (tree->is_negated()) {
4317 AddInverse(ranges);
4318 } else {
4319 for (int i = 0; i < ranges->length(); i++)
4320 AddRange(ranges->at(i));
4321 }
4322 break;
4323 }
4324 default: {
4325 UNIMPLEMENTED();
4326 }
4327 }
4328}
4329
4330
4331void DispatchTableConstructor::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004332 RegExpNode* target = that->on_success();
4333 target->Accept(this);
4334}
4335
4336
ager@chromium.org8bb60582008-12-11 12:02:20 +00004337Handle<FixedArray> RegExpEngine::Compile(RegExpCompileData* data,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004338 bool ignore_case,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004339 bool is_multiline,
4340 Handle<String> pattern,
4341 bool is_ascii) {
4342 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004343 // Wrap the body of the regexp in capture #0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004344 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004345 0,
4346 &compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004347 compiler.accept());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004348 // Add a .*? at the beginning, outside the body capture.
4349 // Note: We could choose to not add this if the regexp is anchored at
4350 // the start of the input but I'm not sure how best to do that and
4351 // since we don't even handle ^ yet I'm saving that optimization for
4352 // later.
4353 RegExpNode* node = RegExpQuantifier::ToNode(0,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004354 RegExpTree::kInfinity,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004355 false,
4356 new RegExpCharacterClass('*'),
4357 &compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004358 captured_body);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004359 data->node = node;
4360 Analysis analysis(ignore_case);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004361 analysis.EnsureAnalyzed(node);
4362
4363 NodeInfo info = *node->info();
ager@chromium.org8bb60582008-12-11 12:02:20 +00004364
4365 if (is_multiline && !FLAG_attempt_multiline_irregexp) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004366 return Handle<FixedArray>::null();
4367 }
4368
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004369 if (FLAG_irregexp_native) {
4370#ifdef ARM
4371 // Unimplemented, fall-through to bytecode implementation.
4372#else // IA32
ager@chromium.org8bb60582008-12-11 12:02:20 +00004373 RegExpMacroAssemblerIA32::Mode mode;
4374 if (is_ascii) {
4375 mode = RegExpMacroAssemblerIA32::ASCII;
4376 } else {
4377 mode = RegExpMacroAssemblerIA32::UC16;
4378 }
4379 RegExpMacroAssemblerIA32 macro_assembler(mode,
4380 (data->capture_count + 1) * 2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004381 return compiler.Assemble(&macro_assembler,
4382 node,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004383 data->capture_count,
4384 pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004385#endif
4386 }
4387 EmbeddedVector<byte, 1024> codes;
4388 RegExpMacroAssemblerIrregexp macro_assembler(codes);
4389 return compiler.Assemble(&macro_assembler,
4390 node,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004391 data->capture_count,
4392 pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004393}
4394
4395
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00004396}} // namespace v8::internal