blob: 3046b96afa650561afaa37163a4fdd81a5ee6515 [file] [log] [blame]
ager@chromium.org9258b6b2008-09-11 09:11:10 +00001// Copyright 2006-2008 the V8 project authors. All rights reserved.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include "v8.h"
29
ager@chromium.orga74f0da2008-12-03 16:05:52 +000030#include "ast.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000031#include "execution.h"
32#include "factory.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000033#include "jsregexp-inl.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000034#include "platform.h"
kasperl@chromium.org41044eb2008-10-06 08:24:46 +000035#include "runtime.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000036#include "top.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000037#include "compilation-cache.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000038#include "string-stream.h"
39#include "parser.h"
40#include "regexp-macro-assembler.h"
41#include "regexp-macro-assembler-tracer.h"
42#include "regexp-macro-assembler-irregexp.h"
43
44#ifdef ARM
45#include "regexp-macro-assembler-arm.h"
46#else // IA32
47#include "macro-assembler-ia32.h"
48#include "regexp-macro-assembler-ia32.h"
49#endif
50
51#include "interpreter-irregexp.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000052
53// Including pcre.h undefines DEBUG to avoid getting debug output from
54// the JSCRE implementation. Make sure to redefine it in debug mode
55// after having included the header file.
56#ifdef DEBUG
57#include "third_party/jscre/pcre.h"
58#define DEBUG
59#else
60#include "third_party/jscre/pcre.h"
61#endif
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000062
ager@chromium.orga74f0da2008-12-03 16:05:52 +000063
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000064namespace v8 { namespace internal {
65
66
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000067static Failure* malloc_failure;
68
69static void* JSREMalloc(size_t size) {
70 Object* obj = Heap::AllocateByteArray(size);
71
72 // If allocation failed, return a NULL pointer to JSRE, and jsRegExpCompile
73 // will return NULL to the caller, performs GC there.
74 // Also pass failure information to the caller.
75 if (obj->IsFailure()) {
76 malloc_failure = Failure::cast(obj);
77 return NULL;
78 }
79
80 // Note: object is unrooted, the caller of jsRegExpCompile must
81 // create a handle for the return value before doing heap allocation.
82 return reinterpret_cast<void*>(ByteArray::cast(obj)->GetDataStartAddress());
83}
84
85
86static void JSREFree(void* p) {
87 USE(p); // Do nothing, memory is garbage collected.
88}
89
90
91String* RegExpImpl::last_ascii_string_ = NULL;
92String* RegExpImpl::two_byte_cached_string_ = NULL;
93
94
95void RegExpImpl::NewSpaceCollectionPrologue() {
96 // The two byte string is always in the old space. The Ascii string may be
97 // in either place. If it is in the old space we don't need to do anything.
98 if (Heap::InNewSpace(last_ascii_string_)) {
99 // Invalidate the cache.
100 last_ascii_string_ = NULL;
101 two_byte_cached_string_ = NULL;
102 }
103}
104
105
106void RegExpImpl::OldSpaceCollectionPrologue() {
107 last_ascii_string_ = NULL;
108 two_byte_cached_string_ = NULL;
109}
110
111
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000112Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
113 Handle<String> pattern,
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000114 Handle<String> flags,
115 bool* has_pending_exception) {
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000116 // Ensure that the constructor function has been loaded.
117 if (!constructor->IsLoaded()) {
118 LoadLazy(constructor, has_pending_exception);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000119 if (*has_pending_exception) return Handle<Object>();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000120 }
121 // Call the construct code with 2 arguments.
122 Object** argv[2] = { Handle<Object>::cast(pattern).location(),
123 Handle<Object>::cast(flags).location() };
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000124 return Execution::New(constructor, 2, argv, has_pending_exception);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000125}
126
127
128// Converts a source string to a 16 bit flat string or a SlicedString containing
129// a 16 bit flat string).
130Handle<String> RegExpImpl::CachedStringToTwoByte(Handle<String> subject) {
131 if (*subject == last_ascii_string_) {
132 ASSERT(two_byte_cached_string_ != NULL);
133 return Handle<String>(String::cast(two_byte_cached_string_));
134 }
135 Handle<String> two_byte_string = StringToTwoByte(subject);
136 last_ascii_string_ = *subject;
137 two_byte_cached_string_ = *two_byte_string;
138 return two_byte_string;
139}
140
141
142// Converts a source string to a 16 bit flat string or a SlicedString containing
143// a 16 bit flat string).
144Handle<String> RegExpImpl::StringToTwoByte(Handle<String> pattern) {
ager@chromium.org870a0b62008-11-04 11:43:05 +0000145 StringShape shape(*pattern);
146 if (!pattern->IsFlat(shape)) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000147 FlattenString(pattern);
ager@chromium.orgc3e50d82008-11-05 11:53:10 +0000148 shape = StringShape(*pattern);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000149 }
ager@chromium.org870a0b62008-11-04 11:43:05 +0000150 Handle<String> flat_string(shape.IsCons() ?
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000151 String::cast(ConsString::cast(*pattern)->first()) :
152 *pattern);
ager@chromium.org870a0b62008-11-04 11:43:05 +0000153 ASSERT(flat_string->IsString());
154 StringShape flat_shape(*flat_string);
155 ASSERT(!flat_shape.IsCons());
156 ASSERT(flat_shape.IsSequential() ||
157 flat_shape.IsSliced() ||
158 flat_shape.IsExternal());
159 if (!flat_shape.IsAsciiRepresentation()) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000160 return flat_string;
161 }
162
ager@chromium.org870a0b62008-11-04 11:43:05 +0000163 int len = flat_string->length(flat_shape);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000164 Handle<String> two_byte_string =
ager@chromium.org870a0b62008-11-04 11:43:05 +0000165 Factory::NewRawTwoByteString(len, TENURED);
166 uc16* dest = SeqTwoByteString::cast(*two_byte_string)->GetChars();
167 String::WriteToFlat(*flat_string, flat_shape, dest, 0, len);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000168 return two_byte_string;
169}
170
171
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000172static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
173 int flags = JSRegExp::NONE;
ager@chromium.org870a0b62008-11-04 11:43:05 +0000174 StringShape shape(*str);
175 for (int i = 0; i < str->length(shape); i++) {
176 switch (str->Get(shape, i)) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000177 case 'i':
178 flags |= JSRegExp::IGNORE_CASE;
179 break;
180 case 'g':
181 flags |= JSRegExp::GLOBAL;
182 break;
183 case 'm':
184 flags |= JSRegExp::MULTILINE;
185 break;
186 }
187 }
188 return JSRegExp::Flags(flags);
189}
190
191
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000192static inline void ThrowRegExpException(Handle<JSRegExp> re,
193 Handle<String> pattern,
194 Handle<String> error_text,
195 const char* message) {
196 Handle<JSArray> array = Factory::NewJSArray(2);
197 SetElement(array, 0, pattern);
198 SetElement(array, 1, error_text);
199 Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
200 Top::Throw(*regexp_err);
201}
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000202
203
204Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
205 Handle<String> pattern,
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000206 Handle<String> flag_str) {
207 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
208 Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
209 bool in_cache = !cached.is_null();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000210 LOG(RegExpCompileEvent(re, in_cache));
211
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000212 Handle<Object> result;
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000213 if (in_cache) {
214 re->set_data(*cached);
215 result = re;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000216 } else {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000217 FlattenString(pattern);
218 ZoneScope zone_scope(DELETE_ON_EXIT);
219 RegExpParseResult parse_result;
220 FlatStringReader reader(pattern);
221 if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
222 // Throw an exception if we fail to parse the pattern.
223 ThrowRegExpException(re,
224 pattern,
225 parse_result.error,
226 "malformed_regexp");
227 return Handle<Object>();
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000228 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000229 RegExpAtom* atom = parse_result.tree->AsAtom();
230 if (atom != NULL && !flags.is_ignore_case()) {
231 if (parse_result.has_character_escapes) {
232 Vector<const uc16> atom_pattern = atom->data();
233 Handle<String> atom_string =
234 Factory::NewStringFromTwoByte(atom_pattern);
235 result = AtomCompile(re, pattern, flags, atom_string);
236 } else {
237 result = AtomCompile(re, pattern, flags, pattern);
238 }
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000239 } else {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000240 RegExpNode* node = NULL;
241 Handle<FixedArray> irregexp_data =
242 RegExpEngine::Compile(&parse_result,
243 &node,
244 flags.is_ignore_case(),
245 flags.is_multiline());
246 if (irregexp_data.is_null()) {
247 if (FLAG_disable_jscre) {
248 UNIMPLEMENTED();
249 }
250 result = JscrePrepare(re, pattern, flags);
251 } else {
252 result = IrregexpPrepare(re, pattern, flags, irregexp_data);
253 }
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000254 }
255 Object* data = re->data();
256 if (data->IsFixedArray()) {
257 // If compilation succeeded then the data is set on the regexp
258 // and we can store it in the cache.
259 Handle<FixedArray> data(FixedArray::cast(re->data()));
260 CompilationCache::PutRegExp(pattern, flags, data);
261 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000262 }
263
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000264 return result;
265}
266
267
268Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
269 Handle<String> subject,
270 Handle<Object> index) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000271 switch (regexp->TypeTag()) {
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000272 case JSRegExp::JSCRE:
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000273 if (FLAG_disable_jscre) {
274 UNIMPLEMENTED();
275 }
276 return JscreExec(regexp, subject, index);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000277 case JSRegExp::ATOM:
278 return AtomExec(regexp, subject, index);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000279 case JSRegExp::IRREGEXP:
280 return IrregexpExec(regexp, subject, index);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000281 default:
282 UNREACHABLE();
283 return Handle<Object>();
284 }
285}
286
287
288Handle<Object> RegExpImpl::ExecGlobal(Handle<JSRegExp> regexp,
289 Handle<String> subject) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000290 switch (regexp->TypeTag()) {
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000291 case JSRegExp::JSCRE:
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000292 if (FLAG_disable_jscre) {
293 UNIMPLEMENTED();
294 }
295 return JscreExecGlobal(regexp, subject);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000296 case JSRegExp::ATOM:
297 return AtomExecGlobal(regexp, subject);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000298 case JSRegExp::IRREGEXP:
299 return IrregexpExecGlobal(regexp, subject);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000300 default:
301 UNREACHABLE();
302 return Handle<Object>();
303 }
304}
305
306
307Handle<Object> RegExpImpl::AtomCompile(Handle<JSRegExp> re,
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000308 Handle<String> pattern,
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000309 JSRegExp::Flags flags,
310 Handle<String> match_pattern) {
311 Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, match_pattern);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000312 return re;
313}
314
315
316Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
317 Handle<String> subject,
318 Handle<Object> index) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000319 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000320
321 uint32_t start_index;
322 if (!Array::IndexFromObject(*index, &start_index)) {
323 return Handle<Smi>(Smi::FromInt(-1));
324 }
325
326 LOG(RegExpExecEvent(re, start_index, subject));
ager@chromium.org7c537e22008-10-16 08:43:32 +0000327 int value = Runtime::StringMatch(subject, needle, start_index);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000328 if (value == -1) return Factory::null_value();
ager@chromium.org7c537e22008-10-16 08:43:32 +0000329
330 Handle<FixedArray> array = Factory::NewFixedArray(2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000331 array->set(0, Smi::FromInt(value));
332 array->set(1, Smi::FromInt(value + needle->length()));
ager@chromium.org7c537e22008-10-16 08:43:32 +0000333 return Factory::NewJSArrayWithElements(array);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000334}
335
336
337Handle<Object> RegExpImpl::AtomExecGlobal(Handle<JSRegExp> re,
338 Handle<String> subject) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000339 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000340 Handle<JSArray> result = Factory::NewJSArray(1);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000341 int index = 0;
342 int match_count = 0;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000343 int subject_length = subject->length();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000344 int needle_length = needle->length();
ager@chromium.org7c537e22008-10-16 08:43:32 +0000345 while (true) {
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000346 LOG(RegExpExecEvent(re, index, subject));
ager@chromium.org7c537e22008-10-16 08:43:32 +0000347 int value = -1;
348 if (index + needle_length <= subject_length) {
349 value = Runtime::StringMatch(subject, needle, index);
350 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000351 if (value == -1) break;
352 HandleScope scope;
353 int end = value + needle_length;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000354
355 Handle<FixedArray> array = Factory::NewFixedArray(2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000356 array->set(0, Smi::FromInt(value));
357 array->set(1, Smi::FromInt(end));
ager@chromium.org7c537e22008-10-16 08:43:32 +0000358 Handle<JSArray> pair = Factory::NewJSArrayWithElements(array);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000359 SetElement(result, match_count, pair);
360 match_count++;
361 index = end;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000362 if (needle_length == 0) index++;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000363 }
364 return result;
365}
366
367
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000368Handle<Object>RegExpImpl::JscrePrepare(Handle<JSRegExp> re,
369 Handle<String> pattern,
370 JSRegExp::Flags flags) {
371 Handle<Object> value(Heap::undefined_value());
372 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
373 return re;
374}
375
376
377Handle<Object>RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
378 Handle<String> pattern,
379 JSRegExp::Flags flags,
380 Handle<FixedArray> irregexp_data) {
381 Factory::SetRegExpData(re, JSRegExp::IRREGEXP, pattern, flags, irregexp_data);
382 return re;
383}
384
385
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000386static inline Object* DoCompile(String* pattern,
387 JSRegExp::Flags flags,
388 unsigned* number_of_captures,
389 const char** error_message,
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000390 v8::jscre::JscreRegExp** code) {
391 v8::jscre::JSRegExpIgnoreCaseOption case_option = flags.is_ignore_case()
392 ? v8::jscre::JSRegExpIgnoreCase
393 : v8::jscre::JSRegExpDoNotIgnoreCase;
394 v8::jscre::JSRegExpMultilineOption multiline_option = flags.is_multiline()
395 ? v8::jscre::JSRegExpMultiline
396 : v8::jscre::JSRegExpSingleLine;
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000397 *error_message = NULL;
398 malloc_failure = Failure::Exception();
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000399 *code = v8::jscre::jsRegExpCompile(pattern->GetTwoByteData(),
400 pattern->length(),
401 case_option,
402 multiline_option,
403 number_of_captures,
404 error_message,
405 &JSREMalloc,
406 &JSREFree);
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000407 if (*code == NULL && (malloc_failure->IsRetryAfterGC() ||
408 malloc_failure->IsOutOfMemoryFailure())) {
409 return malloc_failure;
410 } else {
411 // It doesn't matter which object we return here, we just need to return
412 // a non-failure to indicate to the GC-retry code that there was no
413 // allocation failure.
414 return pattern;
415 }
416}
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000417
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000418
419void CompileWithRetryAfterGC(Handle<String> pattern,
420 JSRegExp::Flags flags,
421 unsigned* number_of_captures,
422 const char** error_message,
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000423 v8::jscre::JscreRegExp** code) {
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000424 CALL_HEAP_FUNCTION_VOID(DoCompile(*pattern,
425 flags,
426 number_of_captures,
427 error_message,
428 code));
429}
430
431
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000432Handle<Object> RegExpImpl::JscreCompile(Handle<JSRegExp> re) {
433 ASSERT_EQ(re->TypeTag(), JSRegExp::JSCRE);
434 ASSERT(re->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined());
435
436 Handle<String> pattern(re->Pattern());
437 JSRegExp::Flags flags = re->GetFlags();
438
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000439 Handle<String> two_byte_pattern = StringToTwoByte(pattern);
440
441 unsigned number_of_captures;
442 const char* error_message = NULL;
443
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000444 v8::jscre::JscreRegExp* code = NULL;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000445 FlattenString(pattern);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000446
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000447 CompileWithRetryAfterGC(two_byte_pattern,
448 flags,
449 &number_of_captures,
450 &error_message,
451 &code);
ager@chromium.org7c537e22008-10-16 08:43:32 +0000452
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000453 if (code == NULL) {
454 // Throw an exception.
455 Handle<JSArray> array = Factory::NewJSArray(2);
456 SetElement(array, 0, pattern);
457 SetElement(array, 1, Factory::NewStringFromUtf8(CStrVector(
458 (error_message == NULL) ? "Unknown regexp error" : error_message)));
459 Handle<Object> regexp_err =
460 Factory::NewSyntaxError("malformed_regexp", array);
ager@chromium.org3bf7b912008-11-17 09:09:45 +0000461 Top::Throw(*regexp_err);
462 return Handle<Object>();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000463 }
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000464
465 // Convert the return address to a ByteArray pointer.
466 Handle<ByteArray> internal(
467 ByteArray::FromDataStartAddress(reinterpret_cast<Address>(code)));
468
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000469 Handle<FixedArray> value = Factory::NewFixedArray(kJscreDataLength);
470 value->set(kJscreNumberOfCapturesIndex, Smi::FromInt(number_of_captures));
471 value->set(kJscreInternalIndex, *internal);
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000472 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
473
474 return re;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000475}
476
477
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000478Handle<Object> RegExpImpl::IrregexpExecOnce(Handle<JSRegExp> regexp,
479 int num_captures,
480 Handle<String> two_byte_subject,
481 int previous_index,
482 int* offsets_vector,
483 int offsets_vector_length) {
484#ifdef DEBUG
485 if (FLAG_trace_regexp_bytecodes) {
486 String* pattern = regexp->Pattern();
487 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
488 PrintF("\n\nSubject string: '%s'\n\n", *(two_byte_subject->ToCString()));
489 }
490#endif
491 ASSERT(StringShape(*two_byte_subject).IsTwoByteRepresentation());
492 ASSERT(two_byte_subject->IsFlat(StringShape(*two_byte_subject)));
493 bool rc;
494
495 for (int i = (num_captures + 1) * 2 - 1; i >= 0; i--) {
496 offsets_vector[i] = -1;
497 }
498
499 LOG(RegExpExecEvent(regexp, previous_index, two_byte_subject));
500
501 FixedArray* irregexp =
502 FixedArray::cast(regexp->DataAt(JSRegExp::kIrregexpDataIndex));
503 int tag = Smi::cast(irregexp->get(kIrregexpImplementationIndex))->value();
504
505 switch (tag) {
506 case RegExpMacroAssembler::kIA32Implementation: {
507#ifndef ARM
508 Code* code = Code::cast(irregexp->get(kIrregexpCodeIndex));
509 Address start_addr =
510 Handle<SeqTwoByteString>::cast(two_byte_subject)->GetCharsAddress();
511 int string_offset =
512 start_addr - reinterpret_cast<Address>(*two_byte_subject);
513 int start_offset = string_offset + previous_index * sizeof(uc16);
514 int end_offset =
515 string_offset + two_byte_subject->length() * sizeof(uc16);
516 rc = RegExpMacroAssemblerIA32::Execute(code,
517 two_byte_subject.location(),
518 start_offset,
519 end_offset,
520 offsets_vector,
521 previous_index == 0);
522 if (rc) {
523 // Capture values are relative to start_offset only.
524 for (int i = 0; i < offsets_vector_length; i++) {
525 if (offsets_vector[i] >= 0) {
526 offsets_vector[i] += previous_index;
527 }
528 }
529 }
530 break;
531#else
532 UNIMPLEMENTED();
533 rc = false;
534 break;
535#endif
536 }
537 case RegExpMacroAssembler::kBytecodeImplementation: {
538 Handle<ByteArray> byte_codes = IrregexpCode(regexp);
539
540 rc = IrregexpInterpreter::Match(byte_codes,
541 two_byte_subject,
542 offsets_vector,
543 previous_index);
544 break;
545 }
546 case RegExpMacroAssembler::kARMImplementation:
547 default:
548 UNREACHABLE();
549 rc = false;
550 break;
551 }
552
553 if (!rc) {
554 return Factory::null_value();
555 }
556
557 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
558 // The captures come in (start, end+1) pairs.
559 for (int i = 0; i < 2 * (num_captures+1); i += 2) {
560 array->set(i, Smi::FromInt(offsets_vector[i]));
561 array->set(i+1, Smi::FromInt(offsets_vector[i+1]));
562 }
563 return Factory::NewJSArrayWithElements(array);
564}
565
566
567Handle<Object> RegExpImpl::JscreExecOnce(Handle<JSRegExp> regexp,
568 int num_captures,
569 Handle<String> subject,
570 int previous_index,
571 const uc16* two_byte_subject,
572 int* offsets_vector,
573 int offsets_vector_length) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000574 int rc;
575 {
576 AssertNoAllocation a;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000577 ByteArray* internal = JscreInternal(regexp);
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000578 const v8::jscre::JscreRegExp* js_regexp =
579 reinterpret_cast<v8::jscre::JscreRegExp*>(
580 internal->GetDataStartAddress());
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000581
kasperl@chromium.orgb9123622008-09-17 14:05:56 +0000582 LOG(RegExpExecEvent(regexp, previous_index, subject));
583
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000584 rc = v8::jscre::jsRegExpExecute(js_regexp,
585 two_byte_subject,
586 subject->length(),
587 previous_index,
588 offsets_vector,
589 offsets_vector_length);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000590 }
591
592 // The KJS JavaScript engine returns null (ie, a failed match) when
593 // JSRE's internal match limit is exceeded. We duplicate that behavior here.
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000594 if (rc == v8::jscre::JSRegExpErrorNoMatch
595 || rc == v8::jscre::JSRegExpErrorHitLimit) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000596 return Factory::null_value();
597 }
598
599 // Other JSRE errors:
600 if (rc < 0) {
601 // Throw an exception.
602 Handle<Object> code(Smi::FromInt(rc));
603 Handle<Object> args[2] = { Factory::LookupAsciiSymbol("jsre_exec"), code };
604 Handle<Object> regexp_err(
605 Factory::NewTypeError("jsre_error", HandleVector(args, 2)));
606 return Handle<Object>(Top::Throw(*regexp_err));
607 }
608
ager@chromium.org7c537e22008-10-16 08:43:32 +0000609 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000610 // The captures come in (start, end+1) pairs.
611 for (int i = 0; i < 2 * (num_captures+1); i += 2) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000612 array->set(i, Smi::FromInt(offsets_vector[i]));
613 array->set(i+1, Smi::FromInt(offsets_vector[i+1]));
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000614 }
ager@chromium.org7c537e22008-10-16 08:43:32 +0000615 return Factory::NewJSArrayWithElements(array);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000616}
617
618
619class OffsetsVector {
620 public:
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000621 inline OffsetsVector(int num_registers)
622 : offsets_vector_length_(num_registers) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000623 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
624 vector_ = NewArray<int>(offsets_vector_length_);
625 } else {
626 vector_ = static_offsets_vector_;
627 }
628 }
629
630
631 inline ~OffsetsVector() {
632 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
633 DeleteArray(vector_);
634 vector_ = NULL;
635 }
636 }
637
638
639 inline int* vector() {
640 return vector_;
641 }
642
643
644 inline int length() {
645 return offsets_vector_length_;
646 }
647
648 private:
649 int* vector_;
650 int offsets_vector_length_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000651 static const int kStaticOffsetsVectorSize = 50;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000652 static int static_offsets_vector_[kStaticOffsetsVectorSize];
653};
654
655
656int OffsetsVector::static_offsets_vector_[
657 OffsetsVector::kStaticOffsetsVectorSize];
658
659
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000660Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
661 Handle<String> subject,
662 Handle<Object> index) {
663 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
664 ASSERT(!regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsUndefined());
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000665
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000666 // Prepare space for the return values.
667 int number_of_registers = IrregexpNumberOfRegisters(regexp);
668 OffsetsVector offsets(number_of_registers);
669
670 int num_captures = IrregexpNumberOfCaptures(regexp);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000671
672 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
673
674 Handle<String> subject16 = CachedStringToTwoByte(subject);
675
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000676 Handle<Object> result(IrregexpExecOnce(regexp,
677 num_captures,
678 subject16,
679 previous_index,
680 offsets.vector(),
681 offsets.length()));
682 return result;
683}
684
685
686Handle<Object> RegExpImpl::JscreExec(Handle<JSRegExp> regexp,
687 Handle<String> subject,
688 Handle<Object> index) {
689 ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);
690 if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {
691 Handle<Object> compile_result = JscreCompile(regexp);
692 if (compile_result.is_null()) return compile_result;
693 }
694 ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());
695
696 int num_captures = JscreNumberOfCaptures(regexp);
697
698 OffsetsVector offsets((num_captures + 1) * 3);
699
700 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
701
702 Handle<String> subject16 = CachedStringToTwoByte(subject);
703
704 Handle<Object> result(JscreExecOnce(regexp,
705 num_captures,
706 subject,
707 previous_index,
708 subject16->GetTwoByteData(),
709 offsets.vector(),
710 offsets.length()));
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000711
712 return result;
713}
714
715
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000716Handle<Object> RegExpImpl::IrregexpExecGlobal(Handle<JSRegExp> regexp,
717 Handle<String> subject) {
718 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
719 ASSERT(!regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsUndefined());
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000720
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000721 // Prepare space for the return values.
722 int number_of_registers = IrregexpNumberOfRegisters(regexp);
723 OffsetsVector offsets(number_of_registers);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000724
725 int previous_index = 0;
726
ager@chromium.org7c537e22008-10-16 08:43:32 +0000727 Handle<JSArray> result = Factory::NewJSArray(0);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000728 int i = 0;
729 Handle<Object> matches;
730
731 Handle<String> subject16 = CachedStringToTwoByte(subject);
732
733 do {
734 if (previous_index > subject->length() || previous_index < 0) {
735 // Per ECMA-262 15.10.6.2, if the previous index is greater than the
736 // string length, there is no match.
737 matches = Factory::null_value();
738 } else {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000739 matches = IrregexpExecOnce(regexp,
740 IrregexpNumberOfCaptures(regexp),
741 subject16,
742 previous_index,
743 offsets.vector(),
744 offsets.length());
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000745
746 if (matches->IsJSArray()) {
747 SetElement(result, i, matches);
748 i++;
749 previous_index = offsets.vector()[1];
750 if (offsets.vector()[0] == offsets.vector()[1]) {
751 previous_index++;
752 }
753 }
754 }
755 } while (matches->IsJSArray());
756
757 // If we exited the loop with an exception, throw it.
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000758 if (matches->IsNull()) {
759 // Exited loop normally.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000760 return result;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000761 } else {
762 // Exited loop with the exception in matches.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000763 return matches;
764 }
765}
766
767
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000768Handle<Object> RegExpImpl::JscreExecGlobal(Handle<JSRegExp> regexp,
769 Handle<String> subject) {
770 ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);
771 if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {
772 Handle<Object> compile_result = JscreCompile(regexp);
773 if (compile_result.is_null()) return compile_result;
774 }
775 ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());
776
777 // Prepare space for the return values.
778 int num_captures = JscreNumberOfCaptures(regexp);
779
780 OffsetsVector offsets((num_captures + 1) * 3);
781
782 int previous_index = 0;
783
784 Handle<JSArray> result = Factory::NewJSArray(0);
785 int i = 0;
786 Handle<Object> matches;
787
788 Handle<String> subject16 = CachedStringToTwoByte(subject);
789
790 do {
791 if (previous_index > subject->length() || previous_index < 0) {
792 // Per ECMA-262 15.10.6.2, if the previous index is greater than the
793 // string length, there is no match.
794 matches = Factory::null_value();
795 } else {
796 matches = JscreExecOnce(regexp,
797 num_captures,
798 subject,
799 previous_index,
800 subject16->GetTwoByteData(),
801 offsets.vector(),
802 offsets.length());
803
804 if (matches->IsJSArray()) {
805 SetElement(result, i, matches);
806 i++;
807 previous_index = offsets.vector()[1];
808 if (offsets.vector()[0] == offsets.vector()[1]) {
809 previous_index++;
810 }
811 }
812 }
813 } while (matches->IsJSArray());
814
815 // If we exited the loop with an exception, throw it.
816 if (matches->IsNull()) {
817 // Exited loop normally.
818 return result;
819 } else {
820 // Exited loop with the exception in matches.
821 return matches;
822 }
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000823}
824
825
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000826int RegExpImpl::JscreNumberOfCaptures(Handle<JSRegExp> re) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000827 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000828 return Smi::cast(value->get(kJscreNumberOfCapturesIndex))->value();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000829}
830
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000831
832ByteArray* RegExpImpl::JscreInternal(Handle<JSRegExp> re) {
833 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
834 return ByteArray::cast(value->get(kJscreInternalIndex));
835}
836
837
838int RegExpImpl::IrregexpNumberOfCaptures(Handle<JSRegExp> re) {
839 FixedArray* value =
840 FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex));
841 return Smi::cast(value->get(kIrregexpNumberOfCapturesIndex))->value();
842}
843
844
845int RegExpImpl::IrregexpNumberOfRegisters(Handle<JSRegExp> re) {
846 FixedArray* value =
847 FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex));
848 return Smi::cast(value->get(kIrregexpNumberOfRegistersIndex))->value();
849}
850
851
852Handle<ByteArray> RegExpImpl::IrregexpCode(Handle<JSRegExp> re) {
853 FixedArray* value =
854 FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex));
855 return Handle<ByteArray>(ByteArray::cast(value->get(kIrregexpCodeIndex)));
856}
857
858
859// -------------------------------------------------------------------
860// Implmentation of the Irregexp regular expression engine.
861
862
863void RegExpTree::AppendToText(RegExpText* text) {
864 UNREACHABLE();
865}
866
867
868void RegExpAtom::AppendToText(RegExpText* text) {
869 text->AddElement(TextElement::Atom(this));
870}
871
872
873void RegExpCharacterClass::AppendToText(RegExpText* text) {
874 text->AddElement(TextElement::CharClass(this));
875}
876
877
878void RegExpText::AppendToText(RegExpText* text) {
879 for (int i = 0; i < elements()->length(); i++)
880 text->AddElement(elements()->at(i));
881}
882
883
884TextElement TextElement::Atom(RegExpAtom* atom) {
885 TextElement result = TextElement(ATOM);
886 result.data.u_atom = atom;
887 return result;
888}
889
890
891TextElement TextElement::CharClass(
892 RegExpCharacterClass* char_class) {
893 TextElement result = TextElement(CHAR_CLASS);
894 result.data.u_char_class = char_class;
895 return result;
896}
897
898
899DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
900 if (table_ == NULL) {
901 table_ = new DispatchTable();
902 DispatchTableConstructor cons(table_, ignore_case);
903 cons.BuildTable(this);
904 }
905 return table_;
906}
907
908
909class RegExpCompiler {
910 public:
911 RegExpCompiler(int capture_count, bool ignore_case);
912
913 int AllocateRegister() { return next_register_++; }
914
915 Handle<FixedArray> Assemble(RegExpMacroAssembler* assembler,
916 RegExpNode* start,
917 int capture_count);
918
919 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
920
921 static const int kImplementationOffset = 0;
922 static const int kNumberOfRegistersOffset = 0;
923 static const int kCodeOffset = 1;
924
925 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
926 EndNode* accept() { return accept_; }
927 EndNode* backtrack() { return backtrack_; }
928
929 static const int kMaxRecursion = 100;
930 inline int recursion_depth() { return recursion_depth_; }
931 inline void IncrementRecursionDepth() { recursion_depth_++; }
932 inline void DecrementRecursionDepth() { recursion_depth_--; }
933
934 inline bool ignore_case() { return ignore_case_; }
935
936 private:
937 EndNode* accept_;
938 EndNode* backtrack_;
939 int next_register_;
940 List<RegExpNode*>* work_list_;
941 int recursion_depth_;
942 RegExpMacroAssembler* macro_assembler_;
943 bool ignore_case_;
944};
945
946
947// Attempts to compile the regexp using an Irregexp code generator. Returns
948// a fixed array or a null handle depending on whether it succeeded.
949RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case)
950 : next_register_(2 * (capture_count + 1)),
951 work_list_(NULL),
952 recursion_depth_(0),
953 ignore_case_(ignore_case) {
954 accept_ = new EndNode(EndNode::ACCEPT);
955 backtrack_ = new EndNode(EndNode::BACKTRACK);
956}
957
958
959Handle<FixedArray> RegExpCompiler::Assemble(
960 RegExpMacroAssembler* macro_assembler,
961 RegExpNode* start,
962 int capture_count) {
963#ifdef DEBUG
964 if (FLAG_trace_regexp_assembler)
965 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
966 else
967#endif
968 macro_assembler_ = macro_assembler;
969 List <RegExpNode*> work_list(0);
970 work_list_ = &work_list;
971 Label fail;
972 macro_assembler_->PushBacktrack(&fail);
973 if (!start->GoTo(this)) {
974 fail.Unuse();
975 return Handle<FixedArray>::null();
976 }
977 while (!work_list.is_empty()) {
978 if (!work_list.RemoveLast()->GoTo(this)) {
979 fail.Unuse();
980 return Handle<FixedArray>::null();
981 }
982 }
983 macro_assembler_->Bind(&fail);
984 macro_assembler_->Fail();
985 Handle<FixedArray> array =
986 Factory::NewFixedArray(RegExpImpl::kIrregexpDataLength);
987 array->set(RegExpImpl::kIrregexpImplementationIndex,
988 Smi::FromInt(macro_assembler_->Implementation()));
989 array->set(RegExpImpl::kIrregexpNumberOfRegistersIndex,
990 Smi::FromInt(next_register_));
991 array->set(RegExpImpl::kIrregexpNumberOfCapturesIndex,
992 Smi::FromInt(capture_count));
993 Handle<Object> code = macro_assembler_->GetCode();
994 array->set(RegExpImpl::kIrregexpCodeIndex, *code);
995 work_list_ = NULL;
996#ifdef DEBUG
997 if (FLAG_trace_regexp_assembler) {
998 delete macro_assembler_;
999 }
1000#endif
1001 return array;
1002}
1003
1004
1005bool RegExpNode::GoTo(RegExpCompiler* compiler) {
1006 // TODO(erikcorry): Implement support.
1007 if (info_.follows_word_interest ||
1008 info_.follows_newline_interest ||
1009 info_.follows_start_interest) {
1010 return false;
1011 }
1012 if (label_.is_bound()) {
1013 compiler->macro_assembler()->GoTo(&label_);
1014 return true;
1015 } else {
1016 if (compiler->recursion_depth() > RegExpCompiler::kMaxRecursion) {
1017 compiler->macro_assembler()->GoTo(&label_);
1018 compiler->AddWork(this);
1019 return true;
1020 } else {
1021 compiler->IncrementRecursionDepth();
1022 bool how_it_went = Emit(compiler);
1023 compiler->DecrementRecursionDepth();
1024 return how_it_went;
1025 }
1026 }
1027}
1028
1029
1030// EndNodes are special. Because they can be very common and they are very
1031// short we normally inline them. That is, if we are asked to emit a GoTo
1032// we just emit the entire node. Since they don't have successors this
1033// works.
1034bool EndNode::GoTo(RegExpCompiler* compiler) {
1035 if (info()->follows_word_interest ||
1036 info()->follows_newline_interest ||
1037 info()->follows_start_interest) {
1038 return false;
1039 }
1040 return Emit(compiler);
1041}
1042
1043
1044Label* RegExpNode::label() {
1045 return &label_;
1046}
1047
1048
1049bool EndNode::Emit(RegExpCompiler* compiler) {
1050 RegExpMacroAssembler* macro = compiler->macro_assembler();
1051 switch (action_) {
1052 case ACCEPT:
1053 if (!label()->is_bound()) Bind(macro);
1054 if (info()->at_end) {
1055 Label succeed;
1056 // LoadCurrentCharacter will go to the label if we are at the end of the
1057 // input string.
1058 macro->LoadCurrentCharacter(0, &succeed);
1059 macro->Backtrack();
1060 macro->Bind(&succeed);
1061 }
1062 macro->Succeed();
1063 return true;
1064 case BACKTRACK:
1065 if (!label()->is_bound()) Bind(macro);
1066 ASSERT(!info()->at_end);
1067 macro->Backtrack();
1068 return true;
1069 }
1070 return false;
1071}
1072
1073
1074void GuardedAlternative::AddGuard(Guard* guard) {
1075 if (guards_ == NULL)
1076 guards_ = new ZoneList<Guard*>(1);
1077 guards_->Add(guard);
1078}
1079
1080
1081ActionNode* ActionNode::StoreRegister(int reg,
1082 int val,
1083 RegExpNode* on_success) {
1084 ActionNode* result = new ActionNode(STORE_REGISTER, on_success);
1085 result->data_.u_store_register.reg = reg;
1086 result->data_.u_store_register.value = val;
1087 return result;
1088}
1089
1090
1091ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1092 ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
1093 result->data_.u_increment_register.reg = reg;
1094 return result;
1095}
1096
1097
1098ActionNode* ActionNode::StorePosition(int reg, RegExpNode* on_success) {
1099 ActionNode* result = new ActionNode(STORE_POSITION, on_success);
1100 result->data_.u_position_register.reg = reg;
1101 return result;
1102}
1103
1104
1105ActionNode* ActionNode::RestorePosition(int reg, RegExpNode* on_success) {
1106 ActionNode* result = new ActionNode(RESTORE_POSITION, on_success);
1107 result->data_.u_position_register.reg = reg;
1108 return result;
1109}
1110
1111
1112ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1113 int position_reg,
1114 RegExpNode* on_success) {
1115 ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
1116 result->data_.u_submatch.stack_pointer_register = stack_reg;
1117 result->data_.u_submatch.current_position_register = position_reg;
1118 return result;
1119}
1120
1121
1122ActionNode* ActionNode::EscapeSubmatch(int stack_reg,
1123 bool restore_position,
1124 int position_reg,
1125 RegExpNode* on_success) {
1126 ActionNode* result = new ActionNode(ESCAPE_SUBMATCH, on_success);
1127 result->data_.u_submatch.stack_pointer_register = stack_reg;
1128 if (restore_position) {
1129 result->data_.u_submatch.current_position_register = position_reg;
1130 } else {
1131 result->data_.u_submatch.current_position_register = -1;
1132 }
1133 return result;
1134}
1135
1136
1137#define DEFINE_ACCEPT(Type) \
1138 void Type##Node::Accept(NodeVisitor* visitor) { \
1139 visitor->Visit##Type(this); \
1140 }
1141FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1142#undef DEFINE_ACCEPT
1143
1144
1145// -------------------------------------------------------------------
1146// Emit code.
1147
1148
1149void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1150 Guard* guard,
1151 Label* on_failure) {
1152 switch (guard->op()) {
1153 case Guard::LT:
1154 macro_assembler->IfRegisterGE(guard->reg(), guard->value(), on_failure);
1155 break;
1156 case Guard::GEQ:
1157 macro_assembler->IfRegisterLT(guard->reg(), guard->value(), on_failure);
1158 break;
1159 }
1160}
1161
1162
1163static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
1164static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
1165
1166
1167static inline void EmitAtomNonLetters(
1168 RegExpMacroAssembler* macro_assembler,
1169 TextElement elm,
1170 Vector<const uc16> quarks,
1171 Label* on_failure,
1172 int cp_offset) {
1173 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1174 for (int i = quarks.length() - 1; i >= 0; i--) {
1175 uc16 c = quarks[i];
1176 int length = uncanonicalize.get(c, '\0', chars);
1177 if (length <= 1) {
1178 macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure);
1179 macro_assembler->CheckNotCharacter(c, on_failure);
1180 }
1181 }
1182}
1183
1184
1185static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1186 uc16 c1,
1187 uc16 c2,
1188 Label* on_failure) {
1189 uc16 exor = c1 ^ c2;
1190 // Check whether exor has only one bit set.
1191 if (((exor - 1) & exor) == 0) {
1192 // If c1 and c2 differ only by one bit.
1193 // Ecma262UnCanonicalize always gives the highest number last.
1194 ASSERT(c2 > c1);
1195 macro_assembler->CheckNotCharacterAfterOr(c2, exor, on_failure);
1196 return true;
1197 }
1198 ASSERT(c2 > c1);
1199 uc16 diff = c2 - c1;
1200 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1201 // If the characters differ by 2^n but don't differ by one bit then
1202 // subtract the difference from the found character, then do the or
1203 // trick. We avoid the theoretical case where negative numbers are
1204 // involved in order to simplify code generation.
1205 macro_assembler->CheckNotCharacterAfterMinusOr(c2 - diff,
1206 diff,
1207 on_failure);
1208 return true;
1209 }
1210 return false;
1211}
1212
1213
1214static inline void EmitAtomLetters(
1215 RegExpMacroAssembler* macro_assembler,
1216 TextElement elm,
1217 Vector<const uc16> quarks,
1218 Label* on_failure,
1219 int cp_offset) {
1220 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1221 for (int i = quarks.length() - 1; i >= 0; i--) {
1222 uc16 c = quarks[i];
1223 int length = uncanonicalize.get(c, '\0', chars);
1224 if (length <= 1) continue;
1225 macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure);
1226 Label ok;
1227 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1228 switch (length) {
1229 case 2: {
1230 if (ShortCutEmitCharacterPair(macro_assembler,
1231 chars[0],
1232 chars[1],
1233 on_failure)) {
1234 ok.Unuse();
1235 } else {
1236 macro_assembler->CheckCharacter(chars[0], &ok);
1237 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1238 macro_assembler->Bind(&ok);
1239 }
1240 break;
1241 }
1242 case 4:
1243 macro_assembler->CheckCharacter(chars[3], &ok);
1244 // Fall through!
1245 case 3:
1246 macro_assembler->CheckCharacter(chars[0], &ok);
1247 macro_assembler->CheckCharacter(chars[1], &ok);
1248 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1249 macro_assembler->Bind(&ok);
1250 break;
1251 default:
1252 UNREACHABLE();
1253 break;
1254 }
1255 }
1256}
1257
1258
1259static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1260 RegExpCharacterClass* cc,
1261 int cp_offset,
1262 Label* on_failure) {
1263 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure);
1264 cp_offset++;
1265
1266 ZoneList<CharacterRange>* ranges = cc->ranges();
1267
1268 Label success;
1269
1270 Label* char_is_in_class =
1271 cc->is_negated() ? on_failure : &success;
1272
1273 int range_count = ranges->length();
1274
1275 if (range_count == 0) {
1276 if (!cc->is_negated()) {
1277 macro_assembler->GoTo(on_failure);
1278 }
1279 return;
1280 }
1281
1282 for (int i = 0; i < range_count - 1; i++) {
1283 CharacterRange& range = ranges->at(i);
1284 Label next_range;
1285 uc16 from = range.from();
1286 uc16 to = range.to();
1287 if (to == from) {
1288 macro_assembler->CheckCharacter(to, char_is_in_class);
1289 } else {
1290 if (from != 0) {
1291 macro_assembler->CheckCharacterLT(from, &next_range);
1292 }
1293 if (to != 0xffff) {
1294 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
1295 } else {
1296 macro_assembler->GoTo(char_is_in_class);
1297 }
1298 }
1299 macro_assembler->Bind(&next_range);
1300 }
1301
1302 CharacterRange& range = ranges->at(range_count - 1);
1303 uc16 from = range.from();
1304 uc16 to = range.to();
1305
1306 if (to == from) {
1307 if (cc->is_negated()) {
1308 macro_assembler->CheckCharacter(to, on_failure);
1309 } else {
1310 macro_assembler->CheckNotCharacter(to, on_failure);
1311 }
1312 } else {
1313 if (from != 0) {
1314 if (cc->is_negated()) {
1315 macro_assembler->CheckCharacterLT(from, &success);
1316 } else {
1317 macro_assembler->CheckCharacterLT(from, on_failure);
1318 }
1319 }
1320 if (to != 0xffff) {
1321 if (cc->is_negated()) {
1322 macro_assembler->CheckCharacterLT(to + 1, on_failure);
1323 } else {
1324 macro_assembler->CheckCharacterGT(to, on_failure);
1325 }
1326 } else {
1327 if (cc->is_negated()) {
1328 macro_assembler->GoTo(on_failure);
1329 }
1330 }
1331 }
1332 macro_assembler->Bind(&success);
1333}
1334
1335
1336
1337bool TextNode::Emit(RegExpCompiler* compiler) {
1338 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1339 Bind(macro_assembler);
1340 int element_count = elms_->length();
1341 ASSERT(element_count != 0);
1342 int cp_offset = 0;
1343 if (info()->at_end) {
1344 macro_assembler->Backtrack();
1345 return true;
1346 }
1347 // First, handle straight character matches.
1348 for (int i = 0; i < element_count; i++) {
1349 TextElement elm = elms_->at(i);
1350 if (elm.type == TextElement::ATOM) {
1351 Vector<const uc16> quarks = elm.data.u_atom->data();
1352 if (compiler->ignore_case()) {
1353 EmitAtomNonLetters(macro_assembler,
1354 elm,
1355 quarks,
1356 on_failure_->label(),
1357 cp_offset);
1358 } else {
1359 macro_assembler->CheckCharacters(quarks,
1360 cp_offset,
1361 on_failure_->label());
1362 }
1363 cp_offset += quarks.length();
1364 } else {
1365 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
1366 cp_offset++;
1367 }
1368 }
1369 // Second, handle case independent letter matches if any.
1370 if (compiler->ignore_case()) {
1371 cp_offset = 0;
1372 for (int i = 0; i < element_count; i++) {
1373 TextElement elm = elms_->at(i);
1374 if (elm.type == TextElement::ATOM) {
1375 Vector<const uc16> quarks = elm.data.u_atom->data();
1376 EmitAtomLetters(macro_assembler,
1377 elm,
1378 quarks,
1379 on_failure_->label(),
1380 cp_offset);
1381 cp_offset += quarks.length();
1382 } else {
1383 cp_offset++;
1384 }
1385 }
1386 }
1387 // If the fast character matches passed then do the character classes.
1388 cp_offset = 0;
1389 for (int i = 0; i < element_count; i++) {
1390 TextElement elm = elms_->at(i);
1391 if (elm.type == TextElement::CHAR_CLASS) {
1392 RegExpCharacterClass* cc = elm.data.u_char_class;
1393 EmitCharClass(macro_assembler, cc, cp_offset, on_failure_->label());
1394 cp_offset++;
1395 } else {
1396 cp_offset += elm.data.u_atom->data().length();
1397 }
1398 }
1399
1400 compiler->AddWork(on_failure_);
1401 macro_assembler->AdvanceCurrentPosition(cp_offset);
1402 return on_success()->GoTo(compiler);
1403}
1404
1405
1406void TextNode::MakeCaseIndependent() {
1407 int element_count = elms_->length();
1408 for (int i = 0; i < element_count; i++) {
1409 TextElement elm = elms_->at(i);
1410 if (elm.type == TextElement::CHAR_CLASS) {
1411 RegExpCharacterClass* cc = elm.data.u_char_class;
1412 ZoneList<CharacterRange>* ranges = cc->ranges();
1413 int range_count = ranges->length();
1414 for (int i = 0; i < range_count; i++) {
1415 ranges->at(i).AddCaseEquivalents(ranges);
1416 }
1417 }
1418 }
1419}
1420
1421
1422bool ChoiceNode::Emit(RegExpCompiler* compiler) {
1423 int choice_count = alternatives_->length();
1424 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1425 Bind(macro_assembler);
1426 // For now we just call all choices one after the other. The idea ultimately
1427 // is to use the Dispatch table to try only the relevant ones.
1428 for (int i = 0; i < choice_count - 1; i++) {
1429 GuardedAlternative alternative = alternatives_->at(i);
1430 Label after;
1431 Label after_no_pop_cp;
1432 ZoneList<Guard*>* guards = alternative.guards();
1433 if (guards != NULL) {
1434 int guard_count = guards->length();
1435 for (int j = 0; j < guard_count; j++) {
1436 GenerateGuard(macro_assembler, guards->at(j), &after_no_pop_cp);
1437 }
1438 }
1439 macro_assembler->PushCurrentPosition();
1440 macro_assembler->PushBacktrack(&after);
1441 if (!alternative.node()->GoTo(compiler)) {
1442 after.Unuse();
1443 after_no_pop_cp.Unuse();
1444 return false;
1445 }
1446 macro_assembler->Bind(&after);
1447 macro_assembler->PopCurrentPosition();
1448 macro_assembler->Bind(&after_no_pop_cp);
1449 }
1450 GuardedAlternative alternative = alternatives_->at(choice_count - 1);
1451 ZoneList<Guard*>* guards = alternative.guards();
1452 if (guards != NULL) {
1453 int guard_count = guards->length();
1454 for (int j = 0; j < guard_count; j++) {
1455 GenerateGuard(macro_assembler, guards->at(j), on_failure_->label());
1456 }
1457 }
1458 if (!on_failure_->IsBacktrack()) {
1459 ASSERT_NOT_NULL(on_failure_ -> label());
1460 macro_assembler->PushBacktrack(on_failure_->label());
1461 compiler->AddWork(on_failure_);
1462 }
1463 if (!alternative.node()->GoTo(compiler)) {
1464 return false;
1465 }
1466 return true;
1467}
1468
1469
1470bool ActionNode::Emit(RegExpCompiler* compiler) {
1471 RegExpMacroAssembler* macro = compiler->macro_assembler();
1472 Bind(macro);
1473 switch (type_) {
1474 case STORE_REGISTER:
1475 macro->SetRegister(data_.u_store_register.reg,
1476 data_.u_store_register.value);
1477 break;
1478 case INCREMENT_REGISTER: {
1479 Label undo;
1480 macro->PushBacktrack(&undo);
1481 macro->AdvanceRegister(data_.u_increment_register.reg, 1);
1482 bool ok = on_success()->GoTo(compiler);
1483 if (!ok) {
1484 undo.Unuse();
1485 return false;
1486 }
1487 macro->Bind(&undo);
1488 macro->AdvanceRegister(data_.u_increment_register.reg, -1);
1489 macro->Backtrack();
1490 break;
1491 }
1492 case STORE_POSITION: {
1493 Label undo;
1494 macro->PushRegister(data_.u_position_register.reg);
1495 macro->PushBacktrack(&undo);
1496 macro->WriteCurrentPositionToRegister(data_.u_position_register.reg);
1497 bool ok = on_success()->GoTo(compiler);
1498 if (!ok) {
1499 undo.Unuse();
1500 return false;
1501 }
1502 macro->Bind(&undo);
1503 macro->PopRegister(data_.u_position_register.reg);
1504 macro->Backtrack();
1505 break;
1506 }
1507 case RESTORE_POSITION:
1508 macro->ReadCurrentPositionFromRegister(
1509 data_.u_position_register.reg);
1510 break;
1511 case BEGIN_SUBMATCH:
1512 macro->WriteCurrentPositionToRegister(
1513 data_.u_submatch.current_position_register);
1514 macro->WriteStackPointerToRegister(
1515 data_.u_submatch.stack_pointer_register);
1516 break;
1517 case ESCAPE_SUBMATCH:
1518 if (info()->at_end) {
1519 Label at_end;
1520 // Load current character jumps to the label if we are beyond the string
1521 // end.
1522 macro->LoadCurrentCharacter(0, &at_end);
1523 macro->Backtrack();
1524 macro->Bind(&at_end);
1525 }
1526 if (data_.u_submatch.current_position_register != -1) {
1527 macro->ReadCurrentPositionFromRegister(
1528 data_.u_submatch.current_position_register);
1529 }
1530 macro->ReadStackPointerFromRegister(
1531 data_.u_submatch.stack_pointer_register);
1532 break;
1533 default:
1534 UNREACHABLE();
1535 return false;
1536 }
1537 return on_success()->GoTo(compiler);
1538}
1539
1540
1541bool BackReferenceNode::Emit(RegExpCompiler* compiler) {
1542 RegExpMacroAssembler* macro = compiler->macro_assembler();
1543 Bind(macro);
1544 ASSERT_EQ(start_reg_ + 1, end_reg_);
1545 if (info()->at_end) {
1546 // If we are constrained to match at the end of the input then succeed
1547 // iff the back reference is empty.
1548 macro->CheckNotRegistersEqual(start_reg_, end_reg_, on_failure_->label());
1549 } else {
1550 if (compiler->ignore_case()) {
1551 macro->CheckNotBackReferenceIgnoreCase(start_reg_, on_failure_->label());
1552 } else {
1553 macro->CheckNotBackReference(start_reg_, on_failure_->label());
1554 }
1555 }
1556 return on_success()->GoTo(compiler);
1557}
1558
1559
1560// -------------------------------------------------------------------
1561// Dot/dotty output
1562
1563
1564#ifdef DEBUG
1565
1566
1567class DotPrinter: public NodeVisitor {
1568 public:
1569 explicit DotPrinter(bool ignore_case)
1570 : ignore_case_(ignore_case),
1571 stream_(&alloc_) { }
1572 void PrintNode(const char* label, RegExpNode* node);
1573 void Visit(RegExpNode* node);
1574 void PrintOnFailure(RegExpNode* from, RegExpNode* on_failure);
1575 void PrintAttributes(RegExpNode* from);
1576 StringStream* stream() { return &stream_; }
1577#define DECLARE_VISIT(Type) \
1578 virtual void Visit##Type(Type##Node* that);
1579FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1580#undef DECLARE_VISIT
1581 private:
1582 bool ignore_case_;
1583 HeapStringAllocator alloc_;
1584 StringStream stream_;
1585};
1586
1587
1588void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
1589 stream()->Add("digraph G {\n graph [label=\"");
1590 for (int i = 0; label[i]; i++) {
1591 switch (label[i]) {
1592 case '\\':
1593 stream()->Add("\\\\");
1594 break;
1595 case '"':
1596 stream()->Add("\"");
1597 break;
1598 default:
1599 stream()->Put(label[i]);
1600 break;
1601 }
1602 }
1603 stream()->Add("\"];\n");
1604 Visit(node);
1605 stream()->Add("}\n");
1606 printf("%s", *(stream()->ToCString()));
1607}
1608
1609
1610void DotPrinter::Visit(RegExpNode* node) {
1611 if (node->info()->visited) return;
1612 node->info()->visited = true;
1613 node->Accept(this);
1614}
1615
1616
1617void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
1618 if (on_failure->IsBacktrack()) return;
1619 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
1620 Visit(on_failure);
1621}
1622
1623
1624class TableEntryBodyPrinter {
1625 public:
1626 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
1627 : stream_(stream), choice_(choice) { }
1628 void Call(uc16 from, DispatchTable::Entry entry) {
1629 OutSet* out_set = entry.out_set();
1630 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
1631 if (out_set->Get(i)) {
1632 stream()->Add(" n%p:s%io%i -> n%p;\n",
1633 choice(),
1634 from,
1635 i,
1636 choice()->alternatives()->at(i).node());
1637 }
1638 }
1639 }
1640 private:
1641 StringStream* stream() { return stream_; }
1642 ChoiceNode* choice() { return choice_; }
1643 StringStream* stream_;
1644 ChoiceNode* choice_;
1645};
1646
1647
1648class TableEntryHeaderPrinter {
1649 public:
1650 explicit TableEntryHeaderPrinter(StringStream* stream)
1651 : first_(true), stream_(stream) { }
1652 void Call(uc16 from, DispatchTable::Entry entry) {
1653 if (first_) {
1654 first_ = false;
1655 } else {
1656 stream()->Add("|");
1657 }
1658 stream()->Add("{\\%k-\\%k|{", from, entry.to());
1659 OutSet* out_set = entry.out_set();
1660 int priority = 0;
1661 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
1662 if (out_set->Get(i)) {
1663 if (priority > 0) stream()->Add("|");
1664 stream()->Add("<s%io%i> %i", from, i, priority);
1665 priority++;
1666 }
1667 }
1668 stream()->Add("}}");
1669 }
1670 private:
1671 bool first_;
1672 StringStream* stream() { return stream_; }
1673 StringStream* stream_;
1674};
1675
1676
1677class AttributePrinter {
1678 public:
1679 explicit AttributePrinter(DotPrinter* out)
1680 : out_(out), first_(true) { }
1681 void PrintSeparator() {
1682 if (first_) {
1683 first_ = false;
1684 } else {
1685 out_->stream()->Add("|");
1686 }
1687 }
1688 void PrintBit(const char* name, bool value) {
1689 if (!value) return;
1690 PrintSeparator();
1691 out_->stream()->Add("{%s}", name);
1692 }
1693 void PrintPositive(const char* name, int value) {
1694 if (value < 0) return;
1695 PrintSeparator();
1696 out_->stream()->Add("{%s|%x}", name, value);
1697 }
1698 private:
1699 DotPrinter* out_;
1700 bool first_;
1701};
1702
1703
1704void DotPrinter::PrintAttributes(RegExpNode* that) {
1705 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
1706 "margin=0.1, fontsize=10, label=\"{",
1707 that);
1708 AttributePrinter printer(this);
1709 NodeInfo* info = that->info();
1710 printer.PrintBit("NI", info->follows_newline_interest);
1711 printer.PrintBit("WI", info->follows_word_interest);
1712 printer.PrintBit("SI", info->follows_start_interest);
1713 printer.PrintBit("DN", info->determine_newline);
1714 printer.PrintBit("DW", info->determine_word);
1715 printer.PrintBit("DS", info->determine_start);
1716 printer.PrintBit("DDN", info->does_determine_newline);
1717 printer.PrintBit("DDW", info->does_determine_word);
1718 printer.PrintBit("DDS", info->does_determine_start);
1719 printer.PrintPositive("IW", info->is_word);
1720 printer.PrintPositive("IN", info->is_newline);
1721 printer.PrintPositive("FN", info->follows_newline);
1722 printer.PrintPositive("FW", info->follows_word);
1723 printer.PrintPositive("FS", info->follows_start);
1724 Label* label = that->label();
1725 if (label->is_bound())
1726 printer.PrintPositive("@", label->pos());
1727 stream()->Add("}\"];\n");
1728 stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
1729 "arrowhead=none];\n", that, that);
1730}
1731
1732
1733static const bool kPrintDispatchTable = false;
1734void DotPrinter::VisitChoice(ChoiceNode* that) {
1735 if (kPrintDispatchTable) {
1736 stream()->Add(" n%p [shape=Mrecord, label=\"", that);
1737 TableEntryHeaderPrinter header_printer(stream());
1738 that->GetTable(ignore_case_)->ForEach(&header_printer);
1739 stream()->Add("\"]\n", that);
1740 PrintAttributes(that);
1741 TableEntryBodyPrinter body_printer(stream(), that);
1742 that->GetTable(ignore_case_)->ForEach(&body_printer);
1743 PrintOnFailure(that, that->on_failure());
1744 } else {
1745 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
1746 for (int i = 0; i < that->alternatives()->length(); i++) {
1747 GuardedAlternative alt = that->alternatives()->at(i);
1748 stream()->Add(" n%p -> n%p;\n", that, alt.node());
1749 }
1750 }
1751 for (int i = 0; i < that->alternatives()->length(); i++) {
1752 GuardedAlternative alt = that->alternatives()->at(i);
1753 alt.node()->Accept(this);
1754 }
1755}
1756
1757
1758void DotPrinter::VisitText(TextNode* that) {
1759 stream()->Add(" n%p [label=\"", that);
1760 for (int i = 0; i < that->elements()->length(); i++) {
1761 if (i > 0) stream()->Add(" ");
1762 TextElement elm = that->elements()->at(i);
1763 switch (elm.type) {
1764 case TextElement::ATOM: {
1765 stream()->Add("'%w'", elm.data.u_atom->data());
1766 break;
1767 }
1768 case TextElement::CHAR_CLASS: {
1769 RegExpCharacterClass* node = elm.data.u_char_class;
1770 stream()->Add("[");
1771 if (node->is_negated())
1772 stream()->Add("^");
1773 for (int j = 0; j < node->ranges()->length(); j++) {
1774 CharacterRange range = node->ranges()->at(j);
1775 stream()->Add("%k-%k", range.from(), range.to());
1776 }
1777 stream()->Add("]");
1778 break;
1779 }
1780 default:
1781 UNREACHABLE();
1782 }
1783 }
1784 stream()->Add("\", shape=box, peripheries=2];\n");
1785 PrintAttributes(that);
1786 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
1787 Visit(that->on_success());
1788 PrintOnFailure(that, that->on_failure());
1789}
1790
1791
1792void DotPrinter::VisitBackReference(BackReferenceNode* that) {
1793 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
1794 that,
1795 that->start_register(),
1796 that->end_register());
1797 PrintAttributes(that);
1798 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
1799 Visit(that->on_success());
1800 PrintOnFailure(that, that->on_failure());
1801}
1802
1803
1804void DotPrinter::VisitEnd(EndNode* that) {
1805 stream()->Add(" n%p [style=bold, shape=point];\n", that);
1806 PrintAttributes(that);
1807}
1808
1809
1810void DotPrinter::VisitAction(ActionNode* that) {
1811 stream()->Add(" n%p [", that);
1812 switch (that->type_) {
1813 case ActionNode::STORE_REGISTER:
1814 stream()->Add("label=\"$%i:=%i\", shape=octagon",
1815 that->data_.u_store_register.reg,
1816 that->data_.u_store_register.value);
1817 break;
1818 case ActionNode::INCREMENT_REGISTER:
1819 stream()->Add("label=\"$%i++\", shape=octagon",
1820 that->data_.u_increment_register.reg);
1821 break;
1822 case ActionNode::STORE_POSITION:
1823 stream()->Add("label=\"$%i:=$pos\", shape=octagon",
1824 that->data_.u_position_register.reg);
1825 break;
1826 case ActionNode::RESTORE_POSITION:
1827 stream()->Add("label=\"$pos:=$%i\", shape=octagon",
1828 that->data_.u_position_register.reg);
1829 break;
1830 case ActionNode::BEGIN_SUBMATCH:
1831 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
1832 that->data_.u_submatch.current_position_register);
1833 break;
1834 case ActionNode::ESCAPE_SUBMATCH:
1835 stream()->Add("label=\"escape\", shape=septagon");
1836 break;
1837 }
1838 stream()->Add("];\n");
1839 PrintAttributes(that);
1840 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
1841 Visit(that->on_success());
1842}
1843
1844
1845class DispatchTableDumper {
1846 public:
1847 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
1848 void Call(uc16 key, DispatchTable::Entry entry);
1849 StringStream* stream() { return stream_; }
1850 private:
1851 StringStream* stream_;
1852};
1853
1854
1855void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
1856 stream()->Add("[%k-%k]: {", key, entry.to());
1857 OutSet* set = entry.out_set();
1858 bool first = true;
1859 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
1860 if (set->Get(i)) {
1861 if (first) {
1862 first = false;
1863 } else {
1864 stream()->Add(", ");
1865 }
1866 stream()->Add("%i", i);
1867 }
1868 }
1869 stream()->Add("}\n");
1870}
1871
1872
1873void DispatchTable::Dump() {
1874 HeapStringAllocator alloc;
1875 StringStream stream(&alloc);
1876 DispatchTableDumper dumper(&stream);
1877 tree()->ForEach(&dumper);
1878 OS::PrintError("%s", *stream.ToCString());
1879}
1880
1881
1882void RegExpEngine::DotPrint(const char* label,
1883 RegExpNode* node,
1884 bool ignore_case) {
1885 DotPrinter printer(ignore_case);
1886 printer.PrintNode(label, node);
1887}
1888
1889
1890#endif // DEBUG
1891
1892
1893// -------------------------------------------------------------------
1894// Tree to graph conversion
1895
1896
1897RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
1898 RegExpNode* on_success,
1899 RegExpNode* on_failure) {
1900 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
1901 elms->Add(TextElement::Atom(this));
1902 return new TextNode(elms, on_success, on_failure);
1903}
1904
1905
1906RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
1907 RegExpNode* on_success,
1908 RegExpNode* on_failure) {
1909 return new TextNode(elements(), on_success, on_failure);
1910}
1911
1912
1913RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
1914 RegExpNode* on_success,
1915 RegExpNode* on_failure) {
1916 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
1917 elms->Add(TextElement::CharClass(this));
1918 return new TextNode(elms, on_success, on_failure);
1919}
1920
1921
1922RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
1923 RegExpNode* on_success,
1924 RegExpNode* on_failure) {
1925 ZoneList<RegExpTree*>* alternatives = this->alternatives();
1926 int length = alternatives->length();
1927 ChoiceNode* result = new ChoiceNode(length, on_failure);
1928 for (int i = 0; i < length; i++) {
1929 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
1930 on_success,
1931 on_failure));
1932 result->AddAlternative(alternative);
1933 }
1934 return result;
1935}
1936
1937
1938RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
1939 RegExpNode* on_success,
1940 RegExpNode* on_failure) {
1941 return ToNode(min(),
1942 max(),
1943 is_greedy(),
1944 body(),
1945 compiler,
1946 on_success,
1947 on_failure);
1948}
1949
1950
1951RegExpNode* RegExpQuantifier::ToNode(int min,
1952 int max,
1953 bool is_greedy,
1954 RegExpTree* body,
1955 RegExpCompiler* compiler,
1956 RegExpNode* on_success,
1957 RegExpNode* on_failure) {
1958 // x{f, t} becomes this:
1959 //
1960 // (r++)<-.
1961 // | `
1962 // | (x)
1963 // v ^
1964 // (r=0)-->(?)---/ [if r < t]
1965 // |
1966 // [if r >= f] \----> ...
1967 //
1968 //
1969 // TODO(someone): clear captures on repetition and handle empty
1970 // matches.
1971 bool has_min = min > 0;
1972 bool has_max = max < RegExpQuantifier::kInfinity;
1973 bool needs_counter = has_min || has_max;
1974 int reg_ctr = needs_counter ? compiler->AllocateRegister() : -1;
1975 ChoiceNode* center = new ChoiceNode(2, on_failure);
1976 RegExpNode* loop_return = needs_counter
1977 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
1978 : static_cast<RegExpNode*>(center);
1979 RegExpNode* body_node = body->ToNode(compiler, loop_return, on_failure);
1980 GuardedAlternative body_alt(body_node);
1981 if (has_max) {
1982 Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
1983 body_alt.AddGuard(body_guard);
1984 }
1985 GuardedAlternative rest_alt(on_success);
1986 if (has_min) {
1987 Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
1988 rest_alt.AddGuard(rest_guard);
1989 }
1990 if (is_greedy) {
1991 center->AddAlternative(body_alt);
1992 center->AddAlternative(rest_alt);
1993 } else {
1994 center->AddAlternative(rest_alt);
1995 center->AddAlternative(body_alt);
1996 }
1997 if (needs_counter) {
1998 return ActionNode::StoreRegister(reg_ctr, 0, center);
1999 } else {
2000 return center;
2001 }
2002}
2003
2004
2005RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
2006 RegExpNode* on_success,
2007 RegExpNode* on_failure) {
2008 NodeInfo info;
2009 switch (type()) {
2010 case START_OF_LINE:
2011 info.follows_newline_interest = true;
2012 break;
2013 case START_OF_INPUT:
2014 info.follows_start_interest = true;
2015 break;
2016 case BOUNDARY: case NON_BOUNDARY:
2017 info.follows_word_interest = true;
2018 break;
2019 case END_OF_INPUT:
2020 info.at_end = true;
2021 break;
2022 case END_OF_LINE:
2023 // This is wrong but has the effect of making the compiler abort.
2024 info.at_end = true;
2025 }
2026 return on_success->PropagateForward(&info);
2027}
2028
2029
2030RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
2031 RegExpNode* on_success,
2032 RegExpNode* on_failure) {
2033 return new BackReferenceNode(RegExpCapture::StartRegister(index()),
2034 RegExpCapture::EndRegister(index()),
2035 on_success,
2036 on_failure);
2037}
2038
2039
2040RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
2041 RegExpNode* on_success,
2042 RegExpNode* on_failure) {
2043 return on_success;
2044}
2045
2046
2047RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
2048 RegExpNode* on_success,
2049 RegExpNode* on_failure) {
2050 int stack_pointer_register = compiler->AllocateRegister();
2051 int position_register = compiler->AllocateRegister();
2052 if (is_positive()) {
2053 // begin submatch scope
2054 // $reg = $pos
2055 // if [body]
2056 // then
2057 // $pos = $reg
2058 // escape submatch scope (drop all backtracks created in scope)
2059 // succeed
2060 // else
2061 // end submatch scope (nothing to clean up, just exit the scope)
2062 // fail
2063 return ActionNode::BeginSubmatch(
2064 stack_pointer_register,
2065 position_register,
2066 body()->ToNode(
2067 compiler,
2068 ActionNode::EscapeSubmatch(
2069 stack_pointer_register,
2070 true, // Also restore input position.
2071 position_register,
2072 on_success),
2073 on_failure));
2074 } else {
2075 // begin submatch scope
2076 // try
2077 // first if (body)
2078 // then
2079 // escape submatch scope
2080 // fail
2081 // else
2082 // backtrack
2083 // second
2084 // end submatch scope
2085 // restore current position
2086 // succeed
2087 ChoiceNode* try_node =
2088 new ChoiceNode(1, ActionNode::RestorePosition(position_register,
2089 on_success));
2090 RegExpNode* body_node = body()->ToNode(
2091 compiler,
2092 ActionNode::EscapeSubmatch(stack_pointer_register,
2093 false, // Don't also restore position
2094 0, // Unused arguments.
2095 on_failure),
2096 compiler->backtrack());
2097 GuardedAlternative body_alt(body_node);
2098 try_node->AddAlternative(body_alt);
2099 return ActionNode::BeginSubmatch(stack_pointer_register,
2100 position_register,
2101 try_node);
2102 }
2103}
2104
2105
2106RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
2107 RegExpNode* on_success,
2108 RegExpNode* on_failure) {
2109 return ToNode(body(), index(), compiler, on_success, on_failure);
2110}
2111
2112
2113RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
2114 int index,
2115 RegExpCompiler* compiler,
2116 RegExpNode* on_success,
2117 RegExpNode* on_failure) {
2118 int start_reg = RegExpCapture::StartRegister(index);
2119 int end_reg = RegExpCapture::EndRegister(index);
2120 RegExpNode* store_end = ActionNode::StorePosition(end_reg, on_success);
2121 RegExpNode* body_node = body->ToNode(compiler, store_end, on_failure);
2122 return ActionNode::StorePosition(start_reg, body_node);
2123}
2124
2125
2126RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
2127 RegExpNode* on_success,
2128 RegExpNode* on_failure) {
2129 ZoneList<RegExpTree*>* children = nodes();
2130 RegExpNode* current = on_success;
2131 for (int i = children->length() - 1; i >= 0; i--) {
2132 current = children->at(i)->ToNode(compiler, current, on_failure);
2133 }
2134 return current;
2135}
2136
2137
2138static const int kSpaceRangeCount = 20;
2139static const uc16 kSpaceRanges[kSpaceRangeCount] = {
2140 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0, 0x00A0, 0x1680,
2141 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029,
2142 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000
2143};
2144
2145
2146static const int kWordRangeCount = 8;
2147static const uc16 kWordRanges[kWordRangeCount] = {
2148 '0', '9', 'A', 'Z', '_', '_', 'a', 'z'
2149};
2150
2151
2152static const int kDigitRangeCount = 2;
2153static const uc16 kDigitRanges[kDigitRangeCount] = {
2154 '0', '9'
2155};
2156
2157
2158static const int kLineTerminatorRangeCount = 6;
2159static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = {
2160 0x000A, 0x000A, 0x000D, 0x000D, 0x2028, 0x2029
2161};
2162
2163
2164static void AddClass(const uc16* elmv,
2165 int elmc,
2166 ZoneList<CharacterRange>* ranges) {
2167 for (int i = 0; i < elmc; i += 2) {
2168 ASSERT(elmv[i] <= elmv[i + 1]);
2169 ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
2170 }
2171}
2172
2173
2174static void AddClassNegated(const uc16 *elmv,
2175 int elmc,
2176 ZoneList<CharacterRange>* ranges) {
2177 ASSERT(elmv[0] != 0x0000);
2178 ASSERT(elmv[elmc-1] != 0xFFFF);
2179 uc16 last = 0x0000;
2180 for (int i = 0; i < elmc; i += 2) {
2181 ASSERT(last <= elmv[i] - 1);
2182 ASSERT(elmv[i] <= elmv[i + 1]);
2183 ranges->Add(CharacterRange(last, elmv[i] - 1));
2184 last = elmv[i + 1] + 1;
2185 }
2186 ranges->Add(CharacterRange(last, 0xFFFF));
2187}
2188
2189
2190void CharacterRange::AddClassEscape(uc16 type,
2191 ZoneList<CharacterRange>* ranges) {
2192 switch (type) {
2193 case 's':
2194 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
2195 break;
2196 case 'S':
2197 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
2198 break;
2199 case 'w':
2200 AddClass(kWordRanges, kWordRangeCount, ranges);
2201 break;
2202 case 'W':
2203 AddClassNegated(kWordRanges, kWordRangeCount, ranges);
2204 break;
2205 case 'd':
2206 AddClass(kDigitRanges, kDigitRangeCount, ranges);
2207 break;
2208 case 'D':
2209 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
2210 break;
2211 case '.':
2212 AddClassNegated(kLineTerminatorRanges,
2213 kLineTerminatorRangeCount,
2214 ranges);
2215 break;
2216 // This is not a character range as defined by the spec but a
2217 // convenient shorthand for a character class that matches any
2218 // character.
2219 case '*':
2220 ranges->Add(CharacterRange::Everything());
2221 break;
2222 default:
2223 UNREACHABLE();
2224 }
2225}
2226
2227
2228Vector<const uc16> CharacterRange::GetWordBounds() {
2229 return Vector<const uc16>(kWordRanges, kWordRangeCount);
2230}
2231
2232
2233class CharacterRangeSplitter {
2234 public:
2235 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
2236 ZoneList<CharacterRange>** excluded)
2237 : included_(included),
2238 excluded_(excluded) { }
2239 void Call(uc16 from, DispatchTable::Entry entry);
2240
2241 static const int kInBase = 0;
2242 static const int kInOverlay = 1;
2243
2244 private:
2245 ZoneList<CharacterRange>** included_;
2246 ZoneList<CharacterRange>** excluded_;
2247};
2248
2249
2250void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
2251 if (!entry.out_set()->Get(kInBase)) return;
2252 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
2253 ? included_
2254 : excluded_;
2255 if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
2256 (*target)->Add(CharacterRange(entry.from(), entry.to()));
2257}
2258
2259
2260void CharacterRange::Split(ZoneList<CharacterRange>* base,
2261 Vector<const uc16> overlay,
2262 ZoneList<CharacterRange>** included,
2263 ZoneList<CharacterRange>** excluded) {
2264 ASSERT_EQ(NULL, *included);
2265 ASSERT_EQ(NULL, *excluded);
2266 DispatchTable table;
2267 for (int i = 0; i < base->length(); i++)
2268 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
2269 for (int i = 0; i < overlay.length(); i += 2) {
2270 table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
2271 CharacterRangeSplitter::kInOverlay);
2272 }
2273 CharacterRangeSplitter callback(included, excluded);
2274 table.ForEach(&callback);
2275}
2276
2277
2278void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
2279 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2280 if (IsSingleton()) {
2281 // If this is a singleton we just expand the one character.
2282 int length = uncanonicalize.get(from(), '\0', chars);
2283 for (int i = 0; i < length; i++) {
2284 uc32 chr = chars[i];
2285 if (chr != from()) {
2286 ranges->Add(CharacterRange::Singleton(chars[i]));
2287 }
2288 }
2289 } else if (from() <= kRangeCanonicalizeMax &&
2290 to() <= kRangeCanonicalizeMax) {
2291 // If this is a range we expand the characters block by block,
2292 // expanding contiguous subranges (blocks) one at a time.
2293 // The approach is as follows. For a given start character we
2294 // look up the block that contains it, for instance 'a' if the
2295 // start character is 'c'. A block is characterized by the property
2296 // that all characters uncanonicalize in the same way as the first
2297 // element, except that each entry in the result is incremented
2298 // by the distance from the first element. So a-z is a block
2299 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
2300 // uncanonicalizes to ['a' + k, 'A' + k].
2301 // Once we've found the start point we look up its uncanonicalization
2302 // and produce a range for each element. For instance for [c-f]
2303 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
2304 // add a range if it is not already contained in the input, so [c-f]
2305 // will be skipped but [C-F] will be added. If this range is not
2306 // completely contained in a block we do this for all the blocks
2307 // covered by the range.
2308 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2309 // First, look up the block that contains the 'from' character.
2310 int length = canonrange.get(from(), '\0', range);
2311 if (length == 0) {
2312 range[0] = from();
2313 } else {
2314 ASSERT_EQ(1, length);
2315 }
2316 int pos = from();
2317 // The start of the current block. Note that except for the first
2318 // iteration 'start' is always equal to 'pos'.
2319 int start;
2320 // If it is not the start point of a block the entry contains the
2321 // offset of the character from the start point.
2322 if ((range[0] & kStartMarker) == 0) {
2323 start = pos - range[0];
2324 } else {
2325 start = pos;
2326 }
2327 // Then we add the ranges on at a time, incrementing the current
2328 // position to be after the last block each time. The position
2329 // always points to the start of a block.
2330 while (pos < to()) {
2331 length = canonrange.get(start, '\0', range);
2332 if (length == 0) {
2333 range[0] = start;
2334 } else {
2335 ASSERT_EQ(1, length);
2336 }
2337 ASSERT((range[0] & kStartMarker) != 0);
2338 // The start point of a block contains the distance to the end
2339 // of the range.
2340 int block_end = start + (range[0] & kPayloadMask) - 1;
2341 int end = (block_end > to()) ? to() : block_end;
2342 length = uncanonicalize.get(start, '\0', range);
2343 for (int i = 0; i < length; i++) {
2344 uc32 c = range[i];
2345 uc16 range_from = c + (pos - start);
2346 uc16 range_to = c + (end - start);
2347 if (!(from() <= range_from && range_to <= to())) {
2348 ranges->Add(CharacterRange(range_from, range_to));
2349 }
2350 }
2351 start = pos = block_end + 1;
2352 }
2353 } else {
2354 // TODO(plesner) when we've fixed the 2^11 bug in unibrow.
2355 }
2356}
2357
2358
2359// -------------------------------------------------------------------
2360// Interest propagation
2361
2362
2363RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
2364 for (int i = 0; i < siblings_.length(); i++) {
2365 RegExpNode* sibling = siblings_.Get(i);
2366 if (sibling->info()->Matches(info))
2367 return sibling;
2368 }
2369 return NULL;
2370}
2371
2372
2373RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
2374 ASSERT_EQ(false, *cloned);
2375 ASSERT(!info->HasAssertions());
2376 siblings_.Ensure(this);
2377 RegExpNode* result = TryGetSibling(info);
2378 if (result != NULL) return result;
2379 result = this->Clone();
2380 NodeInfo* new_info = result->info();
2381 new_info->ResetCompilationState();
2382 new_info->AddFromPreceding(info);
2383 AddSibling(result);
2384 *cloned = true;
2385 return result;
2386}
2387
2388
2389template <class C>
2390static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
2391 NodeInfo full_info(*node->info());
2392 full_info.AddFromPreceding(info);
2393 bool cloned = false;
2394 return RegExpNode::EnsureSibling(node, &full_info, &cloned);
2395}
2396
2397
2398RegExpNode* ActionNode::PropagateForward(NodeInfo* info) {
2399 NodeInfo full_info(*this->info());
2400 full_info.AddFromPreceding(info);
2401 bool cloned = false;
2402 ActionNode* action = EnsureSibling(this, &full_info, &cloned);
2403 if (cloned && type_ != ESCAPE_SUBMATCH) {
2404 action->set_on_success(action->on_success()->PropagateForward(info));
2405 }
2406 return action;
2407}
2408
2409
2410RegExpNode* ChoiceNode::PropagateForward(NodeInfo* info) {
2411 NodeInfo full_info(*this->info());
2412 full_info.AddFromPreceding(info);
2413 bool cloned = false;
2414 ChoiceNode* choice = EnsureSibling(this, &full_info, &cloned);
2415 if (cloned) {
2416 ZoneList<GuardedAlternative>* old_alternatives = alternatives();
2417 int count = old_alternatives->length();
2418 choice->alternatives_ = new ZoneList<GuardedAlternative>(count);
2419 for (int i = 0; i < count; i++) {
2420 GuardedAlternative alternative = old_alternatives->at(i);
2421 alternative.set_node(alternative.node()->PropagateForward(info));
2422 choice->alternatives()->Add(alternative);
2423 }
2424 if (!choice->on_failure_->IsBacktrack()) {
2425 choice->on_failure_ = choice->on_failure_->PropagateForward(info);
2426 }
2427 }
2428 return choice;
2429}
2430
2431
2432RegExpNode* EndNode::PropagateForward(NodeInfo* info) {
2433 return PropagateToEndpoint(this, info);
2434}
2435
2436
2437RegExpNode* BackReferenceNode::PropagateForward(NodeInfo* info) {
2438 NodeInfo full_info(*this->info());
2439 full_info.AddFromPreceding(info);
2440 bool cloned = false;
2441 BackReferenceNode* back_ref = EnsureSibling(this, &full_info, &cloned);
2442 if (cloned) {
2443 // TODO(erikcorry): A back reference has to have two successors (by default
2444 // the same node). The first is used if the back reference matches a non-
2445 // empty back reference, the second if it matches an empty one. This
2446 // doesn't matter for at_end, which is the only one implemented right now,
2447 // but it will matter for other pieces of info.
2448 back_ref->set_on_success(back_ref->on_success()->PropagateForward(info));
2449 }
2450 return back_ref;
2451}
2452
2453
2454RegExpNode* TextNode::PropagateForward(NodeInfo* info) {
2455 return PropagateToEndpoint(this, info);
2456}
2457
2458
2459// -------------------------------------------------------------------
2460// Splay tree
2461
2462
2463OutSet* OutSet::Extend(unsigned value) {
2464 if (Get(value))
2465 return this;
2466 if (successors() != NULL) {
2467 for (int i = 0; i < successors()->length(); i++) {
2468 OutSet* successor = successors()->at(i);
2469 if (successor->Get(value))
2470 return successor;
2471 }
2472 } else {
2473 successors_ = new ZoneList<OutSet*>(2);
2474 }
2475 OutSet* result = new OutSet(first_, remaining_);
2476 result->Set(value);
2477 successors()->Add(result);
2478 return result;
2479}
2480
2481
2482void OutSet::Set(unsigned value) {
2483 if (value < kFirstLimit) {
2484 first_ |= (1 << value);
2485 } else {
2486 if (remaining_ == NULL)
2487 remaining_ = new ZoneList<unsigned>(1);
2488 if (remaining_->is_empty() || !remaining_->Contains(value))
2489 remaining_->Add(value);
2490 }
2491}
2492
2493
2494bool OutSet::Get(unsigned value) {
2495 if (value < kFirstLimit) {
2496 return (first_ & (1 << value)) != 0;
2497 } else if (remaining_ == NULL) {
2498 return false;
2499 } else {
2500 return remaining_->Contains(value);
2501 }
2502}
2503
2504
2505const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
2506const DispatchTable::Entry DispatchTable::Config::kNoValue;
2507
2508
2509void DispatchTable::AddRange(CharacterRange full_range, int value) {
2510 CharacterRange current = full_range;
2511 if (tree()->is_empty()) {
2512 // If this is the first range we just insert into the table.
2513 ZoneSplayTree<Config>::Locator loc;
2514 ASSERT_RESULT(tree()->Insert(current.from(), &loc));
2515 loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
2516 return;
2517 }
2518 // First see if there is a range to the left of this one that
2519 // overlaps.
2520 ZoneSplayTree<Config>::Locator loc;
2521 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
2522 Entry* entry = &loc.value();
2523 // If we've found a range that overlaps with this one, and it
2524 // starts strictly to the left of this one, we have to fix it
2525 // because the following code only handles ranges that start on
2526 // or after the start point of the range we're adding.
2527 if (entry->from() < current.from() && entry->to() >= current.from()) {
2528 // Snap the overlapping range in half around the start point of
2529 // the range we're adding.
2530 CharacterRange left(entry->from(), current.from() - 1);
2531 CharacterRange right(current.from(), entry->to());
2532 // The left part of the overlapping range doesn't overlap.
2533 // Truncate the whole entry to be just the left part.
2534 entry->set_to(left.to());
2535 // The right part is the one that overlaps. We add this part
2536 // to the map and let the next step deal with merging it with
2537 // the range we're adding.
2538 ZoneSplayTree<Config>::Locator loc;
2539 ASSERT_RESULT(tree()->Insert(right.from(), &loc));
2540 loc.set_value(Entry(right.from(),
2541 right.to(),
2542 entry->out_set()));
2543 }
2544 }
2545 while (current.is_valid()) {
2546 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
2547 (loc.value().from() <= current.to()) &&
2548 (loc.value().to() >= current.from())) {
2549 Entry* entry = &loc.value();
2550 // We have overlap. If there is space between the start point of
2551 // the range we're adding and where the overlapping range starts
2552 // then we have to add a range covering just that space.
2553 if (current.from() < entry->from()) {
2554 ZoneSplayTree<Config>::Locator ins;
2555 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
2556 ins.set_value(Entry(current.from(),
2557 entry->from() - 1,
2558 empty()->Extend(value)));
2559 current.set_from(entry->from());
2560 }
2561 ASSERT_EQ(current.from(), entry->from());
2562 // If the overlapping range extends beyond the one we want to add
2563 // we have to snap the right part off and add it separately.
2564 if (entry->to() > current.to()) {
2565 ZoneSplayTree<Config>::Locator ins;
2566 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
2567 ins.set_value(Entry(current.to() + 1,
2568 entry->to(),
2569 entry->out_set()));
2570 entry->set_to(current.to());
2571 }
2572 ASSERT(entry->to() <= current.to());
2573 // The overlapping range is now completely contained by the range
2574 // we're adding so we can just update it and move the start point
2575 // of the range we're adding just past it.
2576 entry->AddValue(value);
2577 // Bail out if the last interval ended at 0xFFFF since otherwise
2578 // adding 1 will wrap around to 0.
2579 if (entry->to() == 0xFFFF)
2580 break;
2581 ASSERT(entry->to() + 1 > current.from());
2582 current.set_from(entry->to() + 1);
2583 } else {
2584 // There is no overlap so we can just add the range
2585 ZoneSplayTree<Config>::Locator ins;
2586 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
2587 ins.set_value(Entry(current.from(),
2588 current.to(),
2589 empty()->Extend(value)));
2590 break;
2591 }
2592 }
2593}
2594
2595
2596OutSet* DispatchTable::Get(uc16 value) {
2597 ZoneSplayTree<Config>::Locator loc;
2598 if (!tree()->FindGreatestLessThan(value, &loc))
2599 return empty();
2600 Entry* entry = &loc.value();
2601 if (value <= entry->to())
2602 return entry->out_set();
2603 else
2604 return empty();
2605}
2606
2607
2608// -------------------------------------------------------------------
2609// Analysis
2610
2611
2612void Analysis::EnsureAnalyzed(RegExpNode* that) {
2613 if (that->info()->been_analyzed || that->info()->being_analyzed)
2614 return;
2615 that->info()->being_analyzed = true;
2616 that->Accept(this);
2617 that->info()->being_analyzed = false;
2618 that->info()->been_analyzed = true;
2619}
2620
2621
2622void Analysis::VisitEnd(EndNode* that) {
2623 // nothing to do
2624}
2625
2626
2627void Analysis::VisitText(TextNode* that) {
2628 if (ignore_case_) {
2629 that->MakeCaseIndependent();
2630 }
2631 EnsureAnalyzed(that->on_success());
2632 EnsureAnalyzed(that->on_failure());
2633 NodeInfo* info = that->info();
2634 NodeInfo* next_info = that->on_success()->info();
2635 // If the following node is interested in what it follows then this
2636 // node must determine it.
2637 info->determine_newline = next_info->follows_newline_interest;
2638 info->determine_word = next_info->follows_word_interest;
2639 info->determine_start = next_info->follows_start_interest;
2640}
2641
2642
2643void Analysis::VisitAction(ActionNode* that) {
2644 EnsureAnalyzed(that->on_success());
2645 // If the next node is interested in what it follows then this node
2646 // has to be interested too so it can pass the information on.
2647 that->info()->AddFromFollowing(that->on_success()->info());
2648}
2649
2650
2651void Analysis::VisitChoice(ChoiceNode* that) {
2652 NodeInfo* info = that->info();
2653 for (int i = 0; i < that->alternatives()->length(); i++) {
2654 RegExpNode* node = that->alternatives()->at(i).node();
2655 EnsureAnalyzed(node);
2656 // Anything the following nodes need to know has to be known by
2657 // this node also, so it can pass it on.
2658 info->AddFromFollowing(node->info());
2659 }
2660 EnsureAnalyzed(that->on_failure());
2661}
2662
2663
2664void Analysis::VisitBackReference(BackReferenceNode* that) {
2665 EnsureAnalyzed(that->on_success());
2666 EnsureAnalyzed(that->on_failure());
2667}
2668
2669
2670// -------------------------------------------------------------------
2671// Assumption expansion
2672
2673
2674RegExpNode* RegExpNode::EnsureExpanded(NodeInfo* info) {
2675 siblings_.Ensure(this);
2676 NodeInfo new_info = *this->info();
2677 if (new_info.follows_word_interest)
2678 new_info.follows_word = info->follows_word;
2679 if (new_info.follows_newline_interest)
2680 new_info.follows_newline = info->follows_newline;
2681 // If the following node should determine something we need to get
2682 // a sibling that determines it.
2683 new_info.does_determine_newline = new_info.determine_newline;
2684 new_info.does_determine_word = new_info.determine_word;
2685 new_info.does_determine_start = new_info.determine_start;
2686 RegExpNode* sibling = TryGetSibling(&new_info);
2687 if (sibling == NULL) {
2688 sibling = ExpandLocal(&new_info);
2689 siblings_.Add(sibling);
2690 sibling->info()->being_expanded = true;
2691 sibling->ExpandChildren();
2692 sibling->info()->being_expanded = false;
2693 sibling->info()->been_expanded = true;
2694 } else {
2695 NodeInfo* sib_info = sibling->info();
2696 if (!sib_info->been_expanded && !sib_info->being_expanded) {
2697 sibling->info()->being_expanded = true;
2698 sibling->ExpandChildren();
2699 sibling->info()->being_expanded = false;
2700 sibling->info()->been_expanded = true;
2701 }
2702 }
2703 return sibling;
2704}
2705
2706
2707RegExpNode* ChoiceNode::ExpandLocal(NodeInfo* info) {
2708 ChoiceNode* clone = this->Clone();
2709 clone->info()->ResetCompilationState();
2710 clone->info()->AddAssumptions(info);
2711 return clone;
2712}
2713
2714
2715void ChoiceNode::ExpandChildren() {
2716 ZoneList<GuardedAlternative>* alts = alternatives();
2717 ZoneList<GuardedAlternative>* new_alts
2718 = new ZoneList<GuardedAlternative>(alts->length());
2719 for (int i = 0; i < alts->length(); i++) {
2720 GuardedAlternative next = alts->at(i);
2721 next.set_node(next.node()->EnsureExpanded(info()));
2722 new_alts->Add(next);
2723 }
2724 alternatives_ = new_alts;
2725}
2726
2727
2728RegExpNode* TextNode::ExpandLocal(NodeInfo* info) {
2729 TextElement last = elements()->last();
2730 if (last.type == TextElement::CHAR_CLASS) {
2731 RegExpCharacterClass* char_class = last.data.u_char_class;
2732 if (info->does_determine_word) {
2733 ZoneList<CharacterRange>* word = NULL;
2734 ZoneList<CharacterRange>* non_word = NULL;
2735 CharacterRange::Split(char_class->ranges(),
2736 CharacterRange::GetWordBounds(),
2737 &word,
2738 &non_word);
2739 if (non_word == NULL) {
2740 // This node contains no non-word characters so it must be
2741 // all word.
2742 this->info()->is_word = NodeInfo::TRUE;
2743 } else if (word == NULL) {
2744 // Vice versa.
2745 this->info()->is_word = NodeInfo::FALSE;
2746 } else {
2747 // If this character class contains both word and non-word
2748 // characters we need to split it into two.
2749 ChoiceNode* result = new ChoiceNode(2, on_failure());
2750 // Welcome to the family, son!
2751 result->set_siblings(this->siblings());
2752 *result->info() = *this->info();
2753 result->info()->ResetCompilationState();
2754 result->info()->AddAssumptions(info);
2755 RegExpNode* word_node
2756 = new TextNode(new RegExpCharacterClass(word, false),
2757 on_success(),
2758 on_failure());
2759 word_node->info()->determine_word = true;
2760 word_node->info()->does_determine_word = true;
2761 word_node->info()->is_word = NodeInfo::TRUE;
2762 result->alternatives()->Add(GuardedAlternative(word_node));
2763 RegExpNode* non_word_node
2764 = new TextNode(new RegExpCharacterClass(non_word, false),
2765 on_success(),
2766 on_failure());
2767 non_word_node->info()->determine_word = true;
2768 non_word_node->info()->does_determine_word = true;
2769 non_word_node->info()->is_word = NodeInfo::FALSE;
2770 result->alternatives()->Add(GuardedAlternative(non_word_node));
2771 return result;
2772 }
2773 }
2774 }
2775 TextNode* clone = this->Clone();
2776 clone->info()->ResetCompilationState();
2777 clone->info()->AddAssumptions(info);
2778 return clone;
2779}
2780
2781
2782void TextNode::ExpandAtomChildren(RegExpAtom* that) {
2783 NodeInfo new_info = *info();
2784 uc16 last = that->data()[that->data().length() - 1];
2785 if (info()->determine_word) {
2786 new_info.follows_word = IsRegExpWord(last)
2787 ? NodeInfo::TRUE : NodeInfo::FALSE;
2788 } else {
2789 new_info.follows_word = NodeInfo::UNKNOWN;
2790 }
2791 if (info()->determine_newline) {
2792 new_info.follows_newline = IsRegExpNewline(last)
2793 ? NodeInfo::TRUE : NodeInfo::FALSE;
2794 } else {
2795 new_info.follows_newline = NodeInfo::UNKNOWN;
2796 }
2797 if (info()->determine_start) {
2798 new_info.follows_start = NodeInfo::FALSE;
2799 } else {
2800 new_info.follows_start = NodeInfo::UNKNOWN;
2801 }
2802 set_on_success(on_success()->EnsureExpanded(&new_info));
2803}
2804
2805
2806void TextNode::ExpandCharClassChildren(RegExpCharacterClass* that) {
2807 if (info()->does_determine_word) {
2808 // ASSERT(info()->is_word != NodeInfo::UNKNOWN);
2809 NodeInfo next_info = *on_success()->info();
2810 next_info.follows_word = info()->is_word;
2811 set_on_success(on_success()->EnsureExpanded(&next_info));
2812 } else {
2813 set_on_success(on_success()->EnsureExpanded(info()));
2814 }
2815}
2816
2817
2818void TextNode::ExpandChildren() {
2819 TextElement last = elements()->last();
2820 switch (last.type) {
2821 case TextElement::ATOM:
2822 ExpandAtomChildren(last.data.u_atom);
2823 break;
2824 case TextElement::CHAR_CLASS:
2825 ExpandCharClassChildren(last.data.u_char_class);
2826 break;
2827 default:
2828 UNREACHABLE();
2829 }
2830}
2831
2832
2833RegExpNode* ActionNode::ExpandLocal(NodeInfo* info) {
2834 ActionNode* clone = this->Clone();
2835 clone->info()->ResetCompilationState();
2836 clone->info()->AddAssumptions(info);
2837 return clone;
2838}
2839
2840
2841void ActionNode::ExpandChildren() {
2842 set_on_success(on_success()->EnsureExpanded(info()));
2843}
2844
2845
2846RegExpNode* BackReferenceNode::ExpandLocal(NodeInfo* info) {
2847 BackReferenceNode* clone = this->Clone();
2848 clone->info()->ResetCompilationState();
2849 clone->info()->AddAssumptions(info);
2850 return clone;
2851}
2852
2853
2854void BackReferenceNode::ExpandChildren() {
2855 set_on_success(on_success()->EnsureExpanded(info()));
2856}
2857
2858
2859RegExpNode* EndNode::ExpandLocal(NodeInfo* info) {
2860 EndNode* clone = this->Clone();
2861 clone->info()->ResetCompilationState();
2862 clone->info()->AddAssumptions(info);
2863 return clone;
2864}
2865
2866
2867void EndNode::ExpandChildren() {
2868 // nothing to do
2869}
2870
2871
2872// -------------------------------------------------------------------
2873// Dispatch table construction
2874
2875
2876void DispatchTableConstructor::VisitEnd(EndNode* that) {
2877 AddRange(CharacterRange::Everything());
2878}
2879
2880
2881void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
2882 node->set_being_calculated(true);
2883 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
2884 for (int i = 0; i < alternatives->length(); i++) {
2885 set_choice_index(i);
2886 alternatives->at(i).node()->Accept(this);
2887 }
2888 node->set_being_calculated(false);
2889}
2890
2891
2892class AddDispatchRange {
2893 public:
2894 explicit AddDispatchRange(DispatchTableConstructor* constructor)
2895 : constructor_(constructor) { }
2896 void Call(uc32 from, DispatchTable::Entry entry);
2897 private:
2898 DispatchTableConstructor* constructor_;
2899};
2900
2901
2902void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
2903 CharacterRange range(from, entry.to());
2904 constructor_->AddRange(range);
2905}
2906
2907
2908void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
2909 if (node->being_calculated())
2910 return;
2911 DispatchTable* table = node->GetTable(ignore_case_);
2912 AddDispatchRange adder(this);
2913 table->ForEach(&adder);
2914}
2915
2916
2917void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
2918 // TODO(160): Find the node that we refer back to and propagate its start
2919 // set back to here. For now we just accept anything.
2920 AddRange(CharacterRange::Everything());
2921}
2922
2923
2924
2925static int CompareRangeByFrom(const CharacterRange* a,
2926 const CharacterRange* b) {
2927 return Compare<uc16>(a->from(), b->from());
2928}
2929
2930
2931void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
2932 ranges->Sort(CompareRangeByFrom);
2933 uc16 last = 0;
2934 for (int i = 0; i < ranges->length(); i++) {
2935 CharacterRange range = ranges->at(i);
2936 if (last < range.from())
2937 AddRange(CharacterRange(last, range.from() - 1));
2938 if (range.to() >= last) {
2939 if (range.to() == 0xFFFF) {
2940 return;
2941 } else {
2942 last = range.to() + 1;
2943 }
2944 }
2945 }
2946 AddRange(CharacterRange(last, 0xFFFF));
2947}
2948
2949
2950void DispatchTableConstructor::VisitText(TextNode* that) {
2951 TextElement elm = that->elements()->at(0);
2952 switch (elm.type) {
2953 case TextElement::ATOM: {
2954 uc16 c = elm.data.u_atom->data()[0];
2955 AddRange(CharacterRange(c, c));
2956 break;
2957 }
2958 case TextElement::CHAR_CLASS: {
2959 RegExpCharacterClass* tree = elm.data.u_char_class;
2960 ZoneList<CharacterRange>* ranges = tree->ranges();
2961 if (tree->is_negated()) {
2962 AddInverse(ranges);
2963 } else {
2964 for (int i = 0; i < ranges->length(); i++)
2965 AddRange(ranges->at(i));
2966 }
2967 break;
2968 }
2969 default: {
2970 UNIMPLEMENTED();
2971 }
2972 }
2973}
2974
2975
2976void DispatchTableConstructor::VisitAction(ActionNode* that) {
2977 that->on_success()->Accept(this);
2978}
2979
2980
2981Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input,
2982 RegExpNode** node_return,
2983 bool ignore_case,
2984 bool is_multiline) {
2985 RegExpCompiler compiler(input->capture_count, ignore_case);
2986 // Wrap the body of the regexp in capture #0.
2987 RegExpNode* captured_body = RegExpCapture::ToNode(input->tree,
2988 0,
2989 &compiler,
2990 compiler.accept(),
2991 compiler.backtrack());
2992 // Add a .*? at the beginning, outside the body capture.
2993 // Note: We could choose to not add this if the regexp is anchored at
2994 // the start of the input but I'm not sure how best to do that and
2995 // since we don't even handle ^ yet I'm saving that optimization for
2996 // later.
2997 RegExpNode* node = RegExpQuantifier::ToNode(0,
2998 RegExpQuantifier::kInfinity,
2999 false,
3000 new RegExpCharacterClass('*'),
3001 &compiler,
3002 captured_body,
3003 compiler.backtrack());
3004 if (node_return != NULL) *node_return = node;
3005 Analysis analysis(ignore_case);
3006 analysis.EnsureAnalyzed(node);
3007
3008 NodeInfo info = *node->info();
3009 node = node->EnsureExpanded(&info);
3010
3011 if (!FLAG_irregexp) {
3012 return Handle<FixedArray>::null();
3013 }
3014
3015 if (is_multiline && !FLAG_attempt_multiline_irregexp) {
3016 return Handle<FixedArray>::null();
3017 }
3018
3019 if (FLAG_irregexp_native) {
3020#ifdef ARM
3021 // Unimplemented, fall-through to bytecode implementation.
3022#else // IA32
3023 RegExpMacroAssemblerIA32 macro_assembler(RegExpMacroAssemblerIA32::UC16,
3024 (input->capture_count + 1) * 2);
3025 return compiler.Assemble(&macro_assembler,
3026 node,
3027 input->capture_count);
3028#endif
3029 }
3030 EmbeddedVector<byte, 1024> codes;
3031 RegExpMacroAssemblerIrregexp macro_assembler(codes);
3032 return compiler.Assemble(&macro_assembler,
3033 node,
3034 input->capture_count);
3035}
3036
3037
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00003038}} // namespace v8::internal