blob: 3ca7139074bb5e32a2e61d1ec2aa51ee5d621142 [file] [log] [blame]
ager@chromium.org9258b6b2008-09-11 09:11:10 +00001// Copyright 2006-2008 the V8 project authors. All rights reserved.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include "v8.h"
29
ager@chromium.orga74f0da2008-12-03 16:05:52 +000030#include "ast.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000031#include "execution.h"
32#include "factory.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000033#include "jsregexp-inl.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000034#include "platform.h"
kasperl@chromium.org41044eb2008-10-06 08:24:46 +000035#include "runtime.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000036#include "top.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000037#include "compilation-cache.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000038#include "string-stream.h"
39#include "parser.h"
40#include "regexp-macro-assembler.h"
41#include "regexp-macro-assembler-tracer.h"
42#include "regexp-macro-assembler-irregexp.h"
ager@chromium.org32912102009-01-16 10:38:43 +000043#include "regexp-stack.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000044
45#ifdef ARM
46#include "regexp-macro-assembler-arm.h"
47#else // IA32
48#include "macro-assembler-ia32.h"
49#include "regexp-macro-assembler-ia32.h"
50#endif
51
52#include "interpreter-irregexp.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000053
54// Including pcre.h undefines DEBUG to avoid getting debug output from
55// the JSCRE implementation. Make sure to redefine it in debug mode
56// after having included the header file.
57#ifdef DEBUG
58#include "third_party/jscre/pcre.h"
59#define DEBUG
60#else
61#include "third_party/jscre/pcre.h"
62#endif
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000063
ager@chromium.orga74f0da2008-12-03 16:05:52 +000064
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000065namespace v8 { namespace internal {
66
67
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000068static Failure* malloc_failure;
69
70static void* JSREMalloc(size_t size) {
71 Object* obj = Heap::AllocateByteArray(size);
72
73 // If allocation failed, return a NULL pointer to JSRE, and jsRegExpCompile
74 // will return NULL to the caller, performs GC there.
75 // Also pass failure information to the caller.
76 if (obj->IsFailure()) {
77 malloc_failure = Failure::cast(obj);
78 return NULL;
79 }
80
81 // Note: object is unrooted, the caller of jsRegExpCompile must
82 // create a handle for the return value before doing heap allocation.
83 return reinterpret_cast<void*>(ByteArray::cast(obj)->GetDataStartAddress());
84}
85
86
87static void JSREFree(void* p) {
88 USE(p); // Do nothing, memory is garbage collected.
89}
90
91
92String* RegExpImpl::last_ascii_string_ = NULL;
93String* RegExpImpl::two_byte_cached_string_ = NULL;
94
95
96void RegExpImpl::NewSpaceCollectionPrologue() {
97 // The two byte string is always in the old space. The Ascii string may be
98 // in either place. If it is in the old space we don't need to do anything.
99 if (Heap::InNewSpace(last_ascii_string_)) {
100 // Invalidate the cache.
101 last_ascii_string_ = NULL;
102 two_byte_cached_string_ = NULL;
103 }
104}
105
106
107void RegExpImpl::OldSpaceCollectionPrologue() {
108 last_ascii_string_ = NULL;
109 two_byte_cached_string_ = NULL;
110}
111
112
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000113Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
114 Handle<String> pattern,
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000115 Handle<String> flags,
116 bool* has_pending_exception) {
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000117 // Ensure that the constructor function has been loaded.
118 if (!constructor->IsLoaded()) {
119 LoadLazy(constructor, has_pending_exception);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000120 if (*has_pending_exception) return Handle<Object>();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000121 }
122 // Call the construct code with 2 arguments.
123 Object** argv[2] = { Handle<Object>::cast(pattern).location(),
124 Handle<Object>::cast(flags).location() };
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000125 return Execution::New(constructor, 2, argv, has_pending_exception);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000126}
127
128
129// Converts a source string to a 16 bit flat string or a SlicedString containing
130// a 16 bit flat string).
131Handle<String> RegExpImpl::CachedStringToTwoByte(Handle<String> subject) {
132 if (*subject == last_ascii_string_) {
133 ASSERT(two_byte_cached_string_ != NULL);
134 return Handle<String>(String::cast(two_byte_cached_string_));
135 }
136 Handle<String> two_byte_string = StringToTwoByte(subject);
137 last_ascii_string_ = *subject;
138 two_byte_cached_string_ = *two_byte_string;
139 return two_byte_string;
140}
141
142
143// Converts a source string to a 16 bit flat string or a SlicedString containing
144// a 16 bit flat string).
145Handle<String> RegExpImpl::StringToTwoByte(Handle<String> pattern) {
ager@chromium.org870a0b62008-11-04 11:43:05 +0000146 StringShape shape(*pattern);
147 if (!pattern->IsFlat(shape)) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000148 FlattenString(pattern);
ager@chromium.orgc3e50d82008-11-05 11:53:10 +0000149 shape = StringShape(*pattern);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000150 }
ager@chromium.org870a0b62008-11-04 11:43:05 +0000151 Handle<String> flat_string(shape.IsCons() ?
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000152 String::cast(ConsString::cast(*pattern)->first()) :
153 *pattern);
ager@chromium.org870a0b62008-11-04 11:43:05 +0000154 ASSERT(flat_string->IsString());
155 StringShape flat_shape(*flat_string);
156 ASSERT(!flat_shape.IsCons());
157 ASSERT(flat_shape.IsSequential() ||
158 flat_shape.IsSliced() ||
159 flat_shape.IsExternal());
160 if (!flat_shape.IsAsciiRepresentation()) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000161 return flat_string;
162 }
163
ager@chromium.org870a0b62008-11-04 11:43:05 +0000164 int len = flat_string->length(flat_shape);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000165 Handle<String> two_byte_string =
ager@chromium.org870a0b62008-11-04 11:43:05 +0000166 Factory::NewRawTwoByteString(len, TENURED);
167 uc16* dest = SeqTwoByteString::cast(*two_byte_string)->GetChars();
168 String::WriteToFlat(*flat_string, flat_shape, dest, 0, len);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000169 return two_byte_string;
170}
171
172
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000173static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
174 int flags = JSRegExp::NONE;
ager@chromium.org870a0b62008-11-04 11:43:05 +0000175 StringShape shape(*str);
176 for (int i = 0; i < str->length(shape); i++) {
177 switch (str->Get(shape, i)) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000178 case 'i':
179 flags |= JSRegExp::IGNORE_CASE;
180 break;
181 case 'g':
182 flags |= JSRegExp::GLOBAL;
183 break;
184 case 'm':
185 flags |= JSRegExp::MULTILINE;
186 break;
187 }
188 }
189 return JSRegExp::Flags(flags);
190}
191
192
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000193static inline void ThrowRegExpException(Handle<JSRegExp> re,
194 Handle<String> pattern,
195 Handle<String> error_text,
196 const char* message) {
197 Handle<JSArray> array = Factory::NewJSArray(2);
198 SetElement(array, 0, pattern);
199 SetElement(array, 1, error_text);
200 Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
201 Top::Throw(*regexp_err);
202}
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000203
204
ager@chromium.org8bb60582008-12-11 12:02:20 +0000205// Generic RegExp methods. Dispatches to implementation specific methods.
206
207
208class OffsetsVector {
209 public:
210 inline OffsetsVector(int num_registers)
211 : offsets_vector_length_(num_registers) {
212 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
213 vector_ = NewArray<int>(offsets_vector_length_);
214 } else {
215 vector_ = static_offsets_vector_;
216 }
217 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000218 inline ~OffsetsVector() {
219 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
220 DeleteArray(vector_);
221 vector_ = NULL;
222 }
223 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000224 inline int* vector() { return vector_; }
225 inline int length() { return offsets_vector_length_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000226
227 private:
228 int* vector_;
229 int offsets_vector_length_;
230 static const int kStaticOffsetsVectorSize = 50;
231 static int static_offsets_vector_[kStaticOffsetsVectorSize];
232};
233
234
235int OffsetsVector::static_offsets_vector_[
236 OffsetsVector::kStaticOffsetsVectorSize];
237
238
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000239Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
240 Handle<String> pattern,
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000241 Handle<String> flag_str) {
242 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
243 Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
244 bool in_cache = !cached.is_null();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000245 LOG(RegExpCompileEvent(re, in_cache));
246
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000247 Handle<Object> result;
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000248 if (in_cache) {
249 re->set_data(*cached);
250 result = re;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000251 } else {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000252 FlattenString(pattern);
253 ZoneScope zone_scope(DELETE_ON_EXIT);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000254 RegExpCompileData parse_result;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000255 FlatStringReader reader(pattern);
256 if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
257 // Throw an exception if we fail to parse the pattern.
258 ThrowRegExpException(re,
259 pattern,
260 parse_result.error,
261 "malformed_regexp");
ager@chromium.org8bb60582008-12-11 12:02:20 +0000262 return Handle<Object>::null();
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000263 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000264
265 if (parse_result.simple && !flags.is_ignore_case()) {
266 // Parse-tree is a single atom that is equal to the pattern.
267 result = AtomCompile(re, pattern, flags, pattern);
268 } else if (parse_result.tree->IsAtom() &&
269 !flags.is_ignore_case() &&
270 parse_result.capture_count == 0) {
271 RegExpAtom* atom = parse_result.tree->AsAtom();
272 Vector<const uc16> atom_pattern = atom->data();
273 Handle<String> atom_string = Factory::NewStringFromTwoByte(atom_pattern);
274 result = AtomCompile(re, pattern, flags, atom_string);
275 } else if (FLAG_irregexp) {
276 result = IrregexpPrepare(re, pattern, flags);
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000277 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000278 result = JscrePrepare(re, pattern, flags);
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000279 }
280 Object* data = re->data();
281 if (data->IsFixedArray()) {
282 // If compilation succeeded then the data is set on the regexp
283 // and we can store it in the cache.
284 Handle<FixedArray> data(FixedArray::cast(re->data()));
285 CompilationCache::PutRegExp(pattern, flags, data);
286 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000287 }
288
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000289 return result;
290}
291
292
293Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
294 Handle<String> subject,
295 Handle<Object> index) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000296 switch (regexp->TypeTag()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000297 case JSRegExp::ATOM:
298 return AtomExec(regexp, subject, index);
299 case JSRegExp::IRREGEXP: {
300 Handle<Object> result = IrregexpExec(regexp, subject, index);
ager@chromium.org6f10e412009-02-13 10:11:16 +0000301 ASSERT(!result.is_null() || Top::has_pending_exception());
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000302 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000303 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000304 case JSRegExp::JSCRE:
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000305 return JscreExec(regexp, subject, index);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000306 default:
307 UNREACHABLE();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000308 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000309 }
310}
311
312
313Handle<Object> RegExpImpl::ExecGlobal(Handle<JSRegExp> regexp,
314 Handle<String> subject) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000315 switch (regexp->TypeTag()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000316 case JSRegExp::ATOM:
317 return AtomExecGlobal(regexp, subject);
318 case JSRegExp::IRREGEXP: {
319 Handle<Object> result = IrregexpExecGlobal(regexp, subject);
ager@chromium.org6f10e412009-02-13 10:11:16 +0000320 ASSERT(!result.is_null() || Top::has_pending_exception());
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000321 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000322 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000323 case JSRegExp::JSCRE:
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000324 return JscreExecGlobal(regexp, subject);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000325 default:
326 UNREACHABLE();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000327 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000328 }
329}
330
331
ager@chromium.org8bb60582008-12-11 12:02:20 +0000332// RegExp Atom implementation: Simple string search using indexOf.
333
334
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000335Handle<Object> RegExpImpl::AtomCompile(Handle<JSRegExp> re,
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000336 Handle<String> pattern,
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000337 JSRegExp::Flags flags,
338 Handle<String> match_pattern) {
339 Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, match_pattern);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000340 return re;
341}
342
343
344Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
345 Handle<String> subject,
346 Handle<Object> index) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000347 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000348
349 uint32_t start_index;
350 if (!Array::IndexFromObject(*index, &start_index)) {
351 return Handle<Smi>(Smi::FromInt(-1));
352 }
353
ager@chromium.org7c537e22008-10-16 08:43:32 +0000354 int value = Runtime::StringMatch(subject, needle, start_index);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000355 if (value == -1) return Factory::null_value();
ager@chromium.org7c537e22008-10-16 08:43:32 +0000356
357 Handle<FixedArray> array = Factory::NewFixedArray(2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000358 array->set(0, Smi::FromInt(value));
359 array->set(1, Smi::FromInt(value + needle->length()));
ager@chromium.org7c537e22008-10-16 08:43:32 +0000360 return Factory::NewJSArrayWithElements(array);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000361}
362
363
364Handle<Object> RegExpImpl::AtomExecGlobal(Handle<JSRegExp> re,
365 Handle<String> subject) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000366 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000367 Handle<JSArray> result = Factory::NewJSArray(1);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000368 int index = 0;
369 int match_count = 0;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000370 int subject_length = subject->length();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000371 int needle_length = needle->length();
ager@chromium.org7c537e22008-10-16 08:43:32 +0000372 while (true) {
ager@chromium.org7c537e22008-10-16 08:43:32 +0000373 int value = -1;
374 if (index + needle_length <= subject_length) {
375 value = Runtime::StringMatch(subject, needle, index);
376 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000377 if (value == -1) break;
378 HandleScope scope;
379 int end = value + needle_length;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000380
381 Handle<FixedArray> array = Factory::NewFixedArray(2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000382 array->set(0, Smi::FromInt(value));
383 array->set(1, Smi::FromInt(end));
ager@chromium.org7c537e22008-10-16 08:43:32 +0000384 Handle<JSArray> pair = Factory::NewJSArrayWithElements(array);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000385 SetElement(result, match_count, pair);
386 match_count++;
387 index = end;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000388 if (needle_length == 0) index++;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000389 }
390 return result;
391}
392
393
ager@chromium.org8bb60582008-12-11 12:02:20 +0000394// JSCRE implementation.
395
396
397int RegExpImpl::JscreNumberOfCaptures(Handle<JSRegExp> re) {
398 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
399 return Smi::cast(value->get(kJscreNumberOfCapturesIndex))->value();
400}
401
402
403ByteArray* RegExpImpl::JscreInternal(Handle<JSRegExp> re) {
404 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
405 return ByteArray::cast(value->get(kJscreInternalIndex));
406}
407
408
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000409Handle<Object>RegExpImpl::JscrePrepare(Handle<JSRegExp> re,
410 Handle<String> pattern,
411 JSRegExp::Flags flags) {
412 Handle<Object> value(Heap::undefined_value());
413 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
414 return re;
415}
416
417
ager@chromium.org8bb60582008-12-11 12:02:20 +0000418static inline Object* JscreDoCompile(String* pattern,
419 JSRegExp::Flags flags,
420 unsigned* number_of_captures,
421 const char** error_message,
422 v8::jscre::JscreRegExp** code) {
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000423 v8::jscre::JSRegExpIgnoreCaseOption case_option = flags.is_ignore_case()
424 ? v8::jscre::JSRegExpIgnoreCase
425 : v8::jscre::JSRegExpDoNotIgnoreCase;
426 v8::jscre::JSRegExpMultilineOption multiline_option = flags.is_multiline()
427 ? v8::jscre::JSRegExpMultiline
428 : v8::jscre::JSRegExpSingleLine;
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000429 *error_message = NULL;
430 malloc_failure = Failure::Exception();
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000431 *code = v8::jscre::jsRegExpCompile(pattern->GetTwoByteData(),
432 pattern->length(),
433 case_option,
434 multiline_option,
435 number_of_captures,
436 error_message,
437 &JSREMalloc,
438 &JSREFree);
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000439 if (*code == NULL && (malloc_failure->IsRetryAfterGC() ||
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000440 malloc_failure->IsOutOfMemoryFailure())) {
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000441 return malloc_failure;
442 } else {
443 // It doesn't matter which object we return here, we just need to return
444 // a non-failure to indicate to the GC-retry code that there was no
445 // allocation failure.
446 return pattern;
447 }
448}
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000449
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000450
ager@chromium.org8bb60582008-12-11 12:02:20 +0000451static void JscreCompileWithRetryAfterGC(Handle<String> pattern,
452 JSRegExp::Flags flags,
453 unsigned* number_of_captures,
454 const char** error_message,
455 v8::jscre::JscreRegExp** code) {
456 CALL_HEAP_FUNCTION_VOID(JscreDoCompile(*pattern,
457 flags,
458 number_of_captures,
459 error_message,
460 code));
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000461}
462
463
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000464Handle<Object> RegExpImpl::JscreCompile(Handle<JSRegExp> re) {
465 ASSERT_EQ(re->TypeTag(), JSRegExp::JSCRE);
466 ASSERT(re->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined());
467
468 Handle<String> pattern(re->Pattern());
469 JSRegExp::Flags flags = re->GetFlags();
470
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000471 Handle<String> two_byte_pattern = StringToTwoByte(pattern);
472
473 unsigned number_of_captures;
474 const char* error_message = NULL;
475
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000476 v8::jscre::JscreRegExp* code = NULL;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000477 FlattenString(pattern);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000478
ager@chromium.org8bb60582008-12-11 12:02:20 +0000479 JscreCompileWithRetryAfterGC(two_byte_pattern,
480 flags,
481 &number_of_captures,
482 &error_message,
483 &code);
ager@chromium.org7c537e22008-10-16 08:43:32 +0000484
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000485 if (code == NULL) {
486 // Throw an exception.
487 Handle<JSArray> array = Factory::NewJSArray(2);
488 SetElement(array, 0, pattern);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000489 const char* message =
490 (error_message == NULL) ? "Unknown regexp error" : error_message;
491 SetElement(array, 1, Factory::NewStringFromUtf8(CStrVector(message)));
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000492 Handle<Object> regexp_err =
493 Factory::NewSyntaxError("malformed_regexp", array);
ager@chromium.org3bf7b912008-11-17 09:09:45 +0000494 Top::Throw(*regexp_err);
495 return Handle<Object>();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000496 }
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000497
498 // Convert the return address to a ByteArray pointer.
499 Handle<ByteArray> internal(
500 ByteArray::FromDataStartAddress(reinterpret_cast<Address>(code)));
501
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000502 Handle<FixedArray> value = Factory::NewFixedArray(kJscreDataLength);
503 value->set(kJscreNumberOfCapturesIndex, Smi::FromInt(number_of_captures));
504 value->set(kJscreInternalIndex, *internal);
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000505 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
506
507 return re;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000508}
509
510
ager@chromium.org8bb60582008-12-11 12:02:20 +0000511Handle<Object> RegExpImpl::JscreExec(Handle<JSRegExp> regexp,
512 Handle<String> subject,
513 Handle<Object> index) {
514 ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);
515 if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {
516 Handle<Object> compile_result = JscreCompile(regexp);
517 if (compile_result.is_null()) return compile_result;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000518 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000519 ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000520
ager@chromium.org8bb60582008-12-11 12:02:20 +0000521 int num_captures = JscreNumberOfCaptures(regexp);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000522
ager@chromium.org8bb60582008-12-11 12:02:20 +0000523 OffsetsVector offsets((num_captures + 1) * 3);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000524
ager@chromium.org8bb60582008-12-11 12:02:20 +0000525 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000526
ager@chromium.org8bb60582008-12-11 12:02:20 +0000527 Handle<String> subject16 = CachedStringToTwoByte(subject);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000528
ager@chromium.org8bb60582008-12-11 12:02:20 +0000529 return JscreExecOnce(regexp,
530 num_captures,
531 subject,
532 previous_index,
533 subject16->GetTwoByteData(),
534 offsets.vector(),
535 offsets.length());
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000536}
537
538
539Handle<Object> RegExpImpl::JscreExecOnce(Handle<JSRegExp> regexp,
540 int num_captures,
541 Handle<String> subject,
542 int previous_index,
543 const uc16* two_byte_subject,
544 int* offsets_vector,
545 int offsets_vector_length) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000546 int rc;
547 {
548 AssertNoAllocation a;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000549 ByteArray* internal = JscreInternal(regexp);
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000550 const v8::jscre::JscreRegExp* js_regexp =
551 reinterpret_cast<v8::jscre::JscreRegExp*>(
552 internal->GetDataStartAddress());
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000553
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000554 rc = v8::jscre::jsRegExpExecute(js_regexp,
555 two_byte_subject,
556 subject->length(),
557 previous_index,
558 offsets_vector,
559 offsets_vector_length);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000560 }
561
562 // The KJS JavaScript engine returns null (ie, a failed match) when
563 // JSRE's internal match limit is exceeded. We duplicate that behavior here.
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000564 if (rc == v8::jscre::JSRegExpErrorNoMatch
565 || rc == v8::jscre::JSRegExpErrorHitLimit) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000566 return Factory::null_value();
567 }
568
569 // Other JSRE errors:
570 if (rc < 0) {
571 // Throw an exception.
572 Handle<Object> code(Smi::FromInt(rc));
573 Handle<Object> args[2] = { Factory::LookupAsciiSymbol("jsre_exec"), code };
574 Handle<Object> regexp_err(
575 Factory::NewTypeError("jsre_error", HandleVector(args, 2)));
576 return Handle<Object>(Top::Throw(*regexp_err));
577 }
578
ager@chromium.org7c537e22008-10-16 08:43:32 +0000579 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000580 // The captures come in (start, end+1) pairs.
581 for (int i = 0; i < 2 * (num_captures+1); i += 2) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000582 array->set(i, Smi::FromInt(offsets_vector[i]));
583 array->set(i+1, Smi::FromInt(offsets_vector[i+1]));
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000584 }
ager@chromium.org7c537e22008-10-16 08:43:32 +0000585 return Factory::NewJSArrayWithElements(array);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000586}
587
588
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000589Handle<Object> RegExpImpl::JscreExecGlobal(Handle<JSRegExp> regexp,
590 Handle<String> subject) {
591 ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);
592 if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {
593 Handle<Object> compile_result = JscreCompile(regexp);
594 if (compile_result.is_null()) return compile_result;
595 }
596 ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());
597
598 // Prepare space for the return values.
599 int num_captures = JscreNumberOfCaptures(regexp);
600
601 OffsetsVector offsets((num_captures + 1) * 3);
602
603 int previous_index = 0;
604
605 Handle<JSArray> result = Factory::NewJSArray(0);
606 int i = 0;
607 Handle<Object> matches;
608
609 Handle<String> subject16 = CachedStringToTwoByte(subject);
610
611 do {
612 if (previous_index > subject->length() || previous_index < 0) {
613 // Per ECMA-262 15.10.6.2, if the previous index is greater than the
614 // string length, there is no match.
615 matches = Factory::null_value();
616 } else {
617 matches = JscreExecOnce(regexp,
618 num_captures,
619 subject,
620 previous_index,
621 subject16->GetTwoByteData(),
622 offsets.vector(),
623 offsets.length());
624
625 if (matches->IsJSArray()) {
626 SetElement(result, i, matches);
627 i++;
628 previous_index = offsets.vector()[1];
629 if (offsets.vector()[0] == offsets.vector()[1]) {
630 previous_index++;
631 }
632 }
633 }
634 } while (matches->IsJSArray());
635
636 // If we exited the loop with an exception, throw it.
637 if (matches->IsNull()) {
638 // Exited loop normally.
639 return result;
640 } else {
641 // Exited loop with the exception in matches.
642 return matches;
643 }
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000644}
645
646
ager@chromium.org8bb60582008-12-11 12:02:20 +0000647// Irregexp implementation.
648
649
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000650// Retrieves a compiled version of the regexp for either ASCII or non-ASCII
651// strings. If the compiled version doesn't already exist, it is compiled
652// from the source pattern.
653// Irregexp is not feature complete yet. If there is something in the
654// regexp that the compiler cannot currently handle, an empty
655// handle is returned, but no exception is thrown.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000656static Handle<FixedArray> GetCompiledIrregexp(Handle<JSRegExp> re,
657 bool is_ascii) {
658 ASSERT(re->DataAt(JSRegExp::kIrregexpDataIndex)->IsFixedArray());
659 Handle<FixedArray> alternatives(
660 FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex)));
661 ASSERT_EQ(2, alternatives->length());
662
663 int index = is_ascii ? 0 : 1;
664 Object* entry = alternatives->get(index);
665 if (!entry->IsNull()) {
666 return Handle<FixedArray>(FixedArray::cast(entry));
667 }
668
669 // Compile the RegExp.
670 ZoneScope zone_scope(DELETE_ON_EXIT);
671
672 JSRegExp::Flags flags = re->GetFlags();
673
674 Handle<String> pattern(re->Pattern());
ager@chromium.orga8d58b82009-02-02 08:33:31 +0000675 if (!pattern->IsFlat(StringShape(*pattern))) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000676 FlattenString(pattern);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000677 }
678
679 RegExpCompileData compile_data;
680 FlatStringReader reader(pattern);
681 if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) {
682 // Throw an exception if we fail to parse the pattern.
683 // THIS SHOULD NOT HAPPEN. We already parsed it successfully once.
684 ThrowRegExpException(re,
685 pattern,
686 compile_data.error,
687 "malformed_regexp");
688 return Handle<FixedArray>::null();
689 }
690 Handle<FixedArray> compiled_entry =
691 RegExpEngine::Compile(&compile_data,
692 flags.is_ignore_case(),
693 flags.is_multiline(),
694 pattern,
695 is_ascii);
696 if (!compiled_entry.is_null()) {
697 alternatives->set(index, *compiled_entry);
698 }
699 return compiled_entry;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000700}
701
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000702
ager@chromium.org8bb60582008-12-11 12:02:20 +0000703int RegExpImpl::IrregexpNumberOfCaptures(Handle<FixedArray> irre) {
704 return Smi::cast(irre->get(kIrregexpNumberOfCapturesIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000705}
706
707
ager@chromium.org8bb60582008-12-11 12:02:20 +0000708int RegExpImpl::IrregexpNumberOfRegisters(Handle<FixedArray> irre) {
709 return Smi::cast(irre->get(kIrregexpNumberOfRegistersIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000710}
711
712
ager@chromium.org8bb60582008-12-11 12:02:20 +0000713Handle<ByteArray> RegExpImpl::IrregexpByteCode(Handle<FixedArray> irre) {
714 ASSERT(Smi::cast(irre->get(kIrregexpImplementationIndex))->value()
715 == RegExpMacroAssembler::kBytecodeImplementation);
716 return Handle<ByteArray>(ByteArray::cast(irre->get(kIrregexpCodeIndex)));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000717}
718
719
ager@chromium.org8bb60582008-12-11 12:02:20 +0000720Handle<Code> RegExpImpl::IrregexpNativeCode(Handle<FixedArray> irre) {
721 ASSERT(Smi::cast(irre->get(kIrregexpImplementationIndex))->value()
722 != RegExpMacroAssembler::kBytecodeImplementation);
723 return Handle<Code>(Code::cast(irre->get(kIrregexpCodeIndex)));
724}
725
726
727Handle<Object>RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
728 Handle<String> pattern,
729 JSRegExp::Flags flags) {
730 // Make space for ASCII and UC16 versions.
731 Handle<FixedArray> alternatives = Factory::NewFixedArray(2);
732 alternatives->set_null(0);
733 alternatives->set_null(1);
734 Factory::SetRegExpData(re, JSRegExp::IRREGEXP, pattern, flags, alternatives);
735 return re;
736}
737
738
739Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
740 Handle<String> subject,
741 Handle<Object> index) {
742 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
743 ASSERT(regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsFixedArray());
744
745 bool is_ascii = StringShape(*subject).IsAsciiRepresentation();
746 Handle<FixedArray> irregexp = GetCompiledIrregexp(regexp, is_ascii);
747 if (irregexp.is_null()) {
748 // We can't handle the RegExp with IRRegExp.
749 return Handle<Object>::null();
750 }
751
752 // Prepare space for the return values.
753 int number_of_registers = IrregexpNumberOfRegisters(irregexp);
754 OffsetsVector offsets(number_of_registers);
755
756 int num_captures = IrregexpNumberOfCaptures(irregexp);
757
758 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
759
760#ifdef DEBUG
761 if (FLAG_trace_regexp_bytecodes) {
762 String* pattern = regexp->Pattern();
763 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
764 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
765 }
766#endif
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000767
768 if (!subject->IsFlat(StringShape(*subject))) {
769 FlattenString(subject);
770 }
771
ager@chromium.org8bb60582008-12-11 12:02:20 +0000772 return IrregexpExecOnce(irregexp,
773 num_captures,
774 subject,
775 previous_index,
776 offsets.vector(),
777 offsets.length());
778}
779
780
781Handle<Object> RegExpImpl::IrregexpExecGlobal(Handle<JSRegExp> regexp,
782 Handle<String> subject) {
783 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
784
ager@chromium.orga8d58b82009-02-02 08:33:31 +0000785 bool is_ascii = StringShape(*subject).IsAsciiRepresentation();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000786 Handle<FixedArray> irregexp = GetCompiledIrregexp(regexp, is_ascii);
787 if (irregexp.is_null()) {
788 return Handle<Object>::null();
789 }
790
791 // Prepare space for the return values.
792 int number_of_registers = IrregexpNumberOfRegisters(irregexp);
793 OffsetsVector offsets(number_of_registers);
794
795 int previous_index = 0;
796
797 Handle<JSArray> result = Factory::NewJSArray(0);
798 int i = 0;
799 Handle<Object> matches;
800
ager@chromium.orga8d58b82009-02-02 08:33:31 +0000801 if (!subject->IsFlat(StringShape(*subject))) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000802 FlattenString(subject);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000803 }
804
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000805 while (true) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000806 if (previous_index > subject->length() || previous_index < 0) {
807 // Per ECMA-262 15.10.6.2, if the previous index is greater than the
808 // string length, there is no match.
809 matches = Factory::null_value();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000810 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000811 } else {
812#ifdef DEBUG
813 if (FLAG_trace_regexp_bytecodes) {
814 String* pattern = regexp->Pattern();
815 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
816 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
817 }
818#endif
ager@chromium.org8bb60582008-12-11 12:02:20 +0000819 matches = IrregexpExecOnce(irregexp,
820 IrregexpNumberOfCaptures(irregexp),
821 subject,
822 previous_index,
823 offsets.vector(),
824 offsets.length());
825
ager@chromium.org6f10e412009-02-13 10:11:16 +0000826 if (matches.is_null()) {
827 ASSERT(Top::has_pending_exception());
828 return matches;
829 }
830
ager@chromium.org8bb60582008-12-11 12:02:20 +0000831 if (matches->IsJSArray()) {
832 SetElement(result, i, matches);
833 i++;
834 previous_index = offsets.vector()[1];
835 if (offsets.vector()[0] == offsets.vector()[1]) {
836 previous_index++;
837 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000838 } else {
ager@chromium.org6f10e412009-02-13 10:11:16 +0000839 ASSERT(matches->IsNull());
840 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000841 }
842 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000843 }
844}
845
846
847Handle<Object> RegExpImpl::IrregexpExecOnce(Handle<FixedArray> irregexp,
848 int num_captures,
849 Handle<String> subject,
850 int previous_index,
851 int* offsets_vector,
852 int offsets_vector_length) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000853 ASSERT(subject->IsFlat(StringShape(*subject)));
ager@chromium.org8bb60582008-12-11 12:02:20 +0000854 bool rc;
855
856 int tag = Smi::cast(irregexp->get(kIrregexpImplementationIndex))->value();
857
ager@chromium.org8bb60582008-12-11 12:02:20 +0000858 switch (tag) {
859 case RegExpMacroAssembler::kIA32Implementation: {
860#ifndef ARM
861 Handle<Code> code = IrregexpNativeCode(irregexp);
862
863 StringShape shape(*subject);
864
865 // Character offsets into string.
866 int start_offset = previous_index;
867 int end_offset = subject->length(shape);
868
869 if (shape.IsCons()) {
870 subject = Handle<String>(ConsString::cast(*subject)->first());
871 } else if (shape.IsSliced()) {
872 SlicedString* slice = SlicedString::cast(*subject);
873 start_offset += slice->start();
874 end_offset += slice->start();
875 subject = Handle<String>(slice->buffer());
876 }
877
878 // String is now either Sequential or External
879 StringShape flatshape(*subject);
880 bool is_ascii = flatshape.IsAsciiRepresentation();
881 int char_size_shift = is_ascii ? 0 : 1;
882
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000883 RegExpMacroAssemblerIA32::Result res;
884
ager@chromium.org8bb60582008-12-11 12:02:20 +0000885 if (flatshape.IsExternal()) {
886 const byte* address;
887 if (is_ascii) {
888 ExternalAsciiString* ext = ExternalAsciiString::cast(*subject);
889 address = reinterpret_cast<const byte*>(ext->resource()->data());
890 } else {
891 ExternalTwoByteString* ext = ExternalTwoByteString::cast(*subject);
892 address = reinterpret_cast<const byte*>(ext->resource()->data());
893 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000894 res = RegExpMacroAssemblerIA32::Execute(
ager@chromium.org8bb60582008-12-11 12:02:20 +0000895 *code,
ager@chromium.org32912102009-01-16 10:38:43 +0000896 const_cast<Address*>(&address),
ager@chromium.org8bb60582008-12-11 12:02:20 +0000897 start_offset << char_size_shift,
898 end_offset << char_size_shift,
899 offsets_vector,
900 previous_index == 0);
901 } else { // Sequential string
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000902 ASSERT(StringShape(*subject).IsSequential());
ager@chromium.org8bb60582008-12-11 12:02:20 +0000903 Address char_address =
904 is_ascii ? SeqAsciiString::cast(*subject)->GetCharsAddress()
905 : SeqTwoByteString::cast(*subject)->GetCharsAddress();
906 int byte_offset = char_address - reinterpret_cast<Address>(*subject);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000907 res = RegExpMacroAssemblerIA32::Execute(
ager@chromium.org8bb60582008-12-11 12:02:20 +0000908 *code,
ager@chromium.org32912102009-01-16 10:38:43 +0000909 reinterpret_cast<Address*>(subject.location()),
ager@chromium.org8bb60582008-12-11 12:02:20 +0000910 byte_offset + (start_offset << char_size_shift),
911 byte_offset + (end_offset << char_size_shift),
912 offsets_vector,
913 previous_index == 0);
914 }
915
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000916 if (res == RegExpMacroAssemblerIA32::EXCEPTION) {
917 ASSERT(Top::has_pending_exception());
918 return Handle<Object>::null();
919 }
920 rc = (res == RegExpMacroAssemblerIA32::SUCCESS);
921
ager@chromium.org8bb60582008-12-11 12:02:20 +0000922 if (rc) {
923 // Capture values are relative to start_offset only.
924 for (int i = 0; i < offsets_vector_length; i++) {
925 if (offsets_vector[i] >= 0) {
926 offsets_vector[i] += previous_index;
927 }
928 }
929 }
930 break;
931#else
932 UNIMPLEMENTED();
933 rc = false;
934 break;
935#endif
936 }
937 case RegExpMacroAssembler::kBytecodeImplementation: {
938 for (int i = (num_captures + 1) * 2 - 1; i >= 0; i--) {
939 offsets_vector[i] = -1;
940 }
941 Handle<ByteArray> byte_codes = IrregexpByteCode(irregexp);
942
943 rc = IrregexpInterpreter::Match(byte_codes,
944 subject,
945 offsets_vector,
946 previous_index);
947 break;
948 }
949 case RegExpMacroAssembler::kARMImplementation:
950 default:
951 UNREACHABLE();
952 rc = false;
953 break;
954 }
955
956 if (!rc) {
957 return Factory::null_value();
958 }
959
960 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
961 // The captures come in (start, end+1) pairs.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000962 for (int i = 0; i < 2 * (num_captures + 1); i += 2) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000963 array->set(i, Smi::FromInt(offsets_vector[i]));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000964 array->set(i + 1, Smi::FromInt(offsets_vector[i + 1]));
ager@chromium.org8bb60582008-12-11 12:02:20 +0000965 }
966 return Factory::NewJSArrayWithElements(array);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000967}
968
969
970// -------------------------------------------------------------------
971// Implmentation of the Irregexp regular expression engine.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000972//
973// The Irregexp regular expression engine is intended to be a complete
974// implementation of ECMAScript regular expressions. It generates either
975// bytecodes or native code.
976
977// The Irregexp regexp engine is structured in three steps.
978// 1) The parser generates an abstract syntax tree. See ast.cc.
979// 2) From the AST a node network is created. The nodes are all
980// subclasses of RegExpNode. The nodes represent states when
981// executing a regular expression. Several optimizations are
982// performed on the node network.
983// 3) From the nodes we generate either byte codes or native code
984// that can actually execute the regular expression (perform
985// the search). The code generation step is described in more
986// detail below.
987
988// Code generation.
989//
990// The nodes are divided into four main categories.
991// * Choice nodes
992// These represent places where the regular expression can
993// match in more than one way. For example on entry to an
994// alternation (foo|bar) or a repetition (*, +, ? or {}).
995// * Action nodes
996// These represent places where some action should be
997// performed. Examples include recording the current position
998// in the input string to a register (in order to implement
999// captures) or other actions on register for example in order
1000// to implement the counters needed for {} repetitions.
1001// * Matching nodes
1002// These attempt to match some element part of the input string.
1003// Examples of elements include character classes, plain strings
1004// or back references.
1005// * End nodes
1006// These are used to implement the actions required on finding
1007// a successful match or failing to find a match.
1008//
1009// The code generated (whether as byte codes or native code) maintains
1010// some state as it runs. This consists of the following elements:
1011//
1012// * The capture registers. Used for string captures.
1013// * Other registers. Used for counters etc.
1014// * The current position.
1015// * The stack of backtracking information. Used when a matching node
1016// fails to find a match and needs to try an alternative.
1017//
1018// Conceptual regular expression execution model:
1019//
1020// There is a simple conceptual model of regular expression execution
1021// which will be presented first. The actual code generated is a more
1022// efficient simulation of the simple conceptual model:
1023//
1024// * Choice nodes are implemented as follows:
1025// For each choice except the last {
1026// push current position
1027// push backtrack code location
1028// <generate code to test for choice>
1029// backtrack code location:
1030// pop current position
1031// }
1032// <generate code to test for last choice>
1033//
1034// * Actions nodes are generated as follows
1035// <push affected registers on backtrack stack>
1036// <generate code to perform action>
1037// push backtrack code location
1038// <generate code to test for following nodes>
1039// backtrack code location:
1040// <pop affected registers to restore their state>
1041// <pop backtrack location from stack and go to it>
1042//
1043// * Matching nodes are generated as follows:
1044// if input string matches at current position
1045// update current position
1046// <generate code to test for following nodes>
1047// else
1048// <pop backtrack location from stack and go to it>
1049//
1050// Thus it can be seen that the current position is saved and restored
1051// by the choice nodes, whereas the registers are saved and restored by
1052// by the action nodes that manipulate them.
1053//
1054// The other interesting aspect of this model is that nodes are generated
1055// at the point where they are needed by a recursive call to Emit(). If
1056// the node has already been code generated then the Emit() call will
1057// generate a jump to the previously generated code instead. In order to
1058// limit recursion it is possible for the Emit() function to put the node
1059// on a work list for later generation and instead generate a jump. The
1060// destination of the jump is resolved later when the code is generated.
1061//
1062// Actual regular expression code generation.
1063//
1064// Code generation is actually more complicated than the above. In order
1065// to improve the efficiency of the generated code some optimizations are
1066// performed
1067//
1068// * Choice nodes have 1-character lookahead.
1069// A choice node looks at the following character and eliminates some of
1070// the choices immediately based on that character. This is not yet
1071// implemented.
1072// * Simple greedy loops store reduced backtracking information.
1073// A quantifier like /.*foo/m will greedily match the whole input. It will
1074// then need to backtrack to a point where it can match "foo". The naive
1075// implementation of this would push each character position onto the
1076// backtracking stack, then pop them off one by one. This would use space
1077// proportional to the length of the input string. However since the "."
1078// can only match in one way and always has a constant length (in this case
1079// of 1) it suffices to store the current position on the top of the stack
1080// once. Matching now becomes merely incrementing the current position and
1081// backtracking becomes decrementing the current position and checking the
1082// result against the stored current position. This is faster and saves
1083// space.
1084// * The current state is virtualized.
1085// This is used to defer expensive operations until it is clear that they
1086// are needed and to generate code for a node more than once, allowing
1087// specialized an efficient versions of the code to be created. This is
1088// explained in the section below.
1089//
1090// Execution state virtualization.
1091//
1092// Instead of emitting code, nodes that manipulate the state can record their
ager@chromium.org32912102009-01-16 10:38:43 +00001093// manipulation in an object called the Trace. The Trace object can record a
1094// current position offset, an optional backtrack code location on the top of
1095// the virtualized backtrack stack and some register changes. When a node is
1096// to be emitted it can flush the Trace or update it. Flushing the Trace
ager@chromium.org8bb60582008-12-11 12:02:20 +00001097// will emit code to bring the actual state into line with the virtual state.
1098// Avoiding flushing the state can postpone some work (eg updates of capture
1099// registers). Postponing work can save time when executing the regular
1100// expression since it may be found that the work never has to be done as a
1101// failure to match can occur. In addition it is much faster to jump to a
1102// known backtrack code location than it is to pop an unknown backtrack
1103// location from the stack and jump there.
1104//
ager@chromium.org32912102009-01-16 10:38:43 +00001105// The virtual state found in the Trace affects code generation. For example
1106// the virtual state contains the difference between the actual current
1107// position and the virtual current position, and matching code needs to use
1108// this offset to attempt a match in the correct location of the input
1109// string. Therefore code generated for a non-trivial trace is specialized
1110// to that trace. The code generator therefore has the ability to generate
1111// code for each node several times. In order to limit the size of the
1112// generated code there is an arbitrary limit on how many specialized sets of
1113// code may be generated for a given node. If the limit is reached, the
1114// trace is flushed and a generic version of the code for a node is emitted.
1115// This is subsequently used for that node. The code emitted for non-generic
1116// trace is not recorded in the node and so it cannot currently be reused in
1117// the event that code generation is requested for an identical trace.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001118
1119
1120void RegExpTree::AppendToText(RegExpText* text) {
1121 UNREACHABLE();
1122}
1123
1124
1125void RegExpAtom::AppendToText(RegExpText* text) {
1126 text->AddElement(TextElement::Atom(this));
1127}
1128
1129
1130void RegExpCharacterClass::AppendToText(RegExpText* text) {
1131 text->AddElement(TextElement::CharClass(this));
1132}
1133
1134
1135void RegExpText::AppendToText(RegExpText* text) {
1136 for (int i = 0; i < elements()->length(); i++)
1137 text->AddElement(elements()->at(i));
1138}
1139
1140
1141TextElement TextElement::Atom(RegExpAtom* atom) {
1142 TextElement result = TextElement(ATOM);
1143 result.data.u_atom = atom;
1144 return result;
1145}
1146
1147
1148TextElement TextElement::CharClass(
1149 RegExpCharacterClass* char_class) {
1150 TextElement result = TextElement(CHAR_CLASS);
1151 result.data.u_char_class = char_class;
1152 return result;
1153}
1154
1155
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001156int TextElement::length() {
1157 if (type == ATOM) {
1158 return data.u_atom->length();
1159 } else {
1160 ASSERT(type == CHAR_CLASS);
1161 return 1;
1162 }
1163}
1164
1165
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001166DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
1167 if (table_ == NULL) {
1168 table_ = new DispatchTable();
1169 DispatchTableConstructor cons(table_, ignore_case);
1170 cons.BuildTable(this);
1171 }
1172 return table_;
1173}
1174
1175
1176class RegExpCompiler {
1177 public:
ager@chromium.org8bb60582008-12-11 12:02:20 +00001178 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001179
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001180 int AllocateRegister() {
1181 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
1182 reg_exp_too_big_ = true;
1183 return next_register_;
1184 }
1185 return next_register_++;
1186 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001187
1188 Handle<FixedArray> Assemble(RegExpMacroAssembler* assembler,
1189 RegExpNode* start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001190 int capture_count,
1191 Handle<String> pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001192
1193 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
1194
1195 static const int kImplementationOffset = 0;
1196 static const int kNumberOfRegistersOffset = 0;
1197 static const int kCodeOffset = 1;
1198
1199 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
1200 EndNode* accept() { return accept_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001201
1202 static const int kMaxRecursion = 100;
1203 inline int recursion_depth() { return recursion_depth_; }
1204 inline void IncrementRecursionDepth() { recursion_depth_++; }
1205 inline void DecrementRecursionDepth() { recursion_depth_--; }
1206
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001207 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1208
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001209 inline bool ignore_case() { return ignore_case_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001210 inline bool ascii() { return ascii_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001211
ager@chromium.org32912102009-01-16 10:38:43 +00001212 static const int kNoRegister = -1;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001213 private:
1214 EndNode* accept_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001215 int next_register_;
1216 List<RegExpNode*>* work_list_;
1217 int recursion_depth_;
1218 RegExpMacroAssembler* macro_assembler_;
1219 bool ignore_case_;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001220 bool ascii_;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001221 bool reg_exp_too_big_;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001222};
1223
1224
1225class RecursionCheck {
1226 public:
1227 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1228 compiler->IncrementRecursionDepth();
1229 }
1230 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1231 private:
1232 RegExpCompiler* compiler_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001233};
1234
1235
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001236static Handle<FixedArray> IrregexpRegExpTooBig(Handle<String> pattern) {
1237 Handle<JSArray> array = Factory::NewJSArray(2);
1238 SetElement(array, 0, pattern);
1239 const char* message = "RegExp too big";
1240 SetElement(array, 1, Factory::NewStringFromUtf8(CStrVector(message)));
1241 Handle<Object> regexp_err =
1242 Factory::NewSyntaxError("malformed_regexp", array);
1243 Top::Throw(*regexp_err);
1244 return Handle<FixedArray>();
1245}
1246
1247
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001248// Attempts to compile the regexp using an Irregexp code generator. Returns
1249// a fixed array or a null handle depending on whether it succeeded.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001250RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001251 : next_register_(2 * (capture_count + 1)),
1252 work_list_(NULL),
1253 recursion_depth_(0),
ager@chromium.org8bb60582008-12-11 12:02:20 +00001254 ignore_case_(ignore_case),
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001255 ascii_(ascii),
1256 reg_exp_too_big_(false) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001257 accept_ = new EndNode(EndNode::ACCEPT);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001258 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001259}
1260
1261
1262Handle<FixedArray> RegExpCompiler::Assemble(
1263 RegExpMacroAssembler* macro_assembler,
1264 RegExpNode* start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001265 int capture_count,
1266 Handle<String> pattern) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001267#ifdef DEBUG
1268 if (FLAG_trace_regexp_assembler)
1269 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
1270 else
1271#endif
1272 macro_assembler_ = macro_assembler;
1273 List <RegExpNode*> work_list(0);
1274 work_list_ = &work_list;
1275 Label fail;
iposva@chromium.org245aa852009-02-10 00:49:54 +00001276 macro_assembler_->PushBacktrack(&fail);
ager@chromium.org32912102009-01-16 10:38:43 +00001277 Trace new_trace;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001278 start->Emit(this, &new_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001279 macro_assembler_->Bind(&fail);
1280 macro_assembler_->Fail();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001281 while (!work_list.is_empty()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001282 work_list.RemoveLast()->Emit(this, &new_trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001283 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001284 if (reg_exp_too_big_) return IrregexpRegExpTooBig(pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001285 Handle<FixedArray> array =
1286 Factory::NewFixedArray(RegExpImpl::kIrregexpDataLength);
1287 array->set(RegExpImpl::kIrregexpImplementationIndex,
1288 Smi::FromInt(macro_assembler_->Implementation()));
1289 array->set(RegExpImpl::kIrregexpNumberOfRegistersIndex,
1290 Smi::FromInt(next_register_));
1291 array->set(RegExpImpl::kIrregexpNumberOfCapturesIndex,
1292 Smi::FromInt(capture_count));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001293 Handle<Object> code = macro_assembler_->GetCode(pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001294 array->set(RegExpImpl::kIrregexpCodeIndex, *code);
1295 work_list_ = NULL;
1296#ifdef DEBUG
1297 if (FLAG_trace_regexp_assembler) {
1298 delete macro_assembler_;
1299 }
1300#endif
1301 return array;
1302}
1303
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001304
ager@chromium.org32912102009-01-16 10:38:43 +00001305bool Trace::DeferredAction::Mentions(int that) {
1306 if (type() == ActionNode::CLEAR_CAPTURES) {
1307 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1308 return range.Contains(that);
1309 } else {
1310 return reg() == that;
1311 }
1312}
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001313
ager@chromium.org32912102009-01-16 10:38:43 +00001314
1315bool Trace::mentions_reg(int reg) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001316 for (DeferredAction* action = actions_;
1317 action != NULL;
1318 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +00001319 if (action->Mentions(reg))
1320 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001321 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001322 return false;
1323}
1324
1325
ager@chromium.org32912102009-01-16 10:38:43 +00001326bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1327 ASSERT_EQ(0, *cp_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001328 for (DeferredAction* action = actions_;
1329 action != NULL;
1330 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +00001331 if (action->Mentions(reg)) {
1332 if (action->type() == ActionNode::STORE_POSITION) {
1333 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1334 return true;
1335 } else {
1336 return false;
1337 }
1338 }
1339 }
1340 return false;
1341}
1342
1343
1344int Trace::FindAffectedRegisters(OutSet* affected_registers) {
1345 int max_register = RegExpCompiler::kNoRegister;
1346 for (DeferredAction* action = actions_;
1347 action != NULL;
1348 action = action->next()) {
1349 if (action->type() == ActionNode::CLEAR_CAPTURES) {
1350 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1351 for (int i = range.from(); i <= range.to(); i++)
1352 affected_registers->Set(i);
1353 if (range.to() > max_register) max_register = range.to();
1354 } else {
1355 affected_registers->Set(action->reg());
1356 if (action->reg() > max_register) max_register = action->reg();
1357 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001358 }
1359 return max_register;
1360}
1361
1362
ager@chromium.org32912102009-01-16 10:38:43 +00001363void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1364 int max_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001365 OutSet& registers_to_pop,
1366 OutSet& registers_to_clear) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001367 for (int reg = max_register; reg >= 0; reg--) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001368 if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
1369 else if (registers_to_clear.Get(reg)) {
1370 int clear_to = reg;
1371 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1372 reg--;
1373 }
1374 assembler->ClearRegisters(reg, clear_to);
1375 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001376 }
1377}
1378
1379
ager@chromium.org32912102009-01-16 10:38:43 +00001380void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1381 int max_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001382 OutSet& affected_registers,
1383 OutSet* registers_to_pop,
1384 OutSet* registers_to_clear) {
1385 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1386 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1387
ager@chromium.org8bb60582008-12-11 12:02:20 +00001388 for (int reg = 0; reg <= max_register; reg++) {
1389 if (!affected_registers.Get(reg)) {
1390 continue;
1391 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001392 // Count pushes performed to force a stack limit check occasionally.
1393 int pushes = 0;
1394
1395 // The chronologically first deferred action in the trace
1396 // is used to infer the action needed to restore a register
1397 // to its previous state (or not, if it's safe to ignore it).
1398 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1399 DeferredActionUndoType undo_action = IGNORE;
1400
ager@chromium.org8bb60582008-12-11 12:02:20 +00001401 int value = 0;
1402 bool absolute = false;
ager@chromium.org32912102009-01-16 10:38:43 +00001403 bool clear = false;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001404 int store_position = -1;
1405 // This is a little tricky because we are scanning the actions in reverse
1406 // historical order (newest first).
1407 for (DeferredAction* action = actions_;
1408 action != NULL;
1409 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +00001410 if (action->Mentions(reg)) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001411 switch (action->type()) {
1412 case ActionNode::SET_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00001413 Trace::DeferredSetRegister* psr =
1414 static_cast<Trace::DeferredSetRegister*>(action);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001415 if (!absolute) {
1416 value += psr->value();
1417 absolute = true;
1418 }
1419 // SET_REGISTER is currently only used for newly introduced loop
1420 // counters. They can have a significant previous value if they
1421 // occour in a loop. TODO(lrn): Propagate this information, so
1422 // we can set undo_action to IGNORE if we know there is no value to
1423 // restore.
1424 undo_action = RESTORE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001425 ASSERT_EQ(store_position, -1);
ager@chromium.org32912102009-01-16 10:38:43 +00001426 ASSERT(!clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001427 break;
1428 }
1429 case ActionNode::INCREMENT_REGISTER:
1430 if (!absolute) {
1431 value++;
1432 }
1433 ASSERT_EQ(store_position, -1);
ager@chromium.org32912102009-01-16 10:38:43 +00001434 ASSERT(!clear);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001435 undo_action = RESTORE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001436 break;
1437 case ActionNode::STORE_POSITION: {
ager@chromium.org32912102009-01-16 10:38:43 +00001438 Trace::DeferredCapture* pc =
1439 static_cast<Trace::DeferredCapture*>(action);
1440 if (!clear && store_position == -1) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001441 store_position = pc->cp_offset();
1442 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001443
1444 // For captures we know that stores and clears alternate.
1445 // Other register, are never cleared, and if the occur
1446 // inside a loop, they might be assigned more than once.
1447 if (reg <= 1) {
1448 // Registers zero and one, aka "capture zero", is
1449 // always set correctly if we succeed. There is no
1450 // need to undo a setting on backtrack, because we
1451 // will set it again or fail.
1452 undo_action = IGNORE;
1453 } else {
1454 undo_action = pc->is_capture() ? CLEAR : RESTORE;
1455 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001456 ASSERT(!absolute);
1457 ASSERT_EQ(value, 0);
1458 break;
1459 }
ager@chromium.org32912102009-01-16 10:38:43 +00001460 case ActionNode::CLEAR_CAPTURES: {
1461 // Since we're scanning in reverse order, if we've already
1462 // set the position we have to ignore historically earlier
1463 // clearing operations.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001464 if (store_position == -1) {
ager@chromium.org32912102009-01-16 10:38:43 +00001465 clear = true;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001466 }
1467 undo_action = RESTORE;
ager@chromium.org32912102009-01-16 10:38:43 +00001468 ASSERT(!absolute);
1469 ASSERT_EQ(value, 0);
1470 break;
1471 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001472 default:
1473 UNREACHABLE();
1474 break;
1475 }
1476 }
1477 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001478 // Prepare for the undo-action (e.g., push if it's going to be popped).
1479 if (undo_action == RESTORE) {
1480 pushes++;
1481 RegExpMacroAssembler::StackCheckFlag stack_check =
1482 RegExpMacroAssembler::kNoStackLimitCheck;
1483 if (pushes == push_limit) {
1484 stack_check = RegExpMacroAssembler::kCheckStackLimit;
1485 pushes = 0;
1486 }
1487
1488 assembler->PushRegister(reg, stack_check);
1489 registers_to_pop->Set(reg);
1490 } else if (undo_action == CLEAR) {
1491 registers_to_clear->Set(reg);
1492 }
1493 // Perform the chronologically last action (or accumulated increment)
1494 // for the register.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001495 if (store_position != -1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001496 assembler->WriteCurrentPositionToRegister(reg, store_position);
ager@chromium.org32912102009-01-16 10:38:43 +00001497 } else if (clear) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001498 assembler->ClearRegisters(reg, reg);
ager@chromium.org32912102009-01-16 10:38:43 +00001499 } else if (absolute) {
1500 assembler->SetRegister(reg, value);
1501 } else if (value != 0) {
1502 assembler->AdvanceRegister(reg, value);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001503 }
1504 }
1505}
1506
1507
ager@chromium.org8bb60582008-12-11 12:02:20 +00001508// This is called as we come into a loop choice node and some other tricky
ager@chromium.org32912102009-01-16 10:38:43 +00001509// nodes. It normalizes the state of the code generator to ensure we can
ager@chromium.org8bb60582008-12-11 12:02:20 +00001510// generate generic code.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001511void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001512 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001513
iposva@chromium.org245aa852009-02-10 00:49:54 +00001514 ASSERT(!is_trivial());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001515
1516 if (actions_ == NULL && backtrack() == NULL) {
1517 // Here we just have some deferred cp advances to fix and we are back to
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001518 // a normal situation. We may also have to forget some information gained
1519 // through a quick check that was already performed.
1520 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001521 // Create a new trivial state and generate the node with that.
ager@chromium.org32912102009-01-16 10:38:43 +00001522 Trace new_state;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001523 successor->Emit(compiler, &new_state);
1524 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001525 }
1526
1527 // Generate deferred actions here along with code to undo them again.
1528 OutSet affected_registers;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001529
ager@chromium.org8bb60582008-12-11 12:02:20 +00001530 int max_register = FindAffectedRegisters(&affected_registers);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001531 OutSet registers_to_pop;
1532 OutSet registers_to_clear;
1533 PerformDeferredActions(assembler,
1534 max_register,
1535 affected_registers,
1536 &registers_to_pop,
1537 &registers_to_clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001538 if (backtrack() != NULL) {
1539 // Here we have a concrete backtrack location. These are set up by choice
1540 // nodes and so they indicate that we have a deferred save of the current
1541 // position which we may need to emit here.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001542 assembler->PushCurrentPosition();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001543 }
1544 if (cp_offset_ != 0) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001545 assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001546 }
1547
1548 // Create a new trivial state and generate the node with that.
1549 Label undo;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001550 assembler->PushBacktrack(&undo);
ager@chromium.org32912102009-01-16 10:38:43 +00001551 Trace new_state;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001552 successor->Emit(compiler, &new_state);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001553
1554 // On backtrack we need to restore state.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001555 assembler->Bind(&undo);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001556 if (backtrack() != NULL) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001557 assembler->PopCurrentPosition();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001558 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001559 RestoreAffectedRegisters(assembler,
1560 max_register,
1561 registers_to_pop,
1562 registers_to_clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001563 if (backtrack() == NULL) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001564 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001565 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001566 assembler->GoTo(backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001567 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001568}
1569
1570
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001571void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001572 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001573
1574 // Omit flushing the trace. We discard the entire stack frame anyway.
1575
ager@chromium.org8bb60582008-12-11 12:02:20 +00001576 if (!label()->is_bound()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001577 // We are completely independent of the trace, since we ignore it,
1578 // so this code can be used as the generic version.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001579 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001580 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001581
1582 // Throw away everything on the backtrack stack since the start
1583 // of the negative submatch and restore the character position.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001584 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1585 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001586 if (clear_capture_count_ > 0) {
1587 // Clear any captures that might have been performed during the success
1588 // of the body of the negative look-ahead.
1589 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1590 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1591 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001592 // Now that we have unwound the stack we find at the top of the stack the
1593 // backtrack that the BeginSubmatch node got.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001594 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001595}
1596
1597
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001598void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org32912102009-01-16 10:38:43 +00001599 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001600 trace->Flush(compiler, this);
1601 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001602 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001603 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001604 if (!label()->is_bound()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001605 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001606 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001607 switch (action_) {
1608 case ACCEPT:
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001609 assembler->Succeed();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001610 return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001611 case BACKTRACK:
ager@chromium.org32912102009-01-16 10:38:43 +00001612 assembler->GoTo(trace->backtrack());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001613 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001614 case NEGATIVE_SUBMATCH_SUCCESS:
1615 // This case is handled in a different virtual method.
1616 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001617 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001618 UNIMPLEMENTED();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001619}
1620
1621
1622void GuardedAlternative::AddGuard(Guard* guard) {
1623 if (guards_ == NULL)
1624 guards_ = new ZoneList<Guard*>(1);
1625 guards_->Add(guard);
1626}
1627
1628
ager@chromium.org8bb60582008-12-11 12:02:20 +00001629ActionNode* ActionNode::SetRegister(int reg,
1630 int val,
1631 RegExpNode* on_success) {
1632 ActionNode* result = new ActionNode(SET_REGISTER, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001633 result->data_.u_store_register.reg = reg;
1634 result->data_.u_store_register.value = val;
1635 return result;
1636}
1637
1638
1639ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1640 ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
1641 result->data_.u_increment_register.reg = reg;
1642 return result;
1643}
1644
1645
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001646ActionNode* ActionNode::StorePosition(int reg,
1647 bool is_capture,
1648 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001649 ActionNode* result = new ActionNode(STORE_POSITION, on_success);
1650 result->data_.u_position_register.reg = reg;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001651 result->data_.u_position_register.is_capture = is_capture;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001652 return result;
1653}
1654
1655
ager@chromium.org32912102009-01-16 10:38:43 +00001656ActionNode* ActionNode::ClearCaptures(Interval range,
1657 RegExpNode* on_success) {
1658 ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
1659 result->data_.u_clear_captures.range_from = range.from();
1660 result->data_.u_clear_captures.range_to = range.to();
1661 return result;
1662}
1663
1664
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001665ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1666 int position_reg,
1667 RegExpNode* on_success) {
1668 ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
1669 result->data_.u_submatch.stack_pointer_register = stack_reg;
1670 result->data_.u_submatch.current_position_register = position_reg;
1671 return result;
1672}
1673
1674
ager@chromium.org8bb60582008-12-11 12:02:20 +00001675ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1676 int position_reg,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001677 int clear_register_count,
1678 int clear_register_from,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001679 RegExpNode* on_success) {
1680 ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001681 result->data_.u_submatch.stack_pointer_register = stack_reg;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001682 result->data_.u_submatch.current_position_register = position_reg;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001683 result->data_.u_submatch.clear_register_count = clear_register_count;
1684 result->data_.u_submatch.clear_register_from = clear_register_from;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001685 return result;
1686}
1687
1688
ager@chromium.org32912102009-01-16 10:38:43 +00001689ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1690 int repetition_register,
1691 int repetition_limit,
1692 RegExpNode* on_success) {
1693 ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
1694 result->data_.u_empty_match_check.start_register = start_register;
1695 result->data_.u_empty_match_check.repetition_register = repetition_register;
1696 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1697 return result;
1698}
1699
1700
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001701#define DEFINE_ACCEPT(Type) \
1702 void Type##Node::Accept(NodeVisitor* visitor) { \
1703 visitor->Visit##Type(this); \
1704 }
1705FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1706#undef DEFINE_ACCEPT
1707
1708
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001709void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1710 visitor->VisitLoopChoice(this);
1711}
1712
1713
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001714// -------------------------------------------------------------------
1715// Emit code.
1716
1717
1718void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1719 Guard* guard,
ager@chromium.org32912102009-01-16 10:38:43 +00001720 Trace* trace) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001721 switch (guard->op()) {
1722 case Guard::LT:
ager@chromium.org32912102009-01-16 10:38:43 +00001723 ASSERT(!trace->mentions_reg(guard->reg()));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001724 macro_assembler->IfRegisterGE(guard->reg(),
1725 guard->value(),
ager@chromium.org32912102009-01-16 10:38:43 +00001726 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001727 break;
1728 case Guard::GEQ:
ager@chromium.org32912102009-01-16 10:38:43 +00001729 ASSERT(!trace->mentions_reg(guard->reg()));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001730 macro_assembler->IfRegisterLT(guard->reg(),
1731 guard->value(),
ager@chromium.org32912102009-01-16 10:38:43 +00001732 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001733 break;
1734 }
1735}
1736
1737
1738static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
1739static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
1740
1741
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001742// Only emits non-letters (things that don't have case). Only used for case
1743// independent matches.
1744static inline bool EmitAtomNonLetter(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001745 RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001746 uc16 c,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001747 Label* on_failure,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001748 int cp_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001749 bool check,
1750 bool preloaded) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001751 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001752 int length = uncanonicalize.get(c, '\0', chars);
1753 bool checked = false;
1754 if (length <= 1) {
1755 if (!preloaded) {
1756 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1757 checked = check;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001758 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001759 macro_assembler->CheckNotCharacter(c, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001760 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001761 return checked;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001762}
1763
1764
1765static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001766 bool ascii,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001767 uc16 c1,
1768 uc16 c2,
1769 Label* on_failure) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001770 uc16 char_mask;
1771 if (ascii) {
1772 char_mask = String::kMaxAsciiCharCode;
1773 } else {
1774 char_mask = String::kMaxUC16CharCode;
1775 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001776 uc16 exor = c1 ^ c2;
1777 // Check whether exor has only one bit set.
1778 if (((exor - 1) & exor) == 0) {
1779 // If c1 and c2 differ only by one bit.
1780 // Ecma262UnCanonicalize always gives the highest number last.
1781 ASSERT(c2 > c1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001782 uc16 mask = char_mask ^ exor;
1783 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001784 return true;
1785 }
1786 ASSERT(c2 > c1);
1787 uc16 diff = c2 - c1;
1788 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1789 // If the characters differ by 2^n but don't differ by one bit then
1790 // subtract the difference from the found character, then do the or
1791 // trick. We avoid the theoretical case where negative numbers are
1792 // involved in order to simplify code generation.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001793 uc16 mask = char_mask ^ diff;
1794 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1795 diff,
1796 mask,
1797 on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001798 return true;
1799 }
1800 return false;
1801}
1802
1803
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001804// Only emits letters (things that have case). Only used for case independent
1805// matches.
1806static inline bool EmitAtomLetter(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001807 RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001808 bool ascii,
1809 uc16 c,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001810 Label* on_failure,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001811 int cp_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001812 bool check,
1813 bool preloaded) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001814 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001815 int length = uncanonicalize.get(c, '\0', chars);
1816 if (length <= 1) return false;
1817 // We may not need to check against the end of the input string
1818 // if this character lies before a character that matched.
1819 if (!preloaded) {
1820 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001821 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001822 Label ok;
1823 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1824 switch (length) {
1825 case 2: {
1826 if (ShortCutEmitCharacterPair(macro_assembler,
1827 ascii,
1828 chars[0],
1829 chars[1],
1830 on_failure)) {
1831 } else {
1832 macro_assembler->CheckCharacter(chars[0], &ok);
1833 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1834 macro_assembler->Bind(&ok);
1835 }
1836 break;
1837 }
1838 case 4:
1839 macro_assembler->CheckCharacter(chars[3], &ok);
1840 // Fall through!
1841 case 3:
1842 macro_assembler->CheckCharacter(chars[0], &ok);
1843 macro_assembler->CheckCharacter(chars[1], &ok);
1844 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1845 macro_assembler->Bind(&ok);
1846 break;
1847 default:
1848 UNREACHABLE();
1849 break;
1850 }
1851 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001852}
1853
1854
1855static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1856 RegExpCharacterClass* cc,
1857 int cp_offset,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001858 Label* on_failure,
1859 bool check_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001860 bool ascii,
1861 bool preloaded) {
1862 if (cc->is_standard() &&
1863 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1864 cp_offset,
1865 check_offset,
1866 on_failure)) {
1867 return;
1868 }
1869
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001870 ZoneList<CharacterRange>* ranges = cc->ranges();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001871 int max_char;
1872 if (ascii) {
1873 max_char = String::kMaxAsciiCharCode;
1874 } else {
1875 max_char = String::kMaxUC16CharCode;
1876 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001877
1878 Label success;
1879
1880 Label* char_is_in_class =
1881 cc->is_negated() ? on_failure : &success;
1882
1883 int range_count = ranges->length();
1884
ager@chromium.org8bb60582008-12-11 12:02:20 +00001885 int last_valid_range = range_count - 1;
1886 while (last_valid_range >= 0) {
1887 CharacterRange& range = ranges->at(last_valid_range);
1888 if (range.from() <= max_char) {
1889 break;
1890 }
1891 last_valid_range--;
1892 }
1893
1894 if (last_valid_range < 0) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001895 if (!cc->is_negated()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001896 // TODO(plesner): We can remove this when the node level does our
1897 // ASCII optimizations for us.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001898 macro_assembler->GoTo(on_failure);
1899 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001900 if (check_offset) {
1901 macro_assembler->CheckPosition(cp_offset, on_failure);
1902 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001903 return;
1904 }
1905
ager@chromium.org8bb60582008-12-11 12:02:20 +00001906 if (last_valid_range == 0 &&
1907 !cc->is_negated() &&
1908 ranges->at(0).IsEverything(max_char)) {
1909 // This is a common case hit by non-anchored expressions.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001910 if (check_offset) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001911 macro_assembler->CheckPosition(cp_offset, on_failure);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001912 }
1913 return;
1914 }
1915
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001916 if (!preloaded) {
1917 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001918 }
1919
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001920 for (int i = 0; i < last_valid_range; i++) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001921 CharacterRange& range = ranges->at(i);
1922 Label next_range;
1923 uc16 from = range.from();
1924 uc16 to = range.to();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001925 if (from > max_char) {
1926 continue;
1927 }
1928 if (to > max_char) to = max_char;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001929 if (to == from) {
1930 macro_assembler->CheckCharacter(to, char_is_in_class);
1931 } else {
1932 if (from != 0) {
1933 macro_assembler->CheckCharacterLT(from, &next_range);
1934 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001935 if (to != max_char) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001936 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
1937 } else {
1938 macro_assembler->GoTo(char_is_in_class);
1939 }
1940 }
1941 macro_assembler->Bind(&next_range);
1942 }
1943
ager@chromium.org8bb60582008-12-11 12:02:20 +00001944 CharacterRange& range = ranges->at(last_valid_range);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001945 uc16 from = range.from();
1946 uc16 to = range.to();
1947
ager@chromium.org8bb60582008-12-11 12:02:20 +00001948 if (to > max_char) to = max_char;
1949 ASSERT(to >= from);
1950
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001951 if (to == from) {
1952 if (cc->is_negated()) {
1953 macro_assembler->CheckCharacter(to, on_failure);
1954 } else {
1955 macro_assembler->CheckNotCharacter(to, on_failure);
1956 }
1957 } else {
1958 if (from != 0) {
1959 if (cc->is_negated()) {
1960 macro_assembler->CheckCharacterLT(from, &success);
1961 } else {
1962 macro_assembler->CheckCharacterLT(from, on_failure);
1963 }
1964 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001965 if (to != String::kMaxUC16CharCode) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001966 if (cc->is_negated()) {
1967 macro_assembler->CheckCharacterLT(to + 1, on_failure);
1968 } else {
1969 macro_assembler->CheckCharacterGT(to, on_failure);
1970 }
1971 } else {
1972 if (cc->is_negated()) {
1973 macro_assembler->GoTo(on_failure);
1974 }
1975 }
1976 }
1977 macro_assembler->Bind(&success);
1978}
1979
1980
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001981RegExpNode::~RegExpNode() {
1982}
1983
1984
ager@chromium.org8bb60582008-12-11 12:02:20 +00001985RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00001986 Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001987 // If we are generating a greedy loop then don't stop and don't reuse code.
ager@chromium.org32912102009-01-16 10:38:43 +00001988 if (trace->stop_node() != NULL) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001989 return CONTINUE;
1990 }
1991
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001992 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00001993 if (trace->is_trivial()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001994 if (label_.is_bound()) {
1995 // We are being asked to generate a generic version, but that's already
1996 // been done so just go to it.
1997 macro_assembler->GoTo(&label_);
1998 return DONE;
1999 }
2000 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
2001 // To avoid too deep recursion we push the node to the work queue and just
2002 // generate a goto here.
2003 compiler->AddWork(this);
2004 macro_assembler->GoTo(&label_);
2005 return DONE;
2006 }
2007 // Generate generic version of the node and bind the label for later use.
2008 macro_assembler->Bind(&label_);
2009 return CONTINUE;
2010 }
2011
2012 // We are being asked to make a non-generic version. Keep track of how many
2013 // non-generic versions we generate so as not to overdo it.
ager@chromium.org32912102009-01-16 10:38:43 +00002014 trace_count_++;
iposva@chromium.org245aa852009-02-10 00:49:54 +00002015 if (FLAG_irregexp_optimization &&
2016 trace_count_ < kMaxCopiesCodeGenerated &&
ager@chromium.org8bb60582008-12-11 12:02:20 +00002017 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
2018 return CONTINUE;
2019 }
2020
ager@chromium.org32912102009-01-16 10:38:43 +00002021 // If we get here code has been generated for this node too many times or
2022 // recursion is too deep. Time to switch to a generic version. The code for
ager@chromium.org8bb60582008-12-11 12:02:20 +00002023 // generic versions above can handle deep recursion properly.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002024 trace->Flush(compiler, this);
2025 return DONE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002026}
2027
2028
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002029int ActionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002030 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2031 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002032 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002033}
2034
2035
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002036int AssertionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
2037 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2038 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
2039}
2040
2041
2042int BackReferenceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
2043 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2044 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
2045}
2046
2047
2048int TextNode::EatsAtLeast(int still_to_find, int recursion_depth) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002049 int answer = Length();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002050 if (answer >= still_to_find) return answer;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002051 if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002052 return answer + on_success()->EatsAtLeast(still_to_find - answer,
2053 recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002054}
2055
2056
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002057int NegativeLookaheadChoiceNode:: EatsAtLeast(int still_to_find,
2058 int recursion_depth) {
2059 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2060 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2061 // afterwards.
2062 RegExpNode* node = alternatives_->at(1).node();
2063 return node->EatsAtLeast(still_to_find, recursion_depth + 1);
2064}
2065
2066
2067void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
2068 QuickCheckDetails* details,
2069 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002070 int filled_in,
2071 bool not_at_start) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002072 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2073 // afterwards.
2074 RegExpNode* node = alternatives_->at(1).node();
iposva@chromium.org245aa852009-02-10 00:49:54 +00002075 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002076}
2077
2078
2079int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2080 int recursion_depth,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002081 RegExpNode* ignore_this_node) {
2082 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2083 int min = 100;
2084 int choice_count = alternatives_->length();
2085 for (int i = 0; i < choice_count; i++) {
2086 RegExpNode* node = alternatives_->at(i).node();
2087 if (node == ignore_this_node) continue;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002088 int node_eats_at_least = node->EatsAtLeast(still_to_find,
2089 recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002090 if (node_eats_at_least < min) min = node_eats_at_least;
2091 }
2092 return min;
2093}
2094
2095
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002096int LoopChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
2097 return EatsAtLeastHelper(still_to_find, recursion_depth, loop_node_);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002098}
2099
2100
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002101int ChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
2102 return EatsAtLeastHelper(still_to_find, recursion_depth, NULL);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002103}
2104
2105
2106// Takes the left-most 1-bit and smears it out, setting all bits to its right.
2107static inline uint32_t SmearBitsRight(uint32_t v) {
2108 v |= v >> 1;
2109 v |= v >> 2;
2110 v |= v >> 4;
2111 v |= v >> 8;
2112 v |= v >> 16;
2113 return v;
2114}
2115
2116
2117bool QuickCheckDetails::Rationalize(bool asc) {
2118 bool found_useful_op = false;
2119 uint32_t char_mask;
2120 if (asc) {
2121 char_mask = String::kMaxAsciiCharCode;
2122 } else {
2123 char_mask = String::kMaxUC16CharCode;
2124 }
2125 mask_ = 0;
2126 value_ = 0;
2127 int char_shift = 0;
2128 for (int i = 0; i < characters_; i++) {
2129 Position* pos = &positions_[i];
2130 if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
2131 found_useful_op = true;
2132 }
2133 mask_ |= (pos->mask & char_mask) << char_shift;
2134 value_ |= (pos->value & char_mask) << char_shift;
2135 char_shift += asc ? 8 : 16;
2136 }
2137 return found_useful_op;
2138}
2139
2140
2141bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00002142 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002143 bool preload_has_checked_bounds,
2144 Label* on_possible_success,
2145 QuickCheckDetails* details,
2146 bool fall_through_on_failure) {
2147 if (details->characters() == 0) return false;
iposva@chromium.org245aa852009-02-10 00:49:54 +00002148 GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
2149 if (details->cannot_match()) return false;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002150 if (!details->Rationalize(compiler->ascii())) return false;
2151 uint32_t mask = details->mask();
2152 uint32_t value = details->value();
2153
2154 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2155
ager@chromium.org32912102009-01-16 10:38:43 +00002156 if (trace->characters_preloaded() != details->characters()) {
2157 assembler->LoadCurrentCharacter(trace->cp_offset(),
2158 trace->backtrack(),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002159 !preload_has_checked_bounds,
2160 details->characters());
2161 }
2162
2163
2164 bool need_mask = true;
2165
2166 if (details->characters() == 1) {
2167 // If number of characters preloaded is 1 then we used a byte or 16 bit
2168 // load so the value is already masked down.
2169 uint32_t char_mask;
2170 if (compiler->ascii()) {
2171 char_mask = String::kMaxAsciiCharCode;
2172 } else {
2173 char_mask = String::kMaxUC16CharCode;
2174 }
2175 if ((mask & char_mask) == char_mask) need_mask = false;
2176 mask &= char_mask;
2177 } else {
2178 // For 2-character preloads in ASCII mode we also use a 16 bit load with
2179 // zero extend.
2180 if (details->characters() == 2 && compiler->ascii()) {
2181 if ((mask & 0xffff) == 0xffff) need_mask = false;
2182 } else {
2183 if (mask == 0xffffffff) need_mask = false;
2184 }
2185 }
2186
2187 if (fall_through_on_failure) {
2188 if (need_mask) {
2189 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2190 } else {
2191 assembler->CheckCharacter(value, on_possible_success);
2192 }
2193 } else {
2194 if (need_mask) {
ager@chromium.org32912102009-01-16 10:38:43 +00002195 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002196 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00002197 assembler->CheckNotCharacter(value, trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002198 }
2199 }
2200 return true;
2201}
2202
2203
2204// Here is the meat of GetQuickCheckDetails (see also the comment on the
2205// super-class in the .h file).
2206//
2207// We iterate along the text object, building up for each character a
2208// mask and value that can be used to test for a quick failure to match.
2209// The masks and values for the positions will be combined into a single
2210// machine word for the current character width in order to be used in
2211// generating a quick check.
2212void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2213 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002214 int characters_filled_in,
2215 bool not_at_start) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002216 ASSERT(characters_filled_in < details->characters());
2217 int characters = details->characters();
2218 int char_mask;
2219 int char_shift;
2220 if (compiler->ascii()) {
2221 char_mask = String::kMaxAsciiCharCode;
2222 char_shift = 8;
2223 } else {
2224 char_mask = String::kMaxUC16CharCode;
2225 char_shift = 16;
2226 }
2227 for (int k = 0; k < elms_->length(); k++) {
2228 TextElement elm = elms_->at(k);
2229 if (elm.type == TextElement::ATOM) {
2230 Vector<const uc16> quarks = elm.data.u_atom->data();
2231 for (int i = 0; i < characters && i < quarks.length(); i++) {
2232 QuickCheckDetails::Position* pos =
2233 details->positions(characters_filled_in);
ager@chromium.org6f10e412009-02-13 10:11:16 +00002234 uc16 c = quarks[i];
2235 if (c > char_mask) {
2236 // If we expect a non-ASCII character from an ASCII string,
2237 // there is no way we can match. Not even case independent
2238 // matching can turn an ASCII character into non-ASCII or
2239 // vice versa.
2240 details->set_cannot_match();
2241 return;
2242 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002243 if (compiler->ignore_case()) {
2244 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002245 int length = uncanonicalize.get(c, '\0', chars);
2246 if (length < 2) {
2247 // This letter has no case equivalents, so it's nice and simple
2248 // and the mask-compare will determine definitely whether we have
2249 // a match at this character position.
2250 pos->mask = char_mask;
2251 pos->value = c;
2252 pos->determines_perfectly = true;
2253 } else {
2254 uint32_t common_bits = char_mask;
2255 uint32_t bits = chars[0];
2256 for (int j = 1; j < length; j++) {
2257 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2258 common_bits ^= differing_bits;
2259 bits &= common_bits;
2260 }
2261 // If length is 2 and common bits has only one zero in it then
2262 // our mask and compare instruction will determine definitely
2263 // whether we have a match at this character position. Otherwise
2264 // it can only be an approximate check.
2265 uint32_t one_zero = (common_bits | ~char_mask);
2266 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2267 pos->determines_perfectly = true;
2268 }
2269 pos->mask = common_bits;
2270 pos->value = bits;
2271 }
2272 } else {
2273 // Don't ignore case. Nice simple case where the mask-compare will
2274 // determine definitely whether we have a match at this character
2275 // position.
2276 pos->mask = char_mask;
ager@chromium.org6f10e412009-02-13 10:11:16 +00002277 pos->value = c;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002278 pos->determines_perfectly = true;
2279 }
2280 characters_filled_in++;
2281 ASSERT(characters_filled_in <= details->characters());
2282 if (characters_filled_in == details->characters()) {
2283 return;
2284 }
2285 }
2286 } else {
2287 QuickCheckDetails::Position* pos =
2288 details->positions(characters_filled_in);
2289 RegExpCharacterClass* tree = elm.data.u_char_class;
2290 ZoneList<CharacterRange>* ranges = tree->ranges();
2291 CharacterRange range = ranges->at(0);
2292 if (tree->is_negated()) {
2293 // A quick check uses multi-character mask and compare. There is no
2294 // useful way to incorporate a negative char class into this scheme
2295 // so we just conservatively create a mask and value that will always
2296 // succeed.
2297 pos->mask = 0;
2298 pos->value = 0;
2299 } else {
2300 uint32_t differing_bits = (range.from() ^ range.to());
2301 // A mask and compare is only perfect if the differing bits form a
2302 // number like 00011111 with one single block of trailing 1s.
2303 if ((differing_bits & (differing_bits + 1)) == 0) {
2304 pos->determines_perfectly = true;
2305 }
2306 uint32_t common_bits = ~SmearBitsRight(differing_bits);
2307 uint32_t bits = (range.from() & common_bits);
2308 for (int i = 1; i < ranges->length(); i++) {
2309 // Here we are combining more ranges into the mask and compare
2310 // value. With each new range the mask becomes more sparse and
2311 // so the chances of a false positive rise. A character class
2312 // with multiple ranges is assumed never to be equivalent to a
2313 // mask and compare operation.
2314 pos->determines_perfectly = false;
2315 CharacterRange range = ranges->at(i);
2316 uint32_t new_common_bits = (range.from() ^ range.to());
2317 new_common_bits = ~SmearBitsRight(new_common_bits);
2318 common_bits &= new_common_bits;
2319 bits &= new_common_bits;
2320 uint32_t differing_bits = (range.from() & common_bits) ^ bits;
2321 common_bits ^= differing_bits;
2322 bits &= common_bits;
2323 }
2324 pos->mask = common_bits;
2325 pos->value = bits;
2326 }
2327 characters_filled_in++;
2328 ASSERT(characters_filled_in <= details->characters());
2329 if (characters_filled_in == details->characters()) {
2330 return;
2331 }
2332 }
2333 }
2334 ASSERT(characters_filled_in != details->characters());
iposva@chromium.org245aa852009-02-10 00:49:54 +00002335 on_success()-> GetQuickCheckDetails(details,
2336 compiler,
2337 characters_filled_in,
2338 true);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002339}
2340
2341
2342void QuickCheckDetails::Clear() {
2343 for (int i = 0; i < characters_; i++) {
2344 positions_[i].mask = 0;
2345 positions_[i].value = 0;
2346 positions_[i].determines_perfectly = false;
2347 }
2348 characters_ = 0;
2349}
2350
2351
2352void QuickCheckDetails::Advance(int by, bool ascii) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002353 ASSERT(by >= 0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002354 if (by >= characters_) {
2355 Clear();
2356 return;
2357 }
2358 for (int i = 0; i < characters_ - by; i++) {
2359 positions_[i] = positions_[by + i];
2360 }
2361 for (int i = characters_ - by; i < characters_; i++) {
2362 positions_[i].mask = 0;
2363 positions_[i].value = 0;
2364 positions_[i].determines_perfectly = false;
2365 }
2366 characters_ -= by;
2367 // We could change mask_ and value_ here but we would never advance unless
2368 // they had already been used in a check and they won't be used again because
2369 // it would gain us nothing. So there's no point.
2370}
2371
2372
2373void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2374 ASSERT(characters_ == other->characters_);
iposva@chromium.org245aa852009-02-10 00:49:54 +00002375 if (other->cannot_match_) {
2376 return;
2377 }
2378 if (cannot_match_) {
2379 *this = *other;
2380 return;
2381 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002382 for (int i = from_index; i < characters_; i++) {
2383 QuickCheckDetails::Position* pos = positions(i);
2384 QuickCheckDetails::Position* other_pos = other->positions(i);
2385 if (pos->mask != other_pos->mask ||
2386 pos->value != other_pos->value ||
2387 !other_pos->determines_perfectly) {
2388 // Our mask-compare operation will be approximate unless we have the
2389 // exact same operation on both sides of the alternation.
2390 pos->determines_perfectly = false;
2391 }
2392 pos->mask &= other_pos->mask;
2393 pos->value &= pos->mask;
2394 other_pos->value &= pos->mask;
2395 uc16 differing_bits = (pos->value ^ other_pos->value);
2396 pos->mask &= ~differing_bits;
2397 pos->value &= pos->mask;
2398 }
2399}
2400
2401
ager@chromium.org32912102009-01-16 10:38:43 +00002402class VisitMarker {
2403 public:
2404 explicit VisitMarker(NodeInfo* info) : info_(info) {
2405 ASSERT(!info->visited);
2406 info->visited = true;
2407 }
2408 ~VisitMarker() {
2409 info_->visited = false;
2410 }
2411 private:
2412 NodeInfo* info_;
2413};
2414
2415
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002416void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2417 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002418 int characters_filled_in,
2419 bool not_at_start) {
ager@chromium.org32912102009-01-16 10:38:43 +00002420 if (body_can_be_zero_length_ || info()->visited) return;
2421 VisitMarker marker(info());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002422 return ChoiceNode::GetQuickCheckDetails(details,
2423 compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002424 characters_filled_in,
2425 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002426}
2427
2428
2429void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2430 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002431 int characters_filled_in,
2432 bool not_at_start) {
2433 not_at_start = (not_at_start || not_at_start_);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002434 int choice_count = alternatives_->length();
2435 ASSERT(choice_count > 0);
2436 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2437 compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002438 characters_filled_in,
2439 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002440 for (int i = 1; i < choice_count; i++) {
2441 QuickCheckDetails new_details(details->characters());
2442 RegExpNode* node = alternatives_->at(i).node();
iposva@chromium.org245aa852009-02-10 00:49:54 +00002443 node->GetQuickCheckDetails(&new_details, compiler,
2444 characters_filled_in,
2445 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002446 // Here we merge the quick match details of the two branches.
2447 details->Merge(&new_details, characters_filled_in);
2448 }
2449}
2450
2451
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002452// Check for [0-9A-Z_a-z].
2453static void EmitWordCheck(RegExpMacroAssembler* assembler,
2454 Label* word,
2455 Label* non_word,
2456 bool fall_through_on_word) {
2457 assembler->CheckCharacterGT('z', non_word);
2458 assembler->CheckCharacterLT('0', non_word);
2459 assembler->CheckCharacterGT('a' - 1, word);
2460 assembler->CheckCharacterLT('9' + 1, word);
2461 assembler->CheckCharacterLT('A', non_word);
2462 assembler->CheckCharacterLT('Z' + 1, word);
2463 if (fall_through_on_word) {
2464 assembler->CheckNotCharacter('_', non_word);
2465 } else {
2466 assembler->CheckCharacter('_', word);
2467 }
2468}
2469
2470
2471// Emit the code to check for a ^ in multiline mode (1-character lookbehind
2472// that matches newline or the start of input).
2473static void EmitHat(RegExpCompiler* compiler,
2474 RegExpNode* on_success,
2475 Trace* trace) {
2476 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2477 // We will be loading the previous character into the current character
2478 // register.
2479 Trace new_trace(*trace);
2480 new_trace.InvalidateCurrentCharacter();
2481
2482 Label ok;
2483 if (new_trace.cp_offset() == 0) {
2484 // The start of input counts as a newline in this context, so skip to
2485 // ok if we are at the start.
2486 assembler->CheckAtStart(&ok);
2487 }
2488 // We already checked that we are not at the start of input so it must be
2489 // OK to load the previous character.
2490 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2491 new_trace.backtrack(),
2492 false);
2493 // Newline means \n, \r, 0x2028 or 0x2029.
2494 if (!compiler->ascii()) {
2495 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2496 }
2497 assembler->CheckCharacter('\n', &ok);
2498 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2499 assembler->Bind(&ok);
2500 on_success->Emit(compiler, &new_trace);
2501}
2502
2503
2504// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2505static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
2506 RegExpCompiler* compiler,
2507 RegExpNode* on_success,
2508 Trace* trace) {
2509 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2510 Label before_non_word;
2511 Label before_word;
2512 if (trace->characters_preloaded() != 1) {
2513 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2514 }
2515 // Fall through on non-word.
2516 EmitWordCheck(assembler, &before_word, &before_non_word, false);
2517
2518 // We will be loading the previous character into the current character
2519 // register.
2520 Trace new_trace(*trace);
2521 new_trace.InvalidateCurrentCharacter();
2522
2523 Label ok;
2524 Label* boundary;
2525 Label* not_boundary;
2526 if (type == AssertionNode::AT_BOUNDARY) {
2527 boundary = &ok;
2528 not_boundary = new_trace.backtrack();
2529 } else {
2530 not_boundary = &ok;
2531 boundary = new_trace.backtrack();
2532 }
2533
2534 // Next character is not a word character.
2535 assembler->Bind(&before_non_word);
2536 if (new_trace.cp_offset() == 0) {
2537 // The start of input counts as a non-word character, so the question is
2538 // decided if we are at the start.
2539 assembler->CheckAtStart(not_boundary);
2540 }
2541 // We already checked that we are not at the start of input so it must be
2542 // OK to load the previous character.
2543 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2544 &ok, // Unused dummy label in this call.
2545 false);
2546 // Fall through on non-word.
2547 EmitWordCheck(assembler, boundary, not_boundary, false);
2548 assembler->GoTo(not_boundary);
2549
2550 // Next character is a word character.
2551 assembler->Bind(&before_word);
2552 if (new_trace.cp_offset() == 0) {
2553 // The start of input counts as a non-word character, so the question is
2554 // decided if we are at the start.
2555 assembler->CheckAtStart(boundary);
2556 }
2557 // We already checked that we are not at the start of input so it must be
2558 // OK to load the previous character.
2559 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2560 &ok, // Unused dummy label in this call.
2561 false);
2562 bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
2563 EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
2564
2565 assembler->Bind(&ok);
2566
2567 on_success->Emit(compiler, &new_trace);
2568}
2569
2570
iposva@chromium.org245aa852009-02-10 00:49:54 +00002571void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2572 RegExpCompiler* compiler,
2573 int filled_in,
2574 bool not_at_start) {
2575 if (type_ == AT_START && not_at_start) {
2576 details->set_cannot_match();
2577 return;
2578 }
2579 return on_success()->GetQuickCheckDetails(details,
2580 compiler,
2581 filled_in,
2582 not_at_start);
2583}
2584
2585
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002586void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2587 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2588 switch (type_) {
2589 case AT_END: {
2590 Label ok;
2591 assembler->CheckPosition(trace->cp_offset(), &ok);
2592 assembler->GoTo(trace->backtrack());
2593 assembler->Bind(&ok);
2594 break;
2595 }
iposva@chromium.org245aa852009-02-10 00:49:54 +00002596 case AT_START: {
2597 if (trace->at_start() == Trace::FALSE) {
2598 assembler->GoTo(trace->backtrack());
2599 return;
2600 }
2601 if (trace->at_start() == Trace::UNKNOWN) {
2602 assembler->CheckNotAtStart(trace->backtrack());
2603 Trace at_start_trace = *trace;
2604 at_start_trace.set_at_start(true);
2605 on_success()->Emit(compiler, &at_start_trace);
2606 return;
2607 }
2608 }
2609 break;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002610 case AFTER_NEWLINE:
2611 EmitHat(compiler, on_success(), trace);
2612 return;
2613 case AT_NON_BOUNDARY:
2614 case AT_BOUNDARY:
2615 EmitBoundaryCheck(type_, compiler, on_success(), trace);
2616 return;
2617 }
2618 on_success()->Emit(compiler, trace);
2619}
2620
2621
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002622// We call this repeatedly to generate code for each pass over the text node.
2623// The passes are in increasing order of difficulty because we hope one
2624// of the first passes will fail in which case we are saved the work of the
2625// later passes. for example for the case independent regexp /%[asdfghjkl]a/
2626// we will check the '%' in the first pass, the case independent 'a' in the
2627// second pass and the character class in the last pass.
2628//
2629// The passes are done from right to left, so for example to test for /bar/
2630// we will first test for an 'r' with offset 2, then an 'a' with offset 1
2631// and then a 'b' with offset 0. This means we can avoid the end-of-input
2632// bounds check most of the time. In the example we only need to check for
2633// end-of-input when loading the putative 'r'.
2634//
2635// A slight complication involves the fact that the first character may already
2636// be fetched into a register by the previous node. In this case we want to
2637// do the test for that character first. We do this in separate passes. The
2638// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2639// pass has been performed then subsequent passes will have true in
2640// first_element_checked to indicate that that character does not need to be
2641// checked again.
2642//
ager@chromium.org32912102009-01-16 10:38:43 +00002643// In addition to all this we are passed a Trace, which can
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002644// contain an AlternativeGeneration object. In this AlternativeGeneration
2645// object we can see details of any quick check that was already passed in
2646// order to get to the code we are now generating. The quick check can involve
2647// loading characters, which means we do not need to recheck the bounds
2648// up to the limit the quick check already checked. In addition the quick
2649// check can have involved a mask and compare operation which may simplify
2650// or obviate the need for further checks at some character positions.
2651void TextNode::TextEmitPass(RegExpCompiler* compiler,
2652 TextEmitPassType pass,
2653 bool preloaded,
ager@chromium.org32912102009-01-16 10:38:43 +00002654 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002655 bool first_element_checked,
2656 int* checked_up_to) {
2657 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2658 bool ascii = compiler->ascii();
ager@chromium.org32912102009-01-16 10:38:43 +00002659 Label* backtrack = trace->backtrack();
2660 QuickCheckDetails* quick_check = trace->quick_check_performed();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002661 int element_count = elms_->length();
2662 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2663 TextElement elm = elms_->at(i);
ager@chromium.org32912102009-01-16 10:38:43 +00002664 int cp_offset = trace->cp_offset() + elm.cp_offset;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002665 if (elm.type == TextElement::ATOM) {
2666 if (pass == NON_ASCII_MATCH ||
2667 pass == CHARACTER_MATCH ||
2668 pass == CASE_CHARACTER_MATCH) {
2669 Vector<const uc16> quarks = elm.data.u_atom->data();
2670 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2671 bool bound_checked = true; // Most ops will check their bounds.
2672 if (first_element_checked && i == 0 && j == 0) continue;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002673 if (pass == NON_ASCII_MATCH) {
2674 ASSERT(ascii);
2675 if (quarks[j] > String::kMaxAsciiCharCode) {
2676 assembler->GoTo(backtrack);
2677 return;
2678 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002679 } else {
ager@chromium.org6f10e412009-02-13 10:11:16 +00002680 if (quick_check != NULL &&
2681 elm.cp_offset + j < quick_check->characters() &&
2682 quick_check->positions(elm.cp_offset + j)->
2683 determines_perfectly) {
2684 continue;
2685 }
2686 if (pass == CHARACTER_MATCH) {
2687 if (compiler->ignore_case()) {
2688 bound_checked = EmitAtomNonLetter(
2689 assembler,
2690 quarks[j],
2691 backtrack,
2692 cp_offset + j,
2693 *checked_up_to < cp_offset + j,
2694 preloaded);
2695 } else {
2696 if (!preloaded) {
2697 assembler->LoadCurrentCharacter(
2698 cp_offset + j,
2699 backtrack,
2700 *checked_up_to < cp_offset + j);
2701 }
2702 assembler->CheckNotCharacter(quarks[j], backtrack);
2703 }
2704 } else {
2705 ASSERT_EQ(pass, CASE_CHARACTER_MATCH);
2706 ASSERT(compiler->ignore_case());
2707 bound_checked = EmitAtomLetter(assembler,
2708 compiler->ascii(),
2709 quarks[j],
2710 backtrack,
2711 cp_offset + j,
2712 *checked_up_to < cp_offset + j,
2713 preloaded);
2714 }
2715 if (bound_checked) {
2716 if (cp_offset + j > *checked_up_to) {
2717 *checked_up_to = cp_offset + j;
2718 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002719 }
2720 }
2721 }
2722 }
2723 } else {
2724 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
2725 if (first_element_checked && i == 0) continue;
2726 if (quick_check != NULL &&
2727 elm.cp_offset < quick_check->characters() &&
2728 quick_check->positions(elm.cp_offset)->determines_perfectly) {
2729 continue;
2730 }
2731 if (pass == CHARACTER_CLASS_MATCH) {
2732 RegExpCharacterClass* cc = elm.data.u_char_class;
2733 EmitCharClass(assembler,
2734 cc,
2735 cp_offset,
2736 backtrack,
2737 *checked_up_to < cp_offset,
2738 ascii,
2739 preloaded);
2740 if (cp_offset > *checked_up_to) {
2741 *checked_up_to = cp_offset;
2742 }
2743 }
2744 }
2745 }
2746}
2747
2748
2749int TextNode::Length() {
2750 TextElement elm = elms_->last();
2751 ASSERT(elm.cp_offset >= 0);
2752 if (elm.type == TextElement::ATOM) {
2753 return elm.cp_offset + elm.data.u_atom->data().length();
2754 } else {
2755 return elm.cp_offset + 1;
2756 }
2757}
2758
2759
ager@chromium.org8bb60582008-12-11 12:02:20 +00002760// This generates the code to match a text node. A text node can contain
2761// straight character sequences (possibly to be matched in a case-independent
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002762// way) and character classes. For efficiency we do not do this in a single
2763// pass from left to right. Instead we pass over the text node several times,
2764// emitting code for some character positions every time. See the comment on
2765// TextEmitPass for details.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002766void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org32912102009-01-16 10:38:43 +00002767 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002768 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002769 ASSERT(limit_result == CONTINUE);
2770
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002771 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2772 compiler->SetRegExpTooBig();
2773 return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002774 }
2775
2776 if (compiler->ascii()) {
2777 int dummy = 0;
ager@chromium.org32912102009-01-16 10:38:43 +00002778 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002779 }
2780
2781 bool first_elt_done = false;
ager@chromium.org32912102009-01-16 10:38:43 +00002782 int bound_checked_to = trace->cp_offset() - 1;
2783 bound_checked_to += trace->bound_checked_up_to();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002784
2785 // If a character is preloaded into the current character register then
2786 // check that now.
ager@chromium.org32912102009-01-16 10:38:43 +00002787 if (trace->characters_preloaded() == 1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002788 TextEmitPass(compiler,
2789 CHARACTER_MATCH,
2790 true,
ager@chromium.org32912102009-01-16 10:38:43 +00002791 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002792 false,
2793 &bound_checked_to);
2794 if (compiler->ignore_case()) {
2795 TextEmitPass(compiler,
2796 CASE_CHARACTER_MATCH,
2797 true,
ager@chromium.org32912102009-01-16 10:38:43 +00002798 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002799 false,
2800 &bound_checked_to);
2801 }
2802 TextEmitPass(compiler,
2803 CHARACTER_CLASS_MATCH,
2804 true,
ager@chromium.org32912102009-01-16 10:38:43 +00002805 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002806 false,
2807 &bound_checked_to);
2808 first_elt_done = true;
2809 }
2810
2811 TextEmitPass(compiler,
2812 CHARACTER_MATCH,
2813 false,
ager@chromium.org32912102009-01-16 10:38:43 +00002814 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002815 first_elt_done,
2816 &bound_checked_to);
2817 if (compiler->ignore_case()) {
2818 TextEmitPass(compiler,
2819 CASE_CHARACTER_MATCH,
2820 false,
ager@chromium.org32912102009-01-16 10:38:43 +00002821 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002822 first_elt_done,
2823 &bound_checked_to);
2824 }
2825 TextEmitPass(compiler,
2826 CHARACTER_CLASS_MATCH,
2827 false,
ager@chromium.org32912102009-01-16 10:38:43 +00002828 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002829 first_elt_done,
2830 &bound_checked_to);
2831
ager@chromium.org32912102009-01-16 10:38:43 +00002832 Trace successor_trace(*trace);
iposva@chromium.org245aa852009-02-10 00:49:54 +00002833 successor_trace.set_at_start(false);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002834 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002835 RecursionCheck rc(compiler);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002836 on_success()->Emit(compiler, &successor_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002837}
2838
2839
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002840void Trace::InvalidateCurrentCharacter() {
2841 characters_preloaded_ = 0;
2842}
2843
2844
2845void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002846 ASSERT(by > 0);
2847 // We don't have an instruction for shifting the current character register
2848 // down or for using a shifted value for anything so lets just forget that
2849 // we preloaded any characters into it.
2850 characters_preloaded_ = 0;
2851 // Adjust the offsets of the quick check performed information. This
2852 // information is used to find out what we already determined about the
2853 // characters by means of mask and compare.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002854 quick_check_performed_.Advance(by, compiler->ascii());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002855 cp_offset_ += by;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002856 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2857 compiler->SetRegExpTooBig();
2858 cp_offset_ = 0;
2859 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002860 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002861}
2862
2863
2864void TextNode::MakeCaseIndependent() {
2865 int element_count = elms_->length();
2866 for (int i = 0; i < element_count; i++) {
2867 TextElement elm = elms_->at(i);
2868 if (elm.type == TextElement::CHAR_CLASS) {
2869 RegExpCharacterClass* cc = elm.data.u_char_class;
2870 ZoneList<CharacterRange>* ranges = cc->ranges();
2871 int range_count = ranges->length();
2872 for (int i = 0; i < range_count; i++) {
2873 ranges->at(i).AddCaseEquivalents(ranges);
2874 }
2875 }
2876 }
2877}
2878
2879
ager@chromium.org8bb60582008-12-11 12:02:20 +00002880int TextNode::GreedyLoopTextLength() {
2881 TextElement elm = elms_->at(elms_->length() - 1);
2882 if (elm.type == TextElement::CHAR_CLASS) {
2883 return elm.cp_offset + 1;
2884 } else {
2885 return elm.cp_offset + elm.data.u_atom->data().length();
2886 }
2887}
2888
2889
2890// Finds the fixed match length of a sequence of nodes that goes from
2891// this alternative and back to this choice node. If there are variable
2892// length nodes or other complications in the way then return a sentinel
2893// value indicating that a greedy loop cannot be constructed.
2894int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) {
2895 int length = 0;
2896 RegExpNode* node = alternative->node();
2897 // Later we will generate code for all these text nodes using recursion
2898 // so we have to limit the max number.
2899 int recursion_depth = 0;
2900 while (node != this) {
2901 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2902 return kNodeIsTooComplexForGreedyLoops;
2903 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002904 int node_length = node->GreedyLoopTextLength();
2905 if (node_length == kNodeIsTooComplexForGreedyLoops) {
2906 return kNodeIsTooComplexForGreedyLoops;
2907 }
2908 length += node_length;
2909 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2910 node = seq_node->on_success();
2911 }
2912 return length;
2913}
2914
2915
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002916void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2917 ASSERT_EQ(loop_node_, NULL);
2918 AddAlternative(alt);
2919 loop_node_ = alt.node();
2920}
2921
2922
2923void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2924 ASSERT_EQ(continue_node_, NULL);
2925 AddAlternative(alt);
2926 continue_node_ = alt.node();
2927}
2928
2929
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002930void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002931 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00002932 if (trace->stop_node() == this) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002933 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2934 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2935 // Update the counter-based backtracking info on the stack. This is an
2936 // optimization for greedy loops (see below).
ager@chromium.org32912102009-01-16 10:38:43 +00002937 ASSERT(trace->cp_offset() == text_length);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002938 macro_assembler->AdvanceCurrentPosition(text_length);
ager@chromium.org32912102009-01-16 10:38:43 +00002939 macro_assembler->GoTo(trace->loop_label());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002940 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002941 }
ager@chromium.org32912102009-01-16 10:38:43 +00002942 ASSERT(trace->stop_node() == NULL);
2943 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002944 trace->Flush(compiler, this);
2945 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002946 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002947 ChoiceNode::Emit(compiler, trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002948}
2949
2950
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002951int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002952 int preload_characters = EatsAtLeast(4, 0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002953#ifdef CAN_READ_UNALIGNED
2954 bool ascii = compiler->ascii();
2955 if (ascii) {
2956 if (preload_characters > 4) preload_characters = 4;
2957 // We can't preload 3 characters because there is no machine instruction
2958 // to do that. We can't just load 4 because we could be reading
2959 // beyond the end of the string, which could cause a memory fault.
2960 if (preload_characters == 3) preload_characters = 2;
2961 } else {
2962 if (preload_characters > 2) preload_characters = 2;
2963 }
2964#else
2965 if (preload_characters > 1) preload_characters = 1;
2966#endif
2967 return preload_characters;
2968}
2969
2970
2971// This class is used when generating the alternatives in a choice node. It
2972// records the way the alternative is being code generated.
2973class AlternativeGeneration: public Malloced {
2974 public:
2975 AlternativeGeneration()
2976 : possible_success(),
2977 expects_preload(false),
2978 after(),
2979 quick_check_details() { }
2980 Label possible_success;
2981 bool expects_preload;
2982 Label after;
2983 QuickCheckDetails quick_check_details;
2984};
2985
2986
2987// Creates a list of AlternativeGenerations. If the list has a reasonable
2988// size then it is on the stack, otherwise the excess is on the heap.
2989class AlternativeGenerationList {
2990 public:
2991 explicit AlternativeGenerationList(int count)
2992 : alt_gens_(count) {
2993 for (int i = 0; i < count && i < kAFew; i++) {
2994 alt_gens_.Add(a_few_alt_gens_ + i);
2995 }
2996 for (int i = kAFew; i < count; i++) {
2997 alt_gens_.Add(new AlternativeGeneration());
2998 }
2999 }
3000 ~AlternativeGenerationList() {
3001 for (int i = 0; i < alt_gens_.length(); i++) {
3002 alt_gens_[i]->possible_success.Unuse();
3003 alt_gens_[i]->after.Unuse();
3004 }
3005 for (int i = kAFew; i < alt_gens_.length(); i++) {
3006 delete alt_gens_[i];
3007 alt_gens_[i] = NULL;
3008 }
3009 }
3010
3011 AlternativeGeneration* at(int i) {
3012 return alt_gens_[i];
3013 }
3014 private:
3015 static const int kAFew = 10;
3016 ZoneList<AlternativeGeneration*> alt_gens_;
3017 AlternativeGeneration a_few_alt_gens_[kAFew];
3018};
3019
3020
3021/* Code generation for choice nodes.
3022 *
3023 * We generate quick checks that do a mask and compare to eliminate a
3024 * choice. If the quick check succeeds then it jumps to the continuation to
3025 * do slow checks and check subsequent nodes. If it fails (the common case)
3026 * it falls through to the next choice.
3027 *
3028 * Here is the desired flow graph. Nodes directly below each other imply
3029 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3030 * 3 doesn't have a quick check so we have to call the slow check.
3031 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3032 * regexp continuation is generated directly after the Sn node, up to the
3033 * next GoTo if we decide to reuse some already generated code. Some
3034 * nodes expect preload_characters to be preloaded into the current
3035 * character register. R nodes do this preloading. Vertices are marked
3036 * F for failures and S for success (possible success in the case of quick
3037 * nodes). L, V, < and > are used as arrow heads.
3038 *
3039 * ----------> R
3040 * |
3041 * V
3042 * Q1 -----> S1
3043 * | S /
3044 * F| /
3045 * | F/
3046 * | /
3047 * | R
3048 * | /
3049 * V L
3050 * Q2 -----> S2
3051 * | S /
3052 * F| /
3053 * | F/
3054 * | /
3055 * | R
3056 * | /
3057 * V L
3058 * S3
3059 * |
3060 * F|
3061 * |
3062 * R
3063 * |
3064 * backtrack V
3065 * <----------Q4
3066 * \ F |
3067 * \ |S
3068 * \ F V
3069 * \-----S4
3070 *
3071 * For greedy loops we reverse our expectation and expect to match rather
3072 * than fail. Therefore we want the loop code to look like this (U is the
3073 * unwind code that steps back in the greedy loop). The following alternatives
3074 * look the same as above.
3075 * _____
3076 * / \
3077 * V |
3078 * ----------> S1 |
3079 * /| |
3080 * / |S |
3081 * F/ \_____/
3082 * /
3083 * |<-----------
3084 * | \
3085 * V \
3086 * Q2 ---> S2 \
3087 * | S / |
3088 * F| / |
3089 * | F/ |
3090 * | / |
3091 * | R |
3092 * | / |
3093 * F VL |
3094 * <------U |
3095 * back |S |
3096 * \______________/
3097 */
3098
3099
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003100void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003101 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3102 int choice_count = alternatives_->length();
3103#ifdef DEBUG
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003104 for (int i = 0; i < choice_count - 1; i++) {
3105 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003106 ZoneList<Guard*>* guards = alternative.guards();
ager@chromium.org8bb60582008-12-11 12:02:20 +00003107 int guard_count = (guards == NULL) ? 0 : guards->length();
3108 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00003109 ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003110 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003111 }
3112#endif
3113
ager@chromium.org32912102009-01-16 10:38:43 +00003114 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003115 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003116 ASSERT(limit_result == CONTINUE);
3117
3118 RecursionCheck rc(compiler);
3119
ager@chromium.org32912102009-01-16 10:38:43 +00003120 Trace* current_trace = trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003121
3122 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
3123 bool greedy_loop = false;
3124 Label greedy_loop_label;
ager@chromium.org32912102009-01-16 10:38:43 +00003125 Trace counter_backtrack_trace;
3126 counter_backtrack_trace.set_backtrack(&greedy_loop_label);
iposva@chromium.org245aa852009-02-10 00:49:54 +00003127 if (not_at_start()) counter_backtrack_trace.set_at_start(false);
3128
ager@chromium.org8bb60582008-12-11 12:02:20 +00003129 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3130 // Here we have special handling for greedy loops containing only text nodes
3131 // and other simple nodes. These are handled by pushing the current
3132 // position on the stack and then incrementing the current position each
3133 // time around the switch. On backtrack we decrement the current position
3134 // and check it against the pushed value. This avoids pushing backtrack
3135 // information for each iteration of the loop, which could take up a lot of
3136 // space.
3137 greedy_loop = true;
ager@chromium.org32912102009-01-16 10:38:43 +00003138 ASSERT(trace->stop_node() == NULL);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003139 macro_assembler->PushCurrentPosition();
ager@chromium.org32912102009-01-16 10:38:43 +00003140 current_trace = &counter_backtrack_trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003141 Label greedy_match_failed;
ager@chromium.org32912102009-01-16 10:38:43 +00003142 Trace greedy_match_trace;
iposva@chromium.org245aa852009-02-10 00:49:54 +00003143 if (not_at_start()) greedy_match_trace.set_at_start(false);
ager@chromium.org32912102009-01-16 10:38:43 +00003144 greedy_match_trace.set_backtrack(&greedy_match_failed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003145 Label loop_label;
3146 macro_assembler->Bind(&loop_label);
ager@chromium.org32912102009-01-16 10:38:43 +00003147 greedy_match_trace.set_stop_node(this);
3148 greedy_match_trace.set_loop_label(&loop_label);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003149 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003150 macro_assembler->Bind(&greedy_match_failed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003151 }
3152
3153 Label second_choice; // For use in greedy matches.
3154 macro_assembler->Bind(&second_choice);
3155
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003156 int first_normal_choice = greedy_loop ? 1 : 0;
3157
3158 int preload_characters = CalculatePreloadCharacters(compiler);
3159 bool preload_is_current =
ager@chromium.org32912102009-01-16 10:38:43 +00003160 (current_trace->characters_preloaded() == preload_characters);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003161 bool preload_has_checked_bounds = preload_is_current;
3162
3163 AlternativeGenerationList alt_gens(choice_count);
3164
ager@chromium.org8bb60582008-12-11 12:02:20 +00003165 // For now we just call all choices one after the other. The idea ultimately
3166 // is to use the Dispatch table to try only the relevant ones.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003167 for (int i = first_normal_choice; i < choice_count; i++) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003168 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.org32912102009-01-16 10:38:43 +00003169 AlternativeGeneration* alt_gen = alt_gens.at(i);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003170 alt_gen->quick_check_details.set_characters(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003171 ZoneList<Guard*>* guards = alternative.guards();
3172 int guard_count = (guards == NULL) ? 0 : guards->length();
ager@chromium.org32912102009-01-16 10:38:43 +00003173 Trace new_trace(*current_trace);
3174 new_trace.set_characters_preloaded(preload_is_current ?
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003175 preload_characters :
3176 0);
3177 if (preload_has_checked_bounds) {
ager@chromium.org32912102009-01-16 10:38:43 +00003178 new_trace.set_bound_checked_up_to(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003179 }
ager@chromium.org32912102009-01-16 10:38:43 +00003180 new_trace.quick_check_performed()->Clear();
iposva@chromium.org245aa852009-02-10 00:49:54 +00003181 if (not_at_start_) new_trace.set_at_start(Trace::FALSE);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003182 alt_gen->expects_preload = preload_is_current;
3183 bool generate_full_check_inline = false;
iposva@chromium.org245aa852009-02-10 00:49:54 +00003184 if (FLAG_irregexp_optimization &&
3185 try_to_emit_quick_check_for_alternative(i) &&
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003186 alternative.node()->EmitQuickCheck(compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00003187 &new_trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003188 preload_has_checked_bounds,
3189 &alt_gen->possible_success,
3190 &alt_gen->quick_check_details,
3191 i < choice_count - 1)) {
3192 // Quick check was generated for this choice.
3193 preload_is_current = true;
3194 preload_has_checked_bounds = true;
3195 // On the last choice in the ChoiceNode we generated the quick
3196 // check to fall through on possible success. So now we need to
3197 // generate the full check inline.
3198 if (i == choice_count - 1) {
3199 macro_assembler->Bind(&alt_gen->possible_success);
ager@chromium.org32912102009-01-16 10:38:43 +00003200 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3201 new_trace.set_characters_preloaded(preload_characters);
3202 new_trace.set_bound_checked_up_to(preload_characters);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003203 generate_full_check_inline = true;
3204 }
iposva@chromium.org245aa852009-02-10 00:49:54 +00003205 } else if (alt_gen->quick_check_details.cannot_match()) {
3206 if (i == choice_count - 1 && !greedy_loop) {
3207 macro_assembler->GoTo(trace->backtrack());
3208 }
3209 continue;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003210 } else {
3211 // No quick check was generated. Put the full code here.
3212 // If this is not the first choice then there could be slow checks from
3213 // previous cases that go here when they fail. There's no reason to
3214 // insist that they preload characters since the slow check we are about
3215 // to generate probably can't use it.
3216 if (i != first_normal_choice) {
3217 alt_gen->expects_preload = false;
ager@chromium.org32912102009-01-16 10:38:43 +00003218 new_trace.set_characters_preloaded(0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003219 }
3220 if (i < choice_count - 1) {
ager@chromium.org32912102009-01-16 10:38:43 +00003221 new_trace.set_backtrack(&alt_gen->after);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003222 }
3223 generate_full_check_inline = true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003224 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003225 if (generate_full_check_inline) {
3226 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00003227 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003228 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003229 alternative.node()->Emit(compiler, &new_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003230 preload_is_current = false;
3231 }
3232 macro_assembler->Bind(&alt_gen->after);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003233 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003234 if (greedy_loop) {
3235 macro_assembler->Bind(&greedy_loop_label);
3236 // If we have unwound to the bottom then backtrack.
ager@chromium.org32912102009-01-16 10:38:43 +00003237 macro_assembler->CheckGreedyLoop(trace->backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00003238 // Otherwise try the second priority at an earlier position.
3239 macro_assembler->AdvanceCurrentPosition(-text_length);
3240 macro_assembler->GoTo(&second_choice);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003241 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003242 // At this point we need to generate slow checks for the alternatives where
3243 // the quick check was inlined. We can recognize these because the associated
3244 // label was bound.
3245 for (int i = first_normal_choice; i < choice_count - 1; i++) {
3246 AlternativeGeneration* alt_gen = alt_gens.at(i);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003247 EmitOutOfLineContinuation(compiler,
3248 current_trace,
3249 alternatives_->at(i),
3250 alt_gen,
3251 preload_characters,
3252 alt_gens.at(i + 1)->expects_preload);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003253 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003254}
3255
3256
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003257void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00003258 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003259 GuardedAlternative alternative,
3260 AlternativeGeneration* alt_gen,
3261 int preload_characters,
3262 bool next_expects_preload) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003263 if (!alt_gen->possible_success.is_linked()) return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003264
3265 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3266 macro_assembler->Bind(&alt_gen->possible_success);
ager@chromium.org32912102009-01-16 10:38:43 +00003267 Trace out_of_line_trace(*trace);
3268 out_of_line_trace.set_characters_preloaded(preload_characters);
3269 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
iposva@chromium.org245aa852009-02-10 00:49:54 +00003270 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003271 ZoneList<Guard*>* guards = alternative.guards();
3272 int guard_count = (guards == NULL) ? 0 : guards->length();
3273 if (next_expects_preload) {
3274 Label reload_current_char;
ager@chromium.org32912102009-01-16 10:38:43 +00003275 out_of_line_trace.set_backtrack(&reload_current_char);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003276 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00003277 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003278 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003279 alternative.node()->Emit(compiler, &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003280 macro_assembler->Bind(&reload_current_char);
3281 // Reload the current character, since the next quick check expects that.
3282 // We don't need to check bounds here because we only get into this
3283 // code through a quick check which already did the checked load.
ager@chromium.org32912102009-01-16 10:38:43 +00003284 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003285 NULL,
3286 false,
3287 preload_characters);
3288 macro_assembler->GoTo(&(alt_gen->after));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003289 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00003290 out_of_line_trace.set_backtrack(&(alt_gen->after));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003291 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00003292 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003293 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003294 alternative.node()->Emit(compiler, &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003295 }
3296}
3297
3298
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003299void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003300 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00003301 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003302 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003303 ASSERT(limit_result == CONTINUE);
3304
3305 RecursionCheck rc(compiler);
3306
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003307 switch (type_) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003308 case STORE_POSITION: {
ager@chromium.org32912102009-01-16 10:38:43 +00003309 Trace::DeferredCapture
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003310 new_capture(data_.u_position_register.reg,
3311 data_.u_position_register.is_capture,
3312 trace);
ager@chromium.org32912102009-01-16 10:38:43 +00003313 Trace new_trace = *trace;
3314 new_trace.add_action(&new_capture);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003315 on_success()->Emit(compiler, &new_trace);
3316 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003317 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003318 case INCREMENT_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00003319 Trace::DeferredIncrementRegister
ager@chromium.org8bb60582008-12-11 12:02:20 +00003320 new_increment(data_.u_increment_register.reg);
ager@chromium.org32912102009-01-16 10:38:43 +00003321 Trace new_trace = *trace;
3322 new_trace.add_action(&new_increment);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003323 on_success()->Emit(compiler, &new_trace);
3324 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003325 }
3326 case SET_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00003327 Trace::DeferredSetRegister
ager@chromium.org8bb60582008-12-11 12:02:20 +00003328 new_set(data_.u_store_register.reg, data_.u_store_register.value);
ager@chromium.org32912102009-01-16 10:38:43 +00003329 Trace new_trace = *trace;
3330 new_trace.add_action(&new_set);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003331 on_success()->Emit(compiler, &new_trace);
3332 break;
ager@chromium.org32912102009-01-16 10:38:43 +00003333 }
3334 case CLEAR_CAPTURES: {
3335 Trace::DeferredClearCaptures
3336 new_capture(Interval(data_.u_clear_captures.range_from,
3337 data_.u_clear_captures.range_to));
3338 Trace new_trace = *trace;
3339 new_trace.add_action(&new_capture);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003340 on_success()->Emit(compiler, &new_trace);
3341 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003342 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003343 case BEGIN_SUBMATCH:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003344 if (!trace->is_trivial()) {
3345 trace->Flush(compiler, this);
3346 } else {
3347 assembler->WriteCurrentPositionToRegister(
3348 data_.u_submatch.current_position_register, 0);
3349 assembler->WriteStackPointerToRegister(
3350 data_.u_submatch.stack_pointer_register);
3351 on_success()->Emit(compiler, trace);
3352 }
3353 break;
ager@chromium.org32912102009-01-16 10:38:43 +00003354 case EMPTY_MATCH_CHECK: {
3355 int start_pos_reg = data_.u_empty_match_check.start_register;
3356 int stored_pos = 0;
3357 int rep_reg = data_.u_empty_match_check.repetition_register;
3358 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
3359 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
3360 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
3361 // If we know we haven't advanced and there is no minimum we
3362 // can just backtrack immediately.
3363 assembler->GoTo(trace->backtrack());
ager@chromium.org32912102009-01-16 10:38:43 +00003364 } else if (know_dist && stored_pos < trace->cp_offset()) {
3365 // If we know we've advanced we can generate the continuation
3366 // immediately.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003367 on_success()->Emit(compiler, trace);
3368 } else if (!trace->is_trivial()) {
3369 trace->Flush(compiler, this);
3370 } else {
3371 Label skip_empty_check;
3372 // If we have a minimum number of repetitions we check the current
3373 // number first and skip the empty check if it's not enough.
3374 if (has_minimum) {
3375 int limit = data_.u_empty_match_check.repetition_limit;
3376 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
3377 }
3378 // If the match is empty we bail out, otherwise we fall through
3379 // to the on-success continuation.
3380 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
3381 trace->backtrack());
3382 assembler->Bind(&skip_empty_check);
3383 on_success()->Emit(compiler, trace);
ager@chromium.org32912102009-01-16 10:38:43 +00003384 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003385 break;
ager@chromium.org32912102009-01-16 10:38:43 +00003386 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003387 case POSITIVE_SUBMATCH_SUCCESS: {
3388 if (!trace->is_trivial()) {
3389 trace->Flush(compiler, this);
3390 return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003391 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003392 assembler->ReadCurrentPositionFromRegister(
ager@chromium.org8bb60582008-12-11 12:02:20 +00003393 data_.u_submatch.current_position_register);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003394 assembler->ReadStackPointerFromRegister(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003395 data_.u_submatch.stack_pointer_register);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003396 int clear_register_count = data_.u_submatch.clear_register_count;
3397 if (clear_register_count == 0) {
3398 on_success()->Emit(compiler, trace);
3399 return;
3400 }
3401 int clear_registers_from = data_.u_submatch.clear_register_from;
3402 Label clear_registers_backtrack;
3403 Trace new_trace = *trace;
3404 new_trace.set_backtrack(&clear_registers_backtrack);
3405 on_success()->Emit(compiler, &new_trace);
3406
3407 assembler->Bind(&clear_registers_backtrack);
3408 int clear_registers_to = clear_registers_from + clear_register_count - 1;
3409 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
3410
3411 ASSERT(trace->backtrack() == NULL);
3412 assembler->Backtrack();
3413 return;
3414 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003415 default:
3416 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003417 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003418}
3419
3420
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003421void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003422 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00003423 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003424 trace->Flush(compiler, this);
3425 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003426 }
3427
ager@chromium.org32912102009-01-16 10:38:43 +00003428 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003429 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003430 ASSERT(limit_result == CONTINUE);
3431
3432 RecursionCheck rc(compiler);
3433
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003434 ASSERT_EQ(start_reg_ + 1, end_reg_);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003435 if (compiler->ignore_case()) {
3436 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
3437 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003438 } else {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003439 assembler->CheckNotBackReference(start_reg_, trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003440 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003441 on_success()->Emit(compiler, trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003442}
3443
3444
3445// -------------------------------------------------------------------
3446// Dot/dotty output
3447
3448
3449#ifdef DEBUG
3450
3451
3452class DotPrinter: public NodeVisitor {
3453 public:
3454 explicit DotPrinter(bool ignore_case)
3455 : ignore_case_(ignore_case),
3456 stream_(&alloc_) { }
3457 void PrintNode(const char* label, RegExpNode* node);
3458 void Visit(RegExpNode* node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003459 void PrintAttributes(RegExpNode* from);
3460 StringStream* stream() { return &stream_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003461 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003462#define DECLARE_VISIT(Type) \
3463 virtual void Visit##Type(Type##Node* that);
3464FOR_EACH_NODE_TYPE(DECLARE_VISIT)
3465#undef DECLARE_VISIT
3466 private:
3467 bool ignore_case_;
3468 HeapStringAllocator alloc_;
3469 StringStream stream_;
3470};
3471
3472
3473void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
3474 stream()->Add("digraph G {\n graph [label=\"");
3475 for (int i = 0; label[i]; i++) {
3476 switch (label[i]) {
3477 case '\\':
3478 stream()->Add("\\\\");
3479 break;
3480 case '"':
3481 stream()->Add("\"");
3482 break;
3483 default:
3484 stream()->Put(label[i]);
3485 break;
3486 }
3487 }
3488 stream()->Add("\"];\n");
3489 Visit(node);
3490 stream()->Add("}\n");
3491 printf("%s", *(stream()->ToCString()));
3492}
3493
3494
3495void DotPrinter::Visit(RegExpNode* node) {
3496 if (node->info()->visited) return;
3497 node->info()->visited = true;
3498 node->Accept(this);
3499}
3500
3501
3502void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003503 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
3504 Visit(on_failure);
3505}
3506
3507
3508class TableEntryBodyPrinter {
3509 public:
3510 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
3511 : stream_(stream), choice_(choice) { }
3512 void Call(uc16 from, DispatchTable::Entry entry) {
3513 OutSet* out_set = entry.out_set();
3514 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3515 if (out_set->Get(i)) {
3516 stream()->Add(" n%p:s%io%i -> n%p;\n",
3517 choice(),
3518 from,
3519 i,
3520 choice()->alternatives()->at(i).node());
3521 }
3522 }
3523 }
3524 private:
3525 StringStream* stream() { return stream_; }
3526 ChoiceNode* choice() { return choice_; }
3527 StringStream* stream_;
3528 ChoiceNode* choice_;
3529};
3530
3531
3532class TableEntryHeaderPrinter {
3533 public:
3534 explicit TableEntryHeaderPrinter(StringStream* stream)
3535 : first_(true), stream_(stream) { }
3536 void Call(uc16 from, DispatchTable::Entry entry) {
3537 if (first_) {
3538 first_ = false;
3539 } else {
3540 stream()->Add("|");
3541 }
3542 stream()->Add("{\\%k-\\%k|{", from, entry.to());
3543 OutSet* out_set = entry.out_set();
3544 int priority = 0;
3545 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3546 if (out_set->Get(i)) {
3547 if (priority > 0) stream()->Add("|");
3548 stream()->Add("<s%io%i> %i", from, i, priority);
3549 priority++;
3550 }
3551 }
3552 stream()->Add("}}");
3553 }
3554 private:
3555 bool first_;
3556 StringStream* stream() { return stream_; }
3557 StringStream* stream_;
3558};
3559
3560
3561class AttributePrinter {
3562 public:
3563 explicit AttributePrinter(DotPrinter* out)
3564 : out_(out), first_(true) { }
3565 void PrintSeparator() {
3566 if (first_) {
3567 first_ = false;
3568 } else {
3569 out_->stream()->Add("|");
3570 }
3571 }
3572 void PrintBit(const char* name, bool value) {
3573 if (!value) return;
3574 PrintSeparator();
3575 out_->stream()->Add("{%s}", name);
3576 }
3577 void PrintPositive(const char* name, int value) {
3578 if (value < 0) return;
3579 PrintSeparator();
3580 out_->stream()->Add("{%s|%x}", name, value);
3581 }
3582 private:
3583 DotPrinter* out_;
3584 bool first_;
3585};
3586
3587
3588void DotPrinter::PrintAttributes(RegExpNode* that) {
3589 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
3590 "margin=0.1, fontsize=10, label=\"{",
3591 that);
3592 AttributePrinter printer(this);
3593 NodeInfo* info = that->info();
3594 printer.PrintBit("NI", info->follows_newline_interest);
3595 printer.PrintBit("WI", info->follows_word_interest);
3596 printer.PrintBit("SI", info->follows_start_interest);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003597 Label* label = that->label();
3598 if (label->is_bound())
3599 printer.PrintPositive("@", label->pos());
3600 stream()->Add("}\"];\n");
3601 stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
3602 "arrowhead=none];\n", that, that);
3603}
3604
3605
3606static const bool kPrintDispatchTable = false;
3607void DotPrinter::VisitChoice(ChoiceNode* that) {
3608 if (kPrintDispatchTable) {
3609 stream()->Add(" n%p [shape=Mrecord, label=\"", that);
3610 TableEntryHeaderPrinter header_printer(stream());
3611 that->GetTable(ignore_case_)->ForEach(&header_printer);
3612 stream()->Add("\"]\n", that);
3613 PrintAttributes(that);
3614 TableEntryBodyPrinter body_printer(stream(), that);
3615 that->GetTable(ignore_case_)->ForEach(&body_printer);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003616 } else {
3617 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
3618 for (int i = 0; i < that->alternatives()->length(); i++) {
3619 GuardedAlternative alt = that->alternatives()->at(i);
3620 stream()->Add(" n%p -> n%p;\n", that, alt.node());
3621 }
3622 }
3623 for (int i = 0; i < that->alternatives()->length(); i++) {
3624 GuardedAlternative alt = that->alternatives()->at(i);
3625 alt.node()->Accept(this);
3626 }
3627}
3628
3629
3630void DotPrinter::VisitText(TextNode* that) {
3631 stream()->Add(" n%p [label=\"", that);
3632 for (int i = 0; i < that->elements()->length(); i++) {
3633 if (i > 0) stream()->Add(" ");
3634 TextElement elm = that->elements()->at(i);
3635 switch (elm.type) {
3636 case TextElement::ATOM: {
3637 stream()->Add("'%w'", elm.data.u_atom->data());
3638 break;
3639 }
3640 case TextElement::CHAR_CLASS: {
3641 RegExpCharacterClass* node = elm.data.u_char_class;
3642 stream()->Add("[");
3643 if (node->is_negated())
3644 stream()->Add("^");
3645 for (int j = 0; j < node->ranges()->length(); j++) {
3646 CharacterRange range = node->ranges()->at(j);
3647 stream()->Add("%k-%k", range.from(), range.to());
3648 }
3649 stream()->Add("]");
3650 break;
3651 }
3652 default:
3653 UNREACHABLE();
3654 }
3655 }
3656 stream()->Add("\", shape=box, peripheries=2];\n");
3657 PrintAttributes(that);
3658 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3659 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003660}
3661
3662
3663void DotPrinter::VisitBackReference(BackReferenceNode* that) {
3664 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
3665 that,
3666 that->start_register(),
3667 that->end_register());
3668 PrintAttributes(that);
3669 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3670 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003671}
3672
3673
3674void DotPrinter::VisitEnd(EndNode* that) {
3675 stream()->Add(" n%p [style=bold, shape=point];\n", that);
3676 PrintAttributes(that);
3677}
3678
3679
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003680void DotPrinter::VisitAssertion(AssertionNode* that) {
3681 stream()->Add(" n%p [", that);
3682 switch (that->type()) {
3683 case AssertionNode::AT_END:
3684 stream()->Add("label=\"$\", shape=septagon");
3685 break;
3686 case AssertionNode::AT_START:
3687 stream()->Add("label=\"^\", shape=septagon");
3688 break;
3689 case AssertionNode::AT_BOUNDARY:
3690 stream()->Add("label=\"\\b\", shape=septagon");
3691 break;
3692 case AssertionNode::AT_NON_BOUNDARY:
3693 stream()->Add("label=\"\\B\", shape=septagon");
3694 break;
3695 case AssertionNode::AFTER_NEWLINE:
3696 stream()->Add("label=\"(?<=\\n)\", shape=septagon");
3697 break;
3698 }
3699 stream()->Add("];\n");
3700 PrintAttributes(that);
3701 RegExpNode* successor = that->on_success();
3702 stream()->Add(" n%p -> n%p;\n", that, successor);
3703 Visit(successor);
3704}
3705
3706
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003707void DotPrinter::VisitAction(ActionNode* that) {
3708 stream()->Add(" n%p [", that);
3709 switch (that->type_) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003710 case ActionNode::SET_REGISTER:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003711 stream()->Add("label=\"$%i:=%i\", shape=octagon",
3712 that->data_.u_store_register.reg,
3713 that->data_.u_store_register.value);
3714 break;
3715 case ActionNode::INCREMENT_REGISTER:
3716 stream()->Add("label=\"$%i++\", shape=octagon",
3717 that->data_.u_increment_register.reg);
3718 break;
3719 case ActionNode::STORE_POSITION:
3720 stream()->Add("label=\"$%i:=$pos\", shape=octagon",
3721 that->data_.u_position_register.reg);
3722 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003723 case ActionNode::BEGIN_SUBMATCH:
3724 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
3725 that->data_.u_submatch.current_position_register);
3726 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003727 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003728 stream()->Add("label=\"escape\", shape=septagon");
3729 break;
ager@chromium.org32912102009-01-16 10:38:43 +00003730 case ActionNode::EMPTY_MATCH_CHECK:
3731 stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
3732 that->data_.u_empty_match_check.start_register,
3733 that->data_.u_empty_match_check.repetition_register,
3734 that->data_.u_empty_match_check.repetition_limit);
3735 break;
3736 case ActionNode::CLEAR_CAPTURES: {
3737 stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
3738 that->data_.u_clear_captures.range_from,
3739 that->data_.u_clear_captures.range_to);
3740 break;
3741 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003742 }
3743 stream()->Add("];\n");
3744 PrintAttributes(that);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003745 RegExpNode* successor = that->on_success();
3746 stream()->Add(" n%p -> n%p;\n", that, successor);
3747 Visit(successor);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003748}
3749
3750
3751class DispatchTableDumper {
3752 public:
3753 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
3754 void Call(uc16 key, DispatchTable::Entry entry);
3755 StringStream* stream() { return stream_; }
3756 private:
3757 StringStream* stream_;
3758};
3759
3760
3761void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
3762 stream()->Add("[%k-%k]: {", key, entry.to());
3763 OutSet* set = entry.out_set();
3764 bool first = true;
3765 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3766 if (set->Get(i)) {
3767 if (first) {
3768 first = false;
3769 } else {
3770 stream()->Add(", ");
3771 }
3772 stream()->Add("%i", i);
3773 }
3774 }
3775 stream()->Add("}\n");
3776}
3777
3778
3779void DispatchTable::Dump() {
3780 HeapStringAllocator alloc;
3781 StringStream stream(&alloc);
3782 DispatchTableDumper dumper(&stream);
3783 tree()->ForEach(&dumper);
3784 OS::PrintError("%s", *stream.ToCString());
3785}
3786
3787
3788void RegExpEngine::DotPrint(const char* label,
3789 RegExpNode* node,
3790 bool ignore_case) {
3791 DotPrinter printer(ignore_case);
3792 printer.PrintNode(label, node);
3793}
3794
3795
3796#endif // DEBUG
3797
3798
3799// -------------------------------------------------------------------
3800// Tree to graph conversion
3801
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003802static const int kSpaceRangeCount = 20;
3803static const int kSpaceRangeAsciiCount = 4;
3804static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020,
3805 0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A,
3806 0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 };
3807
3808static const int kWordRangeCount = 8;
3809static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_',
3810 '_', 'a', 'z' };
3811
3812static const int kDigitRangeCount = 2;
3813static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' };
3814
3815static const int kLineTerminatorRangeCount = 6;
3816static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A,
3817 0x000A, 0x000D, 0x000D, 0x2028, 0x2029 };
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003818
3819RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003820 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003821 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
3822 elms->Add(TextElement::Atom(this));
ager@chromium.org8bb60582008-12-11 12:02:20 +00003823 return new TextNode(elms, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003824}
3825
3826
3827RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003828 RegExpNode* on_success) {
3829 return new TextNode(elements(), on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003830}
3831
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003832static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
3833 const uc16* special_class,
3834 int length) {
3835 ASSERT(ranges->length() != 0);
3836 ASSERT(length != 0);
3837 ASSERT(special_class[0] != 0);
3838 if (ranges->length() != (length >> 1) + 1) {
3839 return false;
3840 }
3841 CharacterRange range = ranges->at(0);
3842 if (range.from() != 0) {
3843 return false;
3844 }
3845 for (int i = 0; i < length; i += 2) {
3846 if (special_class[i] != (range.to() + 1)) {
3847 return false;
3848 }
3849 range = ranges->at((i >> 1) + 1);
3850 if (special_class[i+1] != range.from() - 1) {
3851 return false;
3852 }
3853 }
3854 if (range.to() != 0xffff) {
3855 return false;
3856 }
3857 return true;
3858}
3859
3860
3861static bool CompareRanges(ZoneList<CharacterRange>* ranges,
3862 const uc16* special_class,
3863 int length) {
3864 if (ranges->length() * 2 != length) {
3865 return false;
3866 }
3867 for (int i = 0; i < length; i += 2) {
3868 CharacterRange range = ranges->at(i >> 1);
3869 if (range.from() != special_class[i] || range.to() != special_class[i+1]) {
3870 return false;
3871 }
3872 }
3873 return true;
3874}
3875
3876
3877bool RegExpCharacterClass::is_standard() {
3878 // TODO(lrn): Remove need for this function, by not throwing away information
3879 // along the way.
3880 if (is_negated_) {
3881 return false;
3882 }
3883 if (set_.is_standard()) {
3884 return true;
3885 }
3886 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3887 set_.set_standard_set_type('s');
3888 return true;
3889 }
3890 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3891 set_.set_standard_set_type('S');
3892 return true;
3893 }
3894 if (CompareInverseRanges(set_.ranges(),
3895 kLineTerminatorRanges,
3896 kLineTerminatorRangeCount)) {
3897 set_.set_standard_set_type('.');
3898 return true;
3899 }
3900 return false;
3901}
3902
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003903
3904RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003905 RegExpNode* on_success) {
3906 return new TextNode(this, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003907}
3908
3909
3910RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003911 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003912 ZoneList<RegExpTree*>* alternatives = this->alternatives();
3913 int length = alternatives->length();
ager@chromium.org8bb60582008-12-11 12:02:20 +00003914 ChoiceNode* result = new ChoiceNode(length);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003915 for (int i = 0; i < length; i++) {
3916 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003917 on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003918 result->AddAlternative(alternative);
3919 }
3920 return result;
3921}
3922
3923
3924RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003925 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003926 return ToNode(min(),
3927 max(),
3928 is_greedy(),
3929 body(),
3930 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003931 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003932}
3933
3934
3935RegExpNode* RegExpQuantifier::ToNode(int min,
3936 int max,
3937 bool is_greedy,
3938 RegExpTree* body,
3939 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00003940 RegExpNode* on_success,
3941 bool not_at_start) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003942 // x{f, t} becomes this:
3943 //
3944 // (r++)<-.
3945 // | `
3946 // | (x)
3947 // v ^
3948 // (r=0)-->(?)---/ [if r < t]
3949 // |
3950 // [if r >= f] \----> ...
3951 //
3952 //
3953 // TODO(someone): clear captures on repetition and handle empty
3954 // matches.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003955
3956 // 15.10.2.5 RepeatMatcher algorithm.
3957 // The parser has already eliminated the case where max is 0. In the case
3958 // where max_match is zero the parser has removed the quantifier if min was
3959 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
3960
3961 // If we know that we cannot match zero length then things are a little
3962 // simpler since we don't need to make the special zero length match check
3963 // from step 2.1. If the min and max are small we can unroll a little in
3964 // this case.
3965 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
3966 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
3967 if (max == 0) return on_success; // This can happen due to recursion.
ager@chromium.org32912102009-01-16 10:38:43 +00003968 bool body_can_be_empty = (body->min_match() == 0);
3969 int body_start_reg = RegExpCompiler::kNoRegister;
3970 Interval capture_registers = body->CaptureRegisters();
3971 bool needs_capture_clearing = !capture_registers.is_empty();
3972 if (body_can_be_empty) {
3973 body_start_reg = compiler->AllocateRegister();
iposva@chromium.org245aa852009-02-10 00:49:54 +00003974 } else if (FLAG_irregexp_optimization && !needs_capture_clearing) {
ager@chromium.org32912102009-01-16 10:38:43 +00003975 // Only unroll if there are no captures and the body can't be
3976 // empty.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003977 if (min > 0 && min <= kMaxUnrolledMinMatches) {
3978 int new_max = (max == kInfinity) ? max : max - min;
3979 // Recurse once to get the loop or optional matches after the fixed ones.
iposva@chromium.org245aa852009-02-10 00:49:54 +00003980 RegExpNode* answer = ToNode(
3981 0, new_max, is_greedy, body, compiler, on_success, true);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003982 // Unroll the forced matches from 0 to min. This can cause chains of
3983 // TextNodes (which the parser does not generate). These should be
3984 // combined if it turns out they hinder good code generation.
3985 for (int i = 0; i < min; i++) {
3986 answer = body->ToNode(compiler, answer);
3987 }
3988 return answer;
3989 }
3990 if (max <= kMaxUnrolledMaxMatches) {
3991 ASSERT(min == 0);
3992 // Unroll the optional matches up to max.
3993 RegExpNode* answer = on_success;
3994 for (int i = 0; i < max; i++) {
3995 ChoiceNode* alternation = new ChoiceNode(2);
3996 if (is_greedy) {
3997 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3998 answer)));
3999 alternation->AddAlternative(GuardedAlternative(on_success));
4000 } else {
4001 alternation->AddAlternative(GuardedAlternative(on_success));
4002 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
4003 answer)));
4004 }
4005 answer = alternation;
iposva@chromium.org245aa852009-02-10 00:49:54 +00004006 if (not_at_start) alternation->set_not_at_start();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004007 }
4008 return answer;
4009 }
4010 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004011 bool has_min = min > 0;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004012 bool has_max = max < RegExpTree::kInfinity;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004013 bool needs_counter = has_min || has_max;
ager@chromium.org32912102009-01-16 10:38:43 +00004014 int reg_ctr = needs_counter
4015 ? compiler->AllocateRegister()
4016 : RegExpCompiler::kNoRegister;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004017 LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
iposva@chromium.org245aa852009-02-10 00:49:54 +00004018 if (not_at_start) center->set_not_at_start();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004019 RegExpNode* loop_return = needs_counter
4020 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
4021 : static_cast<RegExpNode*>(center);
ager@chromium.org32912102009-01-16 10:38:43 +00004022 if (body_can_be_empty) {
4023 // If the body can be empty we need to check if it was and then
4024 // backtrack.
4025 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
4026 reg_ctr,
4027 min,
4028 loop_return);
4029 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004030 RegExpNode* body_node = body->ToNode(compiler, loop_return);
ager@chromium.org32912102009-01-16 10:38:43 +00004031 if (body_can_be_empty) {
4032 // If the body can be empty we need to store the start position
4033 // so we can bail out if it was empty.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004034 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
ager@chromium.org32912102009-01-16 10:38:43 +00004035 }
4036 if (needs_capture_clearing) {
4037 // Before entering the body of this loop we need to clear captures.
4038 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
4039 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004040 GuardedAlternative body_alt(body_node);
4041 if (has_max) {
4042 Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
4043 body_alt.AddGuard(body_guard);
4044 }
4045 GuardedAlternative rest_alt(on_success);
4046 if (has_min) {
4047 Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
4048 rest_alt.AddGuard(rest_guard);
4049 }
4050 if (is_greedy) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004051 center->AddLoopAlternative(body_alt);
4052 center->AddContinueAlternative(rest_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004053 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004054 center->AddContinueAlternative(rest_alt);
4055 center->AddLoopAlternative(body_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004056 }
4057 if (needs_counter) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004058 return ActionNode::SetRegister(reg_ctr, 0, center);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004059 } else {
4060 return center;
4061 }
4062}
4063
4064
4065RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004066 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004067 NodeInfo info;
4068 switch (type()) {
4069 case START_OF_LINE:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004070 return AssertionNode::AfterNewline(on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004071 case START_OF_INPUT:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004072 return AssertionNode::AtStart(on_success);
4073 case BOUNDARY:
4074 return AssertionNode::AtBoundary(on_success);
4075 case NON_BOUNDARY:
4076 return AssertionNode::AtNonBoundary(on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004077 case END_OF_INPUT:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004078 return AssertionNode::AtEnd(on_success);
4079 case END_OF_LINE: {
4080 // Compile $ in multiline regexps as an alternation with a positive
4081 // lookahead in one side and an end-of-input on the other side.
4082 // We need two registers for the lookahead.
4083 int stack_pointer_register = compiler->AllocateRegister();
4084 int position_register = compiler->AllocateRegister();
4085 // The ChoiceNode to distinguish between a newline and end-of-input.
4086 ChoiceNode* result = new ChoiceNode(2);
4087 // Create a newline atom.
4088 ZoneList<CharacterRange>* newline_ranges =
4089 new ZoneList<CharacterRange>(3);
4090 CharacterRange::AddClassEscape('n', newline_ranges);
4091 RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
4092 TextNode* newline_matcher = new TextNode(
4093 newline_atom,
4094 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
4095 position_register,
4096 0, // No captures inside.
4097 -1, // Ignored if no captures.
4098 on_success));
4099 // Create an end-of-input matcher.
4100 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
4101 stack_pointer_register,
4102 position_register,
4103 newline_matcher);
4104 // Add the two alternatives to the ChoiceNode.
4105 GuardedAlternative eol_alternative(end_of_line);
4106 result->AddAlternative(eol_alternative);
4107 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
4108 result->AddAlternative(end_alternative);
4109 return result;
4110 }
4111 default:
4112 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004113 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004114 return on_success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004115}
4116
4117
4118RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004119 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004120 return new BackReferenceNode(RegExpCapture::StartRegister(index()),
4121 RegExpCapture::EndRegister(index()),
ager@chromium.org8bb60582008-12-11 12:02:20 +00004122 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004123}
4124
4125
4126RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004127 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004128 return on_success;
4129}
4130
4131
4132RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004133 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004134 int stack_pointer_register = compiler->AllocateRegister();
4135 int position_register = compiler->AllocateRegister();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004136
4137 const int registers_per_capture = 2;
4138 const int register_of_first_capture = 2;
4139 int register_count = capture_count_ * registers_per_capture;
4140 int register_start =
4141 register_of_first_capture + capture_from_ * registers_per_capture;
4142
ager@chromium.org8bb60582008-12-11 12:02:20 +00004143 RegExpNode* success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004144 if (is_positive()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004145 RegExpNode* node = ActionNode::BeginSubmatch(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004146 stack_pointer_register,
4147 position_register,
4148 body()->ToNode(
4149 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004150 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
4151 position_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004152 register_count,
4153 register_start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004154 on_success)));
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004155 return node;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004156 } else {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004157 // We use a ChoiceNode for a negative lookahead because it has most of
4158 // the characteristics we need. It has the body of the lookahead as its
4159 // first alternative and the expression after the lookahead of the second
4160 // alternative. If the first alternative succeeds then the
4161 // NegativeSubmatchSuccess will unwind the stack including everything the
4162 // choice node set up and backtrack. If the first alternative fails then
4163 // the second alternative is tried, which is exactly the desired result
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004164 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
4165 // ChoiceNode that knows to ignore the first exit when calculating quick
4166 // checks.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004167 GuardedAlternative body_alt(
4168 body()->ToNode(
4169 compiler,
4170 success = new NegativeSubmatchSuccess(stack_pointer_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004171 position_register,
4172 register_count,
4173 register_start)));
4174 ChoiceNode* choice_node =
4175 new NegativeLookaheadChoiceNode(body_alt,
4176 GuardedAlternative(on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004177 return ActionNode::BeginSubmatch(stack_pointer_register,
4178 position_register,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004179 choice_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004180 }
4181}
4182
4183
4184RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004185 RegExpNode* on_success) {
4186 return ToNode(body(), index(), compiler, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004187}
4188
4189
4190RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
4191 int index,
4192 RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004193 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004194 int start_reg = RegExpCapture::StartRegister(index);
4195 int end_reg = RegExpCapture::EndRegister(index);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004196 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
ager@chromium.org8bb60582008-12-11 12:02:20 +00004197 RegExpNode* body_node = body->ToNode(compiler, store_end);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004198 return ActionNode::StorePosition(start_reg, true, body_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004199}
4200
4201
4202RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004203 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004204 ZoneList<RegExpTree*>* children = nodes();
4205 RegExpNode* current = on_success;
4206 for (int i = children->length() - 1; i >= 0; i--) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004207 current = children->at(i)->ToNode(compiler, current);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004208 }
4209 return current;
4210}
4211
4212
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004213static void AddClass(const uc16* elmv,
4214 int elmc,
4215 ZoneList<CharacterRange>* ranges) {
4216 for (int i = 0; i < elmc; i += 2) {
4217 ASSERT(elmv[i] <= elmv[i + 1]);
4218 ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
4219 }
4220}
4221
4222
4223static void AddClassNegated(const uc16 *elmv,
4224 int elmc,
4225 ZoneList<CharacterRange>* ranges) {
4226 ASSERT(elmv[0] != 0x0000);
ager@chromium.org8bb60582008-12-11 12:02:20 +00004227 ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004228 uc16 last = 0x0000;
4229 for (int i = 0; i < elmc; i += 2) {
4230 ASSERT(last <= elmv[i] - 1);
4231 ASSERT(elmv[i] <= elmv[i + 1]);
4232 ranges->Add(CharacterRange(last, elmv[i] - 1));
4233 last = elmv[i + 1] + 1;
4234 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004235 ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004236}
4237
4238
4239void CharacterRange::AddClassEscape(uc16 type,
4240 ZoneList<CharacterRange>* ranges) {
4241 switch (type) {
4242 case 's':
4243 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
4244 break;
4245 case 'S':
4246 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
4247 break;
4248 case 'w':
4249 AddClass(kWordRanges, kWordRangeCount, ranges);
4250 break;
4251 case 'W':
4252 AddClassNegated(kWordRanges, kWordRangeCount, ranges);
4253 break;
4254 case 'd':
4255 AddClass(kDigitRanges, kDigitRangeCount, ranges);
4256 break;
4257 case 'D':
4258 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
4259 break;
4260 case '.':
4261 AddClassNegated(kLineTerminatorRanges,
4262 kLineTerminatorRangeCount,
4263 ranges);
4264 break;
4265 // This is not a character range as defined by the spec but a
4266 // convenient shorthand for a character class that matches any
4267 // character.
4268 case '*':
4269 ranges->Add(CharacterRange::Everything());
4270 break;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004271 // This is the set of characters matched by the $ and ^ symbols
4272 // in multiline mode.
4273 case 'n':
4274 AddClass(kLineTerminatorRanges,
4275 kLineTerminatorRangeCount,
4276 ranges);
4277 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004278 default:
4279 UNREACHABLE();
4280 }
4281}
4282
4283
4284Vector<const uc16> CharacterRange::GetWordBounds() {
4285 return Vector<const uc16>(kWordRanges, kWordRangeCount);
4286}
4287
4288
4289class CharacterRangeSplitter {
4290 public:
4291 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
4292 ZoneList<CharacterRange>** excluded)
4293 : included_(included),
4294 excluded_(excluded) { }
4295 void Call(uc16 from, DispatchTable::Entry entry);
4296
4297 static const int kInBase = 0;
4298 static const int kInOverlay = 1;
4299
4300 private:
4301 ZoneList<CharacterRange>** included_;
4302 ZoneList<CharacterRange>** excluded_;
4303};
4304
4305
4306void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
4307 if (!entry.out_set()->Get(kInBase)) return;
4308 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
4309 ? included_
4310 : excluded_;
4311 if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
4312 (*target)->Add(CharacterRange(entry.from(), entry.to()));
4313}
4314
4315
4316void CharacterRange::Split(ZoneList<CharacterRange>* base,
4317 Vector<const uc16> overlay,
4318 ZoneList<CharacterRange>** included,
4319 ZoneList<CharacterRange>** excluded) {
4320 ASSERT_EQ(NULL, *included);
4321 ASSERT_EQ(NULL, *excluded);
4322 DispatchTable table;
4323 for (int i = 0; i < base->length(); i++)
4324 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
4325 for (int i = 0; i < overlay.length(); i += 2) {
4326 table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
4327 CharacterRangeSplitter::kInOverlay);
4328 }
4329 CharacterRangeSplitter callback(included, excluded);
4330 table.ForEach(&callback);
4331}
4332
4333
4334void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
4335 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4336 if (IsSingleton()) {
4337 // If this is a singleton we just expand the one character.
4338 int length = uncanonicalize.get(from(), '\0', chars);
4339 for (int i = 0; i < length; i++) {
4340 uc32 chr = chars[i];
4341 if (chr != from()) {
4342 ranges->Add(CharacterRange::Singleton(chars[i]));
4343 }
4344 }
4345 } else if (from() <= kRangeCanonicalizeMax &&
4346 to() <= kRangeCanonicalizeMax) {
4347 // If this is a range we expand the characters block by block,
4348 // expanding contiguous subranges (blocks) one at a time.
4349 // The approach is as follows. For a given start character we
4350 // look up the block that contains it, for instance 'a' if the
4351 // start character is 'c'. A block is characterized by the property
4352 // that all characters uncanonicalize in the same way as the first
4353 // element, except that each entry in the result is incremented
4354 // by the distance from the first element. So a-z is a block
4355 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
4356 // uncanonicalizes to ['a' + k, 'A' + k].
4357 // Once we've found the start point we look up its uncanonicalization
4358 // and produce a range for each element. For instance for [c-f]
4359 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
4360 // add a range if it is not already contained in the input, so [c-f]
4361 // will be skipped but [C-F] will be added. If this range is not
4362 // completely contained in a block we do this for all the blocks
4363 // covered by the range.
4364 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4365 // First, look up the block that contains the 'from' character.
4366 int length = canonrange.get(from(), '\0', range);
4367 if (length == 0) {
4368 range[0] = from();
4369 } else {
4370 ASSERT_EQ(1, length);
4371 }
4372 int pos = from();
4373 // The start of the current block. Note that except for the first
4374 // iteration 'start' is always equal to 'pos'.
4375 int start;
4376 // If it is not the start point of a block the entry contains the
4377 // offset of the character from the start point.
4378 if ((range[0] & kStartMarker) == 0) {
4379 start = pos - range[0];
4380 } else {
4381 start = pos;
4382 }
4383 // Then we add the ranges on at a time, incrementing the current
4384 // position to be after the last block each time. The position
4385 // always points to the start of a block.
4386 while (pos < to()) {
4387 length = canonrange.get(start, '\0', range);
4388 if (length == 0) {
4389 range[0] = start;
4390 } else {
4391 ASSERT_EQ(1, length);
4392 }
4393 ASSERT((range[0] & kStartMarker) != 0);
4394 // The start point of a block contains the distance to the end
4395 // of the range.
4396 int block_end = start + (range[0] & kPayloadMask) - 1;
4397 int end = (block_end > to()) ? to() : block_end;
4398 length = uncanonicalize.get(start, '\0', range);
4399 for (int i = 0; i < length; i++) {
4400 uc32 c = range[i];
4401 uc16 range_from = c + (pos - start);
4402 uc16 range_to = c + (end - start);
4403 if (!(from() <= range_from && range_to <= to())) {
4404 ranges->Add(CharacterRange(range_from, range_to));
4405 }
4406 }
4407 start = pos = block_end + 1;
4408 }
4409 } else {
4410 // TODO(plesner) when we've fixed the 2^11 bug in unibrow.
4411 }
4412}
4413
4414
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004415ZoneList<CharacterRange>* CharacterSet::ranges() {
4416 if (ranges_ == NULL) {
4417 ranges_ = new ZoneList<CharacterRange>(2);
4418 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
4419 }
4420 return ranges_;
4421}
4422
4423
4424
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004425// -------------------------------------------------------------------
4426// Interest propagation
4427
4428
4429RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
4430 for (int i = 0; i < siblings_.length(); i++) {
4431 RegExpNode* sibling = siblings_.Get(i);
4432 if (sibling->info()->Matches(info))
4433 return sibling;
4434 }
4435 return NULL;
4436}
4437
4438
4439RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
4440 ASSERT_EQ(false, *cloned);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004441 siblings_.Ensure(this);
4442 RegExpNode* result = TryGetSibling(info);
4443 if (result != NULL) return result;
4444 result = this->Clone();
4445 NodeInfo* new_info = result->info();
4446 new_info->ResetCompilationState();
4447 new_info->AddFromPreceding(info);
4448 AddSibling(result);
4449 *cloned = true;
4450 return result;
4451}
4452
4453
4454template <class C>
4455static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
4456 NodeInfo full_info(*node->info());
4457 full_info.AddFromPreceding(info);
4458 bool cloned = false;
4459 return RegExpNode::EnsureSibling(node, &full_info, &cloned);
4460}
4461
4462
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004463// -------------------------------------------------------------------
4464// Splay tree
4465
4466
4467OutSet* OutSet::Extend(unsigned value) {
4468 if (Get(value))
4469 return this;
4470 if (successors() != NULL) {
4471 for (int i = 0; i < successors()->length(); i++) {
4472 OutSet* successor = successors()->at(i);
4473 if (successor->Get(value))
4474 return successor;
4475 }
4476 } else {
4477 successors_ = new ZoneList<OutSet*>(2);
4478 }
4479 OutSet* result = new OutSet(first_, remaining_);
4480 result->Set(value);
4481 successors()->Add(result);
4482 return result;
4483}
4484
4485
4486void OutSet::Set(unsigned value) {
4487 if (value < kFirstLimit) {
4488 first_ |= (1 << value);
4489 } else {
4490 if (remaining_ == NULL)
4491 remaining_ = new ZoneList<unsigned>(1);
4492 if (remaining_->is_empty() || !remaining_->Contains(value))
4493 remaining_->Add(value);
4494 }
4495}
4496
4497
4498bool OutSet::Get(unsigned value) {
4499 if (value < kFirstLimit) {
4500 return (first_ & (1 << value)) != 0;
4501 } else if (remaining_ == NULL) {
4502 return false;
4503 } else {
4504 return remaining_->Contains(value);
4505 }
4506}
4507
4508
4509const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
4510const DispatchTable::Entry DispatchTable::Config::kNoValue;
4511
4512
4513void DispatchTable::AddRange(CharacterRange full_range, int value) {
4514 CharacterRange current = full_range;
4515 if (tree()->is_empty()) {
4516 // If this is the first range we just insert into the table.
4517 ZoneSplayTree<Config>::Locator loc;
4518 ASSERT_RESULT(tree()->Insert(current.from(), &loc));
4519 loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
4520 return;
4521 }
4522 // First see if there is a range to the left of this one that
4523 // overlaps.
4524 ZoneSplayTree<Config>::Locator loc;
4525 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
4526 Entry* entry = &loc.value();
4527 // If we've found a range that overlaps with this one, and it
4528 // starts strictly to the left of this one, we have to fix it
4529 // because the following code only handles ranges that start on
4530 // or after the start point of the range we're adding.
4531 if (entry->from() < current.from() && entry->to() >= current.from()) {
4532 // Snap the overlapping range in half around the start point of
4533 // the range we're adding.
4534 CharacterRange left(entry->from(), current.from() - 1);
4535 CharacterRange right(current.from(), entry->to());
4536 // The left part of the overlapping range doesn't overlap.
4537 // Truncate the whole entry to be just the left part.
4538 entry->set_to(left.to());
4539 // The right part is the one that overlaps. We add this part
4540 // to the map and let the next step deal with merging it with
4541 // the range we're adding.
4542 ZoneSplayTree<Config>::Locator loc;
4543 ASSERT_RESULT(tree()->Insert(right.from(), &loc));
4544 loc.set_value(Entry(right.from(),
4545 right.to(),
4546 entry->out_set()));
4547 }
4548 }
4549 while (current.is_valid()) {
4550 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
4551 (loc.value().from() <= current.to()) &&
4552 (loc.value().to() >= current.from())) {
4553 Entry* entry = &loc.value();
4554 // We have overlap. If there is space between the start point of
4555 // the range we're adding and where the overlapping range starts
4556 // then we have to add a range covering just that space.
4557 if (current.from() < entry->from()) {
4558 ZoneSplayTree<Config>::Locator ins;
4559 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4560 ins.set_value(Entry(current.from(),
4561 entry->from() - 1,
4562 empty()->Extend(value)));
4563 current.set_from(entry->from());
4564 }
4565 ASSERT_EQ(current.from(), entry->from());
4566 // If the overlapping range extends beyond the one we want to add
4567 // we have to snap the right part off and add it separately.
4568 if (entry->to() > current.to()) {
4569 ZoneSplayTree<Config>::Locator ins;
4570 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
4571 ins.set_value(Entry(current.to() + 1,
4572 entry->to(),
4573 entry->out_set()));
4574 entry->set_to(current.to());
4575 }
4576 ASSERT(entry->to() <= current.to());
4577 // The overlapping range is now completely contained by the range
4578 // we're adding so we can just update it and move the start point
4579 // of the range we're adding just past it.
4580 entry->AddValue(value);
4581 // Bail out if the last interval ended at 0xFFFF since otherwise
4582 // adding 1 will wrap around to 0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004583 if (entry->to() == String::kMaxUC16CharCode)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004584 break;
4585 ASSERT(entry->to() + 1 > current.from());
4586 current.set_from(entry->to() + 1);
4587 } else {
4588 // There is no overlap so we can just add the range
4589 ZoneSplayTree<Config>::Locator ins;
4590 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4591 ins.set_value(Entry(current.from(),
4592 current.to(),
4593 empty()->Extend(value)));
4594 break;
4595 }
4596 }
4597}
4598
4599
4600OutSet* DispatchTable::Get(uc16 value) {
4601 ZoneSplayTree<Config>::Locator loc;
4602 if (!tree()->FindGreatestLessThan(value, &loc))
4603 return empty();
4604 Entry* entry = &loc.value();
4605 if (value <= entry->to())
4606 return entry->out_set();
4607 else
4608 return empty();
4609}
4610
4611
4612// -------------------------------------------------------------------
4613// Analysis
4614
4615
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004616void Analysis::EnsureAnalyzed(RegExpNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004617 if (that->info()->been_analyzed || that->info()->being_analyzed)
4618 return;
4619 that->info()->being_analyzed = true;
4620 that->Accept(this);
4621 that->info()->being_analyzed = false;
4622 that->info()->been_analyzed = true;
4623}
4624
4625
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004626void Analysis::VisitEnd(EndNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004627 // nothing to do
4628}
4629
4630
ager@chromium.org8bb60582008-12-11 12:02:20 +00004631void TextNode::CalculateOffsets() {
4632 int element_count = elements()->length();
4633 // Set up the offsets of the elements relative to the start. This is a fixed
4634 // quantity since a TextNode can only contain fixed-width things.
4635 int cp_offset = 0;
4636 for (int i = 0; i < element_count; i++) {
4637 TextElement& elm = elements()->at(i);
4638 elm.cp_offset = cp_offset;
4639 if (elm.type == TextElement::ATOM) {
4640 cp_offset += elm.data.u_atom->data().length();
4641 } else {
4642 cp_offset++;
4643 Vector<const uc16> quarks = elm.data.u_atom->data();
4644 }
4645 }
4646}
4647
4648
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004649void Analysis::VisitText(TextNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004650 if (ignore_case_) {
4651 that->MakeCaseIndependent();
4652 }
4653 EnsureAnalyzed(that->on_success());
ager@chromium.org8bb60582008-12-11 12:02:20 +00004654 that->CalculateOffsets();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004655}
4656
4657
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004658void Analysis::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004659 RegExpNode* target = that->on_success();
4660 EnsureAnalyzed(target);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004661 // If the next node is interested in what it follows then this node
4662 // has to be interested too so it can pass the information on.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004663 that->info()->AddFromFollowing(target->info());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004664}
4665
4666
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004667void Analysis::VisitChoice(ChoiceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004668 NodeInfo* info = that->info();
4669 for (int i = 0; i < that->alternatives()->length(); i++) {
4670 RegExpNode* node = that->alternatives()->at(i).node();
4671 EnsureAnalyzed(node);
4672 // Anything the following nodes need to know has to be known by
4673 // this node also, so it can pass it on.
4674 info->AddFromFollowing(node->info());
4675 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004676}
4677
4678
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004679void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
4680 NodeInfo* info = that->info();
4681 for (int i = 0; i < that->alternatives()->length(); i++) {
4682 RegExpNode* node = that->alternatives()->at(i).node();
4683 if (node != that->loop_node()) {
4684 EnsureAnalyzed(node);
4685 info->AddFromFollowing(node->info());
4686 }
4687 }
4688 // Check the loop last since it may need the value of this node
4689 // to get a correct result.
4690 EnsureAnalyzed(that->loop_node());
4691 info->AddFromFollowing(that->loop_node()->info());
4692}
4693
4694
4695void Analysis::VisitBackReference(BackReferenceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004696 EnsureAnalyzed(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004697}
4698
4699
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004700void Analysis::VisitAssertion(AssertionNode* that) {
4701 EnsureAnalyzed(that->on_success());
4702}
4703
4704
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004705// -------------------------------------------------------------------
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004706// Dispatch table construction
4707
4708
4709void DispatchTableConstructor::VisitEnd(EndNode* that) {
4710 AddRange(CharacterRange::Everything());
4711}
4712
4713
4714void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
4715 node->set_being_calculated(true);
4716 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
4717 for (int i = 0; i < alternatives->length(); i++) {
4718 set_choice_index(i);
4719 alternatives->at(i).node()->Accept(this);
4720 }
4721 node->set_being_calculated(false);
4722}
4723
4724
4725class AddDispatchRange {
4726 public:
4727 explicit AddDispatchRange(DispatchTableConstructor* constructor)
4728 : constructor_(constructor) { }
4729 void Call(uc32 from, DispatchTable::Entry entry);
4730 private:
4731 DispatchTableConstructor* constructor_;
4732};
4733
4734
4735void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
4736 CharacterRange range(from, entry.to());
4737 constructor_->AddRange(range);
4738}
4739
4740
4741void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
4742 if (node->being_calculated())
4743 return;
4744 DispatchTable* table = node->GetTable(ignore_case_);
4745 AddDispatchRange adder(this);
4746 table->ForEach(&adder);
4747}
4748
4749
4750void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
4751 // TODO(160): Find the node that we refer back to and propagate its start
4752 // set back to here. For now we just accept anything.
4753 AddRange(CharacterRange::Everything());
4754}
4755
4756
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004757void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
4758 RegExpNode* target = that->on_success();
4759 target->Accept(this);
4760}
4761
4762
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004763
4764static int CompareRangeByFrom(const CharacterRange* a,
4765 const CharacterRange* b) {
4766 return Compare<uc16>(a->from(), b->from());
4767}
4768
4769
4770void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
4771 ranges->Sort(CompareRangeByFrom);
4772 uc16 last = 0;
4773 for (int i = 0; i < ranges->length(); i++) {
4774 CharacterRange range = ranges->at(i);
4775 if (last < range.from())
4776 AddRange(CharacterRange(last, range.from() - 1));
4777 if (range.to() >= last) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004778 if (range.to() == String::kMaxUC16CharCode) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004779 return;
4780 } else {
4781 last = range.to() + 1;
4782 }
4783 }
4784 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004785 AddRange(CharacterRange(last, String::kMaxUC16CharCode));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004786}
4787
4788
4789void DispatchTableConstructor::VisitText(TextNode* that) {
4790 TextElement elm = that->elements()->at(0);
4791 switch (elm.type) {
4792 case TextElement::ATOM: {
4793 uc16 c = elm.data.u_atom->data()[0];
4794 AddRange(CharacterRange(c, c));
4795 break;
4796 }
4797 case TextElement::CHAR_CLASS: {
4798 RegExpCharacterClass* tree = elm.data.u_char_class;
4799 ZoneList<CharacterRange>* ranges = tree->ranges();
4800 if (tree->is_negated()) {
4801 AddInverse(ranges);
4802 } else {
4803 for (int i = 0; i < ranges->length(); i++)
4804 AddRange(ranges->at(i));
4805 }
4806 break;
4807 }
4808 default: {
4809 UNIMPLEMENTED();
4810 }
4811 }
4812}
4813
4814
4815void DispatchTableConstructor::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004816 RegExpNode* target = that->on_success();
4817 target->Accept(this);
4818}
4819
4820
ager@chromium.org8bb60582008-12-11 12:02:20 +00004821Handle<FixedArray> RegExpEngine::Compile(RegExpCompileData* data,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004822 bool ignore_case,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004823 bool is_multiline,
4824 Handle<String> pattern,
4825 bool is_ascii) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004826 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
4827 return IrregexpRegExpTooBig(pattern);
4828 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004829 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004830 // Wrap the body of the regexp in capture #0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004831 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004832 0,
4833 &compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004834 compiler.accept());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004835 RegExpNode* node = captured_body;
4836 if (!data->tree->IsAnchored()) {
4837 // Add a .*? at the beginning, outside the body capture, unless
4838 // this expression is anchored at the beginning.
iposva@chromium.org245aa852009-02-10 00:49:54 +00004839 RegExpNode* loop_node =
4840 RegExpQuantifier::ToNode(0,
4841 RegExpTree::kInfinity,
4842 false,
4843 new RegExpCharacterClass('*'),
4844 &compiler,
4845 captured_body,
4846 data->contains_anchor);
4847
4848 if (data->contains_anchor) {
4849 // Unroll loop once, to take care of the case that might start
4850 // at the start of input.
4851 ChoiceNode* first_step_node = new ChoiceNode(2);
4852 first_step_node->AddAlternative(GuardedAlternative(captured_body));
4853 first_step_node->AddAlternative(GuardedAlternative(
4854 new TextNode(new RegExpCharacterClass('*'), loop_node)));
4855 node = first_step_node;
4856 } else {
4857 node = loop_node;
4858 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004859 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004860 data->node = node;
4861 Analysis analysis(ignore_case);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004862 analysis.EnsureAnalyzed(node);
4863
4864 NodeInfo info = *node->info();
ager@chromium.org8bb60582008-12-11 12:02:20 +00004865
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004866 if (FLAG_irregexp_native) {
4867#ifdef ARM
4868 // Unimplemented, fall-through to bytecode implementation.
4869#else // IA32
ager@chromium.org8bb60582008-12-11 12:02:20 +00004870 RegExpMacroAssemblerIA32::Mode mode;
4871 if (is_ascii) {
4872 mode = RegExpMacroAssemblerIA32::ASCII;
4873 } else {
4874 mode = RegExpMacroAssemblerIA32::UC16;
4875 }
4876 RegExpMacroAssemblerIA32 macro_assembler(mode,
4877 (data->capture_count + 1) * 2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004878 return compiler.Assemble(&macro_assembler,
4879 node,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004880 data->capture_count,
4881 pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004882#endif
4883 }
4884 EmbeddedVector<byte, 1024> codes;
4885 RegExpMacroAssemblerIrregexp macro_assembler(codes);
4886 return compiler.Assemble(&macro_assembler,
4887 node,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004888 data->capture_count,
4889 pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004890}
4891
4892
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00004893}} // namespace v8::internal