blob: aa11a6950cc84656975f85aa752526a13d2b10a5 [file] [log] [blame]
ager@chromium.org9258b6b2008-09-11 09:11:10 +00001// Copyright 2006-2008 the V8 project authors. All rights reserved.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include "v8.h"
29
ager@chromium.orga74f0da2008-12-03 16:05:52 +000030#include "ast.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000031#include "execution.h"
32#include "factory.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000033#include "jsregexp-inl.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000034#include "platform.h"
kasperl@chromium.org41044eb2008-10-06 08:24:46 +000035#include "runtime.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000036#include "top.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000037#include "compilation-cache.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000038#include "string-stream.h"
39#include "parser.h"
40#include "regexp-macro-assembler.h"
41#include "regexp-macro-assembler-tracer.h"
42#include "regexp-macro-assembler-irregexp.h"
ager@chromium.org32912102009-01-16 10:38:43 +000043#include "regexp-stack.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000044
45#ifdef ARM
46#include "regexp-macro-assembler-arm.h"
47#else // IA32
48#include "macro-assembler-ia32.h"
49#include "regexp-macro-assembler-ia32.h"
50#endif
51
52#include "interpreter-irregexp.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000053
54// Including pcre.h undefines DEBUG to avoid getting debug output from
55// the JSCRE implementation. Make sure to redefine it in debug mode
56// after having included the header file.
57#ifdef DEBUG
58#include "third_party/jscre/pcre.h"
59#define DEBUG
60#else
61#include "third_party/jscre/pcre.h"
62#endif
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000063
ager@chromium.orga74f0da2008-12-03 16:05:52 +000064
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000065namespace v8 { namespace internal {
66
67
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000068static Failure* malloc_failure;
69
70static void* JSREMalloc(size_t size) {
71 Object* obj = Heap::AllocateByteArray(size);
72
73 // If allocation failed, return a NULL pointer to JSRE, and jsRegExpCompile
74 // will return NULL to the caller, performs GC there.
75 // Also pass failure information to the caller.
76 if (obj->IsFailure()) {
77 malloc_failure = Failure::cast(obj);
78 return NULL;
79 }
80
81 // Note: object is unrooted, the caller of jsRegExpCompile must
82 // create a handle for the return value before doing heap allocation.
83 return reinterpret_cast<void*>(ByteArray::cast(obj)->GetDataStartAddress());
84}
85
86
87static void JSREFree(void* p) {
88 USE(p); // Do nothing, memory is garbage collected.
89}
90
91
92String* RegExpImpl::last_ascii_string_ = NULL;
93String* RegExpImpl::two_byte_cached_string_ = NULL;
94
95
96void RegExpImpl::NewSpaceCollectionPrologue() {
97 // The two byte string is always in the old space. The Ascii string may be
98 // in either place. If it is in the old space we don't need to do anything.
99 if (Heap::InNewSpace(last_ascii_string_)) {
100 // Invalidate the cache.
101 last_ascii_string_ = NULL;
102 two_byte_cached_string_ = NULL;
103 }
104}
105
106
107void RegExpImpl::OldSpaceCollectionPrologue() {
108 last_ascii_string_ = NULL;
109 two_byte_cached_string_ = NULL;
110}
111
112
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000113Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
114 Handle<String> pattern,
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000115 Handle<String> flags,
116 bool* has_pending_exception) {
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000117 // Ensure that the constructor function has been loaded.
118 if (!constructor->IsLoaded()) {
119 LoadLazy(constructor, has_pending_exception);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000120 if (*has_pending_exception) return Handle<Object>();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000121 }
122 // Call the construct code with 2 arguments.
123 Object** argv[2] = { Handle<Object>::cast(pattern).location(),
124 Handle<Object>::cast(flags).location() };
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000125 return Execution::New(constructor, 2, argv, has_pending_exception);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000126}
127
128
129// Converts a source string to a 16 bit flat string or a SlicedString containing
130// a 16 bit flat string).
131Handle<String> RegExpImpl::CachedStringToTwoByte(Handle<String> subject) {
132 if (*subject == last_ascii_string_) {
133 ASSERT(two_byte_cached_string_ != NULL);
134 return Handle<String>(String::cast(two_byte_cached_string_));
135 }
136 Handle<String> two_byte_string = StringToTwoByte(subject);
137 last_ascii_string_ = *subject;
138 two_byte_cached_string_ = *two_byte_string;
139 return two_byte_string;
140}
141
142
143// Converts a source string to a 16 bit flat string or a SlicedString containing
144// a 16 bit flat string).
145Handle<String> RegExpImpl::StringToTwoByte(Handle<String> pattern) {
ager@chromium.org870a0b62008-11-04 11:43:05 +0000146 StringShape shape(*pattern);
147 if (!pattern->IsFlat(shape)) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000148 FlattenString(pattern);
ager@chromium.orgc3e50d82008-11-05 11:53:10 +0000149 shape = StringShape(*pattern);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000150 }
ager@chromium.org870a0b62008-11-04 11:43:05 +0000151 Handle<String> flat_string(shape.IsCons() ?
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000152 String::cast(ConsString::cast(*pattern)->first()) :
153 *pattern);
ager@chromium.org870a0b62008-11-04 11:43:05 +0000154 ASSERT(flat_string->IsString());
155 StringShape flat_shape(*flat_string);
156 ASSERT(!flat_shape.IsCons());
157 ASSERT(flat_shape.IsSequential() ||
158 flat_shape.IsSliced() ||
159 flat_shape.IsExternal());
160 if (!flat_shape.IsAsciiRepresentation()) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000161 return flat_string;
162 }
163
ager@chromium.org870a0b62008-11-04 11:43:05 +0000164 int len = flat_string->length(flat_shape);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000165 Handle<String> two_byte_string =
ager@chromium.org870a0b62008-11-04 11:43:05 +0000166 Factory::NewRawTwoByteString(len, TENURED);
167 uc16* dest = SeqTwoByteString::cast(*two_byte_string)->GetChars();
168 String::WriteToFlat(*flat_string, flat_shape, dest, 0, len);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000169 return two_byte_string;
170}
171
172
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000173static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
174 int flags = JSRegExp::NONE;
ager@chromium.org870a0b62008-11-04 11:43:05 +0000175 StringShape shape(*str);
176 for (int i = 0; i < str->length(shape); i++) {
177 switch (str->Get(shape, i)) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000178 case 'i':
179 flags |= JSRegExp::IGNORE_CASE;
180 break;
181 case 'g':
182 flags |= JSRegExp::GLOBAL;
183 break;
184 case 'm':
185 flags |= JSRegExp::MULTILINE;
186 break;
187 }
188 }
189 return JSRegExp::Flags(flags);
190}
191
192
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000193static inline void ThrowRegExpException(Handle<JSRegExp> re,
194 Handle<String> pattern,
195 Handle<String> error_text,
196 const char* message) {
197 Handle<JSArray> array = Factory::NewJSArray(2);
198 SetElement(array, 0, pattern);
199 SetElement(array, 1, error_text);
200 Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
201 Top::Throw(*regexp_err);
202}
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000203
204
ager@chromium.org8bb60582008-12-11 12:02:20 +0000205// Generic RegExp methods. Dispatches to implementation specific methods.
206
207
208class OffsetsVector {
209 public:
210 inline OffsetsVector(int num_registers)
211 : offsets_vector_length_(num_registers) {
212 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
213 vector_ = NewArray<int>(offsets_vector_length_);
214 } else {
215 vector_ = static_offsets_vector_;
216 }
217 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000218 inline ~OffsetsVector() {
219 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
220 DeleteArray(vector_);
221 vector_ = NULL;
222 }
223 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000224 inline int* vector() { return vector_; }
225 inline int length() { return offsets_vector_length_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000226
227 private:
228 int* vector_;
229 int offsets_vector_length_;
230 static const int kStaticOffsetsVectorSize = 50;
231 static int static_offsets_vector_[kStaticOffsetsVectorSize];
232};
233
234
235int OffsetsVector::static_offsets_vector_[
236 OffsetsVector::kStaticOffsetsVectorSize];
237
238
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000239Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
240 Handle<String> pattern,
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000241 Handle<String> flag_str) {
242 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
243 Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
244 bool in_cache = !cached.is_null();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000245 LOG(RegExpCompileEvent(re, in_cache));
246
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000247 Handle<Object> result;
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000248 if (in_cache) {
249 re->set_data(*cached);
250 result = re;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000251 } else {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000252 FlattenString(pattern);
253 ZoneScope zone_scope(DELETE_ON_EXIT);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000254 RegExpCompileData parse_result;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000255 FlatStringReader reader(pattern);
256 if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
257 // Throw an exception if we fail to parse the pattern.
258 ThrowRegExpException(re,
259 pattern,
260 parse_result.error,
261 "malformed_regexp");
ager@chromium.org8bb60582008-12-11 12:02:20 +0000262 return Handle<Object>::null();
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000263 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000264
265 if (parse_result.simple && !flags.is_ignore_case()) {
266 // Parse-tree is a single atom that is equal to the pattern.
267 result = AtomCompile(re, pattern, flags, pattern);
268 } else if (parse_result.tree->IsAtom() &&
269 !flags.is_ignore_case() &&
270 parse_result.capture_count == 0) {
271 RegExpAtom* atom = parse_result.tree->AsAtom();
272 Vector<const uc16> atom_pattern = atom->data();
273 Handle<String> atom_string = Factory::NewStringFromTwoByte(atom_pattern);
274 result = AtomCompile(re, pattern, flags, atom_string);
275 } else if (FLAG_irregexp) {
276 result = IrregexpPrepare(re, pattern, flags);
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000277 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000278 result = JscrePrepare(re, pattern, flags);
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000279 }
280 Object* data = re->data();
281 if (data->IsFixedArray()) {
282 // If compilation succeeded then the data is set on the regexp
283 // and we can store it in the cache.
284 Handle<FixedArray> data(FixedArray::cast(re->data()));
285 CompilationCache::PutRegExp(pattern, flags, data);
286 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000287 }
288
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000289 return result;
290}
291
292
293Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
294 Handle<String> subject,
295 Handle<Object> index) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000296 switch (regexp->TypeTag()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000297 case JSRegExp::ATOM:
298 return AtomExec(regexp, subject, index);
299 case JSRegExp::IRREGEXP: {
300 Handle<Object> result = IrregexpExec(regexp, subject, index);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000301 if (!result.is_null() || Top::has_pending_exception()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000302 return result;
303 }
304 // We couldn't handle the regexp using Irregexp, so fall back
305 // on JSCRE.
306 // Reset the JSRegExp to use JSCRE.
307 JscrePrepare(regexp,
308 Handle<String>(regexp->Pattern()),
309 regexp->GetFlags());
310 // Fall-through to JSCRE.
311 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000312 case JSRegExp::JSCRE:
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000313 if (FLAG_disable_jscre) {
314 UNIMPLEMENTED();
315 }
316 return JscreExec(regexp, subject, index);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000317 default:
318 UNREACHABLE();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000319 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000320 }
321}
322
323
324Handle<Object> RegExpImpl::ExecGlobal(Handle<JSRegExp> regexp,
325 Handle<String> subject) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000326 switch (regexp->TypeTag()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000327 case JSRegExp::ATOM:
328 return AtomExecGlobal(regexp, subject);
329 case JSRegExp::IRREGEXP: {
330 Handle<Object> result = IrregexpExecGlobal(regexp, subject);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000331 if (!result.is_null() || Top::has_pending_exception()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000332 return result;
333 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000334 // Empty handle as result but no exception thrown means that
335 // the regexp contains features not yet handled by the irregexp
336 // compiler.
337 // We have to fall back on JSCRE. Reset the JSRegExp to use JSCRE.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000338 JscrePrepare(regexp,
339 Handle<String>(regexp->Pattern()),
340 regexp->GetFlags());
341 // Fall-through to JSCRE.
342 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000343 case JSRegExp::JSCRE:
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000344 if (FLAG_disable_jscre) {
345 UNIMPLEMENTED();
346 }
347 return JscreExecGlobal(regexp, subject);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000348 default:
349 UNREACHABLE();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000350 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000351 }
352}
353
354
ager@chromium.org8bb60582008-12-11 12:02:20 +0000355// RegExp Atom implementation: Simple string search using indexOf.
356
357
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000358Handle<Object> RegExpImpl::AtomCompile(Handle<JSRegExp> re,
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000359 Handle<String> pattern,
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000360 JSRegExp::Flags flags,
361 Handle<String> match_pattern) {
362 Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, match_pattern);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000363 return re;
364}
365
366
367Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
368 Handle<String> subject,
369 Handle<Object> index) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000370 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000371
372 uint32_t start_index;
373 if (!Array::IndexFromObject(*index, &start_index)) {
374 return Handle<Smi>(Smi::FromInt(-1));
375 }
376
ager@chromium.org7c537e22008-10-16 08:43:32 +0000377 int value = Runtime::StringMatch(subject, needle, start_index);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000378 if (value == -1) return Factory::null_value();
ager@chromium.org7c537e22008-10-16 08:43:32 +0000379
380 Handle<FixedArray> array = Factory::NewFixedArray(2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000381 array->set(0, Smi::FromInt(value));
382 array->set(1, Smi::FromInt(value + needle->length()));
ager@chromium.org7c537e22008-10-16 08:43:32 +0000383 return Factory::NewJSArrayWithElements(array);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000384}
385
386
387Handle<Object> RegExpImpl::AtomExecGlobal(Handle<JSRegExp> re,
388 Handle<String> subject) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000389 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000390 Handle<JSArray> result = Factory::NewJSArray(1);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000391 int index = 0;
392 int match_count = 0;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000393 int subject_length = subject->length();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000394 int needle_length = needle->length();
ager@chromium.org7c537e22008-10-16 08:43:32 +0000395 while (true) {
ager@chromium.org7c537e22008-10-16 08:43:32 +0000396 int value = -1;
397 if (index + needle_length <= subject_length) {
398 value = Runtime::StringMatch(subject, needle, index);
399 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000400 if (value == -1) break;
401 HandleScope scope;
402 int end = value + needle_length;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000403
404 Handle<FixedArray> array = Factory::NewFixedArray(2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000405 array->set(0, Smi::FromInt(value));
406 array->set(1, Smi::FromInt(end));
ager@chromium.org7c537e22008-10-16 08:43:32 +0000407 Handle<JSArray> pair = Factory::NewJSArrayWithElements(array);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000408 SetElement(result, match_count, pair);
409 match_count++;
410 index = end;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000411 if (needle_length == 0) index++;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000412 }
413 return result;
414}
415
416
ager@chromium.org8bb60582008-12-11 12:02:20 +0000417// JSCRE implementation.
418
419
420int RegExpImpl::JscreNumberOfCaptures(Handle<JSRegExp> re) {
421 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
422 return Smi::cast(value->get(kJscreNumberOfCapturesIndex))->value();
423}
424
425
426ByteArray* RegExpImpl::JscreInternal(Handle<JSRegExp> re) {
427 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
428 return ByteArray::cast(value->get(kJscreInternalIndex));
429}
430
431
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000432Handle<Object>RegExpImpl::JscrePrepare(Handle<JSRegExp> re,
433 Handle<String> pattern,
434 JSRegExp::Flags flags) {
435 Handle<Object> value(Heap::undefined_value());
436 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
437 return re;
438}
439
440
ager@chromium.org8bb60582008-12-11 12:02:20 +0000441static inline Object* JscreDoCompile(String* pattern,
442 JSRegExp::Flags flags,
443 unsigned* number_of_captures,
444 const char** error_message,
445 v8::jscre::JscreRegExp** code) {
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000446 v8::jscre::JSRegExpIgnoreCaseOption case_option = flags.is_ignore_case()
447 ? v8::jscre::JSRegExpIgnoreCase
448 : v8::jscre::JSRegExpDoNotIgnoreCase;
449 v8::jscre::JSRegExpMultilineOption multiline_option = flags.is_multiline()
450 ? v8::jscre::JSRegExpMultiline
451 : v8::jscre::JSRegExpSingleLine;
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000452 *error_message = NULL;
453 malloc_failure = Failure::Exception();
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000454 *code = v8::jscre::jsRegExpCompile(pattern->GetTwoByteData(),
455 pattern->length(),
456 case_option,
457 multiline_option,
458 number_of_captures,
459 error_message,
460 &JSREMalloc,
461 &JSREFree);
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000462 if (*code == NULL && (malloc_failure->IsRetryAfterGC() ||
463 malloc_failure->IsOutOfMemoryFailure())) {
464 return malloc_failure;
465 } else {
466 // It doesn't matter which object we return here, we just need to return
467 // a non-failure to indicate to the GC-retry code that there was no
468 // allocation failure.
469 return pattern;
470 }
471}
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000472
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000473
ager@chromium.org8bb60582008-12-11 12:02:20 +0000474static void JscreCompileWithRetryAfterGC(Handle<String> pattern,
475 JSRegExp::Flags flags,
476 unsigned* number_of_captures,
477 const char** error_message,
478 v8::jscre::JscreRegExp** code) {
479 CALL_HEAP_FUNCTION_VOID(JscreDoCompile(*pattern,
480 flags,
481 number_of_captures,
482 error_message,
483 code));
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000484}
485
486
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000487Handle<Object> RegExpImpl::JscreCompile(Handle<JSRegExp> re) {
488 ASSERT_EQ(re->TypeTag(), JSRegExp::JSCRE);
489 ASSERT(re->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined());
490
491 Handle<String> pattern(re->Pattern());
492 JSRegExp::Flags flags = re->GetFlags();
493
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000494 Handle<String> two_byte_pattern = StringToTwoByte(pattern);
495
496 unsigned number_of_captures;
497 const char* error_message = NULL;
498
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000499 v8::jscre::JscreRegExp* code = NULL;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000500 FlattenString(pattern);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000501
ager@chromium.org8bb60582008-12-11 12:02:20 +0000502 JscreCompileWithRetryAfterGC(two_byte_pattern,
503 flags,
504 &number_of_captures,
505 &error_message,
506 &code);
ager@chromium.org7c537e22008-10-16 08:43:32 +0000507
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000508 if (code == NULL) {
509 // Throw an exception.
510 Handle<JSArray> array = Factory::NewJSArray(2);
511 SetElement(array, 0, pattern);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000512 const char* message =
513 (error_message == NULL) ? "Unknown regexp error" : error_message;
514 SetElement(array, 1, Factory::NewStringFromUtf8(CStrVector(message)));
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000515 Handle<Object> regexp_err =
516 Factory::NewSyntaxError("malformed_regexp", array);
ager@chromium.org3bf7b912008-11-17 09:09:45 +0000517 Top::Throw(*regexp_err);
518 return Handle<Object>();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000519 }
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000520
521 // Convert the return address to a ByteArray pointer.
522 Handle<ByteArray> internal(
523 ByteArray::FromDataStartAddress(reinterpret_cast<Address>(code)));
524
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000525 Handle<FixedArray> value = Factory::NewFixedArray(kJscreDataLength);
526 value->set(kJscreNumberOfCapturesIndex, Smi::FromInt(number_of_captures));
527 value->set(kJscreInternalIndex, *internal);
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000528 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
529
530 return re;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000531}
532
533
ager@chromium.org8bb60582008-12-11 12:02:20 +0000534Handle<Object> RegExpImpl::JscreExec(Handle<JSRegExp> regexp,
535 Handle<String> subject,
536 Handle<Object> index) {
537 ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);
538 if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {
539 Handle<Object> compile_result = JscreCompile(regexp);
540 if (compile_result.is_null()) return compile_result;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000541 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000542 ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000543
ager@chromium.org8bb60582008-12-11 12:02:20 +0000544 int num_captures = JscreNumberOfCaptures(regexp);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000545
ager@chromium.org8bb60582008-12-11 12:02:20 +0000546 OffsetsVector offsets((num_captures + 1) * 3);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000547
ager@chromium.org8bb60582008-12-11 12:02:20 +0000548 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000549
ager@chromium.org8bb60582008-12-11 12:02:20 +0000550 Handle<String> subject16 = CachedStringToTwoByte(subject);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000551
ager@chromium.org8bb60582008-12-11 12:02:20 +0000552 return JscreExecOnce(regexp,
553 num_captures,
554 subject,
555 previous_index,
556 subject16->GetTwoByteData(),
557 offsets.vector(),
558 offsets.length());
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000559}
560
561
562Handle<Object> RegExpImpl::JscreExecOnce(Handle<JSRegExp> regexp,
563 int num_captures,
564 Handle<String> subject,
565 int previous_index,
566 const uc16* two_byte_subject,
567 int* offsets_vector,
568 int offsets_vector_length) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000569 int rc;
570 {
571 AssertNoAllocation a;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000572 ByteArray* internal = JscreInternal(regexp);
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000573 const v8::jscre::JscreRegExp* js_regexp =
574 reinterpret_cast<v8::jscre::JscreRegExp*>(
575 internal->GetDataStartAddress());
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000576
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000577 rc = v8::jscre::jsRegExpExecute(js_regexp,
578 two_byte_subject,
579 subject->length(),
580 previous_index,
581 offsets_vector,
582 offsets_vector_length);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000583 }
584
585 // The KJS JavaScript engine returns null (ie, a failed match) when
586 // JSRE's internal match limit is exceeded. We duplicate that behavior here.
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000587 if (rc == v8::jscre::JSRegExpErrorNoMatch
588 || rc == v8::jscre::JSRegExpErrorHitLimit) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000589 return Factory::null_value();
590 }
591
592 // Other JSRE errors:
593 if (rc < 0) {
594 // Throw an exception.
595 Handle<Object> code(Smi::FromInt(rc));
596 Handle<Object> args[2] = { Factory::LookupAsciiSymbol("jsre_exec"), code };
597 Handle<Object> regexp_err(
598 Factory::NewTypeError("jsre_error", HandleVector(args, 2)));
599 return Handle<Object>(Top::Throw(*regexp_err));
600 }
601
ager@chromium.org7c537e22008-10-16 08:43:32 +0000602 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000603 // The captures come in (start, end+1) pairs.
604 for (int i = 0; i < 2 * (num_captures+1); i += 2) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000605 array->set(i, Smi::FromInt(offsets_vector[i]));
606 array->set(i+1, Smi::FromInt(offsets_vector[i+1]));
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000607 }
ager@chromium.org7c537e22008-10-16 08:43:32 +0000608 return Factory::NewJSArrayWithElements(array);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000609}
610
611
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000612Handle<Object> RegExpImpl::JscreExecGlobal(Handle<JSRegExp> regexp,
613 Handle<String> subject) {
614 ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);
615 if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {
616 Handle<Object> compile_result = JscreCompile(regexp);
617 if (compile_result.is_null()) return compile_result;
618 }
619 ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());
620
621 // Prepare space for the return values.
622 int num_captures = JscreNumberOfCaptures(regexp);
623
624 OffsetsVector offsets((num_captures + 1) * 3);
625
626 int previous_index = 0;
627
628 Handle<JSArray> result = Factory::NewJSArray(0);
629 int i = 0;
630 Handle<Object> matches;
631
632 Handle<String> subject16 = CachedStringToTwoByte(subject);
633
634 do {
635 if (previous_index > subject->length() || previous_index < 0) {
636 // Per ECMA-262 15.10.6.2, if the previous index is greater than the
637 // string length, there is no match.
638 matches = Factory::null_value();
639 } else {
640 matches = JscreExecOnce(regexp,
641 num_captures,
642 subject,
643 previous_index,
644 subject16->GetTwoByteData(),
645 offsets.vector(),
646 offsets.length());
647
648 if (matches->IsJSArray()) {
649 SetElement(result, i, matches);
650 i++;
651 previous_index = offsets.vector()[1];
652 if (offsets.vector()[0] == offsets.vector()[1]) {
653 previous_index++;
654 }
655 }
656 }
657 } while (matches->IsJSArray());
658
659 // If we exited the loop with an exception, throw it.
660 if (matches->IsNull()) {
661 // Exited loop normally.
662 return result;
663 } else {
664 // Exited loop with the exception in matches.
665 return matches;
666 }
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000667}
668
669
ager@chromium.org8bb60582008-12-11 12:02:20 +0000670// Irregexp implementation.
671
672
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000673// Retrieves a compiled version of the regexp for either ASCII or non-ASCII
674// strings. If the compiled version doesn't already exist, it is compiled
675// from the source pattern.
676// Irregexp is not feature complete yet. If there is something in the
677// regexp that the compiler cannot currently handle, an empty
678// handle is returned, but no exception is thrown.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000679static Handle<FixedArray> GetCompiledIrregexp(Handle<JSRegExp> re,
680 bool is_ascii) {
681 ASSERT(re->DataAt(JSRegExp::kIrregexpDataIndex)->IsFixedArray());
682 Handle<FixedArray> alternatives(
683 FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex)));
684 ASSERT_EQ(2, alternatives->length());
685
686 int index = is_ascii ? 0 : 1;
687 Object* entry = alternatives->get(index);
688 if (!entry->IsNull()) {
689 return Handle<FixedArray>(FixedArray::cast(entry));
690 }
691
692 // Compile the RegExp.
693 ZoneScope zone_scope(DELETE_ON_EXIT);
694
695 JSRegExp::Flags flags = re->GetFlags();
696
697 Handle<String> pattern(re->Pattern());
698 StringShape shape(*pattern);
699 if (!pattern->IsFlat(shape)) {
700 pattern->Flatten(shape);
701 }
702
703 RegExpCompileData compile_data;
704 FlatStringReader reader(pattern);
705 if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) {
706 // Throw an exception if we fail to parse the pattern.
707 // THIS SHOULD NOT HAPPEN. We already parsed it successfully once.
708 ThrowRegExpException(re,
709 pattern,
710 compile_data.error,
711 "malformed_regexp");
712 return Handle<FixedArray>::null();
713 }
714 Handle<FixedArray> compiled_entry =
715 RegExpEngine::Compile(&compile_data,
716 flags.is_ignore_case(),
717 flags.is_multiline(),
718 pattern,
719 is_ascii);
720 if (!compiled_entry.is_null()) {
721 alternatives->set(index, *compiled_entry);
722 }
723 return compiled_entry;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000724}
725
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000726
ager@chromium.org8bb60582008-12-11 12:02:20 +0000727int RegExpImpl::IrregexpNumberOfCaptures(Handle<FixedArray> irre) {
728 return Smi::cast(irre->get(kIrregexpNumberOfCapturesIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000729}
730
731
ager@chromium.org8bb60582008-12-11 12:02:20 +0000732int RegExpImpl::IrregexpNumberOfRegisters(Handle<FixedArray> irre) {
733 return Smi::cast(irre->get(kIrregexpNumberOfRegistersIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000734}
735
736
ager@chromium.org8bb60582008-12-11 12:02:20 +0000737Handle<ByteArray> RegExpImpl::IrregexpByteCode(Handle<FixedArray> irre) {
738 ASSERT(Smi::cast(irre->get(kIrregexpImplementationIndex))->value()
739 == RegExpMacroAssembler::kBytecodeImplementation);
740 return Handle<ByteArray>(ByteArray::cast(irre->get(kIrregexpCodeIndex)));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000741}
742
743
ager@chromium.org8bb60582008-12-11 12:02:20 +0000744Handle<Code> RegExpImpl::IrregexpNativeCode(Handle<FixedArray> irre) {
745 ASSERT(Smi::cast(irre->get(kIrregexpImplementationIndex))->value()
746 != RegExpMacroAssembler::kBytecodeImplementation);
747 return Handle<Code>(Code::cast(irre->get(kIrregexpCodeIndex)));
748}
749
750
751Handle<Object>RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
752 Handle<String> pattern,
753 JSRegExp::Flags flags) {
754 // Make space for ASCII and UC16 versions.
755 Handle<FixedArray> alternatives = Factory::NewFixedArray(2);
756 alternatives->set_null(0);
757 alternatives->set_null(1);
758 Factory::SetRegExpData(re, JSRegExp::IRREGEXP, pattern, flags, alternatives);
759 return re;
760}
761
762
763Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
764 Handle<String> subject,
765 Handle<Object> index) {
766 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
767 ASSERT(regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsFixedArray());
768
769 bool is_ascii = StringShape(*subject).IsAsciiRepresentation();
770 Handle<FixedArray> irregexp = GetCompiledIrregexp(regexp, is_ascii);
771 if (irregexp.is_null()) {
772 // We can't handle the RegExp with IRRegExp.
773 return Handle<Object>::null();
774 }
775
776 // Prepare space for the return values.
777 int number_of_registers = IrregexpNumberOfRegisters(irregexp);
778 OffsetsVector offsets(number_of_registers);
779
780 int num_captures = IrregexpNumberOfCaptures(irregexp);
781
782 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
783
784#ifdef DEBUG
785 if (FLAG_trace_regexp_bytecodes) {
786 String* pattern = regexp->Pattern();
787 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
788 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
789 }
790#endif
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000791
792 if (!subject->IsFlat(StringShape(*subject))) {
793 FlattenString(subject);
794 }
795
ager@chromium.org8bb60582008-12-11 12:02:20 +0000796 return IrregexpExecOnce(irregexp,
797 num_captures,
798 subject,
799 previous_index,
800 offsets.vector(),
801 offsets.length());
802}
803
804
805Handle<Object> RegExpImpl::IrregexpExecGlobal(Handle<JSRegExp> regexp,
806 Handle<String> subject) {
807 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
808
809 StringShape shape(*subject);
810 bool is_ascii = shape.IsAsciiRepresentation();
811 Handle<FixedArray> irregexp = GetCompiledIrregexp(regexp, is_ascii);
812 if (irregexp.is_null()) {
813 return Handle<Object>::null();
814 }
815
816 // Prepare space for the return values.
817 int number_of_registers = IrregexpNumberOfRegisters(irregexp);
818 OffsetsVector offsets(number_of_registers);
819
820 int previous_index = 0;
821
822 Handle<JSArray> result = Factory::NewJSArray(0);
823 int i = 0;
824 Handle<Object> matches;
825
826 if (!subject->IsFlat(shape)) {
827 subject->Flatten(shape);
828 }
829
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000830 while (true) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000831 if (previous_index > subject->length() || previous_index < 0) {
832 // Per ECMA-262 15.10.6.2, if the previous index is greater than the
833 // string length, there is no match.
834 matches = Factory::null_value();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000835 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000836 } else {
837#ifdef DEBUG
838 if (FLAG_trace_regexp_bytecodes) {
839 String* pattern = regexp->Pattern();
840 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
841 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
842 }
843#endif
ager@chromium.org8bb60582008-12-11 12:02:20 +0000844 matches = IrregexpExecOnce(irregexp,
845 IrregexpNumberOfCaptures(irregexp),
846 subject,
847 previous_index,
848 offsets.vector(),
849 offsets.length());
850
851 if (matches->IsJSArray()) {
852 SetElement(result, i, matches);
853 i++;
854 previous_index = offsets.vector()[1];
855 if (offsets.vector()[0] == offsets.vector()[1]) {
856 previous_index++;
857 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000858 } else if (matches->IsNull()) {
859 return result;
860 } else {
861 return matches;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000862 }
863 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000864 }
865}
866
867
868Handle<Object> RegExpImpl::IrregexpExecOnce(Handle<FixedArray> irregexp,
869 int num_captures,
870 Handle<String> subject,
871 int previous_index,
872 int* offsets_vector,
873 int offsets_vector_length) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000874 ASSERT(subject->IsFlat(StringShape(*subject)));
ager@chromium.org8bb60582008-12-11 12:02:20 +0000875 bool rc;
876
877 int tag = Smi::cast(irregexp->get(kIrregexpImplementationIndex))->value();
878
ager@chromium.org8bb60582008-12-11 12:02:20 +0000879 switch (tag) {
880 case RegExpMacroAssembler::kIA32Implementation: {
881#ifndef ARM
882 Handle<Code> code = IrregexpNativeCode(irregexp);
883
884 StringShape shape(*subject);
885
886 // Character offsets into string.
887 int start_offset = previous_index;
888 int end_offset = subject->length(shape);
889
890 if (shape.IsCons()) {
891 subject = Handle<String>(ConsString::cast(*subject)->first());
892 } else if (shape.IsSliced()) {
893 SlicedString* slice = SlicedString::cast(*subject);
894 start_offset += slice->start();
895 end_offset += slice->start();
896 subject = Handle<String>(slice->buffer());
897 }
898
899 // String is now either Sequential or External
900 StringShape flatshape(*subject);
901 bool is_ascii = flatshape.IsAsciiRepresentation();
902 int char_size_shift = is_ascii ? 0 : 1;
903
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000904 RegExpMacroAssemblerIA32::Result res;
905
ager@chromium.org8bb60582008-12-11 12:02:20 +0000906 if (flatshape.IsExternal()) {
907 const byte* address;
908 if (is_ascii) {
909 ExternalAsciiString* ext = ExternalAsciiString::cast(*subject);
910 address = reinterpret_cast<const byte*>(ext->resource()->data());
911 } else {
912 ExternalTwoByteString* ext = ExternalTwoByteString::cast(*subject);
913 address = reinterpret_cast<const byte*>(ext->resource()->data());
914 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000915 res = RegExpMacroAssemblerIA32::Execute(
ager@chromium.org8bb60582008-12-11 12:02:20 +0000916 *code,
ager@chromium.org32912102009-01-16 10:38:43 +0000917 const_cast<Address*>(&address),
ager@chromium.org8bb60582008-12-11 12:02:20 +0000918 start_offset << char_size_shift,
919 end_offset << char_size_shift,
920 offsets_vector,
921 previous_index == 0);
922 } else { // Sequential string
923 Address char_address =
924 is_ascii ? SeqAsciiString::cast(*subject)->GetCharsAddress()
925 : SeqTwoByteString::cast(*subject)->GetCharsAddress();
926 int byte_offset = char_address - reinterpret_cast<Address>(*subject);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000927 res = RegExpMacroAssemblerIA32::Execute(
ager@chromium.org8bb60582008-12-11 12:02:20 +0000928 *code,
ager@chromium.org32912102009-01-16 10:38:43 +0000929 reinterpret_cast<Address*>(subject.location()),
ager@chromium.org8bb60582008-12-11 12:02:20 +0000930 byte_offset + (start_offset << char_size_shift),
931 byte_offset + (end_offset << char_size_shift),
932 offsets_vector,
933 previous_index == 0);
934 }
935
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000936 if (res == RegExpMacroAssemblerIA32::EXCEPTION) {
937 ASSERT(Top::has_pending_exception());
938 return Handle<Object>::null();
939 }
940 rc = (res == RegExpMacroAssemblerIA32::SUCCESS);
941
ager@chromium.org8bb60582008-12-11 12:02:20 +0000942 if (rc) {
943 // Capture values are relative to start_offset only.
944 for (int i = 0; i < offsets_vector_length; i++) {
945 if (offsets_vector[i] >= 0) {
946 offsets_vector[i] += previous_index;
947 }
948 }
949 }
950 break;
951#else
952 UNIMPLEMENTED();
953 rc = false;
954 break;
955#endif
956 }
957 case RegExpMacroAssembler::kBytecodeImplementation: {
958 for (int i = (num_captures + 1) * 2 - 1; i >= 0; i--) {
959 offsets_vector[i] = -1;
960 }
961 Handle<ByteArray> byte_codes = IrregexpByteCode(irregexp);
962
963 rc = IrregexpInterpreter::Match(byte_codes,
964 subject,
965 offsets_vector,
966 previous_index);
967 break;
968 }
969 case RegExpMacroAssembler::kARMImplementation:
970 default:
971 UNREACHABLE();
972 rc = false;
973 break;
974 }
975
976 if (!rc) {
977 return Factory::null_value();
978 }
979
980 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
981 // The captures come in (start, end+1) pairs.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000982 for (int i = 0; i < 2 * (num_captures + 1); i += 2) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000983 array->set(i, Smi::FromInt(offsets_vector[i]));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000984 array->set(i + 1, Smi::FromInt(offsets_vector[i + 1]));
ager@chromium.org8bb60582008-12-11 12:02:20 +0000985 }
986 return Factory::NewJSArrayWithElements(array);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000987}
988
989
990// -------------------------------------------------------------------
991// Implmentation of the Irregexp regular expression engine.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000992//
993// The Irregexp regular expression engine is intended to be a complete
994// implementation of ECMAScript regular expressions. It generates either
995// bytecodes or native code.
996
997// The Irregexp regexp engine is structured in three steps.
998// 1) The parser generates an abstract syntax tree. See ast.cc.
999// 2) From the AST a node network is created. The nodes are all
1000// subclasses of RegExpNode. The nodes represent states when
1001// executing a regular expression. Several optimizations are
1002// performed on the node network.
1003// 3) From the nodes we generate either byte codes or native code
1004// that can actually execute the regular expression (perform
1005// the search). The code generation step is described in more
1006// detail below.
1007
1008// Code generation.
1009//
1010// The nodes are divided into four main categories.
1011// * Choice nodes
1012// These represent places where the regular expression can
1013// match in more than one way. For example on entry to an
1014// alternation (foo|bar) or a repetition (*, +, ? or {}).
1015// * Action nodes
1016// These represent places where some action should be
1017// performed. Examples include recording the current position
1018// in the input string to a register (in order to implement
1019// captures) or other actions on register for example in order
1020// to implement the counters needed for {} repetitions.
1021// * Matching nodes
1022// These attempt to match some element part of the input string.
1023// Examples of elements include character classes, plain strings
1024// or back references.
1025// * End nodes
1026// These are used to implement the actions required on finding
1027// a successful match or failing to find a match.
1028//
1029// The code generated (whether as byte codes or native code) maintains
1030// some state as it runs. This consists of the following elements:
1031//
1032// * The capture registers. Used for string captures.
1033// * Other registers. Used for counters etc.
1034// * The current position.
1035// * The stack of backtracking information. Used when a matching node
1036// fails to find a match and needs to try an alternative.
1037//
1038// Conceptual regular expression execution model:
1039//
1040// There is a simple conceptual model of regular expression execution
1041// which will be presented first. The actual code generated is a more
1042// efficient simulation of the simple conceptual model:
1043//
1044// * Choice nodes are implemented as follows:
1045// For each choice except the last {
1046// push current position
1047// push backtrack code location
1048// <generate code to test for choice>
1049// backtrack code location:
1050// pop current position
1051// }
1052// <generate code to test for last choice>
1053//
1054// * Actions nodes are generated as follows
1055// <push affected registers on backtrack stack>
1056// <generate code to perform action>
1057// push backtrack code location
1058// <generate code to test for following nodes>
1059// backtrack code location:
1060// <pop affected registers to restore their state>
1061// <pop backtrack location from stack and go to it>
1062//
1063// * Matching nodes are generated as follows:
1064// if input string matches at current position
1065// update current position
1066// <generate code to test for following nodes>
1067// else
1068// <pop backtrack location from stack and go to it>
1069//
1070// Thus it can be seen that the current position is saved and restored
1071// by the choice nodes, whereas the registers are saved and restored by
1072// by the action nodes that manipulate them.
1073//
1074// The other interesting aspect of this model is that nodes are generated
1075// at the point where they are needed by a recursive call to Emit(). If
1076// the node has already been code generated then the Emit() call will
1077// generate a jump to the previously generated code instead. In order to
1078// limit recursion it is possible for the Emit() function to put the node
1079// on a work list for later generation and instead generate a jump. The
1080// destination of the jump is resolved later when the code is generated.
1081//
1082// Actual regular expression code generation.
1083//
1084// Code generation is actually more complicated than the above. In order
1085// to improve the efficiency of the generated code some optimizations are
1086// performed
1087//
1088// * Choice nodes have 1-character lookahead.
1089// A choice node looks at the following character and eliminates some of
1090// the choices immediately based on that character. This is not yet
1091// implemented.
1092// * Simple greedy loops store reduced backtracking information.
1093// A quantifier like /.*foo/m will greedily match the whole input. It will
1094// then need to backtrack to a point where it can match "foo". The naive
1095// implementation of this would push each character position onto the
1096// backtracking stack, then pop them off one by one. This would use space
1097// proportional to the length of the input string. However since the "."
1098// can only match in one way and always has a constant length (in this case
1099// of 1) it suffices to store the current position on the top of the stack
1100// once. Matching now becomes merely incrementing the current position and
1101// backtracking becomes decrementing the current position and checking the
1102// result against the stored current position. This is faster and saves
1103// space.
1104// * The current state is virtualized.
1105// This is used to defer expensive operations until it is clear that they
1106// are needed and to generate code for a node more than once, allowing
1107// specialized an efficient versions of the code to be created. This is
1108// explained in the section below.
1109//
1110// Execution state virtualization.
1111//
1112// Instead of emitting code, nodes that manipulate the state can record their
ager@chromium.org32912102009-01-16 10:38:43 +00001113// manipulation in an object called the Trace. The Trace object can record a
1114// current position offset, an optional backtrack code location on the top of
1115// the virtualized backtrack stack and some register changes. When a node is
1116// to be emitted it can flush the Trace or update it. Flushing the Trace
ager@chromium.org8bb60582008-12-11 12:02:20 +00001117// will emit code to bring the actual state into line with the virtual state.
1118// Avoiding flushing the state can postpone some work (eg updates of capture
1119// registers). Postponing work can save time when executing the regular
1120// expression since it may be found that the work never has to be done as a
1121// failure to match can occur. In addition it is much faster to jump to a
1122// known backtrack code location than it is to pop an unknown backtrack
1123// location from the stack and jump there.
1124//
ager@chromium.org32912102009-01-16 10:38:43 +00001125// The virtual state found in the Trace affects code generation. For example
1126// the virtual state contains the difference between the actual current
1127// position and the virtual current position, and matching code needs to use
1128// this offset to attempt a match in the correct location of the input
1129// string. Therefore code generated for a non-trivial trace is specialized
1130// to that trace. The code generator therefore has the ability to generate
1131// code for each node several times. In order to limit the size of the
1132// generated code there is an arbitrary limit on how many specialized sets of
1133// code may be generated for a given node. If the limit is reached, the
1134// trace is flushed and a generic version of the code for a node is emitted.
1135// This is subsequently used for that node. The code emitted for non-generic
1136// trace is not recorded in the node and so it cannot currently be reused in
1137// the event that code generation is requested for an identical trace.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001138
1139
1140void RegExpTree::AppendToText(RegExpText* text) {
1141 UNREACHABLE();
1142}
1143
1144
1145void RegExpAtom::AppendToText(RegExpText* text) {
1146 text->AddElement(TextElement::Atom(this));
1147}
1148
1149
1150void RegExpCharacterClass::AppendToText(RegExpText* text) {
1151 text->AddElement(TextElement::CharClass(this));
1152}
1153
1154
1155void RegExpText::AppendToText(RegExpText* text) {
1156 for (int i = 0; i < elements()->length(); i++)
1157 text->AddElement(elements()->at(i));
1158}
1159
1160
1161TextElement TextElement::Atom(RegExpAtom* atom) {
1162 TextElement result = TextElement(ATOM);
1163 result.data.u_atom = atom;
1164 return result;
1165}
1166
1167
1168TextElement TextElement::CharClass(
1169 RegExpCharacterClass* char_class) {
1170 TextElement result = TextElement(CHAR_CLASS);
1171 result.data.u_char_class = char_class;
1172 return result;
1173}
1174
1175
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001176int TextElement::length() {
1177 if (type == ATOM) {
1178 return data.u_atom->length();
1179 } else {
1180 ASSERT(type == CHAR_CLASS);
1181 return 1;
1182 }
1183}
1184
1185
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001186DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
1187 if (table_ == NULL) {
1188 table_ = new DispatchTable();
1189 DispatchTableConstructor cons(table_, ignore_case);
1190 cons.BuildTable(this);
1191 }
1192 return table_;
1193}
1194
1195
1196class RegExpCompiler {
1197 public:
ager@chromium.org8bb60582008-12-11 12:02:20 +00001198 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001199
1200 int AllocateRegister() { return next_register_++; }
1201
1202 Handle<FixedArray> Assemble(RegExpMacroAssembler* assembler,
1203 RegExpNode* start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001204 int capture_count,
1205 Handle<String> pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001206
1207 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
1208
1209 static const int kImplementationOffset = 0;
1210 static const int kNumberOfRegistersOffset = 0;
1211 static const int kCodeOffset = 1;
1212
1213 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
1214 EndNode* accept() { return accept_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001215
1216 static const int kMaxRecursion = 100;
1217 inline int recursion_depth() { return recursion_depth_; }
1218 inline void IncrementRecursionDepth() { recursion_depth_++; }
1219 inline void DecrementRecursionDepth() { recursion_depth_--; }
1220
1221 inline bool ignore_case() { return ignore_case_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001222 inline bool ascii() { return ascii_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001223
ager@chromium.org32912102009-01-16 10:38:43 +00001224 static const int kNoRegister = -1;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001225 private:
1226 EndNode* accept_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001227 int next_register_;
1228 List<RegExpNode*>* work_list_;
1229 int recursion_depth_;
1230 RegExpMacroAssembler* macro_assembler_;
1231 bool ignore_case_;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001232 bool ascii_;
1233};
1234
1235
1236class RecursionCheck {
1237 public:
1238 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1239 compiler->IncrementRecursionDepth();
1240 }
1241 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1242 private:
1243 RegExpCompiler* compiler_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001244};
1245
1246
1247// Attempts to compile the regexp using an Irregexp code generator. Returns
1248// a fixed array or a null handle depending on whether it succeeded.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001249RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001250 : next_register_(2 * (capture_count + 1)),
1251 work_list_(NULL),
1252 recursion_depth_(0),
ager@chromium.org8bb60582008-12-11 12:02:20 +00001253 ignore_case_(ignore_case),
1254 ascii_(ascii) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001255 accept_ = new EndNode(EndNode::ACCEPT);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001256}
1257
1258
1259Handle<FixedArray> RegExpCompiler::Assemble(
1260 RegExpMacroAssembler* macro_assembler,
1261 RegExpNode* start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001262 int capture_count,
1263 Handle<String> pattern) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001264#ifdef DEBUG
1265 if (FLAG_trace_regexp_assembler)
1266 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
1267 else
1268#endif
1269 macro_assembler_ = macro_assembler;
1270 List <RegExpNode*> work_list(0);
1271 work_list_ = &work_list;
1272 Label fail;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001273 macro_assembler->PushBacktrack(&fail);
ager@chromium.org32912102009-01-16 10:38:43 +00001274 Trace new_trace;
1275 if (!start->Emit(this, &new_trace)) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001276 fail.Unuse();
1277 return Handle<FixedArray>::null();
1278 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001279 macro_assembler_->Bind(&fail);
1280 macro_assembler_->Fail();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001281 while (!work_list.is_empty()) {
ager@chromium.org32912102009-01-16 10:38:43 +00001282 if (!work_list.RemoveLast()->Emit(this, &new_trace)) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001283 return Handle<FixedArray>::null();
1284 }
1285 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001286 Handle<FixedArray> array =
1287 Factory::NewFixedArray(RegExpImpl::kIrregexpDataLength);
1288 array->set(RegExpImpl::kIrregexpImplementationIndex,
1289 Smi::FromInt(macro_assembler_->Implementation()));
1290 array->set(RegExpImpl::kIrregexpNumberOfRegistersIndex,
1291 Smi::FromInt(next_register_));
1292 array->set(RegExpImpl::kIrregexpNumberOfCapturesIndex,
1293 Smi::FromInt(capture_count));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001294 Handle<Object> code = macro_assembler_->GetCode(pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001295 array->set(RegExpImpl::kIrregexpCodeIndex, *code);
1296 work_list_ = NULL;
1297#ifdef DEBUG
1298 if (FLAG_trace_regexp_assembler) {
1299 delete macro_assembler_;
1300 }
1301#endif
1302 return array;
1303}
1304
ager@chromium.org32912102009-01-16 10:38:43 +00001305bool Trace::DeferredAction::Mentions(int that) {
1306 if (type() == ActionNode::CLEAR_CAPTURES) {
1307 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1308 return range.Contains(that);
1309 } else {
1310 return reg() == that;
1311 }
1312}
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001313
ager@chromium.org32912102009-01-16 10:38:43 +00001314
1315bool Trace::mentions_reg(int reg) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001316 for (DeferredAction* action = actions_;
1317 action != NULL;
1318 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +00001319 if (action->Mentions(reg))
1320 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001321 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001322 return false;
1323}
1324
1325
ager@chromium.org32912102009-01-16 10:38:43 +00001326bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1327 ASSERT_EQ(0, *cp_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001328 for (DeferredAction* action = actions_;
1329 action != NULL;
1330 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +00001331 if (action->Mentions(reg)) {
1332 if (action->type() == ActionNode::STORE_POSITION) {
1333 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1334 return true;
1335 } else {
1336 return false;
1337 }
1338 }
1339 }
1340 return false;
1341}
1342
1343
1344int Trace::FindAffectedRegisters(OutSet* affected_registers) {
1345 int max_register = RegExpCompiler::kNoRegister;
1346 for (DeferredAction* action = actions_;
1347 action != NULL;
1348 action = action->next()) {
1349 if (action->type() == ActionNode::CLEAR_CAPTURES) {
1350 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1351 for (int i = range.from(); i <= range.to(); i++)
1352 affected_registers->Set(i);
1353 if (range.to() > max_register) max_register = range.to();
1354 } else {
1355 affected_registers->Set(action->reg());
1356 if (action->reg() > max_register) max_register = action->reg();
1357 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001358 }
1359 return max_register;
1360}
1361
1362
ager@chromium.org32912102009-01-16 10:38:43 +00001363void Trace::PushAffectedRegisters(RegExpMacroAssembler* assembler,
1364 int max_register,
1365 OutSet& affected_registers) {
1366 // Stay safe and check every half times the limit.
1367 // (Round up in case the limit is 1).
1368 int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1369 for (int reg = 0, pushes = 0; reg <= max_register; reg++) {
1370 if (affected_registers.Get(reg)) {
1371 pushes++;
1372 RegExpMacroAssembler::StackCheckFlag check_stack_limit =
1373 (pushes % push_limit) == 0 ?
1374 RegExpMacroAssembler::kCheckStackLimit :
1375 RegExpMacroAssembler::kNoStackLimitCheck;
1376 assembler->PushRegister(reg, check_stack_limit);
1377 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001378 }
1379}
1380
1381
ager@chromium.org32912102009-01-16 10:38:43 +00001382void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1383 int max_register,
1384 OutSet& affected_registers) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001385 for (int reg = max_register; reg >= 0; reg--) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001386 if (affected_registers.Get(reg)) assembler->PopRegister(reg);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001387 }
1388}
1389
1390
ager@chromium.org32912102009-01-16 10:38:43 +00001391void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1392 int max_register,
1393 OutSet& affected_registers) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001394 for (int reg = 0; reg <= max_register; reg++) {
1395 if (!affected_registers.Get(reg)) {
1396 continue;
1397 }
1398 int value = 0;
1399 bool absolute = false;
ager@chromium.org32912102009-01-16 10:38:43 +00001400 bool clear = false;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001401 int store_position = -1;
1402 // This is a little tricky because we are scanning the actions in reverse
1403 // historical order (newest first).
1404 for (DeferredAction* action = actions_;
1405 action != NULL;
1406 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +00001407 if (action->Mentions(reg)) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001408 switch (action->type()) {
1409 case ActionNode::SET_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00001410 Trace::DeferredSetRegister* psr =
1411 static_cast<Trace::DeferredSetRegister*>(action);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001412 value += psr->value();
1413 absolute = true;
1414 ASSERT_EQ(store_position, -1);
ager@chromium.org32912102009-01-16 10:38:43 +00001415 ASSERT(!clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001416 break;
1417 }
1418 case ActionNode::INCREMENT_REGISTER:
1419 if (!absolute) {
1420 value++;
1421 }
1422 ASSERT_EQ(store_position, -1);
ager@chromium.org32912102009-01-16 10:38:43 +00001423 ASSERT(!clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001424 break;
1425 case ActionNode::STORE_POSITION: {
ager@chromium.org32912102009-01-16 10:38:43 +00001426 Trace::DeferredCapture* pc =
1427 static_cast<Trace::DeferredCapture*>(action);
1428 if (!clear && store_position == -1) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001429 store_position = pc->cp_offset();
1430 }
1431 ASSERT(!absolute);
1432 ASSERT_EQ(value, 0);
1433 break;
1434 }
ager@chromium.org32912102009-01-16 10:38:43 +00001435 case ActionNode::CLEAR_CAPTURES: {
1436 // Since we're scanning in reverse order, if we've already
1437 // set the position we have to ignore historically earlier
1438 // clearing operations.
1439 if (store_position == -1)
1440 clear = true;
1441 ASSERT(!absolute);
1442 ASSERT_EQ(value, 0);
1443 break;
1444 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001445 default:
1446 UNREACHABLE();
1447 break;
1448 }
1449 }
1450 }
1451 if (store_position != -1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001452 assembler->WriteCurrentPositionToRegister(reg, store_position);
ager@chromium.org32912102009-01-16 10:38:43 +00001453 } else if (clear) {
1454 assembler->ClearRegister(reg);
1455 } else if (absolute) {
1456 assembler->SetRegister(reg, value);
1457 } else if (value != 0) {
1458 assembler->AdvanceRegister(reg, value);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001459 }
1460 }
1461}
1462
1463
ager@chromium.org8bb60582008-12-11 12:02:20 +00001464// This is called as we come into a loop choice node and some other tricky
ager@chromium.org32912102009-01-16 10:38:43 +00001465// nodes. It normalizes the state of the code generator to ensure we can
ager@chromium.org8bb60582008-12-11 12:02:20 +00001466// generate generic code.
ager@chromium.org32912102009-01-16 10:38:43 +00001467bool Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001468 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001469
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001470 ASSERT(actions_ != NULL ||
1471 cp_offset_ != 0 ||
1472 backtrack() != NULL ||
1473 characters_preloaded_ != 0 ||
1474 quick_check_performed_.characters() != 0 ||
1475 bound_checked_up_to_ != 0);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001476
1477 if (actions_ == NULL && backtrack() == NULL) {
1478 // Here we just have some deferred cp advances to fix and we are back to
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001479 // a normal situation. We may also have to forget some information gained
1480 // through a quick check that was already performed.
1481 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001482 // Create a new trivial state and generate the node with that.
ager@chromium.org32912102009-01-16 10:38:43 +00001483 Trace new_state;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001484 return successor->Emit(compiler, &new_state);
1485 }
1486
1487 // Generate deferred actions here along with code to undo them again.
1488 OutSet affected_registers;
1489 int max_register = FindAffectedRegisters(&affected_registers);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001490 PushAffectedRegisters(assembler, max_register, affected_registers);
1491 PerformDeferredActions(assembler, max_register, affected_registers);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001492 if (backtrack() != NULL) {
1493 // Here we have a concrete backtrack location. These are set up by choice
1494 // nodes and so they indicate that we have a deferred save of the current
1495 // position which we may need to emit here.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001496 assembler->PushCurrentPosition();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001497 }
1498 if (cp_offset_ != 0) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001499 assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001500 }
1501
1502 // Create a new trivial state and generate the node with that.
1503 Label undo;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001504 assembler->PushBacktrack(&undo);
ager@chromium.org32912102009-01-16 10:38:43 +00001505 Trace new_state;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001506 bool ok = successor->Emit(compiler, &new_state);
1507
1508 // On backtrack we need to restore state.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001509 assembler->Bind(&undo);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001510 if (!ok) return false;
1511 if (backtrack() != NULL) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001512 assembler->PopCurrentPosition();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001513 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001514 RestoreAffectedRegisters(assembler, max_register, affected_registers);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001515 if (backtrack() == NULL) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001516 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001517 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001518 assembler->GoTo(backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001519 }
1520
1521 return true;
1522}
1523
1524
ager@chromium.org32912102009-01-16 10:38:43 +00001525void EndNode::EmitInfoChecks(RegExpMacroAssembler* assembler, Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001526 if (info()->at_end) {
1527 Label succeed;
1528 // LoadCurrentCharacter will go to the label if we are at the end of the
1529 // input string.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001530 assembler->LoadCurrentCharacter(0, &succeed);
ager@chromium.org32912102009-01-16 10:38:43 +00001531 assembler->GoTo(trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001532 assembler->Bind(&succeed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001533 }
1534}
1535
1536
ager@chromium.org32912102009-01-16 10:38:43 +00001537bool NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1538 if (!trace->is_trivial()) {
1539 return trace->Flush(compiler, this);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001540 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001541 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001542 if (!label()->is_bound()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001543 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001544 }
ager@chromium.org32912102009-01-16 10:38:43 +00001545 EmitInfoChecks(assembler, trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001546 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1547 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001548 // Now that we have unwound the stack we find at the top of the stack the
1549 // backtrack that the BeginSubmatch node got.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001550 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001551 return true;
1552}
1553
1554
ager@chromium.org32912102009-01-16 10:38:43 +00001555bool EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1556 if (!trace->is_trivial()) {
1557 return trace->Flush(compiler, this);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001558 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001559 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001560 if (!label()->is_bound()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001561 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001562 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001563 switch (action_) {
1564 case ACCEPT:
ager@chromium.org32912102009-01-16 10:38:43 +00001565 EmitInfoChecks(assembler, trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001566 assembler->Succeed();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001567 return true;
1568 case BACKTRACK:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001569 ASSERT(!info()->at_end);
ager@chromium.org32912102009-01-16 10:38:43 +00001570 assembler->GoTo(trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001571 return true;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001572 case NEGATIVE_SUBMATCH_SUCCESS:
1573 // This case is handled in a different virtual method.
1574 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001575 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001576 UNIMPLEMENTED();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001577 return false;
1578}
1579
1580
1581void GuardedAlternative::AddGuard(Guard* guard) {
1582 if (guards_ == NULL)
1583 guards_ = new ZoneList<Guard*>(1);
1584 guards_->Add(guard);
1585}
1586
1587
ager@chromium.org8bb60582008-12-11 12:02:20 +00001588ActionNode* ActionNode::SetRegister(int reg,
1589 int val,
1590 RegExpNode* on_success) {
1591 ActionNode* result = new ActionNode(SET_REGISTER, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001592 result->data_.u_store_register.reg = reg;
1593 result->data_.u_store_register.value = val;
1594 return result;
1595}
1596
1597
1598ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1599 ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
1600 result->data_.u_increment_register.reg = reg;
1601 return result;
1602}
1603
1604
1605ActionNode* ActionNode::StorePosition(int reg, RegExpNode* on_success) {
1606 ActionNode* result = new ActionNode(STORE_POSITION, on_success);
1607 result->data_.u_position_register.reg = reg;
1608 return result;
1609}
1610
1611
ager@chromium.org32912102009-01-16 10:38:43 +00001612ActionNode* ActionNode::ClearCaptures(Interval range,
1613 RegExpNode* on_success) {
1614 ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
1615 result->data_.u_clear_captures.range_from = range.from();
1616 result->data_.u_clear_captures.range_to = range.to();
1617 return result;
1618}
1619
1620
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001621ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1622 int position_reg,
1623 RegExpNode* on_success) {
1624 ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
1625 result->data_.u_submatch.stack_pointer_register = stack_reg;
1626 result->data_.u_submatch.current_position_register = position_reg;
1627 return result;
1628}
1629
1630
ager@chromium.org8bb60582008-12-11 12:02:20 +00001631ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1632 int position_reg,
1633 RegExpNode* on_success) {
1634 ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001635 result->data_.u_submatch.stack_pointer_register = stack_reg;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001636 result->data_.u_submatch.current_position_register = position_reg;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001637 return result;
1638}
1639
1640
ager@chromium.org32912102009-01-16 10:38:43 +00001641ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1642 int repetition_register,
1643 int repetition_limit,
1644 RegExpNode* on_success) {
1645 ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
1646 result->data_.u_empty_match_check.start_register = start_register;
1647 result->data_.u_empty_match_check.repetition_register = repetition_register;
1648 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1649 return result;
1650}
1651
1652
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001653#define DEFINE_ACCEPT(Type) \
1654 void Type##Node::Accept(NodeVisitor* visitor) { \
1655 visitor->Visit##Type(this); \
1656 }
1657FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1658#undef DEFINE_ACCEPT
1659
1660
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001661void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1662 visitor->VisitLoopChoice(this);
1663}
1664
1665
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001666// -------------------------------------------------------------------
1667// Emit code.
1668
1669
1670void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1671 Guard* guard,
ager@chromium.org32912102009-01-16 10:38:43 +00001672 Trace* trace) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001673 switch (guard->op()) {
1674 case Guard::LT:
ager@chromium.org32912102009-01-16 10:38:43 +00001675 ASSERT(!trace->mentions_reg(guard->reg()));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001676 macro_assembler->IfRegisterGE(guard->reg(),
1677 guard->value(),
ager@chromium.org32912102009-01-16 10:38:43 +00001678 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001679 break;
1680 case Guard::GEQ:
ager@chromium.org32912102009-01-16 10:38:43 +00001681 ASSERT(!trace->mentions_reg(guard->reg()));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001682 macro_assembler->IfRegisterLT(guard->reg(),
1683 guard->value(),
ager@chromium.org32912102009-01-16 10:38:43 +00001684 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001685 break;
1686 }
1687}
1688
1689
1690static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
1691static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
1692
1693
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001694// Only emits non-letters (things that don't have case). Only used for case
1695// independent matches.
1696static inline bool EmitAtomNonLetter(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001697 RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001698 uc16 c,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001699 Label* on_failure,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001700 int cp_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001701 bool check,
1702 bool preloaded) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001703 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001704 int length = uncanonicalize.get(c, '\0', chars);
1705 bool checked = false;
1706 if (length <= 1) {
1707 if (!preloaded) {
1708 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1709 checked = check;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001710 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001711 macro_assembler->CheckNotCharacter(c, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001712 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001713 return checked;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001714}
1715
1716
1717static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001718 bool ascii,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001719 uc16 c1,
1720 uc16 c2,
1721 Label* on_failure) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001722 uc16 char_mask;
1723 if (ascii) {
1724 char_mask = String::kMaxAsciiCharCode;
1725 } else {
1726 char_mask = String::kMaxUC16CharCode;
1727 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001728 uc16 exor = c1 ^ c2;
1729 // Check whether exor has only one bit set.
1730 if (((exor - 1) & exor) == 0) {
1731 // If c1 and c2 differ only by one bit.
1732 // Ecma262UnCanonicalize always gives the highest number last.
1733 ASSERT(c2 > c1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001734 uc16 mask = char_mask ^ exor;
1735 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001736 return true;
1737 }
1738 ASSERT(c2 > c1);
1739 uc16 diff = c2 - c1;
1740 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1741 // If the characters differ by 2^n but don't differ by one bit then
1742 // subtract the difference from the found character, then do the or
1743 // trick. We avoid the theoretical case where negative numbers are
1744 // involved in order to simplify code generation.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001745 uc16 mask = char_mask ^ diff;
1746 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1747 diff,
1748 mask,
1749 on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001750 return true;
1751 }
1752 return false;
1753}
1754
1755
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001756// Only emits letters (things that have case). Only used for case independent
1757// matches.
1758static inline bool EmitAtomLetter(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001759 RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001760 bool ascii,
1761 uc16 c,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001762 Label* on_failure,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001763 int cp_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001764 bool check,
1765 bool preloaded) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001766 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001767 int length = uncanonicalize.get(c, '\0', chars);
1768 if (length <= 1) return false;
1769 // We may not need to check against the end of the input string
1770 // if this character lies before a character that matched.
1771 if (!preloaded) {
1772 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001773 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001774 Label ok;
1775 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1776 switch (length) {
1777 case 2: {
1778 if (ShortCutEmitCharacterPair(macro_assembler,
1779 ascii,
1780 chars[0],
1781 chars[1],
1782 on_failure)) {
1783 } else {
1784 macro_assembler->CheckCharacter(chars[0], &ok);
1785 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1786 macro_assembler->Bind(&ok);
1787 }
1788 break;
1789 }
1790 case 4:
1791 macro_assembler->CheckCharacter(chars[3], &ok);
1792 // Fall through!
1793 case 3:
1794 macro_assembler->CheckCharacter(chars[0], &ok);
1795 macro_assembler->CheckCharacter(chars[1], &ok);
1796 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1797 macro_assembler->Bind(&ok);
1798 break;
1799 default:
1800 UNREACHABLE();
1801 break;
1802 }
1803 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001804}
1805
1806
1807static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1808 RegExpCharacterClass* cc,
1809 int cp_offset,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001810 Label* on_failure,
1811 bool check_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001812 bool ascii,
1813 bool preloaded) {
1814 if (cc->is_standard() &&
1815 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1816 cp_offset,
1817 check_offset,
1818 on_failure)) {
1819 return;
1820 }
1821
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001822 ZoneList<CharacterRange>* ranges = cc->ranges();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001823 int max_char;
1824 if (ascii) {
1825 max_char = String::kMaxAsciiCharCode;
1826 } else {
1827 max_char = String::kMaxUC16CharCode;
1828 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001829
1830 Label success;
1831
1832 Label* char_is_in_class =
1833 cc->is_negated() ? on_failure : &success;
1834
1835 int range_count = ranges->length();
1836
ager@chromium.org8bb60582008-12-11 12:02:20 +00001837 int last_valid_range = range_count - 1;
1838 while (last_valid_range >= 0) {
1839 CharacterRange& range = ranges->at(last_valid_range);
1840 if (range.from() <= max_char) {
1841 break;
1842 }
1843 last_valid_range--;
1844 }
1845
1846 if (last_valid_range < 0) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001847 if (!cc->is_negated()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001848 // TODO(plesner): We can remove this when the node level does our
1849 // ASCII optimizations for us.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001850 macro_assembler->GoTo(on_failure);
1851 }
1852 return;
1853 }
1854
ager@chromium.org8bb60582008-12-11 12:02:20 +00001855 if (last_valid_range == 0 &&
1856 !cc->is_negated() &&
1857 ranges->at(0).IsEverything(max_char)) {
1858 // This is a common case hit by non-anchored expressions.
1859 // TODO(erikcorry): We should have a macro assembler instruction that just
1860 // checks for end of string without loading the character.
1861 if (check_offset) {
1862 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure);
1863 }
1864 return;
1865 }
1866
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001867 if (!preloaded) {
1868 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001869 }
1870
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001871 for (int i = 0; i < last_valid_range; i++) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001872 CharacterRange& range = ranges->at(i);
1873 Label next_range;
1874 uc16 from = range.from();
1875 uc16 to = range.to();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001876 if (from > max_char) {
1877 continue;
1878 }
1879 if (to > max_char) to = max_char;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001880 if (to == from) {
1881 macro_assembler->CheckCharacter(to, char_is_in_class);
1882 } else {
1883 if (from != 0) {
1884 macro_assembler->CheckCharacterLT(from, &next_range);
1885 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001886 if (to != max_char) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001887 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
1888 } else {
1889 macro_assembler->GoTo(char_is_in_class);
1890 }
1891 }
1892 macro_assembler->Bind(&next_range);
1893 }
1894
ager@chromium.org8bb60582008-12-11 12:02:20 +00001895 CharacterRange& range = ranges->at(last_valid_range);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001896 uc16 from = range.from();
1897 uc16 to = range.to();
1898
ager@chromium.org8bb60582008-12-11 12:02:20 +00001899 if (to > max_char) to = max_char;
1900 ASSERT(to >= from);
1901
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001902 if (to == from) {
1903 if (cc->is_negated()) {
1904 macro_assembler->CheckCharacter(to, on_failure);
1905 } else {
1906 macro_assembler->CheckNotCharacter(to, on_failure);
1907 }
1908 } else {
1909 if (from != 0) {
1910 if (cc->is_negated()) {
1911 macro_assembler->CheckCharacterLT(from, &success);
1912 } else {
1913 macro_assembler->CheckCharacterLT(from, on_failure);
1914 }
1915 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001916 if (to != String::kMaxUC16CharCode) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001917 if (cc->is_negated()) {
1918 macro_assembler->CheckCharacterLT(to + 1, on_failure);
1919 } else {
1920 macro_assembler->CheckCharacterGT(to, on_failure);
1921 }
1922 } else {
1923 if (cc->is_negated()) {
1924 macro_assembler->GoTo(on_failure);
1925 }
1926 }
1927 }
1928 macro_assembler->Bind(&success);
1929}
1930
1931
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001932RegExpNode::~RegExpNode() {
1933}
1934
1935
ager@chromium.org8bb60582008-12-11 12:02:20 +00001936RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00001937 Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001938 // TODO(erikcorry): Implement support.
1939 if (info_.follows_word_interest ||
1940 info_.follows_newline_interest ||
1941 info_.follows_start_interest) {
1942 return FAIL;
1943 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001944
ager@chromium.org8bb60582008-12-11 12:02:20 +00001945 // If we are generating a greedy loop then don't stop and don't reuse code.
ager@chromium.org32912102009-01-16 10:38:43 +00001946 if (trace->stop_node() != NULL) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001947 return CONTINUE;
1948 }
1949
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001950 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00001951 if (trace->is_trivial()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001952 if (label_.is_bound()) {
1953 // We are being asked to generate a generic version, but that's already
1954 // been done so just go to it.
1955 macro_assembler->GoTo(&label_);
1956 return DONE;
1957 }
1958 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1959 // To avoid too deep recursion we push the node to the work queue and just
1960 // generate a goto here.
1961 compiler->AddWork(this);
1962 macro_assembler->GoTo(&label_);
1963 return DONE;
1964 }
1965 // Generate generic version of the node and bind the label for later use.
1966 macro_assembler->Bind(&label_);
1967 return CONTINUE;
1968 }
1969
1970 // We are being asked to make a non-generic version. Keep track of how many
1971 // non-generic versions we generate so as not to overdo it.
ager@chromium.org32912102009-01-16 10:38:43 +00001972 trace_count_++;
1973 if (trace_count_ < kMaxCopiesCodeGenerated &&
ager@chromium.org8bb60582008-12-11 12:02:20 +00001974 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
1975 return CONTINUE;
1976 }
1977
ager@chromium.org32912102009-01-16 10:38:43 +00001978 // If we get here code has been generated for this node too many times or
1979 // recursion is too deep. Time to switch to a generic version. The code for
ager@chromium.org8bb60582008-12-11 12:02:20 +00001980 // generic versions above can handle deep recursion properly.
ager@chromium.org32912102009-01-16 10:38:43 +00001981 bool ok = trace->Flush(compiler, this);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001982 return ok ? DONE : FAIL;
1983}
1984
1985
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001986int ActionNode::EatsAtLeast(int recursion_depth) {
1987 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1988 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
1989 return on_success()->EatsAtLeast(recursion_depth + 1);
1990}
1991
1992
1993int TextNode::EatsAtLeast(int recursion_depth) {
1994 int answer = Length();
1995 if (answer >= 4) return answer;
1996 if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
1997 return answer + on_success()->EatsAtLeast(recursion_depth + 1);
1998}
1999
2000
2001int ChoiceNode::EatsAtLeastHelper(int recursion_depth,
2002 RegExpNode* ignore_this_node) {
2003 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2004 int min = 100;
2005 int choice_count = alternatives_->length();
2006 for (int i = 0; i < choice_count; i++) {
2007 RegExpNode* node = alternatives_->at(i).node();
2008 if (node == ignore_this_node) continue;
2009 int node_eats_at_least = node->EatsAtLeast(recursion_depth + 1);
2010 if (node_eats_at_least < min) min = node_eats_at_least;
2011 }
2012 return min;
2013}
2014
2015
2016int LoopChoiceNode::EatsAtLeast(int recursion_depth) {
2017 return EatsAtLeastHelper(recursion_depth, loop_node_);
2018}
2019
2020
2021int ChoiceNode::EatsAtLeast(int recursion_depth) {
2022 return EatsAtLeastHelper(recursion_depth, NULL);
2023}
2024
2025
2026// Takes the left-most 1-bit and smears it out, setting all bits to its right.
2027static inline uint32_t SmearBitsRight(uint32_t v) {
2028 v |= v >> 1;
2029 v |= v >> 2;
2030 v |= v >> 4;
2031 v |= v >> 8;
2032 v |= v >> 16;
2033 return v;
2034}
2035
2036
2037bool QuickCheckDetails::Rationalize(bool asc) {
2038 bool found_useful_op = false;
2039 uint32_t char_mask;
2040 if (asc) {
2041 char_mask = String::kMaxAsciiCharCode;
2042 } else {
2043 char_mask = String::kMaxUC16CharCode;
2044 }
2045 mask_ = 0;
2046 value_ = 0;
2047 int char_shift = 0;
2048 for (int i = 0; i < characters_; i++) {
2049 Position* pos = &positions_[i];
2050 if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
2051 found_useful_op = true;
2052 }
2053 mask_ |= (pos->mask & char_mask) << char_shift;
2054 value_ |= (pos->value & char_mask) << char_shift;
2055 char_shift += asc ? 8 : 16;
2056 }
2057 return found_useful_op;
2058}
2059
2060
2061bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00002062 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002063 bool preload_has_checked_bounds,
2064 Label* on_possible_success,
2065 QuickCheckDetails* details,
2066 bool fall_through_on_failure) {
2067 if (details->characters() == 0) return false;
2068 GetQuickCheckDetails(details, compiler, 0);
2069 if (!details->Rationalize(compiler->ascii())) return false;
2070 uint32_t mask = details->mask();
2071 uint32_t value = details->value();
2072
2073 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2074
ager@chromium.org32912102009-01-16 10:38:43 +00002075 if (trace->characters_preloaded() != details->characters()) {
2076 assembler->LoadCurrentCharacter(trace->cp_offset(),
2077 trace->backtrack(),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002078 !preload_has_checked_bounds,
2079 details->characters());
2080 }
2081
2082
2083 bool need_mask = true;
2084
2085 if (details->characters() == 1) {
2086 // If number of characters preloaded is 1 then we used a byte or 16 bit
2087 // load so the value is already masked down.
2088 uint32_t char_mask;
2089 if (compiler->ascii()) {
2090 char_mask = String::kMaxAsciiCharCode;
2091 } else {
2092 char_mask = String::kMaxUC16CharCode;
2093 }
2094 if ((mask & char_mask) == char_mask) need_mask = false;
2095 mask &= char_mask;
2096 } else {
2097 // For 2-character preloads in ASCII mode we also use a 16 bit load with
2098 // zero extend.
2099 if (details->characters() == 2 && compiler->ascii()) {
2100 if ((mask & 0xffff) == 0xffff) need_mask = false;
2101 } else {
2102 if (mask == 0xffffffff) need_mask = false;
2103 }
2104 }
2105
2106 if (fall_through_on_failure) {
2107 if (need_mask) {
2108 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2109 } else {
2110 assembler->CheckCharacter(value, on_possible_success);
2111 }
2112 } else {
2113 if (need_mask) {
ager@chromium.org32912102009-01-16 10:38:43 +00002114 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002115 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00002116 assembler->CheckNotCharacter(value, trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002117 }
2118 }
2119 return true;
2120}
2121
2122
2123// Here is the meat of GetQuickCheckDetails (see also the comment on the
2124// super-class in the .h file).
2125//
2126// We iterate along the text object, building up for each character a
2127// mask and value that can be used to test for a quick failure to match.
2128// The masks and values for the positions will be combined into a single
2129// machine word for the current character width in order to be used in
2130// generating a quick check.
2131void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2132 RegExpCompiler* compiler,
2133 int characters_filled_in) {
2134 ASSERT(characters_filled_in < details->characters());
2135 int characters = details->characters();
2136 int char_mask;
2137 int char_shift;
2138 if (compiler->ascii()) {
2139 char_mask = String::kMaxAsciiCharCode;
2140 char_shift = 8;
2141 } else {
2142 char_mask = String::kMaxUC16CharCode;
2143 char_shift = 16;
2144 }
2145 for (int k = 0; k < elms_->length(); k++) {
2146 TextElement elm = elms_->at(k);
2147 if (elm.type == TextElement::ATOM) {
2148 Vector<const uc16> quarks = elm.data.u_atom->data();
2149 for (int i = 0; i < characters && i < quarks.length(); i++) {
2150 QuickCheckDetails::Position* pos =
2151 details->positions(characters_filled_in);
2152 if (compiler->ignore_case()) {
2153 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2154 uc16 c = quarks[i];
2155 int length = uncanonicalize.get(c, '\0', chars);
2156 if (length < 2) {
2157 // This letter has no case equivalents, so it's nice and simple
2158 // and the mask-compare will determine definitely whether we have
2159 // a match at this character position.
2160 pos->mask = char_mask;
2161 pos->value = c;
2162 pos->determines_perfectly = true;
2163 } else {
2164 uint32_t common_bits = char_mask;
2165 uint32_t bits = chars[0];
2166 for (int j = 1; j < length; j++) {
2167 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2168 common_bits ^= differing_bits;
2169 bits &= common_bits;
2170 }
2171 // If length is 2 and common bits has only one zero in it then
2172 // our mask and compare instruction will determine definitely
2173 // whether we have a match at this character position. Otherwise
2174 // it can only be an approximate check.
2175 uint32_t one_zero = (common_bits | ~char_mask);
2176 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2177 pos->determines_perfectly = true;
2178 }
2179 pos->mask = common_bits;
2180 pos->value = bits;
2181 }
2182 } else {
2183 // Don't ignore case. Nice simple case where the mask-compare will
2184 // determine definitely whether we have a match at this character
2185 // position.
2186 pos->mask = char_mask;
2187 pos->value = quarks[i];
2188 pos->determines_perfectly = true;
2189 }
2190 characters_filled_in++;
2191 ASSERT(characters_filled_in <= details->characters());
2192 if (characters_filled_in == details->characters()) {
2193 return;
2194 }
2195 }
2196 } else {
2197 QuickCheckDetails::Position* pos =
2198 details->positions(characters_filled_in);
2199 RegExpCharacterClass* tree = elm.data.u_char_class;
2200 ZoneList<CharacterRange>* ranges = tree->ranges();
2201 CharacterRange range = ranges->at(0);
2202 if (tree->is_negated()) {
2203 // A quick check uses multi-character mask and compare. There is no
2204 // useful way to incorporate a negative char class into this scheme
2205 // so we just conservatively create a mask and value that will always
2206 // succeed.
2207 pos->mask = 0;
2208 pos->value = 0;
2209 } else {
2210 uint32_t differing_bits = (range.from() ^ range.to());
2211 // A mask and compare is only perfect if the differing bits form a
2212 // number like 00011111 with one single block of trailing 1s.
2213 if ((differing_bits & (differing_bits + 1)) == 0) {
2214 pos->determines_perfectly = true;
2215 }
2216 uint32_t common_bits = ~SmearBitsRight(differing_bits);
2217 uint32_t bits = (range.from() & common_bits);
2218 for (int i = 1; i < ranges->length(); i++) {
2219 // Here we are combining more ranges into the mask and compare
2220 // value. With each new range the mask becomes more sparse and
2221 // so the chances of a false positive rise. A character class
2222 // with multiple ranges is assumed never to be equivalent to a
2223 // mask and compare operation.
2224 pos->determines_perfectly = false;
2225 CharacterRange range = ranges->at(i);
2226 uint32_t new_common_bits = (range.from() ^ range.to());
2227 new_common_bits = ~SmearBitsRight(new_common_bits);
2228 common_bits &= new_common_bits;
2229 bits &= new_common_bits;
2230 uint32_t differing_bits = (range.from() & common_bits) ^ bits;
2231 common_bits ^= differing_bits;
2232 bits &= common_bits;
2233 }
2234 pos->mask = common_bits;
2235 pos->value = bits;
2236 }
2237 characters_filled_in++;
2238 ASSERT(characters_filled_in <= details->characters());
2239 if (characters_filled_in == details->characters()) {
2240 return;
2241 }
2242 }
2243 }
2244 ASSERT(characters_filled_in != details->characters());
2245 on_success()-> GetQuickCheckDetails(details, compiler, characters_filled_in);
2246}
2247
2248
2249void QuickCheckDetails::Clear() {
2250 for (int i = 0; i < characters_; i++) {
2251 positions_[i].mask = 0;
2252 positions_[i].value = 0;
2253 positions_[i].determines_perfectly = false;
2254 }
2255 characters_ = 0;
2256}
2257
2258
2259void QuickCheckDetails::Advance(int by, bool ascii) {
2260 ASSERT(by > 0);
2261 if (by >= characters_) {
2262 Clear();
2263 return;
2264 }
2265 for (int i = 0; i < characters_ - by; i++) {
2266 positions_[i] = positions_[by + i];
2267 }
2268 for (int i = characters_ - by; i < characters_; i++) {
2269 positions_[i].mask = 0;
2270 positions_[i].value = 0;
2271 positions_[i].determines_perfectly = false;
2272 }
2273 characters_ -= by;
2274 // We could change mask_ and value_ here but we would never advance unless
2275 // they had already been used in a check and they won't be used again because
2276 // it would gain us nothing. So there's no point.
2277}
2278
2279
2280void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2281 ASSERT(characters_ == other->characters_);
2282 for (int i = from_index; i < characters_; i++) {
2283 QuickCheckDetails::Position* pos = positions(i);
2284 QuickCheckDetails::Position* other_pos = other->positions(i);
2285 if (pos->mask != other_pos->mask ||
2286 pos->value != other_pos->value ||
2287 !other_pos->determines_perfectly) {
2288 // Our mask-compare operation will be approximate unless we have the
2289 // exact same operation on both sides of the alternation.
2290 pos->determines_perfectly = false;
2291 }
2292 pos->mask &= other_pos->mask;
2293 pos->value &= pos->mask;
2294 other_pos->value &= pos->mask;
2295 uc16 differing_bits = (pos->value ^ other_pos->value);
2296 pos->mask &= ~differing_bits;
2297 pos->value &= pos->mask;
2298 }
2299}
2300
2301
ager@chromium.org32912102009-01-16 10:38:43 +00002302class VisitMarker {
2303 public:
2304 explicit VisitMarker(NodeInfo* info) : info_(info) {
2305 ASSERT(!info->visited);
2306 info->visited = true;
2307 }
2308 ~VisitMarker() {
2309 info_->visited = false;
2310 }
2311 private:
2312 NodeInfo* info_;
2313};
2314
2315
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002316void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2317 RegExpCompiler* compiler,
2318 int characters_filled_in) {
ager@chromium.org32912102009-01-16 10:38:43 +00002319 if (body_can_be_zero_length_ || info()->visited) return;
2320 VisitMarker marker(info());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002321 return ChoiceNode::GetQuickCheckDetails(details,
2322 compiler,
2323 characters_filled_in);
2324}
2325
2326
2327void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2328 RegExpCompiler* compiler,
2329 int characters_filled_in) {
2330 int choice_count = alternatives_->length();
2331 ASSERT(choice_count > 0);
2332 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2333 compiler,
2334 characters_filled_in);
2335 for (int i = 1; i < choice_count; i++) {
2336 QuickCheckDetails new_details(details->characters());
2337 RegExpNode* node = alternatives_->at(i).node();
2338 node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in);
2339 // Here we merge the quick match details of the two branches.
2340 details->Merge(&new_details, characters_filled_in);
2341 }
2342}
2343
2344
2345// We call this repeatedly to generate code for each pass over the text node.
2346// The passes are in increasing order of difficulty because we hope one
2347// of the first passes will fail in which case we are saved the work of the
2348// later passes. for example for the case independent regexp /%[asdfghjkl]a/
2349// we will check the '%' in the first pass, the case independent 'a' in the
2350// second pass and the character class in the last pass.
2351//
2352// The passes are done from right to left, so for example to test for /bar/
2353// we will first test for an 'r' with offset 2, then an 'a' with offset 1
2354// and then a 'b' with offset 0. This means we can avoid the end-of-input
2355// bounds check most of the time. In the example we only need to check for
2356// end-of-input when loading the putative 'r'.
2357//
2358// A slight complication involves the fact that the first character may already
2359// be fetched into a register by the previous node. In this case we want to
2360// do the test for that character first. We do this in separate passes. The
2361// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2362// pass has been performed then subsequent passes will have true in
2363// first_element_checked to indicate that that character does not need to be
2364// checked again.
2365//
ager@chromium.org32912102009-01-16 10:38:43 +00002366// In addition to all this we are passed a Trace, which can
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002367// contain an AlternativeGeneration object. In this AlternativeGeneration
2368// object we can see details of any quick check that was already passed in
2369// order to get to the code we are now generating. The quick check can involve
2370// loading characters, which means we do not need to recheck the bounds
2371// up to the limit the quick check already checked. In addition the quick
2372// check can have involved a mask and compare operation which may simplify
2373// or obviate the need for further checks at some character positions.
2374void TextNode::TextEmitPass(RegExpCompiler* compiler,
2375 TextEmitPassType pass,
2376 bool preloaded,
ager@chromium.org32912102009-01-16 10:38:43 +00002377 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002378 bool first_element_checked,
2379 int* checked_up_to) {
2380 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2381 bool ascii = compiler->ascii();
ager@chromium.org32912102009-01-16 10:38:43 +00002382 Label* backtrack = trace->backtrack();
2383 QuickCheckDetails* quick_check = trace->quick_check_performed();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002384 int element_count = elms_->length();
2385 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2386 TextElement elm = elms_->at(i);
ager@chromium.org32912102009-01-16 10:38:43 +00002387 int cp_offset = trace->cp_offset() + elm.cp_offset;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002388 if (elm.type == TextElement::ATOM) {
2389 if (pass == NON_ASCII_MATCH ||
2390 pass == CHARACTER_MATCH ||
2391 pass == CASE_CHARACTER_MATCH) {
2392 Vector<const uc16> quarks = elm.data.u_atom->data();
2393 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2394 bool bound_checked = true; // Most ops will check their bounds.
2395 if (first_element_checked && i == 0 && j == 0) continue;
2396 if (quick_check != NULL &&
2397 elm.cp_offset + j < quick_check->characters() &&
2398 quick_check->positions(elm.cp_offset + j)->determines_perfectly) {
2399 continue;
2400 }
2401 if (pass == NON_ASCII_MATCH) {
2402 ASSERT(ascii);
2403 if (quarks[j] > String::kMaxAsciiCharCode) {
2404 assembler->GoTo(backtrack);
2405 return;
2406 }
2407 } else if (pass == CHARACTER_MATCH) {
2408 if (compiler->ignore_case()) {
2409 bound_checked = EmitAtomNonLetter(assembler,
2410 quarks[j],
2411 backtrack,
2412 cp_offset + j,
2413 *checked_up_to < cp_offset + j,
2414 preloaded);
2415 } else {
2416 if (!preloaded) {
2417 assembler->LoadCurrentCharacter(cp_offset + j,
2418 backtrack,
2419 *checked_up_to < cp_offset + j);
2420 }
2421 assembler->CheckNotCharacter(quarks[j], backtrack);
2422 }
2423 } else {
2424 ASSERT_EQ(pass, CASE_CHARACTER_MATCH);
2425 ASSERT(compiler->ignore_case());
2426 bound_checked = EmitAtomLetter(assembler,
2427 compiler->ascii(),
2428 quarks[j],
2429 backtrack,
2430 cp_offset + j,
2431 *checked_up_to < cp_offset + j,
2432 preloaded);
2433 }
2434 if (pass != NON_ASCII_MATCH && bound_checked) {
2435 if (cp_offset + j > *checked_up_to) {
2436 *checked_up_to = cp_offset + j;
2437 }
2438 }
2439 }
2440 }
2441 } else {
2442 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
2443 if (first_element_checked && i == 0) continue;
2444 if (quick_check != NULL &&
2445 elm.cp_offset < quick_check->characters() &&
2446 quick_check->positions(elm.cp_offset)->determines_perfectly) {
2447 continue;
2448 }
2449 if (pass == CHARACTER_CLASS_MATCH) {
2450 RegExpCharacterClass* cc = elm.data.u_char_class;
2451 EmitCharClass(assembler,
2452 cc,
2453 cp_offset,
2454 backtrack,
2455 *checked_up_to < cp_offset,
2456 ascii,
2457 preloaded);
2458 if (cp_offset > *checked_up_to) {
2459 *checked_up_to = cp_offset;
2460 }
2461 }
2462 }
2463 }
2464}
2465
2466
2467int TextNode::Length() {
2468 TextElement elm = elms_->last();
2469 ASSERT(elm.cp_offset >= 0);
2470 if (elm.type == TextElement::ATOM) {
2471 return elm.cp_offset + elm.data.u_atom->data().length();
2472 } else {
2473 return elm.cp_offset + 1;
2474 }
2475}
2476
2477
ager@chromium.org8bb60582008-12-11 12:02:20 +00002478// This generates the code to match a text node. A text node can contain
2479// straight character sequences (possibly to be matched in a case-independent
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002480// way) and character classes. For efficiency we do not do this in a single
2481// pass from left to right. Instead we pass over the text node several times,
2482// emitting code for some character positions every time. See the comment on
2483// TextEmitPass for details.
ager@chromium.org32912102009-01-16 10:38:43 +00002484bool TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2485 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002486 if (limit_result == FAIL) return false;
2487 if (limit_result == DONE) return true;
2488 ASSERT(limit_result == CONTINUE);
2489
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002490 if (info()->follows_word_interest ||
2491 info()->follows_newline_interest ||
2492 info()->follows_start_interest) {
2493 return false;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002494 }
2495
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002496 if (info()->at_end) {
ager@chromium.org32912102009-01-16 10:38:43 +00002497 compiler->macro_assembler()->GoTo(trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002498 return true;
2499 }
2500
2501 if (compiler->ascii()) {
2502 int dummy = 0;
ager@chromium.org32912102009-01-16 10:38:43 +00002503 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002504 }
2505
2506 bool first_elt_done = false;
ager@chromium.org32912102009-01-16 10:38:43 +00002507 int bound_checked_to = trace->cp_offset() - 1;
2508 bound_checked_to += trace->bound_checked_up_to();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002509
2510 // If a character is preloaded into the current character register then
2511 // check that now.
ager@chromium.org32912102009-01-16 10:38:43 +00002512 if (trace->characters_preloaded() == 1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002513 TextEmitPass(compiler,
2514 CHARACTER_MATCH,
2515 true,
ager@chromium.org32912102009-01-16 10:38:43 +00002516 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002517 false,
2518 &bound_checked_to);
2519 if (compiler->ignore_case()) {
2520 TextEmitPass(compiler,
2521 CASE_CHARACTER_MATCH,
2522 true,
ager@chromium.org32912102009-01-16 10:38:43 +00002523 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002524 false,
2525 &bound_checked_to);
2526 }
2527 TextEmitPass(compiler,
2528 CHARACTER_CLASS_MATCH,
2529 true,
ager@chromium.org32912102009-01-16 10:38:43 +00002530 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002531 false,
2532 &bound_checked_to);
2533 first_elt_done = true;
2534 }
2535
2536 TextEmitPass(compiler,
2537 CHARACTER_MATCH,
2538 false,
ager@chromium.org32912102009-01-16 10:38:43 +00002539 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002540 first_elt_done,
2541 &bound_checked_to);
2542 if (compiler->ignore_case()) {
2543 TextEmitPass(compiler,
2544 CASE_CHARACTER_MATCH,
2545 false,
ager@chromium.org32912102009-01-16 10:38:43 +00002546 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002547 first_elt_done,
2548 &bound_checked_to);
2549 }
2550 TextEmitPass(compiler,
2551 CHARACTER_CLASS_MATCH,
2552 false,
ager@chromium.org32912102009-01-16 10:38:43 +00002553 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002554 first_elt_done,
2555 &bound_checked_to);
2556
ager@chromium.org32912102009-01-16 10:38:43 +00002557 Trace successor_trace(*trace);
2558 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler->ascii());
ager@chromium.org8bb60582008-12-11 12:02:20 +00002559 RecursionCheck rc(compiler);
ager@chromium.org32912102009-01-16 10:38:43 +00002560 return on_success()->Emit(compiler, &successor_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002561}
2562
2563
ager@chromium.org32912102009-01-16 10:38:43 +00002564void Trace::AdvanceCurrentPositionInTrace(int by, bool ascii) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002565 ASSERT(by > 0);
2566 // We don't have an instruction for shifting the current character register
2567 // down or for using a shifted value for anything so lets just forget that
2568 // we preloaded any characters into it.
2569 characters_preloaded_ = 0;
2570 // Adjust the offsets of the quick check performed information. This
2571 // information is used to find out what we already determined about the
2572 // characters by means of mask and compare.
2573 quick_check_performed_.Advance(by, ascii);
2574 cp_offset_ += by;
2575 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002576}
2577
2578
2579void TextNode::MakeCaseIndependent() {
2580 int element_count = elms_->length();
2581 for (int i = 0; i < element_count; i++) {
2582 TextElement elm = elms_->at(i);
2583 if (elm.type == TextElement::CHAR_CLASS) {
2584 RegExpCharacterClass* cc = elm.data.u_char_class;
2585 ZoneList<CharacterRange>* ranges = cc->ranges();
2586 int range_count = ranges->length();
2587 for (int i = 0; i < range_count; i++) {
2588 ranges->at(i).AddCaseEquivalents(ranges);
2589 }
2590 }
2591 }
2592}
2593
2594
ager@chromium.org8bb60582008-12-11 12:02:20 +00002595int TextNode::GreedyLoopTextLength() {
2596 TextElement elm = elms_->at(elms_->length() - 1);
2597 if (elm.type == TextElement::CHAR_CLASS) {
2598 return elm.cp_offset + 1;
2599 } else {
2600 return elm.cp_offset + elm.data.u_atom->data().length();
2601 }
2602}
2603
2604
2605// Finds the fixed match length of a sequence of nodes that goes from
2606// this alternative and back to this choice node. If there are variable
2607// length nodes or other complications in the way then return a sentinel
2608// value indicating that a greedy loop cannot be constructed.
2609int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) {
2610 int length = 0;
2611 RegExpNode* node = alternative->node();
2612 // Later we will generate code for all these text nodes using recursion
2613 // so we have to limit the max number.
2614 int recursion_depth = 0;
2615 while (node != this) {
2616 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2617 return kNodeIsTooComplexForGreedyLoops;
2618 }
2619 NodeInfo* info = node->info();
2620 if (info->follows_word_interest ||
2621 info->follows_newline_interest ||
2622 info->follows_start_interest) {
2623 return kNodeIsTooComplexForGreedyLoops;
2624 }
2625 int node_length = node->GreedyLoopTextLength();
2626 if (node_length == kNodeIsTooComplexForGreedyLoops) {
2627 return kNodeIsTooComplexForGreedyLoops;
2628 }
2629 length += node_length;
2630 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2631 node = seq_node->on_success();
2632 }
2633 return length;
2634}
2635
2636
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002637void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2638 ASSERT_EQ(loop_node_, NULL);
2639 AddAlternative(alt);
2640 loop_node_ = alt.node();
2641}
2642
2643
2644void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2645 ASSERT_EQ(continue_node_, NULL);
2646 AddAlternative(alt);
2647 continue_node_ = alt.node();
2648}
2649
2650
ager@chromium.org32912102009-01-16 10:38:43 +00002651bool LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002652 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00002653 if (trace->stop_node() == this) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002654 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2655 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2656 // Update the counter-based backtracking info on the stack. This is an
2657 // optimization for greedy loops (see below).
ager@chromium.org32912102009-01-16 10:38:43 +00002658 ASSERT(trace->cp_offset() == text_length);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002659 macro_assembler->AdvanceCurrentPosition(text_length);
ager@chromium.org32912102009-01-16 10:38:43 +00002660 macro_assembler->GoTo(trace->loop_label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00002661 return true;
2662 }
ager@chromium.org32912102009-01-16 10:38:43 +00002663 ASSERT(trace->stop_node() == NULL);
2664 if (!trace->is_trivial()) {
2665 return trace->Flush(compiler, this);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002666 }
ager@chromium.org32912102009-01-16 10:38:43 +00002667 return ChoiceNode::Emit(compiler, trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002668}
2669
2670
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002671int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler) {
2672 int preload_characters = EatsAtLeast(0);
2673#ifdef CAN_READ_UNALIGNED
2674 bool ascii = compiler->ascii();
2675 if (ascii) {
2676 if (preload_characters > 4) preload_characters = 4;
2677 // We can't preload 3 characters because there is no machine instruction
2678 // to do that. We can't just load 4 because we could be reading
2679 // beyond the end of the string, which could cause a memory fault.
2680 if (preload_characters == 3) preload_characters = 2;
2681 } else {
2682 if (preload_characters > 2) preload_characters = 2;
2683 }
2684#else
2685 if (preload_characters > 1) preload_characters = 1;
2686#endif
2687 return preload_characters;
2688}
2689
2690
2691// This class is used when generating the alternatives in a choice node. It
2692// records the way the alternative is being code generated.
2693class AlternativeGeneration: public Malloced {
2694 public:
2695 AlternativeGeneration()
2696 : possible_success(),
2697 expects_preload(false),
2698 after(),
2699 quick_check_details() { }
2700 Label possible_success;
2701 bool expects_preload;
2702 Label after;
2703 QuickCheckDetails quick_check_details;
2704};
2705
2706
2707// Creates a list of AlternativeGenerations. If the list has a reasonable
2708// size then it is on the stack, otherwise the excess is on the heap.
2709class AlternativeGenerationList {
2710 public:
2711 explicit AlternativeGenerationList(int count)
2712 : alt_gens_(count) {
2713 for (int i = 0; i < count && i < kAFew; i++) {
2714 alt_gens_.Add(a_few_alt_gens_ + i);
2715 }
2716 for (int i = kAFew; i < count; i++) {
2717 alt_gens_.Add(new AlternativeGeneration());
2718 }
2719 }
2720 ~AlternativeGenerationList() {
2721 for (int i = 0; i < alt_gens_.length(); i++) {
2722 alt_gens_[i]->possible_success.Unuse();
2723 alt_gens_[i]->after.Unuse();
2724 }
2725 for (int i = kAFew; i < alt_gens_.length(); i++) {
2726 delete alt_gens_[i];
2727 alt_gens_[i] = NULL;
2728 }
2729 }
2730
2731 AlternativeGeneration* at(int i) {
2732 return alt_gens_[i];
2733 }
2734 private:
2735 static const int kAFew = 10;
2736 ZoneList<AlternativeGeneration*> alt_gens_;
2737 AlternativeGeneration a_few_alt_gens_[kAFew];
2738};
2739
2740
2741/* Code generation for choice nodes.
2742 *
2743 * We generate quick checks that do a mask and compare to eliminate a
2744 * choice. If the quick check succeeds then it jumps to the continuation to
2745 * do slow checks and check subsequent nodes. If it fails (the common case)
2746 * it falls through to the next choice.
2747 *
2748 * Here is the desired flow graph. Nodes directly below each other imply
2749 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
2750 * 3 doesn't have a quick check so we have to call the slow check.
2751 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
2752 * regexp continuation is generated directly after the Sn node, up to the
2753 * next GoTo if we decide to reuse some already generated code. Some
2754 * nodes expect preload_characters to be preloaded into the current
2755 * character register. R nodes do this preloading. Vertices are marked
2756 * F for failures and S for success (possible success in the case of quick
2757 * nodes). L, V, < and > are used as arrow heads.
2758 *
2759 * ----------> R
2760 * |
2761 * V
2762 * Q1 -----> S1
2763 * | S /
2764 * F| /
2765 * | F/
2766 * | /
2767 * | R
2768 * | /
2769 * V L
2770 * Q2 -----> S2
2771 * | S /
2772 * F| /
2773 * | F/
2774 * | /
2775 * | R
2776 * | /
2777 * V L
2778 * S3
2779 * |
2780 * F|
2781 * |
2782 * R
2783 * |
2784 * backtrack V
2785 * <----------Q4
2786 * \ F |
2787 * \ |S
2788 * \ F V
2789 * \-----S4
2790 *
2791 * For greedy loops we reverse our expectation and expect to match rather
2792 * than fail. Therefore we want the loop code to look like this (U is the
2793 * unwind code that steps back in the greedy loop). The following alternatives
2794 * look the same as above.
2795 * _____
2796 * / \
2797 * V |
2798 * ----------> S1 |
2799 * /| |
2800 * / |S |
2801 * F/ \_____/
2802 * /
2803 * |<-----------
2804 * | \
2805 * V \
2806 * Q2 ---> S2 \
2807 * | S / |
2808 * F| / |
2809 * | F/ |
2810 * | / |
2811 * | R |
2812 * | / |
2813 * F VL |
2814 * <------U |
2815 * back |S |
2816 * \______________/
2817 */
2818
2819
ager@chromium.org32912102009-01-16 10:38:43 +00002820bool ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002821 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2822 int choice_count = alternatives_->length();
2823#ifdef DEBUG
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002824 for (int i = 0; i < choice_count - 1; i++) {
2825 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002826 ZoneList<Guard*>* guards = alternative.guards();
ager@chromium.org8bb60582008-12-11 12:02:20 +00002827 int guard_count = (guards == NULL) ? 0 : guards->length();
2828 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00002829 ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002830 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002831 }
2832#endif
2833
ager@chromium.org32912102009-01-16 10:38:43 +00002834 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002835 if (limit_result == DONE) return true;
2836 if (limit_result == FAIL) return false;
2837 ASSERT(limit_result == CONTINUE);
2838
2839 RecursionCheck rc(compiler);
2840
ager@chromium.org32912102009-01-16 10:38:43 +00002841 Trace* current_trace = trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002842
2843 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2844 bool greedy_loop = false;
2845 Label greedy_loop_label;
ager@chromium.org32912102009-01-16 10:38:43 +00002846 Trace counter_backtrack_trace;
2847 counter_backtrack_trace.set_backtrack(&greedy_loop_label);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002848 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
2849 // Here we have special handling for greedy loops containing only text nodes
2850 // and other simple nodes. These are handled by pushing the current
2851 // position on the stack and then incrementing the current position each
2852 // time around the switch. On backtrack we decrement the current position
2853 // and check it against the pushed value. This avoids pushing backtrack
2854 // information for each iteration of the loop, which could take up a lot of
2855 // space.
2856 greedy_loop = true;
ager@chromium.org32912102009-01-16 10:38:43 +00002857 ASSERT(trace->stop_node() == NULL);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002858 macro_assembler->PushCurrentPosition();
ager@chromium.org32912102009-01-16 10:38:43 +00002859 current_trace = &counter_backtrack_trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002860 Label greedy_match_failed;
ager@chromium.org32912102009-01-16 10:38:43 +00002861 Trace greedy_match_trace;
2862 greedy_match_trace.set_backtrack(&greedy_match_failed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002863 Label loop_label;
2864 macro_assembler->Bind(&loop_label);
ager@chromium.org32912102009-01-16 10:38:43 +00002865 greedy_match_trace.set_stop_node(this);
2866 greedy_match_trace.set_loop_label(&loop_label);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002867 bool ok = alternatives_->at(0).node()->Emit(compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00002868 &greedy_match_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002869 macro_assembler->Bind(&greedy_match_failed);
2870 if (!ok) {
2871 greedy_loop_label.Unuse();
2872 return false;
2873 }
2874 }
2875
2876 Label second_choice; // For use in greedy matches.
2877 macro_assembler->Bind(&second_choice);
2878
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002879 int first_normal_choice = greedy_loop ? 1 : 0;
2880
2881 int preload_characters = CalculatePreloadCharacters(compiler);
2882 bool preload_is_current =
ager@chromium.org32912102009-01-16 10:38:43 +00002883 (current_trace->characters_preloaded() == preload_characters);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002884 bool preload_has_checked_bounds = preload_is_current;
2885
2886 AlternativeGenerationList alt_gens(choice_count);
2887
ager@chromium.org8bb60582008-12-11 12:02:20 +00002888 // For now we just call all choices one after the other. The idea ultimately
2889 // is to use the Dispatch table to try only the relevant ones.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002890 for (int i = first_normal_choice; i < choice_count; i++) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002891 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.org32912102009-01-16 10:38:43 +00002892 AlternativeGeneration* alt_gen = alt_gens.at(i);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002893 alt_gen->quick_check_details.set_characters(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002894 ZoneList<Guard*>* guards = alternative.guards();
2895 int guard_count = (guards == NULL) ? 0 : guards->length();
ager@chromium.org32912102009-01-16 10:38:43 +00002896 Trace new_trace(*current_trace);
2897 new_trace.set_characters_preloaded(preload_is_current ?
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002898 preload_characters :
2899 0);
2900 if (preload_has_checked_bounds) {
ager@chromium.org32912102009-01-16 10:38:43 +00002901 new_trace.set_bound_checked_up_to(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002902 }
ager@chromium.org32912102009-01-16 10:38:43 +00002903 new_trace.quick_check_performed()->Clear();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002904 alt_gen->expects_preload = preload_is_current;
2905 bool generate_full_check_inline = false;
2906 if (alternative.node()->EmitQuickCheck(compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00002907 &new_trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002908 preload_has_checked_bounds,
2909 &alt_gen->possible_success,
2910 &alt_gen->quick_check_details,
2911 i < choice_count - 1)) {
2912 // Quick check was generated for this choice.
2913 preload_is_current = true;
2914 preload_has_checked_bounds = true;
2915 // On the last choice in the ChoiceNode we generated the quick
2916 // check to fall through on possible success. So now we need to
2917 // generate the full check inline.
2918 if (i == choice_count - 1) {
2919 macro_assembler->Bind(&alt_gen->possible_success);
ager@chromium.org32912102009-01-16 10:38:43 +00002920 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
2921 new_trace.set_characters_preloaded(preload_characters);
2922 new_trace.set_bound_checked_up_to(preload_characters);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002923 generate_full_check_inline = true;
2924 }
2925 } else {
2926 // No quick check was generated. Put the full code here.
2927 // If this is not the first choice then there could be slow checks from
2928 // previous cases that go here when they fail. There's no reason to
2929 // insist that they preload characters since the slow check we are about
2930 // to generate probably can't use it.
2931 if (i != first_normal_choice) {
2932 alt_gen->expects_preload = false;
ager@chromium.org32912102009-01-16 10:38:43 +00002933 new_trace.set_characters_preloaded(0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002934 }
2935 if (i < choice_count - 1) {
ager@chromium.org32912102009-01-16 10:38:43 +00002936 new_trace.set_backtrack(&alt_gen->after);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002937 }
2938 generate_full_check_inline = true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002939 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002940 if (generate_full_check_inline) {
2941 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00002942 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002943 }
ager@chromium.org32912102009-01-16 10:38:43 +00002944 if (!alternative.node()->Emit(compiler, &new_trace)) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002945 greedy_loop_label.Unuse();
2946 return false;
2947 }
2948 preload_is_current = false;
2949 }
2950 macro_assembler->Bind(&alt_gen->after);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002951 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002952 if (greedy_loop) {
2953 macro_assembler->Bind(&greedy_loop_label);
2954 // If we have unwound to the bottom then backtrack.
ager@chromium.org32912102009-01-16 10:38:43 +00002955 macro_assembler->CheckGreedyLoop(trace->backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00002956 // Otherwise try the second priority at an earlier position.
2957 macro_assembler->AdvanceCurrentPosition(-text_length);
2958 macro_assembler->GoTo(&second_choice);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002959 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002960 // At this point we need to generate slow checks for the alternatives where
2961 // the quick check was inlined. We can recognize these because the associated
2962 // label was bound.
2963 for (int i = first_normal_choice; i < choice_count - 1; i++) {
2964 AlternativeGeneration* alt_gen = alt_gens.at(i);
2965 if (!EmitOutOfLineContinuation(compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00002966 current_trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002967 alternatives_->at(i),
2968 alt_gen,
2969 preload_characters,
2970 alt_gens.at(i + 1)->expects_preload)) {
2971 return false;
2972 }
2973 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002974 return true;
2975}
2976
2977
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002978bool ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00002979 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002980 GuardedAlternative alternative,
2981 AlternativeGeneration* alt_gen,
2982 int preload_characters,
2983 bool next_expects_preload) {
2984 if (!alt_gen->possible_success.is_linked()) return true;
2985
2986 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2987 macro_assembler->Bind(&alt_gen->possible_success);
ager@chromium.org32912102009-01-16 10:38:43 +00002988 Trace out_of_line_trace(*trace);
2989 out_of_line_trace.set_characters_preloaded(preload_characters);
2990 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002991 ZoneList<Guard*>* guards = alternative.guards();
2992 int guard_count = (guards == NULL) ? 0 : guards->length();
2993 if (next_expects_preload) {
2994 Label reload_current_char;
ager@chromium.org32912102009-01-16 10:38:43 +00002995 out_of_line_trace.set_backtrack(&reload_current_char);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002996 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00002997 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002998 }
ager@chromium.org32912102009-01-16 10:38:43 +00002999 bool ok = alternative.node()->Emit(compiler, &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003000 macro_assembler->Bind(&reload_current_char);
3001 // Reload the current character, since the next quick check expects that.
3002 // We don't need to check bounds here because we only get into this
3003 // code through a quick check which already did the checked load.
ager@chromium.org32912102009-01-16 10:38:43 +00003004 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003005 NULL,
3006 false,
3007 preload_characters);
3008 macro_assembler->GoTo(&(alt_gen->after));
3009 return ok;
3010 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00003011 out_of_line_trace.set_backtrack(&(alt_gen->after));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003012 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00003013 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003014 }
ager@chromium.org32912102009-01-16 10:38:43 +00003015 return alternative.node()->Emit(compiler, &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003016 }
3017}
3018
3019
ager@chromium.org32912102009-01-16 10:38:43 +00003020bool ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003021 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00003022 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003023 if (limit_result == DONE) return true;
3024 if (limit_result == FAIL) return false;
3025 ASSERT(limit_result == CONTINUE);
3026
3027 RecursionCheck rc(compiler);
3028
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003029 switch (type_) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003030 case STORE_POSITION: {
ager@chromium.org32912102009-01-16 10:38:43 +00003031 Trace::DeferredCapture
3032 new_capture(data_.u_position_register.reg, trace);
3033 Trace new_trace = *trace;
3034 new_trace.add_action(&new_capture);
3035 return on_success()->Emit(compiler, &new_trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003036 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003037 case INCREMENT_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00003038 Trace::DeferredIncrementRegister
ager@chromium.org8bb60582008-12-11 12:02:20 +00003039 new_increment(data_.u_increment_register.reg);
ager@chromium.org32912102009-01-16 10:38:43 +00003040 Trace new_trace = *trace;
3041 new_trace.add_action(&new_increment);
3042 return on_success()->Emit(compiler, &new_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003043 }
3044 case SET_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00003045 Trace::DeferredSetRegister
ager@chromium.org8bb60582008-12-11 12:02:20 +00003046 new_set(data_.u_store_register.reg, data_.u_store_register.value);
ager@chromium.org32912102009-01-16 10:38:43 +00003047 Trace new_trace = *trace;
3048 new_trace.add_action(&new_set);
3049 return on_success()->Emit(compiler, &new_trace);
3050 }
3051 case CLEAR_CAPTURES: {
3052 Trace::DeferredClearCaptures
3053 new_capture(Interval(data_.u_clear_captures.range_from,
3054 data_.u_clear_captures.range_to));
3055 Trace new_trace = *trace;
3056 new_trace.add_action(&new_capture);
3057 return on_success()->Emit(compiler, &new_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003058 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003059 case BEGIN_SUBMATCH:
ager@chromium.org32912102009-01-16 10:38:43 +00003060 if (!trace->is_trivial()) return trace->Flush(compiler, this);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003061 assembler->WriteCurrentPositionToRegister(
ager@chromium.org8bb60582008-12-11 12:02:20 +00003062 data_.u_submatch.current_position_register, 0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003063 assembler->WriteStackPointerToRegister(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003064 data_.u_submatch.stack_pointer_register);
ager@chromium.org32912102009-01-16 10:38:43 +00003065 return on_success()->Emit(compiler, trace);
3066 case EMPTY_MATCH_CHECK: {
3067 int start_pos_reg = data_.u_empty_match_check.start_register;
3068 int stored_pos = 0;
3069 int rep_reg = data_.u_empty_match_check.repetition_register;
3070 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
3071 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
3072 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
3073 // If we know we haven't advanced and there is no minimum we
3074 // can just backtrack immediately.
3075 assembler->GoTo(trace->backtrack());
3076 return true;
3077 } else if (know_dist && stored_pos < trace->cp_offset()) {
3078 // If we know we've advanced we can generate the continuation
3079 // immediately.
3080 return on_success()->Emit(compiler, trace);
3081 }
3082 if (!trace->is_trivial()) return trace->Flush(compiler, this);
3083 Label skip_empty_check;
3084 // If we have a minimum number of repetitions we check the current
3085 // number first and skip the empty check if it's not enough.
3086 if (has_minimum) {
3087 int limit = data_.u_empty_match_check.repetition_limit;
3088 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
3089 }
3090 // If the match is empty we bail out, otherwise we fall through
3091 // to the on-success continuation.
3092 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
3093 trace->backtrack());
3094 assembler->Bind(&skip_empty_check);
3095 return on_success()->Emit(compiler, trace);
3096 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003097 case POSITIVE_SUBMATCH_SUCCESS:
ager@chromium.org32912102009-01-16 10:38:43 +00003098 if (!trace->is_trivial()) return trace->Flush(compiler, this);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003099 // TODO(erikcorry): Implement support.
3100 if (info()->follows_word_interest ||
3101 info()->follows_newline_interest ||
3102 info()->follows_start_interest) {
3103 return false;
3104 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003105 if (info()->at_end) {
3106 Label at_end;
3107 // Load current character jumps to the label if we are beyond the string
3108 // end.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003109 assembler->LoadCurrentCharacter(0, &at_end);
ager@chromium.org32912102009-01-16 10:38:43 +00003110 assembler->GoTo(trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003111 assembler->Bind(&at_end);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003112 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003113 assembler->ReadCurrentPositionFromRegister(
ager@chromium.org8bb60582008-12-11 12:02:20 +00003114 data_.u_submatch.current_position_register);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003115 assembler->ReadStackPointerFromRegister(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003116 data_.u_submatch.stack_pointer_register);
ager@chromium.org32912102009-01-16 10:38:43 +00003117 return on_success()->Emit(compiler, trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003118 default:
3119 UNREACHABLE();
3120 return false;
3121 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003122}
3123
3124
ager@chromium.org32912102009-01-16 10:38:43 +00003125bool BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003126 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00003127 if (!trace->is_trivial()) {
3128 return trace->Flush(compiler, this);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003129 }
3130
ager@chromium.org32912102009-01-16 10:38:43 +00003131 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003132 if (limit_result == DONE) return true;
3133 if (limit_result == FAIL) return false;
3134 ASSERT(limit_result == CONTINUE);
3135
3136 RecursionCheck rc(compiler);
3137
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003138 ASSERT_EQ(start_reg_ + 1, end_reg_);
3139 if (info()->at_end) {
3140 // If we are constrained to match at the end of the input then succeed
3141 // iff the back reference is empty.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003142 assembler->CheckNotRegistersEqual(start_reg_,
3143 end_reg_,
ager@chromium.org32912102009-01-16 10:38:43 +00003144 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003145 } else {
3146 if (compiler->ignore_case()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003147 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
ager@chromium.org32912102009-01-16 10:38:43 +00003148 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003149 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00003150 assembler->CheckNotBackReference(start_reg_, trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003151 }
3152 }
ager@chromium.org32912102009-01-16 10:38:43 +00003153 return on_success()->Emit(compiler, trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003154}
3155
3156
3157// -------------------------------------------------------------------
3158// Dot/dotty output
3159
3160
3161#ifdef DEBUG
3162
3163
3164class DotPrinter: public NodeVisitor {
3165 public:
3166 explicit DotPrinter(bool ignore_case)
3167 : ignore_case_(ignore_case),
3168 stream_(&alloc_) { }
3169 void PrintNode(const char* label, RegExpNode* node);
3170 void Visit(RegExpNode* node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003171 void PrintAttributes(RegExpNode* from);
3172 StringStream* stream() { return &stream_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003173 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003174#define DECLARE_VISIT(Type) \
3175 virtual void Visit##Type(Type##Node* that);
3176FOR_EACH_NODE_TYPE(DECLARE_VISIT)
3177#undef DECLARE_VISIT
3178 private:
3179 bool ignore_case_;
3180 HeapStringAllocator alloc_;
3181 StringStream stream_;
3182};
3183
3184
3185void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
3186 stream()->Add("digraph G {\n graph [label=\"");
3187 for (int i = 0; label[i]; i++) {
3188 switch (label[i]) {
3189 case '\\':
3190 stream()->Add("\\\\");
3191 break;
3192 case '"':
3193 stream()->Add("\"");
3194 break;
3195 default:
3196 stream()->Put(label[i]);
3197 break;
3198 }
3199 }
3200 stream()->Add("\"];\n");
3201 Visit(node);
3202 stream()->Add("}\n");
3203 printf("%s", *(stream()->ToCString()));
3204}
3205
3206
3207void DotPrinter::Visit(RegExpNode* node) {
3208 if (node->info()->visited) return;
3209 node->info()->visited = true;
3210 node->Accept(this);
3211}
3212
3213
3214void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003215 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
3216 Visit(on_failure);
3217}
3218
3219
3220class TableEntryBodyPrinter {
3221 public:
3222 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
3223 : stream_(stream), choice_(choice) { }
3224 void Call(uc16 from, DispatchTable::Entry entry) {
3225 OutSet* out_set = entry.out_set();
3226 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3227 if (out_set->Get(i)) {
3228 stream()->Add(" n%p:s%io%i -> n%p;\n",
3229 choice(),
3230 from,
3231 i,
3232 choice()->alternatives()->at(i).node());
3233 }
3234 }
3235 }
3236 private:
3237 StringStream* stream() { return stream_; }
3238 ChoiceNode* choice() { return choice_; }
3239 StringStream* stream_;
3240 ChoiceNode* choice_;
3241};
3242
3243
3244class TableEntryHeaderPrinter {
3245 public:
3246 explicit TableEntryHeaderPrinter(StringStream* stream)
3247 : first_(true), stream_(stream) { }
3248 void Call(uc16 from, DispatchTable::Entry entry) {
3249 if (first_) {
3250 first_ = false;
3251 } else {
3252 stream()->Add("|");
3253 }
3254 stream()->Add("{\\%k-\\%k|{", from, entry.to());
3255 OutSet* out_set = entry.out_set();
3256 int priority = 0;
3257 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3258 if (out_set->Get(i)) {
3259 if (priority > 0) stream()->Add("|");
3260 stream()->Add("<s%io%i> %i", from, i, priority);
3261 priority++;
3262 }
3263 }
3264 stream()->Add("}}");
3265 }
3266 private:
3267 bool first_;
3268 StringStream* stream() { return stream_; }
3269 StringStream* stream_;
3270};
3271
3272
3273class AttributePrinter {
3274 public:
3275 explicit AttributePrinter(DotPrinter* out)
3276 : out_(out), first_(true) { }
3277 void PrintSeparator() {
3278 if (first_) {
3279 first_ = false;
3280 } else {
3281 out_->stream()->Add("|");
3282 }
3283 }
3284 void PrintBit(const char* name, bool value) {
3285 if (!value) return;
3286 PrintSeparator();
3287 out_->stream()->Add("{%s}", name);
3288 }
3289 void PrintPositive(const char* name, int value) {
3290 if (value < 0) return;
3291 PrintSeparator();
3292 out_->stream()->Add("{%s|%x}", name, value);
3293 }
3294 private:
3295 DotPrinter* out_;
3296 bool first_;
3297};
3298
3299
3300void DotPrinter::PrintAttributes(RegExpNode* that) {
3301 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
3302 "margin=0.1, fontsize=10, label=\"{",
3303 that);
3304 AttributePrinter printer(this);
3305 NodeInfo* info = that->info();
3306 printer.PrintBit("NI", info->follows_newline_interest);
3307 printer.PrintBit("WI", info->follows_word_interest);
3308 printer.PrintBit("SI", info->follows_start_interest);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003309 Label* label = that->label();
3310 if (label->is_bound())
3311 printer.PrintPositive("@", label->pos());
3312 stream()->Add("}\"];\n");
3313 stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
3314 "arrowhead=none];\n", that, that);
3315}
3316
3317
3318static const bool kPrintDispatchTable = false;
3319void DotPrinter::VisitChoice(ChoiceNode* that) {
3320 if (kPrintDispatchTable) {
3321 stream()->Add(" n%p [shape=Mrecord, label=\"", that);
3322 TableEntryHeaderPrinter header_printer(stream());
3323 that->GetTable(ignore_case_)->ForEach(&header_printer);
3324 stream()->Add("\"]\n", that);
3325 PrintAttributes(that);
3326 TableEntryBodyPrinter body_printer(stream(), that);
3327 that->GetTable(ignore_case_)->ForEach(&body_printer);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003328 } else {
3329 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
3330 for (int i = 0; i < that->alternatives()->length(); i++) {
3331 GuardedAlternative alt = that->alternatives()->at(i);
3332 stream()->Add(" n%p -> n%p;\n", that, alt.node());
3333 }
3334 }
3335 for (int i = 0; i < that->alternatives()->length(); i++) {
3336 GuardedAlternative alt = that->alternatives()->at(i);
3337 alt.node()->Accept(this);
3338 }
3339}
3340
3341
3342void DotPrinter::VisitText(TextNode* that) {
3343 stream()->Add(" n%p [label=\"", that);
3344 for (int i = 0; i < that->elements()->length(); i++) {
3345 if (i > 0) stream()->Add(" ");
3346 TextElement elm = that->elements()->at(i);
3347 switch (elm.type) {
3348 case TextElement::ATOM: {
3349 stream()->Add("'%w'", elm.data.u_atom->data());
3350 break;
3351 }
3352 case TextElement::CHAR_CLASS: {
3353 RegExpCharacterClass* node = elm.data.u_char_class;
3354 stream()->Add("[");
3355 if (node->is_negated())
3356 stream()->Add("^");
3357 for (int j = 0; j < node->ranges()->length(); j++) {
3358 CharacterRange range = node->ranges()->at(j);
3359 stream()->Add("%k-%k", range.from(), range.to());
3360 }
3361 stream()->Add("]");
3362 break;
3363 }
3364 default:
3365 UNREACHABLE();
3366 }
3367 }
3368 stream()->Add("\", shape=box, peripheries=2];\n");
3369 PrintAttributes(that);
3370 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3371 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003372}
3373
3374
3375void DotPrinter::VisitBackReference(BackReferenceNode* that) {
3376 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
3377 that,
3378 that->start_register(),
3379 that->end_register());
3380 PrintAttributes(that);
3381 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3382 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003383}
3384
3385
3386void DotPrinter::VisitEnd(EndNode* that) {
3387 stream()->Add(" n%p [style=bold, shape=point];\n", that);
3388 PrintAttributes(that);
3389}
3390
3391
3392void DotPrinter::VisitAction(ActionNode* that) {
3393 stream()->Add(" n%p [", that);
3394 switch (that->type_) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003395 case ActionNode::SET_REGISTER:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003396 stream()->Add("label=\"$%i:=%i\", shape=octagon",
3397 that->data_.u_store_register.reg,
3398 that->data_.u_store_register.value);
3399 break;
3400 case ActionNode::INCREMENT_REGISTER:
3401 stream()->Add("label=\"$%i++\", shape=octagon",
3402 that->data_.u_increment_register.reg);
3403 break;
3404 case ActionNode::STORE_POSITION:
3405 stream()->Add("label=\"$%i:=$pos\", shape=octagon",
3406 that->data_.u_position_register.reg);
3407 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003408 case ActionNode::BEGIN_SUBMATCH:
3409 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
3410 that->data_.u_submatch.current_position_register);
3411 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003412 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003413 stream()->Add("label=\"escape\", shape=septagon");
3414 break;
ager@chromium.org32912102009-01-16 10:38:43 +00003415 case ActionNode::EMPTY_MATCH_CHECK:
3416 stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
3417 that->data_.u_empty_match_check.start_register,
3418 that->data_.u_empty_match_check.repetition_register,
3419 that->data_.u_empty_match_check.repetition_limit);
3420 break;
3421 case ActionNode::CLEAR_CAPTURES: {
3422 stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
3423 that->data_.u_clear_captures.range_from,
3424 that->data_.u_clear_captures.range_to);
3425 break;
3426 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003427 }
3428 stream()->Add("];\n");
3429 PrintAttributes(that);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003430 RegExpNode* successor = that->on_success();
3431 stream()->Add(" n%p -> n%p;\n", that, successor);
3432 Visit(successor);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003433}
3434
3435
3436class DispatchTableDumper {
3437 public:
3438 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
3439 void Call(uc16 key, DispatchTable::Entry entry);
3440 StringStream* stream() { return stream_; }
3441 private:
3442 StringStream* stream_;
3443};
3444
3445
3446void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
3447 stream()->Add("[%k-%k]: {", key, entry.to());
3448 OutSet* set = entry.out_set();
3449 bool first = true;
3450 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3451 if (set->Get(i)) {
3452 if (first) {
3453 first = false;
3454 } else {
3455 stream()->Add(", ");
3456 }
3457 stream()->Add("%i", i);
3458 }
3459 }
3460 stream()->Add("}\n");
3461}
3462
3463
3464void DispatchTable::Dump() {
3465 HeapStringAllocator alloc;
3466 StringStream stream(&alloc);
3467 DispatchTableDumper dumper(&stream);
3468 tree()->ForEach(&dumper);
3469 OS::PrintError("%s", *stream.ToCString());
3470}
3471
3472
3473void RegExpEngine::DotPrint(const char* label,
3474 RegExpNode* node,
3475 bool ignore_case) {
3476 DotPrinter printer(ignore_case);
3477 printer.PrintNode(label, node);
3478}
3479
3480
3481#endif // DEBUG
3482
3483
3484// -------------------------------------------------------------------
3485// Tree to graph conversion
3486
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003487static const int kSpaceRangeCount = 20;
3488static const int kSpaceRangeAsciiCount = 4;
3489static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020,
3490 0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A,
3491 0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 };
3492
3493static const int kWordRangeCount = 8;
3494static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_',
3495 '_', 'a', 'z' };
3496
3497static const int kDigitRangeCount = 2;
3498static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' };
3499
3500static const int kLineTerminatorRangeCount = 6;
3501static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A,
3502 0x000A, 0x000D, 0x000D, 0x2028, 0x2029 };
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003503
3504RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003505 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003506 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
3507 elms->Add(TextElement::Atom(this));
ager@chromium.org8bb60582008-12-11 12:02:20 +00003508 return new TextNode(elms, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003509}
3510
3511
3512RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003513 RegExpNode* on_success) {
3514 return new TextNode(elements(), on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003515}
3516
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003517static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
3518 const uc16* special_class,
3519 int length) {
3520 ASSERT(ranges->length() != 0);
3521 ASSERT(length != 0);
3522 ASSERT(special_class[0] != 0);
3523 if (ranges->length() != (length >> 1) + 1) {
3524 return false;
3525 }
3526 CharacterRange range = ranges->at(0);
3527 if (range.from() != 0) {
3528 return false;
3529 }
3530 for (int i = 0; i < length; i += 2) {
3531 if (special_class[i] != (range.to() + 1)) {
3532 return false;
3533 }
3534 range = ranges->at((i >> 1) + 1);
3535 if (special_class[i+1] != range.from() - 1) {
3536 return false;
3537 }
3538 }
3539 if (range.to() != 0xffff) {
3540 return false;
3541 }
3542 return true;
3543}
3544
3545
3546static bool CompareRanges(ZoneList<CharacterRange>* ranges,
3547 const uc16* special_class,
3548 int length) {
3549 if (ranges->length() * 2 != length) {
3550 return false;
3551 }
3552 for (int i = 0; i < length; i += 2) {
3553 CharacterRange range = ranges->at(i >> 1);
3554 if (range.from() != special_class[i] || range.to() != special_class[i+1]) {
3555 return false;
3556 }
3557 }
3558 return true;
3559}
3560
3561
3562bool RegExpCharacterClass::is_standard() {
3563 // TODO(lrn): Remove need for this function, by not throwing away information
3564 // along the way.
3565 if (is_negated_) {
3566 return false;
3567 }
3568 if (set_.is_standard()) {
3569 return true;
3570 }
3571 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3572 set_.set_standard_set_type('s');
3573 return true;
3574 }
3575 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3576 set_.set_standard_set_type('S');
3577 return true;
3578 }
3579 if (CompareInverseRanges(set_.ranges(),
3580 kLineTerminatorRanges,
3581 kLineTerminatorRangeCount)) {
3582 set_.set_standard_set_type('.');
3583 return true;
3584 }
3585 return false;
3586}
3587
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003588
3589RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003590 RegExpNode* on_success) {
3591 return new TextNode(this, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003592}
3593
3594
3595RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003596 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003597 ZoneList<RegExpTree*>* alternatives = this->alternatives();
3598 int length = alternatives->length();
ager@chromium.org8bb60582008-12-11 12:02:20 +00003599 ChoiceNode* result = new ChoiceNode(length);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003600 for (int i = 0; i < length; i++) {
3601 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003602 on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003603 result->AddAlternative(alternative);
3604 }
3605 return result;
3606}
3607
3608
3609RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003610 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003611 return ToNode(min(),
3612 max(),
3613 is_greedy(),
3614 body(),
3615 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003616 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003617}
3618
3619
3620RegExpNode* RegExpQuantifier::ToNode(int min,
3621 int max,
3622 bool is_greedy,
3623 RegExpTree* body,
3624 RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003625 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003626 // x{f, t} becomes this:
3627 //
3628 // (r++)<-.
3629 // | `
3630 // | (x)
3631 // v ^
3632 // (r=0)-->(?)---/ [if r < t]
3633 // |
3634 // [if r >= f] \----> ...
3635 //
3636 //
3637 // TODO(someone): clear captures on repetition and handle empty
3638 // matches.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003639
3640 // 15.10.2.5 RepeatMatcher algorithm.
3641 // The parser has already eliminated the case where max is 0. In the case
3642 // where max_match is zero the parser has removed the quantifier if min was
3643 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
3644
3645 // If we know that we cannot match zero length then things are a little
3646 // simpler since we don't need to make the special zero length match check
3647 // from step 2.1. If the min and max are small we can unroll a little in
3648 // this case.
3649 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
3650 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
3651 if (max == 0) return on_success; // This can happen due to recursion.
ager@chromium.org32912102009-01-16 10:38:43 +00003652 bool body_can_be_empty = (body->min_match() == 0);
3653 int body_start_reg = RegExpCompiler::kNoRegister;
3654 Interval capture_registers = body->CaptureRegisters();
3655 bool needs_capture_clearing = !capture_registers.is_empty();
3656 if (body_can_be_empty) {
3657 body_start_reg = compiler->AllocateRegister();
3658 } else if (!needs_capture_clearing) {
3659 // Only unroll if there are no captures and the body can't be
3660 // empty.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003661 if (min > 0 && min <= kMaxUnrolledMinMatches) {
3662 int new_max = (max == kInfinity) ? max : max - min;
3663 // Recurse once to get the loop or optional matches after the fixed ones.
3664 RegExpNode* answer =
3665 ToNode(0, new_max, is_greedy, body, compiler, on_success);
3666 // Unroll the forced matches from 0 to min. This can cause chains of
3667 // TextNodes (which the parser does not generate). These should be
3668 // combined if it turns out they hinder good code generation.
3669 for (int i = 0; i < min; i++) {
3670 answer = body->ToNode(compiler, answer);
3671 }
3672 return answer;
3673 }
3674 if (max <= kMaxUnrolledMaxMatches) {
3675 ASSERT(min == 0);
3676 // Unroll the optional matches up to max.
3677 RegExpNode* answer = on_success;
3678 for (int i = 0; i < max; i++) {
3679 ChoiceNode* alternation = new ChoiceNode(2);
3680 if (is_greedy) {
3681 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3682 answer)));
3683 alternation->AddAlternative(GuardedAlternative(on_success));
3684 } else {
3685 alternation->AddAlternative(GuardedAlternative(on_success));
3686 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3687 answer)));
3688 }
3689 answer = alternation;
3690 }
3691 return answer;
3692 }
3693 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003694 bool has_min = min > 0;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003695 bool has_max = max < RegExpTree::kInfinity;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003696 bool needs_counter = has_min || has_max;
ager@chromium.org32912102009-01-16 10:38:43 +00003697 int reg_ctr = needs_counter
3698 ? compiler->AllocateRegister()
3699 : RegExpCompiler::kNoRegister;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003700 LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003701 RegExpNode* loop_return = needs_counter
3702 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
3703 : static_cast<RegExpNode*>(center);
ager@chromium.org32912102009-01-16 10:38:43 +00003704 if (body_can_be_empty) {
3705 // If the body can be empty we need to check if it was and then
3706 // backtrack.
3707 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
3708 reg_ctr,
3709 min,
3710 loop_return);
3711 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003712 RegExpNode* body_node = body->ToNode(compiler, loop_return);
ager@chromium.org32912102009-01-16 10:38:43 +00003713 if (body_can_be_empty) {
3714 // If the body can be empty we need to store the start position
3715 // so we can bail out if it was empty.
3716 body_node = ActionNode::StorePosition(body_start_reg, body_node);
3717 }
3718 if (needs_capture_clearing) {
3719 // Before entering the body of this loop we need to clear captures.
3720 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
3721 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003722 GuardedAlternative body_alt(body_node);
3723 if (has_max) {
3724 Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
3725 body_alt.AddGuard(body_guard);
3726 }
3727 GuardedAlternative rest_alt(on_success);
3728 if (has_min) {
3729 Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
3730 rest_alt.AddGuard(rest_guard);
3731 }
3732 if (is_greedy) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003733 center->AddLoopAlternative(body_alt);
3734 center->AddContinueAlternative(rest_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003735 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003736 center->AddContinueAlternative(rest_alt);
3737 center->AddLoopAlternative(body_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003738 }
3739 if (needs_counter) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003740 return ActionNode::SetRegister(reg_ctr, 0, center);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003741 } else {
3742 return center;
3743 }
3744}
3745
3746
3747RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003748 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003749 NodeInfo info;
3750 switch (type()) {
3751 case START_OF_LINE:
3752 info.follows_newline_interest = true;
3753 break;
3754 case START_OF_INPUT:
3755 info.follows_start_interest = true;
3756 break;
3757 case BOUNDARY: case NON_BOUNDARY:
3758 info.follows_word_interest = true;
3759 break;
3760 case END_OF_INPUT:
3761 info.at_end = true;
3762 break;
3763 case END_OF_LINE:
3764 // This is wrong but has the effect of making the compiler abort.
3765 info.at_end = true;
3766 }
3767 return on_success->PropagateForward(&info);
3768}
3769
3770
3771RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003772 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003773 return new BackReferenceNode(RegExpCapture::StartRegister(index()),
3774 RegExpCapture::EndRegister(index()),
ager@chromium.org8bb60582008-12-11 12:02:20 +00003775 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003776}
3777
3778
3779RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003780 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003781 return on_success;
3782}
3783
3784
3785RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003786 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003787 int stack_pointer_register = compiler->AllocateRegister();
3788 int position_register = compiler->AllocateRegister();
ager@chromium.org8bb60582008-12-11 12:02:20 +00003789 RegExpNode* success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003790 if (is_positive()) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003791 return ActionNode::BeginSubmatch(
3792 stack_pointer_register,
3793 position_register,
3794 body()->ToNode(
3795 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003796 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
3797 position_register,
3798 on_success)));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003799 } else {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003800 // We use a ChoiceNode for a negative lookahead because it has most of
3801 // the characteristics we need. It has the body of the lookahead as its
3802 // first alternative and the expression after the lookahead of the second
3803 // alternative. If the first alternative succeeds then the
3804 // NegativeSubmatchSuccess will unwind the stack including everything the
3805 // choice node set up and backtrack. If the first alternative fails then
3806 // the second alternative is tried, which is exactly the desired result
3807 // for a negative lookahead. In the case where the dispatch table
3808 // determines that the first alternative cannot match we will save time
3809 // by not trying it. Things are not quite so well-optimized if the
3810 // dispatch table determines that the second alternative cannot match.
3811 // In this case we could optimize by immediately backtracking.
3812 ChoiceNode* choice_node = new ChoiceNode(2);
3813 GuardedAlternative body_alt(
3814 body()->ToNode(
3815 compiler,
3816 success = new NegativeSubmatchSuccess(stack_pointer_register,
3817 position_register)));
3818 choice_node->AddAlternative(body_alt);
3819 choice_node->AddAlternative(GuardedAlternative(on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003820 return ActionNode::BeginSubmatch(stack_pointer_register,
3821 position_register,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003822 choice_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003823 }
3824}
3825
3826
3827RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003828 RegExpNode* on_success) {
3829 return ToNode(body(), index(), compiler, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003830}
3831
3832
3833RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
3834 int index,
3835 RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003836 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003837 int start_reg = RegExpCapture::StartRegister(index);
3838 int end_reg = RegExpCapture::EndRegister(index);
3839 RegExpNode* store_end = ActionNode::StorePosition(end_reg, on_success);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003840 RegExpNode* body_node = body->ToNode(compiler, store_end);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003841 return ActionNode::StorePosition(start_reg, body_node);
3842}
3843
3844
3845RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003846 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003847 ZoneList<RegExpTree*>* children = nodes();
3848 RegExpNode* current = on_success;
3849 for (int i = children->length() - 1; i >= 0; i--) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003850 current = children->at(i)->ToNode(compiler, current);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003851 }
3852 return current;
3853}
3854
3855
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003856static void AddClass(const uc16* elmv,
3857 int elmc,
3858 ZoneList<CharacterRange>* ranges) {
3859 for (int i = 0; i < elmc; i += 2) {
3860 ASSERT(elmv[i] <= elmv[i + 1]);
3861 ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
3862 }
3863}
3864
3865
3866static void AddClassNegated(const uc16 *elmv,
3867 int elmc,
3868 ZoneList<CharacterRange>* ranges) {
3869 ASSERT(elmv[0] != 0x0000);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003870 ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003871 uc16 last = 0x0000;
3872 for (int i = 0; i < elmc; i += 2) {
3873 ASSERT(last <= elmv[i] - 1);
3874 ASSERT(elmv[i] <= elmv[i + 1]);
3875 ranges->Add(CharacterRange(last, elmv[i] - 1));
3876 last = elmv[i + 1] + 1;
3877 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003878 ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003879}
3880
3881
3882void CharacterRange::AddClassEscape(uc16 type,
3883 ZoneList<CharacterRange>* ranges) {
3884 switch (type) {
3885 case 's':
3886 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
3887 break;
3888 case 'S':
3889 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
3890 break;
3891 case 'w':
3892 AddClass(kWordRanges, kWordRangeCount, ranges);
3893 break;
3894 case 'W':
3895 AddClassNegated(kWordRanges, kWordRangeCount, ranges);
3896 break;
3897 case 'd':
3898 AddClass(kDigitRanges, kDigitRangeCount, ranges);
3899 break;
3900 case 'D':
3901 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
3902 break;
3903 case '.':
3904 AddClassNegated(kLineTerminatorRanges,
3905 kLineTerminatorRangeCount,
3906 ranges);
3907 break;
3908 // This is not a character range as defined by the spec but a
3909 // convenient shorthand for a character class that matches any
3910 // character.
3911 case '*':
3912 ranges->Add(CharacterRange::Everything());
3913 break;
3914 default:
3915 UNREACHABLE();
3916 }
3917}
3918
3919
3920Vector<const uc16> CharacterRange::GetWordBounds() {
3921 return Vector<const uc16>(kWordRanges, kWordRangeCount);
3922}
3923
3924
3925class CharacterRangeSplitter {
3926 public:
3927 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
3928 ZoneList<CharacterRange>** excluded)
3929 : included_(included),
3930 excluded_(excluded) { }
3931 void Call(uc16 from, DispatchTable::Entry entry);
3932
3933 static const int kInBase = 0;
3934 static const int kInOverlay = 1;
3935
3936 private:
3937 ZoneList<CharacterRange>** included_;
3938 ZoneList<CharacterRange>** excluded_;
3939};
3940
3941
3942void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
3943 if (!entry.out_set()->Get(kInBase)) return;
3944 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
3945 ? included_
3946 : excluded_;
3947 if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
3948 (*target)->Add(CharacterRange(entry.from(), entry.to()));
3949}
3950
3951
3952void CharacterRange::Split(ZoneList<CharacterRange>* base,
3953 Vector<const uc16> overlay,
3954 ZoneList<CharacterRange>** included,
3955 ZoneList<CharacterRange>** excluded) {
3956 ASSERT_EQ(NULL, *included);
3957 ASSERT_EQ(NULL, *excluded);
3958 DispatchTable table;
3959 for (int i = 0; i < base->length(); i++)
3960 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
3961 for (int i = 0; i < overlay.length(); i += 2) {
3962 table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
3963 CharacterRangeSplitter::kInOverlay);
3964 }
3965 CharacterRangeSplitter callback(included, excluded);
3966 table.ForEach(&callback);
3967}
3968
3969
3970void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
3971 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3972 if (IsSingleton()) {
3973 // If this is a singleton we just expand the one character.
3974 int length = uncanonicalize.get(from(), '\0', chars);
3975 for (int i = 0; i < length; i++) {
3976 uc32 chr = chars[i];
3977 if (chr != from()) {
3978 ranges->Add(CharacterRange::Singleton(chars[i]));
3979 }
3980 }
3981 } else if (from() <= kRangeCanonicalizeMax &&
3982 to() <= kRangeCanonicalizeMax) {
3983 // If this is a range we expand the characters block by block,
3984 // expanding contiguous subranges (blocks) one at a time.
3985 // The approach is as follows. For a given start character we
3986 // look up the block that contains it, for instance 'a' if the
3987 // start character is 'c'. A block is characterized by the property
3988 // that all characters uncanonicalize in the same way as the first
3989 // element, except that each entry in the result is incremented
3990 // by the distance from the first element. So a-z is a block
3991 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
3992 // uncanonicalizes to ['a' + k, 'A' + k].
3993 // Once we've found the start point we look up its uncanonicalization
3994 // and produce a range for each element. For instance for [c-f]
3995 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
3996 // add a range if it is not already contained in the input, so [c-f]
3997 // will be skipped but [C-F] will be added. If this range is not
3998 // completely contained in a block we do this for all the blocks
3999 // covered by the range.
4000 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4001 // First, look up the block that contains the 'from' character.
4002 int length = canonrange.get(from(), '\0', range);
4003 if (length == 0) {
4004 range[0] = from();
4005 } else {
4006 ASSERT_EQ(1, length);
4007 }
4008 int pos = from();
4009 // The start of the current block. Note that except for the first
4010 // iteration 'start' is always equal to 'pos'.
4011 int start;
4012 // If it is not the start point of a block the entry contains the
4013 // offset of the character from the start point.
4014 if ((range[0] & kStartMarker) == 0) {
4015 start = pos - range[0];
4016 } else {
4017 start = pos;
4018 }
4019 // Then we add the ranges on at a time, incrementing the current
4020 // position to be after the last block each time. The position
4021 // always points to the start of a block.
4022 while (pos < to()) {
4023 length = canonrange.get(start, '\0', range);
4024 if (length == 0) {
4025 range[0] = start;
4026 } else {
4027 ASSERT_EQ(1, length);
4028 }
4029 ASSERT((range[0] & kStartMarker) != 0);
4030 // The start point of a block contains the distance to the end
4031 // of the range.
4032 int block_end = start + (range[0] & kPayloadMask) - 1;
4033 int end = (block_end > to()) ? to() : block_end;
4034 length = uncanonicalize.get(start, '\0', range);
4035 for (int i = 0; i < length; i++) {
4036 uc32 c = range[i];
4037 uc16 range_from = c + (pos - start);
4038 uc16 range_to = c + (end - start);
4039 if (!(from() <= range_from && range_to <= to())) {
4040 ranges->Add(CharacterRange(range_from, range_to));
4041 }
4042 }
4043 start = pos = block_end + 1;
4044 }
4045 } else {
4046 // TODO(plesner) when we've fixed the 2^11 bug in unibrow.
4047 }
4048}
4049
4050
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004051ZoneList<CharacterRange>* CharacterSet::ranges() {
4052 if (ranges_ == NULL) {
4053 ranges_ = new ZoneList<CharacterRange>(2);
4054 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
4055 }
4056 return ranges_;
4057}
4058
4059
4060
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004061// -------------------------------------------------------------------
4062// Interest propagation
4063
4064
4065RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
4066 for (int i = 0; i < siblings_.length(); i++) {
4067 RegExpNode* sibling = siblings_.Get(i);
4068 if (sibling->info()->Matches(info))
4069 return sibling;
4070 }
4071 return NULL;
4072}
4073
4074
4075RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
4076 ASSERT_EQ(false, *cloned);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004077 siblings_.Ensure(this);
4078 RegExpNode* result = TryGetSibling(info);
4079 if (result != NULL) return result;
4080 result = this->Clone();
4081 NodeInfo* new_info = result->info();
4082 new_info->ResetCompilationState();
4083 new_info->AddFromPreceding(info);
4084 AddSibling(result);
4085 *cloned = true;
4086 return result;
4087}
4088
4089
4090template <class C>
4091static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
4092 NodeInfo full_info(*node->info());
4093 full_info.AddFromPreceding(info);
4094 bool cloned = false;
4095 return RegExpNode::EnsureSibling(node, &full_info, &cloned);
4096}
4097
4098
4099RegExpNode* ActionNode::PropagateForward(NodeInfo* info) {
4100 NodeInfo full_info(*this->info());
4101 full_info.AddFromPreceding(info);
4102 bool cloned = false;
4103 ActionNode* action = EnsureSibling(this, &full_info, &cloned);
ager@chromium.org8bb60582008-12-11 12:02:20 +00004104 action->set_on_success(action->on_success()->PropagateForward(info));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004105 return action;
4106}
4107
4108
4109RegExpNode* ChoiceNode::PropagateForward(NodeInfo* info) {
4110 NodeInfo full_info(*this->info());
4111 full_info.AddFromPreceding(info);
4112 bool cloned = false;
4113 ChoiceNode* choice = EnsureSibling(this, &full_info, &cloned);
4114 if (cloned) {
4115 ZoneList<GuardedAlternative>* old_alternatives = alternatives();
4116 int count = old_alternatives->length();
4117 choice->alternatives_ = new ZoneList<GuardedAlternative>(count);
4118 for (int i = 0; i < count; i++) {
4119 GuardedAlternative alternative = old_alternatives->at(i);
4120 alternative.set_node(alternative.node()->PropagateForward(info));
4121 choice->alternatives()->Add(alternative);
4122 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004123 }
4124 return choice;
4125}
4126
4127
4128RegExpNode* EndNode::PropagateForward(NodeInfo* info) {
4129 return PropagateToEndpoint(this, info);
4130}
4131
4132
4133RegExpNode* BackReferenceNode::PropagateForward(NodeInfo* info) {
4134 NodeInfo full_info(*this->info());
4135 full_info.AddFromPreceding(info);
4136 bool cloned = false;
4137 BackReferenceNode* back_ref = EnsureSibling(this, &full_info, &cloned);
4138 if (cloned) {
4139 // TODO(erikcorry): A back reference has to have two successors (by default
4140 // the same node). The first is used if the back reference matches a non-
4141 // empty back reference, the second if it matches an empty one. This
4142 // doesn't matter for at_end, which is the only one implemented right now,
4143 // but it will matter for other pieces of info.
4144 back_ref->set_on_success(back_ref->on_success()->PropagateForward(info));
4145 }
4146 return back_ref;
4147}
4148
4149
4150RegExpNode* TextNode::PropagateForward(NodeInfo* info) {
4151 return PropagateToEndpoint(this, info);
4152}
4153
4154
4155// -------------------------------------------------------------------
4156// Splay tree
4157
4158
4159OutSet* OutSet::Extend(unsigned value) {
4160 if (Get(value))
4161 return this;
4162 if (successors() != NULL) {
4163 for (int i = 0; i < successors()->length(); i++) {
4164 OutSet* successor = successors()->at(i);
4165 if (successor->Get(value))
4166 return successor;
4167 }
4168 } else {
4169 successors_ = new ZoneList<OutSet*>(2);
4170 }
4171 OutSet* result = new OutSet(first_, remaining_);
4172 result->Set(value);
4173 successors()->Add(result);
4174 return result;
4175}
4176
4177
4178void OutSet::Set(unsigned value) {
4179 if (value < kFirstLimit) {
4180 first_ |= (1 << value);
4181 } else {
4182 if (remaining_ == NULL)
4183 remaining_ = new ZoneList<unsigned>(1);
4184 if (remaining_->is_empty() || !remaining_->Contains(value))
4185 remaining_->Add(value);
4186 }
4187}
4188
4189
4190bool OutSet::Get(unsigned value) {
4191 if (value < kFirstLimit) {
4192 return (first_ & (1 << value)) != 0;
4193 } else if (remaining_ == NULL) {
4194 return false;
4195 } else {
4196 return remaining_->Contains(value);
4197 }
4198}
4199
4200
4201const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
4202const DispatchTable::Entry DispatchTable::Config::kNoValue;
4203
4204
4205void DispatchTable::AddRange(CharacterRange full_range, int value) {
4206 CharacterRange current = full_range;
4207 if (tree()->is_empty()) {
4208 // If this is the first range we just insert into the table.
4209 ZoneSplayTree<Config>::Locator loc;
4210 ASSERT_RESULT(tree()->Insert(current.from(), &loc));
4211 loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
4212 return;
4213 }
4214 // First see if there is a range to the left of this one that
4215 // overlaps.
4216 ZoneSplayTree<Config>::Locator loc;
4217 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
4218 Entry* entry = &loc.value();
4219 // If we've found a range that overlaps with this one, and it
4220 // starts strictly to the left of this one, we have to fix it
4221 // because the following code only handles ranges that start on
4222 // or after the start point of the range we're adding.
4223 if (entry->from() < current.from() && entry->to() >= current.from()) {
4224 // Snap the overlapping range in half around the start point of
4225 // the range we're adding.
4226 CharacterRange left(entry->from(), current.from() - 1);
4227 CharacterRange right(current.from(), entry->to());
4228 // The left part of the overlapping range doesn't overlap.
4229 // Truncate the whole entry to be just the left part.
4230 entry->set_to(left.to());
4231 // The right part is the one that overlaps. We add this part
4232 // to the map and let the next step deal with merging it with
4233 // the range we're adding.
4234 ZoneSplayTree<Config>::Locator loc;
4235 ASSERT_RESULT(tree()->Insert(right.from(), &loc));
4236 loc.set_value(Entry(right.from(),
4237 right.to(),
4238 entry->out_set()));
4239 }
4240 }
4241 while (current.is_valid()) {
4242 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
4243 (loc.value().from() <= current.to()) &&
4244 (loc.value().to() >= current.from())) {
4245 Entry* entry = &loc.value();
4246 // We have overlap. If there is space between the start point of
4247 // the range we're adding and where the overlapping range starts
4248 // then we have to add a range covering just that space.
4249 if (current.from() < entry->from()) {
4250 ZoneSplayTree<Config>::Locator ins;
4251 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4252 ins.set_value(Entry(current.from(),
4253 entry->from() - 1,
4254 empty()->Extend(value)));
4255 current.set_from(entry->from());
4256 }
4257 ASSERT_EQ(current.from(), entry->from());
4258 // If the overlapping range extends beyond the one we want to add
4259 // we have to snap the right part off and add it separately.
4260 if (entry->to() > current.to()) {
4261 ZoneSplayTree<Config>::Locator ins;
4262 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
4263 ins.set_value(Entry(current.to() + 1,
4264 entry->to(),
4265 entry->out_set()));
4266 entry->set_to(current.to());
4267 }
4268 ASSERT(entry->to() <= current.to());
4269 // The overlapping range is now completely contained by the range
4270 // we're adding so we can just update it and move the start point
4271 // of the range we're adding just past it.
4272 entry->AddValue(value);
4273 // Bail out if the last interval ended at 0xFFFF since otherwise
4274 // adding 1 will wrap around to 0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004275 if (entry->to() == String::kMaxUC16CharCode)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004276 break;
4277 ASSERT(entry->to() + 1 > current.from());
4278 current.set_from(entry->to() + 1);
4279 } else {
4280 // There is no overlap so we can just add the range
4281 ZoneSplayTree<Config>::Locator ins;
4282 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4283 ins.set_value(Entry(current.from(),
4284 current.to(),
4285 empty()->Extend(value)));
4286 break;
4287 }
4288 }
4289}
4290
4291
4292OutSet* DispatchTable::Get(uc16 value) {
4293 ZoneSplayTree<Config>::Locator loc;
4294 if (!tree()->FindGreatestLessThan(value, &loc))
4295 return empty();
4296 Entry* entry = &loc.value();
4297 if (value <= entry->to())
4298 return entry->out_set();
4299 else
4300 return empty();
4301}
4302
4303
4304// -------------------------------------------------------------------
4305// Analysis
4306
4307
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004308void Analysis::EnsureAnalyzed(RegExpNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004309 if (that->info()->been_analyzed || that->info()->being_analyzed)
4310 return;
4311 that->info()->being_analyzed = true;
4312 that->Accept(this);
4313 that->info()->being_analyzed = false;
4314 that->info()->been_analyzed = true;
4315}
4316
4317
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004318void Analysis::VisitEnd(EndNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004319 // nothing to do
4320}
4321
4322
ager@chromium.org8bb60582008-12-11 12:02:20 +00004323void TextNode::CalculateOffsets() {
4324 int element_count = elements()->length();
4325 // Set up the offsets of the elements relative to the start. This is a fixed
4326 // quantity since a TextNode can only contain fixed-width things.
4327 int cp_offset = 0;
4328 for (int i = 0; i < element_count; i++) {
4329 TextElement& elm = elements()->at(i);
4330 elm.cp_offset = cp_offset;
4331 if (elm.type == TextElement::ATOM) {
4332 cp_offset += elm.data.u_atom->data().length();
4333 } else {
4334 cp_offset++;
4335 Vector<const uc16> quarks = elm.data.u_atom->data();
4336 }
4337 }
4338}
4339
4340
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004341void Analysis::VisitText(TextNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004342 if (ignore_case_) {
4343 that->MakeCaseIndependent();
4344 }
4345 EnsureAnalyzed(that->on_success());
ager@chromium.org8bb60582008-12-11 12:02:20 +00004346 that->CalculateOffsets();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004347}
4348
4349
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004350void Analysis::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004351 RegExpNode* target = that->on_success();
4352 EnsureAnalyzed(target);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004353 // If the next node is interested in what it follows then this node
4354 // has to be interested too so it can pass the information on.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004355 that->info()->AddFromFollowing(target->info());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004356}
4357
4358
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004359void Analysis::VisitChoice(ChoiceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004360 NodeInfo* info = that->info();
4361 for (int i = 0; i < that->alternatives()->length(); i++) {
4362 RegExpNode* node = that->alternatives()->at(i).node();
4363 EnsureAnalyzed(node);
4364 // Anything the following nodes need to know has to be known by
4365 // this node also, so it can pass it on.
4366 info->AddFromFollowing(node->info());
4367 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004368}
4369
4370
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004371void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
4372 NodeInfo* info = that->info();
4373 for (int i = 0; i < that->alternatives()->length(); i++) {
4374 RegExpNode* node = that->alternatives()->at(i).node();
4375 if (node != that->loop_node()) {
4376 EnsureAnalyzed(node);
4377 info->AddFromFollowing(node->info());
4378 }
4379 }
4380 // Check the loop last since it may need the value of this node
4381 // to get a correct result.
4382 EnsureAnalyzed(that->loop_node());
4383 info->AddFromFollowing(that->loop_node()->info());
4384}
4385
4386
4387void Analysis::VisitBackReference(BackReferenceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004388 EnsureAnalyzed(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004389}
4390
4391
4392// -------------------------------------------------------------------
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004393// Dispatch table construction
4394
4395
4396void DispatchTableConstructor::VisitEnd(EndNode* that) {
4397 AddRange(CharacterRange::Everything());
4398}
4399
4400
4401void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
4402 node->set_being_calculated(true);
4403 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
4404 for (int i = 0; i < alternatives->length(); i++) {
4405 set_choice_index(i);
4406 alternatives->at(i).node()->Accept(this);
4407 }
4408 node->set_being_calculated(false);
4409}
4410
4411
4412class AddDispatchRange {
4413 public:
4414 explicit AddDispatchRange(DispatchTableConstructor* constructor)
4415 : constructor_(constructor) { }
4416 void Call(uc32 from, DispatchTable::Entry entry);
4417 private:
4418 DispatchTableConstructor* constructor_;
4419};
4420
4421
4422void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
4423 CharacterRange range(from, entry.to());
4424 constructor_->AddRange(range);
4425}
4426
4427
4428void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
4429 if (node->being_calculated())
4430 return;
4431 DispatchTable* table = node->GetTable(ignore_case_);
4432 AddDispatchRange adder(this);
4433 table->ForEach(&adder);
4434}
4435
4436
4437void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
4438 // TODO(160): Find the node that we refer back to and propagate its start
4439 // set back to here. For now we just accept anything.
4440 AddRange(CharacterRange::Everything());
4441}
4442
4443
4444
4445static int CompareRangeByFrom(const CharacterRange* a,
4446 const CharacterRange* b) {
4447 return Compare<uc16>(a->from(), b->from());
4448}
4449
4450
4451void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
4452 ranges->Sort(CompareRangeByFrom);
4453 uc16 last = 0;
4454 for (int i = 0; i < ranges->length(); i++) {
4455 CharacterRange range = ranges->at(i);
4456 if (last < range.from())
4457 AddRange(CharacterRange(last, range.from() - 1));
4458 if (range.to() >= last) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004459 if (range.to() == String::kMaxUC16CharCode) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004460 return;
4461 } else {
4462 last = range.to() + 1;
4463 }
4464 }
4465 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004466 AddRange(CharacterRange(last, String::kMaxUC16CharCode));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004467}
4468
4469
4470void DispatchTableConstructor::VisitText(TextNode* that) {
4471 TextElement elm = that->elements()->at(0);
4472 switch (elm.type) {
4473 case TextElement::ATOM: {
4474 uc16 c = elm.data.u_atom->data()[0];
4475 AddRange(CharacterRange(c, c));
4476 break;
4477 }
4478 case TextElement::CHAR_CLASS: {
4479 RegExpCharacterClass* tree = elm.data.u_char_class;
4480 ZoneList<CharacterRange>* ranges = tree->ranges();
4481 if (tree->is_negated()) {
4482 AddInverse(ranges);
4483 } else {
4484 for (int i = 0; i < ranges->length(); i++)
4485 AddRange(ranges->at(i));
4486 }
4487 break;
4488 }
4489 default: {
4490 UNIMPLEMENTED();
4491 }
4492 }
4493}
4494
4495
4496void DispatchTableConstructor::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004497 RegExpNode* target = that->on_success();
4498 target->Accept(this);
4499}
4500
4501
ager@chromium.org8bb60582008-12-11 12:02:20 +00004502Handle<FixedArray> RegExpEngine::Compile(RegExpCompileData* data,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004503 bool ignore_case,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004504 bool is_multiline,
4505 Handle<String> pattern,
4506 bool is_ascii) {
4507 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004508 // Wrap the body of the regexp in capture #0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004509 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004510 0,
4511 &compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004512 compiler.accept());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004513 // Add a .*? at the beginning, outside the body capture.
4514 // Note: We could choose to not add this if the regexp is anchored at
4515 // the start of the input but I'm not sure how best to do that and
4516 // since we don't even handle ^ yet I'm saving that optimization for
4517 // later.
4518 RegExpNode* node = RegExpQuantifier::ToNode(0,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004519 RegExpTree::kInfinity,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004520 false,
4521 new RegExpCharacterClass('*'),
4522 &compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004523 captured_body);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004524 data->node = node;
4525 Analysis analysis(ignore_case);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004526 analysis.EnsureAnalyzed(node);
4527
4528 NodeInfo info = *node->info();
ager@chromium.org8bb60582008-12-11 12:02:20 +00004529
4530 if (is_multiline && !FLAG_attempt_multiline_irregexp) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004531 return Handle<FixedArray>::null();
4532 }
4533
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004534 if (FLAG_irregexp_native) {
4535#ifdef ARM
4536 // Unimplemented, fall-through to bytecode implementation.
4537#else // IA32
ager@chromium.org8bb60582008-12-11 12:02:20 +00004538 RegExpMacroAssemblerIA32::Mode mode;
4539 if (is_ascii) {
4540 mode = RegExpMacroAssemblerIA32::ASCII;
4541 } else {
4542 mode = RegExpMacroAssemblerIA32::UC16;
4543 }
4544 RegExpMacroAssemblerIA32 macro_assembler(mode,
4545 (data->capture_count + 1) * 2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004546 return compiler.Assemble(&macro_assembler,
4547 node,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004548 data->capture_count,
4549 pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004550#endif
4551 }
4552 EmbeddedVector<byte, 1024> codes;
4553 RegExpMacroAssemblerIrregexp macro_assembler(codes);
4554 return compiler.Assemble(&macro_assembler,
4555 node,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004556 data->capture_count,
4557 pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004558}
4559
4560
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00004561}} // namespace v8::internal