blob: fef0b0d97a43ecc2452df26addff84ccc290b890 [file] [log] [blame]
ager@chromium.org9258b6b2008-09-11 09:11:10 +00001// Copyright 2006-2008 the V8 project authors. All rights reserved.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include "v8.h"
29
ager@chromium.orga74f0da2008-12-03 16:05:52 +000030#include "ast.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000031#include "execution.h"
32#include "factory.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000033#include "jsregexp-inl.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000034#include "platform.h"
kasperl@chromium.org41044eb2008-10-06 08:24:46 +000035#include "runtime.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000036#include "top.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000037#include "compilation-cache.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000038#include "string-stream.h"
39#include "parser.h"
40#include "regexp-macro-assembler.h"
41#include "regexp-macro-assembler-tracer.h"
42#include "regexp-macro-assembler-irregexp.h"
ager@chromium.org32912102009-01-16 10:38:43 +000043#include "regexp-stack.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000044
45#ifdef ARM
46#include "regexp-macro-assembler-arm.h"
47#else // IA32
48#include "macro-assembler-ia32.h"
49#include "regexp-macro-assembler-ia32.h"
50#endif
51
52#include "interpreter-irregexp.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000053
54// Including pcre.h undefines DEBUG to avoid getting debug output from
55// the JSCRE implementation. Make sure to redefine it in debug mode
56// after having included the header file.
57#ifdef DEBUG
58#include "third_party/jscre/pcre.h"
59#define DEBUG
60#else
61#include "third_party/jscre/pcre.h"
62#endif
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000063
ager@chromium.orga74f0da2008-12-03 16:05:52 +000064
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000065namespace v8 { namespace internal {
66
67
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000068static Failure* malloc_failure;
69
70static void* JSREMalloc(size_t size) {
71 Object* obj = Heap::AllocateByteArray(size);
72
73 // If allocation failed, return a NULL pointer to JSRE, and jsRegExpCompile
74 // will return NULL to the caller, performs GC there.
75 // Also pass failure information to the caller.
76 if (obj->IsFailure()) {
77 malloc_failure = Failure::cast(obj);
78 return NULL;
79 }
80
81 // Note: object is unrooted, the caller of jsRegExpCompile must
82 // create a handle for the return value before doing heap allocation.
83 return reinterpret_cast<void*>(ByteArray::cast(obj)->GetDataStartAddress());
84}
85
86
87static void JSREFree(void* p) {
88 USE(p); // Do nothing, memory is garbage collected.
89}
90
91
92String* RegExpImpl::last_ascii_string_ = NULL;
93String* RegExpImpl::two_byte_cached_string_ = NULL;
94
95
96void RegExpImpl::NewSpaceCollectionPrologue() {
97 // The two byte string is always in the old space. The Ascii string may be
98 // in either place. If it is in the old space we don't need to do anything.
99 if (Heap::InNewSpace(last_ascii_string_)) {
100 // Invalidate the cache.
101 last_ascii_string_ = NULL;
102 two_byte_cached_string_ = NULL;
103 }
104}
105
106
107void RegExpImpl::OldSpaceCollectionPrologue() {
108 last_ascii_string_ = NULL;
109 two_byte_cached_string_ = NULL;
110}
111
112
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000113Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
114 Handle<String> pattern,
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000115 Handle<String> flags,
116 bool* has_pending_exception) {
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000117 // Ensure that the constructor function has been loaded.
118 if (!constructor->IsLoaded()) {
119 LoadLazy(constructor, has_pending_exception);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000120 if (*has_pending_exception) return Handle<Object>();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000121 }
122 // Call the construct code with 2 arguments.
123 Object** argv[2] = { Handle<Object>::cast(pattern).location(),
124 Handle<Object>::cast(flags).location() };
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +0000125 return Execution::New(constructor, 2, argv, has_pending_exception);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000126}
127
128
129// Converts a source string to a 16 bit flat string or a SlicedString containing
130// a 16 bit flat string).
131Handle<String> RegExpImpl::CachedStringToTwoByte(Handle<String> subject) {
132 if (*subject == last_ascii_string_) {
133 ASSERT(two_byte_cached_string_ != NULL);
134 return Handle<String>(String::cast(two_byte_cached_string_));
135 }
136 Handle<String> two_byte_string = StringToTwoByte(subject);
137 last_ascii_string_ = *subject;
138 two_byte_cached_string_ = *two_byte_string;
139 return two_byte_string;
140}
141
142
143// Converts a source string to a 16 bit flat string or a SlicedString containing
144// a 16 bit flat string).
145Handle<String> RegExpImpl::StringToTwoByte(Handle<String> pattern) {
ager@chromium.org870a0b62008-11-04 11:43:05 +0000146 StringShape shape(*pattern);
147 if (!pattern->IsFlat(shape)) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000148 FlattenString(pattern);
ager@chromium.orgc3e50d82008-11-05 11:53:10 +0000149 shape = StringShape(*pattern);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000150 }
ager@chromium.org870a0b62008-11-04 11:43:05 +0000151 Handle<String> flat_string(shape.IsCons() ?
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000152 String::cast(ConsString::cast(*pattern)->first()) :
153 *pattern);
ager@chromium.org870a0b62008-11-04 11:43:05 +0000154 ASSERT(flat_string->IsString());
155 StringShape flat_shape(*flat_string);
156 ASSERT(!flat_shape.IsCons());
157 ASSERT(flat_shape.IsSequential() ||
158 flat_shape.IsSliced() ||
159 flat_shape.IsExternal());
160 if (!flat_shape.IsAsciiRepresentation()) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000161 return flat_string;
162 }
163
ager@chromium.org870a0b62008-11-04 11:43:05 +0000164 int len = flat_string->length(flat_shape);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000165 Handle<String> two_byte_string =
ager@chromium.org870a0b62008-11-04 11:43:05 +0000166 Factory::NewRawTwoByteString(len, TENURED);
167 uc16* dest = SeqTwoByteString::cast(*two_byte_string)->GetChars();
168 String::WriteToFlat(*flat_string, flat_shape, dest, 0, len);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000169 return two_byte_string;
170}
171
172
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000173static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
174 int flags = JSRegExp::NONE;
ager@chromium.org870a0b62008-11-04 11:43:05 +0000175 StringShape shape(*str);
176 for (int i = 0; i < str->length(shape); i++) {
177 switch (str->Get(shape, i)) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000178 case 'i':
179 flags |= JSRegExp::IGNORE_CASE;
180 break;
181 case 'g':
182 flags |= JSRegExp::GLOBAL;
183 break;
184 case 'm':
185 flags |= JSRegExp::MULTILINE;
186 break;
187 }
188 }
189 return JSRegExp::Flags(flags);
190}
191
192
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000193static inline void ThrowRegExpException(Handle<JSRegExp> re,
194 Handle<String> pattern,
195 Handle<String> error_text,
196 const char* message) {
197 Handle<JSArray> array = Factory::NewJSArray(2);
198 SetElement(array, 0, pattern);
199 SetElement(array, 1, error_text);
200 Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
201 Top::Throw(*regexp_err);
202}
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000203
204
ager@chromium.org8bb60582008-12-11 12:02:20 +0000205// Generic RegExp methods. Dispatches to implementation specific methods.
206
207
208class OffsetsVector {
209 public:
210 inline OffsetsVector(int num_registers)
211 : offsets_vector_length_(num_registers) {
212 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
213 vector_ = NewArray<int>(offsets_vector_length_);
214 } else {
215 vector_ = static_offsets_vector_;
216 }
217 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000218 inline ~OffsetsVector() {
219 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
220 DeleteArray(vector_);
221 vector_ = NULL;
222 }
223 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000224 inline int* vector() { return vector_; }
225 inline int length() { return offsets_vector_length_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000226
227 private:
228 int* vector_;
229 int offsets_vector_length_;
230 static const int kStaticOffsetsVectorSize = 50;
231 static int static_offsets_vector_[kStaticOffsetsVectorSize];
232};
233
234
235int OffsetsVector::static_offsets_vector_[
236 OffsetsVector::kStaticOffsetsVectorSize];
237
238
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000239Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
240 Handle<String> pattern,
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000241 Handle<String> flag_str) {
242 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
243 Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
244 bool in_cache = !cached.is_null();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000245 LOG(RegExpCompileEvent(re, in_cache));
246
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000247 Handle<Object> result;
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000248 if (in_cache) {
249 re->set_data(*cached);
250 result = re;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000251 } else {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000252 FlattenString(pattern);
253 ZoneScope zone_scope(DELETE_ON_EXIT);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000254 RegExpCompileData parse_result;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000255 FlatStringReader reader(pattern);
256 if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
257 // Throw an exception if we fail to parse the pattern.
258 ThrowRegExpException(re,
259 pattern,
260 parse_result.error,
261 "malformed_regexp");
ager@chromium.org8bb60582008-12-11 12:02:20 +0000262 return Handle<Object>::null();
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000263 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000264
265 if (parse_result.simple && !flags.is_ignore_case()) {
266 // Parse-tree is a single atom that is equal to the pattern.
267 result = AtomCompile(re, pattern, flags, pattern);
268 } else if (parse_result.tree->IsAtom() &&
269 !flags.is_ignore_case() &&
270 parse_result.capture_count == 0) {
271 RegExpAtom* atom = parse_result.tree->AsAtom();
272 Vector<const uc16> atom_pattern = atom->data();
273 Handle<String> atom_string = Factory::NewStringFromTwoByte(atom_pattern);
274 result = AtomCompile(re, pattern, flags, atom_string);
275 } else if (FLAG_irregexp) {
276 result = IrregexpPrepare(re, pattern, flags);
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000277 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000278 result = JscrePrepare(re, pattern, flags);
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000279 }
280 Object* data = re->data();
281 if (data->IsFixedArray()) {
282 // If compilation succeeded then the data is set on the regexp
283 // and we can store it in the cache.
284 Handle<FixedArray> data(FixedArray::cast(re->data()));
285 CompilationCache::PutRegExp(pattern, flags, data);
286 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000287 }
288
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000289 return result;
290}
291
292
293Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
294 Handle<String> subject,
295 Handle<Object> index) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000296 switch (regexp->TypeTag()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000297 case JSRegExp::ATOM:
298 return AtomExec(regexp, subject, index);
299 case JSRegExp::IRREGEXP: {
300 Handle<Object> result = IrregexpExec(regexp, subject, index);
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000301 if (result.is_null()) ASSERT(Top::has_pending_exception());
302 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000303 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000304 case JSRegExp::JSCRE:
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000305 return JscreExec(regexp, subject, index);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000306 default:
307 UNREACHABLE();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000308 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000309 }
310}
311
312
313Handle<Object> RegExpImpl::ExecGlobal(Handle<JSRegExp> regexp,
314 Handle<String> subject) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000315 switch (regexp->TypeTag()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000316 case JSRegExp::ATOM:
317 return AtomExecGlobal(regexp, subject);
318 case JSRegExp::IRREGEXP: {
319 Handle<Object> result = IrregexpExecGlobal(regexp, subject);
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000320 if (result.is_null()) ASSERT(Top::has_pending_exception());
321 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000322 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000323 case JSRegExp::JSCRE:
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000324 return JscreExecGlobal(regexp, subject);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000325 default:
326 UNREACHABLE();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000327 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000328 }
329}
330
331
ager@chromium.org8bb60582008-12-11 12:02:20 +0000332// RegExp Atom implementation: Simple string search using indexOf.
333
334
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000335Handle<Object> RegExpImpl::AtomCompile(Handle<JSRegExp> re,
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000336 Handle<String> pattern,
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000337 JSRegExp::Flags flags,
338 Handle<String> match_pattern) {
339 Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, match_pattern);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000340 return re;
341}
342
343
344Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
345 Handle<String> subject,
346 Handle<Object> index) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000347 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000348
349 uint32_t start_index;
350 if (!Array::IndexFromObject(*index, &start_index)) {
351 return Handle<Smi>(Smi::FromInt(-1));
352 }
353
ager@chromium.org7c537e22008-10-16 08:43:32 +0000354 int value = Runtime::StringMatch(subject, needle, start_index);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000355 if (value == -1) return Factory::null_value();
ager@chromium.org7c537e22008-10-16 08:43:32 +0000356
357 Handle<FixedArray> array = Factory::NewFixedArray(2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000358 array->set(0, Smi::FromInt(value));
359 array->set(1, Smi::FromInt(value + needle->length()));
ager@chromium.org7c537e22008-10-16 08:43:32 +0000360 return Factory::NewJSArrayWithElements(array);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000361}
362
363
364Handle<Object> RegExpImpl::AtomExecGlobal(Handle<JSRegExp> re,
365 Handle<String> subject) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000366 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000367 Handle<JSArray> result = Factory::NewJSArray(1);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000368 int index = 0;
369 int match_count = 0;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000370 int subject_length = subject->length();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000371 int needle_length = needle->length();
ager@chromium.org7c537e22008-10-16 08:43:32 +0000372 while (true) {
ager@chromium.org7c537e22008-10-16 08:43:32 +0000373 int value = -1;
374 if (index + needle_length <= subject_length) {
375 value = Runtime::StringMatch(subject, needle, index);
376 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000377 if (value == -1) break;
378 HandleScope scope;
379 int end = value + needle_length;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000380
381 Handle<FixedArray> array = Factory::NewFixedArray(2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000382 array->set(0, Smi::FromInt(value));
383 array->set(1, Smi::FromInt(end));
ager@chromium.org7c537e22008-10-16 08:43:32 +0000384 Handle<JSArray> pair = Factory::NewJSArrayWithElements(array);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000385 SetElement(result, match_count, pair);
386 match_count++;
387 index = end;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000388 if (needle_length == 0) index++;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000389 }
390 return result;
391}
392
393
ager@chromium.org8bb60582008-12-11 12:02:20 +0000394// JSCRE implementation.
395
396
397int RegExpImpl::JscreNumberOfCaptures(Handle<JSRegExp> re) {
398 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
399 return Smi::cast(value->get(kJscreNumberOfCapturesIndex))->value();
400}
401
402
403ByteArray* RegExpImpl::JscreInternal(Handle<JSRegExp> re) {
404 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
405 return ByteArray::cast(value->get(kJscreInternalIndex));
406}
407
408
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000409Handle<Object>RegExpImpl::JscrePrepare(Handle<JSRegExp> re,
410 Handle<String> pattern,
411 JSRegExp::Flags flags) {
412 Handle<Object> value(Heap::undefined_value());
413 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
414 return re;
415}
416
417
ager@chromium.org8bb60582008-12-11 12:02:20 +0000418static inline Object* JscreDoCompile(String* pattern,
419 JSRegExp::Flags flags,
420 unsigned* number_of_captures,
421 const char** error_message,
422 v8::jscre::JscreRegExp** code) {
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000423 v8::jscre::JSRegExpIgnoreCaseOption case_option = flags.is_ignore_case()
424 ? v8::jscre::JSRegExpIgnoreCase
425 : v8::jscre::JSRegExpDoNotIgnoreCase;
426 v8::jscre::JSRegExpMultilineOption multiline_option = flags.is_multiline()
427 ? v8::jscre::JSRegExpMultiline
428 : v8::jscre::JSRegExpSingleLine;
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000429 *error_message = NULL;
430 malloc_failure = Failure::Exception();
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000431 *code = v8::jscre::jsRegExpCompile(pattern->GetTwoByteData(),
432 pattern->length(),
433 case_option,
434 multiline_option,
435 number_of_captures,
436 error_message,
437 &JSREMalloc,
438 &JSREFree);
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000439 if (*code == NULL && (malloc_failure->IsRetryAfterGC() ||
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000440 malloc_failure->IsOutOfMemoryFailure())) {
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000441 return malloc_failure;
442 } else {
443 // It doesn't matter which object we return here, we just need to return
444 // a non-failure to indicate to the GC-retry code that there was no
445 // allocation failure.
446 return pattern;
447 }
448}
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000449
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000450
ager@chromium.org8bb60582008-12-11 12:02:20 +0000451static void JscreCompileWithRetryAfterGC(Handle<String> pattern,
452 JSRegExp::Flags flags,
453 unsigned* number_of_captures,
454 const char** error_message,
455 v8::jscre::JscreRegExp** code) {
456 CALL_HEAP_FUNCTION_VOID(JscreDoCompile(*pattern,
457 flags,
458 number_of_captures,
459 error_message,
460 code));
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000461}
462
463
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000464Handle<Object> RegExpImpl::JscreCompile(Handle<JSRegExp> re) {
465 ASSERT_EQ(re->TypeTag(), JSRegExp::JSCRE);
466 ASSERT(re->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined());
467
468 Handle<String> pattern(re->Pattern());
469 JSRegExp::Flags flags = re->GetFlags();
470
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000471 Handle<String> two_byte_pattern = StringToTwoByte(pattern);
472
473 unsigned number_of_captures;
474 const char* error_message = NULL;
475
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000476 v8::jscre::JscreRegExp* code = NULL;
ager@chromium.org7c537e22008-10-16 08:43:32 +0000477 FlattenString(pattern);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000478
ager@chromium.org8bb60582008-12-11 12:02:20 +0000479 JscreCompileWithRetryAfterGC(two_byte_pattern,
480 flags,
481 &number_of_captures,
482 &error_message,
483 &code);
ager@chromium.org7c537e22008-10-16 08:43:32 +0000484
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000485 if (code == NULL) {
486 // Throw an exception.
487 Handle<JSArray> array = Factory::NewJSArray(2);
488 SetElement(array, 0, pattern);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000489 const char* message =
490 (error_message == NULL) ? "Unknown regexp error" : error_message;
491 SetElement(array, 1, Factory::NewStringFromUtf8(CStrVector(message)));
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000492 Handle<Object> regexp_err =
493 Factory::NewSyntaxError("malformed_regexp", array);
ager@chromium.org3bf7b912008-11-17 09:09:45 +0000494 Top::Throw(*regexp_err);
495 return Handle<Object>();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000496 }
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000497
498 // Convert the return address to a ByteArray pointer.
499 Handle<ByteArray> internal(
500 ByteArray::FromDataStartAddress(reinterpret_cast<Address>(code)));
501
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000502 Handle<FixedArray> value = Factory::NewFixedArray(kJscreDataLength);
503 value->set(kJscreNumberOfCapturesIndex, Smi::FromInt(number_of_captures));
504 value->set(kJscreInternalIndex, *internal);
kasperl@chromium.org9bbf9682008-10-30 11:53:07 +0000505 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
506
507 return re;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000508}
509
510
ager@chromium.org8bb60582008-12-11 12:02:20 +0000511Handle<Object> RegExpImpl::JscreExec(Handle<JSRegExp> regexp,
512 Handle<String> subject,
513 Handle<Object> index) {
514 ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);
515 if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {
516 Handle<Object> compile_result = JscreCompile(regexp);
517 if (compile_result.is_null()) return compile_result;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000518 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000519 ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000520
ager@chromium.org8bb60582008-12-11 12:02:20 +0000521 int num_captures = JscreNumberOfCaptures(regexp);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000522
ager@chromium.org8bb60582008-12-11 12:02:20 +0000523 OffsetsVector offsets((num_captures + 1) * 3);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000524
ager@chromium.org8bb60582008-12-11 12:02:20 +0000525 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000526
ager@chromium.org8bb60582008-12-11 12:02:20 +0000527 Handle<String> subject16 = CachedStringToTwoByte(subject);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000528
ager@chromium.org8bb60582008-12-11 12:02:20 +0000529 return JscreExecOnce(regexp,
530 num_captures,
531 subject,
532 previous_index,
533 subject16->GetTwoByteData(),
534 offsets.vector(),
535 offsets.length());
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000536}
537
538
539Handle<Object> RegExpImpl::JscreExecOnce(Handle<JSRegExp> regexp,
540 int num_captures,
541 Handle<String> subject,
542 int previous_index,
543 const uc16* two_byte_subject,
544 int* offsets_vector,
545 int offsets_vector_length) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000546 int rc;
547 {
548 AssertNoAllocation a;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000549 ByteArray* internal = JscreInternal(regexp);
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000550 const v8::jscre::JscreRegExp* js_regexp =
551 reinterpret_cast<v8::jscre::JscreRegExp*>(
552 internal->GetDataStartAddress());
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000553
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000554 rc = v8::jscre::jsRegExpExecute(js_regexp,
555 two_byte_subject,
556 subject->length(),
557 previous_index,
558 offsets_vector,
559 offsets_vector_length);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000560 }
561
562 // The KJS JavaScript engine returns null (ie, a failed match) when
563 // JSRE's internal match limit is exceeded. We duplicate that behavior here.
iposva@chromium.org96f667e2008-11-26 23:48:02 +0000564 if (rc == v8::jscre::JSRegExpErrorNoMatch
565 || rc == v8::jscre::JSRegExpErrorHitLimit) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000566 return Factory::null_value();
567 }
568
569 // Other JSRE errors:
570 if (rc < 0) {
571 // Throw an exception.
572 Handle<Object> code(Smi::FromInt(rc));
573 Handle<Object> args[2] = { Factory::LookupAsciiSymbol("jsre_exec"), code };
574 Handle<Object> regexp_err(
575 Factory::NewTypeError("jsre_error", HandleVector(args, 2)));
576 return Handle<Object>(Top::Throw(*regexp_err));
577 }
578
ager@chromium.org7c537e22008-10-16 08:43:32 +0000579 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000580 // The captures come in (start, end+1) pairs.
581 for (int i = 0; i < 2 * (num_captures+1); i += 2) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000582 array->set(i, Smi::FromInt(offsets_vector[i]));
583 array->set(i+1, Smi::FromInt(offsets_vector[i+1]));
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000584 }
ager@chromium.org7c537e22008-10-16 08:43:32 +0000585 return Factory::NewJSArrayWithElements(array);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000586}
587
588
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000589Handle<Object> RegExpImpl::JscreExecGlobal(Handle<JSRegExp> regexp,
590 Handle<String> subject) {
591 ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);
592 if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {
593 Handle<Object> compile_result = JscreCompile(regexp);
594 if (compile_result.is_null()) return compile_result;
595 }
596 ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());
597
598 // Prepare space for the return values.
599 int num_captures = JscreNumberOfCaptures(regexp);
600
601 OffsetsVector offsets((num_captures + 1) * 3);
602
603 int previous_index = 0;
604
605 Handle<JSArray> result = Factory::NewJSArray(0);
606 int i = 0;
607 Handle<Object> matches;
608
609 Handle<String> subject16 = CachedStringToTwoByte(subject);
610
611 do {
612 if (previous_index > subject->length() || previous_index < 0) {
613 // Per ECMA-262 15.10.6.2, if the previous index is greater than the
614 // string length, there is no match.
615 matches = Factory::null_value();
616 } else {
617 matches = JscreExecOnce(regexp,
618 num_captures,
619 subject,
620 previous_index,
621 subject16->GetTwoByteData(),
622 offsets.vector(),
623 offsets.length());
624
625 if (matches->IsJSArray()) {
626 SetElement(result, i, matches);
627 i++;
628 previous_index = offsets.vector()[1];
629 if (offsets.vector()[0] == offsets.vector()[1]) {
630 previous_index++;
631 }
632 }
633 }
634 } while (matches->IsJSArray());
635
636 // If we exited the loop with an exception, throw it.
637 if (matches->IsNull()) {
638 // Exited loop normally.
639 return result;
640 } else {
641 // Exited loop with the exception in matches.
642 return matches;
643 }
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000644}
645
646
ager@chromium.org8bb60582008-12-11 12:02:20 +0000647// Irregexp implementation.
648
649
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000650// Retrieves a compiled version of the regexp for either ASCII or non-ASCII
651// strings. If the compiled version doesn't already exist, it is compiled
652// from the source pattern.
653// Irregexp is not feature complete yet. If there is something in the
654// regexp that the compiler cannot currently handle, an empty
655// handle is returned, but no exception is thrown.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000656static Handle<FixedArray> GetCompiledIrregexp(Handle<JSRegExp> re,
657 bool is_ascii) {
658 ASSERT(re->DataAt(JSRegExp::kIrregexpDataIndex)->IsFixedArray());
659 Handle<FixedArray> alternatives(
660 FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex)));
661 ASSERT_EQ(2, alternatives->length());
662
663 int index = is_ascii ? 0 : 1;
664 Object* entry = alternatives->get(index);
665 if (!entry->IsNull()) {
666 return Handle<FixedArray>(FixedArray::cast(entry));
667 }
668
669 // Compile the RegExp.
670 ZoneScope zone_scope(DELETE_ON_EXIT);
671
672 JSRegExp::Flags flags = re->GetFlags();
673
674 Handle<String> pattern(re->Pattern());
675 StringShape shape(*pattern);
676 if (!pattern->IsFlat(shape)) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000677 FlattenString(pattern);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000678 }
679
680 RegExpCompileData compile_data;
681 FlatStringReader reader(pattern);
682 if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) {
683 // Throw an exception if we fail to parse the pattern.
684 // THIS SHOULD NOT HAPPEN. We already parsed it successfully once.
685 ThrowRegExpException(re,
686 pattern,
687 compile_data.error,
688 "malformed_regexp");
689 return Handle<FixedArray>::null();
690 }
691 Handle<FixedArray> compiled_entry =
692 RegExpEngine::Compile(&compile_data,
693 flags.is_ignore_case(),
694 flags.is_multiline(),
695 pattern,
696 is_ascii);
697 if (!compiled_entry.is_null()) {
698 alternatives->set(index, *compiled_entry);
699 }
700 return compiled_entry;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000701}
702
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000703
ager@chromium.org8bb60582008-12-11 12:02:20 +0000704int RegExpImpl::IrregexpNumberOfCaptures(Handle<FixedArray> irre) {
705 return Smi::cast(irre->get(kIrregexpNumberOfCapturesIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000706}
707
708
ager@chromium.org8bb60582008-12-11 12:02:20 +0000709int RegExpImpl::IrregexpNumberOfRegisters(Handle<FixedArray> irre) {
710 return Smi::cast(irre->get(kIrregexpNumberOfRegistersIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000711}
712
713
ager@chromium.org8bb60582008-12-11 12:02:20 +0000714Handle<ByteArray> RegExpImpl::IrregexpByteCode(Handle<FixedArray> irre) {
715 ASSERT(Smi::cast(irre->get(kIrregexpImplementationIndex))->value()
716 == RegExpMacroAssembler::kBytecodeImplementation);
717 return Handle<ByteArray>(ByteArray::cast(irre->get(kIrregexpCodeIndex)));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000718}
719
720
ager@chromium.org8bb60582008-12-11 12:02:20 +0000721Handle<Code> RegExpImpl::IrregexpNativeCode(Handle<FixedArray> irre) {
722 ASSERT(Smi::cast(irre->get(kIrregexpImplementationIndex))->value()
723 != RegExpMacroAssembler::kBytecodeImplementation);
724 return Handle<Code>(Code::cast(irre->get(kIrregexpCodeIndex)));
725}
726
727
728Handle<Object>RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
729 Handle<String> pattern,
730 JSRegExp::Flags flags) {
731 // Make space for ASCII and UC16 versions.
732 Handle<FixedArray> alternatives = Factory::NewFixedArray(2);
733 alternatives->set_null(0);
734 alternatives->set_null(1);
735 Factory::SetRegExpData(re, JSRegExp::IRREGEXP, pattern, flags, alternatives);
736 return re;
737}
738
739
740Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
741 Handle<String> subject,
742 Handle<Object> index) {
743 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
744 ASSERT(regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsFixedArray());
745
746 bool is_ascii = StringShape(*subject).IsAsciiRepresentation();
747 Handle<FixedArray> irregexp = GetCompiledIrregexp(regexp, is_ascii);
748 if (irregexp.is_null()) {
749 // We can't handle the RegExp with IRRegExp.
750 return Handle<Object>::null();
751 }
752
753 // Prepare space for the return values.
754 int number_of_registers = IrregexpNumberOfRegisters(irregexp);
755 OffsetsVector offsets(number_of_registers);
756
757 int num_captures = IrregexpNumberOfCaptures(irregexp);
758
759 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
760
761#ifdef DEBUG
762 if (FLAG_trace_regexp_bytecodes) {
763 String* pattern = regexp->Pattern();
764 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
765 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
766 }
767#endif
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000768
769 if (!subject->IsFlat(StringShape(*subject))) {
770 FlattenString(subject);
771 }
772
ager@chromium.org8bb60582008-12-11 12:02:20 +0000773 return IrregexpExecOnce(irregexp,
774 num_captures,
775 subject,
776 previous_index,
777 offsets.vector(),
778 offsets.length());
779}
780
781
782Handle<Object> RegExpImpl::IrregexpExecGlobal(Handle<JSRegExp> regexp,
783 Handle<String> subject) {
784 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
785
786 StringShape shape(*subject);
787 bool is_ascii = shape.IsAsciiRepresentation();
788 Handle<FixedArray> irregexp = GetCompiledIrregexp(regexp, is_ascii);
789 if (irregexp.is_null()) {
790 return Handle<Object>::null();
791 }
792
793 // Prepare space for the return values.
794 int number_of_registers = IrregexpNumberOfRegisters(irregexp);
795 OffsetsVector offsets(number_of_registers);
796
797 int previous_index = 0;
798
799 Handle<JSArray> result = Factory::NewJSArray(0);
800 int i = 0;
801 Handle<Object> matches;
802
803 if (!subject->IsFlat(shape)) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000804 FlattenString(subject);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000805 }
806
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000807 while (true) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000808 if (previous_index > subject->length() || previous_index < 0) {
809 // Per ECMA-262 15.10.6.2, if the previous index is greater than the
810 // string length, there is no match.
811 matches = Factory::null_value();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000812 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000813 } else {
814#ifdef DEBUG
815 if (FLAG_trace_regexp_bytecodes) {
816 String* pattern = regexp->Pattern();
817 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
818 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
819 }
820#endif
ager@chromium.org8bb60582008-12-11 12:02:20 +0000821 matches = IrregexpExecOnce(irregexp,
822 IrregexpNumberOfCaptures(irregexp),
823 subject,
824 previous_index,
825 offsets.vector(),
826 offsets.length());
827
828 if (matches->IsJSArray()) {
829 SetElement(result, i, matches);
830 i++;
831 previous_index = offsets.vector()[1];
832 if (offsets.vector()[0] == offsets.vector()[1]) {
833 previous_index++;
834 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000835 } else if (matches->IsNull()) {
836 return result;
837 } else {
838 return matches;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000839 }
840 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000841 }
842}
843
844
845Handle<Object> RegExpImpl::IrregexpExecOnce(Handle<FixedArray> irregexp,
846 int num_captures,
847 Handle<String> subject,
848 int previous_index,
849 int* offsets_vector,
850 int offsets_vector_length) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000851 ASSERT(subject->IsFlat(StringShape(*subject)));
ager@chromium.org8bb60582008-12-11 12:02:20 +0000852 bool rc;
853
854 int tag = Smi::cast(irregexp->get(kIrregexpImplementationIndex))->value();
855
ager@chromium.org8bb60582008-12-11 12:02:20 +0000856 switch (tag) {
857 case RegExpMacroAssembler::kIA32Implementation: {
858#ifndef ARM
859 Handle<Code> code = IrregexpNativeCode(irregexp);
860
861 StringShape shape(*subject);
862
863 // Character offsets into string.
864 int start_offset = previous_index;
865 int end_offset = subject->length(shape);
866
867 if (shape.IsCons()) {
868 subject = Handle<String>(ConsString::cast(*subject)->first());
869 } else if (shape.IsSliced()) {
870 SlicedString* slice = SlicedString::cast(*subject);
871 start_offset += slice->start();
872 end_offset += slice->start();
873 subject = Handle<String>(slice->buffer());
874 }
875
876 // String is now either Sequential or External
877 StringShape flatshape(*subject);
878 bool is_ascii = flatshape.IsAsciiRepresentation();
879 int char_size_shift = is_ascii ? 0 : 1;
880
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000881 RegExpMacroAssemblerIA32::Result res;
882
ager@chromium.org8bb60582008-12-11 12:02:20 +0000883 if (flatshape.IsExternal()) {
884 const byte* address;
885 if (is_ascii) {
886 ExternalAsciiString* ext = ExternalAsciiString::cast(*subject);
887 address = reinterpret_cast<const byte*>(ext->resource()->data());
888 } else {
889 ExternalTwoByteString* ext = ExternalTwoByteString::cast(*subject);
890 address = reinterpret_cast<const byte*>(ext->resource()->data());
891 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000892 res = RegExpMacroAssemblerIA32::Execute(
ager@chromium.org8bb60582008-12-11 12:02:20 +0000893 *code,
ager@chromium.org32912102009-01-16 10:38:43 +0000894 const_cast<Address*>(&address),
ager@chromium.org8bb60582008-12-11 12:02:20 +0000895 start_offset << char_size_shift,
896 end_offset << char_size_shift,
897 offsets_vector,
898 previous_index == 0);
899 } else { // Sequential string
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000900 ASSERT(StringShape(*subject).IsSequential());
ager@chromium.org8bb60582008-12-11 12:02:20 +0000901 Address char_address =
902 is_ascii ? SeqAsciiString::cast(*subject)->GetCharsAddress()
903 : SeqTwoByteString::cast(*subject)->GetCharsAddress();
904 int byte_offset = char_address - reinterpret_cast<Address>(*subject);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000905 res = RegExpMacroAssemblerIA32::Execute(
ager@chromium.org8bb60582008-12-11 12:02:20 +0000906 *code,
ager@chromium.org32912102009-01-16 10:38:43 +0000907 reinterpret_cast<Address*>(subject.location()),
ager@chromium.org8bb60582008-12-11 12:02:20 +0000908 byte_offset + (start_offset << char_size_shift),
909 byte_offset + (end_offset << char_size_shift),
910 offsets_vector,
911 previous_index == 0);
912 }
913
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000914 if (res == RegExpMacroAssemblerIA32::EXCEPTION) {
915 ASSERT(Top::has_pending_exception());
916 return Handle<Object>::null();
917 }
918 rc = (res == RegExpMacroAssemblerIA32::SUCCESS);
919
ager@chromium.org8bb60582008-12-11 12:02:20 +0000920 if (rc) {
921 // Capture values are relative to start_offset only.
922 for (int i = 0; i < offsets_vector_length; i++) {
923 if (offsets_vector[i] >= 0) {
924 offsets_vector[i] += previous_index;
925 }
926 }
927 }
928 break;
929#else
930 UNIMPLEMENTED();
931 rc = false;
932 break;
933#endif
934 }
935 case RegExpMacroAssembler::kBytecodeImplementation: {
936 for (int i = (num_captures + 1) * 2 - 1; i >= 0; i--) {
937 offsets_vector[i] = -1;
938 }
939 Handle<ByteArray> byte_codes = IrregexpByteCode(irregexp);
940
941 rc = IrregexpInterpreter::Match(byte_codes,
942 subject,
943 offsets_vector,
944 previous_index);
945 break;
946 }
947 case RegExpMacroAssembler::kARMImplementation:
948 default:
949 UNREACHABLE();
950 rc = false;
951 break;
952 }
953
954 if (!rc) {
955 return Factory::null_value();
956 }
957
958 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
959 // The captures come in (start, end+1) pairs.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000960 for (int i = 0; i < 2 * (num_captures + 1); i += 2) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000961 array->set(i, Smi::FromInt(offsets_vector[i]));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000962 array->set(i + 1, Smi::FromInt(offsets_vector[i + 1]));
ager@chromium.org8bb60582008-12-11 12:02:20 +0000963 }
964 return Factory::NewJSArrayWithElements(array);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000965}
966
967
968// -------------------------------------------------------------------
969// Implmentation of the Irregexp regular expression engine.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000970//
971// The Irregexp regular expression engine is intended to be a complete
972// implementation of ECMAScript regular expressions. It generates either
973// bytecodes or native code.
974
975// The Irregexp regexp engine is structured in three steps.
976// 1) The parser generates an abstract syntax tree. See ast.cc.
977// 2) From the AST a node network is created. The nodes are all
978// subclasses of RegExpNode. The nodes represent states when
979// executing a regular expression. Several optimizations are
980// performed on the node network.
981// 3) From the nodes we generate either byte codes or native code
982// that can actually execute the regular expression (perform
983// the search). The code generation step is described in more
984// detail below.
985
986// Code generation.
987//
988// The nodes are divided into four main categories.
989// * Choice nodes
990// These represent places where the regular expression can
991// match in more than one way. For example on entry to an
992// alternation (foo|bar) or a repetition (*, +, ? or {}).
993// * Action nodes
994// These represent places where some action should be
995// performed. Examples include recording the current position
996// in the input string to a register (in order to implement
997// captures) or other actions on register for example in order
998// to implement the counters needed for {} repetitions.
999// * Matching nodes
1000// These attempt to match some element part of the input string.
1001// Examples of elements include character classes, plain strings
1002// or back references.
1003// * End nodes
1004// These are used to implement the actions required on finding
1005// a successful match or failing to find a match.
1006//
1007// The code generated (whether as byte codes or native code) maintains
1008// some state as it runs. This consists of the following elements:
1009//
1010// * The capture registers. Used for string captures.
1011// * Other registers. Used for counters etc.
1012// * The current position.
1013// * The stack of backtracking information. Used when a matching node
1014// fails to find a match and needs to try an alternative.
1015//
1016// Conceptual regular expression execution model:
1017//
1018// There is a simple conceptual model of regular expression execution
1019// which will be presented first. The actual code generated is a more
1020// efficient simulation of the simple conceptual model:
1021//
1022// * Choice nodes are implemented as follows:
1023// For each choice except the last {
1024// push current position
1025// push backtrack code location
1026// <generate code to test for choice>
1027// backtrack code location:
1028// pop current position
1029// }
1030// <generate code to test for last choice>
1031//
1032// * Actions nodes are generated as follows
1033// <push affected registers on backtrack stack>
1034// <generate code to perform action>
1035// push backtrack code location
1036// <generate code to test for following nodes>
1037// backtrack code location:
1038// <pop affected registers to restore their state>
1039// <pop backtrack location from stack and go to it>
1040//
1041// * Matching nodes are generated as follows:
1042// if input string matches at current position
1043// update current position
1044// <generate code to test for following nodes>
1045// else
1046// <pop backtrack location from stack and go to it>
1047//
1048// Thus it can be seen that the current position is saved and restored
1049// by the choice nodes, whereas the registers are saved and restored by
1050// by the action nodes that manipulate them.
1051//
1052// The other interesting aspect of this model is that nodes are generated
1053// at the point where they are needed by a recursive call to Emit(). If
1054// the node has already been code generated then the Emit() call will
1055// generate a jump to the previously generated code instead. In order to
1056// limit recursion it is possible for the Emit() function to put the node
1057// on a work list for later generation and instead generate a jump. The
1058// destination of the jump is resolved later when the code is generated.
1059//
1060// Actual regular expression code generation.
1061//
1062// Code generation is actually more complicated than the above. In order
1063// to improve the efficiency of the generated code some optimizations are
1064// performed
1065//
1066// * Choice nodes have 1-character lookahead.
1067// A choice node looks at the following character and eliminates some of
1068// the choices immediately based on that character. This is not yet
1069// implemented.
1070// * Simple greedy loops store reduced backtracking information.
1071// A quantifier like /.*foo/m will greedily match the whole input. It will
1072// then need to backtrack to a point where it can match "foo". The naive
1073// implementation of this would push each character position onto the
1074// backtracking stack, then pop them off one by one. This would use space
1075// proportional to the length of the input string. However since the "."
1076// can only match in one way and always has a constant length (in this case
1077// of 1) it suffices to store the current position on the top of the stack
1078// once. Matching now becomes merely incrementing the current position and
1079// backtracking becomes decrementing the current position and checking the
1080// result against the stored current position. This is faster and saves
1081// space.
1082// * The current state is virtualized.
1083// This is used to defer expensive operations until it is clear that they
1084// are needed and to generate code for a node more than once, allowing
1085// specialized an efficient versions of the code to be created. This is
1086// explained in the section below.
1087//
1088// Execution state virtualization.
1089//
1090// Instead of emitting code, nodes that manipulate the state can record their
ager@chromium.org32912102009-01-16 10:38:43 +00001091// manipulation in an object called the Trace. The Trace object can record a
1092// current position offset, an optional backtrack code location on the top of
1093// the virtualized backtrack stack and some register changes. When a node is
1094// to be emitted it can flush the Trace or update it. Flushing the Trace
ager@chromium.org8bb60582008-12-11 12:02:20 +00001095// will emit code to bring the actual state into line with the virtual state.
1096// Avoiding flushing the state can postpone some work (eg updates of capture
1097// registers). Postponing work can save time when executing the regular
1098// expression since it may be found that the work never has to be done as a
1099// failure to match can occur. In addition it is much faster to jump to a
1100// known backtrack code location than it is to pop an unknown backtrack
1101// location from the stack and jump there.
1102//
ager@chromium.org32912102009-01-16 10:38:43 +00001103// The virtual state found in the Trace affects code generation. For example
1104// the virtual state contains the difference between the actual current
1105// position and the virtual current position, and matching code needs to use
1106// this offset to attempt a match in the correct location of the input
1107// string. Therefore code generated for a non-trivial trace is specialized
1108// to that trace. The code generator therefore has the ability to generate
1109// code for each node several times. In order to limit the size of the
1110// generated code there is an arbitrary limit on how many specialized sets of
1111// code may be generated for a given node. If the limit is reached, the
1112// trace is flushed and a generic version of the code for a node is emitted.
1113// This is subsequently used for that node. The code emitted for non-generic
1114// trace is not recorded in the node and so it cannot currently be reused in
1115// the event that code generation is requested for an identical trace.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001116
1117
1118void RegExpTree::AppendToText(RegExpText* text) {
1119 UNREACHABLE();
1120}
1121
1122
1123void RegExpAtom::AppendToText(RegExpText* text) {
1124 text->AddElement(TextElement::Atom(this));
1125}
1126
1127
1128void RegExpCharacterClass::AppendToText(RegExpText* text) {
1129 text->AddElement(TextElement::CharClass(this));
1130}
1131
1132
1133void RegExpText::AppendToText(RegExpText* text) {
1134 for (int i = 0; i < elements()->length(); i++)
1135 text->AddElement(elements()->at(i));
1136}
1137
1138
1139TextElement TextElement::Atom(RegExpAtom* atom) {
1140 TextElement result = TextElement(ATOM);
1141 result.data.u_atom = atom;
1142 return result;
1143}
1144
1145
1146TextElement TextElement::CharClass(
1147 RegExpCharacterClass* char_class) {
1148 TextElement result = TextElement(CHAR_CLASS);
1149 result.data.u_char_class = char_class;
1150 return result;
1151}
1152
1153
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001154int TextElement::length() {
1155 if (type == ATOM) {
1156 return data.u_atom->length();
1157 } else {
1158 ASSERT(type == CHAR_CLASS);
1159 return 1;
1160 }
1161}
1162
1163
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001164DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
1165 if (table_ == NULL) {
1166 table_ = new DispatchTable();
1167 DispatchTableConstructor cons(table_, ignore_case);
1168 cons.BuildTable(this);
1169 }
1170 return table_;
1171}
1172
1173
1174class RegExpCompiler {
1175 public:
ager@chromium.org8bb60582008-12-11 12:02:20 +00001176 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001177
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001178 int AllocateRegister() {
1179 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
1180 reg_exp_too_big_ = true;
1181 return next_register_;
1182 }
1183 return next_register_++;
1184 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001185
1186 Handle<FixedArray> Assemble(RegExpMacroAssembler* assembler,
1187 RegExpNode* start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001188 int capture_count,
1189 Handle<String> pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001190
1191 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
1192
1193 static const int kImplementationOffset = 0;
1194 static const int kNumberOfRegistersOffset = 0;
1195 static const int kCodeOffset = 1;
1196
1197 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
1198 EndNode* accept() { return accept_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001199
1200 static const int kMaxRecursion = 100;
1201 inline int recursion_depth() { return recursion_depth_; }
1202 inline void IncrementRecursionDepth() { recursion_depth_++; }
1203 inline void DecrementRecursionDepth() { recursion_depth_--; }
1204
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001205 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1206
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001207 inline bool ignore_case() { return ignore_case_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001208 inline bool ascii() { return ascii_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001209
ager@chromium.org32912102009-01-16 10:38:43 +00001210 static const int kNoRegister = -1;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001211 private:
1212 EndNode* accept_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001213 int next_register_;
1214 List<RegExpNode*>* work_list_;
1215 int recursion_depth_;
1216 RegExpMacroAssembler* macro_assembler_;
1217 bool ignore_case_;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001218 bool ascii_;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001219 bool reg_exp_too_big_;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001220};
1221
1222
1223class RecursionCheck {
1224 public:
1225 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1226 compiler->IncrementRecursionDepth();
1227 }
1228 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1229 private:
1230 RegExpCompiler* compiler_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001231};
1232
1233
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001234static Handle<FixedArray> IrregexpRegExpTooBig(Handle<String> pattern) {
1235 Handle<JSArray> array = Factory::NewJSArray(2);
1236 SetElement(array, 0, pattern);
1237 const char* message = "RegExp too big";
1238 SetElement(array, 1, Factory::NewStringFromUtf8(CStrVector(message)));
1239 Handle<Object> regexp_err =
1240 Factory::NewSyntaxError("malformed_regexp", array);
1241 Top::Throw(*regexp_err);
1242 return Handle<FixedArray>();
1243}
1244
1245
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001246// Attempts to compile the regexp using an Irregexp code generator. Returns
1247// a fixed array or a null handle depending on whether it succeeded.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001248RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001249 : next_register_(2 * (capture_count + 1)),
1250 work_list_(NULL),
1251 recursion_depth_(0),
ager@chromium.org8bb60582008-12-11 12:02:20 +00001252 ignore_case_(ignore_case),
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001253 ascii_(ascii),
1254 reg_exp_too_big_(false) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001255 accept_ = new EndNode(EndNode::ACCEPT);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001256 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001257}
1258
1259
1260Handle<FixedArray> RegExpCompiler::Assemble(
1261 RegExpMacroAssembler* macro_assembler,
1262 RegExpNode* start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001263 int capture_count,
1264 Handle<String> pattern) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001265#ifdef DEBUG
1266 if (FLAG_trace_regexp_assembler)
1267 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
1268 else
1269#endif
1270 macro_assembler_ = macro_assembler;
1271 List <RegExpNode*> work_list(0);
1272 work_list_ = &work_list;
1273 Label fail;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001274 macro_assembler->PushBacktrack(&fail);
ager@chromium.org32912102009-01-16 10:38:43 +00001275 Trace new_trace;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001276 start->Emit(this, &new_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001277 macro_assembler_->Bind(&fail);
1278 macro_assembler_->Fail();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001279 while (!work_list.is_empty()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001280 work_list.RemoveLast()->Emit(this, &new_trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001281 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001282 if (reg_exp_too_big_) return IrregexpRegExpTooBig(pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001283 Handle<FixedArray> array =
1284 Factory::NewFixedArray(RegExpImpl::kIrregexpDataLength);
1285 array->set(RegExpImpl::kIrregexpImplementationIndex,
1286 Smi::FromInt(macro_assembler_->Implementation()));
1287 array->set(RegExpImpl::kIrregexpNumberOfRegistersIndex,
1288 Smi::FromInt(next_register_));
1289 array->set(RegExpImpl::kIrregexpNumberOfCapturesIndex,
1290 Smi::FromInt(capture_count));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001291 Handle<Object> code = macro_assembler_->GetCode(pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001292 array->set(RegExpImpl::kIrregexpCodeIndex, *code);
1293 work_list_ = NULL;
1294#ifdef DEBUG
1295 if (FLAG_trace_regexp_assembler) {
1296 delete macro_assembler_;
1297 }
1298#endif
1299 return array;
1300}
1301
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001302
ager@chromium.org32912102009-01-16 10:38:43 +00001303bool Trace::DeferredAction::Mentions(int that) {
1304 if (type() == ActionNode::CLEAR_CAPTURES) {
1305 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1306 return range.Contains(that);
1307 } else {
1308 return reg() == that;
1309 }
1310}
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001311
ager@chromium.org32912102009-01-16 10:38:43 +00001312
1313bool Trace::mentions_reg(int reg) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001314 for (DeferredAction* action = actions_;
1315 action != NULL;
1316 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +00001317 if (action->Mentions(reg))
1318 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001319 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001320 return false;
1321}
1322
1323
ager@chromium.org32912102009-01-16 10:38:43 +00001324bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1325 ASSERT_EQ(0, *cp_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001326 for (DeferredAction* action = actions_;
1327 action != NULL;
1328 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +00001329 if (action->Mentions(reg)) {
1330 if (action->type() == ActionNode::STORE_POSITION) {
1331 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1332 return true;
1333 } else {
1334 return false;
1335 }
1336 }
1337 }
1338 return false;
1339}
1340
1341
1342int Trace::FindAffectedRegisters(OutSet* affected_registers) {
1343 int max_register = RegExpCompiler::kNoRegister;
1344 for (DeferredAction* action = actions_;
1345 action != NULL;
1346 action = action->next()) {
1347 if (action->type() == ActionNode::CLEAR_CAPTURES) {
1348 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1349 for (int i = range.from(); i <= range.to(); i++)
1350 affected_registers->Set(i);
1351 if (range.to() > max_register) max_register = range.to();
1352 } else {
1353 affected_registers->Set(action->reg());
1354 if (action->reg() > max_register) max_register = action->reg();
1355 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001356 }
1357 return max_register;
1358}
1359
1360
ager@chromium.org32912102009-01-16 10:38:43 +00001361void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1362 int max_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001363 OutSet& registers_to_pop,
1364 OutSet& registers_to_clear) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001365 for (int reg = max_register; reg >= 0; reg--) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001366 if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
1367 else if (registers_to_clear.Get(reg)) {
1368 int clear_to = reg;
1369 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1370 reg--;
1371 }
1372 assembler->ClearRegisters(reg, clear_to);
1373 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001374 }
1375}
1376
1377
ager@chromium.org32912102009-01-16 10:38:43 +00001378void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1379 int max_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001380 OutSet& affected_registers,
1381 OutSet* registers_to_pop,
1382 OutSet* registers_to_clear) {
1383 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1384 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1385
ager@chromium.org8bb60582008-12-11 12:02:20 +00001386 for (int reg = 0; reg <= max_register; reg++) {
1387 if (!affected_registers.Get(reg)) {
1388 continue;
1389 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001390 // Count pushes performed to force a stack limit check occasionally.
1391 int pushes = 0;
1392
1393 // The chronologically first deferred action in the trace
1394 // is used to infer the action needed to restore a register
1395 // to its previous state (or not, if it's safe to ignore it).
1396 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1397 DeferredActionUndoType undo_action = IGNORE;
1398
ager@chromium.org8bb60582008-12-11 12:02:20 +00001399 int value = 0;
1400 bool absolute = false;
ager@chromium.org32912102009-01-16 10:38:43 +00001401 bool clear = false;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001402 int store_position = -1;
1403 // This is a little tricky because we are scanning the actions in reverse
1404 // historical order (newest first).
1405 for (DeferredAction* action = actions_;
1406 action != NULL;
1407 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +00001408 if (action->Mentions(reg)) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001409 switch (action->type()) {
1410 case ActionNode::SET_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00001411 Trace::DeferredSetRegister* psr =
1412 static_cast<Trace::DeferredSetRegister*>(action);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001413 if (!absolute) {
1414 value += psr->value();
1415 absolute = true;
1416 }
1417 // SET_REGISTER is currently only used for newly introduced loop
1418 // counters. They can have a significant previous value if they
1419 // occour in a loop. TODO(lrn): Propagate this information, so
1420 // we can set undo_action to IGNORE if we know there is no value to
1421 // restore.
1422 undo_action = RESTORE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001423 ASSERT_EQ(store_position, -1);
ager@chromium.org32912102009-01-16 10:38:43 +00001424 ASSERT(!clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001425 break;
1426 }
1427 case ActionNode::INCREMENT_REGISTER:
1428 if (!absolute) {
1429 value++;
1430 }
1431 ASSERT_EQ(store_position, -1);
ager@chromium.org32912102009-01-16 10:38:43 +00001432 ASSERT(!clear);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001433 undo_action = RESTORE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001434 break;
1435 case ActionNode::STORE_POSITION: {
ager@chromium.org32912102009-01-16 10:38:43 +00001436 Trace::DeferredCapture* pc =
1437 static_cast<Trace::DeferredCapture*>(action);
1438 if (!clear && store_position == -1) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001439 store_position = pc->cp_offset();
1440 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001441
1442 // For captures we know that stores and clears alternate.
1443 // Other register, are never cleared, and if the occur
1444 // inside a loop, they might be assigned more than once.
1445 if (reg <= 1) {
1446 // Registers zero and one, aka "capture zero", is
1447 // always set correctly if we succeed. There is no
1448 // need to undo a setting on backtrack, because we
1449 // will set it again or fail.
1450 undo_action = IGNORE;
1451 } else {
1452 undo_action = pc->is_capture() ? CLEAR : RESTORE;
1453 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001454 ASSERT(!absolute);
1455 ASSERT_EQ(value, 0);
1456 break;
1457 }
ager@chromium.org32912102009-01-16 10:38:43 +00001458 case ActionNode::CLEAR_CAPTURES: {
1459 // Since we're scanning in reverse order, if we've already
1460 // set the position we have to ignore historically earlier
1461 // clearing operations.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001462 if (store_position == -1) {
ager@chromium.org32912102009-01-16 10:38:43 +00001463 clear = true;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001464 }
1465 undo_action = RESTORE;
ager@chromium.org32912102009-01-16 10:38:43 +00001466 ASSERT(!absolute);
1467 ASSERT_EQ(value, 0);
1468 break;
1469 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001470 default:
1471 UNREACHABLE();
1472 break;
1473 }
1474 }
1475 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001476 // Prepare for the undo-action (e.g., push if it's going to be popped).
1477 if (undo_action == RESTORE) {
1478 pushes++;
1479 RegExpMacroAssembler::StackCheckFlag stack_check =
1480 RegExpMacroAssembler::kNoStackLimitCheck;
1481 if (pushes == push_limit) {
1482 stack_check = RegExpMacroAssembler::kCheckStackLimit;
1483 pushes = 0;
1484 }
1485
1486 assembler->PushRegister(reg, stack_check);
1487 registers_to_pop->Set(reg);
1488 } else if (undo_action == CLEAR) {
1489 registers_to_clear->Set(reg);
1490 }
1491 // Perform the chronologically last action (or accumulated increment)
1492 // for the register.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001493 if (store_position != -1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001494 assembler->WriteCurrentPositionToRegister(reg, store_position);
ager@chromium.org32912102009-01-16 10:38:43 +00001495 } else if (clear) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001496 assembler->ClearRegisters(reg, reg);
ager@chromium.org32912102009-01-16 10:38:43 +00001497 } else if (absolute) {
1498 assembler->SetRegister(reg, value);
1499 } else if (value != 0) {
1500 assembler->AdvanceRegister(reg, value);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001501 }
1502 }
1503}
1504
1505
ager@chromium.org8bb60582008-12-11 12:02:20 +00001506// This is called as we come into a loop choice node and some other tricky
ager@chromium.org32912102009-01-16 10:38:43 +00001507// nodes. It normalizes the state of the code generator to ensure we can
ager@chromium.org8bb60582008-12-11 12:02:20 +00001508// generate generic code.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001509void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001510 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001511
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001512 ASSERT(actions_ != NULL ||
1513 cp_offset_ != 0 ||
1514 backtrack() != NULL ||
1515 characters_preloaded_ != 0 ||
1516 quick_check_performed_.characters() != 0 ||
1517 bound_checked_up_to_ != 0);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001518
1519 if (actions_ == NULL && backtrack() == NULL) {
1520 // Here we just have some deferred cp advances to fix and we are back to
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001521 // a normal situation. We may also have to forget some information gained
1522 // through a quick check that was already performed.
1523 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001524 // Create a new trivial state and generate the node with that.
ager@chromium.org32912102009-01-16 10:38:43 +00001525 Trace new_state;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001526 successor->Emit(compiler, &new_state);
1527 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001528 }
1529
1530 // Generate deferred actions here along with code to undo them again.
1531 OutSet affected_registers;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001532
ager@chromium.org8bb60582008-12-11 12:02:20 +00001533 int max_register = FindAffectedRegisters(&affected_registers);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001534 OutSet registers_to_pop;
1535 OutSet registers_to_clear;
1536 PerformDeferredActions(assembler,
1537 max_register,
1538 affected_registers,
1539 &registers_to_pop,
1540 &registers_to_clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001541 if (backtrack() != NULL) {
1542 // Here we have a concrete backtrack location. These are set up by choice
1543 // nodes and so they indicate that we have a deferred save of the current
1544 // position which we may need to emit here.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001545 assembler->PushCurrentPosition();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001546 }
1547 if (cp_offset_ != 0) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001548 assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001549 }
1550
1551 // Create a new trivial state and generate the node with that.
1552 Label undo;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001553 assembler->PushBacktrack(&undo);
ager@chromium.org32912102009-01-16 10:38:43 +00001554 Trace new_state;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001555 successor->Emit(compiler, &new_state);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001556
1557 // On backtrack we need to restore state.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001558 assembler->Bind(&undo);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001559 if (backtrack() != NULL) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001560 assembler->PopCurrentPosition();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001561 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001562 RestoreAffectedRegisters(assembler,
1563 max_register,
1564 registers_to_pop,
1565 registers_to_clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001566 if (backtrack() == NULL) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001567 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001568 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001569 assembler->GoTo(backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001570 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001571}
1572
1573
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001574void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001575 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001576
1577 // Omit flushing the trace. We discard the entire stack frame anyway.
1578
ager@chromium.org8bb60582008-12-11 12:02:20 +00001579 if (!label()->is_bound()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001580 // We are completely independent of the trace, since we ignore it,
1581 // so this code can be used as the generic version.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001582 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001583 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001584
1585 // Throw away everything on the backtrack stack since the start
1586 // of the negative submatch and restore the character position.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001587 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1588 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001589 if (clear_capture_count_ > 0) {
1590 // Clear any captures that might have been performed during the success
1591 // of the body of the negative look-ahead.
1592 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1593 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1594 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001595 // Now that we have unwound the stack we find at the top of the stack the
1596 // backtrack that the BeginSubmatch node got.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001597 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001598}
1599
1600
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001601void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org32912102009-01-16 10:38:43 +00001602 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001603 trace->Flush(compiler, this);
1604 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001605 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001606 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001607 if (!label()->is_bound()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001608 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001609 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001610 switch (action_) {
1611 case ACCEPT:
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001612 assembler->Succeed();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001613 return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001614 case BACKTRACK:
ager@chromium.org32912102009-01-16 10:38:43 +00001615 assembler->GoTo(trace->backtrack());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001616 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001617 case NEGATIVE_SUBMATCH_SUCCESS:
1618 // This case is handled in a different virtual method.
1619 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001620 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001621 UNIMPLEMENTED();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001622}
1623
1624
1625void GuardedAlternative::AddGuard(Guard* guard) {
1626 if (guards_ == NULL)
1627 guards_ = new ZoneList<Guard*>(1);
1628 guards_->Add(guard);
1629}
1630
1631
ager@chromium.org8bb60582008-12-11 12:02:20 +00001632ActionNode* ActionNode::SetRegister(int reg,
1633 int val,
1634 RegExpNode* on_success) {
1635 ActionNode* result = new ActionNode(SET_REGISTER, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001636 result->data_.u_store_register.reg = reg;
1637 result->data_.u_store_register.value = val;
1638 return result;
1639}
1640
1641
1642ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1643 ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
1644 result->data_.u_increment_register.reg = reg;
1645 return result;
1646}
1647
1648
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001649ActionNode* ActionNode::StorePosition(int reg,
1650 bool is_capture,
1651 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001652 ActionNode* result = new ActionNode(STORE_POSITION, on_success);
1653 result->data_.u_position_register.reg = reg;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001654 result->data_.u_position_register.is_capture = is_capture;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001655 return result;
1656}
1657
1658
ager@chromium.org32912102009-01-16 10:38:43 +00001659ActionNode* ActionNode::ClearCaptures(Interval range,
1660 RegExpNode* on_success) {
1661 ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
1662 result->data_.u_clear_captures.range_from = range.from();
1663 result->data_.u_clear_captures.range_to = range.to();
1664 return result;
1665}
1666
1667
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001668ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1669 int position_reg,
1670 RegExpNode* on_success) {
1671 ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
1672 result->data_.u_submatch.stack_pointer_register = stack_reg;
1673 result->data_.u_submatch.current_position_register = position_reg;
1674 return result;
1675}
1676
1677
ager@chromium.org8bb60582008-12-11 12:02:20 +00001678ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1679 int position_reg,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001680 int clear_register_count,
1681 int clear_register_from,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001682 RegExpNode* on_success) {
1683 ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001684 result->data_.u_submatch.stack_pointer_register = stack_reg;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001685 result->data_.u_submatch.current_position_register = position_reg;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001686 result->data_.u_submatch.clear_register_count = clear_register_count;
1687 result->data_.u_submatch.clear_register_from = clear_register_from;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001688 return result;
1689}
1690
1691
ager@chromium.org32912102009-01-16 10:38:43 +00001692ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1693 int repetition_register,
1694 int repetition_limit,
1695 RegExpNode* on_success) {
1696 ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
1697 result->data_.u_empty_match_check.start_register = start_register;
1698 result->data_.u_empty_match_check.repetition_register = repetition_register;
1699 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1700 return result;
1701}
1702
1703
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001704#define DEFINE_ACCEPT(Type) \
1705 void Type##Node::Accept(NodeVisitor* visitor) { \
1706 visitor->Visit##Type(this); \
1707 }
1708FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1709#undef DEFINE_ACCEPT
1710
1711
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001712void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1713 visitor->VisitLoopChoice(this);
1714}
1715
1716
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001717// -------------------------------------------------------------------
1718// Emit code.
1719
1720
1721void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1722 Guard* guard,
ager@chromium.org32912102009-01-16 10:38:43 +00001723 Trace* trace) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001724 switch (guard->op()) {
1725 case Guard::LT:
ager@chromium.org32912102009-01-16 10:38:43 +00001726 ASSERT(!trace->mentions_reg(guard->reg()));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001727 macro_assembler->IfRegisterGE(guard->reg(),
1728 guard->value(),
ager@chromium.org32912102009-01-16 10:38:43 +00001729 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001730 break;
1731 case Guard::GEQ:
ager@chromium.org32912102009-01-16 10:38:43 +00001732 ASSERT(!trace->mentions_reg(guard->reg()));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001733 macro_assembler->IfRegisterLT(guard->reg(),
1734 guard->value(),
ager@chromium.org32912102009-01-16 10:38:43 +00001735 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001736 break;
1737 }
1738}
1739
1740
1741static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
1742static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
1743
1744
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001745// Only emits non-letters (things that don't have case). Only used for case
1746// independent matches.
1747static inline bool EmitAtomNonLetter(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001748 RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001749 uc16 c,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001750 Label* on_failure,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001751 int cp_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001752 bool check,
1753 bool preloaded) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001754 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001755 int length = uncanonicalize.get(c, '\0', chars);
1756 bool checked = false;
1757 if (length <= 1) {
1758 if (!preloaded) {
1759 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1760 checked = check;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001761 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001762 macro_assembler->CheckNotCharacter(c, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001763 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001764 return checked;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001765}
1766
1767
1768static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001769 bool ascii,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001770 uc16 c1,
1771 uc16 c2,
1772 Label* on_failure) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001773 uc16 char_mask;
1774 if (ascii) {
1775 char_mask = String::kMaxAsciiCharCode;
1776 } else {
1777 char_mask = String::kMaxUC16CharCode;
1778 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001779 uc16 exor = c1 ^ c2;
1780 // Check whether exor has only one bit set.
1781 if (((exor - 1) & exor) == 0) {
1782 // If c1 and c2 differ only by one bit.
1783 // Ecma262UnCanonicalize always gives the highest number last.
1784 ASSERT(c2 > c1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001785 uc16 mask = char_mask ^ exor;
1786 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001787 return true;
1788 }
1789 ASSERT(c2 > c1);
1790 uc16 diff = c2 - c1;
1791 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1792 // If the characters differ by 2^n but don't differ by one bit then
1793 // subtract the difference from the found character, then do the or
1794 // trick. We avoid the theoretical case where negative numbers are
1795 // involved in order to simplify code generation.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001796 uc16 mask = char_mask ^ diff;
1797 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1798 diff,
1799 mask,
1800 on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001801 return true;
1802 }
1803 return false;
1804}
1805
1806
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001807// Only emits letters (things that have case). Only used for case independent
1808// matches.
1809static inline bool EmitAtomLetter(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001810 RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001811 bool ascii,
1812 uc16 c,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001813 Label* on_failure,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001814 int cp_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001815 bool check,
1816 bool preloaded) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001817 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001818 int length = uncanonicalize.get(c, '\0', chars);
1819 if (length <= 1) return false;
1820 // We may not need to check against the end of the input string
1821 // if this character lies before a character that matched.
1822 if (!preloaded) {
1823 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001824 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001825 Label ok;
1826 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1827 switch (length) {
1828 case 2: {
1829 if (ShortCutEmitCharacterPair(macro_assembler,
1830 ascii,
1831 chars[0],
1832 chars[1],
1833 on_failure)) {
1834 } else {
1835 macro_assembler->CheckCharacter(chars[0], &ok);
1836 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1837 macro_assembler->Bind(&ok);
1838 }
1839 break;
1840 }
1841 case 4:
1842 macro_assembler->CheckCharacter(chars[3], &ok);
1843 // Fall through!
1844 case 3:
1845 macro_assembler->CheckCharacter(chars[0], &ok);
1846 macro_assembler->CheckCharacter(chars[1], &ok);
1847 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1848 macro_assembler->Bind(&ok);
1849 break;
1850 default:
1851 UNREACHABLE();
1852 break;
1853 }
1854 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001855}
1856
1857
1858static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1859 RegExpCharacterClass* cc,
1860 int cp_offset,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001861 Label* on_failure,
1862 bool check_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001863 bool ascii,
1864 bool preloaded) {
1865 if (cc->is_standard() &&
1866 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1867 cp_offset,
1868 check_offset,
1869 on_failure)) {
1870 return;
1871 }
1872
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001873 ZoneList<CharacterRange>* ranges = cc->ranges();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001874 int max_char;
1875 if (ascii) {
1876 max_char = String::kMaxAsciiCharCode;
1877 } else {
1878 max_char = String::kMaxUC16CharCode;
1879 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001880
1881 Label success;
1882
1883 Label* char_is_in_class =
1884 cc->is_negated() ? on_failure : &success;
1885
1886 int range_count = ranges->length();
1887
ager@chromium.org8bb60582008-12-11 12:02:20 +00001888 int last_valid_range = range_count - 1;
1889 while (last_valid_range >= 0) {
1890 CharacterRange& range = ranges->at(last_valid_range);
1891 if (range.from() <= max_char) {
1892 break;
1893 }
1894 last_valid_range--;
1895 }
1896
1897 if (last_valid_range < 0) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001898 if (!cc->is_negated()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001899 // TODO(plesner): We can remove this when the node level does our
1900 // ASCII optimizations for us.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001901 macro_assembler->GoTo(on_failure);
1902 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001903 if (check_offset) {
1904 macro_assembler->CheckPosition(cp_offset, on_failure);
1905 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001906 return;
1907 }
1908
ager@chromium.org8bb60582008-12-11 12:02:20 +00001909 if (last_valid_range == 0 &&
1910 !cc->is_negated() &&
1911 ranges->at(0).IsEverything(max_char)) {
1912 // This is a common case hit by non-anchored expressions.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001913 if (check_offset) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001914 macro_assembler->CheckPosition(cp_offset, on_failure);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001915 }
1916 return;
1917 }
1918
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001919 if (!preloaded) {
1920 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001921 }
1922
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001923 for (int i = 0; i < last_valid_range; i++) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001924 CharacterRange& range = ranges->at(i);
1925 Label next_range;
1926 uc16 from = range.from();
1927 uc16 to = range.to();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001928 if (from > max_char) {
1929 continue;
1930 }
1931 if (to > max_char) to = max_char;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001932 if (to == from) {
1933 macro_assembler->CheckCharacter(to, char_is_in_class);
1934 } else {
1935 if (from != 0) {
1936 macro_assembler->CheckCharacterLT(from, &next_range);
1937 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001938 if (to != max_char) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001939 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
1940 } else {
1941 macro_assembler->GoTo(char_is_in_class);
1942 }
1943 }
1944 macro_assembler->Bind(&next_range);
1945 }
1946
ager@chromium.org8bb60582008-12-11 12:02:20 +00001947 CharacterRange& range = ranges->at(last_valid_range);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001948 uc16 from = range.from();
1949 uc16 to = range.to();
1950
ager@chromium.org8bb60582008-12-11 12:02:20 +00001951 if (to > max_char) to = max_char;
1952 ASSERT(to >= from);
1953
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001954 if (to == from) {
1955 if (cc->is_negated()) {
1956 macro_assembler->CheckCharacter(to, on_failure);
1957 } else {
1958 macro_assembler->CheckNotCharacter(to, on_failure);
1959 }
1960 } else {
1961 if (from != 0) {
1962 if (cc->is_negated()) {
1963 macro_assembler->CheckCharacterLT(from, &success);
1964 } else {
1965 macro_assembler->CheckCharacterLT(from, on_failure);
1966 }
1967 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001968 if (to != String::kMaxUC16CharCode) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001969 if (cc->is_negated()) {
1970 macro_assembler->CheckCharacterLT(to + 1, on_failure);
1971 } else {
1972 macro_assembler->CheckCharacterGT(to, on_failure);
1973 }
1974 } else {
1975 if (cc->is_negated()) {
1976 macro_assembler->GoTo(on_failure);
1977 }
1978 }
1979 }
1980 macro_assembler->Bind(&success);
1981}
1982
1983
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001984RegExpNode::~RegExpNode() {
1985}
1986
1987
ager@chromium.org8bb60582008-12-11 12:02:20 +00001988RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00001989 Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001990 // If we are generating a greedy loop then don't stop and don't reuse code.
ager@chromium.org32912102009-01-16 10:38:43 +00001991 if (trace->stop_node() != NULL) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001992 return CONTINUE;
1993 }
1994
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001995 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00001996 if (trace->is_trivial()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001997 if (label_.is_bound()) {
1998 // We are being asked to generate a generic version, but that's already
1999 // been done so just go to it.
2000 macro_assembler->GoTo(&label_);
2001 return DONE;
2002 }
2003 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
2004 // To avoid too deep recursion we push the node to the work queue and just
2005 // generate a goto here.
2006 compiler->AddWork(this);
2007 macro_assembler->GoTo(&label_);
2008 return DONE;
2009 }
2010 // Generate generic version of the node and bind the label for later use.
2011 macro_assembler->Bind(&label_);
2012 return CONTINUE;
2013 }
2014
2015 // We are being asked to make a non-generic version. Keep track of how many
2016 // non-generic versions we generate so as not to overdo it.
ager@chromium.org32912102009-01-16 10:38:43 +00002017 trace_count_++;
2018 if (trace_count_ < kMaxCopiesCodeGenerated &&
ager@chromium.org8bb60582008-12-11 12:02:20 +00002019 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
2020 return CONTINUE;
2021 }
2022
ager@chromium.org32912102009-01-16 10:38:43 +00002023 // If we get here code has been generated for this node too many times or
2024 // recursion is too deep. Time to switch to a generic version. The code for
ager@chromium.org8bb60582008-12-11 12:02:20 +00002025 // generic versions above can handle deep recursion properly.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002026 trace->Flush(compiler, this);
2027 return DONE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002028}
2029
2030
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002031int ActionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002032 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2033 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002034 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002035}
2036
2037
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002038int AssertionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
2039 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2040 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
2041}
2042
2043
2044int BackReferenceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
2045 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2046 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
2047}
2048
2049
2050int TextNode::EatsAtLeast(int still_to_find, int recursion_depth) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002051 int answer = Length();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002052 if (answer >= still_to_find) return answer;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002053 if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002054 return answer + on_success()->EatsAtLeast(still_to_find - answer,
2055 recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002056}
2057
2058
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002059int NegativeLookaheadChoiceNode:: EatsAtLeast(int still_to_find,
2060 int recursion_depth) {
2061 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2062 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2063 // afterwards.
2064 RegExpNode* node = alternatives_->at(1).node();
2065 return node->EatsAtLeast(still_to_find, recursion_depth + 1);
2066}
2067
2068
2069void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
2070 QuickCheckDetails* details,
2071 RegExpCompiler* compiler,
2072 int filled_in) {
2073 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2074 // afterwards.
2075 RegExpNode* node = alternatives_->at(1).node();
2076 return node->GetQuickCheckDetails(details, compiler, filled_in);
2077}
2078
2079
2080int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2081 int recursion_depth,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002082 RegExpNode* ignore_this_node) {
2083 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2084 int min = 100;
2085 int choice_count = alternatives_->length();
2086 for (int i = 0; i < choice_count; i++) {
2087 RegExpNode* node = alternatives_->at(i).node();
2088 if (node == ignore_this_node) continue;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002089 int node_eats_at_least = node->EatsAtLeast(still_to_find,
2090 recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002091 if (node_eats_at_least < min) min = node_eats_at_least;
2092 }
2093 return min;
2094}
2095
2096
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002097int LoopChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
2098 return EatsAtLeastHelper(still_to_find, recursion_depth, loop_node_);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002099}
2100
2101
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002102int ChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
2103 return EatsAtLeastHelper(still_to_find, recursion_depth, NULL);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002104}
2105
2106
2107// Takes the left-most 1-bit and smears it out, setting all bits to its right.
2108static inline uint32_t SmearBitsRight(uint32_t v) {
2109 v |= v >> 1;
2110 v |= v >> 2;
2111 v |= v >> 4;
2112 v |= v >> 8;
2113 v |= v >> 16;
2114 return v;
2115}
2116
2117
2118bool QuickCheckDetails::Rationalize(bool asc) {
2119 bool found_useful_op = false;
2120 uint32_t char_mask;
2121 if (asc) {
2122 char_mask = String::kMaxAsciiCharCode;
2123 } else {
2124 char_mask = String::kMaxUC16CharCode;
2125 }
2126 mask_ = 0;
2127 value_ = 0;
2128 int char_shift = 0;
2129 for (int i = 0; i < characters_; i++) {
2130 Position* pos = &positions_[i];
2131 if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
2132 found_useful_op = true;
2133 }
2134 mask_ |= (pos->mask & char_mask) << char_shift;
2135 value_ |= (pos->value & char_mask) << char_shift;
2136 char_shift += asc ? 8 : 16;
2137 }
2138 return found_useful_op;
2139}
2140
2141
2142bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00002143 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002144 bool preload_has_checked_bounds,
2145 Label* on_possible_success,
2146 QuickCheckDetails* details,
2147 bool fall_through_on_failure) {
2148 if (details->characters() == 0) return false;
2149 GetQuickCheckDetails(details, compiler, 0);
2150 if (!details->Rationalize(compiler->ascii())) return false;
2151 uint32_t mask = details->mask();
2152 uint32_t value = details->value();
2153
2154 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2155
ager@chromium.org32912102009-01-16 10:38:43 +00002156 if (trace->characters_preloaded() != details->characters()) {
2157 assembler->LoadCurrentCharacter(trace->cp_offset(),
2158 trace->backtrack(),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002159 !preload_has_checked_bounds,
2160 details->characters());
2161 }
2162
2163
2164 bool need_mask = true;
2165
2166 if (details->characters() == 1) {
2167 // If number of characters preloaded is 1 then we used a byte or 16 bit
2168 // load so the value is already masked down.
2169 uint32_t char_mask;
2170 if (compiler->ascii()) {
2171 char_mask = String::kMaxAsciiCharCode;
2172 } else {
2173 char_mask = String::kMaxUC16CharCode;
2174 }
2175 if ((mask & char_mask) == char_mask) need_mask = false;
2176 mask &= char_mask;
2177 } else {
2178 // For 2-character preloads in ASCII mode we also use a 16 bit load with
2179 // zero extend.
2180 if (details->characters() == 2 && compiler->ascii()) {
2181 if ((mask & 0xffff) == 0xffff) need_mask = false;
2182 } else {
2183 if (mask == 0xffffffff) need_mask = false;
2184 }
2185 }
2186
2187 if (fall_through_on_failure) {
2188 if (need_mask) {
2189 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2190 } else {
2191 assembler->CheckCharacter(value, on_possible_success);
2192 }
2193 } else {
2194 if (need_mask) {
ager@chromium.org32912102009-01-16 10:38:43 +00002195 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002196 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00002197 assembler->CheckNotCharacter(value, trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002198 }
2199 }
2200 return true;
2201}
2202
2203
2204// Here is the meat of GetQuickCheckDetails (see also the comment on the
2205// super-class in the .h file).
2206//
2207// We iterate along the text object, building up for each character a
2208// mask and value that can be used to test for a quick failure to match.
2209// The masks and values for the positions will be combined into a single
2210// machine word for the current character width in order to be used in
2211// generating a quick check.
2212void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2213 RegExpCompiler* compiler,
2214 int characters_filled_in) {
2215 ASSERT(characters_filled_in < details->characters());
2216 int characters = details->characters();
2217 int char_mask;
2218 int char_shift;
2219 if (compiler->ascii()) {
2220 char_mask = String::kMaxAsciiCharCode;
2221 char_shift = 8;
2222 } else {
2223 char_mask = String::kMaxUC16CharCode;
2224 char_shift = 16;
2225 }
2226 for (int k = 0; k < elms_->length(); k++) {
2227 TextElement elm = elms_->at(k);
2228 if (elm.type == TextElement::ATOM) {
2229 Vector<const uc16> quarks = elm.data.u_atom->data();
2230 for (int i = 0; i < characters && i < quarks.length(); i++) {
2231 QuickCheckDetails::Position* pos =
2232 details->positions(characters_filled_in);
2233 if (compiler->ignore_case()) {
2234 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2235 uc16 c = quarks[i];
2236 int length = uncanonicalize.get(c, '\0', chars);
2237 if (length < 2) {
2238 // This letter has no case equivalents, so it's nice and simple
2239 // and the mask-compare will determine definitely whether we have
2240 // a match at this character position.
2241 pos->mask = char_mask;
2242 pos->value = c;
2243 pos->determines_perfectly = true;
2244 } else {
2245 uint32_t common_bits = char_mask;
2246 uint32_t bits = chars[0];
2247 for (int j = 1; j < length; j++) {
2248 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2249 common_bits ^= differing_bits;
2250 bits &= common_bits;
2251 }
2252 // If length is 2 and common bits has only one zero in it then
2253 // our mask and compare instruction will determine definitely
2254 // whether we have a match at this character position. Otherwise
2255 // it can only be an approximate check.
2256 uint32_t one_zero = (common_bits | ~char_mask);
2257 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2258 pos->determines_perfectly = true;
2259 }
2260 pos->mask = common_bits;
2261 pos->value = bits;
2262 }
2263 } else {
2264 // Don't ignore case. Nice simple case where the mask-compare will
2265 // determine definitely whether we have a match at this character
2266 // position.
2267 pos->mask = char_mask;
2268 pos->value = quarks[i];
2269 pos->determines_perfectly = true;
2270 }
2271 characters_filled_in++;
2272 ASSERT(characters_filled_in <= details->characters());
2273 if (characters_filled_in == details->characters()) {
2274 return;
2275 }
2276 }
2277 } else {
2278 QuickCheckDetails::Position* pos =
2279 details->positions(characters_filled_in);
2280 RegExpCharacterClass* tree = elm.data.u_char_class;
2281 ZoneList<CharacterRange>* ranges = tree->ranges();
2282 CharacterRange range = ranges->at(0);
2283 if (tree->is_negated()) {
2284 // A quick check uses multi-character mask and compare. There is no
2285 // useful way to incorporate a negative char class into this scheme
2286 // so we just conservatively create a mask and value that will always
2287 // succeed.
2288 pos->mask = 0;
2289 pos->value = 0;
2290 } else {
2291 uint32_t differing_bits = (range.from() ^ range.to());
2292 // A mask and compare is only perfect if the differing bits form a
2293 // number like 00011111 with one single block of trailing 1s.
2294 if ((differing_bits & (differing_bits + 1)) == 0) {
2295 pos->determines_perfectly = true;
2296 }
2297 uint32_t common_bits = ~SmearBitsRight(differing_bits);
2298 uint32_t bits = (range.from() & common_bits);
2299 for (int i = 1; i < ranges->length(); i++) {
2300 // Here we are combining more ranges into the mask and compare
2301 // value. With each new range the mask becomes more sparse and
2302 // so the chances of a false positive rise. A character class
2303 // with multiple ranges is assumed never to be equivalent to a
2304 // mask and compare operation.
2305 pos->determines_perfectly = false;
2306 CharacterRange range = ranges->at(i);
2307 uint32_t new_common_bits = (range.from() ^ range.to());
2308 new_common_bits = ~SmearBitsRight(new_common_bits);
2309 common_bits &= new_common_bits;
2310 bits &= new_common_bits;
2311 uint32_t differing_bits = (range.from() & common_bits) ^ bits;
2312 common_bits ^= differing_bits;
2313 bits &= common_bits;
2314 }
2315 pos->mask = common_bits;
2316 pos->value = bits;
2317 }
2318 characters_filled_in++;
2319 ASSERT(characters_filled_in <= details->characters());
2320 if (characters_filled_in == details->characters()) {
2321 return;
2322 }
2323 }
2324 }
2325 ASSERT(characters_filled_in != details->characters());
2326 on_success()-> GetQuickCheckDetails(details, compiler, characters_filled_in);
2327}
2328
2329
2330void QuickCheckDetails::Clear() {
2331 for (int i = 0; i < characters_; i++) {
2332 positions_[i].mask = 0;
2333 positions_[i].value = 0;
2334 positions_[i].determines_perfectly = false;
2335 }
2336 characters_ = 0;
2337}
2338
2339
2340void QuickCheckDetails::Advance(int by, bool ascii) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002341 ASSERT(by >= 0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002342 if (by >= characters_) {
2343 Clear();
2344 return;
2345 }
2346 for (int i = 0; i < characters_ - by; i++) {
2347 positions_[i] = positions_[by + i];
2348 }
2349 for (int i = characters_ - by; i < characters_; i++) {
2350 positions_[i].mask = 0;
2351 positions_[i].value = 0;
2352 positions_[i].determines_perfectly = false;
2353 }
2354 characters_ -= by;
2355 // We could change mask_ and value_ here but we would never advance unless
2356 // they had already been used in a check and they won't be used again because
2357 // it would gain us nothing. So there's no point.
2358}
2359
2360
2361void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2362 ASSERT(characters_ == other->characters_);
2363 for (int i = from_index; i < characters_; i++) {
2364 QuickCheckDetails::Position* pos = positions(i);
2365 QuickCheckDetails::Position* other_pos = other->positions(i);
2366 if (pos->mask != other_pos->mask ||
2367 pos->value != other_pos->value ||
2368 !other_pos->determines_perfectly) {
2369 // Our mask-compare operation will be approximate unless we have the
2370 // exact same operation on both sides of the alternation.
2371 pos->determines_perfectly = false;
2372 }
2373 pos->mask &= other_pos->mask;
2374 pos->value &= pos->mask;
2375 other_pos->value &= pos->mask;
2376 uc16 differing_bits = (pos->value ^ other_pos->value);
2377 pos->mask &= ~differing_bits;
2378 pos->value &= pos->mask;
2379 }
2380}
2381
2382
ager@chromium.org32912102009-01-16 10:38:43 +00002383class VisitMarker {
2384 public:
2385 explicit VisitMarker(NodeInfo* info) : info_(info) {
2386 ASSERT(!info->visited);
2387 info->visited = true;
2388 }
2389 ~VisitMarker() {
2390 info_->visited = false;
2391 }
2392 private:
2393 NodeInfo* info_;
2394};
2395
2396
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002397void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2398 RegExpCompiler* compiler,
2399 int characters_filled_in) {
ager@chromium.org32912102009-01-16 10:38:43 +00002400 if (body_can_be_zero_length_ || info()->visited) return;
2401 VisitMarker marker(info());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002402 return ChoiceNode::GetQuickCheckDetails(details,
2403 compiler,
2404 characters_filled_in);
2405}
2406
2407
2408void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2409 RegExpCompiler* compiler,
2410 int characters_filled_in) {
2411 int choice_count = alternatives_->length();
2412 ASSERT(choice_count > 0);
2413 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2414 compiler,
2415 characters_filled_in);
2416 for (int i = 1; i < choice_count; i++) {
2417 QuickCheckDetails new_details(details->characters());
2418 RegExpNode* node = alternatives_->at(i).node();
2419 node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in);
2420 // Here we merge the quick match details of the two branches.
2421 details->Merge(&new_details, characters_filled_in);
2422 }
2423}
2424
2425
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002426// Check for [0-9A-Z_a-z].
2427static void EmitWordCheck(RegExpMacroAssembler* assembler,
2428 Label* word,
2429 Label* non_word,
2430 bool fall_through_on_word) {
2431 assembler->CheckCharacterGT('z', non_word);
2432 assembler->CheckCharacterLT('0', non_word);
2433 assembler->CheckCharacterGT('a' - 1, word);
2434 assembler->CheckCharacterLT('9' + 1, word);
2435 assembler->CheckCharacterLT('A', non_word);
2436 assembler->CheckCharacterLT('Z' + 1, word);
2437 if (fall_through_on_word) {
2438 assembler->CheckNotCharacter('_', non_word);
2439 } else {
2440 assembler->CheckCharacter('_', word);
2441 }
2442}
2443
2444
2445// Emit the code to check for a ^ in multiline mode (1-character lookbehind
2446// that matches newline or the start of input).
2447static void EmitHat(RegExpCompiler* compiler,
2448 RegExpNode* on_success,
2449 Trace* trace) {
2450 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2451 // We will be loading the previous character into the current character
2452 // register.
2453 Trace new_trace(*trace);
2454 new_trace.InvalidateCurrentCharacter();
2455
2456 Label ok;
2457 if (new_trace.cp_offset() == 0) {
2458 // The start of input counts as a newline in this context, so skip to
2459 // ok if we are at the start.
2460 assembler->CheckAtStart(&ok);
2461 }
2462 // We already checked that we are not at the start of input so it must be
2463 // OK to load the previous character.
2464 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2465 new_trace.backtrack(),
2466 false);
2467 // Newline means \n, \r, 0x2028 or 0x2029.
2468 if (!compiler->ascii()) {
2469 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2470 }
2471 assembler->CheckCharacter('\n', &ok);
2472 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2473 assembler->Bind(&ok);
2474 on_success->Emit(compiler, &new_trace);
2475}
2476
2477
2478// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2479static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
2480 RegExpCompiler* compiler,
2481 RegExpNode* on_success,
2482 Trace* trace) {
2483 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2484 Label before_non_word;
2485 Label before_word;
2486 if (trace->characters_preloaded() != 1) {
2487 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2488 }
2489 // Fall through on non-word.
2490 EmitWordCheck(assembler, &before_word, &before_non_word, false);
2491
2492 // We will be loading the previous character into the current character
2493 // register.
2494 Trace new_trace(*trace);
2495 new_trace.InvalidateCurrentCharacter();
2496
2497 Label ok;
2498 Label* boundary;
2499 Label* not_boundary;
2500 if (type == AssertionNode::AT_BOUNDARY) {
2501 boundary = &ok;
2502 not_boundary = new_trace.backtrack();
2503 } else {
2504 not_boundary = &ok;
2505 boundary = new_trace.backtrack();
2506 }
2507
2508 // Next character is not a word character.
2509 assembler->Bind(&before_non_word);
2510 if (new_trace.cp_offset() == 0) {
2511 // The start of input counts as a non-word character, so the question is
2512 // decided if we are at the start.
2513 assembler->CheckAtStart(not_boundary);
2514 }
2515 // We already checked that we are not at the start of input so it must be
2516 // OK to load the previous character.
2517 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2518 &ok, // Unused dummy label in this call.
2519 false);
2520 // Fall through on non-word.
2521 EmitWordCheck(assembler, boundary, not_boundary, false);
2522 assembler->GoTo(not_boundary);
2523
2524 // Next character is a word character.
2525 assembler->Bind(&before_word);
2526 if (new_trace.cp_offset() == 0) {
2527 // The start of input counts as a non-word character, so the question is
2528 // decided if we are at the start.
2529 assembler->CheckAtStart(boundary);
2530 }
2531 // We already checked that we are not at the start of input so it must be
2532 // OK to load the previous character.
2533 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2534 &ok, // Unused dummy label in this call.
2535 false);
2536 bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
2537 EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
2538
2539 assembler->Bind(&ok);
2540
2541 on_success->Emit(compiler, &new_trace);
2542}
2543
2544
2545void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2546 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2547 switch (type_) {
2548 case AT_END: {
2549 Label ok;
2550 assembler->CheckPosition(trace->cp_offset(), &ok);
2551 assembler->GoTo(trace->backtrack());
2552 assembler->Bind(&ok);
2553 break;
2554 }
2555 case AT_START:
2556 assembler->CheckNotAtStart(trace->backtrack());
2557 break;
2558 case AFTER_NEWLINE:
2559 EmitHat(compiler, on_success(), trace);
2560 return;
2561 case AT_NON_BOUNDARY:
2562 case AT_BOUNDARY:
2563 EmitBoundaryCheck(type_, compiler, on_success(), trace);
2564 return;
2565 }
2566 on_success()->Emit(compiler, trace);
2567}
2568
2569
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002570// We call this repeatedly to generate code for each pass over the text node.
2571// The passes are in increasing order of difficulty because we hope one
2572// of the first passes will fail in which case we are saved the work of the
2573// later passes. for example for the case independent regexp /%[asdfghjkl]a/
2574// we will check the '%' in the first pass, the case independent 'a' in the
2575// second pass and the character class in the last pass.
2576//
2577// The passes are done from right to left, so for example to test for /bar/
2578// we will first test for an 'r' with offset 2, then an 'a' with offset 1
2579// and then a 'b' with offset 0. This means we can avoid the end-of-input
2580// bounds check most of the time. In the example we only need to check for
2581// end-of-input when loading the putative 'r'.
2582//
2583// A slight complication involves the fact that the first character may already
2584// be fetched into a register by the previous node. In this case we want to
2585// do the test for that character first. We do this in separate passes. The
2586// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2587// pass has been performed then subsequent passes will have true in
2588// first_element_checked to indicate that that character does not need to be
2589// checked again.
2590//
ager@chromium.org32912102009-01-16 10:38:43 +00002591// In addition to all this we are passed a Trace, which can
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002592// contain an AlternativeGeneration object. In this AlternativeGeneration
2593// object we can see details of any quick check that was already passed in
2594// order to get to the code we are now generating. The quick check can involve
2595// loading characters, which means we do not need to recheck the bounds
2596// up to the limit the quick check already checked. In addition the quick
2597// check can have involved a mask and compare operation which may simplify
2598// or obviate the need for further checks at some character positions.
2599void TextNode::TextEmitPass(RegExpCompiler* compiler,
2600 TextEmitPassType pass,
2601 bool preloaded,
ager@chromium.org32912102009-01-16 10:38:43 +00002602 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002603 bool first_element_checked,
2604 int* checked_up_to) {
2605 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2606 bool ascii = compiler->ascii();
ager@chromium.org32912102009-01-16 10:38:43 +00002607 Label* backtrack = trace->backtrack();
2608 QuickCheckDetails* quick_check = trace->quick_check_performed();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002609 int element_count = elms_->length();
2610 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2611 TextElement elm = elms_->at(i);
ager@chromium.org32912102009-01-16 10:38:43 +00002612 int cp_offset = trace->cp_offset() + elm.cp_offset;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002613 if (elm.type == TextElement::ATOM) {
2614 if (pass == NON_ASCII_MATCH ||
2615 pass == CHARACTER_MATCH ||
2616 pass == CASE_CHARACTER_MATCH) {
2617 Vector<const uc16> quarks = elm.data.u_atom->data();
2618 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2619 bool bound_checked = true; // Most ops will check their bounds.
2620 if (first_element_checked && i == 0 && j == 0) continue;
2621 if (quick_check != NULL &&
2622 elm.cp_offset + j < quick_check->characters() &&
2623 quick_check->positions(elm.cp_offset + j)->determines_perfectly) {
2624 continue;
2625 }
2626 if (pass == NON_ASCII_MATCH) {
2627 ASSERT(ascii);
2628 if (quarks[j] > String::kMaxAsciiCharCode) {
2629 assembler->GoTo(backtrack);
2630 return;
2631 }
2632 } else if (pass == CHARACTER_MATCH) {
2633 if (compiler->ignore_case()) {
2634 bound_checked = EmitAtomNonLetter(assembler,
2635 quarks[j],
2636 backtrack,
2637 cp_offset + j,
2638 *checked_up_to < cp_offset + j,
2639 preloaded);
2640 } else {
2641 if (!preloaded) {
2642 assembler->LoadCurrentCharacter(cp_offset + j,
2643 backtrack,
2644 *checked_up_to < cp_offset + j);
2645 }
2646 assembler->CheckNotCharacter(quarks[j], backtrack);
2647 }
2648 } else {
2649 ASSERT_EQ(pass, CASE_CHARACTER_MATCH);
2650 ASSERT(compiler->ignore_case());
2651 bound_checked = EmitAtomLetter(assembler,
2652 compiler->ascii(),
2653 quarks[j],
2654 backtrack,
2655 cp_offset + j,
2656 *checked_up_to < cp_offset + j,
2657 preloaded);
2658 }
2659 if (pass != NON_ASCII_MATCH && bound_checked) {
2660 if (cp_offset + j > *checked_up_to) {
2661 *checked_up_to = cp_offset + j;
2662 }
2663 }
2664 }
2665 }
2666 } else {
2667 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
2668 if (first_element_checked && i == 0) continue;
2669 if (quick_check != NULL &&
2670 elm.cp_offset < quick_check->characters() &&
2671 quick_check->positions(elm.cp_offset)->determines_perfectly) {
2672 continue;
2673 }
2674 if (pass == CHARACTER_CLASS_MATCH) {
2675 RegExpCharacterClass* cc = elm.data.u_char_class;
2676 EmitCharClass(assembler,
2677 cc,
2678 cp_offset,
2679 backtrack,
2680 *checked_up_to < cp_offset,
2681 ascii,
2682 preloaded);
2683 if (cp_offset > *checked_up_to) {
2684 *checked_up_to = cp_offset;
2685 }
2686 }
2687 }
2688 }
2689}
2690
2691
2692int TextNode::Length() {
2693 TextElement elm = elms_->last();
2694 ASSERT(elm.cp_offset >= 0);
2695 if (elm.type == TextElement::ATOM) {
2696 return elm.cp_offset + elm.data.u_atom->data().length();
2697 } else {
2698 return elm.cp_offset + 1;
2699 }
2700}
2701
2702
ager@chromium.org8bb60582008-12-11 12:02:20 +00002703// This generates the code to match a text node. A text node can contain
2704// straight character sequences (possibly to be matched in a case-independent
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002705// way) and character classes. For efficiency we do not do this in a single
2706// pass from left to right. Instead we pass over the text node several times,
2707// emitting code for some character positions every time. See the comment on
2708// TextEmitPass for details.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002709void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org32912102009-01-16 10:38:43 +00002710 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002711 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002712 ASSERT(limit_result == CONTINUE);
2713
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002714 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2715 compiler->SetRegExpTooBig();
2716 return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002717 }
2718
2719 if (compiler->ascii()) {
2720 int dummy = 0;
ager@chromium.org32912102009-01-16 10:38:43 +00002721 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002722 }
2723
2724 bool first_elt_done = false;
ager@chromium.org32912102009-01-16 10:38:43 +00002725 int bound_checked_to = trace->cp_offset() - 1;
2726 bound_checked_to += trace->bound_checked_up_to();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002727
2728 // If a character is preloaded into the current character register then
2729 // check that now.
ager@chromium.org32912102009-01-16 10:38:43 +00002730 if (trace->characters_preloaded() == 1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002731 TextEmitPass(compiler,
2732 CHARACTER_MATCH,
2733 true,
ager@chromium.org32912102009-01-16 10:38:43 +00002734 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002735 false,
2736 &bound_checked_to);
2737 if (compiler->ignore_case()) {
2738 TextEmitPass(compiler,
2739 CASE_CHARACTER_MATCH,
2740 true,
ager@chromium.org32912102009-01-16 10:38:43 +00002741 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002742 false,
2743 &bound_checked_to);
2744 }
2745 TextEmitPass(compiler,
2746 CHARACTER_CLASS_MATCH,
2747 true,
ager@chromium.org32912102009-01-16 10:38:43 +00002748 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002749 false,
2750 &bound_checked_to);
2751 first_elt_done = true;
2752 }
2753
2754 TextEmitPass(compiler,
2755 CHARACTER_MATCH,
2756 false,
ager@chromium.org32912102009-01-16 10:38:43 +00002757 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002758 first_elt_done,
2759 &bound_checked_to);
2760 if (compiler->ignore_case()) {
2761 TextEmitPass(compiler,
2762 CASE_CHARACTER_MATCH,
2763 false,
ager@chromium.org32912102009-01-16 10:38:43 +00002764 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002765 first_elt_done,
2766 &bound_checked_to);
2767 }
2768 TextEmitPass(compiler,
2769 CHARACTER_CLASS_MATCH,
2770 false,
ager@chromium.org32912102009-01-16 10:38:43 +00002771 trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002772 first_elt_done,
2773 &bound_checked_to);
2774
ager@chromium.org32912102009-01-16 10:38:43 +00002775 Trace successor_trace(*trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002776 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002777 RecursionCheck rc(compiler);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002778 on_success()->Emit(compiler, &successor_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002779}
2780
2781
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002782void Trace::InvalidateCurrentCharacter() {
2783 characters_preloaded_ = 0;
2784}
2785
2786
2787void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002788 ASSERT(by > 0);
2789 // We don't have an instruction for shifting the current character register
2790 // down or for using a shifted value for anything so lets just forget that
2791 // we preloaded any characters into it.
2792 characters_preloaded_ = 0;
2793 // Adjust the offsets of the quick check performed information. This
2794 // information is used to find out what we already determined about the
2795 // characters by means of mask and compare.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002796 quick_check_performed_.Advance(by, compiler->ascii());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002797 cp_offset_ += by;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002798 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2799 compiler->SetRegExpTooBig();
2800 cp_offset_ = 0;
2801 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002802 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002803}
2804
2805
2806void TextNode::MakeCaseIndependent() {
2807 int element_count = elms_->length();
2808 for (int i = 0; i < element_count; i++) {
2809 TextElement elm = elms_->at(i);
2810 if (elm.type == TextElement::CHAR_CLASS) {
2811 RegExpCharacterClass* cc = elm.data.u_char_class;
2812 ZoneList<CharacterRange>* ranges = cc->ranges();
2813 int range_count = ranges->length();
2814 for (int i = 0; i < range_count; i++) {
2815 ranges->at(i).AddCaseEquivalents(ranges);
2816 }
2817 }
2818 }
2819}
2820
2821
ager@chromium.org8bb60582008-12-11 12:02:20 +00002822int TextNode::GreedyLoopTextLength() {
2823 TextElement elm = elms_->at(elms_->length() - 1);
2824 if (elm.type == TextElement::CHAR_CLASS) {
2825 return elm.cp_offset + 1;
2826 } else {
2827 return elm.cp_offset + elm.data.u_atom->data().length();
2828 }
2829}
2830
2831
2832// Finds the fixed match length of a sequence of nodes that goes from
2833// this alternative and back to this choice node. If there are variable
2834// length nodes or other complications in the way then return a sentinel
2835// value indicating that a greedy loop cannot be constructed.
2836int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) {
2837 int length = 0;
2838 RegExpNode* node = alternative->node();
2839 // Later we will generate code for all these text nodes using recursion
2840 // so we have to limit the max number.
2841 int recursion_depth = 0;
2842 while (node != this) {
2843 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2844 return kNodeIsTooComplexForGreedyLoops;
2845 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002846 int node_length = node->GreedyLoopTextLength();
2847 if (node_length == kNodeIsTooComplexForGreedyLoops) {
2848 return kNodeIsTooComplexForGreedyLoops;
2849 }
2850 length += node_length;
2851 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2852 node = seq_node->on_success();
2853 }
2854 return length;
2855}
2856
2857
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002858void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2859 ASSERT_EQ(loop_node_, NULL);
2860 AddAlternative(alt);
2861 loop_node_ = alt.node();
2862}
2863
2864
2865void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2866 ASSERT_EQ(continue_node_, NULL);
2867 AddAlternative(alt);
2868 continue_node_ = alt.node();
2869}
2870
2871
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002872void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002873 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00002874 if (trace->stop_node() == this) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002875 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2876 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2877 // Update the counter-based backtracking info on the stack. This is an
2878 // optimization for greedy loops (see below).
ager@chromium.org32912102009-01-16 10:38:43 +00002879 ASSERT(trace->cp_offset() == text_length);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002880 macro_assembler->AdvanceCurrentPosition(text_length);
ager@chromium.org32912102009-01-16 10:38:43 +00002881 macro_assembler->GoTo(trace->loop_label());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002882 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002883 }
ager@chromium.org32912102009-01-16 10:38:43 +00002884 ASSERT(trace->stop_node() == NULL);
2885 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002886 trace->Flush(compiler, this);
2887 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002888 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002889 ChoiceNode::Emit(compiler, trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002890}
2891
2892
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002893int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002894 int preload_characters = EatsAtLeast(4, 0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002895#ifdef CAN_READ_UNALIGNED
2896 bool ascii = compiler->ascii();
2897 if (ascii) {
2898 if (preload_characters > 4) preload_characters = 4;
2899 // We can't preload 3 characters because there is no machine instruction
2900 // to do that. We can't just load 4 because we could be reading
2901 // beyond the end of the string, which could cause a memory fault.
2902 if (preload_characters == 3) preload_characters = 2;
2903 } else {
2904 if (preload_characters > 2) preload_characters = 2;
2905 }
2906#else
2907 if (preload_characters > 1) preload_characters = 1;
2908#endif
2909 return preload_characters;
2910}
2911
2912
2913// This class is used when generating the alternatives in a choice node. It
2914// records the way the alternative is being code generated.
2915class AlternativeGeneration: public Malloced {
2916 public:
2917 AlternativeGeneration()
2918 : possible_success(),
2919 expects_preload(false),
2920 after(),
2921 quick_check_details() { }
2922 Label possible_success;
2923 bool expects_preload;
2924 Label after;
2925 QuickCheckDetails quick_check_details;
2926};
2927
2928
2929// Creates a list of AlternativeGenerations. If the list has a reasonable
2930// size then it is on the stack, otherwise the excess is on the heap.
2931class AlternativeGenerationList {
2932 public:
2933 explicit AlternativeGenerationList(int count)
2934 : alt_gens_(count) {
2935 for (int i = 0; i < count && i < kAFew; i++) {
2936 alt_gens_.Add(a_few_alt_gens_ + i);
2937 }
2938 for (int i = kAFew; i < count; i++) {
2939 alt_gens_.Add(new AlternativeGeneration());
2940 }
2941 }
2942 ~AlternativeGenerationList() {
2943 for (int i = 0; i < alt_gens_.length(); i++) {
2944 alt_gens_[i]->possible_success.Unuse();
2945 alt_gens_[i]->after.Unuse();
2946 }
2947 for (int i = kAFew; i < alt_gens_.length(); i++) {
2948 delete alt_gens_[i];
2949 alt_gens_[i] = NULL;
2950 }
2951 }
2952
2953 AlternativeGeneration* at(int i) {
2954 return alt_gens_[i];
2955 }
2956 private:
2957 static const int kAFew = 10;
2958 ZoneList<AlternativeGeneration*> alt_gens_;
2959 AlternativeGeneration a_few_alt_gens_[kAFew];
2960};
2961
2962
2963/* Code generation for choice nodes.
2964 *
2965 * We generate quick checks that do a mask and compare to eliminate a
2966 * choice. If the quick check succeeds then it jumps to the continuation to
2967 * do slow checks and check subsequent nodes. If it fails (the common case)
2968 * it falls through to the next choice.
2969 *
2970 * Here is the desired flow graph. Nodes directly below each other imply
2971 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
2972 * 3 doesn't have a quick check so we have to call the slow check.
2973 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
2974 * regexp continuation is generated directly after the Sn node, up to the
2975 * next GoTo if we decide to reuse some already generated code. Some
2976 * nodes expect preload_characters to be preloaded into the current
2977 * character register. R nodes do this preloading. Vertices are marked
2978 * F for failures and S for success (possible success in the case of quick
2979 * nodes). L, V, < and > are used as arrow heads.
2980 *
2981 * ----------> R
2982 * |
2983 * V
2984 * Q1 -----> S1
2985 * | S /
2986 * F| /
2987 * | F/
2988 * | /
2989 * | R
2990 * | /
2991 * V L
2992 * Q2 -----> S2
2993 * | S /
2994 * F| /
2995 * | F/
2996 * | /
2997 * | R
2998 * | /
2999 * V L
3000 * S3
3001 * |
3002 * F|
3003 * |
3004 * R
3005 * |
3006 * backtrack V
3007 * <----------Q4
3008 * \ F |
3009 * \ |S
3010 * \ F V
3011 * \-----S4
3012 *
3013 * For greedy loops we reverse our expectation and expect to match rather
3014 * than fail. Therefore we want the loop code to look like this (U is the
3015 * unwind code that steps back in the greedy loop). The following alternatives
3016 * look the same as above.
3017 * _____
3018 * / \
3019 * V |
3020 * ----------> S1 |
3021 * /| |
3022 * / |S |
3023 * F/ \_____/
3024 * /
3025 * |<-----------
3026 * | \
3027 * V \
3028 * Q2 ---> S2 \
3029 * | S / |
3030 * F| / |
3031 * | F/ |
3032 * | / |
3033 * | R |
3034 * | / |
3035 * F VL |
3036 * <------U |
3037 * back |S |
3038 * \______________/
3039 */
3040
3041
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003042void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003043 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3044 int choice_count = alternatives_->length();
3045#ifdef DEBUG
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003046 for (int i = 0; i < choice_count - 1; i++) {
3047 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003048 ZoneList<Guard*>* guards = alternative.guards();
ager@chromium.org8bb60582008-12-11 12:02:20 +00003049 int guard_count = (guards == NULL) ? 0 : guards->length();
3050 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00003051 ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003052 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003053 }
3054#endif
3055
ager@chromium.org32912102009-01-16 10:38:43 +00003056 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003057 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003058 ASSERT(limit_result == CONTINUE);
3059
3060 RecursionCheck rc(compiler);
3061
ager@chromium.org32912102009-01-16 10:38:43 +00003062 Trace* current_trace = trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003063
3064 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
3065 bool greedy_loop = false;
3066 Label greedy_loop_label;
ager@chromium.org32912102009-01-16 10:38:43 +00003067 Trace counter_backtrack_trace;
3068 counter_backtrack_trace.set_backtrack(&greedy_loop_label);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003069 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3070 // Here we have special handling for greedy loops containing only text nodes
3071 // and other simple nodes. These are handled by pushing the current
3072 // position on the stack and then incrementing the current position each
3073 // time around the switch. On backtrack we decrement the current position
3074 // and check it against the pushed value. This avoids pushing backtrack
3075 // information for each iteration of the loop, which could take up a lot of
3076 // space.
3077 greedy_loop = true;
ager@chromium.org32912102009-01-16 10:38:43 +00003078 ASSERT(trace->stop_node() == NULL);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003079 macro_assembler->PushCurrentPosition();
ager@chromium.org32912102009-01-16 10:38:43 +00003080 current_trace = &counter_backtrack_trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003081 Label greedy_match_failed;
ager@chromium.org32912102009-01-16 10:38:43 +00003082 Trace greedy_match_trace;
3083 greedy_match_trace.set_backtrack(&greedy_match_failed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003084 Label loop_label;
3085 macro_assembler->Bind(&loop_label);
ager@chromium.org32912102009-01-16 10:38:43 +00003086 greedy_match_trace.set_stop_node(this);
3087 greedy_match_trace.set_loop_label(&loop_label);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003088 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003089 macro_assembler->Bind(&greedy_match_failed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003090 }
3091
3092 Label second_choice; // For use in greedy matches.
3093 macro_assembler->Bind(&second_choice);
3094
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003095 int first_normal_choice = greedy_loop ? 1 : 0;
3096
3097 int preload_characters = CalculatePreloadCharacters(compiler);
3098 bool preload_is_current =
ager@chromium.org32912102009-01-16 10:38:43 +00003099 (current_trace->characters_preloaded() == preload_characters);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003100 bool preload_has_checked_bounds = preload_is_current;
3101
3102 AlternativeGenerationList alt_gens(choice_count);
3103
ager@chromium.org8bb60582008-12-11 12:02:20 +00003104 // For now we just call all choices one after the other. The idea ultimately
3105 // is to use the Dispatch table to try only the relevant ones.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003106 for (int i = first_normal_choice; i < choice_count; i++) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003107 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.org32912102009-01-16 10:38:43 +00003108 AlternativeGeneration* alt_gen = alt_gens.at(i);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003109 alt_gen->quick_check_details.set_characters(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003110 ZoneList<Guard*>* guards = alternative.guards();
3111 int guard_count = (guards == NULL) ? 0 : guards->length();
ager@chromium.org32912102009-01-16 10:38:43 +00003112 Trace new_trace(*current_trace);
3113 new_trace.set_characters_preloaded(preload_is_current ?
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003114 preload_characters :
3115 0);
3116 if (preload_has_checked_bounds) {
ager@chromium.org32912102009-01-16 10:38:43 +00003117 new_trace.set_bound_checked_up_to(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003118 }
ager@chromium.org32912102009-01-16 10:38:43 +00003119 new_trace.quick_check_performed()->Clear();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003120 alt_gen->expects_preload = preload_is_current;
3121 bool generate_full_check_inline = false;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003122 if (try_to_emit_quick_check_for_alternative(i) &&
3123 alternative.node()->EmitQuickCheck(compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00003124 &new_trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003125 preload_has_checked_bounds,
3126 &alt_gen->possible_success,
3127 &alt_gen->quick_check_details,
3128 i < choice_count - 1)) {
3129 // Quick check was generated for this choice.
3130 preload_is_current = true;
3131 preload_has_checked_bounds = true;
3132 // On the last choice in the ChoiceNode we generated the quick
3133 // check to fall through on possible success. So now we need to
3134 // generate the full check inline.
3135 if (i == choice_count - 1) {
3136 macro_assembler->Bind(&alt_gen->possible_success);
ager@chromium.org32912102009-01-16 10:38:43 +00003137 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3138 new_trace.set_characters_preloaded(preload_characters);
3139 new_trace.set_bound_checked_up_to(preload_characters);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003140 generate_full_check_inline = true;
3141 }
3142 } else {
3143 // No quick check was generated. Put the full code here.
3144 // If this is not the first choice then there could be slow checks from
3145 // previous cases that go here when they fail. There's no reason to
3146 // insist that they preload characters since the slow check we are about
3147 // to generate probably can't use it.
3148 if (i != first_normal_choice) {
3149 alt_gen->expects_preload = false;
ager@chromium.org32912102009-01-16 10:38:43 +00003150 new_trace.set_characters_preloaded(0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003151 }
3152 if (i < choice_count - 1) {
ager@chromium.org32912102009-01-16 10:38:43 +00003153 new_trace.set_backtrack(&alt_gen->after);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003154 }
3155 generate_full_check_inline = true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003156 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003157 if (generate_full_check_inline) {
3158 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00003159 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003160 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003161 alternative.node()->Emit(compiler, &new_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003162 preload_is_current = false;
3163 }
3164 macro_assembler->Bind(&alt_gen->after);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003165 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003166 if (greedy_loop) {
3167 macro_assembler->Bind(&greedy_loop_label);
3168 // If we have unwound to the bottom then backtrack.
ager@chromium.org32912102009-01-16 10:38:43 +00003169 macro_assembler->CheckGreedyLoop(trace->backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00003170 // Otherwise try the second priority at an earlier position.
3171 macro_assembler->AdvanceCurrentPosition(-text_length);
3172 macro_assembler->GoTo(&second_choice);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003173 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003174 // At this point we need to generate slow checks for the alternatives where
3175 // the quick check was inlined. We can recognize these because the associated
3176 // label was bound.
3177 for (int i = first_normal_choice; i < choice_count - 1; i++) {
3178 AlternativeGeneration* alt_gen = alt_gens.at(i);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003179 EmitOutOfLineContinuation(compiler,
3180 current_trace,
3181 alternatives_->at(i),
3182 alt_gen,
3183 preload_characters,
3184 alt_gens.at(i + 1)->expects_preload);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003185 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003186}
3187
3188
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003189void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00003190 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003191 GuardedAlternative alternative,
3192 AlternativeGeneration* alt_gen,
3193 int preload_characters,
3194 bool next_expects_preload) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003195 if (!alt_gen->possible_success.is_linked()) return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003196
3197 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3198 macro_assembler->Bind(&alt_gen->possible_success);
ager@chromium.org32912102009-01-16 10:38:43 +00003199 Trace out_of_line_trace(*trace);
3200 out_of_line_trace.set_characters_preloaded(preload_characters);
3201 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003202 ZoneList<Guard*>* guards = alternative.guards();
3203 int guard_count = (guards == NULL) ? 0 : guards->length();
3204 if (next_expects_preload) {
3205 Label reload_current_char;
ager@chromium.org32912102009-01-16 10:38:43 +00003206 out_of_line_trace.set_backtrack(&reload_current_char);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003207 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00003208 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003209 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003210 alternative.node()->Emit(compiler, &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003211 macro_assembler->Bind(&reload_current_char);
3212 // Reload the current character, since the next quick check expects that.
3213 // We don't need to check bounds here because we only get into this
3214 // code through a quick check which already did the checked load.
ager@chromium.org32912102009-01-16 10:38:43 +00003215 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003216 NULL,
3217 false,
3218 preload_characters);
3219 macro_assembler->GoTo(&(alt_gen->after));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003220 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00003221 out_of_line_trace.set_backtrack(&(alt_gen->after));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003222 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00003223 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003224 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003225 alternative.node()->Emit(compiler, &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003226 }
3227}
3228
3229
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003230void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003231 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00003232 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003233 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003234 ASSERT(limit_result == CONTINUE);
3235
3236 RecursionCheck rc(compiler);
3237
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003238 switch (type_) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003239 case STORE_POSITION: {
ager@chromium.org32912102009-01-16 10:38:43 +00003240 Trace::DeferredCapture
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003241 new_capture(data_.u_position_register.reg,
3242 data_.u_position_register.is_capture,
3243 trace);
ager@chromium.org32912102009-01-16 10:38:43 +00003244 Trace new_trace = *trace;
3245 new_trace.add_action(&new_capture);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003246 on_success()->Emit(compiler, &new_trace);
3247 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003248 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003249 case INCREMENT_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00003250 Trace::DeferredIncrementRegister
ager@chromium.org8bb60582008-12-11 12:02:20 +00003251 new_increment(data_.u_increment_register.reg);
ager@chromium.org32912102009-01-16 10:38:43 +00003252 Trace new_trace = *trace;
3253 new_trace.add_action(&new_increment);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003254 on_success()->Emit(compiler, &new_trace);
3255 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003256 }
3257 case SET_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00003258 Trace::DeferredSetRegister
ager@chromium.org8bb60582008-12-11 12:02:20 +00003259 new_set(data_.u_store_register.reg, data_.u_store_register.value);
ager@chromium.org32912102009-01-16 10:38:43 +00003260 Trace new_trace = *trace;
3261 new_trace.add_action(&new_set);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003262 on_success()->Emit(compiler, &new_trace);
3263 break;
ager@chromium.org32912102009-01-16 10:38:43 +00003264 }
3265 case CLEAR_CAPTURES: {
3266 Trace::DeferredClearCaptures
3267 new_capture(Interval(data_.u_clear_captures.range_from,
3268 data_.u_clear_captures.range_to));
3269 Trace new_trace = *trace;
3270 new_trace.add_action(&new_capture);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003271 on_success()->Emit(compiler, &new_trace);
3272 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003273 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003274 case BEGIN_SUBMATCH:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003275 if (!trace->is_trivial()) {
3276 trace->Flush(compiler, this);
3277 } else {
3278 assembler->WriteCurrentPositionToRegister(
3279 data_.u_submatch.current_position_register, 0);
3280 assembler->WriteStackPointerToRegister(
3281 data_.u_submatch.stack_pointer_register);
3282 on_success()->Emit(compiler, trace);
3283 }
3284 break;
ager@chromium.org32912102009-01-16 10:38:43 +00003285 case EMPTY_MATCH_CHECK: {
3286 int start_pos_reg = data_.u_empty_match_check.start_register;
3287 int stored_pos = 0;
3288 int rep_reg = data_.u_empty_match_check.repetition_register;
3289 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
3290 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
3291 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
3292 // If we know we haven't advanced and there is no minimum we
3293 // can just backtrack immediately.
3294 assembler->GoTo(trace->backtrack());
ager@chromium.org32912102009-01-16 10:38:43 +00003295 } else if (know_dist && stored_pos < trace->cp_offset()) {
3296 // If we know we've advanced we can generate the continuation
3297 // immediately.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003298 on_success()->Emit(compiler, trace);
3299 } else if (!trace->is_trivial()) {
3300 trace->Flush(compiler, this);
3301 } else {
3302 Label skip_empty_check;
3303 // If we have a minimum number of repetitions we check the current
3304 // number first and skip the empty check if it's not enough.
3305 if (has_minimum) {
3306 int limit = data_.u_empty_match_check.repetition_limit;
3307 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
3308 }
3309 // If the match is empty we bail out, otherwise we fall through
3310 // to the on-success continuation.
3311 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
3312 trace->backtrack());
3313 assembler->Bind(&skip_empty_check);
3314 on_success()->Emit(compiler, trace);
ager@chromium.org32912102009-01-16 10:38:43 +00003315 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003316 break;
ager@chromium.org32912102009-01-16 10:38:43 +00003317 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003318 case POSITIVE_SUBMATCH_SUCCESS: {
3319 if (!trace->is_trivial()) {
3320 trace->Flush(compiler, this);
3321 return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003322 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003323 assembler->ReadCurrentPositionFromRegister(
ager@chromium.org8bb60582008-12-11 12:02:20 +00003324 data_.u_submatch.current_position_register);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003325 assembler->ReadStackPointerFromRegister(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003326 data_.u_submatch.stack_pointer_register);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003327 int clear_register_count = data_.u_submatch.clear_register_count;
3328 if (clear_register_count == 0) {
3329 on_success()->Emit(compiler, trace);
3330 return;
3331 }
3332 int clear_registers_from = data_.u_submatch.clear_register_from;
3333 Label clear_registers_backtrack;
3334 Trace new_trace = *trace;
3335 new_trace.set_backtrack(&clear_registers_backtrack);
3336 on_success()->Emit(compiler, &new_trace);
3337
3338 assembler->Bind(&clear_registers_backtrack);
3339 int clear_registers_to = clear_registers_from + clear_register_count - 1;
3340 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
3341
3342 ASSERT(trace->backtrack() == NULL);
3343 assembler->Backtrack();
3344 return;
3345 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003346 default:
3347 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003348 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003349}
3350
3351
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003352void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003353 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00003354 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003355 trace->Flush(compiler, this);
3356 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003357 }
3358
ager@chromium.org32912102009-01-16 10:38:43 +00003359 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003360 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003361 ASSERT(limit_result == CONTINUE);
3362
3363 RecursionCheck rc(compiler);
3364
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003365 ASSERT_EQ(start_reg_ + 1, end_reg_);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003366 if (compiler->ignore_case()) {
3367 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
3368 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003369 } else {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003370 assembler->CheckNotBackReference(start_reg_, trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003371 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003372 on_success()->Emit(compiler, trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003373}
3374
3375
3376// -------------------------------------------------------------------
3377// Dot/dotty output
3378
3379
3380#ifdef DEBUG
3381
3382
3383class DotPrinter: public NodeVisitor {
3384 public:
3385 explicit DotPrinter(bool ignore_case)
3386 : ignore_case_(ignore_case),
3387 stream_(&alloc_) { }
3388 void PrintNode(const char* label, RegExpNode* node);
3389 void Visit(RegExpNode* node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003390 void PrintAttributes(RegExpNode* from);
3391 StringStream* stream() { return &stream_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003392 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003393#define DECLARE_VISIT(Type) \
3394 virtual void Visit##Type(Type##Node* that);
3395FOR_EACH_NODE_TYPE(DECLARE_VISIT)
3396#undef DECLARE_VISIT
3397 private:
3398 bool ignore_case_;
3399 HeapStringAllocator alloc_;
3400 StringStream stream_;
3401};
3402
3403
3404void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
3405 stream()->Add("digraph G {\n graph [label=\"");
3406 for (int i = 0; label[i]; i++) {
3407 switch (label[i]) {
3408 case '\\':
3409 stream()->Add("\\\\");
3410 break;
3411 case '"':
3412 stream()->Add("\"");
3413 break;
3414 default:
3415 stream()->Put(label[i]);
3416 break;
3417 }
3418 }
3419 stream()->Add("\"];\n");
3420 Visit(node);
3421 stream()->Add("}\n");
3422 printf("%s", *(stream()->ToCString()));
3423}
3424
3425
3426void DotPrinter::Visit(RegExpNode* node) {
3427 if (node->info()->visited) return;
3428 node->info()->visited = true;
3429 node->Accept(this);
3430}
3431
3432
3433void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003434 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
3435 Visit(on_failure);
3436}
3437
3438
3439class TableEntryBodyPrinter {
3440 public:
3441 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
3442 : stream_(stream), choice_(choice) { }
3443 void Call(uc16 from, DispatchTable::Entry entry) {
3444 OutSet* out_set = entry.out_set();
3445 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3446 if (out_set->Get(i)) {
3447 stream()->Add(" n%p:s%io%i -> n%p;\n",
3448 choice(),
3449 from,
3450 i,
3451 choice()->alternatives()->at(i).node());
3452 }
3453 }
3454 }
3455 private:
3456 StringStream* stream() { return stream_; }
3457 ChoiceNode* choice() { return choice_; }
3458 StringStream* stream_;
3459 ChoiceNode* choice_;
3460};
3461
3462
3463class TableEntryHeaderPrinter {
3464 public:
3465 explicit TableEntryHeaderPrinter(StringStream* stream)
3466 : first_(true), stream_(stream) { }
3467 void Call(uc16 from, DispatchTable::Entry entry) {
3468 if (first_) {
3469 first_ = false;
3470 } else {
3471 stream()->Add("|");
3472 }
3473 stream()->Add("{\\%k-\\%k|{", from, entry.to());
3474 OutSet* out_set = entry.out_set();
3475 int priority = 0;
3476 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3477 if (out_set->Get(i)) {
3478 if (priority > 0) stream()->Add("|");
3479 stream()->Add("<s%io%i> %i", from, i, priority);
3480 priority++;
3481 }
3482 }
3483 stream()->Add("}}");
3484 }
3485 private:
3486 bool first_;
3487 StringStream* stream() { return stream_; }
3488 StringStream* stream_;
3489};
3490
3491
3492class AttributePrinter {
3493 public:
3494 explicit AttributePrinter(DotPrinter* out)
3495 : out_(out), first_(true) { }
3496 void PrintSeparator() {
3497 if (first_) {
3498 first_ = false;
3499 } else {
3500 out_->stream()->Add("|");
3501 }
3502 }
3503 void PrintBit(const char* name, bool value) {
3504 if (!value) return;
3505 PrintSeparator();
3506 out_->stream()->Add("{%s}", name);
3507 }
3508 void PrintPositive(const char* name, int value) {
3509 if (value < 0) return;
3510 PrintSeparator();
3511 out_->stream()->Add("{%s|%x}", name, value);
3512 }
3513 private:
3514 DotPrinter* out_;
3515 bool first_;
3516};
3517
3518
3519void DotPrinter::PrintAttributes(RegExpNode* that) {
3520 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
3521 "margin=0.1, fontsize=10, label=\"{",
3522 that);
3523 AttributePrinter printer(this);
3524 NodeInfo* info = that->info();
3525 printer.PrintBit("NI", info->follows_newline_interest);
3526 printer.PrintBit("WI", info->follows_word_interest);
3527 printer.PrintBit("SI", info->follows_start_interest);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003528 Label* label = that->label();
3529 if (label->is_bound())
3530 printer.PrintPositive("@", label->pos());
3531 stream()->Add("}\"];\n");
3532 stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
3533 "arrowhead=none];\n", that, that);
3534}
3535
3536
3537static const bool kPrintDispatchTable = false;
3538void DotPrinter::VisitChoice(ChoiceNode* that) {
3539 if (kPrintDispatchTable) {
3540 stream()->Add(" n%p [shape=Mrecord, label=\"", that);
3541 TableEntryHeaderPrinter header_printer(stream());
3542 that->GetTable(ignore_case_)->ForEach(&header_printer);
3543 stream()->Add("\"]\n", that);
3544 PrintAttributes(that);
3545 TableEntryBodyPrinter body_printer(stream(), that);
3546 that->GetTable(ignore_case_)->ForEach(&body_printer);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003547 } else {
3548 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
3549 for (int i = 0; i < that->alternatives()->length(); i++) {
3550 GuardedAlternative alt = that->alternatives()->at(i);
3551 stream()->Add(" n%p -> n%p;\n", that, alt.node());
3552 }
3553 }
3554 for (int i = 0; i < that->alternatives()->length(); i++) {
3555 GuardedAlternative alt = that->alternatives()->at(i);
3556 alt.node()->Accept(this);
3557 }
3558}
3559
3560
3561void DotPrinter::VisitText(TextNode* that) {
3562 stream()->Add(" n%p [label=\"", that);
3563 for (int i = 0; i < that->elements()->length(); i++) {
3564 if (i > 0) stream()->Add(" ");
3565 TextElement elm = that->elements()->at(i);
3566 switch (elm.type) {
3567 case TextElement::ATOM: {
3568 stream()->Add("'%w'", elm.data.u_atom->data());
3569 break;
3570 }
3571 case TextElement::CHAR_CLASS: {
3572 RegExpCharacterClass* node = elm.data.u_char_class;
3573 stream()->Add("[");
3574 if (node->is_negated())
3575 stream()->Add("^");
3576 for (int j = 0; j < node->ranges()->length(); j++) {
3577 CharacterRange range = node->ranges()->at(j);
3578 stream()->Add("%k-%k", range.from(), range.to());
3579 }
3580 stream()->Add("]");
3581 break;
3582 }
3583 default:
3584 UNREACHABLE();
3585 }
3586 }
3587 stream()->Add("\", shape=box, peripheries=2];\n");
3588 PrintAttributes(that);
3589 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3590 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003591}
3592
3593
3594void DotPrinter::VisitBackReference(BackReferenceNode* that) {
3595 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
3596 that,
3597 that->start_register(),
3598 that->end_register());
3599 PrintAttributes(that);
3600 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3601 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003602}
3603
3604
3605void DotPrinter::VisitEnd(EndNode* that) {
3606 stream()->Add(" n%p [style=bold, shape=point];\n", that);
3607 PrintAttributes(that);
3608}
3609
3610
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003611void DotPrinter::VisitAssertion(AssertionNode* that) {
3612 stream()->Add(" n%p [", that);
3613 switch (that->type()) {
3614 case AssertionNode::AT_END:
3615 stream()->Add("label=\"$\", shape=septagon");
3616 break;
3617 case AssertionNode::AT_START:
3618 stream()->Add("label=\"^\", shape=septagon");
3619 break;
3620 case AssertionNode::AT_BOUNDARY:
3621 stream()->Add("label=\"\\b\", shape=septagon");
3622 break;
3623 case AssertionNode::AT_NON_BOUNDARY:
3624 stream()->Add("label=\"\\B\", shape=septagon");
3625 break;
3626 case AssertionNode::AFTER_NEWLINE:
3627 stream()->Add("label=\"(?<=\\n)\", shape=septagon");
3628 break;
3629 }
3630 stream()->Add("];\n");
3631 PrintAttributes(that);
3632 RegExpNode* successor = that->on_success();
3633 stream()->Add(" n%p -> n%p;\n", that, successor);
3634 Visit(successor);
3635}
3636
3637
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003638void DotPrinter::VisitAction(ActionNode* that) {
3639 stream()->Add(" n%p [", that);
3640 switch (that->type_) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003641 case ActionNode::SET_REGISTER:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003642 stream()->Add("label=\"$%i:=%i\", shape=octagon",
3643 that->data_.u_store_register.reg,
3644 that->data_.u_store_register.value);
3645 break;
3646 case ActionNode::INCREMENT_REGISTER:
3647 stream()->Add("label=\"$%i++\", shape=octagon",
3648 that->data_.u_increment_register.reg);
3649 break;
3650 case ActionNode::STORE_POSITION:
3651 stream()->Add("label=\"$%i:=$pos\", shape=octagon",
3652 that->data_.u_position_register.reg);
3653 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003654 case ActionNode::BEGIN_SUBMATCH:
3655 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
3656 that->data_.u_submatch.current_position_register);
3657 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003658 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003659 stream()->Add("label=\"escape\", shape=septagon");
3660 break;
ager@chromium.org32912102009-01-16 10:38:43 +00003661 case ActionNode::EMPTY_MATCH_CHECK:
3662 stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
3663 that->data_.u_empty_match_check.start_register,
3664 that->data_.u_empty_match_check.repetition_register,
3665 that->data_.u_empty_match_check.repetition_limit);
3666 break;
3667 case ActionNode::CLEAR_CAPTURES: {
3668 stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
3669 that->data_.u_clear_captures.range_from,
3670 that->data_.u_clear_captures.range_to);
3671 break;
3672 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003673 }
3674 stream()->Add("];\n");
3675 PrintAttributes(that);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003676 RegExpNode* successor = that->on_success();
3677 stream()->Add(" n%p -> n%p;\n", that, successor);
3678 Visit(successor);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003679}
3680
3681
3682class DispatchTableDumper {
3683 public:
3684 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
3685 void Call(uc16 key, DispatchTable::Entry entry);
3686 StringStream* stream() { return stream_; }
3687 private:
3688 StringStream* stream_;
3689};
3690
3691
3692void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
3693 stream()->Add("[%k-%k]: {", key, entry.to());
3694 OutSet* set = entry.out_set();
3695 bool first = true;
3696 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3697 if (set->Get(i)) {
3698 if (first) {
3699 first = false;
3700 } else {
3701 stream()->Add(", ");
3702 }
3703 stream()->Add("%i", i);
3704 }
3705 }
3706 stream()->Add("}\n");
3707}
3708
3709
3710void DispatchTable::Dump() {
3711 HeapStringAllocator alloc;
3712 StringStream stream(&alloc);
3713 DispatchTableDumper dumper(&stream);
3714 tree()->ForEach(&dumper);
3715 OS::PrintError("%s", *stream.ToCString());
3716}
3717
3718
3719void RegExpEngine::DotPrint(const char* label,
3720 RegExpNode* node,
3721 bool ignore_case) {
3722 DotPrinter printer(ignore_case);
3723 printer.PrintNode(label, node);
3724}
3725
3726
3727#endif // DEBUG
3728
3729
3730// -------------------------------------------------------------------
3731// Tree to graph conversion
3732
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003733static const int kSpaceRangeCount = 20;
3734static const int kSpaceRangeAsciiCount = 4;
3735static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020,
3736 0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A,
3737 0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 };
3738
3739static const int kWordRangeCount = 8;
3740static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_',
3741 '_', 'a', 'z' };
3742
3743static const int kDigitRangeCount = 2;
3744static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' };
3745
3746static const int kLineTerminatorRangeCount = 6;
3747static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A,
3748 0x000A, 0x000D, 0x000D, 0x2028, 0x2029 };
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003749
3750RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003751 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003752 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
3753 elms->Add(TextElement::Atom(this));
ager@chromium.org8bb60582008-12-11 12:02:20 +00003754 return new TextNode(elms, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003755}
3756
3757
3758RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003759 RegExpNode* on_success) {
3760 return new TextNode(elements(), on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003761}
3762
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003763static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
3764 const uc16* special_class,
3765 int length) {
3766 ASSERT(ranges->length() != 0);
3767 ASSERT(length != 0);
3768 ASSERT(special_class[0] != 0);
3769 if (ranges->length() != (length >> 1) + 1) {
3770 return false;
3771 }
3772 CharacterRange range = ranges->at(0);
3773 if (range.from() != 0) {
3774 return false;
3775 }
3776 for (int i = 0; i < length; i += 2) {
3777 if (special_class[i] != (range.to() + 1)) {
3778 return false;
3779 }
3780 range = ranges->at((i >> 1) + 1);
3781 if (special_class[i+1] != range.from() - 1) {
3782 return false;
3783 }
3784 }
3785 if (range.to() != 0xffff) {
3786 return false;
3787 }
3788 return true;
3789}
3790
3791
3792static bool CompareRanges(ZoneList<CharacterRange>* ranges,
3793 const uc16* special_class,
3794 int length) {
3795 if (ranges->length() * 2 != length) {
3796 return false;
3797 }
3798 for (int i = 0; i < length; i += 2) {
3799 CharacterRange range = ranges->at(i >> 1);
3800 if (range.from() != special_class[i] || range.to() != special_class[i+1]) {
3801 return false;
3802 }
3803 }
3804 return true;
3805}
3806
3807
3808bool RegExpCharacterClass::is_standard() {
3809 // TODO(lrn): Remove need for this function, by not throwing away information
3810 // along the way.
3811 if (is_negated_) {
3812 return false;
3813 }
3814 if (set_.is_standard()) {
3815 return true;
3816 }
3817 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3818 set_.set_standard_set_type('s');
3819 return true;
3820 }
3821 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3822 set_.set_standard_set_type('S');
3823 return true;
3824 }
3825 if (CompareInverseRanges(set_.ranges(),
3826 kLineTerminatorRanges,
3827 kLineTerminatorRangeCount)) {
3828 set_.set_standard_set_type('.');
3829 return true;
3830 }
3831 return false;
3832}
3833
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003834
3835RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003836 RegExpNode* on_success) {
3837 return new TextNode(this, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003838}
3839
3840
3841RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003842 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003843 ZoneList<RegExpTree*>* alternatives = this->alternatives();
3844 int length = alternatives->length();
ager@chromium.org8bb60582008-12-11 12:02:20 +00003845 ChoiceNode* result = new ChoiceNode(length);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003846 for (int i = 0; i < length; i++) {
3847 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003848 on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003849 result->AddAlternative(alternative);
3850 }
3851 return result;
3852}
3853
3854
3855RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003856 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003857 return ToNode(min(),
3858 max(),
3859 is_greedy(),
3860 body(),
3861 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003862 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003863}
3864
3865
3866RegExpNode* RegExpQuantifier::ToNode(int min,
3867 int max,
3868 bool is_greedy,
3869 RegExpTree* body,
3870 RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003871 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003872 // x{f, t} becomes this:
3873 //
3874 // (r++)<-.
3875 // | `
3876 // | (x)
3877 // v ^
3878 // (r=0)-->(?)---/ [if r < t]
3879 // |
3880 // [if r >= f] \----> ...
3881 //
3882 //
3883 // TODO(someone): clear captures on repetition and handle empty
3884 // matches.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003885
3886 // 15.10.2.5 RepeatMatcher algorithm.
3887 // The parser has already eliminated the case where max is 0. In the case
3888 // where max_match is zero the parser has removed the quantifier if min was
3889 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
3890
3891 // If we know that we cannot match zero length then things are a little
3892 // simpler since we don't need to make the special zero length match check
3893 // from step 2.1. If the min and max are small we can unroll a little in
3894 // this case.
3895 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
3896 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
3897 if (max == 0) return on_success; // This can happen due to recursion.
ager@chromium.org32912102009-01-16 10:38:43 +00003898 bool body_can_be_empty = (body->min_match() == 0);
3899 int body_start_reg = RegExpCompiler::kNoRegister;
3900 Interval capture_registers = body->CaptureRegisters();
3901 bool needs_capture_clearing = !capture_registers.is_empty();
3902 if (body_can_be_empty) {
3903 body_start_reg = compiler->AllocateRegister();
3904 } else if (!needs_capture_clearing) {
3905 // Only unroll if there are no captures and the body can't be
3906 // empty.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003907 if (min > 0 && min <= kMaxUnrolledMinMatches) {
3908 int new_max = (max == kInfinity) ? max : max - min;
3909 // Recurse once to get the loop or optional matches after the fixed ones.
3910 RegExpNode* answer =
3911 ToNode(0, new_max, is_greedy, body, compiler, on_success);
3912 // Unroll the forced matches from 0 to min. This can cause chains of
3913 // TextNodes (which the parser does not generate). These should be
3914 // combined if it turns out they hinder good code generation.
3915 for (int i = 0; i < min; i++) {
3916 answer = body->ToNode(compiler, answer);
3917 }
3918 return answer;
3919 }
3920 if (max <= kMaxUnrolledMaxMatches) {
3921 ASSERT(min == 0);
3922 // Unroll the optional matches up to max.
3923 RegExpNode* answer = on_success;
3924 for (int i = 0; i < max; i++) {
3925 ChoiceNode* alternation = new ChoiceNode(2);
3926 if (is_greedy) {
3927 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3928 answer)));
3929 alternation->AddAlternative(GuardedAlternative(on_success));
3930 } else {
3931 alternation->AddAlternative(GuardedAlternative(on_success));
3932 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3933 answer)));
3934 }
3935 answer = alternation;
3936 }
3937 return answer;
3938 }
3939 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003940 bool has_min = min > 0;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003941 bool has_max = max < RegExpTree::kInfinity;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003942 bool needs_counter = has_min || has_max;
ager@chromium.org32912102009-01-16 10:38:43 +00003943 int reg_ctr = needs_counter
3944 ? compiler->AllocateRegister()
3945 : RegExpCompiler::kNoRegister;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003946 LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003947 RegExpNode* loop_return = needs_counter
3948 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
3949 : static_cast<RegExpNode*>(center);
ager@chromium.org32912102009-01-16 10:38:43 +00003950 if (body_can_be_empty) {
3951 // If the body can be empty we need to check if it was and then
3952 // backtrack.
3953 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
3954 reg_ctr,
3955 min,
3956 loop_return);
3957 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003958 RegExpNode* body_node = body->ToNode(compiler, loop_return);
ager@chromium.org32912102009-01-16 10:38:43 +00003959 if (body_can_be_empty) {
3960 // If the body can be empty we need to store the start position
3961 // so we can bail out if it was empty.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003962 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
ager@chromium.org32912102009-01-16 10:38:43 +00003963 }
3964 if (needs_capture_clearing) {
3965 // Before entering the body of this loop we need to clear captures.
3966 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
3967 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003968 GuardedAlternative body_alt(body_node);
3969 if (has_max) {
3970 Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
3971 body_alt.AddGuard(body_guard);
3972 }
3973 GuardedAlternative rest_alt(on_success);
3974 if (has_min) {
3975 Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
3976 rest_alt.AddGuard(rest_guard);
3977 }
3978 if (is_greedy) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003979 center->AddLoopAlternative(body_alt);
3980 center->AddContinueAlternative(rest_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003981 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003982 center->AddContinueAlternative(rest_alt);
3983 center->AddLoopAlternative(body_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003984 }
3985 if (needs_counter) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003986 return ActionNode::SetRegister(reg_ctr, 0, center);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003987 } else {
3988 return center;
3989 }
3990}
3991
3992
3993RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003994 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003995 NodeInfo info;
3996 switch (type()) {
3997 case START_OF_LINE:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003998 return AssertionNode::AfterNewline(on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003999 case START_OF_INPUT:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004000 return AssertionNode::AtStart(on_success);
4001 case BOUNDARY:
4002 return AssertionNode::AtBoundary(on_success);
4003 case NON_BOUNDARY:
4004 return AssertionNode::AtNonBoundary(on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004005 case END_OF_INPUT:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004006 return AssertionNode::AtEnd(on_success);
4007 case END_OF_LINE: {
4008 // Compile $ in multiline regexps as an alternation with a positive
4009 // lookahead in one side and an end-of-input on the other side.
4010 // We need two registers for the lookahead.
4011 int stack_pointer_register = compiler->AllocateRegister();
4012 int position_register = compiler->AllocateRegister();
4013 // The ChoiceNode to distinguish between a newline and end-of-input.
4014 ChoiceNode* result = new ChoiceNode(2);
4015 // Create a newline atom.
4016 ZoneList<CharacterRange>* newline_ranges =
4017 new ZoneList<CharacterRange>(3);
4018 CharacterRange::AddClassEscape('n', newline_ranges);
4019 RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
4020 TextNode* newline_matcher = new TextNode(
4021 newline_atom,
4022 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
4023 position_register,
4024 0, // No captures inside.
4025 -1, // Ignored if no captures.
4026 on_success));
4027 // Create an end-of-input matcher.
4028 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
4029 stack_pointer_register,
4030 position_register,
4031 newline_matcher);
4032 // Add the two alternatives to the ChoiceNode.
4033 GuardedAlternative eol_alternative(end_of_line);
4034 result->AddAlternative(eol_alternative);
4035 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
4036 result->AddAlternative(end_alternative);
4037 return result;
4038 }
4039 default:
4040 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004041 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004042 return on_success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004043}
4044
4045
4046RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004047 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004048 return new BackReferenceNode(RegExpCapture::StartRegister(index()),
4049 RegExpCapture::EndRegister(index()),
ager@chromium.org8bb60582008-12-11 12:02:20 +00004050 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004051}
4052
4053
4054RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004055 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004056 return on_success;
4057}
4058
4059
4060RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004061 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004062 int stack_pointer_register = compiler->AllocateRegister();
4063 int position_register = compiler->AllocateRegister();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004064
4065 const int registers_per_capture = 2;
4066 const int register_of_first_capture = 2;
4067 int register_count = capture_count_ * registers_per_capture;
4068 int register_start =
4069 register_of_first_capture + capture_from_ * registers_per_capture;
4070
ager@chromium.org8bb60582008-12-11 12:02:20 +00004071 RegExpNode* success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004072 if (is_positive()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004073 RegExpNode* node = ActionNode::BeginSubmatch(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004074 stack_pointer_register,
4075 position_register,
4076 body()->ToNode(
4077 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004078 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
4079 position_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004080 register_count,
4081 register_start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004082 on_success)));
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004083 return node;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004084 } else {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004085 // We use a ChoiceNode for a negative lookahead because it has most of
4086 // the characteristics we need. It has the body of the lookahead as its
4087 // first alternative and the expression after the lookahead of the second
4088 // alternative. If the first alternative succeeds then the
4089 // NegativeSubmatchSuccess will unwind the stack including everything the
4090 // choice node set up and backtrack. If the first alternative fails then
4091 // the second alternative is tried, which is exactly the desired result
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004092 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
4093 // ChoiceNode that knows to ignore the first exit when calculating quick
4094 // checks.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004095 GuardedAlternative body_alt(
4096 body()->ToNode(
4097 compiler,
4098 success = new NegativeSubmatchSuccess(stack_pointer_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004099 position_register,
4100 register_count,
4101 register_start)));
4102 ChoiceNode* choice_node =
4103 new NegativeLookaheadChoiceNode(body_alt,
4104 GuardedAlternative(on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004105 return ActionNode::BeginSubmatch(stack_pointer_register,
4106 position_register,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004107 choice_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004108 }
4109}
4110
4111
4112RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004113 RegExpNode* on_success) {
4114 return ToNode(body(), index(), compiler, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004115}
4116
4117
4118RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
4119 int index,
4120 RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004121 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004122 int start_reg = RegExpCapture::StartRegister(index);
4123 int end_reg = RegExpCapture::EndRegister(index);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004124 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
ager@chromium.org8bb60582008-12-11 12:02:20 +00004125 RegExpNode* body_node = body->ToNode(compiler, store_end);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004126 return ActionNode::StorePosition(start_reg, true, body_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004127}
4128
4129
4130RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004131 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004132 ZoneList<RegExpTree*>* children = nodes();
4133 RegExpNode* current = on_success;
4134 for (int i = children->length() - 1; i >= 0; i--) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004135 current = children->at(i)->ToNode(compiler, current);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004136 }
4137 return current;
4138}
4139
4140
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004141static void AddClass(const uc16* elmv,
4142 int elmc,
4143 ZoneList<CharacterRange>* ranges) {
4144 for (int i = 0; i < elmc; i += 2) {
4145 ASSERT(elmv[i] <= elmv[i + 1]);
4146 ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
4147 }
4148}
4149
4150
4151static void AddClassNegated(const uc16 *elmv,
4152 int elmc,
4153 ZoneList<CharacterRange>* ranges) {
4154 ASSERT(elmv[0] != 0x0000);
ager@chromium.org8bb60582008-12-11 12:02:20 +00004155 ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004156 uc16 last = 0x0000;
4157 for (int i = 0; i < elmc; i += 2) {
4158 ASSERT(last <= elmv[i] - 1);
4159 ASSERT(elmv[i] <= elmv[i + 1]);
4160 ranges->Add(CharacterRange(last, elmv[i] - 1));
4161 last = elmv[i + 1] + 1;
4162 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004163 ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004164}
4165
4166
4167void CharacterRange::AddClassEscape(uc16 type,
4168 ZoneList<CharacterRange>* ranges) {
4169 switch (type) {
4170 case 's':
4171 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
4172 break;
4173 case 'S':
4174 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
4175 break;
4176 case 'w':
4177 AddClass(kWordRanges, kWordRangeCount, ranges);
4178 break;
4179 case 'W':
4180 AddClassNegated(kWordRanges, kWordRangeCount, ranges);
4181 break;
4182 case 'd':
4183 AddClass(kDigitRanges, kDigitRangeCount, ranges);
4184 break;
4185 case 'D':
4186 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
4187 break;
4188 case '.':
4189 AddClassNegated(kLineTerminatorRanges,
4190 kLineTerminatorRangeCount,
4191 ranges);
4192 break;
4193 // This is not a character range as defined by the spec but a
4194 // convenient shorthand for a character class that matches any
4195 // character.
4196 case '*':
4197 ranges->Add(CharacterRange::Everything());
4198 break;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004199 // This is the set of characters matched by the $ and ^ symbols
4200 // in multiline mode.
4201 case 'n':
4202 AddClass(kLineTerminatorRanges,
4203 kLineTerminatorRangeCount,
4204 ranges);
4205 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004206 default:
4207 UNREACHABLE();
4208 }
4209}
4210
4211
4212Vector<const uc16> CharacterRange::GetWordBounds() {
4213 return Vector<const uc16>(kWordRanges, kWordRangeCount);
4214}
4215
4216
4217class CharacterRangeSplitter {
4218 public:
4219 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
4220 ZoneList<CharacterRange>** excluded)
4221 : included_(included),
4222 excluded_(excluded) { }
4223 void Call(uc16 from, DispatchTable::Entry entry);
4224
4225 static const int kInBase = 0;
4226 static const int kInOverlay = 1;
4227
4228 private:
4229 ZoneList<CharacterRange>** included_;
4230 ZoneList<CharacterRange>** excluded_;
4231};
4232
4233
4234void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
4235 if (!entry.out_set()->Get(kInBase)) return;
4236 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
4237 ? included_
4238 : excluded_;
4239 if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
4240 (*target)->Add(CharacterRange(entry.from(), entry.to()));
4241}
4242
4243
4244void CharacterRange::Split(ZoneList<CharacterRange>* base,
4245 Vector<const uc16> overlay,
4246 ZoneList<CharacterRange>** included,
4247 ZoneList<CharacterRange>** excluded) {
4248 ASSERT_EQ(NULL, *included);
4249 ASSERT_EQ(NULL, *excluded);
4250 DispatchTable table;
4251 for (int i = 0; i < base->length(); i++)
4252 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
4253 for (int i = 0; i < overlay.length(); i += 2) {
4254 table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
4255 CharacterRangeSplitter::kInOverlay);
4256 }
4257 CharacterRangeSplitter callback(included, excluded);
4258 table.ForEach(&callback);
4259}
4260
4261
4262void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
4263 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4264 if (IsSingleton()) {
4265 // If this is a singleton we just expand the one character.
4266 int length = uncanonicalize.get(from(), '\0', chars);
4267 for (int i = 0; i < length; i++) {
4268 uc32 chr = chars[i];
4269 if (chr != from()) {
4270 ranges->Add(CharacterRange::Singleton(chars[i]));
4271 }
4272 }
4273 } else if (from() <= kRangeCanonicalizeMax &&
4274 to() <= kRangeCanonicalizeMax) {
4275 // If this is a range we expand the characters block by block,
4276 // expanding contiguous subranges (blocks) one at a time.
4277 // The approach is as follows. For a given start character we
4278 // look up the block that contains it, for instance 'a' if the
4279 // start character is 'c'. A block is characterized by the property
4280 // that all characters uncanonicalize in the same way as the first
4281 // element, except that each entry in the result is incremented
4282 // by the distance from the first element. So a-z is a block
4283 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
4284 // uncanonicalizes to ['a' + k, 'A' + k].
4285 // Once we've found the start point we look up its uncanonicalization
4286 // and produce a range for each element. For instance for [c-f]
4287 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
4288 // add a range if it is not already contained in the input, so [c-f]
4289 // will be skipped but [C-F] will be added. If this range is not
4290 // completely contained in a block we do this for all the blocks
4291 // covered by the range.
4292 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4293 // First, look up the block that contains the 'from' character.
4294 int length = canonrange.get(from(), '\0', range);
4295 if (length == 0) {
4296 range[0] = from();
4297 } else {
4298 ASSERT_EQ(1, length);
4299 }
4300 int pos = from();
4301 // The start of the current block. Note that except for the first
4302 // iteration 'start' is always equal to 'pos'.
4303 int start;
4304 // If it is not the start point of a block the entry contains the
4305 // offset of the character from the start point.
4306 if ((range[0] & kStartMarker) == 0) {
4307 start = pos - range[0];
4308 } else {
4309 start = pos;
4310 }
4311 // Then we add the ranges on at a time, incrementing the current
4312 // position to be after the last block each time. The position
4313 // always points to the start of a block.
4314 while (pos < to()) {
4315 length = canonrange.get(start, '\0', range);
4316 if (length == 0) {
4317 range[0] = start;
4318 } else {
4319 ASSERT_EQ(1, length);
4320 }
4321 ASSERT((range[0] & kStartMarker) != 0);
4322 // The start point of a block contains the distance to the end
4323 // of the range.
4324 int block_end = start + (range[0] & kPayloadMask) - 1;
4325 int end = (block_end > to()) ? to() : block_end;
4326 length = uncanonicalize.get(start, '\0', range);
4327 for (int i = 0; i < length; i++) {
4328 uc32 c = range[i];
4329 uc16 range_from = c + (pos - start);
4330 uc16 range_to = c + (end - start);
4331 if (!(from() <= range_from && range_to <= to())) {
4332 ranges->Add(CharacterRange(range_from, range_to));
4333 }
4334 }
4335 start = pos = block_end + 1;
4336 }
4337 } else {
4338 // TODO(plesner) when we've fixed the 2^11 bug in unibrow.
4339 }
4340}
4341
4342
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004343ZoneList<CharacterRange>* CharacterSet::ranges() {
4344 if (ranges_ == NULL) {
4345 ranges_ = new ZoneList<CharacterRange>(2);
4346 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
4347 }
4348 return ranges_;
4349}
4350
4351
4352
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004353// -------------------------------------------------------------------
4354// Interest propagation
4355
4356
4357RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
4358 for (int i = 0; i < siblings_.length(); i++) {
4359 RegExpNode* sibling = siblings_.Get(i);
4360 if (sibling->info()->Matches(info))
4361 return sibling;
4362 }
4363 return NULL;
4364}
4365
4366
4367RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
4368 ASSERT_EQ(false, *cloned);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004369 siblings_.Ensure(this);
4370 RegExpNode* result = TryGetSibling(info);
4371 if (result != NULL) return result;
4372 result = this->Clone();
4373 NodeInfo* new_info = result->info();
4374 new_info->ResetCompilationState();
4375 new_info->AddFromPreceding(info);
4376 AddSibling(result);
4377 *cloned = true;
4378 return result;
4379}
4380
4381
4382template <class C>
4383static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
4384 NodeInfo full_info(*node->info());
4385 full_info.AddFromPreceding(info);
4386 bool cloned = false;
4387 return RegExpNode::EnsureSibling(node, &full_info, &cloned);
4388}
4389
4390
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004391// -------------------------------------------------------------------
4392// Splay tree
4393
4394
4395OutSet* OutSet::Extend(unsigned value) {
4396 if (Get(value))
4397 return this;
4398 if (successors() != NULL) {
4399 for (int i = 0; i < successors()->length(); i++) {
4400 OutSet* successor = successors()->at(i);
4401 if (successor->Get(value))
4402 return successor;
4403 }
4404 } else {
4405 successors_ = new ZoneList<OutSet*>(2);
4406 }
4407 OutSet* result = new OutSet(first_, remaining_);
4408 result->Set(value);
4409 successors()->Add(result);
4410 return result;
4411}
4412
4413
4414void OutSet::Set(unsigned value) {
4415 if (value < kFirstLimit) {
4416 first_ |= (1 << value);
4417 } else {
4418 if (remaining_ == NULL)
4419 remaining_ = new ZoneList<unsigned>(1);
4420 if (remaining_->is_empty() || !remaining_->Contains(value))
4421 remaining_->Add(value);
4422 }
4423}
4424
4425
4426bool OutSet::Get(unsigned value) {
4427 if (value < kFirstLimit) {
4428 return (first_ & (1 << value)) != 0;
4429 } else if (remaining_ == NULL) {
4430 return false;
4431 } else {
4432 return remaining_->Contains(value);
4433 }
4434}
4435
4436
4437const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
4438const DispatchTable::Entry DispatchTable::Config::kNoValue;
4439
4440
4441void DispatchTable::AddRange(CharacterRange full_range, int value) {
4442 CharacterRange current = full_range;
4443 if (tree()->is_empty()) {
4444 // If this is the first range we just insert into the table.
4445 ZoneSplayTree<Config>::Locator loc;
4446 ASSERT_RESULT(tree()->Insert(current.from(), &loc));
4447 loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
4448 return;
4449 }
4450 // First see if there is a range to the left of this one that
4451 // overlaps.
4452 ZoneSplayTree<Config>::Locator loc;
4453 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
4454 Entry* entry = &loc.value();
4455 // If we've found a range that overlaps with this one, and it
4456 // starts strictly to the left of this one, we have to fix it
4457 // because the following code only handles ranges that start on
4458 // or after the start point of the range we're adding.
4459 if (entry->from() < current.from() && entry->to() >= current.from()) {
4460 // Snap the overlapping range in half around the start point of
4461 // the range we're adding.
4462 CharacterRange left(entry->from(), current.from() - 1);
4463 CharacterRange right(current.from(), entry->to());
4464 // The left part of the overlapping range doesn't overlap.
4465 // Truncate the whole entry to be just the left part.
4466 entry->set_to(left.to());
4467 // The right part is the one that overlaps. We add this part
4468 // to the map and let the next step deal with merging it with
4469 // the range we're adding.
4470 ZoneSplayTree<Config>::Locator loc;
4471 ASSERT_RESULT(tree()->Insert(right.from(), &loc));
4472 loc.set_value(Entry(right.from(),
4473 right.to(),
4474 entry->out_set()));
4475 }
4476 }
4477 while (current.is_valid()) {
4478 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
4479 (loc.value().from() <= current.to()) &&
4480 (loc.value().to() >= current.from())) {
4481 Entry* entry = &loc.value();
4482 // We have overlap. If there is space between the start point of
4483 // the range we're adding and where the overlapping range starts
4484 // then we have to add a range covering just that space.
4485 if (current.from() < entry->from()) {
4486 ZoneSplayTree<Config>::Locator ins;
4487 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4488 ins.set_value(Entry(current.from(),
4489 entry->from() - 1,
4490 empty()->Extend(value)));
4491 current.set_from(entry->from());
4492 }
4493 ASSERT_EQ(current.from(), entry->from());
4494 // If the overlapping range extends beyond the one we want to add
4495 // we have to snap the right part off and add it separately.
4496 if (entry->to() > current.to()) {
4497 ZoneSplayTree<Config>::Locator ins;
4498 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
4499 ins.set_value(Entry(current.to() + 1,
4500 entry->to(),
4501 entry->out_set()));
4502 entry->set_to(current.to());
4503 }
4504 ASSERT(entry->to() <= current.to());
4505 // The overlapping range is now completely contained by the range
4506 // we're adding so we can just update it and move the start point
4507 // of the range we're adding just past it.
4508 entry->AddValue(value);
4509 // Bail out if the last interval ended at 0xFFFF since otherwise
4510 // adding 1 will wrap around to 0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004511 if (entry->to() == String::kMaxUC16CharCode)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004512 break;
4513 ASSERT(entry->to() + 1 > current.from());
4514 current.set_from(entry->to() + 1);
4515 } else {
4516 // There is no overlap so we can just add the range
4517 ZoneSplayTree<Config>::Locator ins;
4518 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4519 ins.set_value(Entry(current.from(),
4520 current.to(),
4521 empty()->Extend(value)));
4522 break;
4523 }
4524 }
4525}
4526
4527
4528OutSet* DispatchTable::Get(uc16 value) {
4529 ZoneSplayTree<Config>::Locator loc;
4530 if (!tree()->FindGreatestLessThan(value, &loc))
4531 return empty();
4532 Entry* entry = &loc.value();
4533 if (value <= entry->to())
4534 return entry->out_set();
4535 else
4536 return empty();
4537}
4538
4539
4540// -------------------------------------------------------------------
4541// Analysis
4542
4543
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004544void Analysis::EnsureAnalyzed(RegExpNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004545 if (that->info()->been_analyzed || that->info()->being_analyzed)
4546 return;
4547 that->info()->being_analyzed = true;
4548 that->Accept(this);
4549 that->info()->being_analyzed = false;
4550 that->info()->been_analyzed = true;
4551}
4552
4553
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004554void Analysis::VisitEnd(EndNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004555 // nothing to do
4556}
4557
4558
ager@chromium.org8bb60582008-12-11 12:02:20 +00004559void TextNode::CalculateOffsets() {
4560 int element_count = elements()->length();
4561 // Set up the offsets of the elements relative to the start. This is a fixed
4562 // quantity since a TextNode can only contain fixed-width things.
4563 int cp_offset = 0;
4564 for (int i = 0; i < element_count; i++) {
4565 TextElement& elm = elements()->at(i);
4566 elm.cp_offset = cp_offset;
4567 if (elm.type == TextElement::ATOM) {
4568 cp_offset += elm.data.u_atom->data().length();
4569 } else {
4570 cp_offset++;
4571 Vector<const uc16> quarks = elm.data.u_atom->data();
4572 }
4573 }
4574}
4575
4576
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004577void Analysis::VisitText(TextNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004578 if (ignore_case_) {
4579 that->MakeCaseIndependent();
4580 }
4581 EnsureAnalyzed(that->on_success());
ager@chromium.org8bb60582008-12-11 12:02:20 +00004582 that->CalculateOffsets();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004583}
4584
4585
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004586void Analysis::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004587 RegExpNode* target = that->on_success();
4588 EnsureAnalyzed(target);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004589 // If the next node is interested in what it follows then this node
4590 // has to be interested too so it can pass the information on.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004591 that->info()->AddFromFollowing(target->info());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004592}
4593
4594
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004595void Analysis::VisitChoice(ChoiceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004596 NodeInfo* info = that->info();
4597 for (int i = 0; i < that->alternatives()->length(); i++) {
4598 RegExpNode* node = that->alternatives()->at(i).node();
4599 EnsureAnalyzed(node);
4600 // Anything the following nodes need to know has to be known by
4601 // this node also, so it can pass it on.
4602 info->AddFromFollowing(node->info());
4603 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004604}
4605
4606
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004607void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
4608 NodeInfo* info = that->info();
4609 for (int i = 0; i < that->alternatives()->length(); i++) {
4610 RegExpNode* node = that->alternatives()->at(i).node();
4611 if (node != that->loop_node()) {
4612 EnsureAnalyzed(node);
4613 info->AddFromFollowing(node->info());
4614 }
4615 }
4616 // Check the loop last since it may need the value of this node
4617 // to get a correct result.
4618 EnsureAnalyzed(that->loop_node());
4619 info->AddFromFollowing(that->loop_node()->info());
4620}
4621
4622
4623void Analysis::VisitBackReference(BackReferenceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004624 EnsureAnalyzed(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004625}
4626
4627
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004628void Analysis::VisitAssertion(AssertionNode* that) {
4629 EnsureAnalyzed(that->on_success());
4630}
4631
4632
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004633// -------------------------------------------------------------------
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004634// Dispatch table construction
4635
4636
4637void DispatchTableConstructor::VisitEnd(EndNode* that) {
4638 AddRange(CharacterRange::Everything());
4639}
4640
4641
4642void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
4643 node->set_being_calculated(true);
4644 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
4645 for (int i = 0; i < alternatives->length(); i++) {
4646 set_choice_index(i);
4647 alternatives->at(i).node()->Accept(this);
4648 }
4649 node->set_being_calculated(false);
4650}
4651
4652
4653class AddDispatchRange {
4654 public:
4655 explicit AddDispatchRange(DispatchTableConstructor* constructor)
4656 : constructor_(constructor) { }
4657 void Call(uc32 from, DispatchTable::Entry entry);
4658 private:
4659 DispatchTableConstructor* constructor_;
4660};
4661
4662
4663void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
4664 CharacterRange range(from, entry.to());
4665 constructor_->AddRange(range);
4666}
4667
4668
4669void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
4670 if (node->being_calculated())
4671 return;
4672 DispatchTable* table = node->GetTable(ignore_case_);
4673 AddDispatchRange adder(this);
4674 table->ForEach(&adder);
4675}
4676
4677
4678void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
4679 // TODO(160): Find the node that we refer back to and propagate its start
4680 // set back to here. For now we just accept anything.
4681 AddRange(CharacterRange::Everything());
4682}
4683
4684
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004685void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
4686 RegExpNode* target = that->on_success();
4687 target->Accept(this);
4688}
4689
4690
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004691
4692static int CompareRangeByFrom(const CharacterRange* a,
4693 const CharacterRange* b) {
4694 return Compare<uc16>(a->from(), b->from());
4695}
4696
4697
4698void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
4699 ranges->Sort(CompareRangeByFrom);
4700 uc16 last = 0;
4701 for (int i = 0; i < ranges->length(); i++) {
4702 CharacterRange range = ranges->at(i);
4703 if (last < range.from())
4704 AddRange(CharacterRange(last, range.from() - 1));
4705 if (range.to() >= last) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004706 if (range.to() == String::kMaxUC16CharCode) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004707 return;
4708 } else {
4709 last = range.to() + 1;
4710 }
4711 }
4712 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004713 AddRange(CharacterRange(last, String::kMaxUC16CharCode));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004714}
4715
4716
4717void DispatchTableConstructor::VisitText(TextNode* that) {
4718 TextElement elm = that->elements()->at(0);
4719 switch (elm.type) {
4720 case TextElement::ATOM: {
4721 uc16 c = elm.data.u_atom->data()[0];
4722 AddRange(CharacterRange(c, c));
4723 break;
4724 }
4725 case TextElement::CHAR_CLASS: {
4726 RegExpCharacterClass* tree = elm.data.u_char_class;
4727 ZoneList<CharacterRange>* ranges = tree->ranges();
4728 if (tree->is_negated()) {
4729 AddInverse(ranges);
4730 } else {
4731 for (int i = 0; i < ranges->length(); i++)
4732 AddRange(ranges->at(i));
4733 }
4734 break;
4735 }
4736 default: {
4737 UNIMPLEMENTED();
4738 }
4739 }
4740}
4741
4742
4743void DispatchTableConstructor::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004744 RegExpNode* target = that->on_success();
4745 target->Accept(this);
4746}
4747
4748
ager@chromium.org8bb60582008-12-11 12:02:20 +00004749Handle<FixedArray> RegExpEngine::Compile(RegExpCompileData* data,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004750 bool ignore_case,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004751 bool is_multiline,
4752 Handle<String> pattern,
4753 bool is_ascii) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004754 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
4755 return IrregexpRegExpTooBig(pattern);
4756 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004757 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004758 // Wrap the body of the regexp in capture #0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004759 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004760 0,
4761 &compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004762 compiler.accept());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004763 RegExpNode* node = captured_body;
4764 if (!data->tree->IsAnchored()) {
4765 // Add a .*? at the beginning, outside the body capture, unless
4766 // this expression is anchored at the beginning.
4767 node = RegExpQuantifier::ToNode(0,
4768 RegExpTree::kInfinity,
4769 false,
4770 new RegExpCharacterClass('*'),
4771 &compiler,
4772 captured_body);
4773 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004774 data->node = node;
4775 Analysis analysis(ignore_case);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004776 analysis.EnsureAnalyzed(node);
4777
4778 NodeInfo info = *node->info();
ager@chromium.org8bb60582008-12-11 12:02:20 +00004779
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004780 if (FLAG_irregexp_native) {
4781#ifdef ARM
4782 // Unimplemented, fall-through to bytecode implementation.
4783#else // IA32
ager@chromium.org8bb60582008-12-11 12:02:20 +00004784 RegExpMacroAssemblerIA32::Mode mode;
4785 if (is_ascii) {
4786 mode = RegExpMacroAssemblerIA32::ASCII;
4787 } else {
4788 mode = RegExpMacroAssemblerIA32::UC16;
4789 }
4790 RegExpMacroAssemblerIA32 macro_assembler(mode,
4791 (data->capture_count + 1) * 2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004792 return compiler.Assemble(&macro_assembler,
4793 node,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004794 data->capture_count,
4795 pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004796#endif
4797 }
4798 EmbeddedVector<byte, 1024> codes;
4799 RegExpMacroAssemblerIrregexp macro_assembler(codes);
4800 return compiler.Assemble(&macro_assembler,
4801 node,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004802 data->capture_count,
4803 pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004804}
4805
4806
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00004807}} // namespace v8::internal